\documentclass[accepted]{uai2025} % for initial submission
%\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.
%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    % \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{bm}
\usepackage{amssymb}
\usepackage{algorithm}
\usepackage[noend]{algpseudocode}
\DeclareMathOperator*{\argmax}{argmax}
\usepackage{subfigure}
\usepackage{enumitem}
\usepackage{microtype}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newtheorem{example}{Example}
\newtheorem{thm}{Theorem}
%added by DR from prior version, might not be acceptable. 
% \newtheorem{thm}{Theorem}
\newtheorem{cor}[thm]{Corollary}
\newtheorem{lem}{Lemma}
\newtheorem{prop}[thm]{Proposition}
\newtheorem{defn}{Definition}
\newtheorem{rem}{Remark}
% \newtheorem{ex}{Example}

\newcommand{\tao}{\textsf{TaO-MG}}
\newcommand{\mohito}{\textsc{Mohito}}
\newcommand{\pgella}{\textsc{TaO-PGELLA}}
\newcommand{\rideshare}{\textsf{Rideshare}}
\newcommand{\wildfire}{\textsf{Wildfire Suppression}}
\newcommand*{\commt}[1]{\color{blue}\em{#1}}

\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}

\title{MOHITO: Multi-Agent Reinforcement Learning using Hypergraphs for Task-Open Systems}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Gayathri Anil}
\author[1]{Prashant Doshi}
\author[1]{Daniel Redder}
\author[2]{Adam Eck}
\author[3]{Leen-Kiat Soh}
% Add affiliations after the authors
\affil[1]{%
    THINC Lab, School of Computing\\
    University of Georgia\\
    Athens, GA, USA
}
\affil[2]{%
    Computer Science Department\\
    Oberlin College\\
    Oberlin, Ohio, USA
}
\affil[3]{%
    School of Computing\\
    University of Nebraska\\
    Lincoln, Nebraska, USA
}
  
\begin{document}
\maketitle
\begin{abstract}
    Open agent systems are prevalent in the real world, where the sets of agents and tasks change over time.  In this paper, we focus on task-open multi-agent systems, exemplified by applications such as ridesharing, where passengers (tasks)  appear spontaneously over time and disappear if not attended to promptly. Task-open settings challenge us with an action space which changes dynamically. This renders existing reinforcement learning (RL) methods--intended for fixed state and action spaces--inapplicable. Whereas multi-task learning approaches learn policies generalized to multiple known and related tasks, they struggle to adapt to previously unseen tasks. Conversely, lifelong learning adapts to new tasks over time, but generally assumes that tasks come sequentially from a static and known distribution rather than simultaneously and unpredictably. We introduce a novel category of RL for addressing task openness, modeled using a task-open Markov game.  Our approach, MOHITO, is a multi-agent actor-critic schema which represents knowledge about the relationships between agents and changing tasks and actions as dynamically evolving 3-uniform hypergraphs. As popular multi-agent RL testbeds do not exhibit task openness, we evaluate MOHITO on two realistic and naturally task-open domains to establish its efficacy and provide a benchmark for future work in this setting. 
\end{abstract}

\section{Introduction}\label{sec:intro}
In multi-agent systems, each decision-making agent must determine a strategy to achieve collective and/or individual objectives. Learning how to coexist and effectively operate within a shared environment with other agents is challenging but well-studied in \emph{closed} environments where the tasks being accomplished are fixed in time and known in advance.  More challenging is learning how to act in environments with {\em task openness}:  a phenomenon where the set of objectives or tasks is neither static nor predefined. 

Consider ridesharing, such as Uber Pool, operated by autonomous driver agents or robotaxis.  A task in ridesharing is to transport a passenger to their destination.  However, passengers enter spontaneously, in unbounded quantity, with a frequency influenced by exogenous factors. Passengers can also withdraw causing tasks to disappear from the system. Such task openness makes the set of action choices transient, alters action-dependent elements of the problem such as the reward function, and may introduce new relationships between agents. 
%with its peers (e.g., needing to collaborate on some tasks and compete on others), making it challenging for agents to reason optimally. 
As such, actions optimal for one state under one set of tasks might be different from those in the same state under a different set of tasks. Hence, an agent must reason about the changing meaning of actions and availability thereof caused by the entry and exit of tasks.

Recent generalizations of multi-agent RL (MARL) targeting related challenges have produced three important categories of methods. First, \textit{multi-task} learning algorithms  learn from multiple related tasks (e.g., 
~\citealt{Tanaka03:Multitask,Omidshafiei2017:Multitask,Zhangetal2023:Multitask}) but falter when novel tasks appear. Second, \textit{lifelong} learning enables ongoing adaptation to new tasks over time (e.g., ~\citealt{Thrun1995:Lifelong,Chen2018:Lifelong,Skrynnik24:Lifelong}) but assumes a sequential, and not simultaneous, arrival of the tasks from a known distribution. Third, out-of-distribution learning enables agents to detect when their current tasks are different from training (e.g., ~\citealt{Sedlmeier20:OOD,Haider23:OOD}) but does not say how agents should use that information. In short, extant MARL may not apply under task openness due to the underlying constraint where policies map states to a static action set.

In the context of the novel challenges brought about by task openness to MARL, 
%and a lack of relevant, applicable methods, 
this paper contributes the following: 

\begin{itemize}[leftmargin=*,topsep=0in,itemsep=0in]
    \item A first and general decision-making model, {\em task-open Markov games} (\tao), for such multi-agent settings.  
    \item A hypergraph-based knowledge representation schema modeling the relationships between agents, tasks, and actions amid openness.
    \item \mohito{}, a deep reinforcement learning method for \tao{}. \mohito{} interprets the hypergraphs via a graph neural network to implement its multi-agent actor-critic schema, thereby learning a relative evaluation of available actions.
    \item We present two domains that naturally manifest this form of openness: rideshare~\citep{aimag} and wildfire suppression~\citep{Eck:AAAI2020} because extant MARL testbeds generally do not exhibit task openness.
\end{itemize}

Evaluations on these domains using standard (domain-agnostic) metrics and domain-centric ones establish \mohito{}'s efficacy and offer insights into its behavior. Importantly, this is the first MARL method to fully target task openness, thereby stimulating further development for this new and pragmatic multi-agent decision-making setting.   


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Background}
\label{sec:background}

We situate task openness in the broader context of open agent systems followed by a review of a well-known framework for modeling multi-agent interactions. 


\subsection{Open Agent Systems}
\label{subsec:oasys}

Openness can manifest in various ways. A recent survey~\citep{aimag} as well as prior  discussions~\citep{Shehory00:Software,Calmet04:Liberal,Jumadinov14:Strategic} recognize three types: \textit{agent openness}, where the set of agents acting in the environment changes over time; \textit{task openness}, where the set of tasks changes over time; and \textit{frame/type openness}, where agents' frames (capabilities, preferences, and reasoning processes) can change, such as when agents acquire new abilities or change roles. Among these categories of open systems, agent openness has received the most attention whereas task openness remains understudied. 

Agent openness was defined more than two decades ago as adding agents beyond the initial number present in a system ~\citep{Shehory00:Software}. Subsequent work defines the degree of openness as the complexity of the minimal transformation of a system to add or remove an agent \citep{Jamroga13:Modularity}. Since then, reasoning methods have shown that explicitly predicting the presence or absence of other agents, where possible, improves global system behavior ~\citep{Chandrasekaran:Open,Cohen:OpenDecPOMDPs,Eck:AAAI2020,Kakarlapudi22:Decision}. This leads to the thought that explicitly modeling task openness may have similar success. However, new tasks often arrive due to exogenous factors (e.g., end of a large gathering resulting in many ride-hailing passengers), which may not be possible to model. 


\subsection{Markov Games}
\label{subsec:mg}

Markov games are stochastic games that commonly formalize the multi-agent decision-making process of learning agents that optimize their individual cumulative rewards in competitive (self-interested) or cooperative environments \citep{Shapley:StochasticGames,Littman:MarkovGames}.  Formally, a Markov game is represented by:
$$M = \langle Ag, S, A, T, R, \gamma, s_0\rangle$$
\noindent where $\bullet$ $Ag$ is the set of agents operating in the environment; $\bullet$ $S$ is the set of states of the environment encapsulating different situations agents can face;  $\bullet$ $A = \Pi_{i \in Ag} A_i$ is the joint action set giving possible combinations of actions taken simultaneously by all agents, with $A_i$ the action set of agent $i$; $\bullet$ $T: S \times A \times S \rightarrow [0, 1]$ is the transition function, specifying the probability of the problem state transitioning from state $s$ to $s'$ when the agents perform their joint action $\mathbf{a}$; $\bullet$ $R: S \times A \rightarrow \mathbb{R}^{|Ag|}$ is the reward function, specifying the collection of individual rewards earned by each agent on joint action $\mathbf{a}$ in state $s$; $\bullet$ $\gamma \in (0, 1)$ represents the discount factor for uncertain future rewards; and $\bullet$ $s_0 \in S$ denotes the initial state of the environment.

In a RL context, the objective of each agent $i \in Ag$ in a Markov game is to learn a policy $\pi_i: S \rightarrow A_i$ that prescribes actions that maximize the agent's sum of discounted cumulative rewards:

$$\mathbb{E}_{\tau \sim \pi}\left[\sum_{t=0}^\infty{\gamma^tR_i(s_t, \mathbf{a}_t|s=s_0)}\right]$$

\noindent where $R_i(s_t, \mathbf{a}_t)$ is the reward of agent $i$ when $i$ chooses action $\pi_i(s_t)$ in state $s_t$ according to its learned policy and the other agents choose the remaining actions in $\mathbf{a}_t$.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\section{Task-Open MAS}
\label{problem}

Many real-world environments do not abide by the assumptions under which RL generally operates; that the set of tasks agents seek to complete is known in advance, and those tasks remain in the environment until completed.

Instead, for example, drivers in a ridesharing application (e.g., Uber, Lyft) with vacancy in their vehicles must decide whether to accept new passengers that arrive unexpectedly in its environment. For larger vehicles (e.g., Uber Pool), the driver must balance the needs of multiple simultaneous tasks like multiple passengers sharing the same vehicle but desiring different destinations while possibly accommodating new passengers.

As such, new tasks and goals are introduced over time, and their presence may alter previously planned or learned behavior (e.g., picking up a new passenger changing the intended dropoff order of existing passengers) or cause such behavior to not be optimal. 

Tasks can be seen as outcomes of exogenous or endogenous events which cause tasks to enter or exit. Exogenous factors are driven by nature and independent of the agents' actions, whereas endogenous ones can be influenced by agents' actions. 

For a more nuanced view, let $X = \langle \mathcal{T}^\tau, \mathcal{T}^\omega \rangle$, where $\mathcal{T}^\tau$ includes parameters of tasks resulting from exogenous events such as those causing new passengers to arrive, and $\mathcal{T}^\omega$ contains parameters of tasks affected by endogenous events such as soliciting a passenger. In some circumstances, tasks in $\mathcal{T}^{\tau}$ may be predicted from experience, although reactive behavior is likely necessary.


\subsection{Dynamic Ridesharing}
\label{ridesharing}


We illustrate task openness using the domain of ridesharing, labeled as \rideshare{}, in Fig.~\ref{fig:ridesharing}. Each driver agent $i$ can transport up to $p_i$ passengers simultaneously. Each passenger (task) has a pickup location, a destination, and a fare. Each agent receives the following information from the environment: its own position and capacity, the location of other agents, and the position, destination, and fare of its accepted passengers and those awaiting service.
\begin{figure}[!ht]
\centerline{\includegraphics[width=2.65in]{Figs/rideshare-domain (cropped) (pdfresizer.com).pdf}}
\caption{
% \footnotesize
A rideshare driver operates a vehicle in a task-open MAS where new tasks (passengers) suddenly appear and existing tasks suddenly exit leading to {\em open} action sets $A_i$.}
\label{fig:ridesharing}
\end{figure}

Because passengers appear dynamically, the set of actions $A_i$ of a driver agent $i$ changes over time. For each unserved passenger, the agent can choose to \emph{accept} the passenger for transport.  For each accepted passenger who is not  in their assigned car, the agent can choose to drive to their \emph{pickup} location.  Finally, for each riding passenger, the agent may drive to their destination for \emph{drop off}.  New passengers increase the size of $A_i$ by adding these actions, while completed tasks reduce it. The rewards, $R$, also change as the composition of tasks in the environment changes. New passengers introduce new fares that provide new opportunities for agents to earn rewards.  As a result, each agent's learned utility function must adapt to changes in the set of passengers caused by task openness.

Note that changes in the set of passengers over time \emph{cannot simply be modeled as agent openness}, for which there are existing reinforcement learning and planning solutions \citep{Cohen:OpenDecPOMDPs,Eck:AAAI2020,Rahman:OpenAdHoc,Kakarlapudi22:Decision}. This is because a passenger is not an autonomous actor in \rideshare{}; only driver agents deliberately choose actions to complete tasks.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\section{Task-Open Markov Games}
\label{sec:model}

Task openness complicates agent decision making within a Markov game by causing many components of the game to potentially change according to task dynamics.  Specifically, the states reflect the features of each task. Changes in task features alter the factored state variables related to the environment (e.g., the current location of each passenger).  Tasks entering or leaving change the space of unique actions available to agents. The transition function must handle the dynamics of both these spaces and possible stochastic changes with current tasks.  The reward function also changes under different tasks, consequently changing the agent's ultimate objective as the expected value of an action may change dramatically as new actions become available.

To support agent reasoning about task openness and handle its compound impacts on the viability and optimality of an action, we propose a time-varying model called the \textbf{task-open Markov game} (\tao{}), formalized as:
\[
\text{TaO-MG} \triangleq \langle {M}, X, \Psi \rangle 
\]
where $M$ is the {\em current} base decision-making model of the problem instance. Here, it is a Markov game as previously described in Section~\ref{subsec:mg}.  $X$ includes (features of) the {\em current} set of tasks in the system. $\Psi$ is the generator function that transforms $M$ when exogenous or endogenous events change the set of available tasks at a given step. These tasks populate $X'=\langle \mathcal{T}'^\tau, \mathcal{T}'^\omega \rangle$. Given $X'$, $\Psi$ updates the current decision-making model $M$ to $M'$, 


\begin{figure*}[ht!]
\subfigure[]{
    \label{fig:hypergraph-rep}
    \includegraphics[width=0.35\linewidth]{Figs/interaction_graph_a (cropped) (pdfresizer.com).pdf}
}
\subfigure[]{
    \label{fig:critic-graph}
    \includegraphics[width=0.35\linewidth]{Figs/interaction_graph_b (cropped) (pdfresizer.com).pdf}
} 
\subfigure[]{
    \label{fig:observation-graph}
    \includegraphics[width=0.23\linewidth]{Figs/interaction_graph_c (cropped) (pdfresizer.com).pdf}
}
\caption{(a) An interaction hypergraph representation of agents, tasks, and action spaces in the \tao{} model for a simple instance of \rideshare{}. (b) The corresponding 2D, critic incidence graph representation of the hypergraph. (c) Agent 2's observation graph: its observed tasks and available hyperedges.}
\label{fig:incidence_graph}
\end{figure*}

    
    \[
        M' =
        \begin{cases} 
          \Psi(M, X') & \text{if } X \text{ transitions to } X'\\ 
          M & \text{otherwise} 
        \end{cases}
    \]
    

    The components of $\Psi$ operate on parameters of the base model $M$ and generate new ones.  Specifically, $\Psi_{S}: S\times X'\xrightarrow{}S'$ updates the state space to include representations of newly added tasks and remove those of exited tasks. $\Psi_A: A \times X' \xrightarrow{} A'$ updates the action space, combining existing actions with those required for new tasks and removing actions no longer associated with any present tasks. $\Psi_T: T \times X'\xrightarrow{}T'$, adapts the transition function to incorporate the new states and actions while excluding those related to exited tasks. 
    %Transitions for new tasks in $X_t$ are updated by comparing the underlying framework $\epsilon$ of the new tasks to the existing framework. This is present in $X^{Ta}$ 
    $\Psi_R: R \times X' \xrightarrow{}R'$ similarly updates the reward function to account for any new or removed states and actions. Note that the rest of the parameters of the Markov game do not change.

Let $\tau = \langle s_t, \mathbf{a}_t, X_t \rangle$ denote a sample experience at timestep $t$. The objective of each agent $i\in Ag$ in \tao{} is to learn a policy $\pi_i: S \times X \rightarrow A_i$ which maximizes the agent's expected sum of discounted rewards,    $\mathbb{E}_{\tau\sim\pi} \left [ \sum^\infty_{t=0} \gamma^t R'_{i}(s_t,\mathbf{a}_t | s=s_0) \right ]$, in the task-open context where the reward function may change, $R'=\Psi_R(R,X)$.  

   
    %In planning, this constitutes monitoring $X$ and solving the updated $M'$ if $X$ changes. In model-free RL, learning a policy which can adapt to changing $X$.

% For planning or model-based RL, an agent in the system solves the current decision-making model, executing the obtained policy while simultaneously monitoring $X$. If one or more new tasks arrive and $X$ transitions, the agent pauses the execution of its current policy and applies the generator $\Psi$ to the current model $M$ to obtain a new model $\Psi(M,X')$, which the agent, $i$, then solves to obtain the revised policy, $\pi'_i$, that is also cognizant of the new tasks. The goal of an agent in a \tao{} is to maximize its reward with the extra challenge of a non-stationary reward function $R$ that changes with $X$. 
% Formally, in \tao{}, an optimal policy is one that maximizes the sum of discounted rewards from the unbounded space of grouped tasks $\chi$;

% $$
% \tau = <s_t, a_t>, R' = \Psi_R(R, X_t)
% $$
% $$
% \mathbb{E}_{X\sim \chi}\mathbb{E}_{\tau\sim \pi'}\big[\sum_{t=0}^\infty [\gamma^tR'_i(s_t, a_t | s=s_0)]\big].
% $$

% It may be impossible to prove optimality here. We describe this to distinguish the goal from achieving optimality for each individual task $x$.


We utilize a Markov game representation for a given $X$.  We assume that the environment is fully observable and that rewards can be individual. This base model may be replaced by a different type of decision-making process as appropriate (e.g., multi-agent MDP \citep{10.5555/1029693.1029710}, interactive partially observable Markov decision process (I-POMDP)~\citep{Gmytrasiewicz05:Framework:JAIR}, or decentralized (Dec-POMDP)~\citep{Oliehoek:DecPOMDP}). A related model to \tao{} is the time-varying Markov decision process (TVMDP) \citep{Liu:TVMDP,Ornik2021-ts}. TVMDP models a single agent's decision process in environments with exogenous changes to the transition function.  Our \tao{} model can be viewed as extending TVMDPs to multi-agent settings with \emph{dynamic}  states, actions, and rewards.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\section{MARL Under task openness}

We present the first RL method for \tao{}, called Models Of Hyper Interactions under Task Openness               (\mohito{}), that adopts a graphical problem representation and engages in actor-critic based model-free learning.

\subsection{Interaction Hypergraphs}
\label{subsec:hyper}

The primary challenge in designing a learning algorithm for a task-open setting lies in accommodating the changing set of actions from the openness. Conventional RL algorithms learn $\pi: S \xrightarrow{}A$ by evaluating each state-action pair. In contrast, our work performs a relative comparison of the {\em current set of actions} to choose an action. That is, a generalized policy must also consider the set of tasks and the corresponding actions available to the agent within that state. It is not trivial to represent these dynamic relationships induced by task openness.
%To represent these dynamic relationships such that the property of openness is captured for the learning process to exploit is non-trivial. 
To address this, we represent actions, tasks, and agents through a graphical construct, which we call an \emph{interaction hypergraph}.  This hypergraph generalizes coordination graphs~\citep{Guestrin02:Context,pmlr-v119-boehmer20a}, which limit interactions to between agents only.   



% \begin{defn} [Interaction Hypergraph]
% An interaction hypergraph is a 3-uniform tripartite hypergraph $\mathcal{G} = \langle Ag^n, X^n, \cup_{i=1}^{|Ag|} A_i^n, E \rangle$ comprised of three types of nodes: \emph{agent nodes} in $Ag^n$ encapsulate the state-specific information of agents; \emph{task nodes} $X^n$ contain state information about the current tasks $X$ in the environment; each agent's \emph{action nodes} $A_i^n$ hold details about the actions available to the agent. $E$ is a set of 3-uniform hyperedges where each hyperedge (agent, task, action) contains one node from each set $Ag^n$, $X^n$, and $\cup_{i=1}^{|Ag|} A_i^n$, respectively.    
% \end{defn}

\begin{defn} [Interaction Hypergraph]
An \textbf{interaction} \textbf{hypergraph} is a 3-uniform (tripartite) hypergraph $\mathcal{G} = \langle \mathcal{N}, \mathcal{X}, \mathcal{A}, E \rangle$ comprised of three types of nodes: \emph{agent nodes} $\mathcal{N} = \{node(i)~|~i \in Ag\}$ store the state-specific information of agents $Ag$, notably including observations of other agents; \emph{task nodes} $\mathcal{X} = \{node(x)~|~x \in X\}$ contain state information about the current tasks $X$ in the environment; \emph{action nodes} $\mathcal{A} = \{node(a)~|~a \in \bigcup_{i \in Ag} A_i\}$ hold details about the unique actions currently available to the agents. $E$ is a set of 3-uniform hyperedges where each hyperedge (agent, task, action) contains one node from each set $\mathcal{N}$, $\mathcal{X}$, and $\mathcal{A}$, respectively.    
\end{defn}

We present the complexity of encoding knowledge of the environment in hypergraphs in Lemma~\ref{lem:interaction-graph-complexity}, with its derivation in Appendices~\ref{appendix:space-complexity} and~\ref{appendix:time-complexity}. Our bound is quadratic, but the more common case of tasks that share all unique actions has a linear bound. 

\begin{lem}\label{lem:interaction-graph-complexity} [Interaction graph complexity] The number of hyperedges is bounded by $\mathcal{O}(|\mathcal{N}| |\mathcal{X}| |\mathcal{A}|)$, and the number of action nodes is bounded by $\mathcal{O}(|Ag||A_i|)$. Thus, we can construct the graph in $\mathcal{O}(|Ag|^2 |X||A_i|)$ space and time.
%so we can construct $\mathcal{G}$ in $\mathcal{O}(|Ag|^2 \cdot |X| \cdot |A_i|) \ll \mathcal{O}(|Ag| \cdot |X| \cdot |A|)$ time and space. $|\mathcal{A}| \in \mathcal{O}(|Ag| \cdot |A_i|) \ll \mathcal{O}(|A|) = \mathcal{O}(|A_i|^{|Ag|})$.
\end{lem}


Figure~\ref{fig:hypergraph-rep} illustrates an interaction hypergraph for a setting of \rideshare{}, introduced previously in Sec.~\ref{ridesharing}. %Note that each action here constitutes an action available for a specific task and no other edges are present.
We mitigate the representational complexity of working with hypergraphs by transforming them into 2-uniform bipartite graphs, referred to as  Levi incidence graphs. Figure~\ref{fig:critic-graph} shows the 2-uniform bipartite graph for the tripartite hypergraph in Fig.~\ref{fig:hypergraph-rep}. This regular graph adds an additional set of nodes $E$ which denote hyperedges. We include \emph{InteractionGraph}, a polynomial-time algorithm in Appendix \ref{appendix:time-complexity}, for constructing an interaction hypergraph from agent observation, and empirical time and memory profiling of our proposed approach in Appendix \ref{appendix:time-profiling}. % This algorithm contributes a time overhead to any further approach we use, but is easily performed in linear time:

% \begin{lem} [Interaction graph complexity] The number of hyperedges is bounded: 
% $|E| \in \mathcal{O}(|Ag^n| \times |X^n| \times |A^n|)$. Therefore, we can construct the graph in $\mathcal{O}(|Ag^n| \times |X^n| \times |A^n|)$ time.
% \end{lem}



%\proof Interaction graphs incorporate all task-actions available to agents while maintaining a linear space\footnote{we include a algorithm to construct interaction graphs from a common observation structure, and a associated linear time complexity proof in appendix...} complexity.

%\begin{lem} (Graph, g, complexity). Let the space used to store a graph be:
%\begin{split}
%    |g| =& (|g_n| \times |features|)\\+& (2 \times|g_e| + |g_e| \times |edge\_features|)
%\end{split}
%\end{lem}
%still deciding how much of the proof to put here


%Recall that at each time step, the interaction hypergraph encapsulates the agents' observable states and tasks, along with the associated action sets. 
%The primary objective of an RL method is to evaluate the problem state via the interactions among agents, tasks, and actions at each time step, thereby enabling the selection of the best action for that time step. 
\subsection{\mohito{}}
\mohito{} leverages the interaction hypergraph's explicit representation of task-open problems to learn a centralized training decentralized execution (CTDE) actor-critic model in a task-open environment.  Each agent has a local actor (or policy) and a local critic. Agents' policies are learned
%, during the training, 
in a centralized manner by exchanging local observations among the critics, thereby leveraging  joint observations of the state and the joint action set. This architecture, analogous to MADDPG's~\citep{Lowe2017:MAPE}, allows each agent to learn considering both individual and collective preferences yet remains an autonomous agent during execution.

As illustrated in Fig.~\ref{fig:module_structure}, both the actor and critic are graph neural networks (GNN),  specifically graph attention networks~\citep{GAT}, which are representations amenable to dynamic environments. We use ~\citet{pmlr-v70-gilmer17a}'s message passing framework to facilitate aggregating knowledge of 
%In this framework, each node in the graph communicates with its neighbors by exchanging and transforming information through message passing. Information about 
the current state of agents, tasks, and available actions. Knowledge is aggregated into the hyperedge nodes of the Levi incidence graph.  Hence, the agents learn how to combine information about dynamic task and action sets as represented in the interaction hypergraphs, so that they can reason about their task-open environment.



%this paragraph is where the gradient should be officially described.
The actor network processes the agent's {\em observed interaction incidence graph}, or \textit{observation graph}, $\mathbf{G}_{\mathbf{O}}$=   $\{G_{1},...,G_{|Ag|}\}$ (e.g., Fig.~\ref{fig:observation-graph}). 
%with directed edges converging from agent, task, and action nodes towards the hyperedge nodes. Then the GNN engages in message-passing aggregating  data from the agent, task, and action nodes into the hyperedge node using the hyperedge nodes as the principal decision-making entities. 
 The updated features of each hyperedge node $ed_{ik} \in {E}$ for the $i^{th}$ agent and $k^{th}$ node are the actor's evaluation of that hyperedge's connected (agent, task, action) nodes. At each decision point, agent $i$'s actor GNN deterministically selects the hyperedge with the highest evaluation $ed_i=\argmax_{k}\sum_{f} ed_{ik}$ where $f$ are the features of $ed$. The agent then performs the action linked to that hyperedge. %In other words, the actor GNN selects one of the hyperedge nodes linked to the subject learner agent, serving as the output. On choosing the hyperedge node with the highest value, the corresponding action is obtained for updating the environment through a mapping of the action node connected to it. 
Each actor $\pi^{\bm{\theta}_i}$ is a deterministic model with target $\pi^{\bm{\theta}'_i}$. Actor $i$'s loss function,
\begin{equation}
L_{\pi_i}=  \frac{1}{B} \left[  \sum_{j \in \text{batch}} -Q_i^{\bm{\phi}}(G^{j}_{\mathbf{C}}, ed^j) \right] + \lambda_A |\bm{\theta}_i - \bm{\theta}'_i|, %+ 
    \label{Eq.actor_pg}
\end{equation}
includes the $i^{th}$ critic's evaluation $Q_i$ over the agent's action preferences, the hyperedges from all agents $ed^j$, and the critic graph $G^j_{\mathbf{C}}$, over a batch of samples $j \in \text{batch}$, $B$ is the size of the batch, and a regularization component moderated by $\lambda_A \geq 0$. The policy gradient is computed as,
\begin{equation}
\begin{split}
    \nabla L_{\pi_i}&=  \frac{1}{B} \left [ \sum_{j\in\text{batch}} \nabla_{\bm{\theta_i}} \pi^{\theta_i}(G_i^j)%ed_{i}^{j} 
    \nabla_{ed^{j}_i}-Q_i^{\bm{\phi}}(G_{\mathbf{C}}^j,ed^j) \right ]\\ &+ \lambda_A\frac{\bm{\theta_i} - \bm{\theta_i}'}{|\bm{\theta_i} - \bm{\theta_i}'|}.
    \end{split}
    \label{Eq.actor-gradient}
\end{equation}

% This $Q$ is a function of continuous preference over discrete action choices, $ed$ not $\mathbf{a}$, so we can differentiate over




The critic network processes the {\em critic interaction incidence graph} or \textit{critic graph} (e.g., Fig.~\ref{fig:critic-graph}), which combines agent observation graphs and extracts all observed task features by performing a disjunctive graph join over all observation graphs, $G_{\mathbf{C}}=
%[[G_0 V,...V G_{|Ag|}]]
\bigvee_{i=1}^{|Ag|}G_i$ \citep{bergami2016joininggraphs}. Agent $i$'s critic value $Q^{\bm{\phi}}_i$ with target network $Q^{\bm{\phi'}}_i$ takes as input the critic graph, $G_{\mathbf{C}}$, and all $ed_i$. The target networks $\theta'$ and $\phi'$ are slowly updated, every $K$ batches, by the difference between the main and the target weighted with hyperparameters $\psi_A$ and $\psi_Q$,  respectively.
%Each critic incorporates actor outputs by mean pooling over all GAT-processed state incidence graph nodes. Then combine this with all actor $ed_n$s through a MLP, see figure \ref{module structure}, to yield $Q_n$.
The critic loss,
    \begin{align}
        L_{Q_i}= & \frac{1}{B}\sum_{j\in \text{batch}} \left ( r_i^j + \gamma Q_i^{\bm{\phi}'}(G'^j_{\mathbf{C}}, ed'^j) - Q_i^{\bm{\phi}}(G^{j}_{\mathbf{C}},ed^j) \right )^2 \nonumber\\ 
        & + \lambda_C |\bm{\phi} - \bm{\phi}'|
    \label{Eq.critic-loss}
    \end{align}
is the mean squared error between expected and calculated $Q$ values for each critic with a similar $\lambda_C \geq 0$ regularization parameter. The gradient remains the same as MADDPG's critic gradient except for the use of $ed$ over $a$. The architectures, hyperparameters, and operational details of the actor and critic networks are discussed further in Appendix~\ref{appdx:MOHITO-parameters}. 

\begin{figure}[ht!]
    \centering
    \includegraphics[width=\linewidth]{Figs/TaO-Page-10_revised (cropped) (pdfresizer.com).pdf}
    \caption{\mohito{} uses an actor-critic schema with one actor and critic per agent. Observations (as incidence graphs) are shared between critics enabling centralized training. }
    \label{fig:module_structure}
\end{figure}

\textbf{Algorithm~\ref{alg:MOHITO}} presents the algorithm for \mohito{}. After collecting observation graphs, which capture the varying sets of tasks and associated actions, into a batch (lines 4-9), the algorithm generates the critic graphs (line 10) and utilizes the batch to engage in actor-critic training (lines 12-20) with the loss functions defined previously in Eqs.~\ref{Eq.actor_pg} and~\ref{Eq.critic-loss}. We show the complexity of \mohito{} in Theorem~\ref{thm:policy} with its analysis in Appendices~\ref{appendix:space-complexity}, and \ref{appendix:time-complexity}. Sizes $|\theta|$ and $|\phi|$ equal to $\text{\#layers} \cdot \text{\#heads} \cdot f^2 + \text{\#heads} \cdot f$  where $f$ is the feature size of $\mathcal{N}$ \citep{GAT}.  Number of heads and layers are structural parameters of a GAT.      

\begin{algorithm}[!ht]
\caption{\mohito
% \\After collecting samples into a batch (lines 4-9), \mohito{} utilizes the batch to engage in actor-critic training (lines 12-20) using actor loss (Eq. \ref{Eq.actor_pg}) and critic loss (Eq. \ref{Eq.critic-loss}).
}

\begin{algorithmic}[1]
\For{ $episode \gets 1$ to $N$}
    \State Get $obs$ from environment with current tasks $X$
    \State $\mathbf{G}_{\mathbf{O}}$:$ [G_1,..., G_{|Ag|}]\gets$ InteractionGraph($obs$) 
    \While{ $episode$ not complete} \Comment{Online} 
    % \State \mathbf{a} = (a_0,...a_{|A_g|}) \  \text{where} \ a_i, ed_i = \pi_i(G_i) \forall i \in agents$
    \State $a_i, ed_i \gets \pi^{\bm{\theta}_i}(G_i) \quad \forall i \in Ag$ 
    \State $\mathbf{a}\gets \langle a_1, ..., a_i \rangle$ or $\epsilon$-greedy
    %with prob $(1-\epsilon)$ else $\sim \text{uniform}(A)$
    \State $obs',r \leftarrow$ environment$(\mathbf{a})$

    %\State
    
    \State $\mathbf{G'}_\mathbf{O} \gets $ InteractionGraph($obs'$) 
    % \State $G_{\mathbf{O}}', r = \begin{cases} env(\mathbf{a}), \  p(1-\epsilon)\\ env(\mathbf{a}) \ \mathbf{a} \sim \text{uniform}(A), \ \text{else} \end{cases}$
    %Perform $\mathbf{a}$
    %$(a_1, \ldots, a_{|Ag|})$ with 
    %prob. (1 - $\epsilon$) else $\mathbf{a}\sim A$. Get  $G'_{\mathbf{O}}$ and $r$ 
    \State $a'_i, ed'_i \gets \pi^{\bm{\theta'}_i}(G_i') \quad \forall i \in Ag$ 
    \State $G_{\mathbf{C}} \gets \left (\bigvee_{i=1}^{|Ag|}G_i\right ), \quad G'_{\mathbf{C}} \gets \left (\bigvee_{i=1}^{|Ag|} G'_i \right )$
    \State $\text{batch} \gets \text{batch}\cup (G_{\mathbf{C}}, {\bf ed}, r, G'_{\mathbf{C}}, {\bf ed'})$
    
        \If{$|$batch$| = B$} \Comment{Offline learning}
            %CRITIC UPDATE
            \State Obtain $L_{Q_i}$ from Eq.~\ref{Eq.critic-loss} $\forall i \in  Ag$
            \State Backprop. $L_{Q_i}$ and update $Q^{\bm{\phi}_i}$~~~ $\forall i \in  Ag$
            %ACTOR UPDATE
            \State Obtain $L_{\pi_i}$ from Eq.~\ref{Eq.actor_pg} $\forall i \in Ag$
            \State Backprop. $L_{\pi_i}$ and update $\pi^{\bm{\theta}_i}$~~~ $\forall i \in Ag$
            \State batch $\leftarrow$  $\emptyset$
        \EndIf
     \EndWhile
        \For{agent $i$ after every $K$ episodes}
            % \State Slow update 
            \State $ \bm{\theta}'_i  \gets \psi_A \times \bm{\theta}_{i} + (1-\psi_A) \times \bm{\theta}'_{i}$
            \State $ \bm{\phi}'_i  \gets \psi_Q \times \bm{\phi}_{i} + (1-\bm{\psi}_Q) \times \bm{\phi}'_{i}$
        \EndFor
\EndFor
\end{algorithmic}

\label{alg:MOHITO}
\end{algorithm}

% \begin{thm} [\mohito{} policy complexity]
% Assuming the time and space complexity of a GAT are the same \cite{GAT}, $GAT^{time} = \mathcal{O}(|G^j|\times |features|^2 + |G^j| * |features|)$ then $\pi^{time}= \mathcal{O}(heads \times |features|^2 \times |Ag^n|\times |X^n| \times |A^n|)$, which is \textbf{linear} in the size of the interaction hypergraph.
% \end{thm}


\begin{thm} \label{thm:policy}[\textup{\mohito{}} complexity]   We perform $\mathcal{O}(|Ag|)$ network queries in one iteration of training.  Each query is bounded by  $\mathcal{O}(|\theta| |G_\mathbf{C}|)$.  All other procedures are dominated by one query. One iteration of \textup{\mohito{}} for $|\text{batch}|=B$ is bounded by $\mathcal{O}(\text{B}|\theta||Ag|^3 |X||A_i|)$ .

% $G_\mathbf{C} \subset G_{\mathbf{O}}$
% Assuming the time and space complexity of a GAT are the same \citep{GAT}, $GAT = $ then $\pi= \mathcal{O}(heads \cdot f^2 \cdot |\mathcal{N}|\cdot |\mathcal{X}| \cdot |\mathcal{A}|)$; \textbf{linear} in $G_O$
\end{thm}

%I think it makes the most sense to leave the algorithm inline immediately after the prior paragraphs describing it. What I'm not sure about is whether it makes sense to put the time proof here?


%here I want to show 
%1) how we determine the attention complexity
%2) the components of a single training step 
%3) the final O time complexity for the training step

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Experiments}
\label{setup}

To evaluate the effectiveness of \mohito{}, we empirically test its performance in instances of two experimental domains: the aforementioned \rideshare{} and the \wildfire{} benchmark for studying open agent systems~\citep{Chandrasekaran:Open,Eck:AAAI2020,Kakarlapudi22:Decision}. Domain implementations and \mohito{}'s code are available at \url{https://github.com/oasys-mas/mohito-public}.  

% Where these episodes have stochastic seeds not seen in training. 
%TODO clarify


\subsection{Dynamic Ridesharing}

\label{subsubsub:dynamic-rideshare-setup}
We consider settings with $|Ag|$ = 2, 3, and 4 driver agents, as well as 3 levels of increasing task openness. These nine environments start with a set of initial tasks $|X_0| \sim \left[|Ag|-1, |Ag|+3\right]$. Due to task openness, additional tasks are added stochastically over time based on the \textbf{openness level (OL)}, our simulation of the entry of exogenous tasks. Levels 1, 2 and 3 add 6, 9, and 12 ride requests throughout the episode at randomly sampled times, respectively. At each of those times, 1-3 new tasks are introduced with probabilities 70\%, 20\%, and 10\%, respectively.  We formally define this domain as a \tao{} in Appendix~\ref{appendix:tao-definitions}.
%\noindent \textbf{Training} We train the agents for 20,000 episodes,
%\footnote{To demonstrate performance during training, we also evaluate performance on a set of (1,2) validation runs every (25,50) training episodes for \rideshare{} and \wildfire{} respectively.  We report these results in Appendix A.2.} 
%each episode consisting of at most 100 steps or until all tasks are completed.  After training, we conduct a set of 30 testing runs on each openness level and evaluate the performance of \mohito{} using the performance measures described after our next domain.

\mohito{}'s training in each configuration of number of agents and openness level generally spans 20,000 episodes by when we observed stability in the loss. Each episode consists of at most 100 steps or until all tasks are completed (scheduled passengers served). We checkpoint each model every 50 model updates (every batch-size steps, see Appendix~\ref{appdx:MOHITO-parameters} for parameters) and utilize the best performing policy from checkpoints. We evaluate \rideshare{} instances on 10 episodes for each set of agents and openness level. 



\subsection{Dynamic Wildfire Suppression}
\label{subsec:wildfire-suppression}

We use \wildfire{} as our second domain, where agents coordinate to extinguish fires before they burn out. Agents do not move. They choose to \textit{suppress} a fire they can reach, or engage in \textit{No-Op} to refill their suppressant. An agent without suppressant may only \textit{No-Op}; agents thus leave the environment when out of suppressant. A fire's size (small or medium) dictates how many agents must attack the fire to stochastically decrease its intensity.
%may make people think that it is possible to put out a medium fire with one agent.
%, with more agents improving the likelihood. 
% Fire intensity ranges within $[0, 4]$ between no fire to a burned out fire lacking additional fuel to burn.
In this cooperative domain, rewards are joint and conditioned on the size of fires. Fires burn out with a penalty of -10 or -25 and put-out fires award +20 or +400 for small and medium fires, respectively. Medium fires are more dangerous and require collaboration, so we set a higher reward for their put outs and a higher penalty for their burn outs. Fires can increase in intensity over time if not attended to, and a fire spreads to adjacent cells using a realistic wildfire spread model~\citep{Boychuk09:Fire,Ure15:Fire}. \\

\begin{figure}[ht!]
    \centering
    \includegraphics[width=.9\linewidth]{Figs/Wildfire_States.pdf}
    \caption{\wildfire{} starting states. Fires start at intensity 2 or 3 (4 is burned out and 0 is put out). Fires lit in cells with a caution icon start at intensity 3.}
    \label{fig:starting-states}
\end{figure}

This environment is naturally task-open. Each grid-cell is a large sector and therefore allows multiple fires in a sector (we cap at 13). Some cells have more fuel, and fires lit in those cells start at a high intensity (of 3), which is one intensity level away from burning out. 
%The fire spread model considers each cell in this grid world a square tract of land of known size, so we also allow multiple fires (i.e., tasks) to exist within a cell simulating how firefighters generally must choose which region and which fire in the chosen region to attack, up to a practical maximum of 13. 
%We also add an exogenous component, "accelerant" which is present in some cells or initial fires that make lit fires there start as medium size at intensity 3, one away from burning out. 
We formally define this problem domain as a \tao{} in Appendix~\ref{appendix:tao-definitions}. \mohito{}'s training in each configuration again spans 20K episodes, and each episode consists of at most 100 steps or until all fires are put out or burn out. We evaluate \wildfire{} instances on 20 episodes for each start state shown in Fig.~\ref{fig:starting-states}.

%Each cell in this grid world represents an area all burning or unlit as states. 
%Each new fire represents a new task for the agents, introducing task-openness. %While rideshare has strictly exogenous factors causing openness, this wildfire configuration is strictly endogenous. %
%We define \textbf{openness levels} 1-3 with increasing values of \textit{base spread} parameterizing the rate at which a fire spreads to a new location. 


%\noindent \textbf{Training} We train agents for 20,000 episodes switching starting, as seen in Fig:\ref{fig:starting-states}, states sequentially, the episode ends if all tasks are complete, burned or put out. Convergence is reached when a moving average has not increased for 5 validations, where we start calculating the moving average once agents consecutively receive an average reward $>50$ for 2 validations. Testing is performed on 30 episodes, 10 of each starting state. 

\vspace{-0.05in}
\subsection{Baselines and Measures}

As there are no prior MARL solutions for task-open environments, we compare \mohito{} with effective baselines inspired by domain-specific strategies. 

\begin{itemize}[leftmargin=*,topsep=0in,itemsep=0in]
    \item \emph{FCFS}, a first-come, first-serve policy prioritizing the longest waiting task (as in Uber rides from airports~\citep{airportq}, offensive firefighting \citep{USFARiskManagement}).
    \item \emph{NTF}: a nearest-task first greedy policy prioritizing the task nearest to completion (e.g., defensive firefighting \cite{USFARiskManagement}).
    \item \emph{MOHITO-NoTaskNodes}: an ablation of \mohito{} where we do not include task nodes in the incidence graph. Actors choose actions without regard to which task they belong to. The task is then selected randomly among the tasks with that action. %Tasks are chosen randomly and the actor chooses the corresponding action.}
    \item \pgella{}: a multi-agent, task-open adaptation of PG-ELLA ~\citep{Ammar14:Online}, a popular RL method for lifelong learning. \pgella{} learns an actor-critic architecture using policy gradient. It is parameterized through fixed-size weights that are global for all tasks and task-specific weights (see Appendix \ref{appendix:tao-pg-ella} for details).
\end{itemize}


% \begin{itemize}[leftmargin=*,topsep=0in,itemsep=0in]
%     \item \textit{FCFS}: a first-come, first serve policy prioritizing the longest waiting task; i.e., a real world locally-optimal policy for \rideshare{} \cite{airportq} and \wildfire{} (defensive firefighting).
%     \item \textit{NTF}: a nearest-task first greedy policy prioritizing the task nearest to completion; i.e., another common strategy in \rideshare{} (\cite{uber}) and \wildfire{} (offensive firefighting).
%     \item \textit{TaO-PGELLA} (\rideshare{} only): a multi-agent, task-open adaptation of PG-ELLA \cite{Ammar14:Online}, a popular RL method for lifelong learning. It learns centrally and conditions actions on a learned latent subset of all task features as the distribution of tasks is not available in task-open settings (see Appendix A.4 for details).  We do not use \pgella{} in \wildfire{} due to its poor performance in \rideshare{} and fundamental problems with adapting lifelong learning to task-open environments.
% \end{itemize}


\begin{figure}[t]
    \centering
    \subfigure[Rideshare - Episodic Rewards]{ \includegraphics[width=.9\linewidth]{Figs/performance_evaluation_mohito-updated (cropped) (pdfresizer.com)-2_revised.pdf}
    \label{fig:rideshare-rewards}
    }
    \qquad
    \subfigure[Wildfire - Episodic Rewards]{
        \includegraphics[width=.9\linewidth]{Figs/wildfire_rewards.pdf}
        % \caption{Wildfire        %\footnotesize
        % Testing rewards across 3 openness levels and 30 testing episodes for wildfire
        % }
    \label{fig:wildfire-testing-rewards}
    }
    \caption{Mean episodic rewards (with 95\% CI) (a) \mohito{} performs well in \rideshare{} (b) \wildfire{} shows larger CI's likely from stochastic transitions.}
\end{figure}

\begin{figure*}
    \subfigure[Rideshare - Passenger Duration]{
    \includegraphics[width=.33\linewidth]{Figs/ride_duration (cropped) (pdfresizer.com)_revised.pdf}
    \label{fig:rideshare-pass-time}
    }
    \subfigure[Wildfire - Fire Duration]{
    \includegraphics[width=.33\linewidth]{Figs/wildfire_duration (cropped) (pdfresizer.com)_revised.pdf}
    \label{fig:wildfire-time-analysis}
    }
    \subfigure[Rideshare - Pooling Efficiency]{
    \includegraphics[width=.33\linewidth]{Figs/pooling-efficacy-updated (cropped) (pdfresizer.com).pdf}
    \label{fig:rideshare-pooling}
    }
\caption{(a) Mean task duration shows passengers spending more time riding with \mohito{} while they are pooled. (b) Mean task duration shows \mohito{} only allowing fires to burn about as long as NTF. (c) The amount of time MOHITO spent carrying multiple passengers (pooling) vs carrying one or none.} 
%(d) }
%\caption{(a-b) Mean task duration for \rideshare{} and \wildfire{} (c) Average number of steps spent pooling. (a) \mohito{}'s pooling increases the time an average passenger is waiting despite the increased rewards. (b) \mohito{} lets the fires burn longer extending the episode duration to allow for additional rewards from spread fires. (c) The amount of time MOHITO spent carrying multiple passengers (pooling) vs carrying one or none. (D) The Average number of Fires put out (positive) and burned out (negative). We can see \mohito{} has significantly fewer burn outs.} 
\end{figure*}

%We include NTF as a common real world local optima for ridesharing and wildfire: taxis, and defensive fireground operation strategies \cite{fire-strategies}. We choose FCFS because it represents another common policy: uber airport pickup \cite{uber}, and offensive fireground operation. For \rideshare{}, both of these policies involve no pooling of passengers, so for a fair comparison we also consider \pgella{} which allows for this possibility. 
% Wildfire does not have this issue.

\noindent We evaluate the performance of \mohito{} and the baselines using the following performance measures. 

\begin{itemize}[leftmargin=*,topsep=0in,itemsep=0in]
\item \emph{Episodic cumulative rewards}: the sum of rewards earned by all agents across testing episodes, measuring the effectiveness of agents at accomplishing their tasks.
\item \emph{Duration}: the timesteps taken to complete tasks, measuring the efficiency of agents at accomplishing their dynamically changing task set (ride-status shown for \rideshare{}).
\item \emph{Pooling} (\rideshare{} only): Time spent by agents multitasking by carrying multiple passengers simultaneously, a domain measure of efficiency with dynamic tasks.
\item Fires Burned/Put-out (\wildfire{} only): The number of fires burned- or put-out. A direct domain-specific measure of task success or failure as a result of policy behavior.
\end{itemize}

  


\subsection{Evaluations}
\label{subsec:evaluation}

\paragraph{\mohito{} improves on the baselines.} Figure~\ref{fig:rideshare-rewards} presents the mean of cumulative rewards (with 95\% confidence intervals) earned in \rideshare{} for the various openness levels. Similarly, mean rewards in \wildfire{} across all start states are shown in Fig.~\ref{fig:wildfire-testing-rewards}.~\mohito{} outperforms all baselines in each openness level with statistical significance (Wilcoxon signed rank tests,  $p<.01$, \rideshare{} $n$=30, \wildfire{} $n$=60) for each domain.\footnote{When all openness levels are considered together, \mohito{}'s performance remains significant except against NTF in \wildfire{}.} 
%\textcolor{blue}{except for NTF in \wildfire{}. 
We do not use \pgella{} in \wildfire{} due to its poor performance in \rideshare{} and fundamental problems with adapting lifelong learning to task-open environments.
% Similarly, we find statistically significant improvement (Wilcoxon signed ranked tests, $n$=90, $p<.01$) in \wildfire{} as seen in Figure \ref{fig:wildfire-testing-rewards}. 
Increased openness offers opportunity to service more passengers and risk when fighting more fires. Rewards generally increase with openness demonstrating that \mohito{} learns to exploit additional task opportunities to increase its performance over the baselines.

\mohito's policy is, in part, similar to NTF in \wildfire{} but \mohito{} earns a slightly higher reward than NTF because it puts out more medium fires compared to NTF (see Fig.~\ref{fig:wildfire-task-counts}a) in each openness level. As such, it displays better collaborative behavior between agents to tackle the most challenging tasks. Nonetheless, NTF is a strong baseline across our domains as it uses task-specific insight to select a greedy action. 

\noindent \textbf{Task-action representation matters.}~ In Fig.~\ref{fig:wildfire-testing-rewards}, we see \mohito{} significantly outperform \emph{MOHITO-NoTaskNodes}. The ablation's policy is not conditioned on task observations, so it cannot focus attacks on specific fires. Meanwhile, the shorter fire durations in Fig.~\ref{fig:wildfire-time-analysis} and fewer burnouts (see Fig.~\ref{fig:wildfire-task-counts}) indicate that  \mohito{} better manages risk by strategically distributing suppressant across fires to reduce burn-outs. This exemplifies the importance of learning a task preference in policies. 


\begin{figure}[!ht]
    \centering
    \subfigure[Put Outs (more is better)]{
    \centering
    \includegraphics[width=1\linewidth]{Figs/wildfire_fires_is_burned_outFalse (cropped) (pdfresizer.com).pdf}
    }
    \subfigure[Burn Outs (less is better)]{
    \centering
    \includegraphics[width=1\linewidth]{Figs/wildfire_fires_is_burned_outTrue (cropped) (pdfresizer.com).pdf}
    }
    \caption{Box plots of the numbers of fires (a) put out and (b) burned out by various methods per episode. The red dot indicates the mean. \mohito{} has significantly fewer burn outs than  \emph{FCFS} and \emph{MOHITO-NoTaskNodes}, even as the openness increases.}
    \label{fig:wildfire-task-counts}
\end{figure}



\label{subsubsubsec:mohito-reasons-about-openness}
%In Fig:\ref{fig:wildfire-testing-rewards}, we see MOHITO also outperforms the baselines in Wildfire Suppression. In both these domains, rewards increase as openness increases, demonstrating that MOHITO learns to exploit additional task opportunities to increase its performance over the baselines.  
% Higher openness level in Wildfire Suppression increases the likelihood that new fires spread from the initial fire.  
%TODO needs to be changed.
%\textcolor{red}{By carefully operating in \wildfire{}, \mohito{} agents learn a strategy of managing risk by controlling the number of  fires. We see in Fig.~\ref{fig:wildfire-task-counts} it maintains a similar number of put-out vs burned out fires between openness levels, while all other baselines let more fires burn out despite putting out more}

%the initial fire so that it does not burn out of control, then quickly putting out new, smaller fires as they appear (which can be addressed by agents individually) 
% in order to increase their total rewards.  This also leads to a task duration similar to NTF 

%. This shows \mohito{} manages the risk of fires igniting beyond its ability to fight, and  thus reasons about  task dynamics. 

%As a result, agents still contain the spread before it grows into a conflagration.  This implies that \mohito{} learns to exploit task-openness and manage the associated uncertainty, adapting its behavior when new tasks bring new reward opportunities while "nipping in the bud" the initial fire and smaller fires to mitigate the amount of additional fires (tasks) appearing in the environment. 


\noindent \textbf{\mohito{} can multitask.}~ In \rideshare{}, we observe that \mohito's converged policies lead to the complex phenomenon of \textit{passenger pooling}. This involves picking up multiple passengers and conveying them efficiently to their destinations. We observe that the pooling behavior increases relative to single passenger rides at lower openness levels and higher agent counts in Fig.~\ref{fig:rideshare-pooling}.
%\footnote{\textcolor{blue}{except for the 3-agents-OL3 configuration where pooling happens more than single tasks. This appears to be due to one agent only performing \textit{No-Op}, a behavior we observe in \rideshare{} that persists throughout training (see Fig.~\ref{fig:rideshare-convergence-check} in Appendix \ref{appdx:MOHITO-parameters}).}}. 
The learned pooling behavior prioritizes the agent's driving efficiency by minimizing total drive distance, albeit at increased time cost as passenger service times are lengthened (cf., Fig.~\ref{fig:rideshare-pass-time}).  Such pooling behavior is the motivation for real-world services such as Uber Pool.
%and were learned by \mohito{} agents without explicit prompting.


\noindent \textbf{Lifelong learning underperforms.}~ \pgella{} exhibits poor ability to simultaneously manage existing tasks alongside new ones, resulting in poor rewards (Fig.~\ref{fig:rideshare-rewards}). Upon further investigation, we note that \pgella{} often switches between tasks before completing a particular task. \pgella{} learns policies that fit groups of tasks based on their features as a proxy for knowing all tasks a'priori -- as required by the original PG-ELLA. However, the diversity of task characteristics due to task openness appears to lead to agent uncertainty about how to prioritize simultaneous tasks.  



% \begin{figure}[h!]
%     \centering
%     \includegraphics[width=0.4\linewidth]{Figs/R3-sideways.pdf}
%     \caption{The second starting state of wildfire expanded to include 6 additional fires, and 4 additional agents per increment for scalability testing.}
%     \label{fig:R3}
% \end{figure}





%ideally learns a policy for all tasks possible in an environment to learn a combination of appropriate behaviors, but knowing all tasks in advance is impossible with task-openness. Thus, for new tasks, we follow policies learned for prior tasks with similar features . However, 
%for appendix
%Lifelong learning algorithms typically learn separate policy for each task. They share information between these policies. While we can consider multiple tasks as a single joint task for forming policies \cite{norvig}, this is intractable in task-openness where the number of tasks is potentially unbounded. This is incompatible with \tao{}, so our adaptation, \pgella{}, groups new tasks with those previously seen (by task features such as start location, destination location, and ride status), as a proxy for all known tasks. Although reasonable, this approach made the agents more uncertain of which task to prioritize, leading to instability in the resulting policy.  










%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Related Work}
\label{sec:related}


Lifelong/continual and multi-task learning also learn behavior that generalizes across changing tasks though they do not encompass fully the complexities of task openness. 
%These directions often treat tasks as independent games. \tao{} treats them as dynamic components of one game.

\noindent \textbf{Lifelong and continual learning do not choose tasks.}~  Agents learn to improve their performance on future tasks by leveraging experiences with prior tasks~\citep{Chen2018:Lifelong}. Some methods use one shared parameterization of the agents' policy and try to avoid catastrophic forgetting as they learn from new tasks \citep{Thrun1995:Lifelong}. Others separate knowledge learned from tasks then aggregate it. These are typically modeled as contextual MDPs \citep{Mendez22:Modular, Amani23:Lifelong}. One notable approach learns an implicit neural representation by having agents share task knowledge on a known schedule of tasks \citep{Kolouri2023:Lifelong}. In others, agents interact with a sequence of single tasks drawn from a  distribution~\citep{Abel18:Policy}. Across lifelong learning, agents interact with the one present task and do not decide which task to prioritize.

\noindent \textbf{There is a known distribution of tasks in multi-task learning.}~ Agents learn from a set of tasks, typically fixed, with the goal of learning generalizable information across tasks \citep{Tanaka03:Multitask, varghese2021:Multitask}. Similar to lifelong learning, these methods often use a contextual MDP as the model~\citep{Sodhani21:Multitask, Andreas17:Modular} with all contexts available to the agent. Unlike lifelong learning, multi-task learning can also be approached by more static models, a traditional MDP learned via ensemble learning \citep{Rajeswaran17:EPOpt}, and a decentralized POMDP \citep{Omidshafiei2017:Multitask, Zhangetal2023:Multitask}. Prior work also allows agents to explicitly prefer more rewarding tasks \citep{Yuetal2023:Multitask}. However, across all of these approaches, tasks are samples from a fixed known distribution of tasks.

\noindent \textbf{Task openness is also addressed by methods for multi-agent path finding (MAPF)}, which often focuses on the use-case of robots servicing tasks in warehouses \citep{stern-roni}. However, the majority of these methods adopt a fully centralized perspective to the joint problem and build on search techniques \citep{okumura2023lacam}. Recently, RL has been explored as well for MAPF \citep{Skrynnik24:Lifelong}, but agents do not explicitly reason about other agents' actions. RL is used only to navigate the agent to the assigned goal without collision. In contrast, agents using \mohito{} reason about others' actions and choose tasks, which then makes pooling possible in \rideshare.


\noindent \textbf{GNNs have been used for MARL previously.}~ \citet{Jiang2020Graph} and \citet{Rahman:OpenAdHoc}  contribute to RL in open multi-agent systems, both of which target agent openness without considering task openness. Both rely on the use of GNNs~\citep{Wang16:Structural,GAT} that adapt to changing input sizes \citep{Hamilton17:Inductive}. These embed fully-connected coordination graphs and may intrinsically adapt to dynamic team sizes as agents depart or reenter. Our interaction hypergraphs significantly extend the coordination graphs to model tasks and actions, which offers a more expressive representation of the decision-making problem setting allowing its use for task openness. 

%The GNN architecture we use is common in literature. The novelty is how we expand coordination graphs to task openness, and how we maintain differentiability despite discrete actions. 
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Concluding Remarks}
\label{sec:conclusion}

Learning how to act in environments with task openness, where the
set of objectives or tasks is neither static nor predefined, is challenging.  This paper presented three contributions towards agent reasoning under task openness: (1) a novel model, \tao, using an interaction hypergraph to encode each agent's  dynamic, open, and task-centric action space; (2) a task-open MARL approach, \mohito{}, based on \tao, which centrally learns to evaluate actions relative to all present tasks; 
%that modifies the  hypergraphs and learns via GNNs to dynamically choose an optimal state-action mapping; 
and (3) two task-open domains, \rideshare{} and \wildfire{}, one competitive and the other cooperative, each with  unique properties that serve as a testbed for exploring and evaluating approaches in this emerging topic. Our experiments demonstrated  \mohito{}'s favorable ability to reason about task dynamics. This shows the sufficiency of the \tao{} model and the viability and effectiveness of \mohito{} in reasoning with task openness. Whereas such openness is a feature of many realistic problems, it may not be easily modeled using traditional games with fixed and bounded sets. 

\mohito{} has theoretical and practical limitations. The current \tao{} model is limited to physical states that are perfectly observed. However, in domains such as \wildfire{}, partial observability is inherent and a key barrier to success. Practically, \mohito{} may suffer from unstable training as tasks enter and leave, especially due to exogenous factors. The regularization component of the loss and utilizing graph norm helps toward this, but it may be challenging to identify convergence.

A key direction of future work is to allow considerations of agent openness as well alongside task openness. An approach would be to investigate ways of extending both \tao{} and \mohito{} to model and learn in the context of both these types of openness and the associated uncertainty. 

%\textcolor{blue}{Future modeling work involves considering agent frame or type openness, where an agent's capabilities can change over time. Future work towards more general solutions of \tao{}s include learning with unknown task-action relationships, and utilizing out of distribution detection to control learning.} 

%Future work includes agent frame openness, where an agent's capabilities or type can change in the environment, and utilizing out of distribution detection to control task-open learning.


\begin{acknowledgements} 
This research was supported, in part, by a collaborative NSF Grant \#IIS-2312657 (to Doshi), \#IIS-2312658 (to Soh), and \#IIS-2312659 (to Eck). Some of the computing occurred on the Holland Computing Center of the University of Nebraska, which receives support from the university's Office of Research and Economic Development and the Nebraska Research Initiative. We thank the graduate and undergraduate students who contributed to the baselines, the domains, and the testing of \mohito{}: Ceferino Patino IV, Matthew Sentell, Alireza Saleh Abadi, and Tyler Billings.
\end{acknowledgements}

% References
\bibliography{oasys}

\newpage

\onecolumn

\title{MOHITO: Multi-Agent Reinforcement Learning using Hypergraphs for Task-Open Systems\\(Supplementary Material)}
\maketitle
\appendix

\section{Technical Appendix}


\subsection{Space complexity}
\label{appendix:space-complexity}

Here we evaluate the space complexity of \mohito{}. 
% Recall the sets of nodes that comprise the interaction graphs: tasks $X$, actions $A$, agents $Ag$, and hyperedge nodes $E$.

\subsubsection{Interaction and observation graphs}

Consider an arbitrary graph $g$ with nodes with feature size $f$ and edge feature size $f^e$. The space used to store $g$ is the sum of the space used to store $g$' s edges, $g^e$, and $g$' s nodes, $g^n$, Eq.~\ref{Eq.graphsize}. 

\begin{equation}
    \label{Eq.graphsize}
    \begin{split}
    |g| =& (|g^n| \cdot f)\\+& (2 \cdot |g^e| + |g^e| \cdot f^e)
\end{split}
\end{equation}


Now we use Eq.~\ref{Eq.graphsize} to find $\mathcal{O}(|G_{\mathbf{C}}|)$, the size of \mohito{}'s critic input. Recall that $\mathbf{G}_{\mathbf{O}} \subseteq G_{\mathbf{C}}$, so $|\mathbf{G}_{\mathbf{O}}|=\mathcal{O}(|G_{\mathbf{C}}|)$ thus we focus on the critic graph rather than, $\mathbf{G}_{\mathbf{O}}$, the  observation graphs. Both our interaction graphs are composed of \textbf{task observations} $\mathcal{X}$, \textbf{action descriptions} $\mathcal{A}$, \textbf{agent observations} $\mathcal{N}$, and \textbf{hyperedge nodes} $E$ (i.e. $|G^n_{\mathbf{C}}|$ as seen in Eq.~\ref{Eq.graphnodesize}). In Eq.~\ref{Eq.graphnodesize},    $f$ is the same for all $|G^n_{\mathbf{c}}|$ as all nodes are padded to the largest.


% \textit{Observation} graphs, $S_i\forall i \in Ag$, store public and $i$'s private observations of $[\subset X, \subset Ag, \subset A, \subset E]$. Each of these groups contain nodes with features,  $|n|\forall n \in S^n_i \leq |x| \forall x \in X$ . Observation graphs are used by the actors. Meanwhile the critics use an identical \textit{interaction} graph, $G,$ where $ S_i\subseteq G \forall i \in Ag$, see Eq.~\ref{Eq.graphnodesize}. 

\begin{equation}
\label{Eq.graphnodesize}
    |G^n_{\mathbf{C}}| = |\mathcal{A}| + |\mathcal{N}| + |\mathcal{X}| + |E|
\end{equation}


The environment provides us the number of agents and tasks, so $|\mathcal{N}| = |Ag| \text{ and } |\mathcal{X}| = |X|$.  Let $A_i$ be the largest agent action space. The number of action nodes is bounded  by the number of unique actions, so $|\mathcal{A}| = \mathcal{O}(|Ag| \cdot |A_i|)$.  Each task-action is a relationship between a $X\in \mathcal{X}$ and a $a\in \mathcal{A}$, and task-action nodes, $E$, represent an agents relationship to that task-action. Thus $|E|$ is bounded by $\mathcal{O}(\cdot |\mathcal{A}| \cdot |\mathcal{N} |\cdot |\mathcal{X}|)$.

Substituting $|\mathcal{A}|, |\mathcal{N}|, |\mathcal{X}|, |E|$, and Eq.~\ref{Eq.graphnodesize} into Eq.~\ref{Eq.graphsize},

 \begin{equation}
 \label{Eq.interactiongraph}
 \begin{split}
     |G_\mathbf{C}|& \leq f(|\mathcal{A}| + |\mathcal{N}| + |\mathcal{X}| + |E|) + 6 ( |\mathcal{A}| \cdot |\mathcal{N}| \cdot |\mathcal{X}|)\\
     |G_\mathbf{C}|& = \mathcal{O}( f \cdot |Ag|^2 \cdot |A_i| \cdot |X|)
     %  |G_{\mathbf{C}}|& \leq f\left [f() + |\mathcal{N}| + |\mathcal{X}| + (|Ag|\cdot |\mathcal{N}| \cdot |\mathcal{X}| \cdot |A_i|)\right ]
     %   + (6\times |\mathcal{N}|\cdot |X| \cdot |Ag| \cdot |A_i|+ 0)\\
     % |G_{\mathbf{C}}|& = \mathcal{O}\left ( f  \cdot |N| \cdot |X| \cdot |A_i|\right ).
 \end{split}
 \end{equation}

 \subsubsection{\mohito{} execution}

 \mohito{} actors are comprised of \#layer-many graph attention transformers (GAT) \cite{GAT} layers, activated by ReLU layers, and the hyperedge, $ed$, is selected by ArgMax. GAT scale linearly in $|G_{\mathbf{C}}^{n}|$ and $|G^e_{\mathbf{C}}|$ \cite{GAT}. Here we assume they scale polynomially in input and output feature size, $f,f'$, following their defined single head GAT time complexity Eq.~\ref{Eq.gat-time-complexity}. 

\begin{equation}
\label{Eq.gat-time-complexity}
    GAT^{time}= \mathcal{O}(|G^{n}_{\mathbf{C}}|\times f \times f' + |G^e_{\mathbf{C}}| * f')
\end{equation}

Within \mohito{}, $ f'= \{f,\text{hidden dim}, \text{hidden dim }\times \text{\#heads}\}$ across layers. We substitute $|G^e_{\mathbf{C}}|$, $|G^n_{\mathbf{C}}|$, and simplify with Eq.~\ref{Eq.gat-time-complexity} to get \mohito{} ($M$) execution space complexity Eq.~\ref{Eq.mohito-execution-complexity}.

\begin{equation}
\label{Eq.mohito-execution-complexity}
\begin{split}
M_{exe}^{space} &\leq\    \text{ \#heads}(f^2 \cdot |Ag|^2\cdot |A_i| \cdot |X| + f|E|)\\
M_{exe}^{space}&=\mathcal{O}(\text{\#heads} \times f \times |G_{\mathbf{C}}|)
\end{split}
\end{equation}


\subsubsection{Critic loss computation, \texorpdfstring{$L_{Q}$}{LQ}}

%TODO unclear if this is incorporated into the described GAT space complexity from the GAT paper. I need to just look at the source code. 
The critic performs global mean pooling and linear layers subsequent to the prior actor structure, which are $\mathcal{O}(f \times |G^n_{\mathbf{C}}|)$, and $\mathcal{O}(f \times |Ag|)$, respectively. Both are strictly dominated by $|M^{space}_{exe}|$ because $f\in \mathbb{N}$. The only additional concurrent space requirement is other agents' choosen hyperedges, $ed\in E$, $[ed,...ed_n]$, term two in Eq.~\ref{alg:critic-space}.

\begin{equation}
\label{alg:critic-space}
\begin{split}
    critic_{space} & \leq ( \text{\#heads} \times f \times |G_{\mathbf{C}}|) + (f \times |Ag|)\\
    critic_{space} & = \mathcal{O}( \text{\#heads} \times f \times |G_{\mathbf{C}}|)
\end{split}
\end{equation}


%so we can say $critic_{space} = \mathcal{O}(actor_{space})$. 

%Typically $\phi > \theta$, but when only considering a given graph as variable $\phi,\theta = \mathcal{O}(|x|^m)$ for some constant $m$. \textcolor{red}{For now ignoring this.}

%Now we consider spatial growth during $L_Q$. 
In $L_Q$, we perform two passes $Q_i^{\bm{\phi(j)}}, Q_i^{\bm{\phi}'}(j)$ per agent per experience in the batch. The size of the Q-values in the output is 1, and $|r^j_i|=1$. The space complexity of storing this loss is dominated by $critic_{space}$ and the batched inputs. We discuss the upper bound of  batch size in Appendix~\ref{appendix:classic-rl}. Removing dominated terms we get the space complexity of critic loss, Eq.~\ref{alg:critic-space}.

%TODO put some other sentence here referencing the eq
% These dominants reveal the main source of growth is the GAT space complexity as we see comparing Eq.~\ref{Eq.critic-loss} with Eq.~\ref{Eq.gat-time-complexity} and the batch size. 

\begin{equation}
    M^{space}_{L_Q} = \mathcal{O}\Big( M^{space}_{exe} + \text{batch size} \times \big( |G_{\mathbf{C}}|+ (|Ag|\times f)\big)\Big)
\end{equation}

\subsubsection{Actor loss computation, \texorpdfstring{$L_{\pi}$}{Lpi}}

As with $L_Q$, the regularization only takes a multiple of parameter space, $\bm{\theta}$ additional space here. Here we calculate $Q_i^{'\bm{\phi}}(j)$ %, or in other words,
using the updated main network $\bm{\phi}$. We do not need the other agents' chosen hyperedges, we need $\mathbf{G}_{\mathbf{O}}$ per batch, and with the assertion that $\mathbf{G}_{\mathbf{O}}\subseteq G_{\mathbf{C}}$, we use the same bound as the critic, Eq.~\ref{Eq.policy-loss}.

\begin{equation}
\label{Eq.policy-loss}
M^{space}_{L_\pi} = \mathcal{O}(M^{space}_{L_Q})
\end{equation}


\subsection{Time Complexity}
\label{appendix:time-complexity} 
Here we evaluate the time complexity of hypergraph construction, policy query, and loss.
\subsubsection{Interaction and observation graphs}  We have pettingzoo \cite{terry2021pettingzoo} environments, so it is important to consider the additional time to convert a gymnasium space observation to an interaction graph. Alg.~\ref{alg:critic-graph-construct} shows our interaction graph construction. identifying unique actions is $\mathcal{O}\left ((|Ag| \times |X| \times |A_i|)log(|Ag| \times |X| \times |A_i|) \right )$, and concatenation is $\mathcal{O}(|Ag|^2+|X|+|A_i|)$. Here the common case is linear w.r.t. $|Ag|$ where agents frequently have the same rather than distinct actions available to them, but the worst case is where all agents have unique actions thus, 

\begin{equation}
\label{Eq.graph-construction-time}
    G^{time}_{\mathbf{C}}=\mathcal{O}(|Ag|^2 \times |X| \times |A_i|)
\end{equation}



\begin{algorithm*}[!ht]
\caption{Interaction Graph Construction
% \\After collecting samples into a batch (lines 4-9), \mohito{} utilizes the batch to engage in actor-critic training (lines 12-20) using actor loss (Eq. \ref{Eq.actor_pg}) and critic loss (Eq. \ref{Eq.critic-loss}).
}
\label{alg:critic-graph-construct}

\begin{algorithmic}[1]
    \Require  
        
    $agents <|Ag|, f>$, 
    
    $tasks <|X|, f>$,

    $actspace[Ag, X]\to [A]$ 

    \Ensure 
    
    $G_{O}$ or $G_C$ \Comment{$G_C$ is returned if all tasks and the union of the action space is given.}
    
    
    \State $actions \gets UNIQUE(actspace.values)$
    
    \State $nodes \gets Concatenate( agents, tasks, actions)$

    \State $edges \gets []$

    \State $taskActions \gets 0$
    \For{$agent \in agents.index$}    
        \For{$task \in tasks.index + |agents|$}                    \State $thisActIndex \gets actspace[agent,task].index$
            \For{$action \in    thisActIndex + |agents| + |tasks|$}
            \State $hyperedge \gets (agent, task, action)$ 
            \State $hyperedge \gets $ pad$(hyperedge, f)$
            \State $APPEND(nodes, hyperedge)$
            \State $APPEND(edges, [agent, hyperedge], [task, hyperedge], [action, hyperedge])$

            \EndFor
        \EndFor
    \EndFor
    
    \State return $(nodes, edges)$

\end{algorithmic}

\end{algorithm*}




\subsubsection{\mohito{} execution}
\label{appendix:policy-time-complexity}

In Appendix~\ref{appendix:space-complexity}, we assume that the time complexity of a single head GAT, Eq.~\ref{Eq.gat-time-complexity}, is equivalent to its space complexity. We make the same assumptions on the value of $f,f'$ here to get a time upper bound for any layer in the GAT; $f,f' \leq f$. To query $\pi$ we multiply by the number of layers because, with our upper bound, time scales linearly with more layers, Eq.~\ref{Eq.time-complexity-execution}.

\begin{equation}
    \label{Eq.time-complexity-execution}
    M_{exe}^{time}=\mathcal{O}( \text{\#layers} \times \text{\#heads} \times f \times |G_{\mathbf{C}}|)
\end{equation}

\subsubsection{Critic loss}

The time complexities of the mean pooling and linear layer are the same as the space complexity mentioned in, Appendix~\ref{appendix:space-complexity}; $\mathcal{O}(f \times |G^n_{\mathbf{C}}|)$, and $\mathcal{O}(f \times |Ag|)$ respectively. The only difference here is accounting for all $Q$ passes instead of space growth, Eq.~\ref{Eq.critic-time-complexity}

\begin{equation}
    \label{Eq.critic-time-complexity}
    \begin{split}
            critic_{time}=& \mathcal{O}(M^{time}_{exe})\\
            M^{time}_{L_Q}=&O\big(\text{batch size} \times( M^{time}_{exe} + f\times(|G^n_{\mathbf{C}}| + |Ag|)\big)
    \end{split}
\end{equation}

\subsubsection{Actor loss} As in Appendix~\ref{appendix:space-complexity}, these losses must be calculated over seperate runs, $L_Q(...,\bm{\phi})$ and $L_\pi(...,\bm{\phi}')$. Then mean square error is calculated which is trivially $\mathcal{O}(\text{batch size})$. 

\begin{equation}
    \begin{split}
            M^{time}_{L_\pi}& \leq (M^{time}_{exe}*2 + \text{batch size})\\
            M^{time}_{L_\pi}& = \mathcal{O}(M^{time}_{exe}+\text{ batch size})
    \end{split}
\end{equation}


\subsection{\mohito{} profiling}
\label{appendix:time-profiling}


\noindent \textbf{ Results confirm \mohito{}'s complexity from Thm. \ref{thm:policy}.} We scale the second starting state to include linearly more agents and tasks.
We execute \mohito{} from 8 to 100 agents in increments of 4 agents and 3 tasks. This is run on a Linux virtual machine with 4 AMD EPYC 7302 16-Core CPUs and 500GB of DDR4 RAM. We show peak memory usage,  Fig.~\ref{fig:time-complexity}, and time, Fig.~\ref{fig:memory-complexity}, to run \mohito{}. These fit to a quadratic curve and empirically show our success in avoiding exponential complexity. 


\begin{figure}[ht!]
    \centering
    \subfigure[Time profiling on \wildfire. The largest time usage is i/o on unpacking batches during training rather than inference time. %Task openness necessitates ragged data storage of observations leading to a large constant overhead in practice. 
    ]{
    \centering
    \includegraphics[width=0.4\linewidth]{Figs/scalability (cropped) (pdfresizer.com).pdf}
    \label{fig:time-complexity}
    }~~
    \subfigure[Peak memory usage from profiling \wildfire{}. The sequential agent update tracks closely to the total usage. %This follows expectations as the number of graphs required for a \mohito{} iteration is the same as an update.
    ]{
    \centering
    \includegraphics[width=0.4\linewidth]{Figs/scalability_memory (cropped) (pdfresizer.com).pdf}
    \label{fig:memory-complexity}
    }
    \caption{Results from profiling \mohito{} align with Appendix~\ref{appendix:policy-time-complexity}.}
\end{figure}








\subsection{MOHITO Parameters and Structure}
\label{appdx:MOHITO-parameters}




Here we show all parameters used for \mohito{} in our two domains. In Table~\ref{table:mohito-param}, we show all parameters we used in the experiments shown in this paper. 

\begin{table}[!htp]\centering
\scriptsize
\caption{Parameters For MOHITO training}
\begin{tabular}{l  |c  |c }\toprule
\textbf{MOHITO Parameters} &\textbf{rideshare} &\textbf{wildfire} \\
% $\epsilon$-greedy: exploit strategy & & \\
$\gamma$ & 0.9 &0.99 \\
 \#GAT Layers & 20 & 3\\
 hidden dim & 50 & 24\\
 grad\_clip: max norm & 5 &$0.0$\\\midrule
\multicolumn{3}{l}{\textbf{Environment}}\\\midrule
training seed & 16 & 16\\
max steps per episode& 100 & 100 \\
$|Episodes|$& 20,000 & 20,000\\
$K$: batch size &20 & 16\\
$B$: buffer size & 0 &1,000\\
Graph Norm& yes& no\\
validation frequency & every 5 model updates & 10 model  updates\\\midrule
\multicolumn{3}{l}{\textbf{Actor}}\\\midrule
learning rate & 0.001 & 0.009\\
$\lambda_A$: regularization coefficient& 0.1 & 0.01 \\
$\psi_A$: coefficient for slow-update & 0.05 & 0.005 \\\midrule
\multicolumn{3}{l}{\textbf{Critic}}\\\midrule
learning rate & 0.01 & 0.01\\
$\lambda_Q$ & 0.01 & 0.0 \\
$\psi_Q$ & 0.05 & 0.005\\
Actor Hyperedges & MLP, Fig.~\ref{fig:critic-arch-a} & Pooling, Fig.~\ref{fig:critic-arch-b}\\\bottomrule
\end{tabular}
\label{table:mohito-param}
\end{table}


The architecture of the actor, Fig.~\ref{fig:a-c} and critic, Fig.~\ref{fig:critic-arch}, are similar up to their final layers. They both contain 2-head GATConv, activation (ReLU), and dropout layers per \textbf{\#GAT Layers}. The critic considers agent actions through incorporating their hyperedge features. Either through a MLP at the end of the network (as seen in Fig.~\ref{fig:critic-arch-a} and \rideshare{}), or incorporating all hyperedges into the critic graph and pooling over the selected (as see in Fig.~\ref{fig:critic-arch-b} and \wildfire{}).
 
\begin{figure}[ht!]
    \centering
    \includegraphics[width=0.8\textwidth]{Figs/actor.pdf}
    \caption{Actor network architecture. It is a series of graph attention transformer layers followed by ReLU activation and dropout. When we use graph norm, it is done before ReLU.}
\label{fig:a-c}
\end{figure}

\begin{figure}[ht!]
    \centering
    \subfigure[The MLP based critic architecture for \mohito{}. This addresses the unbounded number of task-actions by including agent input only from selected actions.]{
    \centering
    \includegraphics[width=0.7\linewidth]{Figs/critic-a (cropped) (pdfresizer.com).pdf}
    \label{fig:critic-arch-a}
    }

    
    \subfigure[The pooling based architecture for \mohito{}. This incorporates all actor hyperedges into the state interaction graph. This addresses the unbounded number of task-actions by keeping the Q-value calculation within the graph attention transformers.]{
    \label{fig:critic-arch-b}
    \centering
    \includegraphics[width=0.7\linewidth]{Figs/critic-b (cropped) (pdfresizer.com).pdf}   
    }
    \caption{The critic architecture for \mohito{}. It is a series of graph attention transformers activated by ReLU and normalized by graph norm at all but the last layer.}
    \label{fig:critic-arch}
\end{figure}


In Fig. \ref{fig:rideshare-convergence-check}, we show \mohito{} validation rewards in \rideshare{} across 3000 episodes. The set of tasks given to MOHITO in training is fixed and different from those used in validation. We trained for a fixed duration for each openness level and agent number then picked the best performing policies for \rideshare{}.  



% Here we show the validation runs for the policies used in the paper, and we discuss a challenge we encountered in working with task-open environments for MARL. In fig:\ref{fig:rideshare-convergence-check} a epoch is one episode of \rideshare{} validation. We adjusted the x-axis for \wildfire{}, fig:\ref{fig:wildfire-convergence-check} to correspond to this. 



%We considered two forms of MOHITO training: (1) a constant set of tasks repeated at each episode, and (2) the testing configuration; where ride-request features and arrival times change each episode. Between our two approaches only (1) converges, and it outperforms (2) at all validation checks, see figure \ref{rideshareValidation}.

% \mohito{} does not explicitly model exogenous or endogenous variables effecting $X$, but in \wildfire{} endogenous features; fire position, are present in our task features, and we see \mohito{ } reason about endogenous openness to earn more rewards, see \ref{subsubsubsec:mohito-reasons-about-openness}.

% \rideshare's openness, on the other hand, is a composite of two hidden variables determining time and density of entering tasks. This takes the already non-stationary multi-agent environment and makes it more non-stationary without relevant observations. Thus we propose this \textbf{training sensitivity} is less a weakness of MOHITO and more a core challenge of exogenous task-openness.
\begin{figure*}[ht]
    \centering
    \includegraphics[width=1.0\linewidth]{Figs/convergence-check-agents.pdf}
    \caption{Some agents earn 0 rewards. This is a common optima for the system we encountered where a subset of agents choose to do nothing, and other agents dominate the environment.}
    \label{fig:rideshare-convergence-check}
\end{figure*}

%this doesn't converge...
% \wildfire{} uses a set convergence criteria. We deem convergence when a moving average is both $>50$ and not increasing for 5 consecutive validation episodes. The moving average has a window of 5, and we start calculating the moving average after the reward has been above 50, just putting out the center fire, for 2 consecutive validation episodes. Validation is run every 50 model updates. 



%put three wildfire validation plots (across openness), and the validation plots for rideshare. 

%discuss training sensitivity in a decent fashion


\subsection{\tao{} based Definitions of Domains}
\label{appendix:tao-definitions}

\subsubsection{\wildfire{}}

Here we formally define \wildfire{}, as a TaO-MG, and we define a general approach to represent task-open action spaces. 

\[ \tao{}  \overset{def}{=} \langle M, X, \Psi \rangle \]

$\mathbf{X}$ is the current set of lit fires. The features of each fire are,

$$
    x = [position, int_{initial}, fType_x]
$$

$\mathbf{M}$ is a Markov game representation of for a given set of fires $X$.



\begin{itemize}
    \item $\mathbf{Ag}$ is a fixed set of agents. In this work we show a domain with two firefighters. These agents leave the environment when they run out of suppressant, and as they No-Op, they stochastically regain suppressant and eventually return to the environment. Here leaving/returning simply effects whether they can act on fires. No new agents appear in this domain's experiments. 
    
    \item $\mathbf{S}$ is a set of 4 matrices with the shape of the grid, $\{fType, int, ag, sup\}$. $fType$ contains integers where non-zero elements are burnable tiles. $fType$ corresponds to the number of agents which must collaborate to stochastically fight and put out the fire. $int$ represents the intensity of each fire $[0,n]$ at time $t$. $ag$ represents the location of each agent. $sup$ represents the amount of fire suppressant each agent has at time $t$. 
    
    \item $\mathbf{A}$  is the Cartesian product of all agent action spaces $A_i$. An agent can fight any available fire if they can reach it (via the Chebyshev distance) and they have enough suppressant. Meanwhile, an agent can always No-Op. 
    
    In this work we handle task-open action spaces using hypergraphs, but to provide a more general view we next define agent action spaces using two simpler approaches. Consider an arbitrary, discrete action, task-open, environment $env$. In this environment, there exists tasks $X$ such that each task is associated with $[0,\infty)$ actions per agent, $A_{x,i}^t$. The full action space for agent $i$ at time $t$ must identify which action is being taken and on which task it is being taken. We can enumerate the task-action by shifting the value of each action by the size of the prior task action spaces Eq.~\ref{Eq.implicit-act-rep}, or we explicitly represent action selection as a two-part process: select the task first and then the action, i.e., Eq.~\ref{Eq.explicit-act-rep}. 

    \begin{equation}
    A_{i}^t = \bigcup_{x \in X}
    \begin{cases}
        A_{x,i}^t & x=0\\
        A_{x,i}^t + \sum_{x'=X_0}^{x-1}| A_{x', i}^t| & x>0.
    \end{cases}
    \label{Eq.implicit-act-rep}
    \end{equation}
    
    \begin{equation}
        A_{i}^t = \bigcup_{x\in X} (x,A_{t,x,i}).
        \label{Eq.explicit-act-rep}
    \end{equation}

In our examples each task has a single action at any time step, so Eq.~\ref{Eq.implicit-act-rep} is equivalent to Eq.~\ref{Eq.explicit-act-rep} in space requirement. However there are benefits to both, Eq.~\ref{Eq.explicit-act-rep} is more interpretable for an arbitrary domain with more actions, and Eq.~\ref{Eq.implicit-act-rep} is convenient for environment implementation. \wildfire{} uses the explicit action representation and \rideshare{} uses the implicit action representation.


\item $\mathbf{T}$ transitions in wildfire are stochastic and occur on $int$, and $sup$. Fire spread is handled by a 2d convolutional filter over  where $int_{N,S,E,W}$ is determined by the wildfire spread model and scales with the base spread parameter \citep{Boychuk09:Fire,Ure15:Fire}. Note that this convolution applies over the existence of lit fires, so the number of fires in a cell doesn't impact fire spread. Additionally there is a 0.1 random ignition probability present in all cells. 


    \begin{equation}
        conv = 
        \begin{bmatrix}
            0.0 & int^+_N & 0.0 \\
            int^+_W & 0.1 & int^+_E \\
            0.0 & int^+_S & 0.0
        \end{bmatrix}, \text{2d conv w/o bias}
    \end{equation}

    Suppressant transitions are determined by whether the agent in question fought a fire, $sup^-= 1/3$, then it decreases by $1$, or No-Op, $sup^+=1/2$ where it will be set to its maximum value, 2.

    Intensity transitions are determined separately from fire spread. If enough agents have fought a fire $|attacks| \geq fType$ the fire intensity decreases stochastically by 1, Eq.~\ref{Eq.intensity-decrease}. 
    
    \begin{equation}
        p(int^-_x)=0.8 + .12 ~(|attacks_x|-ftype_x)
        \label{Eq.intensity-decrease}
    \end{equation}

    Otherwise, fire intensity will increase deterministically, $int^+=1.0$, unless the fire is about to burn out ex $n-1\to n$. In such a case, fire intensity will increase stochastically, $p(int^+, int^+_{burn})$ where $int^+_{burn}=.2238$ is calculated from the fire spread model (not parameterized by base\_spread). 

    \item $\mathbf{R}$ rewards are joint in wildfire. While there exists penalties for fighting a not-present fire or fighting a fire without suppressant, these do not occur because their tasks are not present in such cases. A reward of 400 is given for putting out the collaborative center fire; 20 is given for putting out any other fire, and a penalty of -10, -25 for a small or medium fire burning out. 
\end{itemize}


$\mathbf{\Psi}$ is the generator function which converts $M \to M'$ when $X$ changes. $S$, $T$, $A$, and $R$ change when $X\to X'$. 

\begin{itemize}
    \item $\mathbf{S}$ changes when a new task, $x$, enters the environment its initial intensity and fire type are determined by its position. \emph{accelerated} cells produce dangerous medium fires that are at state 3, other cells produce small intensity 2 fires. When a task leaves the environment that fire will not be included in the action space of any agent. While a theoretically infinite number of fires can enter the environment in each cell, we cap the number at 13 for practicality.

    \item $\mathbf{A}$ is updated as the union of present task action spaces.
    
    \item $\mathbf{T}$ is updated to exclude burned out fires, and include newly lit fires. $fire\_spread$ changes according to which fires are now present in the environment, with additional lit cells raising the probability of adjacent fire spread. 
    
    \item $\mathbf{R}$ a new task, $x$, introduces the possibility of new rewards from its future exit whether that be extinguished or burned-out fires. When a task $x$ leaves, its reward is removed until it is reignited. 
\end{itemize}





%TODO needs to be fixed
% \begin{equation}
% \footnotesize
% int_{t+1} = 
% \begin{cases}
%     int_{t}+1 \text{ with } p(int^+) \text{ else } int_t &  |attacks|<fType \land (0<int_t<n-1) \\
%     int_{t}+1 \text{ with } p(int^+,int^{+}_{burn}) \text{ else } int_t & |attacks|<fType \land (int_t=n-1) \\
%     int_{ignite} \text{ with } conv(int_t>0) \text{ else } 0 & int_t=0\\
%     int_{t}-1 \text{ with } p(int^-) \text{ else } int_t & |attacks|\geq fType \land (0<int_t<n)
% \end{cases}
% \end{equation}



% $$
% sup_{t+1,i} = 
% \begin{cases}
% sup_{max} \text{ with } p(sup^+) \text{ else } sup_{t,i} & a_i = No-Op\\
% sup_{t,i}-1 \text{ with } p(sup^-) \text{ else } sup_{t,i} & a_i = attack\\
% \end{cases}
% $$


% $$
% fType_{t+1} = 
% \begin{cases}
%     fType * -1 & (int_{t+1}=0 \land int_{t}>0)
%     \lor (int_{t+1}>0 \land int_t=0)
% \end{cases}
% $$

% \textbf{Rewards} are given jointly in dynamic wildfire. While the individual penalties for attacking a put/burned-out fire, a out of reach fire, and out of suppressant attacks exist, in such cases these tasks are not available to the agents.

\subsubsection{\rideshare{}}

Here we formally define \rideshare{} as a \tao{}. First we define $X$, the underlying Markov game representation,  $M$, and then we address how we update $M$ to account for a new $M$, $\Psi$.

\[ \tao{}  \overset{def}{=} \langle M, X, \Psi \rangle \]


$\mathbf{X}$ is the current set of passengers. Passenger features, see Eq.~\ref{Eq.ride-task-feat}, are: $pick\_loc$ is the initial pick-up location where a task $x$ spawned. $drop\_loc$ is the destination of the passenger. $ride\_fare$ is the rewards earned for dropping this passenger off, determined by Eq.~\ref{Eq.passenger-rewards} using the Manhattan distance. 

\begin{equation}
    x_{i,j} = [pick\_loc, drop\_loc, ride\_fare]
    \label{Eq.ride-task-feat}
\end{equation}
\begin{equation}
\label{Eq.passenger-rewards}
\begin{split}
    ride\_fare =\  & 3 \times max[ 3, dist(pick\_loc, drop\_loc)]\\& + rand[-1,2]
\end{split}
\end{equation}


\noindent Passengers can also be in one of three states, referred to as $ride\_status$ in the paper: 
\begin{itemize}
    \item \text{Ride request}: A new unassigned task that can be accepted.
    \item \text{Assigned passenger}: A passenger who has been assigned and can be picked up by the assigning agent.
    \item \text{Rider}: A passenger riding with an agent who can be dropped off.
\end{itemize}
This task state is stored by its index: $P_i$ is  the $i^{th}$ ride request, $P_{j,i}$ is the $i^{th}$ accepted passenger for the $j^{th}$ agent if it is not stored with a agent in $s$. We define the rate that $x\to X'$ in rideshare setup \ref{subsubsub:dynamic-rideshare-setup}.




$\mathbf{M}$ is a Markov game representation for a given set of passengers $X$.

\begin{itemize}
    \item $\mathbf{Ag}$ is a fixed set of agents. In this work we considered 2-, 3-, and 4-agent configurations. 
    \item $\mathbf{s}$ is represented using a matrix of vectors (see below), where the grid cells indicate the agent and task locations. All values of these vectors are static task features. 

$$
% \footnotesize
s = 
\begin{bmatrix}
(\{\}, \{\}) & (\{\}, \{\})  & (\{\}, \{\}) \\
(\{Ag_{3}\},\{\}) & (\{\}, \{\})  & (\{\}, \{\}) \\
(\{\}, \{\}) & (\{\}, \{P_{1}\})  & (\{\}, \{\}) \\
(\{\}, \{\})& (\{\}, \{\}) & (\{Ag_{2}\}, \{P_{21}\}) \\
(\{Ag_{1}\}, \{P_{11}, P_{12}\}) & (\{\}, \{\}) & (\{\}, \{\}) \\
(\{\}, \{\}) & (\{\}, \{\})  & (\{\}, \{\}) \\
        \end{bmatrix}
$$
    
    \item $\mathbf{A}$  is the Cartesian product of all agent action spaces $A_i$ where: $$A_i = \text{No-Op}  \bigcup_{x\in X} \begin{cases}
            \text{accept}_x &\text{x is ride\_request}\\
            \text{pick\_up}_x &\text{x is accepted by i}\\
            \text{drop\_off}_x &\text{x is i's passenger}
        \end{cases}$$

    \item $\mathbf{T}$ is the transition function which determines agent and passenger movement. All movement transitions are deterministic. Agents who use pick\_up$_x$ or drop\_off$_x$ move one cell (with diagonals) closer to $x$, direction determined by A*. When an agent lands on the same cell as $x$, with a pick\_up$_x$, $x$ will become a \textit{passenger}. If an agent lands on $drop\_loc_x$ with drop\_off$_x$ then $x$ is removed from $X$. Agents who use $accept_x$ do not move, and $x$ is added to the set of riding passengers for that agent.
    \item $\mathbf{R}$ is the agent-wise function determining rewards based on the joint action $a$, the $i^{th}$ agent, and tasks $X$. We define this in pieces for simplicty. 

\begin{equation}
% \footnotesize
\begin{split}
    &\text{fare}_i = \begin{cases}
        ride\_fare_x & a_i = \text{ drop\_off}_x \land x\notin X'\\
        0 & a_i \neq \text{drop\_off}_x \lor x\in X'
    \end{cases}\\
    &\text{move\_cost}_i = \begin{cases}
        -1.2 & agent\_pos \neq agent\_pos'\\
        0 & agent\_pos = agent\_pos'
    \end{cases}\\
    &\text{pick\_cost}_i = \begin{cases}
        -0.1 & a_i = \text{ pick\_up}_x \land agent\_pos=x\_pos\\
        0.0 & a_i \neq \text{pick\_up}_x \lor agent\_pos\neq x\_pos
    \end{cases}\\
    &\text{pool\_limit\_cost}_i = \begin{cases}
        -2 & a_i = \text{pick\_up}_x \land |x\in i| > 2\\
        0 & a_i \neq \text{pick\_up}_x \lor |x\in i| \leq 2
    \end{cases}
\end{split}
\end{equation}    

    An additional wait\_cost, $-2$, is added to all agents if any $x$ hasn't changed $ride\_status$ for a count of steps (accept:5, pick\_up:10, drop\_off:10).

    The final cost, unserved\_cost = $-0.5 \times$ (open seats), is applied if the number of unaccepted passengers is greater than the total number of seats across all passengers. This, together with the wait cost, discourage model inaction. 

    \begin{equation}
    \begin{split}
            r_i =& fare_i + move\_cost_i + pick\_cost_i\\ &+ pool\_limit\_cost_i+ wait\_cost + unserved\_cost
    \end{split}
    \end{equation}
\end{itemize}



$\mathbf{\Psi}$ is the generator function which converts $M \to M'$ when $X$ changes. $S$, $T$, and $R$ change when $X\to X'$. 

\begin{itemize}
    \item $\mathbf{S}$ when new passengers, $x$, enter, $x$ is added to the second set of the $x\_pos$ element of $s$ to get $s'$. Then $S'$ is the Cartesian product of $s'$ across possible $pos$ for all $x\in X$ and $i \in Ag$.
    \item $\mathbf{T}$ maintains the same behavior as described prior, but when explicitly defined, it is updated to only include transitions for $S'$ removing the transitions for departing passengers and adding new. 
    \item $\mathbf{R}$ the reward function is redefined to account for new waiting tasks for $wait\_cost$ and $unserved\_cost$. Additionally, any departing passengers are removed from the reward function. 
\end{itemize}









\subsection{\pgella{}} 
\label{appendix:tao-pg-ella}

First described by \citet{Ammar14:Online}, PG-ELLA is an extension of the authors' previous work in linear regression \citep{ella}. It is a lifelong learning algorithm and features a modular structure capable of utilizing learning a variety of base learners \citep{reinforce,nac} to individually learn policies for like tasks such as cartpole and quadrotor \citep{cartpole},\citep{quadrotor}. Information is shared between tasks through a large latent space $L$ which combines with task specific parameters $s_x$ to form task-specific policy weights $\theta_x = Ls_x$.

PG-ELLA falls short of open-task applications in three key aspects. Multi-Agency: PG-ELLA is a single agent model; (2) Dynamic Spaces: The action and observation space of the environment change as $X$ changes. $L$ has a fixed size and cannot handle changing shape inputs; (3) Simultaneous tasks: The generation of trajectories depends on a single task present in the environment at a time. We next describe how we address these challenges,  and we show them in Fig.~\ref{fig:taopgella}.


\begin{figure}[ht!]
    \centering
    \includegraphics[width=0.6\linewidth]{Figs/tao-pgella.dio-Page-1_revised (cropped) (pdfresizer.com).pdf}
    \caption{Network diagram of \tao{}, shows how we handle dynamic spaces with shared/individual observations, and simultaneously present tasks with task-specific policy concatentation. }
    \label{fig:taopgella}
\end{figure}

\pgella's base learners, such as reinforce \citep{reinforce} have existing effective extensions to multi-agent environments. We use advantage actor critic (A2C) \citep{pmlr-v48-mniha16}. Rideshare observations can be factored into task-generic, $O_s$, and task-specific observations, $[o_1, ..., o_x]$;

\begin{itemize}[leftmargin=*,topsep=0in,itemsep=0in]
    \item \emph{task-generic}: my position, the position of other drivers, the number of passengers I have accepted, and the number of passengers riding with me.
    \item \emph{task-specific}: the position of a passenger, the destination of a passenger, the fare for that passenger, how long a passenger has waited.
\end{itemize}

We concatenate $(O_s,o_x)$ whose shape is now the fixed input size of $\theta_x \forall x$ to solve the dynamic shape challenge. Simultaneous tasks require changes in both representation and training. We consider unique tasks as individual tasks rather than $X$ as one task (the space of $X$ is unbounded in task openness), and in the case of an unbounded number of possible individual tasks, such as continuous fares in \rideshare{}, we discretize the space, grouping similar tasks (i.e., $x \in \chi \subset X$).







We select an action from \pgella's actor by querying all task-specific parameters with present tasks, $\theta_x \iff x \in X$, then perform a softmax over their concatenated logits to select a task-action.

\begin{algorithm}
\caption{TaO-PGELLA $(k, \lambda, \mu)$} \label{alg:marmotellapig}
\begin{algorithmic}[1]
\State $|\chi| \gets 0$
\State $A \gets \mathbf{zeros}_{k \times d, k \times d}$
\State $b \gets \mathbf{zeros}_{k \times d, 1}$
\State $L \gets \mathbf{zeros}_{d, k}$
\While{some task is available}
    \State $L \gets \mathrm{reinitializeZeroColumns}(L)$
    \State $(\mathbb{X, R}) \gets \mathrm{getTrajectories}(\theta)$ \Comment{ Online interaction and $\epsilon$-greedy}
    \State $\langle o_1,...,o_x,O_s\rangle \gets \mathrm{getTasks}(\mathbb{T, R})$
    \For{$o_j \in \langle o_1,...,o_x\rangle $}
        \State $(\mathbb{X}_{(x_j)}, \mathbb{R}_{(x_j)}) \gets \mathrm{filterTrajectories}(\mathbb{X, R}, x_j)$
        \If{$\mathrm{isNewTask}(x_j)$}
            \State $|\chi| \gets |\chi|+1$
        \Else
            \State $A \gets A-(s_{(x_i)}s_{(x_i)^\top})\otimes\Gamma_{(x_i)}$
            \State $b \gets b-\mathrm{vec}(s_{(x_i)\top}\otimes(\theta^\top_{x_j}\Gamma_{(x_j)}))$
        \EndIf
        % \State $\alpha^{(t_i)}, \Gamma^{(t_i)} \gets \mathrm{baseLearn}(\mathbb{T}^{(t_i)}, \mathbb{R}^{(t_i)})$
        \State compute $\theta_{(x_j)}$ and $\Gamma_{(x_j)}$ from $(\mathbb{X}_{(x_j)}, \mathbb{R}_{(x_j)})$
        \State $s_{(x_j)} \gets \mathrm{argmin}_s\ell(L, s, \theta_{x_j},  \Gamma_{(x_j)})$ \Comment{optimize local learner by adam SGD}
        \State $A \gets A+(s_{(x_j)}s^\top_{(x_j)})\otimes\Gamma_{(x_j)}$
        \State $b \gets b+\mathrm{vec}(s^\top_{(x_j)}\otimes(\theta^\top_{(x_j)}\Gamma_{(x_j)}))$
    \EndFor)
    \State $L \gets \mathrm{mat}((\frac{1}{X}A+\lambda I_{k \times d, k \times d})^{-1}\frac{1}{|\chi|}b)$
\EndWhile
\end{algorithmic}
\end{algorithm}

\textbf{Algorithm \ref{alg:marmotellapig}} presents the algorithm for \pgella. We start by initializing how many $x$ we have encountered, $|\chi|$, and the matrices used with the Hessian to calculate the updated $L$ later (lines 1-4). \textit{reinitializeZeroColumns} resets all zero columns, $L_c\sim \text{Uniform}[-1,1]$. We interact with the environment given our current $\mathbf{\theta}$ to obtain trajectories, ($\mathbb{X}, \mathbb{R}$), exploring with $\epsilon$-greedy (line 7). Next, we perform the parameter updates from PG-ELLA \cite{ella} looping over all unique $x\in \mathbb{X}$ (lines 9-20). \textit{filterTrajectories} constructs $\mathbb{X}_j, \mathbb{R}_j\gets \{x, r | x \in \mathbb{X}\}$ where $x\in \mathbb{X}$ means that the action taken in that trajectory is associated with task $x$ and task $x$ is observed.




% \textcolor{red}{First described by \citet{Ammar14:Online}, PG-ELLA is an extension of the authors' previous work in linear regression \citep{ella}. It features a modular structure capable of utilizing learning a variety of base learners \citep{reinforce,nac} to individually learn policies for like tasks \citep{cartpole, quadrotor}. As they are learned, these task-specific policies are  combined by the main algorithm to form a partially unified set that shares some policy information across tasks. This partial unification comes from the structure of the algorithm's latent-space-defined policy parameters, which are divided into one large latent space $L$ shared between tasks and another, smaller latent space $s_{(x)}$ unique to each task $x$. For each learned task, the policy parameters $\theta_{(x)}$ are defined as follows: $\theta_{(x)}=Ls_{(x)}$.}

% PG-ELLA falls short of open-task applications in three key aspects. (1) Multi-Agency: PG-ELLA is a single agent model; (2) Dynamic Spaces: The action and observation space of tasks in $X$ can change as $X$ changes. $L$ cannot handle changing shape inputs, and (3) Simultaneous Tasks: Trajectory generation is dependent on a single task present in the environment. 

% We extend PG-ELLA to multi-agent settings through a actor-critic schema  analogous to MADDPG \citep{Lowe2017:MAPE}. 

%not really the main problem
%(3) Unbounded tasks, an unbounded $X$ not only introduces the possibility of multiple tasks simultaneously present, but the possibility of an unbounded set of individual tasks (through continuous variables in task features or unbounded integers) exists. Consider ride\_fair in \rideshare{}.


% This method also uses the same centralized training, decentralized execution learning paradigm as the one we use for \mohito{}.



% \begin{figure}[htbp]
%     \centerline{\includegraphics[width=0.5\textwidth]{Figs/joint_pol.png}}
%     \caption{Joining PG-ELLA's task-specific policies together to create a single policy for an environment with a set of tasks. Note that this initial approach requires a fixed observation space, to be addressed later.}
%     \label{fig:joint-policy}
% \end{figure}

% A constraint of PG-ELLA is that all policies must have the same action and observation space utilize the shared latent space, $L$. We address this by splitting our observations into \textit{task-observations}, $[o_{1},...,o_{j}]$ for $j$ tasks, and \textit{shared-observations}, $o_s$.  Only \textit{shared-observations} are sent through the shared latent space, i.e., the driver position in \rideshare{}. This is the dynamic spaces portion of Fig:\ref{fig:taopgella}.

% PG-ELLA operates on an individual task at a time, and while multiple tasks can be considered as one joint task. This means defining $\pi \forall X$ which is unbounded with task-openness. Thus instead of creating a policy for all $\pi_X$ we create task-specific policies, $\pi_x$, which we aggregate through concatenation. This is the simultaneous tasks portion of Fig:\ref{fig:taopgella}. 

% \begin{figure}
%     \centering
%     \includegraphics[width=1.0\linewidth]{Figs/open_pol.png}
%     \caption{Now that we split the observation, we are no longer constrained to have a fixed action and observation space across tasks.}
%     \label{fig:shared-observations}
% \end{figure}

 







\subsection{Tabular RL complexity}
\label{appendix:classic-rl}

In this work we proposed a deep RL solution to \tao{} environments. Here we consider the complexity of \mohito{} if we use classic RL lookup tables in place of our $\pi$ and $Q$ GNNs. We also consider the upper bound for \mohito{} batch size.


% operates similarly to a lookup table choosing specific weights by task, and it manages openness by assuming $x\in \mathbb{X} \forall x$, where $\mathbb{X}$ is a fixed set of possible tasks. However \tao{} is not a lookup table and cannot exactly represent $Q$ or $\pi$ for a arbitrary $X$. 

\subsubsection{\mohito{} lookup space} To encode actions conditioned on $\mathbf{G}_{\mathbf{O}}$, we make the following assumptions:

\begin{enumerate}
    %\item $x\in \mathbb{X} \forall x$, where $\mathbb{X}$ is a set of all possible tasks.
    \item $\Psi_A$ is deterministic such that $E=E'$ if $X=X'$.
    %\item $|X|\leq \Upsilon$, for some given $\Upsilon$. 
    \item All observations are present solely in $Ag$ and $X$.
    \item No agent openness.
    % \item $\forall X', |X| \geq |X'|$
    % \item $\forall x,y \in X, x\neq y$, and $\forall a,a'\in Ag, a\neq a'$
    \item  $|\mathbb{X}_{task}| \geq f$, and $|\mathbb{X}_{agent}| \geq f$.
\end{enumerate}

%\rideshare{} and \wildfire{} follow (1) and (2); (3) violates our definition of openness because it limits the number of present tasks, but it is necessary for an exact lookup table. In other words $\Upsilon=\infty$ in \mohito{}; (4) is not the case for our environments, but is just a question of implementation.
We define our $\pi$ lookup table $\pi(G)\to e\in E$; assumption (1) \textit{allows for this static policy with respect to tasks else we must consider a changing action space despite one observed} $X$. In Eq.~\ref{Eq.space-lookup}, we define the space complexity of tasks as a sum of combinations with replacement. Here $\mathbb{X}_{task}$ is all possible task observations, and  $\mathbb{X}_{agent}$ all possible agent observations. Here we use assumptions (2); \textit{the observation is split into our observation of other agents, "agent", and "task" observations that describe the state}, and (3); \textit{the number of agent observations, and the number of agents are fixed.} 
\begin{equation}
\label{Eq.space-lookup}
    \begin{split}
    %task nodes
    \pi^{space}_{lookup} =
        \sum^{|X|}_{i=1} \Bigg ( \frac{(|\mathbb{X}_{task}| + i - 1)!
        }{(|\mathbb{X}_{task}|-1)!i!}
        \Bigg) \times |\mathbb{X}_{agent}|^{|Ag|}
        %\sum^{|Ag|}_{j=1} \frac{(f + j - 1)!
        %}{(f-1)!j!}
    \end{split}
\end{equation}

 


We find a lower bound only considering $i=|X|$, where the number of tasks present is equal to the largest ever seen number of tasks, from Eq.~\ref{Eq.space-lookup} to  Eq.~\ref{Eq.omega-bounds}. We use assumption 
(4); \textit{the number of unique task/agent observations are at least their degree of freedom;} to substitute in $f$ and $|Ag|$ making this comparable to our deep RL complexity analysis.
% ,(5); \textit{no duplicate tasks/ agent observations}
% , and (5); \textit{all possible tasks/ agent observations are present in the beginning.}

\begin{equation}
    \label{Eq.omega-bounds}
    \pi^{space}_{lookup} = \Omega \bigg(\frac{(f\times|X|-1)!}{(f-1)!|X|!} \times |Ag|^{|Ag|}\bigg) 
\end{equation}

\subsubsection{\mohito{} lookup time}

With a lookup table, we hash the observation graph which is linear, $\mathcal{O}(|G_{\mathbf{C}}|)$. Then we lookup the preferred $e\in E$ at $G$ which is constant, $\mathcal{O}(1)$. The same operation is performed for critic lookup.

\begin{equation}
    \pi^{time}_{lookup}=\mathcal{O}(|G_{\mathbf{C}}|)
\end{equation}

\subsubsection{\mohito{} batch size} An ideal batch contains all possible one-step trajectories, ($S$, $A$, $S'$). The range of possible trajectories is bounded when we make the prior lookup table assumptions, Eq.~\ref{Eq.batch-bound}, represents the state $S = \{\text{observations of agents, observations of tasks}\}$ and is squared to account for noisy transitions  $S\to S'$.

\begin{equation}
\label{Eq.batch-bound}
\begin{split}
        \text{batch size}& = \mathcal{O} \Bigg(
    |Ag| |E| 
    \bigg( |\mathbb{X}_{agent}|^{|Ag|} \sum^{|X|}_{i=1} \frac{(|\mathbb{X}_{task}| + i - 1)!
        }{(|\mathbb{X}_{task}|-1)!i!}\bigg)^2\Bigg )
\end{split}
\end{equation}



\end{document}
