\documentclass[accepted]{uai2025} 
%-------------UAI-2025------------------------------------
\usepackage[american]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
%\newcommand{\swap}[3][-]{#3#1#2} % just an example

%\title{Instructions for Authors: Title in Title Case}
%%% Load required packages here (note that many are included already).
%----------------------------------------------------------------------
\usepackage{balance} % for balancing columns on the final page
\usepackage{enumitem}
\usepackage{graphicx}
\usepackage{amssymb,amsthm}
%\usepackage{amssymb,amsmath,amsthm,tabularx}   % <-- for \eqref
\usepackage[linesnumbered,ruled,vlined,noend]{algorithm2e}
\usepackage{subcaption}
\usepackage{bm}
\usepackage{tikz} 
\usetikzlibrary{arrows.meta}
\usetikzlibrary{positioning, calc, fit}
\usetikzlibrary {shapes.geometric}
\usetikzlibrary{decorations.pathreplacing}
%\usepackage{sidecap}
%\let\labelindent\relax
\usepackage{multicol}
\usepackage{multirow}
%\usepackage{varwidth}
\usepackage{color, soul}
%\usepackage{float}
%\usepackage{hyperref}
%\usepackage[capitalise]{cleveref}


\title{Adaptive Human-Robot Collaboration using Type-Based IRL}

%%% Provide names, affiliations, and email addresses for all authors.
\author[1]{Prasanth Sengadu Suresh}
\author[1]{\href{mailto:<pdoshi@uga.edu>?Subject=Your UAI 2025 paper}{Prashant Doshi}{}}
\author[2]{\href{mailto:<Bikramjit.Banerjee@usm.edu>}{Bikramjit Banerjee}{}}
% Add affiliations after the authors
\affil[1]{%
    THINC Lab, School of Computing\\
    University of Georgia\\
    200 D. W. Brooks Drive, Athens, GA 30602, USA
}
\affil[2]{%
    School of Computing Sciences \& Engg.\\
    University of Southern Mississippi\\
    118 College Dr., Hattiesburg, MS 39406, USA
}

%%% Use this environment to specify a short abstract for your paper.


%%% The code below was generated by the tool at http://dl.acm.org/ccs.cfm.
%%% Please replace this example with code appropriate for your own paper.


%%% Use this command to specify a few keywords describing your work.
%%% Keywords should be separated by commas.

%\keywords{Human-robot collaboration, inverse reinforcement learning, human types}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%% Include any author-defined commands here.
         
% \newcommand{\BibTeX}{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em\TeX}

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\newcommand\tab[1][1.25cm]{\hspace*{#1}}
\let\oldemptyset\emptyset
\let\emptyset\varnothing
\newtheorem{mydef}{Definition}
\newtheorem{theorem}{Theorem}
\newtheorem{assumption}{Assumption}
\newcommand{\Real}{\mathbb R}
\newcommand{\eps}{\varepsilon}
\newcommand{\To}{\rightarrow}
\newcommand{\BX}{\bm{B}(X)}
\newcommand{\X}{\mathcal{X}}
\newcommand{\Y}{\mathcal{Y}}
\newcommand{\Z}{\mathcal{Z}}
\newcommand{\T}{\mathcal{T}}
\newcommand{\R}{\mathcal{R}}
\newcommand{\dAIRL}{Dec-AIRL}
\newcommand{\dPPO}{Dec-PPO}
\newcommand{\dMDP}{Dec-MDP}
\newcommand{\tbdMDP}{\textsf{TB-DecMDP}}
\newcommand{\tbdAIRL}{\textsf{TB-DecAIRL}}
\newcommand{\tbdPPO}{\textsf{TB-DecPPO}}
\newcommand*{\addn}[1]{\color{blue}#1}
% Command \empt{var1}{var2}
\newcommand{\empt}[2]{$#1^{\langle #2 \rangle}$}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{document}
\maketitle

\begin{abstract}
Human-robot collaboration (HRC) integrates the consistency and precision of robotic systems with the dexterity and cognitive abilities of humans to create synergy. However, human performance may degrade due to various factors (e.g., fatigue, trust) which can manifest unpredictably, and typically results in diminished output and reduced quality. To address this challenge toward successful HRCs, we present a human-aware approach to collaboration using a novel multi-agent decision-making framework. {\em Type-based} decentralized Markov decision processes (\tbdMDP{}) additionally model latent, causal decision-making factors influencing agent behavior (e.g., fatigue), leading to dynamic agent types. In this framework, agents can switch between types and each maintains a belief about others' current type based on observed actions while aiming to achieve a shared objective. We introduce a new inverse reinforcement learning (IRL) algorithm, \tbdAIRL{}, which uses \tbdMDP{} to model complex HRCs. \tbdAIRL{} learns a type-contingent reward function and corresponding vector of policies from team demonstrations. Our evaluations in a realistic HRC problem setting establish that modeling human types in \tbdAIRL{} improves robot behavior on the default of ignoring human factors, by increasing throughput in a human-robot produce sorting task.

% In response to these challenges, we develop a human-centric solution using a novel multi-agent decision-making framework called type-based Dec-MDP (\tbdMDP{}), which accounts for changes in human behavior during tasks, incorporating realistic latent decision-making factors such as fatigue or human biases modeled as dynamic agent-types. Additionally, we present a new inverse reinforcement learning (IRL) algorithm---\tbdAIRL{}---which leverages \tbdMDP{} to model the HRC scenario. In this framework, each agent updates its beliefs about the other agents' types based on observed actions and its current belief. Agents then adjust their policies based on these updated beliefs to achieve a shared objective. \tbdAIRL{} learns a type-based reward function and a corresponding vector of policies (one for each agent) from expert team demonstrations. To validate the effectiveness of our approach, we conduct an ablation study comparing \tbdAIRL{} with \dAIRL{}, a method that does not account for agent types. This paper demonstrates that \tbdAIRL{} outperforms the baseline method on a realistic dyadic line sorting task by comparing the throughput of the sorted items.
\end{abstract}
%%% The following commands remove the headers in your paper. For final 
%%% papers, these will be inserted during the pagination process.

%\pagestyle{fancy}
%\fancyhead{}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Introduction}
\label{sec:introduction}

Advances in agent-based decision making are making robots that can take decisions under uncertainty increasingly real. Such intelligence allows platforms such as collaborative robots (cobots) to work safely with humans to effectively contribute to several aspects of human endeavors. Such human-robot collaboration (HRC) enables a paradigm shift in key domains such as healthcare, manufacturing, and other industries by combining the diverse complementary strengths of human and robotic agents to optimize task efficiency and throughput. 

To optimize HRC, cobots must learn to adapt to the dynamic latent factors that influence human action choice. One way to capture these factors in learned behavior is to let (expert) humans perform (demonstrate) the task as a team. Then, we may learn the team's underlying preferences for task state and joint actions using inverse reinforcement learning (IRL)~\citep{ng2000algorithms}, which is known to {\em generalize beyond the demonstrations.}
%which captures expert preferences from demonstrations using an underlying reward function. 
Whereas IRL has found success in several single-expert tasks~\citep{Arora21:Survey}, it remains underexplored in multiagent tasks in contexts such as HRC. Prior work in IRL applied to HRC~\citep{nikolaidis2014computational,sengadu2023dec} assumes that the human teammate follows fixed rule-based behavior throughout, and fails to account for the causal latent factors influencing human behavior such as biases or fatigue. 

To illustrate, consider a packing shed where humans stand across each other and sort onions on a conveyor. The {\em optimal} sortation discards visibly blemished onions and closely inspects the seemingly unblemished ones before deciding to discard or return them back to the conveyor. Prolonged periods of sorting leads to fatigue, resulting in a decline in sorting speed. Fatigued workers usually take a short break to regain energy. In this HRC use case, a cobot could recognize and adapt to its fatigued teammate by accelerating its sorting speed to maintain overall throughput,  This `industrial' mode should be maintained by the cobot only while the human is fatigued and away from the shared workspace, as the increased speed may not be safe for humans. 
%This human-centric solution allows the robot to actively adapt to human energy levels throughout the task and maximize team efficiency.

\noindent We make {\bf three} key contributions in this paper:
\begin{enumerate}[leftmargin=*,topsep=0in,noitemsep]
    \item We present a novel multiagent decision-making framework, \tbdMDP{}, which models dynamic  agent types while solving for type-contingent decentralized agent behavior. Agents may detect their teammate's current type and switch their behavior accordingly to maximize team reward.
    \item In the context of \tbdMDP{}, we present a new  IRL method, \tbdAIRL{}, which learns a {\em type-contingent team reward function} and a corresponding policy vector (one for each agent). This output, backed by a theoretical performance guarantee, enables the cobot to adapt to the human agent's dynamic type, promoting seamless HRC in shared workspaces. A type-contingent reward function is novel to IRL.
    \item Finally, we empirically establish the efficacy of \tbdAIRL{} towards successful HRC in simulation and validate it on a produce-sorting task using a physical HRC with a UR3e cobot.
\end{enumerate}

By demonstrating enhancement in collaborative efficacy via better average rewards per episode (on the MAGym~\citep{magym, openaigym} simulated environment), and increased throughput compared to the baseline (on the physical task), we establish the value of \tbdAIRL{} over the default of ignoring human factors. We conclude by providing avenues for future work. 

%========================================================================================================================================================
\section{Background}
\label{sec:background}

% \subsection{Multiagent Inverse Reinforcement Learning}

Multiagent IRL models the expert using a variant of Markov decision process~\citep{Puterman1994}, which it solves optimally. Since HRC is collaborative and decentralized (i.e., the robot may not perfectly observe all attributes of the human's state, such as the human's joint angles) by definition, a \dMDP{}~\citep{Goldman03} is appropriate to model the expert.

A two-agent \dMDP{} can be formally defined as a tuple $\mathcal{DM} \triangleq \langle S, A, T, R \rangle$ where the joint state, $S = S_i \times S_j$. Here, $S_i$ and $S_j$ are the locally observed states of the two agents $i$ and $j$, respectively, which when combined yield the joint state of the system; $A = A_i \times A_j$ is the set of joint actions of the two agents; $T: S \times A \times S \rightarrow [0,1]$ is the transition function of the multiagent system; and $R: S \times A \rightarrow \mathbb{R}$ is the common reward function.\footnote{This \dMDP{} describes a locally fully observable model whose local states when combined yield the fully observable joint state~\citep{Goldman03}.}  In IRL, the latter is unknown, whereas the rest are usually known. As such, the agents know their local state and any common task attributes and act independently while optimizing a common reward~\citep{melo2011decentralized}. Let $\X^E$ be the set of expert demonstrations and a complete trajectory $X^E \in \X^E$ is given by, 
%\begin{small}
\begin{align}
X^E = (\langle s_i^0,s_j^0\rangle, \langle a_i^0,a_j^0\rangle, 
%\langle s_i^1,s_j^1\rangle, \langle a_i^1,a_j^1\rangle, 
\cdots, \langle s_i^\T,s_j^\T\rangle, \langle a_i^\T,a_j^\T\rangle).
\label{eqn:basic-irl-expert-trajs}
\end{align}
%\end{small}
The agent's task is to learn a reward function and a policy profile, $\bm{\pi} = \langle\pi_i,\pi_j\rangle$, that optimizes its return, such that the trajectories generated by $\bm{\pi}$ are indistinguishable from those in $\X^E$. 


\begin{figure*}[tbh!]
    \centering      
    \begin{minipage}[b]{1\textwidth}
    \centering
    \begin{tikzpicture}[node distance={15mm}, thick, main/.style = {draw, circle, minimum size=0.1cm, text width=0.25cm, align=center, font=\footnotesize}]
    
        % Define agent i's nodes
        \node[main] (m_i_t) {$m_i$}; 
        \node[main] (as_i_t) [left=0.9cm of m_i_t] {$\texttt{as}_i$}; 
        \node[main] (a_i_t) [right of= m_i_t] {$a_i$}; 
        \node[main] (ts_i_t) [left of= as_i_t] {$\texttt{ts}$}; 
        \node[main] (as_i_t1) [above of= as_i_t] {$\texttt{as}'_i$}; 
        \node[main] (ts_i_t1) [above of= ts_i_t] {$\texttt{ts}'$};
        \node[main] (m_i_t1) [above of= m_i_t] {$m'_i$};
        \node[main] (a_i_t1) [above of= a_i_t] {$a'_i$}; 
        
        % Define agent j's nodes
        \node[main] (a_j_t) [right of= a_i_t] {$a_j$}; 
        \node[main] (m_j_t) [right of= a_j_t] {$m_j$}; 
        \node[main] (as_j_t) [right=0.9cm of m_j_t] {$\texttt{as}_j$}; 
        \node[main] (ts_j_t) [right of= as_j_t] {$\texttt{ts}$}; 
        \node[main] (as_j_t1) [above of= as_j_t] {$\texttt{as}'_j$}; 
        \node[main] (ts_j_t1) [above of= ts_j_t] {$\texttt{ts}'$};
        \node[main] (m_j_t1) [above of= m_j_t] {$m'_j$};
        \node[main] (a_j_t1) [above of= a_j_t] {$a'_j$}; 

        % Labels for nodes
        \node[fit=(as_i_t)(ts_i_t), inner sep=0pt, outer sep=0pt, draw, ellipse, label=center:{$s_i$}] (s_i_t){};
        \node[fit=(as_i_t1)(ts_i_t1), inner sep=0pt, outer sep=0pt, draw, ellipse, label=center:{$s'_i$}] (s_i_t1){};
        \node[fit=(as_j_t)(ts_j_t), inner sep=0pt, draw, ellipse, label=center:{$s_j$}] (s_j_t){};
        \node[fit=(as_j_t1)(ts_j_t1), inner sep=0pt, draw, ellipse, label=center:{$s'_j$}] (s_j_t1){};

        % Add text label for timestep t and t+1 to the left of nodes ts_i_t and ts_i_t1
        \node[align=left] at ($(s_i_t.west) + (-1, 0)$) {\bm{$t$}};
        \node[align=left] at ($(s_i_t1.west) + (-1, 0)$) {\bm{$t+1$}};

        % Arrows and connections
        \draw[->] (s_i_t.south east) to [out=-45, in=-145, looseness=0.6] (a_i_t); 
        \draw[->] (m_i_t) -- (a_i_t); 
        \draw[->] (a_i_t) -- (s_i_t1); 
        \draw[->] (s_i_t) -- (s_i_t1);
        \draw[->, dotted, thick] (m_i_t) -- (m_i_t1); 
        \draw[->] (s_i_t1) to [out=45, in=145, looseness=0.6] (a_i_t1); 
        \draw[->] (m_i_t1) -- (a_i_t1); 
        
        \draw[->] (s_j_t.south west)  to [out=-145, in=-45, looseness=0.6] (a_j_t); 
        \draw[->] (m_j_t) -- (a_j_t); 
        \draw[->] (a_j_t) -- (s_j_t1); 
        \draw[->] (s_j_t) -- (s_j_t1); 
        \draw[->, dotted, thick] (m_j_t) -- (m_j_t1); 
        \draw[->] (s_j_t1) to [out=145, in=45, looseness=0.6]  (a_j_t1); 
        \draw[->] (m_j_t1) -- (a_j_t1); 
        
        \draw[->] (a_i_t) -- (m_j_t1); 
        \draw[->] (a_j_t) -- (m_i_t1); 
        
        % Interactive state relations
        \draw[red, ->] (s_i_t) to [out=-40, in=-90, looseness=0.8] (s_j_t1.south west); 
        \draw[red, ->] (a_i_t.south) to [out=-40, in=-90, looseness=0.9] (s_j_t1.south west); 
        \draw[blue, ->] (s_j_t)  to [out=-140, in=-90, looseness=0.8] (s_i_t1.south east); 
        \draw[blue, ->] (a_j_t.south) to [out=-140, in=-90, looseness=0.9]  (s_i_t1.south east); 
        \draw[draw=blue,dashed] (-4.1,-1.5) rectangle ++(6.3,4.3);
        \draw[draw=red,dashed] (2.4,-1.5) rectangle ++(6.3,4.3);
    \end{tikzpicture} 
    \caption{\tbdMDP{} graphical model of a dyadic team with agents $i$ (blue) and $j$ (red), for two timesteps $\bm{t}$ and $\bm{t+1}$. Local state of agent $i$ ($s_i$) is a combination of $i$'s private attributes $\texttt{as}_i$ and common task attributes $\texttt{ts}$. Model $m_i$ holds $i$'s current belief over the others' type $\theta_j$. The dotted link updates model $m_i$ using the other agent’s action at $t$. These apply analogously for agent $j$. All agents transition jointly to the next state, as indicated by the dependence (colored links) of each agent's next state on the other's previous state and action.}
    %(shown by the red and blue colored lines).}
    \label{fig:tbdMDP-dependency-graph}
\end{minipage}
\end{figure*}



%----------------------------------------------------------------------------------------------------
\subsection{Decentralized Adversarial IRL}
\label{subsec:dec-airl}
%---------------------------------------------------------------------------------------------------

Decentralized adversarial IRL (\dAIRL{})~\citep{sengadu2023dec} generalizes the single-expert deep-IRL method -- adversarial IRL (AIRL)~\citep{fu2018learning} (which works on the principle of maximum causal entropy~\citep{Ziebart2010:PhDthesis, gleave2022primer}) -- to learn a common reward function for the team from expert demonstrations. AIRL uses a discriminator $D_{\bm{\alpha}} (X)$ to learn a function $f_{\bm{\alpha}} (X)$~\citep{fu2018learning} which at convergence approximates the expert policy's advantage function. \dAIRL{} analytically represents the discriminator as $D_{\bm{\alpha}}(X) = \frac{e^{f_{\bm{\alpha}}(X)}}{e^{f_{\bm{\alpha}}(X)} + \pi(X)}$ and the reward is updated as

\begin{align}
R_{\bm{\alpha}}(X) \leftarrow \log D_{\bm{\alpha}}(X) - \log(1 - D_{\bm{\alpha}}(X)).
\label{eqn:airl-reward-update}
\end{align}

When simplified, Eq.~\ref{eqn:airl-reward-update} yields $f_{\bm{\alpha}}-\log(\pi)$, which is the entropy-regularized reward formulation. In the underlying Dec-MDP,
%~\citep{goldman2004decentralized}
each agent only has access to their local state and some general task attributes. \dAIRL{} uses \dPPO{} - a decentralized generalization of the popular RL method - Proximal Policy Optimization (PPO)~\citep{schulman2017proximal}, for forward-rollout. Dec-PPO uses the centralized training, decentralized execution paradigm where the centralized critic network updates its value function as a squared-error loss: $L^{VF}_{t}({\omega}) = \Bigl(V^{\pi_{\bm \omega}}(s^{t}) - \Hat{V}_{t}^{targ} \Bigr)^2$ where $\Hat{V}_{t}^{targ}$ is the per-episode discounted reward-to-go and $V^{\pi_{\omega}}(s^{t})$ is the predicted value of joint state $s^{t}$ and ${\omega}$ is the vector of policy parameters (weights). We consider a dyadic system with agents $i$ and $j$, although our formalism and method conceptually scale to $N$ agents and is not limited to a dyad. The policy loss of agent $i$ is given by
% \begin{small}
\begin{align*}
&L^{CLIP}_i(\omega) = \mathop{\mathbb{E}}_{\pi_{\omega, i}} \Bigl[min\Bigl(\lambda_i ~ A^{\bm \pi}, clip(\lambda_i,1-\epsilon, 1+\epsilon) ~ A^{\bm \pi} \Bigr)\Bigr], \\ &\text{where}~~ \lambda_i = \frac{\pi_{\omega,i}(a_i|s_i)}{\pi_{\omega,i}^{old}(a_i|s_i)}\text{ is the importance sampling ratio}.
\end{align*}
% \end{small}
%where $\lambda_i$ is the importance sampling ratio. 
$L^{CLIP}_i(\omega)$ provides a pessimistic bound over the final objective by using a surrogate objective that picks the minimum of the clipped and unclipped objectives. By clipping the importance sampling ratio, the incentive of moving $\lambda$ outside the interval $[1-\epsilon, 1+\epsilon]$ is reduced. Advantage $A^{\bm \pi}$ is calculated using the reward estimate $R_{\bm{\alpha}}(X)$ from Eq.~\ref{eqn:airl-reward-update}. This clipped surrogate objective, combined with the policy entropy, handles the explore-exploit dilemma. The policy entropy loss  and corresponding total loss are given as: $L^{ENT}_i(\omega) = \sigma H \Bigl[\pi_{\omega,i}(s_i) \Bigr],  L_i(\omega) = L^{CLIP}_i(\omega) + L^{ENT}_i(\omega)$,
% \begin{small}    
% \begin{align*} 
% L^{ENT}_i(\omega) = \sigma H \Bigl[\pi_{\omega,i}(s_i) \Bigr],  L_i(\omega) &= L^{CLIP}_i(\omega) + L^{ENT}_i(\omega).
% \end{align*}
% \end{small}
% \begin{align} 
% L_i(\omega) =  L^{CLIP}_i(\omega) + L^{ENT}_i(\omega).
% \label{eqn:Dec-PPO-loss}
% \end{align}
where $H$ is the policy entropy and $\sigma$ is the entropy hyperparameter. These loss functions apply analogously for agent $j$. On convergence, the discriminator and the generator return the learned common reward function and the vector of policies, respectively.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Type-Based Decentralized AIRL for Human-Robot teaming}
\label{sec:method}

In this section, we formalize a novel multiagent model, Type-Based Dec-MDP (\tbdMDP{}), and present our novel IRL method, Type-Based Dec-AIRL (\tbdAIRL{}) to learn a type-contingent reward function and the vector of policies (one for each agent), which reason about the other agents' behavioral changes and adapts accordingly.

We model realistic human-robot collaborative settings where agents adjust their behavior based on latent factors (e.g. trust, fatigue), and anticipate and adapt to others' changes to achieve a shared goal. Although we present this work in the context of HRC, it extends to domains where autonomous agents must adapt to changing human conditions (e.g. healthcare and smart homes). For instance, in healthcare, such agents provide personalized care by monitoring patient conditions and intervening as and when needed. In smart homes, agents adjust the air conditioning system according to human preferences, highlighting the need for continuous monitoring and adaptation.

\subsection{Type-Based Collaboration model} 

In addition to shared task attributes, each agent $i$ has its private attributes (which may include mental attributes and thus are not observable by other agents). Each agent also maintains a model of all other agents, including a belief about their joint types. Agent $i$’s initial belief is based on prior knowledge of these types. The combination of $i$’s private attributes and the common task attributes constitutes $i$’s local state ($s_i$). Agent $i$ makes decisions based on $s_i$ and its belief about the other agents' types. At each timestep, agent $i$ observes other agents' actions noisily and updates its belief accordingly (see Fig.~\ref{fig:tbdMDP-dependency-graph}). All agents collaborate in a decentralized manner to optimize a common task-centric reward. Since each agent has perfect access to its own local state and only maintains a belief about the other agents' types, our \tbdMDP{} can be considered a decentralized variant of a mixed-observability MDP~\citep{ong2010planning}.


A \tbdMDP{} is formally defined as:
$$\mathcal{TB-DM} \triangleq \Bigl\langle Ag, S, A, T, R, M \Bigr\rangle$$
% \begin{itemize}
%     \item $Ag$ is the set of agent identifiers; let $N = |Ag|$,
%     \item $S = S_i \times S_j \times \ldots S_N$ is the joint state of all agents. $S_i = \texttt{tS} \times \texttt{aS}_i$ is agent $i$'s local state, where $\texttt{tS}$ is the common task-state and $\texttt{aS}_i$ contains $i$'s private attributes including $i$'s types $\Theta_i$,
%     \item $A = A_i \times A_j \times \ldots A_N$ is the joint action, where $A_i$ is the set of local actions of agent $i$,
%     \item $T: S \times A \times S \rightarrow [0,1]$ is the state transition function. Note that agents' types transition exogenously and are not captured by T,
%     \item $R: S \times A \rightarrow \mathbb{R}$ is the common reward function for all agents,
%     \item $M = M_i \times M_j \times \ldots M_N$ is the set of agent models, where $M_i$ is agent $i$'s model given by
%     $$M_i = \Bigl( \Theta_{-i}, b_i, F_i \Bigr)$$
%         \begin{itemize}
%             \item $\Theta_{-i} = \prod_{j \neq i} \Theta_j$ is the combined set of all agents' types except agent $i$,
%             \item $b_i \in \Delta (\Theta_{-i})$ is the current belief held by agent $i$ about the other agents' types $\Theta_{-i}$,
%             \item $F_i: \Theta_{-i} \times A_{-i} \rightarrow \Delta (\Theta_{-i})$ is the type transition function,
%         \end{itemize}
% \end{itemize}

\begin{itemize}[leftmargin=*,topsep=0in,itemsep=0in]
    \item %\textbf{Agent Identifiers $Ag$ and the Number of Agents $N$:}
    %\begin{itemize}
        $Ag$ represents the set of all agent identifiers within the system. Each agent is uniquely identified, allowing for individual tracking and interaction within the environment.
        The total number of agents in the system is denoted by $N$, where $N = |Ag|$. 
        %This cardinality is crucial for determining the complexity of interactions and the scalability of the model.
    %\end{itemize}
    
    %\item %\textbf{Joint State Space $S$:}
        \item The joint state space $S$ is defined as the Cartesian product of the individual local states of all agents, i.e., $S = S_1 \times S_2 \times \ldots \times S_N$. 
        %This represents the complete configuration of the system at any given time.
        The local state $S_i$ is further decomposed into:
        \begin{itemize}
            \item 
            The \textit{task-state} $\texttt{tS}$, which is common across all agents. It encapsulates the shared aspects of the environment or task that all agents are aware of and interact with.
            \item  
            The \textit{agent-specific state} $\texttt{aS}_i$, containing private attributes unique to agent $i$. Note that $\texttt{aS}_i$ is not observable by others. This includes {\addn $\theta\in$}$\Theta_i$: The \textit{type} of agent $i$, representing its inherent characteristics, capabilities, or roles within the system.
        \end{itemize}
        Then, $S_i = \texttt{tS} \times \texttt{aS}_i$, allowing each agent to maintain both shared and private information.
    
    %\item %\textbf{Joint Action Space $A$:}
    \item The joint action space $A$ is the Cartesian product of the individual action sets of all agents, expressed as $A = A_1 \times A_2 \times \ldots \times A_N$. 
        %This encompasses all possible combinations of actions that agents can take simultaneously.
        Each agent $i$ has a local action set $A_i$, which includes all actions available to that agent. 
        %These actions represent the possible decisions or moves an agent can make at each time step.
    
    %\item \textbf{State Transition Function $T$:}
        \item The state transition function $T: S \times A \times S \rightarrow [0,1]$ defines the probability of transitioning from one joint state to another, given a particular joint action.
        % \item Specifically, $T(s, a, s')$ denotes the probability of moving to state $s'$ when the system is in state $s$ and the joint action $a$ is executed.
        \item It is important to note that an agent type $\theta \in \Theta_i$ transitions \textit{exogenously}, meaning external factors govern its transition and this is not influenced by the state transition function $T$. This separation ensures that while agents can act and influence the environment, their inherent types remain consistent unless altered by external dynamics.
    
    \item %\textbf{Common Reward Function $R$:}
    The reward function $R: S \times A \rightarrow \mathbb{R}$ assigns a real-valued reward to each state-joint action pair.
    %, representing the immediate benefit or cost to the agents for taking action $a$ in state $s$. 
    This reward is \textit{common} to all agents, implying that it reflects a shared objective or goal that all agents are collectively trying to optimize. 
    %This can facilitate cooperative behavior or alignment of incentives among agents.
    
    %\textbf{Agent Models $M$:}
        \item The set of agent models $M = M_1 \times M_2 \times \ldots \times M_N$ encapsulates the internal representations and beliefs each agent holds about the other agents.
        \item Each agent $i$'s model $M_i$ is defined as:
        $M_i = \Bigl( \Theta_{-i},\ b_i,\ F_i \Bigr)$. This includes:
        % \item Breaking down $M_i$:
        \begin{itemize}[leftmargin=*,topsep=0in,itemsep=0in]
            \item %\textbf{Types of Other Agents ($\Theta_{-i}$):}
                $\Theta_{-i} = \prod_{j \neq i} \Theta_j$ represents the combined set of types for all agents except $i$. 
                %This aggregation enables agent $i$ to consider the various types of other agents during decision-making.
            \item %\textbf{Belief $b_i$ about Other Agents' Types :}
                $b_i \in \Delta (\Theta_{-i})$ denotes agent $i$'s current belief distribution over the types of other agents. 
                %Here, $\Delta (\Theta_{-i})$ represents the probability simplex over $\Theta_{-i}$.This belief allows $i$ to reason about and predict the behaviors of other agents based on their types, facilitating strategic interactions.
            
            \item %\textbf{Type Transition Function $F_i$:}
                  $F_i: \Theta_{-i} \times A_{-i} \times \Theta_{-i} \rightarrow [0,1]$ defines the probabilistic transition of others' types in the next time step given observed action and current types.     
                
                %\item Specifically, given the current types of other agents and their actions, $F_i$ provides a new belief distribution over $\Theta_{-i}$. This function captures the dynamics of how agent $i$ interprets and adapts to the behaviors of others over time.
        \end{itemize}
    \end{itemize}

This formulation can be broadly seen as an \emph{ex interim} expected utility formalism of Bayesian games~\citep{shoham2008multiagent} where each agent has perfect knowledge of its own type and has a mixed strategy of the others' type. For a dyadic team with agents $i$ and $j$, the joint policy is given as $\bm{\pi} = \langle \pi_i, \pi_j \rangle$ where $\pi_i: S_i \times \Delta (\Theta_{-i}) \rightarrow A_i$.


\subsection{IRL from team demonstrations}

The belief update equation for agent $i$ in a dyad is given as:
\begin{align}     
b'_i(\theta_j'~|~a_j, b_i) &= \beta \sum_{\theta_j \in \Theta_j} \Pr(\theta'_j~|~a_j, \theta_j) \Pr(\theta_j)  \nonumber\\
&= \beta \sum_{\theta_j \in \Theta_j} F_i(\theta'_j~|~a_j, \theta_j)~b_i(\theta_j).
\label{eqn:belief-update}
\end{align}
The experts' joint demonstrations contain state-action pairs as defined in~\eqref{eqn:basic-irl-expert-trajs} and a single belief trajectory corresponding to the expert trajectory:
\begin{align}
\hat{b}^E_{\bm \theta} = (\langle \theta_i^0,\theta_j^0\rangle, \langle \theta_i^1,\theta_j^1\rangle, ..., \langle \theta_i^\T,\theta_j^\T\rangle\rangle).
\label{eqn:expert-belief-trajs}
\end{align}
The belief update of Eq.~\ref{eqn:belief-update} and generation of the trajectory above occur in the belief module of \tbdAIRL{}'s architecture, which is shown in Fig.~\ref{fig:TB-Dec-AIRL-architecture}. For implementing this module, we use a GRU-cell~\citep{cho2014learning} as it has been empirically demonstrated to have an equivalent representation as the analytical update.

The discriminator takes in the pooled states, pooled actions, and pooled types to distinguish expert and learned samples. This discriminator is then used to obtain the common task-centric reward as defined in Eq.~\ref{eqn:airl-reward-update}. The (joint) discriminator optimization objective is now defined as:
\begin{align}
    &\mathcal{D}_{KL}(\rho_{\bm \pi_\omega} (\bm{s}, b_{\bm{\theta}}, \bm{a}) ~\|~ \rho_{\X^E} (\bm{s}, b_{\bm{\theta}}, \bm{a})) \approx \nonumber\\
    &\max_{\bm \alpha} ~\mathbb{E}_{(\bm{s}, b_{\bm \theta}, {\bm a}) \sim \X^E,\hat{b}^E_{\bm \theta}} \Bigl[ \log D_{\bm \alpha} (\bm{s}, b_{\bm \theta}, {\bm a})\Bigr] +\nonumber\\
    &\mathbb{E}_{(\bm{s}, b_{\bm \theta}, {\bm a}) \sim {\bm \pi_\omega}} \Bigl[ \log \Bigl(1 -  D_{\bm \alpha} (\bm{s}, b_{\bm \theta}, {\bm a}) \Bigr) \Bigr].
    \label{eqn:tb-dairl-objective}
\end{align}
where $\rho$ gives the occupancy measure as in AIRL. Note that divergence $\mathcal{D}_{KL}$
%~\footnote{We mention just $\mathcal{D}_{KL}$ from here on for succinctness. 
%However, the input arguments are the same as the ones shown in \cref{eqn:tb-dairl-objective}.} 
is dependent on the belief module parameters $\bm{\phi}$ through its dependence on the belief states. 

Then, the IRL's objective is to optimize the policy vector, which is written as:
\begin{align}
    &\min_{\omega} \mathcal{D}_{KL} \approx \min_{\omega} \max_{\bm \alpha} \mathbb{E}_{(\bm{s}, b_{\bm \theta}, {\bm a}) \sim \X^E,\hat{b}^E_{\bm \theta}} \Bigl[ \log D_{\bm \alpha} (\bm{s}, b_{\bm \theta}, {\bm a})\Bigr] +\\\nonumber
    &\mathbb{E}_{(\bm{s}, b_{\bm \theta}, {\bm a}) \sim {\bm \pi_\omega}} \Bigl[ \log \Bigl(1 -  D_{\bm \alpha} (\bm{s}, b_{\bm \theta}, {\bm a}) \Bigr) \Bigr]. \nonumber
    % \label{eqn:tb-dppo-objective}
\end{align}
During the forward rollout (PPO) stage, the belief over other agents' type is obtained from the belief module at each timestep and factored into each agent's policy learning to learn a vector of policies that capture the expert's behavioral preferences. The gradient for policy optimization is given by:
\begin{align*}
    \nabla_{\omega} \mathcal{D}_{KL} \approx \nabla_{\omega} & \left[\mathbb{E}_{\bm{\pi}_\omega} \left( \log D_{\bm{\alpha}} (\bm{s}, b_{\bm{\theta}}, \bm{a}) \right. \right.\\ 
    & \left. \left. - \log \left( 1 - D_{\bm{\alpha}} (\bm{s}, b_{\bm{\theta}}, \bm{a}) \right) \right) \right].
    % \label{eqn:tb-dppo-gradient}
\end{align*}
Given fixed belief parameters ($\bm{\phi}$), the required gradient for policy optimization for agent $i$ is obtained as:
\begin{align*}
    &L^{CLIP}_i(\omega) = \mathop{\mathbb{E}}_{\pi_{\omega, i}} \Bigl[min\Bigl(\lambda_i ~ A^{\bm \pi}, clip(\lambda_i,1-\epsilon, 1+\epsilon)~A^{\bm \pi}\Bigr)\Bigr] \\\nonumber
    &\text{where}~~ \lambda_i^t = \frac{\pi_{\omega,i}(a_i^t|s_i^t, b_i^t)}{\pi_{\omega,i}^{old}(a_i^t|s_i^t, b_i^t)},
\end{align*}
and analogously for agent $j$.

The advantage function is computed as $A^{\bm \pi_\omega} = Q^{\bm \pi_\omega} - V^{\bm \pi_\omega}$ with the action value-function $Q^{\bm \pi_\omega}$ given by $Q^{\bm \pi_\omega} = \mathbb{E}_{\bm \pi_\omega} \Bigl[ \sum_{t'=t}^\infty \gamma^{t'-t} \Bigl( \log D_{\bm \alpha} (\bm{s}^{t'}, b_{\bm \theta}^{t'}, {\bm a}^{t'}) - 
    \log \Bigl( 1 - D_{\bm \alpha} (\bm{s}^{t'}, b_{\bm \theta}^{t'}, {\bm a}^{t'}) \Bigr) \Bigr) \Bigr]$ and the state-value function $V^{\bm \pi_\omega}$ is given as $V^{\bm \pi_\omega}(b) = V(\bm{s}, b_{\bm \theta}) = \max_{\alpha \in \Gamma_{\bm \theta}(s)} (\alpha \times b_{\bm \theta})$ where $\alpha = \langle V(\bm{s}, \theta_{1}), V(\bm{s}, \theta_{2}), V(\bm{s}, \theta_{3}) \ldots V(\bm{s}, \theta_{n}) \rangle$.
    
% \begin{small}    
% \begin{align*}
%     &A^{\bm \pi_\omega} = Q^{\bm \pi_\omega} - V^{\bm \pi_\omega}.\nonumber\\
%     &\text{where the action value-function $Q^{\bm \pi_\omega}$ is given by} \\\nonumber
%     &Q^{\bm \pi_\omega} = \mathbb{E}_{\bm \pi_\omega} \Bigl[ \sum_{t'=t}^\infty \gamma^{t'-t} \Bigl( \log D_{\bm \alpha} (\bm{s}^{t'}, b_{\bm \theta}^{t'}, {\bm a}^{t'}) - \\\nonumber
%     &\log \Bigl( 1 - D_{\bm \alpha} (\bm{s}^{t'}, b_{\bm \theta}^{t'}, {\bm a}^{t'}) \Bigr) \Bigr) \Bigr]. \nonumber\\
%     & \text{and the state-value function $V^{\bm \pi_\omega}$ is given as} \\\nonumber
%     &V^{\bm \pi_\omega}(b) = V(\bm{s}, b_{\bm \theta}) = \max_{\alpha \in \Gamma_{\bm \theta}(s)} (\alpha \times b_{\bm \theta}) \\\nonumber
%     &\text{ where } \alpha = \langle V(\bm{s}, \theta_{1}), V(\bm{s}, \theta_{2}), V(\bm{s}, \theta_{3}) \ldots V(\bm{s}, \theta_{n}) \rangle.
% \end{align*}
% \end{small}
 

\begin{figure*}[tbh!]
    \centering   
    \begin{tikzpicture}[
            font=\sf \footnotesize,
            >=LaTeX,
            % Styles
            cell/.style={% For the main box
                rectangle, 
                rounded corners=5mm, 
                draw,
                very thick,
                },
            cell2/.style={% For the main box
            rectangle, 
            rounded corners=2mm, 
            draw,
            thin,
            },
            cell3/.style={% For the main box
            rectangle, 
            rounded corners=5mm, 
            draw,
            very thin,
            },
            operator/.style={% For operators like +  and  x
                circle,
                draw,
                inner sep=-0.5pt,
                minimum height =.2cm,
                },
            function/.style={% For functions
                ellipse,
                draw,
                inner sep=1pt
                },
            ct/.style={% For external inputs and outputs
                circle,
                draw,
                line width = .75pt,
                minimum width=1cm,
                inner sep=0pt,
                },
            gt/.style={% For internal inputs
                rectangle,
                draw,
                minimum width=3mm,
                minimum height=3mm,
                inner sep=1pt
                },
            mylabel/.style={% something new that I have learned
                font=\bf\footnotesize\sffamily
                },
            ArrowC1/.style={% Arrows with rounded corners
                rounded corners=.25cm,
                thick,
                },
            ArrowC2/.style={% Arrows with big rounded corners
                rounded corners=.5cm,
                thick,
                },
            ]
        \node [cell, minimum height=6cm, minimum width=\textwidth] at (0,0) (outer_boundary){} ;
        
        %#################################################### Belief Module #####################################################################
        
        \node [cell, minimum height=5.5cm, minimum width=6cm, anchor=north west, shift={(1,-0.25)}] at (outer_boundary.north west) (belief_module_boundary){} ;
        \node [cell2, fill=green, opacity=0.2, dotted, thick, green, minimum height=4.5cm, minimum width=5.5cm, anchor=north west, shift={(0.15,-0.7)}, label={[mylabel, yshift=-0.45cm]Belief Module $i$}] at (belief_module_boundary.north west) (belief_module_i){} ;
        \node [cell2, fill=yellow, opacity=0.2, dotted, thick, yellow, minimum height=4.5cm, minimum width=5.5cm, anchor=north west, shift={(0.25,0.4)}, label={[mylabel, yshift=-0.45cm]Belief Module $j$}] at (belief_module_i.north west) (belief_module_j){} ;
        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% GRU Cell %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        % Nodes    
        % Draw the cell boundary: 
        \node [cell, minimum height=3.5cm, minimum width=3.5cm, anchor=south west, shift={(1,0.5)}] at (belief_module_i.south west) (grucell) {};
        % \node [dashed, red, cell2, minimum height=3cm, minimum width=0.6cm, anchor=south west, shift={(1.8, 0.35)}] at (grucell.south west) {};
        % \node [dashed, blue, cell2, minimum height=3cm, minimum width=0.85cm, anchor=south west, shift={(0.8, 0.35)}] at (grucell.south west) {};

        % Draw inputs named ibox#
        \node [gt] (sigma1) at ([xshift=1.5cm, yshift=-0.75cm]grucell.west) {$\sigma$};
        \node [gt] (sigma2) [right=0.25cm of sigma1] {$\sigma$};
        \node [gt] (oneminus) [above of= sigma2] {$1-$};
        \node [gt, minimum width=0.5cm] (tanh) [right of= sigma2] {Tanh};

       % Draw operators named mux# , add# and func#
        \node [operator] (mux1) [left of= oneminus] {$\times$};
        \node [operator] (mux2) [above of= tanh] {$\times$};
        \node [operator] (mux3) [above=1cm of oneminus] {$\times$};
        \node [operator] (add1) [right of= mux3] {+};

        % Draw External inputs named as basis h,x
        % \node[ct, label={[mylabel, xshift=0.75cm]Previous hidden state}, align=center, inner sep=0pt, minimum size=4mm] (h) [left=2.25cm of mux3] {$h$};
        \node[ct, label={[mylabel, xshift=0.75cm]}, align=center, inner sep=0pt, minimum size=4mm] (h) [left=2.25cm of mux3] {$h_i$};
        \node[ct, label={[mylabel]}, align=center, inner sep=0pt, minimum size=2mm] (y1) [below of= h] {$b_i (\theta_j)$};
        % \node[gt, label={[mylabel]right:Input}, align=center, inner sep=0pt, minimum size=4mm] (x) [below=2cm of mux1] {$\langle \psi',\psi, a_j \rangle$};
        \node[gt, label={[mylabel]}, align=center, inner sep=0pt, minimum size=4mm] (x) [below=2cm of mux1] {$\langle a_j \rangle$};

        % Draw External outputs named as basis h2,y2
        \node[ct, label={[mylabel]}, align=center, inner sep=0pt, minimum size=4mm] (h2) [right of= add1] {$h'_i$};
        \node[ct, label={[mylabel]}, align=center, inner sep=0pt, minimum size=4mm] (y2) [below of= h2] {$b'_i (\theta_j)$};

    % Start connecting all.
        % Intersections and displacements are used. 
        % Drawing arrows    
        \draw [ArrowC1] (h) -- (mux3) -- (add1) -- (h2);
        \draw [ArrowC1] (mux3.west) -| (y1.east);
        \draw [ArrowC1] (add1) coordinate[auto] -|(y2.west);

        % Inputs
        \draw [ArrowC1] (x) -- coordinate[auto] (link1) (mux1); 
        \draw [ArrowC1] (x -| sigma2)++(-1,0.7) -| (sigma2);
        \draw [ArrowC1] (x -| tanh)++(-2,0.5) -| (tanh);
        \draw [ArrowC1] (x.north)++(0,0.5) -| (sigma1.south); 
        \draw [->, ArrowC1] (mux1) ++(0,0.1) |- (mux3) ;
        \draw [->,ArrowC1] (h.east)++(0.6,0) |- (mux1.west);
        % \draw [->, ArrowC1] (sigma1.north) |- coordinate[auto]  node[pos=0.3, left] {$r_t$} (mux1.east);
        \draw [->, ArrowC1] (sigma1.north) |- (mux1.east);

        % Internal
        \draw [->, ArrowC2] (sigma2) |- (mux2.south west);
        \draw [->, ArrowC2] (mux2) -- (add1);
        \draw [->, ArrowC2] (oneminus) -- (mux3);
        \draw [->, ArrowC2] (tanh) -- node[pos=0.4, right] {$\widetilde{h}_t$}(mux2);
        % \draw [->, ArrowC2] (sigma2)++(0,0.25) -| node[pos=0.4, right] {$z_t$} (oneminus);
        \draw [->, ArrowC2] (sigma2)++(0,0.25) -| (oneminus);
        \draw [ArrowC2] (h.east)++(0.6,0) |- ($(x.north)!0.5!(link1)$);
        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% End GRU Cell %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
        %#################################################### End Belief Module #################################################################
        %####################################################### Generator #################################################################
        % Draw the cell boundary: 
        \node [cell, fill=magenta, opacity=0.2, minimum height=1.5cm, minimum width=3cm, anchor=north, shift={(2,-1.25)}, label={[mylabel, yshift=-1cm]\tbdPPO{}}] at (outer_boundary.north) (generator) {};
        \node [gt, fill=red, opacity=0.2, minimum height=0.5cm, minimum width=2.5cm, anchor=north, shift={(0,-0.1)}, label={[mylabel, yshift=-0.45cm]Common Critic}]  at (generator.north) (critic) {};
        \node [gt, fill=green, opacity=0.2, minimum height=0.5cm, minimum width=1.25cm, anchor=south west, shift={(0.25,0.1)}, label={[mylabel, yshift=-0.45cm]Actor $i$}]  at (generator.south west) (actor_i) {};
        \node [gt, fill=yellow, opacity=0.2, minimum height=0.5cm, minimum width=1.25cm, anchor=south east, shift={(-0.25,0.1)}, label={[mylabel, yshift=-0.45cm]Actor $j$}]  at (generator.south east) (actor_j) {};            
        %####################################################### End Generator #################################################################
        %####################################################### TB-Dec-MDP #################################################################
        \node [gt, fill=blue, opacity=0.2, minimum height=0.5cm, minimum width=2.5cm, anchor=north, shift={(0,1)}, label={[mylabel, yshift=-0.5cm]\tbdMDP{}}] at (generator.north) (env) {};
        \draw [->] (env) -- coordinate[auto] node[pos=0.4, left] {$\bm{s}$} (generator);
        \draw [->] (belief_module_boundary) -- coordinate[auto] node[pos=0.5, above] {$b_{\bm{\theta}}$} (generator.west);
        \draw [->] (generator) coordinate[auto] -- node[pos=0.5, below] {$\bm{a}$} (belief_module_boundary);
        \draw [->] (generator.75) -- coordinate[auto] node[pos=0.45, right] {$\bm{a}$} (env.-50);
        %####################################################### End TB-Dec-MDP #################################################################         
        %####################################################### Discriminator #################################################################
        % Draw the cell boundary: 
        \node [cell, fill=cyan, opacity=0.2, minimum height=1cm, minimum width=3cm, anchor=south, shift={(0,-1.4)}, label={[mylabel, yshift=-0.75cm]Discriminator $D_{\bm \alpha}$}] at (generator.south) (discriminator) {};
        \draw [->] (discriminator.45) -- coordinate[auto] node[pos=0.45, right] {$R$} (generator.-55);
        \draw [->] (generator.south) -- coordinate[auto] node[pos=0.45, left] {$\langle \hat{\X}, \hat{b}_{\bm \theta} \rangle$} (discriminator.north);
        %####################################################### End Discriminator #################################################################
        %####################################################### Exp Trajs #################################################################
        \node [cell, fill=gray, opacity=0.2, minimum height=1cm, minimum width=5cm, anchor=south, shift={(0,-1.5)}, label={[mylabel, yshift=-1.05cm]{\parbox{5cm}{\centering Simulated expert human-human trajectories}}}] at (discriminator.south) (exp_trajs_1) {};
        
        \draw [->] (belief_module_boundary.-20) -- node[pos=0.45, above] {$\hat{b}^E_{\bm \theta}$} (discriminator);
        \draw [->] (exp_trajs_1.-175) -- coordinate[auto] node[pos=0.45, below] {$\X^E$} (belief_module_boundary.-34);
        %####################################################### End Exp Trajs #################################################################
        %####################################################### Robot Execution #################################################################  
        \node [minimum height=0.2cm, minimum width=2.5cm, inner sep=0pt, anchor=east, shift={(4.5,1)}, label={[mylabel, yshift=-0.05cm]Human-robot task execution}] at (generator.east) (detect-noop) {\includegraphics[width=.14\textwidth]{Figs/start-sort.png}};
        \node [minimum height=0.2cm, minimum width=2.5cm, inner sep=0pt, anchor=east, shift={(4.5,-0.4)}] at (generator.east) (detect-pick) {\includegraphics[width=.14\textwidth]{Figs/thumbsdown-place.png}};
        \node [minimum height=0.2cm, minimum width=2.5cm, inner sep=0pt, anchor=east, shift={(4.5,-1.8)}] at (generator.east) (detect-place) {\includegraphics[width=.14\textwidth]{Figs/fatigued-pick.png}};
        \node [minimum height=0.2cm, minimum width=2.5cm, inner sep=0pt, anchor=east, shift={(4.5,-3.2)}] at (generator.east) (inbin-detect) {\includegraphics[width=.14\textwidth]{Figs/thumbsup-placeinbin.png}};
        
        \draw [->] (exp_trajs_1.north) -- coordinate[auto] node[pos=0.45, right] {$\X^E$} (discriminator.south);
        \draw [decorate,decoration={brace,amplitude=10pt},xshift=0cm] (inbin-detect.south west) -- (detect-noop.north west) node [black,midway] (curly_brace) {};

        \draw [->] (generator) -- coordinate[auto] node[pos=0.45, below] {$\bm{\pi}$} ($(curly_brace.west)+(-0.2,0)$);           
    \end{tikzpicture}
    \caption{The \tbdAIRL{} architecture for a dyadic team with agents \(i\) and \(j\), and simulated human-human expert trajectories (\(\X^E\)). Agent \(i\)'s belief module uses agent \(j\)'s actions to generate belief states of \(j\)'s type, and vice versa, creating joint \(\hat{b}^E_{\bm \theta}\) trajectories. Similarly, \tbdPPO{} interacts with \tbdMDP{}, the type-based reward function \(R\), and the belief module to generate \(\langle \hat{\X}, \hat{b}_{\bm \theta} \rangle\). Both \(\langle \X^E, \hat{b}^E_{\bm \theta} \rangle\) and \(\langle \hat{\X}, \hat{b}_{\bm \theta} \rangle\) train the discriminator \(D_{\bm \alpha}\) to update \(R\) until convergence. The learned policy \(\bm{\pi}\) is then applied in HRC, where the robot follows its learned policy and the human continues to perform as previously in the demonstration. Here, \(h\) denotes the hidden state of the GRU and \(b\) denotes the normalized belief.
}
    \label{fig:TB-Dec-AIRL-architecture}
\end{figure*} 



%===============================================================================
\subsection{Algorithm}
\label{subsec:algorithm}


The \tbdAIRL{} algorithm (Algorithm~\ref{alg:main}) uses the new model $\mathcal{TB-DM}$ without the reward and transition functions (as \tbdAIRL{} is model-free) and the expert trajectories $\X^E$ (see Eq.~\ref{eqn:basic-irl-expert-trajs}) to learn the task's common reward function $R$. It starts by generating a random decentralized policy vector $\bm{\pi^L}$ with generator $G_{\omega}$ (line 1), loading the pre-trained belief module $B_{\bm \phi}$, and initializing the discriminator $D_{\bm{\alpha}}$ with random weights $\bm{\phi}$ and $\bm{\alpha}$ (see Fig.~\ref{fig:tbdAIRL-stochastic-computation-graph}). $\hat{b}^E_{\bm \theta}$ is then obtained by passing $X^E$ through $B_{\bm \phi}$ to obtain expert belief trajectories (see~\eqref{eqn:expert-belief-trajs}). This is equivalent to performing a belief update at each timestep as per Eq.~\ref{eqn:belief-update} using the expert state-action trajectories to obtain the expert's belief states at each timestep.

\begin{figure}[!ht]
    \centering      
    \begin{minipage}[b]{\columnwidth}
        \centering
        \begin{tikzpicture}[
            roundnode/.style={circle, draw=gray, fill=white, thick, minimum size=1mm},
            squarednode/.style={rectangle, draw=gray, fill=white, thick, minimum size=1mm},
            >=latex, % Common arrow tip
            node distance=0.25cm,
            every node/.style={font=\small},
            every label/.append style={font=\small},
            every path/.style={draw=gray} % Set the default color for every path to gray
        ]
        
        % Nodes
        \node[roundnode, inner sep=0pt, minimum size=7mm]      (xhat)           {$\hat{\X}, \hat{b}_{\bm \theta}$};
        \node[squarednode]    (xe) [above=0.25cm of xhat]                       {$\X^E , \hat{b}^E_{\bm \theta}$};
        \node[squarednode]    (D)  [above right=0.05cm and 1cm of xhat]         {$D^*(s, B_{\phi}(\theta), a)$};
        \node[squarednode]    (logD)   [right=of D]                             
        {$\log \left(\frac{D^* (\cdot, \cdot, \cdot)}{1 - D^* (\cdot, \cdot, \cdot)}\right)$};
        %D^* (\cdot, \cdot, \cdot) - \log(1 - D^* (\cdot, \cdot, \cdot))$};
        
        % Arrows
        \draw[->] (xhat) -- (D); %node[midway, above] {\footnotesize $\hat{\Y}_{\bm \theta}$};
        \draw[->] (xe) -- (D); %node[midway, above] {\footnotesize $\Y^E_{\bm \theta}$};
        \draw[->] (D) -- (logD);
        
        % Incoming arrows to xhat
        \draw[->] ($(xhat) + (-0.8,0.4)$) -- (xhat) node[at start, above, xshift=-5pt, yshift=-5pt] {\bm{$\phi$}};
        \draw[->] ($(xhat) + (-0.8,-0.3)$) -- (xhat) node[at start, below, xshift=-4pt, yshift=4pt] {\bm{$\omega$}};
        
        % Incoming arrows to xe
        \draw[->] ($(xe) + (-0.8,-0.4)$) -- (xe);
        % \draw[->] ($(xe) + (-0.65, 0.4)$) -- (xe)  node[at start, above, xshift=-1pt, yshift=-6pt] {\footnotesize $\X^E$};
        
        % Loop arrow from logD to xhat
        % \draw[->, thick] (logD.south) -- ++(0,-0.75) -| node[pos=0.25, above] {$R$} (xhat);
       \draw[->, thick] (logD.south) -- ++(0,0) |- node[pos=0.7, below] {$R$} (xhat.east);
        
        \end{tikzpicture}
        \caption{Stochastic computation graph for the expectation: $\mathbb{E}_{\X^E, \hat{b}^E_{\bm \theta}} \Bigl[ \log D^* \Bigr] - \mathbb{E}_{\bm{\pi},\hat{b}_{\bm \theta}} \Bigl[\log (1 - D^*)\Bigr]$ where $D^*$ represents the maximum of $D_{\bm \alpha}$. Notice that both the policy parameters (${\bm \omega}$) and the belief parameters ($\bm{\phi}$) influence the joint-state, joint-action trajectories and belief trajectories ($\hat{b}_{\bm \theta}$) through environment interaction. Circles represent stochastic nodes, rectangles represent deterministic nodes.}
        \label{fig:tbdAIRL-stochastic-computation-graph}
    \end{minipage}%
\end{figure}


The algorithm iterates through updates until training concludes (line 2). In each iteration, it generates joint trajectories $\langle \hat{\X}, \hat{b}_{\bm \theta} \rangle$ using the current policy vector $\bm{\pi}$ and belief module $B_{\bm \phi}$ (line 3). (State, belief-state, action)-tuples are then sampled from these trajectories and from $\langle \X^E, \hat{b}^E_{\bm \theta} \rangle$ (line 5). The discriminator is trained to distinguish between expert and learned samples using BCE loss (line 7). The updated reward $R$ is extracted from the trained discriminator (line 8). The generator $G_{\bm \omega}(R)$ is then trained with a centralized critic and decentralized actors using \tbdPPO{} to produce the policy rollout vector. Finally, the learned reward function $R$ and the converged policy $\hat{\bm{\pi}}^L$ are returned.


\begin{algorithm}[tbh!] 
\caption{\tbdAIRL{}}
\label{alg:main}
\SetKwInput{KwInput}{Input}                % Set the Input
\SetKwInput{KwOutput}{Output}              % set the Output
\DontPrintSemicolon
  
  \KwInput{$\mathcal{TB-DM}$ sans $R$ and $T$; Exp trajs $\X^E$ sans other agent types.}
  \KwOutput{Learned joint type-based reward function $R$.}
        
        Initialize generator ($G_{\bm \omega}$) with policy vector $\bm{\pi^L}$, Discriminator $D_{\bm{\alpha}}$, and pre-trained belief module $B_{\bm \phi}$. 
        
        \For{$iter \leftarrow 0$ \KwTo $train$\textunderscore $iters$}{

         Use $\bm{\pi^L}$ and $B_{\bm \phi}$ to step through the environment and generate joint trajectories $\langle \hat{\X}, \hat{b}_{\bm \theta} \rangle$

         Obtain expert state-action and belief trajectories tuple $\langle \X^E, \hat{b}^E_{\bm \theta} \rangle$ by passing $\X^E$ through $B_{\bm \phi}$

         Sample joint $(\bm{s}, b_{\bm \theta}, a, \bm{s}', b_{\bm \theta}')$ pairs from $\langle \hat{\X}, \hat{b}_{\bm \theta} \rangle$ and $\langle \X^E, \hat{b}_{\bm \theta} \rangle$, respectively
         
         \For{$ep\gets0$ \KwTo $discriminator$\textunderscore $epochs$}{
            Train discriminator $D_{\bm{\alpha}}$ via BCE loss to classify $\langle \X^E, \hat{b}^E_{\bm \theta} \rangle$ from $\langle \hat{\X}, \hat{b}_{\bm \theta} \rangle$
            }
            
        Update reward: $R \leftarrow \log (D_{\bm{\alpha}}(...) /(1 - D_{\bm{\alpha}}(...))$
        
        \For{$ep\gets0$ \KwTo $generator$\textunderscore $epochs$}{
            Train generator $G_{\bm \omega}(R)$ $\leftarrow$ TB-Dec-PPO.
            }
        
        Get updated policy $\bm{\pi^L} \leftarrow G_{\bm \omega}(R)$.
         }
    \Return{$R$, $\bm{\pi^L}$}
\end{algorithm}



%===============================================================================
\subsection{Theoretical Analysis}
\label{sec:theoretical-result}

Type transition kernel $F_i(\theta_j'|a_j,\theta_j)$ forms a Markov chain with state $\theta_j$ and edges guarded by $a_j$. Under the assumption that $F_i$ is irreducible and aperiodic, the type distribution $b_i$ given by Eq.~\ref{eqn:belief-update} will converge to a limiting distribution. Let the joint beliefs, $b^t({\bm \theta})=\prod_i b_i^t({\bm \theta_{-i}})$. %Eq.~\ref{eqn:belief-update} represents a classic Bayesian belief update where $F_i$ is the likelihood function. 
Then, after sufficient time $t$ elapses, $\mathcal{D}_{TV}(b^{t+1}, b^t)\le \delta\cdot\mathcal{D}_{TV}(b^t, b^{t-1})$ where $0\le \delta <1$ and $\mathcal{D}_{TV}$ denotes the total variation distance. 
%The type transition kernel ($F_i$ in Eq.~\ref{eqn:belief-update}) acts as a contraction mapping, i.e., $\exists~ 0\le \delta <1$ such that  This is justified as Bayesian belief propagation is convergent for the type of networks shown in Eq.~\ref{fig:tbdMDP-dependency-graph}. 
Then, the following  holds (proof is in the Appendix):
\begin{theorem}
    If the $i$-th agent's discriminator error compared to the $i$-th expert is small, that is, $\|D_i^t-D_i^E\| \le \epsilon~\forall i=1,\ldots,N$, then the difference in conditional log-likelihood (LL) of data is bounded:
    \[LL({\mathcal X}|R^E)-LL({\mathcal X}|R^t)\le \frac{8N\epsilon}{1-\gamma(1-\delta/2)},\]
    where $R^E$ and $R^t$ are the true (expert) and learned {\em common} reward functions at iteration $t$.
\end{theorem}

As adversarial inverse learning algorithms have a convergence rate of ${\mathcal O}\Bigl(\frac{1}{(1-\gamma)^3\sqrt{t}}\Bigr)$~\citep{guan2021will}, it follows that as $\epsilon$ decreases at that rate, the average error in log-likelihood approaches $0$.

% =================================================================================
\section{Experiments}
\label{sec:experiments}



We implemented \tbdAIRL{} in Python and evaluate its performance on a use-inspired human-robot collaborative onion sorting domain. Our implementation of the method is available at \url{https://github.com/thinclab/TB-Dec-AIRL}. The objective is to have a human and a UR$3$e cobot  stand across each other and sort onions on a line conveyor. In this collaborative produce sorting domain, the optimal sorting behavior involves quickly assessing each onion on the conveyor. If it is blemished, it should be picked up and discarded into a bin. If it appears unblemished, it should be picked up for a closer inspection. If it is still seen as unblemished, it is returned to the conveyor; otherwise, it is discarded into the bin. Both the human and the cobot operate in a shared workspace and work in a decentralized manner, while the cobot must adapt to changes in the human's sortation behavior due to fatigue.


\subsection{Simulation in MA-Gym}

The simulated environment for the collaborative sorting domain was developed as a discrete state-action domain in MA-Gym~\citep{magym} based on domain knowledge~\citep{sengadu2023dec}.\footnote{It is available for download at \url{https://github.com/prasuchit/ma-gym/tree/master/ma_gym/envs/dec_huro_sorting}.}
In this environment, each agent's state contains $5$ discrete variables: \emph{Onion location} (takes one of $4$ values based on the current onion location); \emph{End-effector location} (takes one of $4$ values based on the current end-effector location); \emph{Prediction} (takes one of $3$ values: blemished, unblemished, or unknown prediction label of the onion in focus); \emph{Self-type} (the subject agent's type); \emph{Indication} (true if the subject agent's type change has been communicated otherwise false).


%(as is common practice). 

Each agent has $9$ discrete actions: \emph{No-op} (no operation), \emph{Detect} (choose any onion on the conveyor), \emph{Pick} (pick up the chosen onion from the conveyor), \emph{Detect-pick} (combined action to choose any onion on the conveyor and immediately pick it up), \emph{Inspect} (inspect the picked onion), \emph{PlaceOnConveyor} (place the held onion back on the conveyor), and \emph{PlaceInBin} (place the held onion in the discard bin). The last two actions for the human are \emph{Thumbs-up} (gesture indicating unfatigued type to the robot), and \emph{Thumbs-down} (indicating fatigued type to the robot). Similarly, the robot's last two actions are \emph{Speed-up} (increase movement speed to maintain throughput and enter industrial mode) and \emph{Slow-down} (reduce movement speed to return to regular collaborative mode). 

Expert trajectories were recorded using a hand-coded policy vector derived from observing real human-human team demonstrations, which was run in the MA-Gym environment. In the demonstration, one of the humans in the human-human team rests every so often and the other human speeds up during this time to maintain throughput. Otherwise, both humans sort the onions simultaneously as described previously.\footnote{These policies can be accesssed at \url{https://github.com/thinclab/TB-Dec-AIRL/blob/main/utils/tb_sorting_simulated_policy.py}.} This allowed us to repeatedly generate a large number of trajectories from different start states. We used a total of $10^6$ timesteps (same for the baseline) and trained the methods for $10^9$ iterations. Rest of the hyperparameters and training error are provided in the Appendix.

\begin{figure}[!ht]
%[tbh!]
    \centering
    \includegraphics[width=0.33\textwidth]{Figs/Belief_comparison.png}
    %\includegraphics{Figs/Belief_comparison.png}
    \caption{The receiver operating characteristic (ROC) plot comparing the pretrained GRU module predictions, a classical Bayesian belief module predictions, and the ground truth, for 100 episodes. 
    %The GRU model is well-trained as it performs near-identical to a classical belief update.
    }
    \label{fig:belief_comparison}
\end{figure}

\begin{figure*}[tbh!]
  \centering
  \begin{subfigure}{0.245\textwidth}
    \centering
    \includegraphics[width=\linewidth]{Figs/Setup.png}
    \caption{}
    \label{fig:hrc-setup}
  \end{subfigure}
  \begin{subfigure}{0.37\textwidth}
    \centering
    \includegraphics[width=\linewidth]{Figs/Thumbs_down.jpg}
    \caption{}
    \label{fig:thumbs-down}
  \end{subfigure}
  \begin{subfigure}{0.37\textwidth}
    \centering
    \includegraphics[width=\linewidth]{Figs/Thumbs_up.jpg}
    \caption{}
    \label{fig:thumbs-up}
  \end{subfigure}
  \caption{$(a)$ HRC sorting setup with a Realsense D435 behind the robot detecting objects on the conveyor and the OAK-D S2 camera to the left of the human monitoring their actions. $(b)$ and $(c)$ a human sorter signaling fatigue and recovery to the robot.}
  \label{fig:human-indications}
\end{figure*}

Each agent's belief update is represented using a pre-trained GRUcell, which is trained with ground-truth labels from the Gym environment and hand coded policy actions. It learns to predict what could be the next type of the other based on the observed action, not {\em when} the type will transition.  We use tanh activation, 64 hidden nodes, Adam optimizer, and cross-entropy loss for each GRUcell. We use batch training until convergence with data collected from complete episodes. Each episode is reset after an arbitrary limit of 100 timesteps. As shown in Fig.~\ref{fig:belief_comparison}, upon convergence, the GRU model is near-identical to a classical Bayesian belief update. 



% A typical human-robot collaborative sorting episode proceeds as follows: Both agents start from a valid state sampled from the start-state distribution and begin sorting according to their respective policies. After a variable number of timesteps, the human's type transitions to a fatigued state. The human then stays idle to recover energy, signaling thumbs-down to the robot. Upon noticing this, the robot updates its belief about the human's type and queries its policy accordingly. This prompts the robot to perform a `Speed-up' action, switching to super-collaborative mode to rapidly pick and sort onions using the `Detect-pick' action. After some time, the human recovers, signals thumbs-up to the robot, and resumes sorting. The robot then updates its belief and returns to a slower, collaborative sorting mode using the ``Slow-down'' action. This process continues until all onions are sorted or a timeout is reached. Deviations from this behavior result in trajectories misaligned with the expert's preferences, yielding a lower reward from the discriminator. %Iteratively, \tbdAIRL{} learns a reward function that accurately reflects the expert's underlying preferences.





\noindent\textbf{Performance comparison with default.} We use a reward function for evaluation which assigns a $+1$ reward for successfully sorting an onion and a $-1$ penalty for incorrect sorting (e.g., placing a bad onion back on the conveyor). Our baseline is a decentralized policy vector learned using a previous method \dAIRL{}~\citep{sengadu2023dec}, which does not model types and therefore the cobot's policy is not type contingent. \dAIRL{} serves as an ablation of \tbdAIRL{}
for {\em disabled beliefs} and helps measure the benefit gained by \tbdAIRL{} from the belief component alone.

\begin{table}[ht!]
\caption{HRC performance comparison on MA-Gym simulation. Expert performance should be seen as an upper bound. Each episode lasts 100 timesteps.}
\label{table:hrc-sorting}
\centering
\renewcommand{\arraystretch}{1.2} % Adjust the row spacing as needed
\setlength{\tabcolsep}{4pt} % Adjust the column spacing as needed
\begin{tabular}{|c|c|c|} 
\hline
\multicolumn{3}{|c|}{Average of $1,000$ episodes} \\
\hline
{\bf Method} & {\bf Onions Sorted Per Eps} & {\bf Eps Reward} \\
\hline
%Upper Bound & & \\
Expert & $64 \pm 1$ & $64 \pm 1$ \\
\hline
\small{\textbf{\tbdAIRL{}}} & \bm{$56 \pm 2$} & \bm{$55.7 \pm 0.78$} \\
\hline
\dAIRL{} & $45 \pm 2$ & $43.2 \pm 0.65$ \\
\hline
\end{tabular}
\end{table}

As depicted in Table~\ref{table:hrc-sorting}, the decentralized policy vector learned by \tbdAIRL{} scores a significantly higher average episode reward compared to the decentralized policies from \dAIRL{} and performs closer to expert behavior. \tbdAIRL{} learns to use the belief over the human's type to adjust robot behavior accordingly. Notice that this adjusted behavior differs from the robot simply defaulting to a single sortation mode as in \dAIRL{}. The decision making learns that when the human is fatigued, fewer onions are sorted, thereby reducing the team's overall reward. This understanding makes the robot choose {\em Speed-up} action, which in turn increases the team's throughput.  Although both methods operate within the same Dec-MDP framework for HRC, the baseline policy relies solely on an agent's local state attributes to decide actions. In contrast, \tbdAIRL{}'s policy accounts for both the agent's local state and their belief about the other agent(s)' type. This enables the cobot to use faster ``industrial-mode'' actions at the appropriate times leading to higher rewards. 


\subsection{Physical HRC Experiments}

Human processing behaviors may diverge slightly from that observed in the demonstration and sim2real challenges exist for the cobot in this domain. Consequently, we validate the simulation results using physical human-cobot experiments with five different human sorters to account for any variability. To sort with the human, we utilize a Universal Robots UR3e 6-DOF cobot equipped with a Realsense D435 RGB-D camera for onion detection and an OAK-D S2 RGB-D camera for human action estimation (see Fig.~\ref{fig:human-indications}). The raw RGB frames from the Realsense camera are processed using a pre-trained object detection model YOLOv7, which generates bounding boxes around the onions on the conveyor. By combining these bounding boxes with their corresponding depth information, we compute the real-world 3D locations of the onions using rigid transforms and a pinhole camera model. Concurrently, we employ a hand-tracking method~\citep{zhang2020}, fine-tuned for our application, on the RGB input from the OAK-D camera. This output is similarly processed to determine the 3D location of the human hand to assess the human's actions. Whereas \tbdAIRL{} yields a policy for each agent, only the cobot uses one of the type-contingent policies in the vector to control its behavior. This learned policy, exported to a CSV file, is loaded into a finite-state machine which is used to control the cobot via ROS Noetic.
%according to the policy.
%at each timestep. 
%The cobot monitors the human's actions to assess their fatigue and adjusts its behavior to optimize sorting efficiency and throughput. 
%After the fatigued human signals their condition with a thumbs-down gesture, they remain idle. Upon recognizing this change, the cobot shifts to a `super-collaborative' mode, increasing its speed to sort more onions. After a few timesteps, the human indicates their recovery with a thumbs-up gesture and resumes sorting. The robot then updates its belief of the human's type and returns to its regular `collaborative' mode.

\begin{figure*}[th!]
\centerline{
    \begin{subfigure}{0.3\textwidth}
      \includegraphics[width=\linewidth]{Figs/start-sort.png}
      \caption{}
      \label{fig:start-sort}
    \end{subfigure}
    \begin{subfigure}{0.3\textwidth}
      \includegraphics[width=\linewidth]{Figs/place-inspect.png}
      \caption{}
      \label{fig:place-inspect}
    \end{subfigure}
    \begin{subfigure}{0.3\textwidth}
      \centering
      \includegraphics[width=\linewidth]{Figs/thumbsdown-place.png}
      \caption{}
      \label{fig:thumbsdown-place}
    \end{subfigure}}
\centerline{
    \begin{subfigure}{0.3\textwidth}
      \includegraphics[width=\linewidth]{Figs/fatigued-pick.png}
      \caption{}
      \label{fig:fatigued-pick}
    \end{subfigure}
    \begin{subfigure}{0.3\textwidth}
      \includegraphics[width=\linewidth]{Figs/thumbsup-placeinbin.png}
      \caption{}
      \label{fig:thumbsup-placeinbin}
    \end{subfigure}
    \begin{subfigure}{0.3\textwidth}
      \includegraphics[width=\linewidth]{Figs/unfatigued-pick.png}
      \caption{}
      \label{fig:finish-sort}
    \end{subfigure}}
    \caption{Key frames from a human-robot collaborative sort. In Fig.~\ref{fig:start-sort}, the human and cobot begin sorting with the human inspecting an unblemished onion while the cobot attempts to pick an onion. Fig.~\ref{fig:place-inspect} captures the the human placing an onion back on the conveyor while the cobot inspects the picked onion. In Fig.~\ref{fig:thumbsdown-place}, the human indicates fatigue with a thumbs-down gesture to the OAK-D camera. The cobot responds by entering industrial mode in Fig.~\ref{fig:fatigued-pick}, moving faster while the human performs NoOp. Finally, in Eq.~\ref{fig:thumbsup-placeinbin}, the human signals recovery with a thumbs-up gesture, and both agents collaboratively complete the sort in  Fig.~\ref{fig:finish-sort}.}
    \label{fig:TB-HRC-sort}
\end{figure*}

\noindent \textbf{Validation of simulation results.} Five University students (not involved in this paper) played the role of the human sorter in the collaboration with small natural variability, and engaged in six rounds of onion sorting each across from the cobot. The team was required to sort fifteen randomly placed onions on a conveyor in each round while the human indicates fatigue no more than two times. During the first three rounds, the cobot followed the policy learned from \tbdAIRL{}, and for the remaining three rounds, it adhered to the \dAIRL{} policy. All trials were successful with no cobot failure in any round (an example sortation is shown in Fig.~\ref{fig:TB-HRC-sort}). In Fig.~\ref{fig:hrc_study_plot}, we show the average time taken per round, under the two conditions of the cobot using the \tbdAIRL{} policy and the baseline policy. The HRC where the cobot is aware of human factors and adapts accordingly took significantly less time to sort the onions compared to one which is not cognizant.
%We recorded the number of onions sorted and the time taken for each round, comparing these results with those from the baseline policy (the ablation study), as shown in \cref{fig:hrc_study_plot}. Clearly, \tbdAIRL{} policy significantly improves upon the baseline.

The human sorter held his or her thumb up or thumb down until it was recognized by the hand
tracking application. This recognition was not instantaneous and the humans held their thumbs for
slightly varying amounts of time across the 6 rounds. Indeed, this variability in gesture recognition is responsible in part for the variability of the sortation times of the 6 human-robot teams shown in Fig.~\ref{fig:hrc_study_plot}. However, the two gestures (thumb-up and thumb-down) are quite distinct and the hand tracking application did not make any mistakes in distinguishing them.


\begin{figure}[!ht]
    \centering
    \includegraphics[width=.9\linewidth]{Figs/HRC_study_plot.png}
    \caption{Average collaborative sort time for 15 onions with a UR$3$e cobot. Each human-robot team performed 3 rounds with the \tbdAIRL{} policy and 3 rounds with the \dAIRL{} policy.}
    \label{fig:hrc_study_plot}
\end{figure}

% \begin{table}[tbh!]
% \caption{\bf Average time taken to sort 300 onions}
% \label{table:avg_time_hrc}
% \setlength{\tabcolsep}{3pt}
% \centering
% \begin{small}
% \begin{tabular}{|c | c | c |} 
% \hline
%  Method & No. onions sorted & Time (mins) \\
%  \hline\hline
%  Human-human & $300$ & $6.72 \pm 0.34$ \\
%  \small{Human-cobot (Dec-AIRL)} & $300$ & $46.583 \pm 0.26$ \\
%  \small{\bf{Human-cobot (TB-Dec-AIRL)}} & \bm{$300$} & \bm{$44.416 \pm 0.12$} \\
%  % 4.8 secs avg per onion for our method 
%  \hline
% \end{tabular}
% \end{small}
% \end{table}

%===============================================================================
\section{Related work}
\label{sec:related_work}

Modeling teammates to enhance decision making is common in fields such as game theory, multiagent planning, and multiagent reinforcement learning. Although previous work in {\em teammate modeling} has had success in toy simulated environments, e.g., SMAC~\citep{samvelyan2019starcraft} and level-based foraging~\citep{albrecht2015game}, the body of research in this area is limited.
% One early work~\citep{albrecht2015game} introduced a Harsanyi-Bellman Ad Hoc Coordination strategy for Bayesian Games~\citep{shoham2008multiagent}, employing user-defined types in a planning framework to optimize actions based on teammates' strategies. 
A prominent approach in modeling other agents is interactive POMDPs (I-POMDPs)~\citep{gmytrasiewicz2005framework}, which recursively updates beliefs about other agents' types while solving for their policies. A key distinction between I-POMDPs and game theory-based models as compared to ours is that the former is often tailored for non-cooperative contexts, not collaborative systems with shared goals.

\citet{unhelkar2020effective} uses an agent Markov model (AMM) to capture an agent’s mental states, allowing learners to infer the AMM through variational Bayesian inference. 
%However, their approach does not focus on developing behavioral policies for efficient collaboration with other agents. 
Following up on this, recently~\citet{seo2024idil} learns the expert's policy and hidden intent (both part of the AMM) using expectation-maximization and IQ-Learn~\citep{garg2021iq}. Unlike our multiagent collaborative HRC scenario, this method models the human as a single-agent MDP and whose type does not change. 

\citet{nikolaidis2015efficient} treats different expert behaviors as partially observable variables and uses expectation-maximization to cluster demonstrations, leading to different reward functions for each expert type through single-agent IRL. However, the model assumes that agent types remain constant throughout task execution, whereas our scenario accounts for {\em dynamic} types.
%changing mid-task due to latent factors, allowing us to learn a common reward function and a vector of policies for each agent.
%
Another prior thread uses a bounded-memory model combined with a mixed-observability MDP to estimate human adaptability~\citep{nikolaidis2015improved, nikolaidis2017mathematical, nikolaidis2017human}, enabling the robot to adjust its actions accordingly. In contrast, \tbdMDP{} adopts an objective perspective to capture the nuances of collaboration, allowing agents to adjust their behaviors as needed, to maximize team rewards. \citet{peternel2018robot} develops a method to adapt the robot's physical behavior online to account for human motor fatigue, using techniques like dynamical movement primitives and adaptive frequency oscillators.

Another prior technique uses Trust-POMDP~\citep{chen2020trust} model that maintains a belief over trust as a latent variable, allowing the robot to adapt its policy in response to human interruptions based on their trust in the robot's abilities. This work aligns more closely with active learning~\citep{settles2009active} than with traditional HRC. Recent studies by~\citet{yuan2022multi} and~\citet{jiang2024multi} apply variational inference models with mutual information maximization in decentralized settings to learn a random variable \( z \) that informs each agent about others. These works are similar to~\citet{wang2022co}, in that they assume a shared latent strategy space that remains constant throughout execution, whereas the \tbdMDP{} allows for diverse and dynamic agent types during task execution.

More recently~\citet{van2024simultaneously} focuses on learning both the human's hidden intent and task preferences for effective human-robot collaboration using maximum entropy IRL. They employ a multi-agent MDP model akin to~\citet{wang2022co} to derive the marginalized policy, transition function, and reward function necessary for the robot to maintain a belief over human intent. This uses a centralized framework where all agents are aware of the joint state, and hence is not applicable to realistic scenarios that tend to be naturally decentralized. ~\citet{prasad2024moveint} utilizes a mixture of Gaussians to model the interactions between humans and robots and to learn the latent space of robot actions. However, the policy is conditioned solely on human movements, meaning the robot reacts to human actions rather than actively collaborating to solve tasks as part of a team. 

%Ultimately, the goal of HRC is to create human-centric solutions that prioritize the time and energy of the human. Techniques that do not validate their results on physical systems may not be applicable in real-world HRC contexts, even if they tackle related concepts.

%===============================================================================
\section{Concluding remarks}
\label{sec:conclusion}

Motivated by the understanding that human behavior changes over time due to latent decision-making factors, we introduced a novel multi-agent model and an associated IRL method. This approach learns from human-human team demonstrations, enabling a collaborative robot (cobot) to work effectively with a human by accommodating changes in the human's type, thereby enhancing collaboration within a shared workspace. Supported by a conditional bound on performance loss, our experimental results indicate that \tbdAIRL{} outperforms a previous decentralized IRL method in which the cobot does not reason about human types. Although \tbdMDP{} is designed for HRC, it generally applies to any multiagent decision making in similar problem settings. \tbdAIRL{} brings robot learning to HRC and its deployment a step closer to real-world collaborative environments.

\noindent \textbf{Limitations.} 
%\tbdAIRL{} and its underlying model, \tbdMDP{}, offer enhanced flexibility for modeling realistic HRC scenarios. However, we 
We assume that expert trajectories are available as state-action pairs (a common assumption in IRL). While the task attributes of the state can be captured using sensors like RGB-D cameras, certain hidden attributes such as the agent's own type are mental states. We leveraged simulated human-human trajectories to seamlessly generate expert trajectories. However, future work may reframe the state definition to attributes that can be fully captured through observable traits, and investigate the impact (e.g., on throughput) of errors and noise in observed actions that could lead to poor belief estimation of others' types. Future studies could also test on larger teams and accommodate asynchronous, durative actions, e.g., using a variant of the Dec-semi-MDP~\citep{sutton1999between, goldman2008communication} model.

\begin{acknowledgements} % will be removed in pdf for initial submission,
                         % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option

This research was supported in part by NSF grant \#IIS-1830421, by a grant from the Georgia Research Alliance, and by an intramural grant from University of Georgia's Precision Agriculture Institute, to PD. All opinions expressed in this paper are those of the authors alone and do not reflect on the sponsors. We would also like to acknowledge useful discussions with Vaibhav Unhelkar from Rice University and constructive feedback from the anonymous reviewers, which have substantially improved this paper. 
    
\end{acknowledgements}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%% The next two lines define, first, the bibliography style to be 
%%% applied, and, second, the bibliography file to be used.

\bibliographystyle{ACM-Reference-Format} 
\bibliography{BIB/references2}  % .bib

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\newpage
\clearpage

\section*{Appendix}

\subsection*{Proof of Theorem 1}

\begin{proof}

We use max-norms $\|.\|$ over the sets of states, actions and next states. 
% \begin{assumption}
% As belief update is approximated by a neural network, we assume that the belief update progresses toward the true belief at a certain rate that is not unboundedly large but also not too small. 
%  \[\exists0<\delta<\infty, ~s.t.~|D_{KL}(b^\ast(.|h^t)\|b^t(.|h^t))-D_{KL}(b^\ast(.|h^{t+1})\|b^{t+1}(.|h^{t+1}))|\le\log(1+\delta),\]
%  where $b^t(.|h^t)$ and $b^{t+1}(.|h^{t+1})$ are the type-beliefs after the step-$t$ and step-$t+1$ updates given the corresponding observation histories, $h^t$ and $h^{t+1}$. 
% \end{assumption}
% The assumption implies $\left|\sum_\theta b^\ast\log\frac{b^{t+1}}{b^t}\right|\le\log(1+\delta)$$. 
We refer to joint beliefs $b^t(\ldots)=\prod_i b_i^t(\ldots)$. As in Dec-AIRL, we distinguish interactive states ($S_I$) from non-interactive states ($S_{NI}$) where the reward function is given as:
\begin{eqnarray}
R(s,\mathbf{a},s')=\left\{
\begin{aligned}
    &R(s,\mathbf{a},s'), \text{ if }s\in S_I\\
    &\sum_i R_i(s_i, a_i, s'_i),\text{ if }s\in S_{NI}
\end{aligned}
\right.
\end{eqnarray}

Type transition kernel $F_i(\theta_j'|a_j,\theta_j)$ forms a Markov chain with state $\theta_j$ and edges guarded by $a_j$. Under the assumption that $F_i$ is irreducible and aperiodic, the type distribution $b_i$ given by Eq.~\ref{eqn:belief-update} will converge to a limiting distribution. Let the joint beliefs, $b^t({\bm \theta})=\prod_i b_i^t({\bm \theta_{-i}})$. %Eq.~\ref{eqn:belief-update} represents a classic Bayesian belief update where $F_i$ is the likelihood function. 
Then, after sufficient time $t$ elapses, 
\begin{equation}
    \mathcal{D}_{TV}(b^{t+1}, b^t)\le \delta\cdot\mathcal{D}_{TV}(b^t, b^{t-1})
    \label{eqn:assume}
\end{equation}    
where $0\le \delta <1$ and $\mathcal{D}_{TV}$ denotes the total variation distance. 

First, we note the definition and upper-bound of ${\cal D}_{TV}(b^t,b^{t-1})=\frac{1}{2}\sum_\theta|b^t(\theta|h^{t-1})-b^{t-1}(\theta|h^{t-2})|\le 1$, and the fact that if ${\cal D}_{TV}(P,Q)\le\epsilon$ then for any event $A$, $P(A)\ge Q(A)(1-\epsilon/2)$.
Applying these to the property of belief updates (Eq.~\ref{eqn:assume}) we get
\begin{align*}
& b^{t+1}(\theta|h^t)\ge b^t(\theta|h^{t-1})(1-\delta{\cal D}_{TV}(b^t,b^{t-1})/2)\\
& \ge b^t(\theta|h^{t-1})(1-\delta/2)
\end{align*}
Therefore, $\frac{b^{t+1}(\theta|...)}{b^t(\theta|...)}\ge (1-\frac{\delta}{2}),\forall\theta, t$ when $t$ is sufficiently large. Note that this applies to both the expert and generated trajectories. Now, the estimates of joint rewards in non-interactive (NI) states is 
\begin{align*}
    R^t &=\sum_i R_i^t\\
    &=\sum_i\log\left(\frac{D_i^t}{1-D_i^t} \right)
\end{align*}
Similarly, the expert rewards in non-interactive states are $R^E=\sum_i\log\left(\frac{D_i^E}{1-D_i^E} \right)$. Now, the $i$-th agent's discriminator error is $\|D_i^t-D_i^E\|\le \epsilon$, and the $i$-th expert's discriminator value~\footnote{An expert's discriminator attempts to distinguish between expert trajectories and those produced by the expert's policy, which are indistinguishable.} is 0.5, therefore
\begin{align}
    \|R^t-R^E\| &= \|\sum_i\log\left(\frac{D_i^t}{1-D_i^t}\right)-\sum_i\log\left(\frac{D_i^E}{1-D_i^E} \right)\|\nonumber\\
    &=\|\sum_i\log\left(\frac{D_i^t}{D_i^E}\right)-\sum_i\log\left(\frac{1-D_i^t}{1-D_i^E} \right)\|\nonumber\\
    &\le \left|\sum_i\log\left(\frac{1/2+\epsilon}{1/2-\epsilon}\right)-\sum_i\log\left(\frac{1/2-\epsilon}{1/2+\epsilon} \right)\right|\nonumber\\
    &=2\sum_i\log\left(\frac{1/2+\epsilon}{1/2-\epsilon}\right)\nonumber\\
    &\le 8N\epsilon\label{eqn:r-bound}
\end{align}
when $\epsilon$ is small enough and $N$ is the number of agents. For interactive states, the same arguments apply to the joint discriminator, but the bound corresponding to Eq.~\ref{eqn:r-bound} would just be $8\epsilon$. Therefore, we use Eq.~\ref{eqn:r-bound} as the dominant form of this bound.

The log-likelihood of a trajectory ($X$) given the reward estimate $R^t$ is 
\begin{small}
\begin{align*}
    \log P(\X|R^t) &\propto \sum_t\gamma^t\sum_\theta R^t(s^t,a^t,\theta)b^t(\theta|h^t)\\
    &\ge\sum_t(\gamma(1-\delta/2))^t\sum_\theta R^t(\ldots,\theta) \rho_1(\theta)\\
    &\ge \sum_t(\gamma(1-\delta/2))^t\sum_\theta \left|R^E(\ldots,\theta)-8N\epsilon\right| \rho_1(\theta)\\
    &\ge \log P(\X|R^E) - \sum_t (\gamma(1-\delta/2))^t 8N\epsilon
\end{align*}
\end{small}
Here, $h^t$ is the history of states-actions preceding step $t$ and $\rho_1$ is the initial belief. Thus, 
\begin{align*}
    LL(\X|R^E) - LL(\X|R^t) &\le \sum_t (\gamma(1-\delta/2))^t 8N\epsilon\\
    &\le \frac{8N\epsilon}{1-\gamma(1-\delta/2)}
\end{align*}

% Accumulating this over a set of trajectories $\mathcal{X}$, we get 
% \[LL(\mathcal{X}|R^E) - LL(\mathcal{X}|R^t)\le \frac{8|\mathcal{X}|N\epsilon}{1-\gamma(1-\delta/2)}.\]
% Assuming identical priors, $P(R^t)=P(R^E)$, this also gives $LL(R^E|\mathcal{X}) - LL(R^t| \mathcal{X})\le \frac{8|\mathcal{X}|N\epsilon}{1-\gamma(1-\delta/2)}$.
\end{proof}

\subsection*{Hyperparameters of the methods}

For both \tbdAIRL{} and \dAIRL{}, we used $256$ hidden nodes for the 2 hidden layers in both the $g$ and $h$ networks within the discriminator, a batchsize of 128, learning rate for both the discriminator and the actors as 0.0003, discount factor and generalized advantage estimator both as 0.95, max gradient normalization as 0.5, and a random seed between 1 and 100. For Dec-PPO, we used the same hyperparameters as mentioned in gSDE~\citep{raffin2022smooth} because it has been tuned and tested on multiple pybullet environments. During our training, we observed discriminator errors of about 14\% at termination for \tbdAIRL{}. 

\end{document}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

