\documentclass[accepted]{uai2023}
\newcounter{savecntr}% Save footnote counter
\newcounter{restorecntr}% Restore footnote counter

\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{bm}
\usepackage{amssymb,amsmath,amsthm}



\newtheorem{theorem}{Theorem}


%added by zhangzq
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage[switch]{lineno}
\usepackage{diagbox}




%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
% \usepackage[pdftex,linkcolor=blue,citecolor=blue,backref=page]{hyperref}


\usepackage{amssymb}
\usepackage{comment}
\usepackage{subfigure}
%\usepackage[table]{xcolor}
\usepackage{xcolor}
\definecolor{lightgray}{gray}{0.893}
\usepackage{colortbl}
%\usepackage{subcaption}
 \renewcommand{\thefootnote}{\fnsymbol{footnote}} 

\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Fast Teammate Adaptation in the Presence of Sudden Policy Change}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\renewcommand{\thefootnote}{\fnsymbol{footnote}}
\author{%
  Ziqian Zhang$^1$\thanks{The first two authors contributed equally.}, Lei Yuan$^{1, 2*}$, Lihe Li$^1$, Ke Xue$^1$, Chengxing Jia$^{1, 2}$, Cong Guan$^1$, Chao Qian$^1$, {Yang Yu}$^{1,2}$\thanks{Corresponding Author}\\
  $^1$ National Key Laboratory for Novel Software Technology, Nanjing University\\
  $^2$ Polixir Technologies\\
  %\texttt{guanc@lamda.nju.edu.cn},\\
%   \texttt{chenf@smail.nju.edu.cn},\\
 \texttt{191240076@smail.nju.edu.cn},
  \texttt{\{yuanl, xuek, jiacx, guanc\}@lamda.nju.edu.cn},\\
  \texttt{lilhzq76@gmail.com}, \texttt{\{qianc, yuy\}@nju.edu.cn}
}

  
  \begin{document}
\maketitle

\begin{abstract}
  Cooperative multi-agent reinforcement learning (MARL), where agents coordinates with teammate(s) for a shared goal, may sustain non-stationary caused by the policy change of teammates. 
  Prior works mainly concentrate on the policy change cross episodes, 
  ignoring the fact that teammates may suffer from sudden policy change within an episode, which might lead to miscoordination and poor performance. We formulate the problem as an open Dec-POMDP, where we control some agents to coordinate with uncontrolled teammates, whose policies could be changed within one episode. Then we develop a new framework \textit{\textbf{Fas}t \textbf{t}eammates \textbf{a}da\textbf{p}tation (\textbf{Fastap})} to address the problem. Concretely, we first train versatile teammates' policies and assign them to different clusters via the Chinese Restaurant Process (CRP). Then, we train the controlled agent(s) to coordinate with the sampled uncontrolled teammates by capturing their identifications as context for fast adaptation. Finally, each agent applies its local information to anticipate the teammates' context for decision-making accordingly. This process proceeds alternately, leading to a robust policy that can adapt to any teammates during the decentralized execution phase. We show in multiple multi-agent benchmarks that Fastap can achieve superior performance than multiple baselines in stationary and non-stationary scenarios. 
  

\end{abstract}

\section{Introduction}\label{sec:intro}

Cooperative Multi-agent Reinforcement Learning (MARL) has shown great promise in recent years, where multiple agents coordinate to complete a specific task with a shared goal~\citep{oroojlooy2022review}, achieving great progress in various domains (e.g., path finding~\citep{sartoretti2019primal},  active voltage control~\citep{DBLP:conf/nips/WangXGSG21}, and dynamic algorithm configuration~\citep{xue2022multiagent}). Various methods emerge as promising solutions, including policy-based ones~\citep{maddpg,mappo}, value-based series~\citep{vdn,qmix}, and many variants like transformer~\citep{wen2022multiagent}, showing remarkable coordination ability in a wide range of tasks like StarCraft multi-agent challenge (SMAC), Google Research Football (GRF)~\citep{gorsane2022towards}, etc. % xuek 建议这里把 like StarCraft multi-agent challenge (SMAC), Google Research Football (GRF)~\citep{gorsane2022towards}, etc. 删了, 一个是之前刚说过领域广, 这里又介绍; 另一个是etc. 感觉出现的太多了 
Other works investigate different aspects, including  communication among agents~\citep{zhu2022survey}, model learning~\citep{wang2022model}, policy robustness~\citep{guo2022towards}, ad hoc teamwork~\citep{mirsky2022survey}, etc. 

However, one issue that can arise in MARL is non-stationarity~\citep{papoudakis2019dealing} caused by changes in teammates' policies.
Non-stationary is a hazardous issue for reinforcement learning, either in single-agent reinforcement learning (SARL)~\citep{Padakandla2019ReinforcementLA}, or MARL~~\citep{papoudakis2019dealing} settings, where the environment dynamic (e.g., transition or reward functions) of a learning system may change over time (inter- or intra-episodes).
Many solutions have been developed in SARL to relieve this problem, %by capturing the environment-changing points
including meta-reinforcement learning~\citep{beck2023survey}, strategic retreat~\citep{DBLP:conf/case/DastiderL22}, sticky Hierarchical Dirichlet Process (HDP) prior~\citep{DBLP:conf/iclr/RenSJSWB22}, etc. 
The non-stationary in MARL is, however, much more complex, as we should consider the policy change caused by multiple teammates rather than the single environment dynamic change in SARL. 
The majority of works in MARL mainly focus on the non-stationary during the training phase~\citep{DBLP:journals/ai/AlbrechtS18,DBLP:conf/icml/Kim0RSAHLTH21}, the teammates' policy change across episodes ~\citep{qin2022multi,DBLP:conf/icml/HuLPF20},  or when perturbations happen~\citep{guo2022towards} (See related work in App.~A). However, the sudden policy change of teammates when deployed within an episode is never explored to the best of our knowledge, neither in problem formulation nor efficient algorithm design. Ignoring this issue would result in policy shift and even catastrophic miscoordination as agents' policies depend on other teammates in MARL~\cite{zhang2021multi}. On the other hand, the successful approaches used in SARL are unsuitable for the MARL setting because of the MARL's inherent characteristic (e.g., partial observability). This begs the question: Can we acquire a robust policy that can handle such changes and adapt to the new teammates' polices rapidly?
 
 
In this work, we aim to develop a robust coordination policy for the mentioned issue. 
Concretely, we formulate the problem as an Open Dec-POMDP, where we control multiple agents to coordinate with some uncontrolled teammates, whose policies could be altered unpredictably within one episode. Subsequently, we develop a new training framework Fastap, with which an agent can anticipate the teammates' identification via its local information. Specifically, as similar teammates might possess similarities in their identifications, learning a specific context for each teammate but ignoring the relationships among them could lead to trivial encodings. 
We thus assign them to different clusters via the Chinese Restaurant Process (CRP) to shrink the context search space. For the controlled coordinating policy training, we sample representative teammates to coordinate with by capturing their identifications into distinguishing contexts to augment the joint policy during the centralized training phase. Each agent then utilizes its local information to approximate the global context information. The mentioned processes proceed alternately, and we can finally obtain a robust policy to adapt to any teammates gradually during the decentralized execution phase.


For evaluation, we conduct experiments on different MARL benchmarks where the teammates' policy alter within one episode, including level-based foraging (LBF)~\citep{lbf}, Predator-prey (PP), Cooperative navigation (CN) from MPE~\citep{maddpg}, and a map created from StarCraft Multi-Agent Challenge (SMAC)~\citep{pymarl}. Experimental results show that the proposed Fastap can cluster teammates to distinguishing groups, learn meaningful context to capture teammates' identification, and achieve outstanding performance in stationary and non-stationary scenarios compared with multiple baselines.




\begin{figure*}
  \centering
  \includegraphics[width=0.99\textwidth]{Figures/flow.pdf}
  \caption{The overall framework of Fastap.}
  \label{Structure}
\end{figure*}
\section{Related Work} 
\paragraph{Cooperative Multi-agent Reinforcement Learning}
Many real-world problems are made up of multiple interactive agents, which could usually be modeled as a multi-agent system~\citep{DBLP:journals/access/DorriKJ18}. Among the multitudinous solutions, Multi-Agent Reinforcement Learning (MARL)~\citep{zhang2021multi} has made great success profit from the powerful problem-solving ability of deep reinforcement learning~\citep{ Wang2020DeepRL}. Further, when the agents hold a shared goal, this problem refers to cooperative MARL~\citep{oroojlooy2022review}, showing great progress in diverse domains like path finding~\citep{sartoretti2019primal},  active voltage control~\citep{DBLP:conf/nips/WangXGSG21}, and dynamic algorithm configuration~\citep{xue2022multiagent}, etc. Many methods are proposed to facilitate coordination among agents, including policy-based ones (e.g., MADDPG~\citep{maddpg}, MAPPO~\citep{mappo}),  value-based series like VDN~\citep{vdn}, QMIX~\citep{qmix}, or other techniques like transformer~\citep{wen2022multiagent} and many variants~\citep{gorsane2022towards}, demonstrating remarkable coordination ability in a wide range of tasks like
SMAC~\citep{pymarl}, Hanabi~\citep{mappo}, GRF~\citep{wen2022multiagent}. 
Besides the mentioned approaches and the corresponding variants, many other methods are also proposed to investigate the cooperative MARL from other aspects, including casual inference among agents~\citep{grimbly2021causal}, policy deployment in an offline way for real-world application~\citep{DBLP:conf/nips/YangMLZZHYZ21}, 
 communication~\citep{zhu2022survey} for partial observability, model learning for sample efficiency improvement~\citep{wang2022model}, policy robustness when perturbations occur~\citep{guo2022towards,ma3c}, training paradigm like CTDE (centralized training with decentralized execution)~\citep{DBLP:conf/atal/LyuXDA21}, testbed design for continual coordination validation~\citep{DBLP:conf/icml/NekoeiBCC21}, and ad hoc teamwork~\citep{mirsky2022survey}, offline learning in MARL~\cite{guan2023efficient,zhang2023discovering}, etc. 


\textbf{Non-stationary} is a longstanding topic in single-agent reinforcement learning (SARL)~\citep{Padakandla2019ReinforcementLA,Padakandla2020ASO}, where the environment dynamic (e.g., transition and reward functions) of a learning system may change over time. For SARL, most existing works focus on inter-episode non-stationarity, where decision
processes are non-stationary across episodes, including multi-task setting~\citep{VithayathilVarghese2020ASO}, continual reinforcement learning~\citep{DBLP:journals/jair/KhetarpalRRP22}, meta reinforcement learning~\citep{beck2023survey}, etc., these problems can be formulated as a contextual MDP~\citep{hallak2015contextual}, and could be solved by techniques like task embeddings learning. Other works also consider intra-episode non-stationarity, where an agent may suffer from dynamic drifting within one single episode~\citep{DBLP:conf/rss/KumarFPM21,DBLP:conf/iclr/RenSJSWB22,chen2022an,escp,DBLP:conf/case/DastiderL22,feng2022factored}. %Specifically, HDP-C-MDP~\citep{DBLP:conf/iclr/RenSJSWB22} assumes the latent context to be finite and Markovian, and adapts a sticky Hierarchical Dirichlet Process (HDP) prior for model learning; while FANS-RL~\citep{feng2022factored} assumes the latent context is Markovian and the environment can be modeled as a factored MDP; 
ESCP~\citep{escp} considers the sudden changes one agent may encounter and obtains a robust policy via learning an auxiliary context recognition model. Experiments show that in environments with both in-distribution and out-of-distribution
parameter changes, ESCP can not only better recover the environment encoding, but also adapt more rapidly to the post-change environment.% SeCBAD~\citep{chen2022an} further assumes the environment context usually stays stable for a stochastic period and then changes in an abrupt and unpredictable manner. Linda~\cite{cao2021linda} learns to decompose local information and build awareness for each teammate, which promotes coordination ability in multiple environments.

Different from the SARL setting, non-stationarity is an inherent challenge for MARL, as the agent's policy may be instability caused by the concurrent learning of multiple policies of other agents~\citep{papoudakis2019dealing}. %Previous works mainly focus on solving the non-stationary in the training phase, using techniques like agent modeling~\citep{DBLP:journals/ai/AlbrechtS18}, meta policy adaptation~\citep{DBLP:conf/icml/Kim0RSAHLTH21}, experience sharing~\citep{DBLP:conf/nips/ChristianosSA20}.
Previous works also concentrate on non-stationarity across episodes, and have focused on solving the problem in the training phase using techniques such as multi-task training~\citep{qin2022multi},  training policy for zero-shot coordination~\citep{DBLP:conf/icml/HuLPF20}. 
Despite the progress made, they do not address non-stationarity caused by teammates' sudden policy changes, which is a crucial and urgent need. As for the open MARL, our work takes a different perspective by emphasizing the general coordination and fast adaptation ability of learned controllable agents in the context of MARL.

\textbf{Open Multi-agent System} considers the problem where agents may join or leave while the process is ongoing, causing the system's composition and size to evolve over time~\cite{hendrickx2017open}. In previous works, the multi-agent problem has mainly been modeled for planning, resulting in various problem formulations such as Open Dec-POMDP~\cite{Cohen2017OpenDP}, Team-POMDP~\cite{Cohen2018MonteCarloPF,Cohen2019PowerIF}, I-POMDP-Lite~\cite{Chandrasekaran2016IndividualPI,Eck2019ScalableDP}, CI-POMDP~\cite{Kakarlapudi2022DecisiontheoreticPW}, and others.  Recently, some works consider the open multi-agent reinforcement learning problems. GPL~\cite{rahman2021towards} formulates the Open Ad-hoc Teamwork as OSBG and assumes global observability for efficiency, which may be hard to achieve in the real world. Additionally, it uses a GNN-based method that works only on the single controllable agent setting and is not scalable enough to be extended to multiple controllable agents setting. ROMANCE~\cite{romance} models the problem where the policy perturbation issue when testing in a different environment as a limited policy adversary Dec-POMDP (LPA-Dec-POMDP), and then proposes \textbf{Ro}bust \textbf{M}ulti-\textbf{A}ge\textbf{n}t \textbf{C}oordination via \textbf{E}volutionary Generation of Auxiliary Adversarial Attackers (ROMANCE), which enables the trained policy to encounter diversified and strong auxiliary adversarial attacks during training, thus achieving high robustness under various policy perturbations. 

 


\section{Problem Formulation} 
The aim of this work is to train multiple controllable agents to interact with other teammates that might suddenly change their policies at any time step within one episode. Therefore we formalize the problem by extending the framework of Dec-POMDP~\citep{oliehoek2016concise} to an Open Dec-POMDP $\mathcal{M}=\langle \mathcal{N}, \mathcal{\bar N}, \mathcal{S}, \mathcal{A}, \mathcal{\bar A}, P, \Omega, O, R,\mathcal{U}, \gamma \rangle$. Here $\mathcal{N}=\{1,...,n\}$, $\mathcal{\bar N}=\{\bar{1},...,\bar{m}\}$ are the sets of controllable agents and uncontrollable teammates, respectively, 
$\mathcal{S}$ stands for the set of state, $\mathcal{A}=\mathcal{A}^1\times...\times\mathcal{A}^n$ and $\mathcal{\bar A}=\mathcal{A}^{\bar 1}\times...\times \mathcal{A}^{\bar m}$ are  the corresponding sets of joint actions for $\mathcal N$ and $\mathcal{\bar N}$, 
 $P$, $O$, $R$ denote the corresponding transition, observation, and reward functions,  $\Omega$ is the set of  observations, $\gamma \in [0, 1)$ is the discounted factor, and $\mathcal{U}$ is a probability distribution  used to control the frequency of sudden change.

 
At the beginning of each episode, the set of uncontrollable teammates that participate in the cooperation at the very start is denoted by $\mathcal{\bar N}_0\in \mathcal{P}(\mathcal{\bar N})$, where $\mathcal{P}(\cdot)$ stands for the power set, and the waiting time is represented by $u_0\sim \mathcal{U}$.
At each time step $t$, $u_t=u_{t-1}-1$ and $\mathcal{\bar N}_t=\mathcal{\bar N}_{t-1}$ are updated. If $u_t\leq 0$, it will be resampled from $\mathcal{U}$, and a brand new set of uncontrollable teammates $\mathcal{\bar N}_t\in \mathcal{P}(\mathcal{\bar N})$ will replace the previous one. Meanwhile,
controllable agent $i$ receives the observation $o^i=O(s, i)$ and outputs action $a^i\in \mathcal{A}^i$. Notice that the number of uncontrollable teammates is changeable in one episode. The joint action $(\boldsymbol{a}, \boldsymbol{\bar a})$ leads to the next state $s'\sim P(\cdot|s, (\boldsymbol{a}, \boldsymbol{\bar a}))$ and a shared reward $R(s, (\boldsymbol{a}, \boldsymbol{\bar a}))$, where $\boldsymbol{a}=(a^1, ..., a^n)\in \mathcal{A}$ and $\boldsymbol{\bar {a}}\in \{(a^{\bar i})_{\bar i\in {\bar N}}|a^{\bar i}\in \mathcal{A}^{\bar i}, \bar N\in \mathcal{P}(\mathcal{\bar N})\}$.  To relieve the partial observability, the trajectory history $(o^i_1, a^i_1,...o_{t-1}^i, a_{t-1}^i, o_t^i)$ of agent $i$ until time step $t$  is encoded into  $\tau^i_t$ by GRU~\citep{gru}. Under an Open Dec-POMDP, we aim to find an optimal policy when uncontrollable teammates suffer from sudden change. Then, with $\boldsymbol{\tau}_t=\langle \tau^1_t,...,\tau^n_t\rangle$, the formal objective is to find a joint policy $\boldsymbol{\pi}(\boldsymbol{\tau}_t, \boldsymbol{a})$, which maximizes the global value function $Q_{\text{tot}}^{\boldsymbol{\pi}}(\boldsymbol{\tau}, \boldsymbol{a})=\mathbb{E}_{s, \boldsymbol{a}, \boldsymbol{\bar a}}[\sum_{t=0}^\infty \gamma^t R(s, (\boldsymbol{a}, \boldsymbol{\bar a}))|s_0=s, \boldsymbol{a}_0=\boldsymbol{a}, \boldsymbol{\pi}, \boldsymbol{\bar \pi}]$, where $\boldsymbol{\bar \pi}$ is the unknown joint policy of uncontrollable teammates.

\section{Method} \label{sec:3 method}
In this section, we will present the detailed design of Fastap (see Fig.~\ref{Structure}), a novel multi-agent policy learning approach that enables controllable agents to handle the sudden change of teammates' polices and adapt to new teammates rapidly. First, we design an infinite mixture model that formulates the distribution of continually increasing teammate clusters based on the Chinese Restaurant Process (CRP)~\citep{crp} (Sec~\ref{sec:3.1 crp mixture} and Fig.~\ref{Structure}(a)). Next, we introduce the centralized context encoder learning objective for fast adaption (Sec~\ref{sec3.2: context} and Fig.~\ref{Structure}(b)). Finally, considering the popular CTDE paradigm in cooperative MARL, we train each controllable agent to recognize and adapt to the teammate situation rapidly according to its local information (Sec~\ref{sec3.3: opt} and Fig.~\ref{Structure}(c)).  
\subsection{CRP-based Infinite Mixture for Dynamic Teammate Generation} \label{sec:3.1 crp mixture}
To adapt to the sudden change in teammates with diverse behaviors in one episode rapidly during evaluation, we expect to maintain a set of diverse policies to simulate the possibly encountered teammates in the training phase. Nevertheless, it is unreasonable and inefficient to consider every newly generated group of teammates as a novel type while ignoring the similarities among them. This approach lacks scalability in a learning process where teammates are generated incrementally, and it may lead to reduced training effectiveness if teammates with similar behavior are generated.
Accordingly, we expect to acquire clearly distinguishable boundaries of teammates' behaviors by applying a behavior-detecting module to assign teammate groups with similar behaviors to the same cluster. 
To tackle the issue, an infinite Dirichlet Process Mixture (DPM) model~\citep{lee2020neural} could be applied due to its scalability and flexibility in the number of clusters. 
Concretely, we can formulate the teammate generation process as a stream of teammate groups with different trajectory batch $\mathcal{D}_1, \mathcal{D}_2, ...$ where each batch $\mathcal{D}_k$ is a set of trajectories $\tau = (s_0, \boldsymbol{a}_0 ..., s_T)$ sampled from the interactions between the $k^{\text{th}}$ teammate group and the environment, and $T$ is the horizon length. Considering the difficulty of trajectory representation due to its high dimension, we utilize a trajectory encoder $E_{\omega_1}$ parameterized by $\omega_1$ to encode $\tau$ into a latent space. Specifically, we partition the trajectory $\tau$ into $\tau^S=(s_0, ... s_{T-1}, s_T)$ and $\tau^A=(\boldsymbol{a}_0, ..., \boldsymbol{a}_{T-1})$,
and a transformer architecture is applied to extract features from the trajectory and represent it as $v=E_{\omega_1}(\tau)$. For the $k^{\text{th}}$ teammate group generated so far, $v_k = \mathbb{E}_{\tau_k\sim \mathcal{D}_k}[E_{\omega_1}(\tau_k)]$ will be used to represent its behavioral type, and $\bar v^m$ is the mean value of the $m^{\text{th}}$ cluster.


If $M$ clusters are instantiated so far, the cluster that the $k^{\text{th}}$ teammate group belongs to will be inferred from the assignment $P(v_k^{(m)}|\tau_k)=P(v_k^{(m)}|\tau^S_k, \tau^A_k), m=1,..., M, M+1$, where $v_k^{(m)}$ denotes that the $k^{\text{th}}$ group belongs to the $m^{\text{th}}$ cluster based on its representation $v_k$. %, and $m=M+1$ indicates that the $k^{\text{th}}$ teammate group is a novel type.
The posterior distribution can be written as:
\begin{equation}
    \begin{aligned}
        P(v_k^{(m)}|\tau^S_k, \tau^A_k)\propto P(v_k^{(m)})P(\tau_k^A|\tau_k^S; v_k^{(m)}),
    \end{aligned}
\end{equation}


we apply CRP~\citep{crp} to instantiate the DPM model as the prior.  
Specifically, for a sequence of teammate groups whose representations are $[v_1, v_2, ... v_k, ...]$, the prior $P(v_k^{(m)})$ is set to be:
\begin{equation}
    \begin{aligned}
        P(v_k^{(m)}) = \begin{cases}
        \frac{n^{(m)}}{k-1+\alpha}, \quad m\leq M\\
        \frac{\alpha}{k-1+\alpha},\quad m=M+1,
        \end{cases}
    \end{aligned}
    \label{priori}
\end{equation}
where $n^{(m)}$ denotes the number of teammate groups belonging to the $m^{\text{th}}$ cluster, $M$ is the number of clusters instantiated so far, $\sum_{m=1}^M n^{(m)}=k-1$, and $\alpha>0$ is a concentration hyperparameter that controls the probability of the instantiation of a new cluster. 

To estimate the predictive likelihood $P(\tau_k^A|\tau_k^S; v_k^{(m)})$, we use an RNN-based decoder $D_{\omega_2}$ that takes $\tau_k^S, v_k^{(m)}$ as input and predicts $\tau_k^A$. The decoder represents each sample as an Gaussian distribution $\mathcal{N}(\mu(\tau_{t}^S, v), \sigma^2(\tau_{t}^S, v))$ where $\tau_t^S=(s_0, ..., s_t)$, such that
\begin{equation}
    \begin{aligned}
        P(\tau_k^A|\tau_k^S; v_k^{(m)})=& D_{\omega_2}(\tau_k^A|\tau_k^S; v_k^{(m)})\\
        =& \prod_{t=1}^T D_{\omega_2}(\boldsymbol{a}_t^k|\tau_{k, t}^S, v_k^{(m)}),\\
        \text{where}~ v_k^{(m)}=&\begin{cases}
            \frac{n^{(m)}\bar{v}^{m}+v_k}{n^{(m)}+1}\quad m\leq M\\
            v_k\quad\quad\quad\quad\,\, m=M+1.
        \end{cases}
    \end{aligned}
    \label{likelihood}
\end{equation}


Combing the estimated prior Eqn.~(\ref{priori}) and predictive likelihood Eqn.~(\ref{likelihood}), we are able to decide which cluster the $k^{\text{th}}$ teammate group belongs to and thus acquire clearly distinguishable boundaries of teammates' behavior. After the assignment, the mean value of the $m^{\text{th}}$ cluster will also be updated. Meanwhile, to force the learned representation $v$ to capture the behavioral information of each teammate group and estimate the predictive likelihood more precisely, the encoder $E_{\omega_1} $and decoder $D_{\omega_2}$ are optimized as:
\begin{equation}
    \begin{aligned}
        \mathcal{L}_{\text{model}}(\boldsymbol{\omega}) = -\log\mathbb{E}_{\tau\sim\cup_{k=1}^K\mathcal{D}_k}[D_{\omega_2}(\tau^A|\tau^S; E_{\omega_1}(\tau))],
    \end{aligned}
\end{equation}
where $K$ is the number of teammate groups generated so far, $\boldsymbol{\omega}=(\omega_1, \omega_2)$. The encoder and decoder are optimized while generating teammate groups (see details in App.~B.1).


\subsection{Centralized Contextualization Learning for Fast Adaptation}\label{sec3.2: context}

After gaining the generated teammates divided into different clusters, this part aims to train a robust policy to handle sudden teammate change and rapidly adapt to the new teammates via conditioning the controllable agents' policies on other teammates' behavior. Despite the diversity and complexity that unknown teammates' behavior exhibits, the CRP formalized before helps acquire clearly distinguishable boundaries based on teammates' behavioral types with regard to high-level semantics. 

Inspired by Environment Sensitive Contextual Policy Learning (ESCP)~\citep{escp}, which aims to guide the context encoder to identify and track the sudden change of the environment rapidly, we expect to utilize a global context encoder $g_\theta$ and local context encoder $\{f_{\phi_i}\}_{i=1}^n$ to embed the historical interactions into a compact but informative representation space. The encoders are supposed to identify a new type of teammate fast so as to recognize the sudden change in time, and we can optimize the encoder by proposing an objective that helps the encoder's output coverage to the oracle rapidly at an early time and keep consistent for the remaining steps.


During centralized training phase, we set $z_t^m=g_\theta(\tau_t^m)$, where $\tau_t^m=(s_0^m, \boldsymbol{a}_0^m, ..., s_t^m)$ is generated based on the interactions between the paired joint policy $(\boldsymbol{\pi}, \boldsymbol{\bar \pi}^m)$ and the environment, 
and $\boldsymbol{\bar \pi}^m$ is the joint policy of uncontrollable teammates belonging to the $m^{\text{th}}$ cluster. Notice that the cluster of teammates is chosen at the beginning of each episode and will not change during training, and sudden change of teammates only happens during evaluation. We can acquire the empirical optimization objective of  $g_\theta$ as:
\begin{equation}
    \begin{aligned}
        \mathcal{L}_{\text{GCE}} = \sum_{m=1}^M\mathbb{E}[||z^m_t-\bar z^m||_2^2]-\log\det(R_{\{\bar z^m\}}),
    \end{aligned}
    \label{loss_ce}
\end{equation}
where $\bar z^m$ is the moving average of all past context vectors used for stabilizing the training process, $\theta$ is the parameter of the global context encoder $g_\theta$, $\det(\cdot)$ denotes the matrix determinant, and $R_{\{\bar z^m\}}$ is a relational matrix. Intuitively, the objective expects to help the encoder's output coverage rapidly at an early time and keep it consistent for the remaining steps. Specifically, the former part forces $z_t^m$ to converge fast and stably in one episode, and the latter pushes the expectation of $z_t^m$ to a set of separable but representative latent vectors. The full derivation can be found in App.~B.2.

In practice, a recurrent neural network is applied to instantiate $g_{\theta}$, which takes $\tau_t^m=(s_0^m, \boldsymbol{a}^m_0, ..., s_t^m)$ as input and outputs a multivariate Gaussian
distribution $\mathcal{N}(\mu_\theta(\tau_t^m), \sigma^2_\theta(\tau_t^m))$. Thus the teammates context is obtained from the Gaussian distribution with the reparameterization trick by $z_t^m\sim g_\theta(\tau_t^m)$. As we can apply Fastap to any value-based methods, the global embedding $z_t^m$ could also be integrated into the centralized network. Similarly, the local embedding $e_t^{m, i}$ and local trajectory $\tau_t^{m, i} $ will also be concatenated to calculate the local Q-value $Q^i(\tau_t^{m, i}, e_t^{m, i}, \cdot)$, where the optimization of the local context encoder will be explained in detail in the next part. Therefore, the TD loss $\mathcal{L}_{\text{TD}}=[r_t^m+\gamma\max_{\boldsymbol{a}_{t+1}^m} \bar Q_{\text{tot}}(s_{t+1}^m, \boldsymbol{e}_{t+1}^m, z_{t+1}^m, \boldsymbol{a}_{t+1}^m)- Q_{\text{tot}}(s_{t}^m, \boldsymbol{e}_{t}^m, z_{t}^m, \boldsymbol{a}_{t}^m)]$ is utilized to accelerate the centralized contextualization learning, where $\bar Q_{\text{tot}}$ is periodically updated target Q network, and $\boldsymbol{e}_{t}^m=(\boldsymbol{e}_t^{m,i})_{i=1}^n$. The overall optimization objective of $g_\theta$ can thus be derived:
\begin{equation}
    \begin{aligned}
        \mathcal{L}_{\text{ADAP}} = \mathcal{L}_{\text{TD}} +\alpha_{\text{GCE}}\mathcal{L}_{\text{GCE}},
    \end{aligned}
    \label{loss_gce}
\end{equation}
where $\alpha_{\text{GCE}}$ is an adjustable hyper-parameter to balance the two optimization objective.

 \begin{figure*}[!ht]
  \centering
  \includegraphics[width=0.95\textwidth]{Figures/envs.pdf}
  \caption{Experimental environments used in this paper.}
  \label{envs}
\end{figure*}

\subsection{Decentralized Team Situation Recognition and Optimization}\label{sec3.3: opt}
Despite the fact that optimizing Eqn.~(\ref{loss_gce}) helps obtain compact and representative representations $z_t^m$ that could guide individual policies to adapt to teammate sudden change rapidly, partial observability of MARL will not allow agents that execute in a decentralized manner to obtain $z_t^m$ encoded from the global state-action trajectory. Thus, we equip each agent $i$ with a local encoder $f_{\phi_i}$ to recognize the team situation. Concretely, the network architecture of $f_{\phi_i}$ is similar to $g_{\theta}$, $f_{\phi_i}$ takes local trajectory $\tau^{m, i}_t=(o_0^{m, i}, a_0^{m, i},..., o_t^{m, i})$ as input and outputs $e_{t}^{m, i}\sim\mathcal{N}(\mu_{\phi_i}(\tau_t^{m,i}), \sigma^2_{\phi_i}(\tau_t^{m,i}))$. To make $e_{t}^{m, i}$ informatively consistent with $z_t^m$, we introduce a mutual information (MI) objective by maximizing the MI $\mathcal{I}(e_{t}^{m, i};z_t^m
|\tau_t^{m,i})$ between $e_{t}^{m, i}$ and $z_t^m$ conditioned on the agent $i$'s local trajectory $\tau^{m, i}_t$. Due to the difficulty and feasibility of estimating the conditional distribution directly, variational distribution $q_{\xi}(e_{t}^{m, i}|z_t^m, \tau^{m, i}_t)$ is used to approximate the conditional distribution $p(e_{t}^{m, i}|z_t^m, \tau^{m, i}_t)$. Inspired by the information bottleneck~\citep{DBLP:conf/iclr/AlemiFD017}, we would derive a tractable lower bound of MI objective:
\begin{equation}
    \begin{aligned}
        &\mathcal{I}(e_{t}^{m, i};z_t^m
|\tau_t^{m,i})\geq \\
&\quad \mathbb{E}_{\mathcal{D}}[\log q_{\xi}(e_{t}^{m, i}|z_t^m, \tau^{m, i}_t)]+\mathcal{H}(e_{t}^{m, i}|\tau^{m, i}_t),
    \end{aligned}
\end{equation}
where $\mathcal{H}(\cdot)$ denotes the entropy, and variables of the distributions are sampled from the experience replay buffer $\mathcal{D}$. We defer the full derivation to App.~B.3. We can now rewrite the MI objective as:
\begin{equation}
    \begin{aligned}
        &\mathcal{L}_{\text{MI}}=\\
        &\quad \sum_{m=1}^M\sum_{i=1}^n  \mathbb{E}_{\mathcal{D}}[\log q_{\xi}(e_{t}^{m, i}|z_t^m, \tau^{m, i}_t)]+\mathcal{H}(e_{t}^{m, i}|\tau^{m, i}_t),
    \end{aligned}
\end{equation}


the mentioned symbols are defined similarly as Eqn.~(\ref{loss_ce}). To facilitate the learning process, two local auxiliary optimization objectives are further designed. On the one hand, we expect $e_{t}^{m, i}$ to recognize the team situation and adapt to new teammates that change suddenly as $z_t^m$ does:
\begin{equation}
    \begin{aligned}
        \mathcal{L}_{\text{LCE}} = \sum_{m=1}^M\sum_{n=1}^n\mathbb{E}[||e^{m,i}_t-\bar e^{m, i}||_2^2]-\log\det(R_{\{\bar e^{m, i}\}}).
    \end{aligned}
\end{equation}
On the other hand, to derive the descriptive representation $e^{m, i}_t$ of the specific team situation,  we hope $e^{m, i}_t$ can learn the relationship between controllable agents and the teammates. Therefore, we expect $e^{m,i}_t$ to reconstruct the observations and actions taken by teammates:
\begin{equation}
    \begin{aligned}
        \mathcal{L}_{\text{REC}} = \sum_{m=1}^M\sum_{n=1}^n\mathbb{E}_{\mathcal{D}}[-\log h_{\psi_i}(\boldsymbol{\bar o}_t^m, \boldsymbol{\bar a}_t^m|e_t^{m, i})],
    \end{aligned}
\end{equation}
where $h$ is parameterized by $\psi_i$ for each agent $i$.
As $e_t^{m, i}$ and $\tau_t^{m, i}$ will be concatenated into the input of individual Q network $Q^i(\tau_t^{m, i}, e_t^{m, i},\cdot)$, the TD loss $\mathcal{L}_{\text{TD}}$ is also utilized to 
promote the learning of local context encoder. Thus, the optimization objective  becomes:
\begin{equation}
    \begin{aligned}
        \mathcal{L}_{\text{DEC}} = \mathcal{L}_{\text{TD}}+\alpha_{\text{MI}}\mathcal{L}_{\text{MI}}+\alpha_{\text{LCE}}\mathcal{L}_{\text{LCE}}+\alpha_{\text{REC}}\mathcal{L}_{\text{REC}},
    \end{aligned}
\end{equation}
where $\alpha_{\text{MI}}, \alpha_{\text{LCE}}, \alpha_{\text{REC}}$ are the corresponding adjustable hyperparameters of the three objectives. 
\section{Experiments}
In this section, we design extensive experiments for the following questions:  1) Can Fastap achieve high adaptability and generalization ability when encountering teammate sudden change compared to other baselines in different scenarios, and how each component influences its performance (Sec.~\ref{results}) ? 2) Can CRP help 
acquire distinguishable boundaries of teammates' behaviors, and what team situation representation is learned by Fastap (Sec.~\ref{analysis})? 3) What transfer ability Fastap reveals, and how does each hyperparameter influence its coordination capability (Sec.~\ref{bonus})?

\begin{figure*}
\setlength{\abovecaptionskip}{0cm}
\centering
    \subfigure[LBF (Stationary)]{
    \label{lbf_static}
      \includegraphics[height=28.5mm]
      {Fastap/Figures/Main/lbf_static.pdf}
    }
    \subfigure[PP (Stationary)]{
    \label{simple_tag_static}
      \includegraphics[height=28.5mm]
      {Fastap/Figures/Main/simple_tag_static.pdf}
    }
    \subfigure[CN (Stationary)]{
    \label{simple_spread_static}
      \includegraphics[height=28.5mm]
      {Fastap/Figures/Main/simple_spread_static.pdf}
    }
    \subfigure[10m\_vs\_14m (Stationary)]{
    \label{smac_static}
      \includegraphics[height=28.5mm]
      {Fastap/Figures/Main/smac_static.pdf}
    }

    \subfigure[LBF (Non-Sta.)]{
    \label{lbf_sudden}
      \includegraphics[height=28.5mm]
      {Fastap/Figures/Main/lbf_sudden.pdf}
    }
    \subfigure[PP (Non-Sta.)]{
    \label{simple_tag_sudden}
      \includegraphics[height=28.5mm]
      {Fastap/Figures/Main/simple_tag_sudden.pdf}
    }
    \subfigure[CN (Non-Sta.)]{
    \label{simple_spread_sudden}
      \includegraphics[height=28.5mm]
      {Fastap/Figures/Main/simple_spread_sudden.pdf}
    }
    \subfigure[10m\_vs\_14m (Non-Sta.)]{
    \label{smac_sudden}
      \includegraphics[height=28.5mm]
      {Fastap/Figures/Main/smac_sudden.pdf}
    }  
  \vspace{.5em}
    \caption{Performance comparison with baselines on multiple benchmarks.}
  \label{main_exp}
\end{figure*}



\subsection{Environments and Baselines} \label{envandbas}
We select four multi-agent tasks as our environments, as shown in Fig.~\ref{envs}. Level Based Foraging (LBF)~\citep{lbf} is a cooperative grid world game with agents that are rewarded if they concurrently navigate to the food and collect it.  Predator-prey (PP) and Cooperative navigation (CN) are two scenarios coming from the MPE environment~\citep{maddpg}, where multiple agents (predators) need to chase and encounter the adversary agent (prey) to win the game in PP, and in CN, multiple agents are trained to move towards landmarks while avoiding collisions with each other.
We also create a map 10m\_vs\_14m from SMAC~\citep{pymarl}, where 10 allies are spawned at different points to attack  14 enemies to win.


For baselines, we consider multiple ones and implement them to a popular valued-based method QMIX~\citep{qmix} for comparisons, including (1) the vanilla QMIX without any extra design;
(2) Meta-learning SARL methods: PEARL~\citep{PEARL} uses recently collected context to infer a probabilistic variable describing the task; ESCP~\citep{escp} copes with the sudden change in the environment by learning a context-sensitive policy; 
(3) Context-based MARL approaches: LIAM~\citep{LIAM} predicts teammates' current behaviors based on local observation history to relieve non-stationary in the training phase; ODITS~\citep{ODITS} applies a centralized ``teamwork situation encoder'' for end-to-end learning to adapt to arbitrary teammates across episodes.  More details about the environments and baselines, and Fastap are illustrated in App.~C, and App.~D, respectively.


\subsection{Competitive Results and Ablations} \label{results}

\begin{figure}

  \centering
\includegraphics[width=0.48\textwidth]{Fastap/Figures/Main/drop\_v5.pdf}
  \caption{Performance difference in stationary and non-stationary conditions.
  The value is the difference in the performance under non-stationary and stationary settings w.r.t. the best return.
  }
  \label{performancedifference}
\end{figure}
\paragraph{Coordination Ability in Stationary and Non-stationary Settings} 

At first glance, we compare Fastap against the mentioned baselines to investigate the coordination ability under stationary and non-stationary conditions, as shown in Fig.~\ref{main_exp}. We can find all algorithms suffer from coordination ability degradation when teammates are in a non-stationary manner, indicating a specific consideration of teammates' sudden policy change in a non-stationary environment is needed.
When only using local information to obtain a context to capture the teammates' information, methods like PERAL and LIAM show indistinctive coordination improvement in stationary and non-stationary settings, PEARL performs even worse than vanilla QMIX, demonstrating that successful meta-learning approaches in SARL cannot be implemented without modification in the MARL setting. Furthermore, when learning a teammate's behavior context extraction model in both global and local ways, ODITS shows superior performance in the two mentioned conditions, manifesting the necessity of utilizing global states to improve training efficiency. Besides, ESCP also reveals a relatively better coordination capability, demonstrating the effectiveness of optimizing a context encoder with fast adaptability. 
Fastap achieves the best performance on all benchmarks both in stationary and non-stationary conditions, and suffers from the least performance degradation when tested in a non-stationary condition in most environments (see Fig.~\ref{performancedifference}), showing the effectiveness and high efficiency of the proposed method.  
\begin{table*}
\centering
\resizebox{\textwidth}{!}{
\begin{tabular}{l|ccccccc} 
\hline
                          $\mathcal{U}$   & Fastap       & Fastap\_wo\_CRP          & ODITS        & LIAM         & QMIX         & PEARL        &   ESCP\\ 
\hline
stationary                    & $\mathbf{0.642\pm0.008}$ & $0.594\pm0.015$ & $0.637\pm0.008$ & $0.597\pm0.029$ & $0.569\pm0.033$ & $0.507\pm0.021$ &   $0.618\pm0.040$   \\
{\cellcolor[rgb]{0.893,0.893,0.893}}$U[5, 8]$ & {\cellcolor[rgb]{0.893,0.893,0.893}}$\mathbf{0.562\pm0.012}$ & {\cellcolor[rgb]{0.893,0.893,0.893}}$0.400\pm0.020$ & {\cellcolor[rgb]{0.893,0.893,0.893}}$0.352\pm0.002$ & {\cellcolor[rgb]{0.893,0.893,0.893}}$0.415\pm0.026$ & {\cellcolor[rgb]{0.893,0.893,0.893}}$0.306\pm0.038$ & {\cellcolor[rgb]{0.893,0.893,0.893}} $0.288\pm0.019$& {\cellcolor[rgb]{0.893,0.893,0.893}} $0.404\pm0.026$    \\
$U[6, 7]$           & $\mathbf{0.567\pm0.001}$ & $0.444\pm0.314$ & $0.487\pm0.022$ & $0.454\pm0.157$ & $0.444\pm0.221$ & $0.333\pm0.000$  & $0.556\pm0.125$     \\
$U[2, 9]$           & $0.484\pm0.285$ & $0.222\pm0.133$ & $0.416\pm0.182$ & $0.401\pm0.078$ & $0.443\pm0.205$ & $0.205\pm0.114$ &   $\mathbf{0.514\pm0.314}$   \\
$U[3, 6]$           & $\mathbf{0.518\pm0.136}$ & $0.366\pm0.217$ & $0.444\pm0.314$ & $0.388\pm0.283$ & $0.353\pm0.272$ &  $0.264\pm0.066$ &   $0.502\pm0.120$   \\
$U[3, 3]$           & $\mathbf{0.384\pm0.272}$ & $0.246\pm0.141$ & $0.342\pm0.118$ & $0.362\pm0.208$ & $0.222\pm0.314$ & $0.243\pm0.172$ &   $0.271\pm0.157$   \\
\hline
\end{tabular}}
\caption{The final average return $\pm$ std in LBF, where $\mathcal{U}$ is the sudden change probability distribution of open Dec-POMDP that controls the frequency of sudden change, and $U[m, n]$ denotes a discrete uniform distribution parameterized by $m$ and $n$. The row of the original training sudden change distribution $\mathcal{U}=U[5, 8]$ is highlighted as \colorbox{lightgray}{gray}.}
\label{generalization_ood}
\end{table*}


\begin{figure}
\setlength{\abovecaptionskip}{0cm}
  \centering
  \subfigure[LBF]{
  \label{ablation_lbf}
      \includegraphics[width=0.226\textwidth]
      {Fastap/Figures/Ablation/Ablation_LBF_v2.pdf}
  }
  \subfigure[PP]{
  \label{ablation_simpletag}
      \includegraphics[width=0.226\textwidth]
{Fastap/Figures/Ablation/Ablation_SimpleTag_v2.pdf}
  }
  \caption{Ablation Studies.}
  \label{ablation}
\end{figure}

\paragraph{Ablation Studies}
As Fastap is composed of multiple components,  we here design ablation studies on benchmarks LBF and PP to investigate how they impact the coordination performance of Fastap under non-stationary settings. First, for the infinite mixture model of dynamic teammate generation, we derive $\textit{W/o CRP}$ by removing the CRP process and taking each newly generated teammate group as a new cluster. Next, to explore whether a teammate-behavior-sensitive encoder helps improve adaptability, we introduce  $\textit{W/o LCE}$ by removing $\mathcal{L}_{\text{LCE}}$ of local encoders. Furthermore, we pick up $\textit{W/o MI}$ to investigate how maximizing mutual information between global and local contexts accelerates learning efficiency. Finally, $\textit{W/o REC}$ is introduced to check the impact of the auxiliary optimization objective that involves agent modeling. As is shown in Fig.~\ref{ablation}, $\textit{W/o CRP}$ and $\textit{W/o MI}$ suffer the most severe performance degradation in LBF and PP, respectively, manifesting the benefit of the introduction of CRP model and that teammate-behavior-sensitive encoders do help agents adapt to sudden change of teammates rapidly. Besides, when removing $\mathcal{L}_{\text{MI}}$, the performance gap $\textit{W/o MI}$  shows in two benchmarks demonstrate the necessity of utilizing global information to facilitate the learning of local context encoders. Finally, we also find agent modeling helps learn more informative context and brings about a slight coordination improvement.
\begin{figure}
  \centering 
  \label{hotmap}
    \includegraphics[height=28mm]{Fastap/Figures/Visualization/CRP/crp\_v4.pdf}
    \caption{Cross-Play performance before and after CRP and teammate behavior embeddings.}
    \label{visual}
\end{figure}
\paragraph{Comparisons in (OOD) Non-stationary Setting.}
As this study considers a setting where the frequency of uncontrolled teammates' sudden change follows a fixed probability distribution $\mathcal{U}$, which is set to be a uniform distribution, we evaluate the generalization ability when altering the changing frequency during testing. The experiments on LBF are conducted with the distribution $\mathcal{U}=U[5, 8]$ during training. As shown in Tab.~\ref{generalization_ood}, we compare the final returns of different learned policies in LBF by altering the distribution $\mathcal{U}$. Although different approaches obtain similar coordination ability in stationary conditions, they suffer from strong performance degradation when altering teammates' policy-changing frequency (e.g., ODITS suffer from close to half performance degradation in sudden change[3, 3]).  On the other hand, Fastap and ESCP achieve outstanding generalization ability in both in-distribution and OOD settings mostly. 
More specifically, in the stationary setting, Fastap outperforms the best baseline ODITS by $0.005$, while in the original non-stationary setting, the gap increases to $0.147$. We also find Fastap shows inferiority to ESCP in setting sudden change[2, 9],  we believe that both methods fail to perform well under the 2-timestep sudden change interval, while Fastap sacrifices a part of the performance under large timestep sudden change interval that might happen in $U[2, 9]$. A more robust policy in diverse conditions would be developed in the future.


\subsection{Teammate Adaptation Analysis} \label{analysis}
Here we conduct experiments to investigate the CRP model and teammate adaptation progress. We first verify whether CRP helps acquire distinguishable boundaries of teammates' behaviors by performing Cross-Play~\citep{DBLP:conf/icml/HuLPF20} experiments on LBF before and after CRP. As shown in the left part of Fig.~\ref{visual}, for generation process of 8 teammate groups, we find that the values on the diagonal from the top left to the bottom right are relatively larger. However, several high performances of other points (e.g., Teammate groups 2 and 3) indicate that the generated teammate groups might share similar behavior. To help relieve the negative influence caused by taking teammate groups with similar behavior as two different types, CRP is applied to learn the behavior type and assign teammates with similar behavior to the same cluster. Further, we sample latent variables generated by $E_{\omega_1}(\tau_k)$ and reduce the dimensionality by principal component analysis (PCA)~\citep{wold1987principal}. We find that latent variables assigned to the same cluster (the ellipse) are distributed in the adjacent areas. Cross-Play experiments are also conducted on the teammate clusters after CRP, and we find from the right part of Fig.~\ref{visual} that teammates belonging to different clusters achieve low performance when paired together, indicating the effectiveness of CRP.


To investigate how teammate-behavior-sensitive encoders help adapt to teammates' sudden change rapidly, we also visualize the fragment snapshot of an episode during testing as shown in Fig.~\ref{snapshot}. When a teammate and two controlled agents are trying to reach out for an apple and win the score as they were intended, the teammate accidentally leaves out the team, and they fail to get the reward provisionally. However, the controlled agents learned by Fastap recognize the situation and switch out the policy rapidly by moving downward and coordinating with the other teammate to attain the reward. Meanwhile, we record the latent context vector in different timesteps of one episode. Fastap encodes the context to four-dimensional vectors in LBF, and we reduce the dimensionality to one-dimensional scalars by PCA. We scatter the points in Fig.~\ref{context_curve} together with the contexts learned by LIAM and ablation Fastap\_wo\_CRP. The results imply that the contexts learned by Fastap are sensitive to the sudden change of teammates, and when the teammates are stable, the latent context is stable and flat. Despite the fact that agent modeling helps recognize the teammates' behavior, the context curve of LIAM is still hysteretic and unstable. Meanwhile, the ablation Fastap\_wo\_CRP can also adapt to new teammates rapidly, but it fails to recognize the teammates with similar behavior and results in the unstable latent context (e.g., Teammate Cluster 3).


\begin{figure}
  \centering
     \label{visual_context}
    \subfigure[Snapshot]{
        \label{snapshot}
        \includegraphics[height=34.5mm]{Fastap/Figures/Visualization/context/snapshot\_v4.pdf}
        
    }
    \subfigure[Context Curve]
    {\label{context_curve}
    \includegraphics[height=32.5mm]{Fastap/Figures/Visualization/context/context1.pdf}}

    \caption{Teammate adaptation visualization.}
\end{figure}


\subsection{Transfer and Sensitive Studies} \label{bonus}
Our Fastap learns teammates recognition module to cope with teammates that might change suddenly in one episode. The sudden change distribution $\mathcal{U}$ that controls the frequency of changing is fixed, and a more frequent change or a larger gap of waiting interval tends to make the training more difficult. Here, we investigate the policy transfer ability of Fastap by comparing the performance after fine-tuning and learning from scratch. Concretely, we train Fastap agents under the sudden change distribution $\mathcal{U}_{\text{source}}=U[5, 8]$ for $0.6$M timesteps and initialize the trained network with the saved checkpoint under the target setting with $\mathcal{U}=\mathcal{U}_{\text{target}}=U[3, 6]$. The learning curves demonstrated in Fig.~\ref{transfer} show that agents trained under $\mathcal{U}_{\text{source}}$ possess a jumpstart compared with the random initialization, and we hope it could accelerate the learning in a new environment by reusing previously learned knowledge.

As Fastap includes multiple hyperparameters, here we conduct experiments on benchmark LBF to investigate how each one influences the coordination ability.  $\alpha_{\text{GCE}}$ balances the trade-off between the TD-loss and the global context optimization object. If it is too small, agents may coordinate in stationary environment excessively, ignoring the extraction of teammates context information. On the other hand, if it is too large, agents pay much attention to teammates identification with risk of overfitting to specific teammates types. We thus find each hyperparameter via grid-search. As shown in Fig.~\ref{sensitivity_gce}, we can find that $\alpha_{\text{GCE}}=1$ is the best choice in this benchmark.
$\alpha_{\text{MI}}$ influences the optimization of local encoder $f_{\phi_i}$ to recognize the team situation, we can find that $\alpha_{\text{MI}}=0.001$ works well in Fig.~\ref{sensitivity_mi}.
 




\begin{figure}
\setlength{\abovecaptionskip}{0cm}
  \centering
  \subfigure[LBF]{
  \label{transfer_lbf}
      \includegraphics[width=0.222\textwidth]
      {Fastap/Figures/Transfer/Transfer_lbf.pdf}
  }
  \subfigure[PP]{
  \label{transfer_simpletag}
      \includegraphics[width=0.222\textwidth]
{Fastap/Figures/Transfer/Transfer_simple_tag.pdf}
  }
  \caption{Policy Transfer Ability.}
  \label{transfer}
\end{figure}



\begin{figure}
\setlength{\abovecaptionskip}{0cm}
  \centering
  \subfigure[Sensitivity of $\alpha_{\text{GCE}}$]{
  \label{sensitivity_gce}
      \includegraphics[width=0.222\textwidth]
      {Fastap/Figures/Sensitivity/Sensitivity_GCE.pdf}
  }
  \subfigure[Sensitivity of $\alpha_{\text{MI}}$]{
  \label{sensitivity_mi}
      \includegraphics[width=0.222\textwidth]
{Fastap/Figures/Sensitivity/Sensitivity_MI.pdf}
  }
  \caption{Sensitivity Studies on LBF.}
  \label{sensitivity}
\end{figure}




\section{Final Remarks}
In this work, we study the teammates' adaptation problem when some coordinators suffer from the sudden policy change.
We first formalize this problem as an open Dec-POMDP, where some coordinators may sustain policy changes unpredictably within one episode, and we train multiple controllable agents to adapt to this change rapidly. For this goal, we propose Fastap, an efficient approach to learn a multi-agent coordination policy by capturing the teammates' policy-changing information. Extensive experimental results on stationary and non-stationary conditions from different benchmarks verify the effectiveness of Fastap, and the analysis also confirms it from multiple aspects. Our method can be seen as a primary attempt for the open-environment setting~\citep{zhou2022open} in cooperative MARL, and we sincerely hope it can be a solid foothold for applying MARL to practical applications
 . For future work, researches on the changing of action/observation space of the MARL system or utilizing techniques like transformer~\citep{DBLP:conf/nips/VaswaniSPUJGKP17} to obtain a generalist coordination policy for non-stationary from diverse sources and degrees is of great value.  

\section*{Acknowledgments}
 This work is supported by National Key Research and Development Program of China (2020AAA0107200), the National Science Foundation of China (61921006, 62022039, 62276124), and the program B for Outstanding Ph.D. candidate of Nanjing University.


\bibliography{ref}

\end{document}
