% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 
\externaldocument{zhao_598}


% https://www.overleaf.com/learn/how-to/Cross_referencing_with_the_xr_package_in_Overleaf
%----Helper code for dealing with external references----
% (by cyberSingularity at http://tex.stackexchange.com/a/69832/226)

\usepackage{xr}
\makeatletter

\newcommand*{\addFileDependency}[1]{% argument=file name and extension
\typeout{(#1)}% latexmk will find this if $recorder=0
% however, in that case, it will ignore #1 if it is a .aux or 
% .pdf file etc and it exists! If it doesn't exist, it will appear 
% in the list of dependents regardless)
%
% Write the following if you want it to appear in \listfiles 
% --- although not really necessary and latexmk doesn't use this
%
\@addtofilelist{#1}
%
% latexmk will find this message if #1 doesn't exist (yet)
\IfFileExists{#1}{}{\typeout{No file #1.}}
}\makeatother

\newcommand*{\myexternaldocument}[1]{%
\externaldocument{#1}%
\addFileDependency{#1.tex}%
\addFileDependency{#1.aux}%
}
%------------End of helper code--------------

% put all the external documents here!
\myexternaldocument{zhao_598}



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Some potentially useful packages
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables

% hyperref makes hyperlinks in the resulting PDF.
% If your build breaks (sometimes temporarily if a hyperlink spans a page)
\usepackage{hyperref}


% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}

% For theorems and such
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}

% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}


\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{textcase}
\usepackage{multirow}
% \usepackage[dvipsnames]{xcolor}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% self-defined commands-----------------------------------
\newcommand{\replay}{\mathcal{D}}
\newcommand{\jointaction}{\mathbf{a}}
\newcommand{\jointhist}{\boldsymbol{\tau}}
\newcommand{\statespace}{\mathcal{S}}
\newcommand{\actionspace}{\mathcal{A}}
\newcommand{\indparam}{\boldsymbol{\phi}}
\newcommand{\depparam}{\boldsymbol{\psi}}
\newcommand{\EE}{\mathbb{E}}
\newcommand{\qi}{Q_{i}}
\newcommand{\qjt}{Q_{joint}}
\newcommand{\supidp}[1]{#1^{idp}}
\newcommand{\supdep}[1]{#1^{dep}}
\newcommand{\subact}{\text{act}}
\newcommand{\subboot}{\text{boot}}
\newcommand{\subrew}{\text{rew}}
\newcommand{\mixer}{\text{Mixer}}
\newcommand{\rom}[1]{\romannumeral #1}

\newcommand{\xutong}[1]{\textcolor{blue}{#1}}
\usepackage{easyReview}

\NewDocumentCommand{\anote}{}{\makebox[0pt][l]{$^*$}}
% --------------------------------------------------------



%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Conditionally Optimistic Exploration for Cooperative Deep Multi-Agent Reinforcement Learning (Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{Xutong Zhao}
\author[3]{Yangchen Pan}
\author[4]{Chenjun Xiao}
\author[1,2,6]{Sarath Chandar}
\author[1,5]{Janarthanan Rajendran}
% Add affiliations after the authors
\affil[1]{
    Mila - Quebec AI Institute
}
\affil[2]{
    \'Ecole Polytechnique de Montr\'eal
}
\affil[3]{
    University of Oxford
  }
\affil[4]{
    University of Alberta
}
\affil[5]{
    Universit\'e de Montr\'eal
}
% \affil[6]{
%     Canada CIFAR AI Chair
% }

  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle


\appendix

\section{Pseudo-count for Deep RL}\label{apx:pseudo-count}
Counting visitations in high-dimensional or continuous state space could be challenging.
This section introduces how we approximate counts by applying the static hashing \citep{tang2017exploration} method, a well-established count approximation approach in RL, adopted successfully in works such as \citet{rashid2020optimistic}.

In particular, the state $s \in \statespace$ is projected to a lower-dimensional feature space by $\phi(s) = sgn (A g(s)) \in \{-1, 1\}^k$, 
where $g: \statespace \rightarrow \mathbb{R}^D$ is an optional pre-processing function,
$A \in \mathbb{R}^{k \times D}$ is a projection matrix with entries drawn i.i.d. from a unit Gaussian distribution $\mathcal{N}(0,1)$,
and $sgn(\cdot)$ is the element-wise sign function.
This method clusters similar states in $\statespace$ to one feature in a small, countable feature space, which enables us to count.
The $k$ value controls the granularity of state approximation: higher $k$ leads to more distinguishable features yet less generalizability across similar states.
We record the visitation count for the tuple of the state feature $\phi(s)$ and all agents' joint action $\jointaction$, denoted by $N(s, \jointaction)$ for simplicity of notation.
Note that for each agent $i$, the count up to its action $a_i$ satisfies:
\begin{align*}
    N(s, a_{<i}, a_i) &= \sum_{a_{i+1}} N(s, a_{<i}, a_i, a_{i+1}) \\
        &= \sum_{a_{>i}} N(s, a_{<i}, a_i, a_{>i}),
\end{align*}
where $a_{<i}$ and $a_{>i}$ denote the joint actions taken by preceding and subsequent agents of $i$, respectively.
This relationship shows that we can obtain any count up to $a_i$ by summing up the counts of joint actions that overlap $a_{<i}$ at state $s$.
This relationship is naturally aligned with the tree structure, where the total count of each node equals the number of action sequences going through that node.
Thus we are able to perform optimistic exploration using conditional counts.


\section{Environment details}\label{apx:exp-environ}

\paragraph{Multi-Agent Particle Environment}
Multi-Agent Particle Environment (MPE) \citep{lowe2017multi,mordatch2018emergence} is a suite of two-dimensional navigation tasks where the entities in the environment obey physics properties.
We choose three tasks that do not involve agent-wide communication: \textsl{Sparse Spread}, \textsl{Sparse Tag}, and \textsl{Adversary}.
In the first two tasks, reward signals are sparse and agents receive positive rewards only when they jointly complete the task.
They are almost fully observable except each agent does not observe the velocity of other agents.
\textsl{Adversary} is fully observable.


\paragraph{Level-Based Foraging}
Level-Based Foraging (LBF) \citep{albrecht2015game,christianos2020shared,papoudakis2020benchmarking} is a set of food-collection tasks in a grid-world.
Each agent or food item is assigned a level value, such that a group of agents can pick up a food item if the sum of agents' levels is greater than or equal to the item's level.
Agents receive a positive reward only when a food item is picked up, hence LBF requires efficient coordinated exploration.
We choose four tasks with different grid dimensions, number of agents, and number of food items.
By default, they are all fully observable.

\paragraph{StarCraft Multi-Agent Challenge}
StarCraft Multi-Agent Challenge (SMAC) \citep{samvelyan2019starcraft} consists of battle tasks where a group of agents is learned to defeat another group.
Each agent could only observe entities within a fixed-sized window.
All tasks have dense rewards, and agents start engaging immediately after the game starts.
As \citet{mahajan2019maven} point out, SMAC tasks are not designed to evaluate cooperative exploration.
In order to assess coordination in partially-observable and non-stationary settings, we choose one easy task \textit{2s-vs-1sc} and one hard task \textit{3s-vs-5z}.


\section{Evaluation Protocol}\label{apx:eval-protocol}
In each task we train all algorithms for four million timesteps.
During training we perform $41$ evaluations at constant timestep intervals, that is, 100k timestep intervals, and at each evaluation point we evaluate for 100 episodes.
We train each algorithm with parameter sharing, where all agent networks share the same set of parameters, and the one-hot identity of each agent as additional network input helps the neural network to develop diverse behaviour.


We evaluate algorithms' performance in a task by two metrics: maximum returns and average returns.
The maximum return refers to the highest mean evaluation return across five seeds achieved at one evaluation point during training.
This metric evaluates algorithms' best-reached performance in a task
The average return is the evaluation return averaged over all evaluation points during training.
This metric reflects both sample efficiency and final performance.


\section{Additional results}\label{apx:more-results}

Table~\ref{tab:max-return-all} summarizes the \textit{maximum} returns for all eight algorithms (including the ablations) in all nine tasks,
which also reports the maximum win-rates in SMAC tasks.
Figure~\ref{fig:marl-learning-curves-ablation} presents learning curves of the evaluation returns achieved during training by ablations in all nine tasks.
Sparse-reward tasks have bold titles.


\begin{table*}[!htb]
    \tiny
    \centering
    \caption{Maximum Returns and 95\% Confidence Interval for All Eight Algorithms in All Nine Tasks, and Maximum Win-rates for SMAC Tasks.}\label{tab:max-return-all}
    \begin{tabular}{*{3}{l}*{8}{c}}
      \toprule % from booktabs package
      && \multicolumn{1}{l}{\textbf{Tasks \textbackslash Algs.}} & \multicolumn{1}{c}{COE} & \multicolumn{1}{c}{COE-Cond-IQ} & \multicolumn{1}{c}{COE-Cond-CQ} & \multicolumn{1}{c}{UCB-Ind} & \multicolumn{1}{c}{UCB-Cen} & \multicolumn{1}{c}{EMC} & \multicolumn{1}{c}{MAVEN} & \multicolumn{1}{c}{QMIX} \\
      \midrule
      \multirow{3}{*}{\rotatebox[origin=c]{90}{MPE}}
      && Adversary & $22.68 \pm 0.80$ & $19.18 \pm 1.70$ & $24.14 \pm 0.83$ & $23.16 \pm 1.28$ & $23.02 \pm 0.93$  & $22.03 \pm 2.12$  & $23.52 \pm 1.50$  & $22.70 \pm 1.61$  \\
      && Sparse Tag & $1.60 \pm 0.41$ & $0.16 \pm 0.18$ & $1.98 \pm 0.77$ & $1.28 \pm 0.31$ & $1.44 \pm 0.05$  & $1.23 \pm 0.35$  & $0.06 \pm 0.03$ & $1.16 \pm 0.29$  \\
      && Sparse Spread & $2.11 \pm 1.86$ & $0.99 \pm 0.85$ & $1.46 \pm 1.05$ & $1.51 \pm 1.06$ & $1.80 \pm 1.15$  & $1.31 \pm 0.92$  & $0.43 \pm 0.85$  & $1.46 \pm 0.28$  \\
      \midrule
      \multirow{4}{*}{\rotatebox[origin=c]{90}{LBF}}
      && 10x10-3p-3f & $0.99 \pm 0.01$ & $0.98 \pm 0.02$ & $0.98 \pm 0.01$ & $0.98 \pm 0.02$ & $0.99 \pm 0.01$ & $0.96 \pm 0.04$  & $0.37 \pm 0.18$ & $0.94 \pm 0.03$ \\
      && 15x15-3p-5f & $0.45 \pm 0.10$ & $0.36 \pm 0.09$ & $0.29 \pm 0.15$ & $0.37 \pm 0.08$ & $0.31 \pm 0.14$  & $0.24 \pm 0.04$ & $0.04 \pm 0.01$ & $0.20 \pm 0.02$ \\
      && 15x15-4p-3f & $0.93 \pm 0.03$ & $0.89 \pm 0.02$ & $0.63 \pm 0.13$ & $0.75 \pm 0.11$ & $0.48 \pm 0.31$ & $0.71 \pm 0.13$ & $0.06 \pm 0.01$ & $0.51 \pm 0.09$ \\
      && 15x15-4p-5f & $0.69 \pm 0.08$ & $0.38 \pm 0.05$ & $0.32 \pm 0.07$ & $0.52 \pm 0.20$ & $0.57 \pm 0.15$  & $0.50 \pm 0.08$ & $0.05 \pm 0.01$ & $0.33 \pm 0.04$ \\
      \midrule
      \multirow{4}{*}{\rotatebox[origin=c]{90}{SMAC}}
      &\multirow{2}{*}{\rotatebox[origin=c]{90}{ret}} & 2s-vs-1sc & $20.25 \pm 0.01$ & $19.57 \pm 0.73$ & $20.24 \pm 0.00$ & $15.88 \pm 7.79$ & $20.19 \pm 0.07$  & $20.22 \pm 0.06$ & $20.22 \pm 0.04$  & $20.16 \pm 0.05$ \\
      && 3s-vs-5z & $21.32 \pm 0.75$ & $21.16 \pm 0.56$ & $21.47 \pm 0.59$ & $16.93 \pm 4.24$ & $19.86 \pm 5.03$  & $14.84 \pm 4.19$ & $20.15 \pm 1.43$  & $18.57 \pm 3.01$  \\
      \cmidrule{2-11}
      &\multirow{2}{*}{\rotatebox[origin=c]{90}{win}} & 2s-vs-1sc & $1.00 \pm 0.00$ & $0.92 \pm 0.09$ & $1.00 \pm 0.00$ & $0.77 \pm 0.38$ & $0.99 \pm 0.01$  & $1.00 \pm 0.00$ & $1.00 \pm 0.00$ & $0.99 \pm 0.00$ \\
      && 3s-vs-5z & $0.97 \pm 0.00$ & $0.93 \pm 0.05$ & $0.98 \pm 0.02$ & $0.56 \pm 0.45$ & $0.61 \pm 0.37$  & $0.27 \pm 0.37$ & $0.87 \pm 0.16$  & $0.65 \pm 0.30$  \\
      \bottomrule
    \end{tabular}
\end{table*}


\begin{figure*}[!htb]
  \centering
  \includegraphics[width=0.9\linewidth]{img/marl_learning_curves_ablation.pdf}
  \caption{Episodic Returns and 95\% Confidence Interval for All Ablations in All Tasks.}\label{fig:marl-learning-curves-ablation}
\end{figure*}



\section{Ablation details}\label{apx:ablation}

In this section, we present in detail the ablation variants
introduced in \cref{sec:exp-ablation}.

COE-Cond-IQ directly adopts the idea of UCT, without considering the partial observability issue of each agent.
In order to enable decentralized execution, we simultaneously learn a Q-value function dependent on preceding agents' actions and its independent counterpart.
Similar to the MACPF factorization \citep{wang2022more}, each agent $i$ has an independent Q-network $\supidp{\qi} (\tau_i, a_i; \phi_i)$ parameterized by $\phi_i$,
and a dependency correction network $\supdep{c}_i(\tau_i, a_i | a_{<i}; \psi_i)$ parameterized by $\psi_i$,
whose sum constructs the dependent Q-network $\supdep{\qi}(\tau_i, a_i | a_{<i}; \phi_i, \psi_i) = \supidp{\qi} (\tau_i, a_i; \phi_i) + \supdep{c}_i(\tau_i, a_i | a_{<i}; \psi_i)$.


\begin{figure*}[!tb] %!htb
  \centering
  \includegraphics[width=1.0\linewidth]{img/COE_architecture-COE_cond-v2.png}
  \caption{Learning Framework for COE-Cond-CQ.}\label{fig:coe-cond-cq-framework}
\end{figure*}

Individual agent's action-value networks $\supdep{\qi}$ and $\supidp{\qi}$ are separately trained by minimizing the mean-squared TD error on each Q-network:
\begin{align}
    \supdep{\mathcal{L}}_i( \psi_i ) &= \EE_\replay [ (\supdep{\qi}(\tau_i, a_i | a_{<i}) - \supdep{y}_i )^2 ] \label{eq:td-err-dep-i} \\
    \supidp{\mathcal{L}}_i( \phi_i ) &= \EE_\replay [ (\supidp{\qi}(\tau_i, a_i) - \supidp{y}_i )^2 ] \label{eq:td-err-ind-i}
\end{align}
where $\supdep{y}_i = (r + \gamma \max_{a_i'} (\supdep{\qi}(\tau_i', a_i' | a_{<i}) ) )$ and $\supidp{y}_i = (r + \gamma \max_{a_i'} (\supidp{\qi}(\tau_i', a_i') ) )$ are the update targets, and $\replay$ contains trajectory data collected by $\supdep{\qi}$'s.
To ensure $\supdep{\qi}$ and $\supidp{\qi}$ achieve the same performance, they are constructed and trained in a way that strengthens their coupling:
$\supdep{\qi}$ is the combination of $\supidp{\qi}$ and a correction network;
during training the same mini-batch of trajectory data sampled from $\replay$ is used to compute both $\supdep{\mathcal{L}}_i$ and $\supidp{\mathcal{L}}_i$.


COE exploration is applied to this variant in a similar way as being applied to value decomposition methods.
The optimistic bonus is added to $\supdep{\qi}$ at action selection during training.
Note that for each agent $i$ the optimistic TD update target is applied to both \cref{eq:td-err-dep-i} and \cref{eq:td-err-ind-i}:
\begin{align}\label{eq:ucb-target-i}
    & y_i = \left( r(s, \jointaction) + \frac{c_\subrew}{\sqrt{N(s, a_{<i}, a_i)}} \right)
    + \gamma \max_{a_i'} \left( Q_i(\tau_i', a_i') + \frac{c_\subboot}{\sqrt{N(s', a_{<i}', a_i')}} \right),
\end{align}
where $c_\subrew, c_\subboot \in \mathbb{R}_+$ are hyper-parameters controlling the scale of the optimistic bias in reward and bootstrapped target, respectively.
During decentralized execution, agents take actions according to $\supidp{\qi}$'s only.


We name this variant COE-Cond-IQ as it could be considered as a direct adoption of UCT to IQL \citep{tan1993multi}.
As opposed to the utility function that learns implicit dependency via centralized training in value decomposition methods, each agent learns a Q-value function, that explicitly captures the correlation among agents by conditioning on previous agents' actions.
COE-Cond-IQ also complies with the CTDE paradigm.
However, it ignores the partial observability of each individual agent.
Each agent only has access to its own local trajectory history.


Another ablation we introduce is COE-Cond-CQ, which combines centralized training and COE-Cond-IQ.
The learning framework of COE-Cond-CQ is illustrated in \cref{fig:coe-cond-cq-framework}.
The same mixing network $\mixer(\cdot ; \theta)$ we use in COE is used to compute both dependent and independent joint Q-values:
\begin{align}
    & \supdep{\qjt}(\jointhist, \jointaction) = \mixer \left( [ \supdep{\qi}(\tau_i, a_i | a_{<i}) ]^N_{i=1}, s; \theta \right) \label{eq:q-joint-dep-condcq} \\
    & \supidp{\qjt}(\jointhist, \jointaction) = \mixer \left( [ \supidp{\qi}(\tau_i, a_i) ]^N_{i=1}, s; \theta \right) \label{eq:q-joint-ind-condcq}
\end{align}
Similarly, centralized training also optimizes both dependent and independent mean-squared TD error:
\begin{align}
    \supdep{\mathcal{L}}( [\psi]^N_{i=1}, \theta ) = \EE_\replay [ (\supdep{\qjt}(\jointhist, \jointaction) - \supdep{y} )^2 ] \label{eq:td-err-dep-condcq} \\
    \supidp{\mathcal{L}}( [\phi]^N_{i=1}, \theta ) = \EE_\replay [ (\supidp{\qjt}(\jointhist, \jointaction) - \supidp{y} )^2 ] \label{eq:td-err-ind-condcq}
\end{align}
where $\supdep{y} = (r + \gamma \max_{\jointaction'} (\supdep{\qjt}(\jointhist', \jointaction') ) )$ and $\supidp{y} = (r + \gamma \max_{\jointaction'} (\supidp{\qjt}(\jointhist', \jointaction') ) )$ are update targets for dependent and independent networks, respectively.
Exploration is performed the same way as COE, and action selection is performed the same way as COE-Cond-IQ.


In the ablation UCB-Ind, each agent performs UCB-based exploration independently.
It is straightforward to obtain UCB-Ind: we simply replace any conditional count terms in COE with independent counts, which do not rely on other agents' actions.

The ablation UCB-Cen augments the global reward with an intrinsic reward $\frac{c_\subrew}{\sqrt{N(s, \jointaction)}}$.
Agents learn optimistic Q-values through centralized training.



\section{Hyperparameter settings}\label{apx:hyperparam}

To perform hyperparameter optimization we follow the same protocol presented by \citet{papoudakis2020benchmarking}.
We select one task from each benchmark environment and optimize the hyperparameters of all algorithms in this task.
In particular, we select \textit{Sparse Tag} from MPE, \textit{15x15-3p-5f} from LBF, and \textit{3s-vs-5z} from SMAC.
We perform a coarse grid search on hyperparameter settings and train each configuration with three seeds.
We identify the best configuration according to the maximum evaluation returns.
This best configuration on each task is then used for all tasks in the respective environment for the final experiments with five seeds.


For methods that use intrinsic reward --- i.e. COE, EMC, and MAVEN --- we only test constant intrinsic reward scales.
For COE, the hyperparameter combination with $c_\subact = c_\subrew = c_\subboot = 0$ is ignored as this setting refers to the greedy-action QMIX.
For MAVEN, we determine the hyperparameter settings according to the original paper and its accompanying codebase.
In particular, we sweep the intrinsic scales only when "MI intrinsic" is True.
The hyperparameters "MI intrinsic" and "RNN discriminator" cannot both be True.
When MAVEN uses $\varepsilon$-greedy, the epsilon annealing time is 50k timesteps.
Every epsilon annealing schedule --- utilized by either MAVEN or QMIX --- is linear with an initial value of $1.0$ and a final value of $0.0$.


\begin{table}[!htb] % !htb
    \centering
    \caption{Common QMIX Hyperparameters for All algorithms across All Tasks.}\label{tab:apx-hyperparam-common}
    \begin{tabular}{cc}
      \toprule 
      Hyperparameter Name & Value \\
      \midrule
      hidden dimension & 128 \\
      reward standardization & True \\
      network type & GRU \\
      evaluation epsilon & 0 \\
      target update & 0.01 (soft) \\
      \bottomrule
    \end{tabular}
\end{table}

\begin{table}[!htb]
    \centering
    \caption{Hyperparameters for COE: Values Swept in Grid-search and Best Configuration for each Benchmark.}\label{tab:apx-hyperparam-coe}
    \begin{tabular}{*{5}{c}}
      \toprule 
      Hyperparameter Name & Swept values & MPE & LBF & SMAC \\
      \midrule
      learning rate & 0.0001/0.0003/0.0005 & 0.0001 & 0.0003 & 0.0005 \\
      feature dimension $k$ & 8/12/16 & 8 & 16 & 8 \\
      $c_\subact$ & 0/0.01/0.05 & 0.01 & 0.01 & 0 \\
      $c_\subboot$ & 0/0.01/0.05 & 0 & 0 & 0 \\
      $c_\subrew$ & 0/0.01/0.05 & 0.05 & 0 & 0.05 \\
      \bottomrule
    \end{tabular}
\end{table}

\begin{table}[!htb]
    \centering
    \caption{Hyperparameters for COE-Cond-IQ: Values Swept in Grid-search and Best Configuration for each Benchmark.}\label{tab:apx-hyperparam-coe-cond-iq}
    \begin{tabular}{*{5}{c}}
      \toprule 
      Hyperparameter Name & Swept values & MPE & LBF & SMAC \\
      \midrule
      learning rate & 0.0001/0.0003/0.0005 & 0.0001 & 0.0001 & 0.0005 \\
      feature dimension $k$ & 8/12/16 & 8 & 8 & 16 \\
      $c_\subact$ & 0/0.01/0.05 & 0 & 0.05 & 0.01 \\
      $c_\subboot$ & 0/0.01/0.05 & 0 & 0 & 0 \\
      $c_\subrew$ & 0/0.01/0.05 & 0.05 & 0 & 0 \\
      \bottomrule
    \end{tabular}
\end{table}

\begin{table}[!htb]
    \centering
    \caption{Hyperparameters for COE-Cond-CQ: Values Swept in Grid-search and Best Configuration for each Benchmark.}\label{tab:apx-hyperparam-coe-cond-cq}
    \begin{tabular}{*{5}{c}}
      \toprule 
      Hyperparameter Name & Swept values & MPE & LBF & SMAC \\
      \midrule
      learning rate & 0.0001/0.0003/0.0005 & 0.0001 & 0.0003 & 0.0005 \\
      feature dimension $k$ & 8/12/16 & 12 & 16 & 8 \\
      $c_\subact$ & 0/0.01/0.05 & 0.05 & 0.01 & 0.05 \\
      $c_\subboot$ & 0/0.01/0.05 & 0 & 0 & 0.01 \\
      $c_\subrew$ & 0/0.01/0.05 & 0.05 & 0.01 & 0.01 \\
      \bottomrule
    \end{tabular}
\end{table}

\begin{table}[!htb]
    \centering
    \caption{Hyperparameters for UCB-Indep: Values Swept in Grid-search and Best Configuration for each Benchmark.}\label{tab:apx-hyperparam-ucb-indep}
    \begin{tabular}{*{5}{c}}
      \toprule 
      Hyperparameter Name & Swept values & MPE & LBF & SMAC \\
      \midrule
      learning rate & 0.0001/0.0003/0.0005 & 0.0001 & 0.0003 & 0.0005 \\
      feature dimension $k$ & 8/12/16 & 8 & 12 & 12 \\
      $c_\subact$ & 0/0.01/0.05 & 0.01 & 0.01 & 0 \\
      $c_\subboot$ & 0/0.01/0.05 & 0 & 0.01 & 0 \\
      $c_\subrew$ & 0/0.01/0.05 & 0 & 0.01 & 0.01 \\
      \bottomrule
    \end{tabular}
\end{table}

\begin{table}[!htb]
    \centering
    \caption{Hyperparameters for UCB-Central: Values Swept in Grid-search and Best Configuration for each Benchmark.}\label{tab:apx-hyperparam-ucb-central}
    \begin{tabular}{*{5}{c}}
      \toprule 
      Hyperparameter Name & Swept values & MPE & LBF & SMAC \\
      \midrule
      learning rate & 0.0001/0.0003/0.0005 & 0.0001 & 0.0003 & 0.0005 \\
      feature dimension $k$ & 8/12/16 & 8 & 8 & 16 \\
      $c_\subrew$ & 0/0.01/0.05 & 0.05 & 0.01 & 0.05 \\
      \bottomrule
    \end{tabular}
\end{table}

\begin{table}[!htb]
    \centering
    \caption{Hyperparameters for EMC: Values Swept in Grid-search and Best Configuration for each Benchmark.}\label{tab:apx-hyperparam-emc}
    \begin{tabular}{*{5}{c}}
      \toprule 
      Hyperparameter Name & Swept values & MPE & LBF & SMAC \\
      \midrule
      learning rate & 0.0001/0.0003/0.0005 & 0.0001 & 0.0003 & 0.0005 \\
      curiosity scale & 0.001/0.005/0.01/0.05/0.1/0.5/1.0 & 0.01 & 0.001 & 0.001 \\
      \bottomrule
    \end{tabular}
\end{table}

\begin{table}[!htb]
    \centering
    \caption{Hyperparameters for MAVEN: Values Swept in Grid-search and Best Configuration for each Benchmark.}\label{tab:apx-hyperparam-maven}
    \begin{tabular}{*{5}{c}}
      \toprule 
      Hyperparameter Name & Swept values & MPE & LBF & SMAC \\
      \midrule
      learning rate & 0.0001/0.0003/0.0005 & 0.0003 & 0.0001 & 0.0005 \\
      RNN discriminator & True/False & False & False & False \\
      MI intrinsic & True/False & True & True & True \\
      curiosity scale & 0.001/0.005/0.01/0.05/0.1/0.5/1.0 & 0.01 & 0.05 & 0.005 \\
      noise bandit & True/False & True & False & True \\
      epsilon start & 0.0/1.0 & 1.0 & 1.0 & 1.0 \\
      \bottomrule
    \end{tabular}
\end{table}

\begin{table}[!htb]
    \centering
    \caption{Hyperparameters for QMIX: Values Swept in Grid-search and Best Configuration for each Benchmark.}\label{tab:apx-hyperparam-qmix}
    \begin{tabular}{*{5}{c}}
      \toprule 
      Hyperparameter Name & Swept values & MPE & LBF & SMAC \\
      \midrule
      learning rate & 0.0001/0.0003/0.0005 & 0.0001 & 0.0001 & 0.0005 \\
      epsilon anneal & 50,000/200,000 & 50,000 & 50,000 & 50,000 \\
      \bottomrule
    \end{tabular}
\end{table}







\bibliography{zhao_598}

\end{document}
