\documentclass[accepted]{uai2022}

\usepackage[american]{babel}
\usepackage{natbib}
\bibliographystyle{plainnat}
\renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{booktabs,url}
\usepackage{algorithm}
\usepackage{algorithmic}

%\usepackage{balance} % for balancing columns on the final page
\usepackage{rotating}
\usepackage{wrapfig}
\usepackage{latexsym}
\let\Bbbk\relax
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{enumitem}
\usepackage{xcolor}
\usepackage{microtype}
\usepackage{tabularx}
\usepackage{relsize}
\usepackage{subcaption}
\usepackage{nccmath}

% THEOREMS -------------------------------------------------------
\newtheorem{thm}{Theorem}
\newtheorem{cor}[thm]{Corollary}
\newtheorem{lem}{Lemma}
\newtheorem{prop}{Proposition}
\newtheorem{defn}{Definition}
\newtheorem{obs}{Observation}

% MATH -----------------------------------------------------------
\newcommand{\norm}[1]{\left\Vert#1\right\Vert}
\newcommand{\abs}[1]{\left\vert#1\right\vert}
\newcommand{\set}[1]{\left\{#1\right\}}
\newcommand{\Real}{\mathbb R}
\newcommand{\eps}{\varepsilon}
\newcommand{\To}{\rightarrow}
\newcommand{\BX}{\mathbf{B}(X)}
\newcommand{\plusequals}{\mathrel{\mathord{+}\hspace*{-1pt}\mathord{=}}}
\newcommand*{\commt}[1]{\color{red}\em{#1}}
\newcommand{\Org}{\textsf{Org}}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\title{Reinforcement Learning in Many-Agent Settings Under Partial Observability: Supplementary File}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\author[1]{\href{mailto:<kh45436@uga.edu>?Subject=Reinforcement Learning in Many-Agent Settings Under Partial Observability UAI 2022 paper}{Keyang~He}}
\author[1]{\href{mailto:<pdoshi@uga.edu>?Subject=Reinforcement Learning in Many-Agent Settings Under Partial Observability UAI 2022 paper}{Prashant~Doshi}}
\author[2]{\href{mailto:<bikramjit.banerjee@usm.edu>?Subject=Reinforcement Learning in Many-Agent Settings Under Partial Observability UAI 2022 paper}{Bikramjit~Banerjee}}

\affil[1]{
THINC Lab\\
Department of Computer Science\\
University of Georgia\\
Athens, GA, USA
}

\affil[2]{
School of Computing Sciences and Engineering\\
University of Southern Mississippi\\
Hattiesburg, MS, USA
}

\begin{document}
\maketitle

\section{Dynamic Programming Algorithm}
\begin{algorithm} [!ht]
\caption{\small Computing configuration distribution $Pr(\mathcal{C}|b_0(M_1), b_0(M_2), \ldots, b_0(M_N))$}
\label{alg}
\small
\begin{algorithmic}
\REQUIRE $\langle b_0(M_1), b_0(M_2), \ldots, b_0(M_N) \rangle$ 
\ENSURE $P_N$, which is the distribution $Pr(\boldsymbol{\mathcal{C}}^{\boldsymbol{a}_{-0}})$ represented as a trie.
\STATE Initialize $c^{a_i}_0 \leftarrow (0,\dots,0)$, $P_0[c^{a_i}_0] \leftarrow 1.0$
%\STATE Initialize $P_0[c^{a_i}_0] \leftarrow 1.0$
\FOR {$k = 1$ to $N$}
\STATE Initialize $P_k$ to be an empty trie
\FOR {$c^{a_i}_{k-1}$ from $P_{k-1}$}
\FOR {$a^{a_i}_k \in A^{a_i}_k$ such that $\pi^{a_i}_k(a^{a_i}_k) > 0$}
\STATE $c^{a_i}_k \leftarrow c^{a_i}_{k-1}$
\IF {$a^{a_i}_k \neq \emptyset$}
\STATE $c^{a_i}_k(a^{a_i}_k) \overset{+}{\leftarrow} 1$
\ENDIF
\IF {$P_k[c^{a_i}_k]$ does not exist}
\STATE $P_k[c^{a_i}_k] \leftarrow 0$
\ENDIF
\STATE $P_k[c^{a_i}_k] \overset{+}{\leftarrow} P_{k-1}[c^{a_i}_{k-1}] \times \pi^{a_i}_k(a^{a_i}_k)$
\ENDFOR
\ENDFOR
\ENDFOR
\RETURN $P_N$
\end{algorithmic}
\label{alg:dp}
\end{algorithm}

\section{Proof of Proposition 1}
Here we assume a common model of noise, $P(a_j^o|a_k^e)$, where the subject agent observes action $a_j^o$ from another agent when the latter executed action $a_k^e$, as
\begin{equation}
    P(a_j^o|a_k^e) = \begin{cases}1-\delta & if\ a_j^o=a_k^e\\\frac{\delta}{|A|-1} & otherwise\end{cases}
\end{equation}
for some small $\delta$. The effect of such noise from the private observation of an individual agent's action can be aggregated over $N$ agents in terms of $\delta$ as follows. Suppose the observed configuration, $\omega_0'$, is $\mathcal{C}^o=(\#a_1^o,\#a_2^o,\ldots,\#a_{|A|}^o)$, and the true configuration is $\mathcal{C}^e=(\#a_1^e,\#a_2^e,\ldots,\#a_{|A|}^e)$. Then the probability of an error in the observation of a configuration is 
\begin{align*}
P(error)&=\sum_{\mathcal{C}^e}\sum_{\mathcal{C}^o\neq\mathcal{C}^e}P(\mathcal{C}^o\land\mathcal{C}^e)\\
&=\sum_{\mathcal{C}^e}\sum_{\mathcal{C}^o\neq\mathcal{C}^e}P(\mathcal{C}^o|\mathcal{C}^e)P(\mathcal{C}^e)
\end{align*}
where
\begin{align}
P(\mathcal{C}^e) =& \prod_{i}\theta_i^{\#a_i^e},\ and\nonumber\\
P(\mathcal{C}^o|\mathcal{C}^e) =& \prod_{(j,k)\in A\times A} P(a_j^o|a_k^e)^{n_{jk}}\nonumber\\
s.t.\ \  & (\sum_jn_{jk}=\#a_k^e)\land (\sum_kn_{jk} = \#a_j^o)\label{eqn:conditions}
\end{align}
Let $m^{oe}_i=\min\{\#a_i^o,\#a_i^e\}$. Then $P(\mathcal{C}^o|\mathcal{C}^e)$ can be maximized by setting the diagonal of the matrix $[n_{jk}]$ as $n_{ii}=m^{oe}_i$, and distributing the remaining weight $N-\sum_i m^{oe}_i$ to the off-diagonal positions while satisfying Eq.~\ref{eqn:conditions}. This yields 
\begin{align*}
    P(\mathcal{C}^o|\mathcal{C}^e) \le & (1-\delta)^{\sum_i m^{oe}_i}\left(\frac{\delta}{|A|-1}\right)^{N-\sum_i m^{oe}_i}\\
    \le & (1-\delta)^{N-1}\left(\frac{\delta}{|A|-1}\right)
\end{align*}
in order to ensure that $\mathcal{C}^o\neq\mathcal{C}^e$. Furthermore, the number of solutions of Eq.~\ref{eqn:conditions} is $\le\prod_i(m^{oe}_i +1)=O(N^{|A|})$. Hence
\begin{align*}
P(error)&\le N^{|A|}(1-\delta)^{N-1}\left(\frac{\delta}{|A|-1}\right)
\end{align*}
The above is a decreasing function of $N$ when $N>\frac{|A|}{\log(1/1-\delta)}$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\begin{figure*}[ht!]
\begin{subfigure}{0.23\textwidth}
\includegraphics[width=1\linewidth]{ep1}
\caption{\small Tree}
\label{fig:ep_tree}
\end{subfigure}
\begin{subfigure}{0.23\textwidth}
\includegraphics[width=1\linewidth]{ep2}
\caption{\small Star}
\label{fig:ep_star}
\end{subfigure}
\begin{subfigure}{0.23\textwidth}
\includegraphics[width=1\linewidth]{ep3}
\caption{\small Fully Connected}
\label{fig:ep_all}
\end{subfigure}
\begin{subfigure}{0.3\textwidth}
\includegraphics[width=1\linewidth]{ep4}
\caption{\small Battlefield}
\label{fig:ep_ma}
\end{subfigure}
\caption{Cumulative reward of learned policies in ($a$) tree structure, ($b$) star structure, and ($c$) fully connected structure. ($d$) Win rate against pre-trained agents in the MAgent battlefield domain.}
\label{fig:episodes}
\end{figure*}

\section{Policy Value with Respect to Episodes}
We choose to use time in hours as metric for demonstrating efficiency of tested algorithms. We provide additional plots that use episodes as metric in Fig.~\ref{fig:episodes}. QMIX and MF-AC do not converge to optimal policy given same amount of episodes as IA2C-BU, however, it only takes QMIX and MF-AC about one third of the time to finish one episode compared to IA2C-BU.
\end{document}

