\documentclass[accepted]{uai2023} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like


%% Choose your variant of English; be consistent
\usepackage[american]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{times}
\usepackage{soul}
\usepackage{url}
\usepackage[utf8]{inputenc}
\usepackage{graphicx,color}
\usepackage{amsmath, bm}
\usepackage{amssymb}
\usepackage{amsfonts}
\usepackage{amsthm}
\usepackage{dsfont}
\usepackage{booktabs}

\usepackage{bbm}
\usepackage{nicefrac}
\urlstyle{same}

\usepackage[noend]{algorithm2e}

\RestyleAlgo{ruled}
\usepackage{mathtools}
\mathtoolsset{showonlyrefs}
\urlstyle{same}

\newtheorem{example}{Example}
\newtheorem{theorem}{Theorem}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}
\newtheorem{lemma}{Lemma}
\newtheorem{problem}{Problem}
\newtheorem{remark}{Remark}


%%%%%%%% Tikz setup %%%%%%%%
\usepackage{pgfplots}
\DeclareUnicodeCharacter{2212}{−}
\usepgfplotslibrary{groupplots, dateplot}
\usetikzlibrary{patterns, shapes, automata, arrows, shapes.arrows, positioning, decorations.pathreplacing, calligraphy, calc}

\pgfplotsset{compat=newest}

\usepackage{tikz}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% CUSTOM LEGEND SETUP%%%%%%%%%%%%%%%%%%%%
\usepgfplotslibrary{external}
\pgfplotsset{compat=newest}
\newenvironment{customlegend}[1][]{%
	\begingroup
	% inits/clears the lists (which might be populated from previous
	% axes):
	\csname pgfplots@init@cleared@structures\endcsname
	\pgfplotsset{#1}%
}{%
	% draws the legend:
	\csname pgfplots@createlegend\endcsname
	\endgroup
}%

% makes \addlegendimage available (typically only available within an
% axis environment):
\def\addlegendimage{\csname pgfplots@addlegendimage\endcsname}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Differential Privacy in Cooperative Multiagent Planning}

\author[1]{Bo~Chen\thanks{Indicates equal contribution.}}
\author[1]{Calvin~Hawkins$^*$}
\author[2]{Mustafa~O.~Karabag$^*$}
\author[2]{Cyrus~Neary$^*$}
\author[1]{Matthew~Hale}
\author[2]{Ufuk~Topcu}

 \affil[1]{
The University of Florida
 }
  \affil[2]{
The University of Texas at Austin
 }
  
\begin{document}
%%%%%%%%%%%%%%%%% MACROS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Colors
\definecolor{minDependencyPolicy}{RGB}{180, 65, 161}
\definecolor{baselinePolicy}{RGB}{46, 36, 47}

%Math symbols
\newcommand{\expectation}{\mathbb{E}}
\newcommand{\kl}{KL}
\newcommand{\entropy}{H}
\newcommand{\distribution}{\Delta}
\newcommand{\probabilityMeasure}{\mu}
\newcommand{\genericRandomVar}{Y}
\newcommand{\genericRandomVarSupport}{\mathcal{\genericRandomVar}}
\newcommand{\genericDistribution}{Q}
\newcommand{\genericDistributionSupport}{\mathcal{\genericDistribution}}
\newcommand{\genericFunction}{f}
\newcommand{\genericFunctionAlt}{g}
\newcommand{\emptyString}{\varepsilon}

\newcommand{\epsilonTransition}{\alpha}

\newcommand{\genericString}{w}
\newcommand{\constantNumber}{K}
\newcommand{\genericSet}{V}

% MDP
\newcommand{\mdp}{\mathcal{M}}
\newcommand{\mdpState}{s}
\newcommand{\mdpStateAlt}{y}
\newcommand{\mdpInitialState}{\mdpState_{I}}
\newcommand{\mdpStateSpace}{\mathcal{S}}
\newcommand{\mdpAction}{a}
\newcommand{\mdpActionAlt}{b}
\newcommand{\mdpActionSpace}{\mathcal{A}}
\newcommand{\mdpReward}{\mathcal{R}}
\newcommand{\mdpTransition}{\mathcal{T}}
\newcommand{\outdegree}{\rho}
\newcommand{\feasibleSet}{D_{\jointPolicy,T}}
\newcommand{\policy}{\pi}

\newcommand{\mdpPath}{\xi}
\newcommand{\mdpStateSeq}{h}
\newcommand{\mdpPathDist}{\Gamma}
\newcommand{\mdpValue}{v}
\newcommand{\mdpStateActionProcess}{X}
\newcommand{\mdpJointPathProcess}{\bm{X}}
\newcommand{\mdpStationaryStateActionProcess}{\bar{X}}
\newcommand{\mdpStateActionProcessAlt}{Y}
\newcommand{\mdpMixedStateActionProcess}{\bar{\mdpStateActionProcess}}
\newcommand{\mdpStateRandomVar}{S}
\newcommand{\mdpActionRandomVar}{A}

\newcommand{\timeHorizon}{T}
\newcommand{\randomReachTime}{\eta}

%%%%%%%%% CHANGE THESE %%%%%%%%%%%%%%
\newcommand{\joint}{joint}
\newcommand{\fullcommunication}{full}
\newcommand{\fullyimaginary}{full\text{ }img}
\newcommand{\imaginary}{img}
\newcommand{\intermittent}{int}

% Markov game
\newcommand{\game}{\bm{\mdp}}
\newcommand{\gameState}{\bm{\mdpState}}
\newcommand{\gameStateAlt}{\bm{\mdpStateAlt}}
\newcommand{\gameActionAlt}{\bm{\mdpActionAlt}}
\newcommand{\gameInitialState}{\bm{\mdpInitialState}}
\newcommand{\gameStateSpace}{\bm{\mdpStateSpace}}
\newcommand{\gameAction}{\bm{\mdpAction}}
\newcommand{\gameActionSpace}{\bm{\mdpActionSpace}}
\newcommand{\gameTransition}{\bm{\mdpTransition}}
\newcommand{\gameReward}{\bm{\mdpReward}}
\newcommand{\gameStateRandomVar}{\bm{\mdpStateRandomVar}}
\newcommand{\gameActionRandomVar}{\bm{\mdpActionRandomVar}}
\newcommand{\gameStateActionProcessAlt}{\bm{\mdpStateActionProcessAlt}}
\newcommand{\len}{len}
\newcommand{\expectedLength}{l}
\newcommand{\pathSet}{W}
\newcommand{\reachPathSet}{R}
\newcommand{\gameAbsorbingState}{\gameState_\alpha}

\newcommand{\jointPolicy}{\bm{\policy}}
\newcommand{\localPolicy}{\policy}

\newcommand{\targetSet}{\gameStateSpace_{\mathcal{T}}}

\newcommand{\deadSet}{\gameStateSpace_{\mathcal{A}}}
\newcommand{\deadSetPrime}{\gameStateSpace_{\mathcal{D}}}
\newcommand{\doneSet}{\gameStateSpace_{\mathcal{E}}}
\newcommand{\gameProcessEndState}{\gameState_{\epsilonTransition}}
\newcommand{\mdpProcessEndState}{\mdpState_{\epsilonTransition}}

\newcommand{\gamePath}{\bm{\mdpPath}}
\newcommand{\gamePathDist}{\bm{\mdpPathDist}}
\newcommand{\gameValue}{\bm{\mdpValue}}
\newcommand{\gameStateActionProcess}{\bm{\mdpStateActionProcess}}
\newcommand{\totalCorrelation}{C}
\newcommand{\totalCorrelationUpperBound}{\bar{\totalCorrelation}}

% multi-agent macros
\newcommand{\numAgents}{N}

%Communication systems
\newcommand{\probabilityFailureForever}{p}
\newcommand{\probabilityFailureOneStep}{q}
\newcommand{\sequenceCommAvailibility}{\Lambda}
\newcommand{\oneStepCommAvailibility}{\lambda}

% Optimization problem macros
\newcommand{\expectedLengthCoef}{\delta}
\newcommand{\totalCorrelationCoef}{\beta}
\newcommand{\occupancyVar}{x}

% Running example
\newcommand{\rover}{R}
\newcommand{\robot}{R}
\newcommand{\agent}{R}
\newcommand{\base}{B}
\newcommand{\goal}{T}

% Privacy macros
\newcommand{\adj}{\textnormal{Adj}}
\newcommand{\oblivious}{tr}
\newcommand{\private}{pr}
\newcommand{\genericPath}{\bm{v}}
\newcommand{\genericAgentPath}{v}

\newcommand{\genericPathAlt}{\bm{w}}
\newcommand{\genericAgentPathAlt}{w}

\newcommand{\privatePath}{\Tilde{\bm{z}}}
\newcommand{\privateAgentPath}{\Tilde{z}}

\newcommand{\privateMdpState}{\Tilde{\mdpState}}
\newcommand{\privateGameState}{\Tilde{\gameState}}
\newcommand{\privacyLevel}{\epsilon}
\newcommand{\adjParam}{k}
\newcommand{\trueStateProb}{\tau}

\newcommand{\probONB}{p_{\textrm{onb}}}
\newcommand{\probOffline}{p_{\textrm{off}}}
\newcommand{\probRepair}{p_{\textrm{r}}}

\newcommand{\directedGraph}{G}
\newcommand{\graphEdges}{E}

%%%%%%%%%%%%%%% START DOCUMENT %%%%%%%%%%%%%%%%%%%%
\maketitle

\begin{abstract}
Privacy-aware multiagent systems must protect agents' sensitive data while simultaneously ensuring that agents accomplish their shared objectives. Towards this goal, we propose a framework to privatize inter-agent communications in cooperative multiagent decision-making problems. We study sequential decision-making problems formulated as cooperative Markov games with reach-avoid objectives. We apply a differential privacy mechanism to privatize agents' communicated symbolic state trajectories, and analyze tradeoffs between the strength of privacy and the team's performance. For a given level of privacy, this tradeoff is shown to depend critically upon the total correlation among agents' state-action processes. We synthesize policies that are robust to privacy by reducing the value of the total correlation. Numerical experiments demonstrate that the team's performance under these policies decreases by only \(6\) percent when comparing private versus non-private implementations of communication. By contrast, the team's performance decreases by \(88\) percent when using baseline policies that ignore total correlation and only optimize team performance.
\end{abstract}

\section{Introduction}
In cooperative multiagent systems, a team of decision-making agents interacts with a shared environment to accomplish a common objective~\citep{Cao2013distributed,Parker2016robot}.
In these systems, inter-agent communication is often necessary for the successful coordination of the team; each agent typically relies on information pertaining to its teammates while making its own decisions.
However, this communicated information may be sensitive.
For example, it may be beneficial for autonomous vehicles to share location data while solving multi-vehicle routing problems. 
However, this would reveal the passengers' sensitive location data.
Smart grids are another example in which households connected to the grid are incentivized to work cooperatively by sharing the power status of their appliances. However, this information could reveal the personal habits and schedules of the tenants~\citep{farokhi2017fisher}.
Privacy-aware multiagent systems should protect the agents' sensitive data while simultaneously ensuring that the agents are able to accomplish their common objective.


In this work, we develop such privacy-aware multiagent systems.
In particular, we study sequential multiagent decision problems formulated as cooperative Markov games with reach-avoid objectives.
We assume that a trusted central aggregator is used to synthesize a collection of local policies for the team of agents \emph{a priori}.
In general, the local policy of a particular agent maps from the state information of some subset of its teammates, to the agent's local action space.
However, during policy execution, the agents want to keep their individual state trajectories private from their teammates and from potential eavesdroppers (the aggregator is not involved at run time). 
When the local policies do not take privacy into consideration, their performance under private communications can decrease dramatically, as shown by our numerical results.
Thus, we develop a framework to privatize the inter-agent communications required to execute the policies, and to synthesize policies that are performant under private communications.

We use \textit{differential privacy}~\citep{dwork2014algorithmic} to develop a framework providing formal privacy guarantees in multiagent systems.
In the Markov game, each agent is modeled by a Markov decision process (MDP), and we are concerned with privatizing the state trajectories of these MDPs.
We implement differential privacy using the Online Mechanism for Markov chains presented by~\citet{chen2022differential}. 
Under an assumption on the structure of the dependencies of the agents' local policies on their teammates' states, we show that this mechanism guarantees differential privacy for the symbolic state trajectories produced by the MDPs.
The mechanism also provides an efficient method for agents to generate private states in real-time, and it ensures that each agent's private trajectory is feasible with respect to the underlying dynamics of its MDP.
Finally, the strength of the privacy guarantees can be tuned by each agent.

Our specific contributions in this work are as follows:
\begin{enumerate}

    \item \textit{A framework for differential privacy in multiagent systems.} We propose a framework for differential privacy in multiagent planning problems.
    The framework allows for the decentralized execution of local policies under private inter-agent communications.
    
    \item \textit{Theoretical results: Analyzing the tradeoff between privacy and performance.} 
    We bound the team's success probability under private communications in terms of the strength of privacy and total correlation of agents' state-action processes.
    
    \item 
    \textit{Synthesis of policies robust to private communications.}
    By minimizing this total correlation value we synthesize policies for the agents that achieve high performance under strong levels of privacy.
    
\end{enumerate}

Numerical experiments demonstrate the strong performance of the synthesized policies, even with private communications.
We observe that under private communications: 1) the proposed minimum-dependency policies are \(84\) percent more performant than baseline policies that only optimize the team's performance under truthful communications and that ignore total correlation, 2) as the total correlation decreases, the team's performance increases, and 3) the performance of the minimum-dependency policies is robust to the level of privacy enforced by the privacy mechanism.

Despite the importance of privacy in multiagent systems~\citep{such2014privacysurvay}, existing algorithms for multiagent planning and learning typically do not examine the tradeoff between privacy and team performance, and many do not consider privacy at all.
\citet{nissim2014distributed,brafman2015privacy,tovzivcka2016privacy,vstolba2018quantifying,vstolba2022privacy} explore the notion of \textit{strong privacy} in multiagent planning problem for deterministic environments. \cite{tovzivcka2016privacy,vstolba2018quantifying,vstolba2022privacy} focus on the privacy of the planning process itself, where sensitive information refers to specific states and actions that are kept hidden from adversaries while synthesizing a plan. \citet{nissim2014distributed,brafman2015privacy} develop algorithms that ensure agents do not share sensitive states or actions when executing a distributed planning algorithm for deterministic environments. \citet{hefner2022privacy} extends the notion of strong privacy to stochastic systems and develops a distributed value iteration algorithm for privacy-preserving planning. 
These works are concerned with hiding a private portion of each agent’s states, and they do not consider mechanisms in which the agents achieve privacy by altering their shared information. By contrast, our work studies a differential privacy mechanism that alters the state trajectories of the agents during multiagent communication in stochastic environments. More closely related to our approach, \citet{Ye2022} uses differential privacy to privatize the local information of the agents.
However, different from our problem setting, they consider logistic-like problems modeled with Graph-STRIPS.

Meanwhile, differential privacy has been studied in the context of planning and reinforcement learning for MDPs \citep{garcelon2020local,qiao2022offline,gohari2021differential}. 
However, these works study single-agent problems and they are mainly concerned with privatizing value functions, reward values, or transition probabilities.
Our work instead considers the multiagent setting and we define differential privacy over symbolic state trajectories.
In particular, we extend the differential privacy mechanism presented by \citet{chen2022differential} to multiagent planning problems, and we study the impact of privacy on the team's performance.

Decentralized policy execution has gained attention for planning and reinforcement learning in multiagent MDPs \citep{becker2003transition,rashid2018qmix,son2019qtran,oliehoek2016concise,Karabag2022}.
As a byproduct of decentralized policy execution, these algorithms may achieve privacy in the sense that agents do not communicate locally available information.
However, these works do not explicitly consider privacy or give privacy guarantees. Furthermore, it may not be possible to obtain high performance under fully decentralized policy execution.
For this reason, we allow for private communications and use total correlation as a soft decentralization metric, which enables the synthesis of policies that are performant under private communications.

\section{Preliminaries}
\label{sec:prelims}

\subsection{Cooperative Markov Games }
\label{sec:prelim_games}
Given a finite collection of $\numAgents$ agents indexed by $i\in\{1,2,\dots,\numAgents\},$ we model the dynamics of agent $i$ with an MDP $\mdp^i$. An MDP is a tuple $\mdp^i = (\mdpStateSpace^i,\mdpInitialState^i,\mdpActionSpace^i,\mdpTransition^i),$ where $\mdpStateSpace^i$ is agent $i$'s finite set of local states, $\mdpInitialState^i\in\mdpStateSpace^i$ is an initial state, $\mdpActionSpace^i$ is agent $i$'s finite set of local actions, and $\mdpTransition^i:\mdpStateSpace^i\times\mdpActionSpace^i\to\distribution(\mdpStateSpace^i)$ is a transition probability function, where $\distribution(\mdpStateSpace^i)$ denotes the set of probability distributions over the state space $\mdpStateSpace^i.$ For brevity, we use $\mdpTransition^i(s^i,a^i,y^i)$ to denote the probability of $y^i$ given by the distribution $\mdpTransition^i(s^i,a^i).$ A state $\mdpState^i_j\in\mdpStateSpace^i$ is called a \emph{feasible state} of another state $\mdpState^i_k\in\mdpStateSpace^i$ if there exists an action $a^i\in\mdpActionSpace^i$ such that $\mdpTransition^i(\mdpState^i_k,\mdpAction^i,\mdpState^i_j)>0$. 

Given such a collection of agents, we formulate the team's decision problem as a cooperative Markov game with independent transitions $\game.$ A cooperative Markov
game involving $\numAgents$ agents, each of which is modeled by an MDP $\mdp^i = (\mdpStateSpace^i,\mdpInitialState^i,\mdpActionSpace^i,\mdpTransition^i),$ is given by the tuple $\game = (\gameStateSpace,\gameInitialState,\gameActionSpace,\gameTransition).$ Here, $\gameStateSpace=\mdpStateSpace^1\times\dots\times\mdpStateSpace^\numAgents$ is the joint state space, $\gameInitialState=(\mdpInitialState^1,\dots,\mdpInitialState^\numAgents)$ is the joint initial state, $\gameActionSpace=\mdpActionSpace^1\times\dots\times\mdpActionSpace^\numAgents$ is the joint action space, and $\gameTransition:\gameStateSpace\times\gameActionSpace\to\distribution(\gameStateSpace)$ is the joint transition probability function. 
For brevity, we use $\gameTransition(\gameState,\gameAction,\bm{y})$ to denote the probability of $\bm{y}$ given the distribution $\gameTransition(\gameState,\gameAction).$ 
$\gameTransition$ is defined as $\gameTransition(\gameState,\gameAction,\gameStateAlt)=\prod_{i=1}^\numAgents \mdpTransition^i(\mdpState^{i},\mdpAction^{i},\mdpStateAlt^{i})$ for all $\gameState=(\mdpState^1,\dots,\mdpState^\numAgents)\in\gameStateSpace,$ $\gameStateAlt=(\mdpStateAlt^1,\dots,\mdpStateAlt^\numAgents)\in\gameStateSpace$ and $\gameAction=(\mdpAction^1,\dots,\mdpAction^\numAgents)\in \gameActionSpace.$
%Let $\gameState\in\gameStateSpace$ and $\gameAction\in\gameActionSpace$ denote a joint state and action, respectively. 

For notational convenience, we use $\gameState^{-i} \in \gameStateSpace^{-i} = \mdpStateSpace^1 \times \ldots \times \mdpStateSpace^{i-1} \times \mdpStateSpace^{i+1} \times$ $\ldots \times \mdpStateSpace^\numAgents$ to denote the states of agent $i$ 's teammates, excluding agent $i$ itself. 
Similarly $\gameAction^{-i}$ and $\gameActionSpace^{-i}$ denote the actions of agent $i$ 's teammates and the set of all possible actions of teammates, respectively.

A (stationary) local policy \(\localPolicy^{i} : \gameStateSpace \to \distribution(\mdpActionSpace^{i})\) of Agent \(i\) is a mapping from a particular joint state to a probability distribution over actions of Agent \(i\). Given the team is in joint state \(\gameState\), \(\localPolicy^{i}(\gameState, \mdpAction^{i})\) denotes the probability that action \(\mdpAction^{i}\) is selected by \(\localPolicy^{i}\) for agent \(i\) . We define a (stationary) joint policy \(\jointPolicy\) to be a collection of local policies, \(\lbrace \localPolicy^{i} \rbrace_{i=1}^{\numAgents}\). 


In a truthful communication setting, at each timestep, each agent \(i\) observes its local state \(\mdpState^{i}_{t}\), and communicates this information with all of its teammates. 
Each agent then uses the state information communicated by its teammates, and its local policy \(\localPolicy^{i}\), to sample an action \(\mdpAction^{i} \in \mdpActionSpace^{i}\) to execute.

In this work, we consider team reach-avoid problems.
The centralized planning problem is to solve for a collection of local policies \(\lbrace \localPolicy^{i} \rbrace_{i=1}^{\numAgents}\) maximizing the probability that the team reaches a target set \(\targetSet \subseteq \gameStateSpace\) of states from the team's initial joint state \(\gameInitialState\), while avoiding a set \(\deadSet \subseteq \gameStateSpace\) of states.
We call this probability value the success probability.
More formally, we say that a state-action \textit{trajectory} \(\gamePath = \gameState_0 \gameAction_0 \gameState_1 \gameAction_1 \ldots\) successfully reaches the target set \(\targetSet\) if there exists some time \(M\) such that \(\gameState_{M} \in \targetSet\) and for all \(t < M\), \(\gameState_t \not \in \deadSet\).
While we focus on reach-avoid problems, our framework can be applied to settings with generic rewards.

We use $\occupancyVar_{\gameState, \gameAction}$ to denote the occupancy measure of the state-action pair $(\gameState, \gameAction)$, i.e., the expected number of times that action $\gameAction$ is taken at state $\gameState$. 
Similarly, $\occupancyVar_{\mdpState^i, \mdpAction^i}$ denotes the the occupancy measure of the state-action pair $\left(\mdpState^i, \mdpAction^i\right)$ for agent $i$ where $\occupancyVar_{\mdpState^i, \mdpAction^i}=$ $\sum_{\bm{s^{-i}} \in \gameStateSpace^{-i}} \sum_{\gameAction^{-i} \in \gameActionSpace^{-i}} \occupancyVar_{(\mdpState^i, \gameState^{-i}), (\mdpAction^{i}, \gameAction^{-i})}.$ 
Let \( \deadSetPrime\) be the states from which the probability of reaching \(\targetSet\) is \(0\) under any collection of local policies. The following assumption ensures that every trajectory satisfies or violates the reachability specification in finite time. 
\begin{assumption}
    The total occupancy measure is finite at states \(\gameStateSpace \setminus (\targetSet \cup \deadSetPrime)\), i.e., \(\sum_{\gameState \in \gameStateSpace\setminus(\targetSet \cup \deadSetPrime), \gameAction \in \gameActionSpace} \occupancyVar_{\gameState, \gameAction} < \infty\).
\end{assumption}
% 
A state-action trajectory $\mdpPath^i$ of the MDP $\mathcal{M}^i$ is a sequence $\mdpPath^i=\mdpState_0^i \mdpAction_0^i \mdpState_1^i \mdpAction_1^i\dots$ such that for all $t=0,1,\dots,$ $\mdpTransition(\mdpState_t^i,\mdpAction_t^i,\mdpState_{t+1}^i)>0.$ We use $\gamePath=\gameState_{0}\gameAction_{0}\gameState_{1}\dots$ to denote the joint state-action trajectory of all agents and $\gamePath^{-i}=\gameState_{0}^{-i}\gameAction_{0}^{-i}\gameState_{1}^{-i}\gameAction_{1}^{-i}\dots$ to denote joint state-action trajectory with agent $i$ excluded. 
% Note that \(\gamePath\) and \(\gamePath^{-i}\) are both strings of vectors. 
We define the effective length of trajectories \(\len(\gamePath = \gameState_0 \gameAction_0\ldots) = \min\lbrace t+1 | \gameState_{t} \in \targetSet \cup \deadSetPrime \rbrace \).
Let agent $i$'s state trajectory up to time $t$ be $\mdpStateSeq_t^i=\mdpState_0^i \mdpState_1^i \dots \mdpState_t^i.$ We are concerned with the privacy of $\mdpStateSeq_t^i$ so that agents can execute their policy without revealing sensitive information. 

\subsection{Differential Privacy}
Differential privacy is enforced by a \emph{mechanism}, which is a randomized map. 
We enforce differential privacy on a per-agent basis, an approach sometimes called ``local differential privacy"~\citep{Duchi2019minimax}.
For nearby local state trajectories, a mechanism must produce local private trajectories that are approximately indistinguishable.
The definition of ``nearby" is given by an adjacency relation using the Hamming distance~\citep{Schulz2003distance} denoted by $d(\genericAgentPath^i,\genericAgentPathAlt^i)$, which is a metric that measures the minimum number of substitutions that can be applied to a local trajectory $\genericAgentPath^i$ of agent $i$ to convert it to $\genericAgentPathAlt^i$.

Next, we define the notions of adjacency and local differential privacy for the Markov game.
Let $\feasibleSet=\left\{(\gameState_0\dots\gameState_T)\mid \forall t,\exists\gameAction_t, \gameTransition{(\gameState_{t},\gameAction_{t},\gameState_{t+1})}\jointPolicy(\gameState_t,\gameAction_t)>0\right\}$ denote a set of all feasible joint state trajectories of $\game$ under a joint policy $\jointPolicy$.
% 
\begin{definition}[Adjacency]
\label{dfn:adjacency}
Fix a length $T\in\mathbb{N}^+$ and an adjacency parameter $\adjParam\in\mathbb{N}^+$. For a Markov game $\game$ with state space $\gameStateSpace$ and a joint policy $\jointPolicy$, the adjacency relation for agent $i$ is $\adj_{T,\adjParam}^i=\{(\genericPath,\genericPathAlt)\in \feasibleSet \times \feasibleSet\ |\ d(\genericAgentPath^i,\genericAgentPathAlt^i)\leq \adjParam,\text{ and } \forall j\neq i, \genericAgentPath^j=\genericAgentPathAlt^j\}.$
\end{definition}
%
For agent $i$, two $T$-length joint trajectories are adjacent if the Hamming distance between agent $i$'s corresponding local trajectories is less than or equal to $\adjParam$, and the local trajectories of the rest of the team remain the same. 
%The adjacency relation of agent $i$ must account for the behavior of the rest of the network: a malicious agent may infer the local trajectories of one agent using others' trajectories.
We next introduce the definition of word local differential privacy.


\begin{definition}[Word Local Differential Privacy]
\label{dfn:word_dp}
Fix a probability space $(\Omega,\mathcal{F},\mathbb{P}),$ an adjacency parameter $k\in\mathbb{N}^+,$ a length $T\in\mathbb{N}^+,$ and a privacy parameter $\privacyLevel>0$. For a Markov game $\game$ with state space $\gameStateSpace$ and a joint policy $\jointPolicy$, a mechanism $M:\feasibleSet\times\Omega\to \distribution((\mdpStateSpace^i)^T)$ is $\privacyLevel$-word local differentially private for agent $i$ if, for all trajectories $(\genericPath,\genericPathAlt)\in \adj_{T,k}^i$ and all $L\subseteq (\mdpStateSpace^i)^T,$ it satisfies
$\mathbb{P}[M(\genericPath)\in L]\leq e^\privacyLevel \mathbb{P}[M(\genericPathAlt)\in L].$
\end{definition}


Consider two candidate trajectories for agent $i$ where one of them is the true local trajectory and the other is an adjacent local trajectory. 
Intuitively, word local differential privacy guarantees that given agent $i$'s private local trajectory, a malicious agent can not reliably tell which candidate trajectory is agent $i$'s true local trajectory, even if the malicious agent has access to the true trajectories of agent $i$'s teammates.
Definition~\ref{dfn:word_dp} is an extension of Definition 2 from~\citet{chen2022differential} for the multiagent setting. The privacy parameter $\privacyLevel$ controls the strength of privacy and a smaller $\privacyLevel$ implies stronger privacy. In the literature, $\privacyLevel$ typically ranges from 0.01 to 10~\citep{Hsu2014DifferentialPA}.

\section{Problem Formulation and Assumptions}
\label{sec:prob_form}
In this section, we state the problem of privatizing inter-agent communications in a cooperative Markov game and introduce the relevant assumptions. We begin with the problem statements.
Consider $N$ agents playing a cooperative Markov game with a reach-avoid objective as introduced in~\S\ref{sec:prelim_games}.

\begin{problem}
\label{prb:mechanism}
Design an online privacy mechanism that provides $\privacyLevel$-word local differential privacy (Definition~\ref{dfn:word_dp}) for the state trajectory~$h_t^i=\mdpState_1^i \mdpState_2^i \dots \mdpState_t^i$ of agent $i$ in real time, i.e., without knowledge of $\mdpState^i_{t+1},\mdpState^i_{t+2},\dots$ at time $t.$ 
\end{problem}

\begin{problem}
    \label{prb:execution}
    Define an algorithm for the decentralized execution of policies \(\lbrace \localPolicy^{i} \rbrace_{i=1}^{\numAgents}\) under private communications.
\end{problem}

\begin{problem}
\label{prb:bound}
Given a collection of local policies \(\lbrace \localPolicy^{i} \rbrace_{i=1}^{\numAgents}\), provide a bound on the probability of success under private communications $\gameValue^{\private}$. Use this bound to analyze the tradeoffs between privacy and performance in the multiagent system.
\end{problem}

\begin{problem}
\label{prb:synthesis}
Synthesize policies for the multiagent system that achieve high performance under strong levels of privacy, by taking into account the tradeoffs analyzed in Problem~\ref{prb:bound}.
\end{problem}

\paragraph{Privacy Assumptions:} We define the information that the agents provide to the central planner and the information that they hide. We then illustrate this setting with an example.

We assume that each agent trusts a central planner to design local policies. Each agent allows the planner to access its individual MDP, denoted as $\mdp^i$ for each $i\in[N]$. The planner also has knowledge of the game's objective, which can be specified as reach and avoid sets $\targetSet$ and $\deadSet$ or as a reward function. The central planner uses this information to provide each agent with a local policy $\localPolicy^{i}$.
These local policies are assumed to be stationary and the action distribution of each agent is independent of the actions of its teammates given the joint state. This means that the central planner will not synthesize policies that compromise privacy: agent $i$ does not gain knowledge of any other agent's actions by sampling its own local policy $\localPolicy^i.$ 
We additionally assume that the initial joint state, $\bm{\mdpInitialState}$, is public information.

Furthermore, we assume that the local policies synthesized by the central planner have an \textit{acyclic dependency structure}.
Let \(G = ([\numAgents], E)\) be a directed graph, where \(E \subseteq [\numAgents] \times [\numAgents]\). 
We use \(\directedGraph\) to define the dependency structure of the local policies in the sense that an edge \((i,j) \not \in E\) if and only if 
\(\localPolicy^{i}(\gameState, \mdpAction^{i}) = \localPolicy^{i}(\gameStateAlt, \mdpAction^{i})\) for all 
\(\gameState=(\mdpState^1,\dots, \mdpState^\numAgents) \in \gameStateSpace\) and \(\gameStateAlt=(\mdpStateAlt^1,\dots,\mdpStateAlt^\numAgents) \in \gameStateSpace\) with \(\mdpState^{k} = \mdpStateAlt^{k}\) for all \(k \neq j\). 
In words, \((i,j) \notin E\) if the local policy of agent \(i\) does not depend on the state of agent \(j\). 
The assumption on the acyclic dependency structure of the agents' policies can then be defined as follows.

\begin{assumption} \label{assumption:acyclicdependency}
The directed graph \(G' = ([\numAgents], E')\) is an acyclic directed graph where \(E' = E\setminus\lbrace (i,i) | i \in [\numAgents]\rbrace\).
\end{assumption}

While we formulate the problem as a cooperative Markov game from the point of view of a central planner, we note that each of the agents only cooperates with its teammates insofar as it follows the policy provided to it by the planner. 
Hence, the agents do not necessarily trust each other.
As an example, the agents in a smart grid might be incentivized to work cooperatively to improve the overall efficiency of the grid.
However, these agents might still have individual privacy concerns: adversaries can infer the power-consumption habits of individuals, e.g., the level of occupancy of a household, from the data shared with the rest of the grid~\citet{farokhi2017fisher}.

We accordingly assume that the agents do not have access to each other's transition probabilities or actions. 
The agents also do not observe whether the reach-avoid specification is satisfied or violated. 
Each agent \emph{only} receives a local policy from the central planner and private state information from the agents that this policy depends on, as defined by \(\directedGraph\).

Lastly, we note that the methods presented in this paper can be applied when each agent has a different privacy level, i.e., different values of $\privacyLevel$. However, for convenience we assume that each agent has the same privacy parameter~$\privacyLevel.$ 

\begin{example}
In this example, the sensitive information is the location of two drivers, Alice and Bob, who work for a taxi service.
A central planner employed by the taxi service generates local policies for Alice and Bob to follow.
These local policies dynamically assign each of the drivers to a passenger based on their location proximity.
Alice and Bob thus need to share their locations with each other in order to follow their respective policies.
However, Alice and Bob would prefer to keep their locations private.
Consequently, they use privacy mechanisms while communicating their locations.
For example, they randomize their location data before sharing it so that their true locations are not revealed. 

With this private information, Alice and Bob then execute the local policies synthesized by the central planner. 
However, because they are sharing perturbed location data, the local policies may not be executed as efficiently as they could be if they had access to each other's true locations.
The central planner should take the privacy mechanisms into account while synthesizing local policies in order to balance privacy and performance.
\end{example}

\section{Implementing Local Policies with Private Communications}\label{sec:implemntation}
In this section, we solve Problems~\ref{prb:mechanism} and~\ref{prb:execution}. Specifically, in~\S\ref{subsec:privacy_implementation}, we modify the online mechanism for Markov chains from~\citet{chen2022differential} to privatize state trajectories of an MDP. Then, in~\S\ref{subsec:policy_implementation} we detail how each agent can use other agents' private state information to execute its local policy.


\subsection{Implementing Differential Privacy}\label{subsec:privacy_implementation}
We enforce privacy on a per-agent basis. 
That is, we develop a mechanism for agent $i$ to share its local state trajectory $\mdpStateSeq_t^i=\mdpState_1^i \mdpState_2^i \dots \mdpState_t^i\in (\mdpStateSpace^i)^t$ during policy execution while satisfying word local differential privacy from Definition~\ref{dfn:priv_mech}. 
To achieve this, agent $i$ will only share a private state trajectory $\Tilde{\mdpStateSeq}_t^i=\privateMdpState_1^i \privateMdpState_2^i \dots \privateMdpState_t^i\in (\mdpStateSpace^i)^t.$ 
To generate $\Tilde{\mdpStateSeq}_t^i$ in real time, agent $i$ uses an online mechanism $M_{\mdpStateSeq_t^i}$ to generate an individual private state $\privateMdpState_t^i$ at each time step $t.$ 


At every time step $t$, each agent needs to communicate its private state with the agents corresponding to its predecessors in the acyclic graph \(\directedGraph'\), to allow them to execute their local policies.
However, the differential privacy guarantee of Definition~\ref{dfn:word_dp} holds over the entire $T-$length state trajectory. This means that even though agents are communicating at each time step, we provide privacy to their entire $T-$length trajectories. We now define the online privacy mechanism.


\begin{definition}[Online Mechanism~\citep{chen2022differential}]
\label{dfn:priv_mech}
Fix a probability space $(\Omega,\mathcal{F},\mathbb{P})$ and an MDP $\mdp^i=(\mdpStateSpace^i,\mdpInitialState^i,\mdpActionSpace^i,\mdpTransition^i)$. Given a state trajectory $\mdpStateSeq_t^i=\mdpState_
1^i \mdpState_2^i \dots \mdpState_t^i\in (\mdpStateSpace^i)^t,$ with an initial state $\mdpState_I^i$, define the online mechanism $M_{\mdpStateSeq_t^i}$ that generates a private trajectory $\Tilde{\mdpStateSeq}^i_t=\privateMdpState_1^i \privateMdpState_2^i \dots \privateMdpState_t^i\in (\mdpStateSpace^i)^t$ such that $\privateMdpState_t^i$ is sampled from the distribution $\mathbb{P}[\privateMdpState_t^i]=\probabilityMeasure^i_\privacyLevel(\privateMdpState_t^i|\mdpState_t^i,\privateMdpState_{t-1}^i)$ where $\probabilityMeasure^i_\privacyLevel$ is computed by Algorithm~\ref{alg:privacy_construction}.
\end{definition}
% 
\begin{algorithm}
\caption{Online Mechanism Construction}
\label{alg:privacy_construction}
\KwIn{Probability of true transition $\trueStateProb_\privacyLevel$}
\KwOut{$\probabilityMeasure^i_{\privacyLevel}$}
\For{$(\mdpState^{i}_t, \privateMdpState^{i}_{t-1}, \privateMdpState^{i}_t)\in \mdpStateSpace^i\times \mdpStateSpace^i \times \mdpStateSpace^i$}{
            \uIf{$\mdpState^{i}_t=\privateMdpState^{i}_t$\ and $\beta(\privateMdpState^{i}_t,\privateMdpState^{i}_{t-1})=1$}{
                $\probabilityMeasure^i_{\privacyLevel}(\privateMdpState^{i}_t\ |\ \mdpState^{i}_t,\privateMdpState^{i}_{t-1})=\trueStateProb_\privacyLevel(\privateMdpState^{i}_{t-1}).$
            }
            \uElseIf{$\mdpState^{i}_t\neq \privateMdpState^{i}_t$\ and $\beta(\privateMdpState^{i}_t,\privateMdpState^{i}_{t-1})=1$}{
                $\probabilityMeasure^i_{\privacyLevel}(\privateMdpState^{i}_t\ |\ \mdpState^{i}_t,\privateMdpState^{i}_{t-1})=\frac{1-\trueStateProb_\privacyLevel(\privateMdpState^{i}_{t-1})\beta(\mdpState^{i}_t,\privateMdpState^{i}_{t-1})}{\rho(\privateMdpState^{i}_{t-1})-\beta(\mdpState^{i}_t,\privateMdpState^{i}_{t-1})}.$
            }
            \Else{
            $\probabilityMeasure^i_{\privacyLevel}(\privateMdpState^{i}_t\ |\ \mdpState^{i}_t,\privateMdpState^{i}_{t-1})=0.$}
}
\end{algorithm}
% 
In Algorithm~\ref{alg:privacy_construction}, the feasibility indicator function $\beta$ is defined for all $\mdpState^i,\mdpStateAlt^i\in\mdpStateSpace^i$ as
\begin{equation*}
    \beta(\mdpState^i,\mdpStateAlt^i)=\begin{cases}
    1,\ \text{if}\ \exists \mdpAction^i\in\mdpActionSpace^i\ s.t.\ \mdpTransition^i(\mdpStateAlt^i,\mdpAction^i,\mdpState^i)>0,\\
    0,\ otherwise,
    \end{cases}\\
\end{equation*}
and the out-degree $\rho$ is defined for each state $\mdpState^i\in\mdpStateSpace$ as
$\rho(\mdpState^i) = \lvert\{\mdpStateAlt^{i}\in\mdpStateSpace^i\ |\  \exists \mdpAction^i\in\mdpActionSpace^i\ s.t.\ \mdpTransition^i(\mdpState^i,\mdpAction^i,\mdpStateAlt^{i})>0\} \rvert.$


Definition~\ref{dfn:priv_mech} and Algorithm~\ref{alg:privacy_construction} define a privacy mechanism in the form of a conditional probability distribution $\probabilityMeasure^i_{\privacyLevel}.$ To implement the mechanism agent $i$ samples a private output $\privateMdpState^i_t$ from the probability distribution $\probabilityMeasure^i_{\privacyLevel}(\cdot|\ \mdpState_t^i,\privateMdpState_{t-1}^i)$ at each time step $t.$ The mechanism is constructed such that the probability $\probabilityMeasure^i_{\privacyLevel}(\privateMdpState_{t}^i\ |\ \mdpState_t^i,\privateMdpState_{t-1}^i)$ is positive if $\privateMdpState^i_t$ is feasible from the most recent private state $\privateMdpState^i_{t-1},$ and $0$ otherwise. This prevents the mechanism from outputting private trajectories that are not feasible with respect to the dynamics of $\mdp^i.$ When the true, sensitive state $\mdpState_t^i$ is feasible from the previous private output $\privateMdpState^i_{t-1},$ the mechanism outputs $\mdpState_t^i$ with probability $\trueStateProb_\privacyLevel(\privateMdpState^i_{t-1})$ and outputs any other feasible state with a uniform probability whose sum is equal to $1-\trueStateProb_\privacyLevel(\privateMdpState^i_{t-1})$. We refer to the event of outputting the sensitive state $\mdpState_t^i$ at time $t$ as a ``true transition" and $\trueStateProb_\privacyLevel(\privateMdpState^i_{t-1})$ as the ``probability of true transition". In~\S\ref{sec:analytical_results}, we establish a requirement for this mechanism to achieve word local differential privacy.


\subsection{Private Policy Execution}\label{subsec:policy_implementation}
In this section, we solve Problem~\ref{prb:execution} and define an algorithm for the decentralized execution of local policies \(\lbrace \localPolicy^{i} \rbrace_{i=1}^{\numAgents}\) under private communications (Algorithm \ref{alg:policy_exec}).

Recall that we assume the agents use local policies with an acyclic dependency structure defined by the directed graph \(\directedGraph' = ([\numAgents], \graphEdges')\).
Let \(Pred(i)\) and \(Succ(i)\) denote the set of predecessors and succesors of node \(i\) in \(G'\), respectively. 
Let $\Tilde{\gameState}^{Succ}_{t,i}=(\Tilde{\mdpState}^{j}_{t})$ for all $j\in Succ(i)$ be the tuple containing the private states of agent $i$'s successors.
Note that formally, \(\localPolicy^{i}\) is defined on the joint state space of the entire team. However, Assumption \ref{assumption:acyclicdependency} ensures that the action distribution defined by \(\localPolicy^{i}\) only depends on \((\Tilde{\gameState}^{Succ}_{t,i}, \mdpState_{t}^{i})\), the local states of some subset of the agents. 
To more readily highlight the information dependencies in the problem, it is with a slight abuse of notation that we use \((\Tilde{\gameState}^{Succ}_{t,i}, \mdpState_{t}^{i})\) to denote the inputs provided to each local policy \(\localPolicy^{i}\) in Algorithm \ref{alg:policy_exec}.

\begin{algorithm}
\caption{Privatized Policy Execution}
\label{alg:policy_exec}
\SetKwBlock{DoParallel}{Every agent \(i\) does in parallel}{end}
\SetKwBlock{ForDoParallel}{for \(t=0,1,\ldots\) every agent \(i\) does in parallel}{end}
\textbf{Input for every agent \(i\):} Local policy \(\localPolicy^{i}\)

Set \(\privateMdpState^{i}_{0} = \mdpInitialState^{i}\) for all \(i\in [\numAgents]\).

\ForDoParallel{
    Set \(\hat{\gameState}_{t,i} = (\Tilde{\gameState}^{Succ}_{t,i}, \mdpState^{i}_{t})\).
       
    Sample an action $\mdpAction_{t}^{i} \sim \localPolicy^{i}(\hat{\gameState}_{t,i}).$
    
    Execute $\mdpAction_{t}^i$ and transition to $\mdpState_{t+1}^i\sim\mdpTransition^i(\mdpState_{t}^i,\mdpAction_{t}^i).$

    Share $\privateMdpState_{t+1}^{i}\sim\probabilityMeasure^i_{\privacyLevel}(\cdot|\mdpState_{t+1}^i,\privateMdpState_{t}^i)$ with agents in $Pred(i)$. 
}

\end{algorithm}

During private policy execution, the agents communicate potentially false information.
To sample actions from their local policies, each agent must therefore maintain an estimate of the states of its successors in \(\directedGraph'\).
We assume that each agent constructs this estimate using the privatized information it receives from its teammates.
In detail, agent \(i\) knows its own local state \(\mdpState^{i}_{t}\) and the private state of its successors, \(\privateMdpState^{j}_{t}\)  for \(j\in Succ(i)\) at time \(t\). 
Agent $i$'s estimate of the relevant teammate states is thus given by \(\hat{\gameState}_{t,i} = (\Tilde{\gameState}^{Succ}_{t,i}, \mdpState^{i}_{t}).\) 
Then, agent \(i\) samples an action \(\mdpAction_{t}^{i}\) for itself from \(\localPolicy^{i}\) using the estimate \(\hat{\mdpState}_{t,i}\). 
We note that the agents do not communicate during the action selection phase since the local policies are independent given the joint state. 
After choosing an action \(\mdpAction_{t}^{i}\), agent \(i\) executes this action and transitions to a next state \(\mdpState_{t+1}^i\). 
In the next time step \(t+1\), agent \(i\) samples a private state \(\privateMdpState^{i}_{t}\) using~$\probabilityMeasure^i_{\privacyLevel}$ and shares this private state with the agents corresponding to its predecessors in \(\directedGraph'\).

\section{Privacy and Performance Tradeoffs}
\label{sec:analytical_results}
In this section, we address Problem \ref{prb:bound} and analyze the tradeoff between performance and privacy when executing a collection of local policies with private communications. 

In the single-agent setting, \citet{chen2022differential} showed word differential privacy of the agents' state trajectories generated by the online mechanism. 
We extend this result to the multiagent setting using Assumption \ref{assumption:acyclicdependency},
which ensures that at time \(t\) the future true state trajectory \(\mdpState^{i}_{t+1}\ldots\) of agent \(i\) is statistically independent from its past private state trajectory \(\Tilde{\mdpStateSeq}_t^i=\privateMdpState_1^i \privateMdpState_2^i \dots \privateMdpState_t^i \) given $\mdpStateSeq_t^i=\mdpState_1^i \mdpState_2^i \dots \mdpState_t^i$. 
We discuss the necessity of this independence in the proof of Theorem \ref{lem:privacy_req}, which is provided in the supplementary material.

\begin{theorem}
\label{lem:privacy_req}
Fix a length $T\in\mathbb{N}^+$, an adjacency parameter $\adjParam\in\mathbb{N}^+,$ and a privacy parameter $\privacyLevel\geq0.$ Under Assumption \ref{assumption:acyclicdependency}, the online mechanism (Definition~\ref{dfn:priv_mech}) is $\privacyLevel$-word locally differentially private (Definition~\ref{dfn:word_dp}) with respect to the Adjacency relation $\adj_{T,\adjParam}^{i}$ in Definition~\ref{dfn:adjacency} if $\trueStateProb_\privacyLevel(\privateMdpState^i_{t-1})$ satisfies
\begin{equation}
    \trueStateProb_\privacyLevel(\privateMdpState^i_{t-1})=1/({(\rho(\privateMdpState^i_{t-1})-1)e^{-\nicefrac{\privacyLevel}{\adjParam}+1})}.
\end{equation}
\end{theorem}

As privacy strengthens (i.e., as $\privacyLevel$ decreases), $\tau_\privacyLevel(\privateMdpState^i_{t-1})$ approaches $1/({\rho(\privateMdpState^i_{t-1})-1)}$ which implies the privacy mechanism will sample the private state \(\privateMdpState^{i}_{t}\) more uniformly from the set of feasible next states of $\privateMdpState^i_{t-1}$. 
Conversely, as privacy weakens (i.e., as $\privacyLevel$ increases), $\tau_\privacyLevel(\privateMdpState^i_{t-1})$ increases as well, indicating a larger probability of revealing the true state.

Having established the differential privacy guarantees of Algorithm \ref{alg:policy_exec}, we now focus on performance guarantees. In order to succeed under private communications, the agents' local policies should be as indifferent as possible to the other agents' states. In other words, agents' behaviors should be made nearly independent from each other. 


The collection of local policies induce a joint policy \(\jointPolicy = \lbrace \localPolicy^{i} \rbrace_{i=1}^{\numAgents}\). To measure the dependencies between the agents, we use a quantity called the ``total correlation" of the joint policy~\citep{Karabag2022}. Let $\gameStateRandomVar_t$ be a random variable denoting the joint state of the agents at time $t$ under the joint policy \(\jointPolicy\) with no privatization, $\gameActionRandomVar_t$ be a random variable denoting the joint action of the agents at time $t$, $\mdpStateRandomVar_t^i$ be a random variable denoting the state of Agent $i$ at time $t$, and let $\mdpActionRandomVar_t^i$ be a random variable denoting the action of Agent $i$ at time $t$.
The total correlation $\totalCorrelation_{\jointPolicy}$ of a joint policy $\jointPolicy = \lbrace \localPolicy^{i} \rbrace_{i=1}^{\numAgents}$ is
\begin{equation}
\totalCorrelation_{\jointPolicy} = \Sigma_{i=1}^N \entropy(\mdpStateRandomVar_0^i\mdpActionRandomVar_0^i\ldots\mdpStateRandomVar_\randomReachTime^i)-\entropy(\gameStateRandomVar_0\gameActionRandomVar_0\ldots\gameStateRandomVar_\randomReachTime)\label{eq:proof_tc_def}
\end{equation}
where \(H(Y) \coloneqq -\sum_{y \in \mathcal{Y}} \operatorname{Pr}(Y=y) \log (\operatorname{Pr}(Y=y))\) is the entropy of a discrete random variable $Y$ with support $\mathcal{Y}$, and \(\randomReachTime\) denotes the random hitting time to \(\targetSet \cup \deadSetPrime\), i.e., the effective end of the trajectory in terms of the reach-avoid specification~\citep{Karabag2022}. 

The following result relates the success probability under private communications to the success probability under truthful communications (i.e., without privacy). 
The proof of the theorem is included in the supplementary material.


\begin{theorem}\label{thm:performance_bound}
Fix a privacy parameter $\epsilon>0$ and adjacency parameter $\adjParam.$
Given $\numAgents$ agents implementing local policies $\jointPolicy = \lbrace \localPolicy^{i} \rbrace_{i=1}^{\numAgents}$ with private communications according to Algorithm~\ref{alg:policy_exec}, let $\gameValue^{\private}$ be the success probability under private communications and let $\gameValue^{\oblivious}$ be the success probability under truthful communications (no privacy). Then,
\begin{align}
\label{eq:thm_1_eq}
\gameValue^{\private}& \geq \gameValue^{\oblivious}
    -\sqrt{1 - e^{-\totalCorrelation_{\jointPolicy}} \left((\outdegree_{m}-1)e^{-{\epsilon}/{\adjParam}}+1\right)^{\numAgents \expectedLength^{\oblivious}}},
\end{align}
where $\totalCorrelation_{\jointPolicy}$ is defined in~\eqref{eq:proof_tc_def}, $\outdegree_{m}=\max_{i \in [\numAgents], \mdpState^{i}\in \mdpStateSpace^{i}}\outdegree(\mdpState^{i})$ is the max out-degree, \(\expectedLength^{\oblivious} = \expectation_{\gamePath \sim \gamePathDist^{\oblivious}}[\len(\gamePath)]\) is the expected joint trajectory length when $\jointPolicy$ is executed with no privacy, and $\Gamma^{\oblivious}$ is the probability distribution over joint trajectories induced by the joint policy executed with no privacy.
\end{theorem}
The term \(\left((\outdegree_{m}-1)e^{-{\epsilon}/{\adjParam}}+1\right)^{\numAgents \expectedLength^{\oblivious}}\) in Theorem \ref{thm:performance_bound} represents the probability of the event that the private state trajectories are the same as the true state trajectories. 
The term \(e^{-\totalCorrelation_{\jointPolicy}}\) in Theorem \ref{thm:performance_bound} is a proxy to account for the event that the private state trajectories are different from the true state trajectories. In this event, the agents can still succeed if the local policies are independent of the other agents' states. A lower total correlation implies lower dependencies between the agents, and that the agents are more likely to succeed. We note that the equality holds in~\eqref{eq:thm_1_eq} when agents communicate truthfully, i.e., $\privacyLevel=\infty$, and each agent acts totally independently from other agents, i.e., $\totalCorrelation_{\jointPolicy} =0$. 

We remark that given the privacy mechanism and the acyclic dependency structure described in Assumption \ref{assumption:acyclicdependency}, one could formulate the policy synthesis as a decision-making problem in a partially observable MDP (POMDP). 
Such a POMDP-based formulation would yield optimal policies in terms of team performance under private communications. 
However, the synthesis procedure for POMDPs is computationally challenging because the optimal policies are history-dependent. 
For this reason, in the next section, we consider the class of stationary joint policies and avoid the computational challenges that arise when keeping track of all the potential histories. 
We instead use a ``soft decentralization'' metric to synthesize policies that make the agents insensitive to inaccuracies in the communicated information.


\section{Policy Synthesis}
\label{sec:policy_synthesis}
In this section, we present an algorithm for the synthesis of a collection of local policies $\jointPolicy = \lbrace \localPolicy^{i} \rbrace_{i=1}^{\numAgents}$ that remains performant, even under private communications.

We aim to maximize the reach-avoid probability under private communications by minimizing the lower bound on $\gameValue^{\private}$ given in Theorem~\ref{thm:performance_bound}. Since the bound is complex in nature and it is a monotone function of its variables, we instead aim to solve the following optimization problem involving constants \(\delta > 0\) and \(\beta > 0\): 
\begin{equation}
    \text{sup}_{\jointPolicy} \gameValue^{\oblivious} - \delta\expectedLength^{\oblivious}-\beta\totalCorrelation_{\jointPolicy}.\label{eq:proxy_optimization_problem}
\end{equation}
In order to solve this optimization problem, we follow the methodology presented in \citet{Karabag2022}.
Using the stationarity of \(\jointPolicy\), the terms \(\gameValue^{\oblivious}\) and \(\expectedLength^{\oblivious}\) can be represented with linear functions of the occupancy measure variables \(\occupancyVar_{\gameState, \gameAction}\) of the joint state-action space.
The term \(-\entropy(\gameStateRandomVar_0\gameActionRandomVar_0\ldots\gameStateRandomVar_\randomReachTime)\) in \(\totalCorrelation_{\jointPolicy}\) can be represented with a convex function of the occupancy measure variables. 
However, the individual entropy terms \(\entropy(\mdpStateRandomVar_0^i\mdpActionRandomVar_0^i\ldots\mdpStateRandomVar_\randomReachTime^i)\) in \(\totalCorrelation_{\jointPolicy}\), which correspond to the entropies of hidden Markov models, do not have closed-form expressions.
As a proxy, we replace each of these terms with an upper bound, which is a concave function of the occupancy measure variables.
The set of stationary joint policies can be represented with affine equality constraints on the occupancy measure variables.
The objective function of the resulting optimization problem only contains convex and concave functions of the occupancy measure variables, and the constraints are affine.
We thus use the convex-concave procedure~\citep{Lanckriet2009convergence,Yuille2001procedure} to solve for a local optimum.
We refer interested readers to \citet{Karabag2022} for more details on the formulation of this optimization problem.

After solving for the optimal $\occupancyVar_{\gameState,\gameAction}^*$ of the occupancy measure variables, we compute the local policies. Recall that in Assumption \ref{assumption:acyclicdependency}, we assumed the policy dependencies between the agents are acyclic. In order to compute local policies \(\localPolicy^{i}(\gameState, \mdpAction^{i})\) that satisfy Assumption \ref{assumption:acyclicdependency}, we marginalize the joint occupancy measure using a desired dependency graph \(G' = ([\numAgents], E')\). 
Formally, for every \(\gameState=(\mdpState^1,\dots, \mdpState^\numAgents) \in \gameStateSpace,\)
\[\localPolicy^i(\gameState, \mdpAction^{i}) = \frac{\sum_{\gameStateAlt \in \bm{\mathcal{Y}}^{i}_{\gameState}}\sum_{\gameAction^{-1} \in \gameActionSpace^{-1}} \ \occupancyVar^{*}_{\gameStateAlt, (\mdpAction^{i}, \gameAction^{-i})}}{\sum_{\gameStateAlt \in \bm{\mathcal{Y}}^{i}_{\gameState}} \sum_{\gameAction \in \gameActionSpace} \ \occupancyVar^{*}_{\gameStateAlt, \gameAction}}\] 
where \(\bm{\mathcal{Y}}^{i}_{\gameState} = \lbrace (\mdpStateAlt^1,\dots,\mdpStateAlt^\numAgents) \in \gameStateSpace \ | \ \forall (i,j) \in E,  \mdpStateAlt^{j} = \mdpState^{j}\rbrace\).
We note that, instead of postprocessing the joint occupancy variables, one could alternatively enforce this assumption by including additional bilinear equality constraints in the policy synthesis optimization problem.

\section{Numerical Experiments}
\label{sec:expirements}
Numerical experiments demonstrate the robustness to private communication enjoyed by the policies synthesized using the procedure described in \S \ref{sec:policy_synthesis}.
In each experiment, we solve \eqref{eq:proxy_optimization_problem} to synthesize minimum-dependency local policies \(\{\localPolicy_{MD}^{i}\}_{i=1}^{\numAgents}\) for the agents in the team.
We use \(\jointPolicy_{MD}\) to denote the joint policy that results from the concurrent execution of these local policies, described in \S \ref{subsec:policy_implementation}.

We compare the performance of \(\jointPolicy_{MD}\) to that of a collection of baseline local policies \(\{\localPolicy_{base}^{i}\}_{i=1}^{\numAgents}\), 
which are synthesized by optimizing the team’s performance under truthful communications without taking the total correlation value into account.
That is, the baseline policies are constructed by solving \eqref{eq:proxy_optimization_problem} with \(\delta\) and \(\beta\) set to zero, and subsequently marginalizing the policies to satisfy Assumption \ref{assumption:acyclicdependency} (as described at the end of \S \ref{sec:policy_synthesis}).
We use \(\jointPolicy_{base}\) to refer to the joint policy resulting from the concurrent execution of \(\{\localPolicy_{base}^{i}\}_{i=1}^{\numAgents}\).
Code to reproduce all experiments and analysis is available at \href{https://github.com/cyrusneary/differential_privacy_in_mas}{https://github.com/cyrusneary/differential\_privacy\_in\_mas}.

\subsection{Two-Agent Navigation Example}
\label{sec:experiments_navigation}

We begin by considering the multiagent navigation example introduced by \citet{Karabag2022}.
Two agents operate in a common environment, which consists of two large open areas connected by two separate corridors.
Each of the two agents begins in one of the large open areas and they must use the corridors to navigate past each other without colliding, in order to reach their target locations.
In addition to the risk of collisions, the environment is constructed so that one of the corridors poses a small level of risk: if an agent uses that corridor, there is a chance they could transition to a dead state and never reach their target.
In such an environment, jointly navigating the corridors without colliding necessitates coordination between the agents.

The environment is implemented as a grid of cells, each of which corresponds to an individual local state.
At any given timestep, each agent can choose to move in any direction or remain in place.
Each agent slips with a small probability when it takes an action, resulting in the agent moving to one of its neighboring states instead of its intended target state.

\begin{figure}[t!]
    \centering
    % This file was created with tikzplotlib v0.9.12.
    \begin{tikzpicture}
    
    \begin{axis}[
    width=0.95*\columnwidth, 
    height=4.0cm,
    legend cell align={left},
    legend columns = 1,
    legend style={
      fill opacity=0.8,
      draw opacity=1,
      text opacity=1,
      %   at={(2.5cm, 1.4cm)},
      at={(3.0cm, 0.2cm)},
      anchor=south west,
      draw=white!80!black
    },
    tick align=inside,
    tick pos=left,
    x grid style={white!69.0196078431373!black},
    xlabel={\footnotesize Number of Convex-Concave Iterations},
    xmajorgrids,
    xmin=0.0, xmax=80.0,
    xtick style={color=black},
    y grid style={white!69.0196078431373!black},
    ylabel={\footnotesize Success Prob.},
    ymajorgrids,
    ymin=0.0, ymax=1.0562822829432,
    ytick style={color=black},
    ytick={0.0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0},
    yticklabels={0.0, 0.2, ,0.4, ,0.6, , 0.8, , 1.0},
    ]
    
    % This file was created with tikzplotlib v0.10.1.
    % \begin{tikzpicture}
    \definecolor{darkgray176}{RGB}{176,176,176}
    
    \addplot [ultra thick, minDependencyPolicy, dashed]
    table {%
    0 0.23
    1 0.224
    2 0.281
    3 0.328
    4 0.434
    5 0.508
    6 0.554
    7 0.675
    8 0.7
    9 0.692
    10 0.733
    11 0.755
    12 0.766
    13 0.819
    14 0.875
    15 0.926
    16 0.941
    17 0.946
    18 0.955
    19 0.956
    20 0.948
    21 0.956
    22 0.956
    23 0.953
    24 0.94
    25 0.957
    26 0.961
    27 0.951
    28 0.962
    29 0.966
    30 0.958
    31 0.958
    32 0.96
    33 0.953
    34 0.946
    35 0.959
    36 0.96
    37 0.952
    38 0.96
    39 0.971
    40 0.968
    41 0.947
    42 0.967
    43 0.959
    44 0.961
    45 0.959
    46 0.956
    47 0.946
    48 0.964
    49 0.971
    50 0.96
    51 0.962
    52 0.966
    53 0.962
    54 0.963
    55 0.963
    56 0.959
    57 0.963
    58 0.953
    59 0.959
    60 0.967
    61 0.966
    62 0.968
    63 0.968
    64 0.968
    65 0.964
    66 0.962
    67 0.965
    68 0.974
    69 0.965
    70 0.969
    71 0.971
    72 0.959
    73 0.954
    74 0.97
    75 0.952
    76 0.959
    77 0.965
    78 0.968
    79 0.975
    }; 
    \addplot [ultra thick, baselinePolicy, dashed]
    table {%
    0 0.981
    1 0.981
    2 0.981
    3 0.981
    4 0.981
    5 0.981
    6 0.981
    7 0.981
    8 0.981
    9 0.981
    10 0.981
    11 0.981
    12 0.981
    13 0.981
    14 0.981
    15 0.981
    16 0.981
    17 0.981
    18 0.981
    19 0.981
    20 0.981
    21 0.981
    22 0.981
    23 0.981
    24 0.981
    25 0.981
    26 0.981
    27 0.981
    28 0.981
    29 0.981
    30 0.981
    31 0.981
    32 0.981
    33 0.981
    34 0.981
    35 0.981
    36 0.981
    37 0.981
    38 0.981
    39 0.981
    40 0.981
    41 0.981
    42 0.981
    43 0.981
    44 0.981
    45 0.981
    46 0.981
    47 0.981
    48 0.981
    49 0.981
    50 0.981
    51 0.981
    52 0.981
    53 0.981
    54 0.981
    55 0.981
    56 0.981
    57 0.981
    58 0.981
    59 0.981
    60 0.981
    61 0.981
    62 0.981
    63 0.981
    64 0.981
    65 0.981
    66 0.981
    67 0.981
    68 0.981
    69 0.981
    70 0.981
    71 0.981
    72 0.981
    73 0.981
    74 0.981
    75 0.981
    76 0.981
    77 0.981
    78 0.981
    79 0.981
    }; 
    \addplot [ultra thick, minDependencyPolicy, mark=*, mark size=1, mark options={solid}]
    table {%
    0 0.151
    1 0.171
    2 0.226
    3 0.278
    4 0.345
    5 0.41
    6 0.462
    7 0.497
    8 0.549
    9 0.55
    10 0.58
    11 0.584
    12 0.644
    13 0.715
    14 0.809
    15 0.858
    16 0.865
    17 0.897
    18 0.873
    19 0.892
    20 0.88
    21 0.895
    22 0.89
    23 0.9
    24 0.903
    25 0.885
    26 0.915
    27 0.89
    28 0.909
    29 0.927
    30 0.906
    31 0.919
    32 0.914
    33 0.92
    34 0.92
    35 0.906
    36 0.904
    37 0.897
    38 0.902
    39 0.922
    40 0.919
    41 0.917
    42 0.929
    43 0.916
    44 0.94
    45 0.934
    46 0.925
    47 0.928
    48 0.936
    49 0.936
    50 0.931
    51 0.94
    52 0.931
    53 0.926
    54 0.926
    55 0.931
    56 0.941
    57 0.932
    58 0.935
    59 0.94
    60 0.932
    61 0.934
    62 0.933
    63 0.935
    64 0.943
    65 0.946
    66 0.935
    67 0.945
    68 0.933
    69 0.929
    70 0.936
    71 0.95
    72 0.944
    73 0.936
    74 0.945
    75 0.935
    76 0.93
    77 0.946
    78 0.949
    79 0.942
    };
    \addplot [ultra thick, baselinePolicy]
    table {%
    0 0.097
    1 0.097
    2 0.097
    3 0.097
    4 0.097
    5 0.097
    6 0.097
    7 0.097
    8 0.097
    9 0.097
    10 0.097
    11 0.097
    12 0.097
    13 0.097
    14 0.097
    15 0.097
    16 0.097
    17 0.097
    18 0.097
    19 0.097
    20 0.097
    21 0.097
    22 0.097
    23 0.097
    24 0.097
    25 0.097
    26 0.097
    27 0.097
    28 0.097
    29 0.097
    30 0.097
    31 0.097
    32 0.097
    33 0.097
    34 0.097
    35 0.097
    36 0.097
    37 0.097
    38 0.097
    39 0.097
    40 0.097
    41 0.097
    42 0.097
    43 0.097
    44 0.097
    45 0.097
    46 0.097
    47 0.097
    48 0.097
    49 0.097
    50 0.097
    51 0.097
    52 0.097
    53 0.097
    54 0.097
    55 0.097
    56 0.097
    57 0.097
    58 0.097
    59 0.097
    60 0.097
    61 0.097
    62 0.097
    63 0.097
    64 0.097
    65 0.097
    66 0.097
    67 0.097
    68 0.097
    69 0.097
    70 0.097
    71 0.097
    72 0.097
    73 0.097
    74 0.097
    75 0.097
    76 0.097
    77 0.097
    78 0.097
    79 0.097
    };
    
    % \end{tikzpicture}
    
    
    \end{axis}
    
    \begin{customlegend}[
        legend columns=1, 
        legend cell align={left},
        legend style={
            align=left, 
            column sep=0.5ex, 
            font=\footnotesize, 
            draw=white!50!black,
            fill=white,
            fill opacity=0.8,
            rounded corners=1mm,
            at={(61.0mm, 19.0mm)},
            row sep=-0.08cm,
        }, 
        legend entries={\(\jointPolicy_{MD}\) (ours) Privacy \(\privacyLevel = 1.0\), \(\jointPolicy_{base}\) Privacy \(\privacyLevel = 1.0\), \(\jointPolicy_{MD}\) (ours) w/o Privacy, \(\jointPolicy_{base}\) w/o Privacy,}
    ]
    \addlegendimage{ultra thick, minDependencyPolicy, mark=*, mark size=1, mark options={solid}}
    \addlegendimage{ultra thick, baselinePolicy}
    \addlegendimage{ultra thick, minDependencyPolicy, dashed}
    \addlegendimage{ultra thick, baselinePolicy, dashed}
    \end{customlegend}
    
    \end{tikzpicture}

    \caption{
    Probability of task success as a function of the number of iterations of the policy synthesis procedure for the two-agent navigation experiment.
    }
    \label{fig:multiagent_navigation_results_during_policy_synthesis}
\end{figure}

\begin{figure}[t!]
    \centering
    % This file was created with tikzplotlib v0.10.1.
    \begin{tikzpicture}
    \definecolor{darkgray176}{RGB}{176,176,176}
    
    \begin{axis}[
    width=0.95*\columnwidth, 
    height=3.2cm,
    tick align=inside,
    tick pos=left,
    x grid style={darkgray176},
    xmajorgrids,
    xmin=2.3184187339362, xmax=5.28759802936976,
    xtick style={color=black},
    y grid style={darkgray176},
    ymajorgrids,
    ymin=0.0, ymax=1.0,
    ytick style={color=black},
    ytick={0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0},
    yticklabels={0.0, , 0.2, ,0.4, ,0.6, , 0.8, , 1.0},
    ylabel style={align=center},
    ylabel={\footnotesize Success Prob.},
    xlabel={\footnotesize Total Correlation Value of the Synthesized Policy \(\policy_{MD}\)},
    ]
    
    \addplot [ultra thick, minDependencyPolicy, mark=*, mark size=2, mark options={solid}]
    table {%
    5.15263533412278 0.151
    4.68315799524924 0.171
    4.39170326856943 0.226
    4.21395294917153 0.278
    4.11560418830195 0.345
    4.0664990574179 0.41
    4.0382724880815 0.462
    4.01201964692952 0.497
    3.98033289098386 0.549
    3.94126881661411 0.55
    3.89059492078465 0.58
    3.81548532419666 0.584
    3.69152974862599 0.644
    3.49555978179105 0.715
    3.25220033928505 0.809
    3.04760075299328 0.858
    2.9266338974042 0.865
    2.86263007993679 0.897
    2.82518849700872 0.873
    2.79879143253659 0.892
    2.7773051196462 0.88
    2.75854818176313 0.895
    2.74171583915934 0.89
    2.72649026216253 0.9
    2.7125888366637 0.903
    2.69977822678362 0.885
    2.6879019550388 0.915
    2.67688768770299 0.89
    2.6666635470067 0.909
    2.6571089641803 0.927
    2.6480963282199 0.906
    2.63950801507142 0.919
    2.63123083634076 0.914
    2.62315268700401 0.92
    2.61516112642168 0.92
    2.60714490897642 0.906
    2.59899801058677 0.904
    2.59062555345212 0.897
    2.58195419778296 0.902
    2.57294340178707 0.922
    2.56360160687682 0.919
    2.55399969661561 0.917
    2.54427584498694 0.929
    2.53463171849739 0.916
    2.5253067388059 0.94
    2.5165405771671 0.934
    2.50853021230751 0.925
    2.50139860047124 0.928
    2.49518314060074 0.936
    2.48984689064893 0.936
    2.48530124554364 0.931
    2.48142608181387 0.94
    2.47808862349647 0.931
    2.47519861043089 0.926
    2.47274751258593 0.926
    2.47066588715846 0.931
    2.46886363478232 0.941
    2.46728312502176 0.932
    2.46588533392269 0.935
    2.46464129099376 0.94
    2.46352823148981 0.932
    2.4625290607641 0.934
    2.46162816549776 0.933
    2.4608139556685 0.935
    2.46007532058983 0.943
    2.4594025213932 0.946
    2.45878810585337 0.935
    2.45822372805964 0.945
    2.45770269694576 0.933
    2.45721876173375 0.929
    2.45676623525647 0.936
    2.45634058879823 0.95
    2.45593538427419 0.944
    2.45554848171814 0.936
    2.45517465206821 0.945
    2.45481029851174 0.935
    2.45445227695792 0.93
    2.45409684651753 0.946
    2.45374081260005 0.949
    2.45338142918318 0.942
    };
    \end{axis}
    
    \node at (4.5cm, 1.35cm) [font=\footnotesize, align=left] {Privacy parameter \(\privacyLevel = 1.0\)};
    
    \end{tikzpicture}
    
    \caption{
    Probability of team success under private communications as a function of the total correlation of the synthesized policies.
    }
    \label{fig:success_prob_vs_corr_two_agent_navigation}
\end{figure}

\begin{figure}[t!]
    \centering
    % This file was created with tikzplotlib v0.10.1.
    \begin{tikzpicture}
    
    \definecolor{darkgray176}{RGB}{176,176,176}
    
    \begin{axis}[
    width=0.95*\columnwidth, 
    height=3.2cm,
    tick align=inside,
    tick pos=left,
    x grid style={darkgray176},
    xmajorgrids,
    xmin=-0.0, xmax=10.0,
    xtick style={color=black},
    y grid style={darkgray176},
    ymajorgrids,
    ymin=0.0, ymax=1.0,
    ytick style={color=black},
    ytick={0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0},
    yticklabels={0.0, , 0.2, ,0.4, ,0.6, , 0.8, , 1.0},
    ylabel={\footnotesize Success Prob.},
    xlabel={\footnotesize Privacy Parameter \(\privacyLevel\)},
    ]
    \addplot [ultra thick, minDependencyPolicy, mark=diamond*, mark size=3, mark options={solid}]
    table {%
    0.01 0.94
    1.009 0.929
    2.008 0.951
    3.007 0.94
    4.006 0.94
    5.005 0.941
    6.004 0.953
    7.003 0.949
    8.002 0.946
    9.001 0.943
    10 0.946
    };
    \addplot [ultra thick, baselinePolicy, mark=diamond*, mark size=3, mark options={solid}]
    table {%
    0.01 0.082
    1.009 0.087
    2.008 0.102
    3.007 0.126
    4.006 0.155
    5.005 0.236
    6.004 0.319
    7.003 0.413
    8.002 0.52
    9.001 0.63
    10 0.731
    };
    \end{axis}
    
    \begin{customlegend}[
        legend columns=1, 
        legend style={
            align=left, 
            column sep=1ex, 
            font=\footnotesize, 
            draw=white!50!black,
            fill=white,
            fill opacity=0.8,
            rounded corners=1mm,
            at={(23.0mm, 13.0mm)},
        }, 
        legend entries={\(\jointPolicy_{MD}\), \(\jointPolicy_{base}\)}
    ]
    \addlegendimage{ultra thick, minDependencyPolicy, mark=diamond*, mark size=3, mark options={solid}}
    \addlegendimage{ultra thick, baselinePolicy, mark=diamond*, mark size=3, mark options={solid}}
    \end{customlegend}
    
    \end{tikzpicture}
    
    \caption{Probability of team success under a variety of levels of privacy. Smaller values of the privacy parameter \(\privacyLevel\) correspond to a stronger level of privacy.}
    \label{fig:success_prob_vs_privacy_level_two_agent_navigation}
\end{figure}

While synthesizing \(\jointPolicy_{MD}\), we set \(\expectedLengthCoef = 0.01\) and \(\totalCorrelationCoef = 0.4\) in \eqref{eq:proxy_optimization_problem}.
We fix an adjacency parameter of \(\adjParam = 3\) while constructing the privacy mechanisms used in all experiments.
To define the acyclic dependencies between the agents' local policies, we use the directed graph \(\directedGraph'\) with nodes \(\{1,2\}\) and with a single edge \(\graphEdges' = \{(1,2)\}\)---the first agent's local policy depends on the local state of the second agent, but not vice versa.

\textbf{Minimum-dependency policies are \(84\%\) more performant than the baseline under private communications.}
Figure \ref{fig:multiagent_navigation_results_during_policy_synthesis} illustrates the probability of success of \(\jointPolicy_{MD}\) throughout the policy synthesis procedure. 
We plot the success probability resulting from both private \((\privacyLevel = 1.0)\) and truthful implementations of communication.
For comparison, we also plot the results of \(\jointPolicy_{base}\).
We estimate the plotted probability values by simulating \(1,000\) rollouts of the policies at each iteration, and computing the empirical rate at which the team reaches its target set.
While the baseline policy achieves a success probability of \(0.98\) under non-private communication, its success probability drops to \(0.10\) when communications are privatized.
By contrast, even under private communication, \(\jointPolicy_{MD}\) enjoys a probability of success of \(0.94\).

Intuitively, \(\jointPolicy_{MD}\) is more performant under private communications than \(\jointPolicy_{base}\) because it renders the actions of each agent independent from the states of its teammate.
We observe that \(\jointPolicy_{base}\) results in both agents using the same corridor to navigate to their respective goals, which requires the agents to condition their actions on each other's states at every timestep.
By contrast, \(\jointPolicy_{MD}\) results in each agent using a different corridor to navigate to its goals, regardless of the states and actions of its teammate.
This joint behavior is much less likely to result in collisions when communications are privatized.

\textbf{Lower total correlation values result in higher success probabilities under private communications.}
Figure \ref{fig:success_prob_vs_corr_two_agent_navigation} illustrates the team's success probability and the total correlation of each of the joint policies obtained throughout policy synthesis.
As the total correlation of \(\jointPolicy_{MD}\) decreases during policy synthesis, the policy's performance under private communications significantly increases.
This result provides a strong empirical justification for the use of total correlation as a regularizer during policy synthesis.

\textbf{The performance of \(\jointPolicy_{MD}\) is robust to the level of privacy enforced by the differential privacy mechanism.}
Recall that the parameter \(\privacyLevel\) controls the strength of privacy enforced by the differential privacy mechanism.
Lower values of \(\privacyLevel\) correspond to stronger levels of privacy---the mechanism is more likely to perturb the state trajectories of the agents.
In Figure \ref{fig:success_prob_vs_privacy_level_two_agent_navigation} we observe that the performance of \(\jointPolicy_{MD}\) remains consistently high, regardless of the value of \(\privacyLevel\).
By contrast, the performance of \(\jointPolicy_{base}\) is highly sensitive to \(\privacyLevel\); it decreases significantly for moderate to strong levels of privacy.

\subsection{Four-Agent SysAdmin Example}
\label{sec:experiments_sysadmin}

\begin{figure}[t!]
    \centering
    \tikzstyle{branch}=[fill,shape=circle,minimum size=5pt,inner sep=0pt]
    \def\horizontaldistance{1.5cm}
    \def\verticaldistance{2.0cm}
    \def\nodedistance{2.5cm}
    \def\branchdist{0.8cm}
    
    \def\stateOutlineThickness{0.2mm}
    \def\edgeThickness{0.2mm}
    
    \tikzset{auto, ->, >=stealth', auto, node distance=\nodedistance, node/.style={scale=0.8, minimum size=0pt, inner sep=0pt}}
    \tikzset{every loop/.style={min distance=8mm,in=45,out=135,looseness=10}}
    
    \begin{tikzpicture}[scale=0.8]
    
        % Draw the states
        \node[state, line width=\stateOutlineThickness, minimum size=0.7cm] (s_repair) {\(0\)};
        
        \node[state, line width=\stateOutlineThickness, right=\horizontaldistance of s_repair, font=\footnotesize, minimum size=0.7cm] (s_nominal) {\(1\)};
        
        \node[state, line width=\stateOutlineThickness, right=\horizontaldistance of s_nominal, align=center, font=\footnotesize, minimum size=0.7cm] (s_unhealthy) {\(2\)};
        
        \node[state, line width=\stateOutlineThickness, right=\horizontaldistance of s_unhealthy, align=center, font=\footnotesize, minimum size=0.7cm] (s_offline) {\(3\)};
    
        % Draw the transitions
        \path [draw, -latex, line width=\edgeThickness] (s_repair.east) -- node [above, yshift=0.0mm, xshift=0.0mm, align=left, font=\footnotesize] {wait, \(\probRepair\)} (s_nominal.west);
        
        \path [draw, -latex, line width=\edgeThickness] (s_nominal.east) -- node [above, yshift=0.0mm, xshift=0.0mm, align=left, font=\footnotesize] {wait, \(\probONB\)} (s_unhealthy.west);
        
        \path [draw, -latex, line width=\edgeThickness] (s_unhealthy.east) -- node [above, yshift=0.0mm, xshift=0.0mm, align=left, font=\footnotesize] {wait, \(\probOffline\)} (s_offline.west);
        
        % Paths to the repair state
        \path [draw, line width=\edgeThickness, -latex] (s_nominal.south) to [bend left, in=150, out=30] node [above, align=left, font=\footnotesize] {repair, \(1\)} (s_repair.south);
        
        \path [draw, line width=\edgeThickness] (s_unhealthy.south) to [bend left, in=150, out=25] node [above, align=left, font=\footnotesize, xshift=0.6cm, yshift=0.1cm] {repair, \(1\)} (s_repair.south);
        
        \path [draw, line width=\edgeThickness] (s_offline.south) to [bend left, in=150, out=20] node [above, align=left, font=\footnotesize, xshift=1.5cm, yshift=0.2cm] {repair, \(1\)} (s_repair.south);
        
        % Self loops
        \path (s_repair.north) edge [loop above] node [above, align=left, font=\footnotesize] {wait, \(1-\probRepair\)} (s_repair.north);
        \path (s_nominal.north) edge [loop above] node [above, align=left, font=\footnotesize] {wait, \(1-\probONB\)} (s_repair.north);
        \path (s_unhealthy.north) edge [loop above] node [above, align=left, font=\footnotesize] {wait, \(1-\probOffline\)} (s_repair.north);
        \path (s_offline.north) edge [loop above] node [above, align=left, font=\footnotesize] {wait, \(1\)} (s_repair.north);
    
    \end{tikzpicture}
    \caption{
    Local transition dynamics of the SysAdmin example. A label \((a,p)\) refers to a transition happening with probability \(p\) under action \(a\).
    }
    \label{fig:sys_admin_local_dynamics}
\end{figure}

We now consider a variant of the multiagent system administration example from \citet{guestrin2003efficient,choudhury2021scalable}.
A collection of servers must coordinate to provide a consistent level of service, while simultaneously performing necessary maintenance.
Each server is modeled as an individual agent with four local states: nominal \(\mdpState^{i} = 1\), in need of repairs \(\mdpState^{i} = 2\), in repair \(\mdpState^{i} = 0\), and offline \(\mdpState^{i} = 3\).
At any timestep, each agent may choose to continue operation or to initiate a repair.
We assume the local transition dynamics of the agents, illustrated in Figure \ref{fig:sys_admin_local_dynamics}, to be independent.

The team's task is to reach a target joint state in which all of the servers are operating nominally.
However, we impose the additional constraints that, at any given time during operation, the team is allowed at most two offline servers and at most two servers in the repair state.
If either of these constraints is violated, the team fails the task.

In this example, we set \(\probRepair = 0.9\), \(\probONB = 0.1\), and \(\probOffline = 0.1\), we set the values of the policy synthesis  coefficients \(\expectedLengthCoef\) and \(\totalCorrelationCoef\) to \(0.001\) and \(0.1\) respectively, and we use an adjacency parameter of \(\adjParam = 1\) in the differential privacy mechanism. 
The acyclic graph \(\directedGraph\) is defined such that the local policy of the first agent depends only on its own local state, the policy of the second agent depends on the its own local states as well as those of the first agent, the third agent depends on the local states of the first three agents, and so on.

\textbf{\(\jointPolicy_{MD}\) consistently outperforms \(\jointPolicy_{base}\) under a variety of initial system configurations and privacy levels.}
Figure \ref{fig:sys_admin_bar_chart} compares the probability of success achieved by the proposed minimum-dependency policy \(\jointPolicy_{MD}\), to that achieved by the baseline \(\jointPolicy_{base}\).
In this example, under truthful communication, it is possible for the team to achieve a success probability of \(1.0\) from any initial configuration.
However, when communication is private, \(\jointPolicy_{MD}\) consistently outperforms \(\jointPolicy_{base}\).
In the considered initial configurations, even under the strongest level of privacy, \(\jointPolicy_{MD}\) achieves a probability of success of above \(97\) percent.
By contrast, we observe that under private communications \(\jointPolicy_{base}\) typically achieves success probabilities of less than \(50\) percent.

\begin{figure}[t!]
    \centering
    % This file was created with tikzplotlib v0.10.1.
    \begin{tikzpicture}
    
    \definecolor{crimson2143940}{RGB}{214,39,40}
    \definecolor{darkgray176}{RGB}{176,176,176}
    \definecolor{darkorange25512714}{RGB}{255,127,14}
    \definecolor{forestgreen4416044}{RGB}{44,160,44}
    \definecolor{gray127}{RGB}{127,127,127}
    \definecolor{mediumpurple148103189}{RGB}{148,103,189}
    \definecolor{orchid227119194}{RGB}{227,119,194}
    \definecolor{sienna1408675}{RGB}{140,86,75}
    \definecolor{steelblue31119180}{RGB}{31,119,180}
    
    \begin{axis}[
    width=1.0*\columnwidth, 
    height=3.2cm,
    tick align=inside,
    tick pos=left,
    x grid style={darkgray176},
    xmin=-0.1796, xmax=2.85,
    xmajorticks=false,
    y grid style={darkgray176},
    ylabel={\footnotesize Success Prob.},
    ymin=0, ymax=1.05,
    ytick style={color=black},
    ytick={0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0},
    yticklabels={0.0, , 0.2, ,0.4, ,0.6, , 0.8, , 1.0},
    ]
    
    \draw[draw=minDependencyPolicy, pattern=north west lines, pattern color=minDependencyPolicy] (axis cs:-0.05,0) rectangle (axis cs:0.05,0.982);
    \draw[draw=minDependencyPolicy, pattern=north west lines, pattern color=minDependencyPolicy] (axis cs:0.95,0) rectangle (axis cs:1.05,0.974);
    \draw[draw=minDependencyPolicy, pattern=north west lines, pattern color=minDependencyPolicy] (axis cs:1.95,0) rectangle (axis cs:2.05,0.967);
    
    \draw[draw=baselinePolicy, pattern=north west lines, pattern color=baselinePolicy] (axis cs:0.415,0) rectangle (axis cs:0.515,0.323);
    \draw[draw=baselinePolicy, pattern=north west lines, pattern color=baselinePolicy] (axis cs:1.415,0) rectangle (axis cs:1.515,0.273);
    \draw[draw=baselinePolicy, pattern=north west lines, pattern color=baselinePolicy] (axis cs:2.415,0) rectangle (axis cs:2.515,0.367);
    
    \draw[draw=minDependencyPolicy, pattern=grid, pattern color=minDependencyPolicy] (axis cs:0.06,0) rectangle (axis cs:0.16,0.984);
    \draw[draw=minDependencyPolicy, pattern=grid, pattern color=minDependencyPolicy] (axis cs:1.06,0) rectangle (axis cs:1.16,0.981);
    \draw[draw=minDependencyPolicy, pattern=grid, pattern color=minDependencyPolicy] (axis cs:2.06,0) rectangle (axis cs:2.16,0.979);
    
    \draw[draw=baselinePolicy, pattern=grid, pattern color=baselinePolicy] (axis cs:0.525,0) rectangle (axis cs:0.625,0.338);
    \draw[draw=baselinePolicy, pattern=grid, pattern color=baselinePolicy] (axis cs:1.525,0) rectangle (axis cs:1.625,0.242);
    \draw[draw=baselinePolicy, pattern=grid, pattern color=baselinePolicy] (axis cs:2.525,0) rectangle (axis cs:2.625,0.362);
    
    \draw[draw=minDependencyPolicy, pattern=crosshatch, pattern color=minDependencyPolicy] (axis cs:0.17,0) rectangle (axis cs:0.27,0.992);
    \draw[draw=minDependencyPolicy, pattern=crosshatch, pattern color=minDependencyPolicy] (axis cs:1.17,0) rectangle (axis cs:1.27,0.996);
    \draw[draw=minDependencyPolicy, pattern=crosshatch, pattern color=minDependencyPolicy] (axis cs:2.17,0) rectangle (axis cs:2.27,0.997);
    
    \draw[draw=baselinePolicy, pattern=crosshatch, pattern color=baselinePolicy] (axis cs:0.635,0) rectangle (axis cs:0.735,0.503);
    \draw[draw=baselinePolicy, pattern=crosshatch, pattern color=baselinePolicy] (axis cs:1.635,0) rectangle (axis cs:1.735,0.284);
    \draw[draw=baselinePolicy, pattern=crosshatch, pattern color=baselinePolicy] (axis cs:2.635,0) rectangle (axis cs:2.735,0.448);
    
    
    \end{axis}
    
    % Label the bars with the corresponding policies
    \node(init1MDPolicy) [] at (0.62cm, -0.2cm) {\(\jointPolicy_{MD}\)};
    \node(init1BasePolicy) [right=0.0cm of init1MDPolicy] {\(\jointPolicy_{base}\)};
    
    \node(init2MDPolicy) [right=0.1cm of init1BasePolicy] {\(\jointPolicy_{MD}\)};
    \node(init2BasePolicy) [right=0.0cm of init2MDPolicy] {\(\jointPolicy_{base}\)};
    
    \node(init3MDPolicy) [right=0.1cm of init2BasePolicy] {\(\jointPolicy_{MD}\)};
    \node(init3BasePolicy) [right=0.00cm of init3MDPolicy] {\(\jointPolicy_{base}\)};
    
    % Put in the initial configuration labels.
    \draw [decorate, decoration = {calligraphic brace, mirror}, very thick] (0.2cm, -0.4cm) --  (2.1cm, -0.4cm);
    \draw [decorate, decoration = {calligraphic brace, mirror}, very thick] (2.5cm, -0.4cm) --  (4.3cm, -0.4cm);
    \draw [decorate, decoration = {calligraphic brace, mirror}, very thick] (4.6cm, -0.4cm) --  (6.5cm, -0.4cm);
    
    \node(init1Label) [font=\footnotesize, align=center, text width=2cm] at (1.1cm, -1.0cm) {One server in need of repair.};
    \node(init2Label) [font=\footnotesize, align=center, text width=2cm] at (3.4cm, -1.0cm) {Two servers in repair.};
    \node(init3Label) [font=\footnotesize, align=center, text width=2cm] at (5.55cm, -1.0cm) {Two in repair, two offline.};
    
    \node(labelForLabels) [font=\footnotesize, align=center, text width=1.5cm] at (-0.7cm, -1.0cm) {Initial\\config:};
    
    \begin{customlegend}[
        legend columns=4, 
        legend style={
            align=left, 
            column sep=1.0ex, 
            font=\footnotesize, 
            % draw=white!50!black,
            draw=none,
            fill=white,
            fill opacity=0.8,
            rounded corners=1mm,
            at={(65.0mm, 22.0mm)},
        }, 
        legend entries={\(\privacyLevel = 0.1\), \(\privacyLevel = 1.0\), \(\privacyLevel = 10.0\)}
    ]
    \addlegendimage{area legend,pattern=north west lines, pattern color=gray}
    \addlegendimage{area legend,pattern=grid, pattern color=gray}
    \addlegendimage{area legend,pattern=crosshatch, pattern color=gray}
    \end{customlegend}
    
    \end{tikzpicture}
    
    \caption{
    Probability of success in the SysAdmin example under a variety of initial configurations and privacy levels.
    }
    \label{fig:sys_admin_bar_chart}
\end{figure}

\subsection{Additional Discussion}
\label{subsec:additional_experimental_discussion}

In addition to the differences between the values of the team's probability of success under \(\jointPolicy_{MD}\) and \(\jointPolicy_{base}\), we also observe a significant change in the expected length of the trajectories that result from these policies.
Under truthful communication in the SysAdmin experiments, the expected length of the trajectories induced by \(\jointPolicy_{base}\) range from \(30\) to \(40\) timesteps, depending on the initial configuration of the system. 
For \(\jointPolicy_{MD}\) these values range from \(3\) to \(6\) timesteps.

This observation yields insight into the different qualitative properties of the policies.
\(\jointPolicy_{base}\) induces conservative behavior that maximizes the team's probability of success (under truthful communication) by requiring the agents to wait for specific joint states before taking the repair action.
The actions of each agent are highly dependent on the states of its teammates.
On the other hand, \(\jointPolicy_{MD}\) achieves nearly the same probability of success as \(\jointPolicy_{base}\), but the agents act quickly and accept a small level of risk in order to reduce the dependencies of their actions on their teammates' states.
More specifically, \(\jointPolicy_{MD}\) results in each agent selecting the repair action with a much higher probability whenever it is in need of repair, regardless of the communicated states of its teammates.
While this behavior results in the team occasionally transitioning to a failure state in which three or more agents are simultaneously under repair, it also significantly lowers the dependencies between the agents. 

The inclusion of the total correlation as a regularization term prevents the policy synthesis procedure from making the agents highly interdependent in order to achieve marginally higher performance.
This tradeoff becomes relevant when the inter-agent communications are imperfect, which is the case in privatized multiagent systems.

Finally, we remark that in some settings there may not exist a collection of highly independent policies that also achieve a high performance.
In such cases, we may not observe a large of a gap in performance between \(\jointPolicy_{MD}\) and \(\jointPolicy_{base}\) under private communication.
However, even in these settings, the total correlation may act as an indicator that it is infeasible to achieve strong performance and privacy simultaneously.

\section{Conclusions}
\label{sec:conclusions}

This paper presents a framework to privatize inter-agent communications in cooperative multiagent decision-making problems. Specifically, we adopt a differential privacy mechanism to protect the symbolic state trajectories of agents. We provide theoretical results to analyze the tradeoff between the strength of privacy and the team’s performance. We synthesize robust policies for agents by reducing the total correlation among them. Numerical results demonstrate that the minimum-dependency policies achieve high performance under strong levels of privacy, whereas the team performance of baseline policies that ignore total correlation decreases dramatically under private communications.  

\begin{acknowledgements}
    This work was supported in part by AFRL FA9550-19-1-0169, AFRL FA8651-23-F-A008, ARL ACC-APG-RTP W911NF1920333, ARO W911NF-20-1-0140, NASA 80NSSC21M0071, NSF 1943275, and ONR N00014-21-1-2502.
\end{acknowledgements}

\bibliography{bibliography}

\end{document}
