% \documentclass{uai2023} % for initial submission
\documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs}
% commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example



\usepackage{algorithm, algorithmic}


\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{subcaption}
\usepackage{xcolor}
\usepackage{graphicx}
\usepackage{caption}
\usepackage{tikz}
\usepackage{dsfont}
\newcommand{\N}{\mathbb{N}}
\newcommand{\Z}{\mathbb{Z}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\PR}{\mathbb{P}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\TV}{\mathrm{TV}}
\newcommand{\dist}{\mathrm{dist}}
\newcommand{\sS}{\mathcal{S}}
\newcommand{\sA}{\mathcal{A}}
\newcommand{\sZ}{\mathcal{Z}}
\newcommand{\sN}{\mathcal{N}}
\newcommand{\sP}{\mathcal{P}}
\newcommand{\Pro}{\mathcal{P}}
\newcommand{\sM}{\mathcal{M}}
\newcommand{\KL}{\mathrm{KL}}
\newcommand{\sI}{\mathcal{I}}
\newcommand{\diag}{\textup{diag}}
\newcommand{\td}{\textup{TD}}
\newcommand{\tdh}{\widehat{\text{TD}}}
\newcommand{\khop}{{\color{black}\kappa}}
\newcommand{\rhok}{{\color{black}\rho^{\khop+1}}}
\newcommand{\fk}{{\color{black}f(\khop)}}
\newcommand{\nik}{{\color{black}N_i^{\khop}}}
\newcommand{\njk}{{\color{black}N_j^{\khop}}}
\newcommand{\nikc}{{\color{black}N_i^{\khop_c}}}
\newcommand{\nikg}{{\color{black}N_i^{\khop_G}}}
\newcommand{\agentk}{i_0}
\newcommand{\nkkr}{{\color{black}N_{\agentk}^{\khop_r}}}
\newcommand{\nikr}{{\color{black}N_i^{\khop_r}}}
\newcommand{\njkr}{{\color{black}N_j^{\khop_r}}}
\newcommand{\nkkc}{{\color{black}N_{\agentk}^{\khop_c}}}
\newcommand{\Mkc}{{\color{black}\sM_{\nkkc}}}
\newcommand{\ukkr}{{\color{black}U_{\agentk}^{\khop_r}}}
\newcommand{\uikr}{{\color{black}U_i^{\khop_r}}}
\newcommand{\ujkr}{{\color{black}U_j^{\khop_r}}}
\newcommand{\nikphi}{{\color{black}N_i^{\khop_\varphi}}}
\newcommand{\nminusik}{{\color{black}N_{-i}^{\khop}}}
\newcommand{\nminusjk}{{\color{black}N_{-j}^{\khop}}}
\newcommand{\supphi}{\psi}
\newcommand{\supepoch}{K}
\newcommand{\supr}{\tilde r}
\newcommand{\regret}{{\color{black}\text{Avg-Nash-Regret}}}
\newcommand{\icrerr}{{\color{black}\epsilon_{critic,i}}}
\newcommand{\crerr}{{\color{black}\epsilon_{critic}}}
\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\abs}[1]{\left\lvert#1\right\rvert}
\newcommand{\Sp}[1]{\left(#1\right)}
\newcommand{\Mp}[1]{\left[#1\right]}
\newcommand{\Bp}[1]{\left\{#1\right\}}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\newcommand{\ri}{(\romannumeral1)}
\newcommand{\rii}{(\romannumeral2)}
\newcommand{\riii}{(\romannumeral3)}
\newcommand{\osi}[1]{\overset{\ri}{#1}}
\newcommand{\osii}[1]{\overset{\rii}{#1}}
\newcommand{\osiii}[1]{\overset{\riii}{#1}}



% if you use cleveref..
\usepackage[capitalize,noabbrev]{cleveref}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}

\title{Convergence Rates for Localized Actor-Critic in\\ Networked Markov Potential Games}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:zhouzhao20@mails.tsinghua.edu.cn}{Zhaoyi Zhou}}
\author[2]{\href{mailto:zchen458@caltech.edu}{Zaiwei Chen}}
\author[2]{\href{mailto:yihengl@caltech.edu}{Yiheng Lin}}
\author[2]{\href{mailto:adamw@caltech.edu}{Adam Wierman}}
% Add affiliations after the authors
\affil[1]{%
    Institute for Interdisciplinary Information Sciences\\
    Tsinghua University
}
\affil[2]{%
    The Computing $+$ Mathematical Sciences (CMS) Department, California Institute of Technology
}
  
\begin{document}
\maketitle


\setlist[itemize]{noitemsep,nosep}
\setlist[enumerate]{noitemsep,nosep}


\begin{abstract}
We introduce a class of networked Markov potential games where agents are associated with nodes in a network. Each agent has its own local potential function, and the reward of each agent depends only on the states and actions of agents within a neighborhood.  In this context, we propose a localized actor-critic algorithm.  The algorithm is scalable since each agent uses only local information and does not need access to the global state.  Further, the algorithm overcomes the curse of dimensionality through the use of function approximation.  Our main results provide finite-sample guarantees up to a localization error and a function approximation error. Specifically, we achieve an $\tilde{\mathcal{O}}(\tilde{\epsilon}^{-4})$ sample complexity measured by the averaged Nash regret. This is the first finite-sample bound for multi-agent competitive games  that does not depend on the number of agents.
\end{abstract}

\section{Introduction}\label{sec:intro}
Large-scale systems where agents interact competitively with each other have received significant attention recently, motivated by applications in power systems \citep{shi2022stability}, EV charging \citep{lee2022systems}, and board games \citep{silver2017mastering}, etc. Controlling such systems can be challenging due to the scale of the system, uncertainty about the model, communication constraints, and the interaction between agents. Inspired by the recent success of reinforcement learning (RL), there is an increasing interest in applying RL methods to environments with multi-agent interactions.  However, in multi-agent RL (MARL), the analysis of the system behavior becomes challenging due to the time-varying nature of the environment faced by each agent, which results from the (time-varying) competitive decisions of other agents. As a result, the theoretical analysis of MARL, especially in the competitive setting, is still limited especially when it comes to large-scale systems. 

Results on MARL in competitive settings to this point have tended to focus on games with a small number of players, e.g., $2$-player zero-sum stochastic games \citep{littman1994markov}, or games with special structure, e.g., Markov potential games (MPGs) \citep{fox2022independent}.  MPGs in particular provide a setting in which the challenges of large-scale systems can be studied. The intuition behind an MPG  parallels that of classical (one-shot) potential games. Specifically, the existence of a potential function guarantees that agents can converge to a global equilibrium even when using greedy localized updates. MPGs have wide-ranging applications including variants of congestion games \citep{leo2021convergeMPG,fox2022independent}, medium access control \citep{macua2018learning}, and the stochastic lake game \citep{dechert2006stochastic}. However, existing theoretical results for MPGs rely on the assumption that a centralized global state exists and can be observed by each individual agent. Such an assumption rules out applications in many large-scale systems including transportation networks \citep{zhang2016control} and social networks \citep{application_chakrabarti2008epidemic}, where the global state space can be exponentially large in the number of agents and/or each agent can only observe its own local state.

A promising approach for the design of scalable and local MARL algorithms in competitive settings is to exploit the networked structure of practical applications to design algorithms with sample complexity that only depends on \emph{local} properties of the network instead of the \emph{global} state. This approach has recently been successful in the case of cooperative MARL.  For example, \citet{qu2019scalableMARL,lin2021multi,zhang2022global} provides a scalable localized algorithm with a sample complexity that does not depend on the number of agents. However, to this point, local algorithms that exploit network structure do not exist in the competitive MARL setting. Thus, we ask: \emph{Can we design a scalable and local algorithm with finite-time bounds for networked MARL with competitive agents?}

\subsection{Main Contributions}
We address the question above by introducing a class of networked Markov potential games (NMPGs) as the networked counterpart of classical MPGs. Importantly, NMPGs represent a broader class of games than MPGs, and draw focus to algorithm design that uses only local information. 

We design a localized actor-critic algorithm that is a combination of independent policy gradient and localized TD$(\lambda)$ with linear function approximation. Notably, our algorithm is \textit{model-free}, uses only \textit{local information}, and  successfully incorporates \textit{function approximation}. This avoids both the need for communication of the global state and the so-called ``curse of dimensionality'' in MARL. 

Our main results provide a finite-sample bound on the averaged Nash regret for our proposed algorithm, which implies an $\tilde{\mathcal{O}}(\tilde{\epsilon}^{-4})$ sample complexity (where $\tilde{\epsilon}$ is the accuracy) up to an approximation error of using local information and a function approximation error. To our knowledge, we are the first to develop a localized algorithm in competitive MARL settings with provable performance guarantees that do not depend on the number of agents. 

Our results are enabled by a novel analysis of the critic in our localized actor-critic framework.  In particular, we propose a localized cost evaluation problem, a new MARL setting to investigate the performance of a local algorithm under a fixed policy. As a critical part of the proof, we propose a novel concept called a ``sub-chain'' that connects local algorithms to their global counterparts, enabling performance bounds via bounds on the gap between the two. 



\subsection{Related Work}
\textbf{Markov Potential Games.} Our work adds to the literature on MPGs in MARL. Analytic results for non-cooperative MARL are challenging to obtain because agents learn in a non-stationary environment as other agents update their policies.  As a result, existing analysis has focused on special cases like $2$-player stochastic games \citep{littman1994markov}, adversarial team Markov games \citep{kalogiannis2022efficiently}, and MPGs \citep{fox2022independent}.  The case of MPGs has received considerable attention recently because the potential games are broadly applicable \citep{leo2021convergeMPG} and the existence of potential functions enables provable guarantees \citep{zhang2021gradientStochasticGame,pmlr-v162-ding22b,fox2022independent,zhang2022logBarrierSoftmax}.  While these papers provide algorithms with provable convergence guarantees, they assume that all agents share a common global state and can observe the global state to decide local actions.  An important open question is understanding how to learn in settings where global information is not available.  Our work studies the MARL setting where each agent has its own local state and can only decide local actions based on the local states.

\textbf{MARL in Networked Systems.} The Markov decision process (MDP) model we study is inspired by a series of works on Networked MARL \citep{qu2019scalableMARL,lin2021multi,zhang2022global}, where RL agents are located on a network. In such models, the local state transition of an agent is affected by its own local state/action and its direct neighbors' local states. Networked MARL is applicable to a wide range of applications, including communication networks \citep{application_communication}, social networks \citep{application_chakrabarti2008epidemic}, and traffic networks \citep{zhang2016control}. Compared with general MARL, the additional structure of networked MARL enables us to establish a critical exponential decay property on the local $Q$-functions, which leads to the design of localized  actor-critic algorithms \citep{qu2019scalableMARL,lin2021multi}. All prior works on networked MARL study the case when agents cooperatively maximize the sum of all local rewards. In contrast, our work studies a non-cooperative NMPG in which each agent has its own objective. 

Another approach to study MARL problems is to use mean-field control (MFC) \citep{gu2021mean, mondal2022on, mondal2022can}. The major difference between the mean-field setting and our setting is that mean-field MARL focuses on homogeneous agents, while we allow each agent to have different transition probabilities and local policies.

\paragraph{Finite-Sample Analysis of TD-Learning Variants.} TD-learning and its variants are widely used for policy evaluation in RL, which plays a critical role in most policy-space algorithms. The asymptotic analysis of TD-learning dates back to \cite{tsitsiklis1994asynchronous,jaakkola1994convergence}, while finite-sample convergence bounds have received attention in the last decade. In TD-learning, function approximation is a useful technique to reduce the dimension of learning parameters at the cost of incurring an approximation error that depends on the function class. Recently, many breakthroughs are made on finite-sample error bounds for TD-learning with function approximation \citep{bhandari2018finite,Srikant2019FiniteTimeEB,dalal2018finite,yu2009convergence}. Meanwhile, in multi-agent settings, localized TD-learning is crucial for limiting communication and the need for global information \citep{lin2021multi}. Our work provides a novel finite-sample error bound for localized TD-learning with function approximation.



\section{Problem Description}
\label{sec:setting}

\textbf{Network Structure.} We study MARL in the context of networked multi-agent Markov games. Specifically, we consider a setting with $n$ agents that are associated with an undirected graph $\mathcal{G} = (\mathcal{N},\mathcal{E})$, where  $\mathcal{N}=\{1,2,\ldots,n\}$ is the set of nodes and $\mathcal{E}\subseteq \mathcal{N}\times\mathcal{N}$ is the set of edges. We denote by $\text{dist}(i,j)$ the graph distance between agents $i$ and $j$. The local state space and local action space of agent $i$ are denoted by $\mathcal{S}_i$ and $\mathcal{A}_i$, respectively, which are both finite sets. The global state is denoted as $s = (s_1,\ldots,s_n)\in  \mathcal{S}:= \prod_{i=1}^n\mathcal{S}_i$ and the global action is defined similarly.
For any subset $I\subseteq \sN$, we use $s_I$ to denote the joint state of the agents in $I$ and use $\sS_I:=\prod_{i\in I} \sS_i$ to denote the joint state space of agents in $I$. Similarly, we define $a_I$ and $\sA_I$ as the joint action and joint action space of the agents in $I$. Denote $\mu\in \Delta(\sS)$ as the initial state distribution, where $\Delta(\mathcal{S})$ denotes the $|\mathcal{S}|$-dimensional probability simplex.

\textbf{Transition Probabilities.} At time $t\geq 0$, given current state $s(t)$ and action $a(t)$, for each agent $i\in\mathcal{N}$, its successor state $s_i(t+1)$ is independently generated according to the following transition probability, which is only dependent on its neighbors' states and its own action: 
\begin{align*}
    \sP(s(t+1)\,|\, s(t),a(t)) \!=\! \prod_{i=1}^n \sP_i(s_i(t+1)\,|\, s_{\mathcal{N}_i}(t),a_i(t)),
\end{align*}
where $\mathcal{N}_i=\{i\}\cup \{j\in\mathcal{N}\mid (i,j)\in\mathcal{E}\}$ denotes the neighborhood of $i$, including $i$ itself. In addition, given an arbitrary integer $\khop\geq 0$, we use $\nik$ to denote the $\khop$-hop neighborhood of $i$, i.e., $\nik=\{i\}\cup \{j\in\mathcal{N}\mid \text{dist}(i,j)\leq \kappa\}$,
and use $-\nik=\sN/\nik$ to denote the set of agents that are not in $\nik$. We use $U_i^{\kappa}=\nik/\{i\}$ to denote the agents in the $\kappa$-hop neighborhood of $i$, excluding $i$ itself.

\begin{remark}
We require that each agent's transition probability depends only on the states of its neighbors and its own action, which is common in networked MARL literature \citep{qu2019scalableMARL, zhang2022global}. Intuitively, it implies that the impact from far-away agents on the network is ``negligible'', which eventually leads to the exponential decay property (cf. Lemma \ref{le:truncated_Q}).
\end{remark}

\textbf{Reward Function.} Each agent $i\in \sN$ is associated with a deterministic reward function $r_i:\mathcal{S}\times\mathcal{A}\mapsto [0,1]$. The interval $[0, 1]$ is chosen without loss of generality over the set of bounded reward functions. In general, agent $i$'s reward depends on the global state and the global action. Due to the network structure, we assume that there exists a non-negative integer $\kappa_r$ such that the reward function of each agent depends only on the states and the actions of other agents within its $\kappa_r$-hop neighborhood, i.e., $r_i(s,a)=r_i(s_{\mathcal{N}_i^{\kappa_r}},a_{\mathcal{N}_i^{\kappa_r}})$ for all $i$. This makes intuitive sense as we expect the dependence between two agents to weaken as their graph distance grows. 


\textbf{Policy.}
In this work, we consider stationary policies \citep{zhang2021multi}. Specifically, each agent $i\in \sN$ is associated with a localized policy $\xi_i:\mathcal{S}_i\mapsto\Delta(\mathcal{A}_i) $. Given a subset $I\subseteq\mathcal{N}$, we define $\xi_I:\mathcal{S}_I\mapsto\Delta(\mathcal{A}_I)$ as the joint policy of agents in $I$. Note that $\xi_I(a_I\mid s_I)=\prod_{i\in I}\xi_i(a_i\mid s_i)$. 
We use $\Xi_i$ to denote agent $i$'s local policy space, and $\Xi_I$ to denote the joint policy space of agents in $I$. When $I=\mathcal{N}$, we omit the subscript and just write $\xi$ for $\xi_\mathcal{N}$ (and $\Xi$ for $\Xi_\mathcal{N}$).
Throughout, we also use $\xi=(\xi_1,\xi_2,\cdots,\xi_n)$ to highlight the local policy components. In this work, we will frequently work with softmax policies, which are defined as
\begin{equation}
    \xi_i^{\theta_i}(a_i|s_i)=\frac{\exp(\theta_{i,s_i,a_i})}{\sum_{a_i'\in\sA_i}\exp(\theta_{i,s_i,a_i'})},\;\forall\;i,s_i,a_i,
\end{equation}
where $\xi_i^{\theta_i}$ stands for agent $i$'s local policy parametrized by the weight vector $\theta_i\in \R^{\abs{\sS_i} \abs{\sA_i}}$.
We denote $\theta=(\theta_1,\theta_2,\cdots,\theta_n)$ as the parameter of a global policy $\xi^\theta$. 


\textbf{Value Function.}
Given a global policy $\xi$ and an agent $i$, we define agent $i$'s $Q$-function $Q_i^\xi\in\mathbb{R}^{|\mathcal{S}||\mathcal{A}|}$ as
\begin{align*}
    Q_i^{\xi}(s,a) 
    = \sum_{t=0}^{\infty} \gamma^t\E_{\xi} \left[ r_i(s(t),a(t)) \,\middle|\,s(0)=s,a(0)=a\right]
\end{align*}
for all $(s,a)$, where $\gamma\in (0,1) $ is the discount factor, and $\E_{\xi}[\,\cdot\,]$ is taken w.r.t. the randomness in the (stochastic) policy $\xi$ and the  transition probabilities.  With $Q_i^\xi$ defined above, the averaged $Q$-function $\overline Q_i^{\xi}\in\mathbb{R}^{|\mathcal{S}||\mathcal{A}_i|}$ and the value function $V_i^{\xi}\in\mathbb{R}^{|\mathcal{S}|}$ of agent $i$ are defined as $\overline Q_i^{\xi}(s,a_i)=\E_{a_{-i}\sim \xi_{-i}(\cdot|s_{-i})}[Q_i^{\xi}(s,a_i,a_{-i})]$ for all $(s,a_i)$ and $V_i^{\xi}(s)=\E_{a_i\sim \xi_i(\cdot|s_i)}[\overline Q_i^{\xi}(s,a_i)]$ for all $s$,
where we use $s_{-i}$, $a_{-i}$, and $\xi_{-i}$ to denote the joint state, the joint action, and the joint policy of the agents in $\sN/\{i\}$, respectively.  With the initial state distribution $\mu$, we define  $J_i(\xi)=\E_{s\sim\mu}[V_i^{\xi}(s)]$. Finally, we define the advantage function of agent $i$ as $A_i^{\xi}(s,a)=Q_i^{\xi}(s,a)-V_i^{\xi}(s)$ for all $(s,a)$, and the averaged advantage function of agent $i$ as $\overline A_i^{\xi}(s,a_i)=\overline Q_i^{\xi}(s,a_i)-V_i^{\xi}(s)$ for all $(s,a_i)$. When the policy uses softmax parameterization with parameter $\theta$, we may abuse the policy parameter $\theta$ to represent the policy $\xi$ for simplicity. For example, we may write $J_i(\theta)$ for $J_i(\xi^{\theta})$.


\textbf{Discounted State Visitation Distribution.}
Given a policy $\xi$ and an initial state $s'$, we define the \textit{discounted state visitation distribution} as $d^{\xi}_{s'}(s)=(1-\gamma)\sum_{t=0}^{\infty}\gamma^t {\Pr}^{\xi}[s(t)=s\ |\ s(0)=s']$
for all $s\in\mathcal{S}$,
where $\Pr^{\xi}[s(t)=s\ |\ s(0)=s']$ denotes the probability that $s(t)=s$ given that the initial state is $s'$ and the global policy is $\xi$. We use $d^{\xi}(s):=\E_{s'\sim \mu}[d^{\xi}_{s'}(s)]$ to represent the discounted state visitation distribution when the initial state distribution is $\mu$. 

\section{Networked MPGs}\label{sec:MLPG}
Our focus is a class of networked multi-agent Markov games that we named NMPGs, which is defined in the following.

\begin{definition}
\label{def:local_MPG} A multi-agent Markov game is called a $\kappa_G$-NMPG (where $\kappa_G$ is a non-negative integer) if
there exists a set of local potential functions $\{\Phi_i \}_{i\in \sN}$, where $\Phi_i:\Xi\rightarrow \R$ for all $i\in \sN$, such that the following equality holds for any $i\in \sN$, $j\in \mathcal{N}_i^{\kappa_G}$, $\xi_j,\xi_j'\in\Xi_j$, and $\xi_{-j}\in \Xi_{-j}$:
\begin{equation}
\label{eq:approx_MPG}
   J_j(\xi_j',\xi_{-j})\!-\!J_j({\xi_j,\xi_{-j}})\! =\! \Phi_i(\xi_j',\xi_{-j})\!-\!\Phi_i(\xi_j,\xi_{-j}).  
\end{equation}
\end{definition}

Definition \ref{def:local_MPG} states that when agent $j$ changes its local policy, the change in its objective function $J_j(\cdot,\xi_{-j})$ can be measured by the change of local potential functions from any agent in its $\kappa_G$-hop neighborhood. The non-negative integer $\kappa_G$ is determined by the networked MPG setting and reflects the extent to which the networked MPG is relaxed from an MPG. Recall that in the definition of a standard MPG \citep{leo2021convergeMPG}, there exists a (global) potential function $\Phi$ such that Eq. (\ref{eq:approx_MPG}) holds with $\Phi_i$ being replaced by $\Phi$ for all $i$. Therefore, an MPG is always an NMPG (by choosing $\Phi_i=\Phi$ for all $i$), and hence NMPG represents a strictly broader class of games. More discussions are given in Appendix F.1, and a concrete example of an NMPG is presented in Section \ref{subsec:examples}.  

Due to the boundedness of the reward function and Eq. (\ref{eq:approx_MPG}), the local potential functions are uniformly bounded from above and below, i.e., there exist $\Phi_{\min},\Phi_{\max}>0$ such that
$\Phi_i(\xi)\in [\Phi_{\min},\Phi_{\max}]$ for all $i\in\mathcal{N}$ and $\xi\in\Xi$. See Appendix F.6 for more details.

Unlike in single-agent RL or cooperative MARL, the optimal policy is not well-defined in the competitive setting, and thus our goal is to design algorithms that learn Nash equilibria of NMPGs. We next introduce the concepts of Nash equilibrium, Nash gap, and averaged Nash regret.
\begin{definition}
    A global policy $\xi$ is a Nash equilibrium if $J_i(\xi_i,\xi_{-i})\geq J_i(\xi_i',\xi_{-i})$ for all $\xi_i'\in \Xi_i$ and $i\in\mathcal{N}$.
\end{definition}
To measure the performance of a policy by its ``distance'' to a Nash equilibrium, we use the Nash gap.

\begin{definition}
\label{def:nash_regret}
    Given a global policy $\xi$, agent $i$'s Nash gap and the global Nash gap are defined as
\begin{align*}
     \text{NE-Gap}_i(\xi):=\;&\max_{\xi_i'} J_i(\xi_i',\xi_{-i})-J_i(\xi_i,\xi_{-i}),\\
     \text{NE-Gap}(\xi):=\;&\max_{i\in \sN} \text{NE-Gap}_i(\xi).
\end{align*}
\end{definition}
With $\text{NE-Gap}(\cdot)$ defined above, given $\hat{\epsilon}>0$, we say that a policy $\xi$ is an $\hat{\epsilon}$-approximate Nash equilibrium if $\text{NE-Gap}(\xi)\leq \hat{\epsilon}$. When using a softmax policy with parameter $\theta$, we may abuse the notation to denote $\text{NE-Gap}_i(\theta)$ for $\text{NE-Gap}_i(\xi^\theta)$ and also $\text{NE-Gap}(\theta)$ for $\text{NE-Gap}(\xi^\theta)$.

While Definition \ref{def:nash_regret} enables us to measure the performance of a single policy, in MARL, most algorithms iterate over a sequence of policies. To measure the performance of a sequence of policies, we use the averaged Nash regret, which is defined in the following. 
\begin{definition}\label{def:avg_NR}
    Given a sequence of $M$ policies $\{\xi(0),\xi(1),\ldots,\xi(M-1)\}$, the averaged Nash regret of agent $i$ and the global averaged Nash regret are defined as
    \begin{align*}
        \regret_i(M)&=\frac{1}{M} \sum_{m=0}^{M-1} \text{NE-Gap}_i(\xi(m)),\\
        \regret(M)&=\max_{i\in\sN} \regret_i(M).
    \end{align*}
\end{definition}

Note that a similar concept  called ``Nash Regret'' was previously introduced in \cite{pmlr-v162-ding22b}, and is defined as 
\begin{align}\label{eq:NR}
    \text{Nash-Regret}(M)=\frac{1}{M} \sum_{m=0}^{M-1} \max_{i\in \sN}\text{NE-Gap}_i(\xi(m)).
\end{align}
By using Jensen's inequality and the fact that the maximum of a set of positive real numbers is less than the summation, we easily have $\regret(M)=\Theta (\text{Nash-Regret}(M))$. See Appendix F.4 for the proof. As a result, $\regret(M)$ and $\text{Nash-Regret}(M)$ have the same rate of convergence (up to a multiplicative constant that depends on the number of agents).

\subsection{An Example of NMPGs}\label{subsec:examples}
To illustrate the model, we present an extension of classical congestion games \citep{roughgarden2004bounding} and distributed welfare games \citep{marden2013distributed}. In this example, $n$ agents are located on a traffic network $\mathcal{T} = (\mathcal{V}, \zeta)$, where $\mathcal{V}$ denotes the set of nodes and $\zeta$ denotes the set of directed edges with self-loops\footnote{Note that the traffic network and the communication network $\mathcal{G}$ may be different.}. The objective of each agent $i$ is to commute from its start node $h_i$ to its destination $d_i$. In this example, the local state $s_i(t)$ of agent $i$ at time $t$ is its current location (a node $v \in \mathcal{V}$). By choosing a directed edge $(v, u) \in \zeta$ as its local action $a_i(t)$ at time $t$, agent $i$ will transit to state $s_i(t+1)=u$ at time $t+1$.  Without the loss of generality, we assume an agent will stay at the same node after it arrives at its destination. 

The reward of agent $i$ is defined as $r_i(t)=0$ if $s_i(t) = d_i$, $r_i(t)=-\Bar{\epsilon}$ if $s_i(t+1) = s_i(t)$, and $r_i(t)=-\Bar{\epsilon} - N(a_i(t), t)$ otherwise, where $\Bar{\epsilon} > 0$ is a constant and $N(e, t)$ denote the number of agents that chooses edge $e$ at time $t$. The reward is designed so that the agent incurs a time cost of $\Bar{\epsilon}$ for every step spent on its trip and a congestion cost of $N(a_i(t), t)$ depending on the traffic on the edge it travels through. The congestion cost is avoided if the agent chooses to wait at its current location (i.e., $s_i(t+1) = s_i(t)$). Each agent's goal is to maximize its expected discounted cumulative reward $\mathbb{E}\left[\sum_{t=0}^\infty \gamma^t r_i(t)\right]$.

To see that this congestion game fits in our NMPG framework, consider the following communication network $\mathcal{G}$: agents $i$ and $j$ are neighbors if and only if there exists a global policy $\xi$ such that $\sum_{t = 0}^\infty \Pr(s_i(t) = s_j(t), s_i(t)\not=d_i, s_j(t)\not=d_j) > 0$. Under this communication network, the transition kernel is completely local because the next state of any agent $i$ is decided completely locally and the local reward of agent $i$ is a function that depends on the $1$-hop local states and actions $(s_{\mathcal{N}_i^1}, a_{\mathcal{N}_i^1})$. 
We provide more discussion of this example and numerical simulations using it in Appendix A. 


\section{Algorithm Design}\label{sec:alg}

We now present a novel algorithm for solving NMPGs.  Our approach uses a combination of independent policy gradient (IPG) with localized TD-learning to form a localized actor-critic framework.

\subsection{Actor: Independent Policy Gradient}
Suppose that the agents have complete knowledge about the underlying model (e.g., reward function and transition dynamics). Then a popular approach for solving MPGs is to use IPG, which is presented in Algorithm \ref{alg:exact_IPG} \citep{leo2021convergeMPG,zhang2021gradientStochasticGame,pmlr-v162-ding22b,fox2022independent,zhang2022logBarrierSoftmax}. 
\begin{algorithm}
    \caption{Independent Policy Gradient  }
    \label{alg:exact_IPG}
    \begin{algorithmic}[1]
    \STATE \textbf{Input:} Initialization $\theta_i(0)=0$, $\forall\; i\in \sN$.
        \FOR{$m=0,1,2,\cdots,M-1$}
               \STATE  $\theta_i(m+1)=\theta_i(m)+\beta\nabla_{\theta_i} J_i(\theta(m))$ for all $i\in\mathcal{N}$
        \ENDFOR
    \end{algorithmic}
\end{algorithm}

In each round of Algorithm \ref{alg:exact_IPG}, each agent simultaneously updates its policy by implementing gradient ascent (in the policy space) w.r.t. their own objective function (cf. Algorithm \ref{alg:exact_IPG} Line 3). Notably, to carry out Algorthm \ref{alg:exact_IPG}, each agent only needs to know its own policy. While Algorithm \ref{alg:exact_IPG} is promising, it is not a model-free algorithm as computing the gradient requires knowledge of the underlying MDP model.  This motivates the design of a critic to help estimate the gradient.

\subsection{Critic: Localized TD$(\lambda)$ with Linear Function Approximation}\label{subsec:critic_algorithm}
To motivate the design of the critic, we first present an explicit expression of the policy gradient of agent $i$ \citep{sutton1999PGT}:
\begin{align}
    \nabla_{\theta_i}J_i(\theta) =\;&\sum_{t=0}^{\infty}\gamma^t \E_{\xi^{\theta}}\big[\nabla_{\theta_i}\log \xi_i^{\theta_i}(a_i(t)|s_i(t))\nonumber\\
        &\times \overline Q_{i}^{\theta}(s(t),a_i(t))\big]. \label{eq:PGT}
\end{align}
Similar versions of policy gradient theorems under different multi-agent settings were previously developed in \cite{zhang2022logBarrierSoftmax,mao2022Decent-MARL}. For completeness, we present a proof of Eq. (\ref{eq:PGT}) in Appendix F.2.

In view of Eq. (\ref{eq:PGT}), to estimate $\nabla_{\theta_i}J_i(\theta)$, the key is to construct an estimate of the averaged $Q$-function $\overline Q_{i}^{\theta}$. However, directly estimating the averaged $Q$-function of agent $i$ requires information about the global state, incurring long-distance communication. 
To localize the algorithm, we introduce a hyper-parameter $\kappa_c\in \N$, and for each agent, we learn an approximation of the averaged $Q$-function (which we refer to as the $\kappa_c$-truncated averaged $Q$-function) using only information in its $\kappa_c$-hop neighborhood. 

\textbf{Truncated Averaged $Q$-functions.}
Given the non-negative integer $\kappa_c$, agent $i\in \sN$, and a global policy parameter $\theta$, we define $\mathcal Q_i^{\theta,\kappa_c}$ as the class of $\kappa_c$-truncated averaged $Q$-functions w.r.t. $\overline Q_i^{\theta}$. Specifically,
\begin{align*}
    &\mathcal Q_i^{\theta,\kappa_c}=\left\{\overline Q_i^{\theta,\kappa_c} \in\mathbb{R}^{|\mathcal{S}_{\nikc}||\mathcal{A}_i|}\;\middle|\;\exists\, u_i\in \Delta(\mathcal{S}_{-\nikc})\text{ s.t. }\right.\\
    &\left.\overline Q_i^{\theta,\kappa_c}(s_{\nikc},a_{i}) =\mathbb{E}_{s_{-\nikc}\sim u_i}\left[\overline Q_i^{\theta}(s_{\nikc},s_{-\nikc},a_i)\right],\right.\\
    &\left.\forall\;(s_{\nikc},a_{i})\in \mathcal{S}_{\nikc}\times \mathcal{A}_i\right\}.
\end{align*}
Note that when $\kappa_c \geq \max_{i,j}\text{dist}(i,j)$, there is essentially no truncation, i.e., any element in $\mathcal Q_i^{\theta,\kappa_c}$ is equal to $\overline Q_i^{\theta}$.
When $\kappa_c<\max_{i,j}\text{dist}(i,j)$, we have the following \emph{exponential-decay property}. See Appendix F.3 for the proof.


\begin{lemma}\label{le:truncated_Q}
For any $\kappa_c\in \N$, agent $i$, and global policy parameter $\theta$, it holds  that 
{\begin{align}
    & \sup_{\overline Q_{i}^{\theta,\kappa_c}\in \mathcal Q_i^{\theta,\kappa_c}}\max_{s,a_i}\abs{\overline Q_{i}^{\theta,\kappa_c}(s_{\nikc},a_{i})-\overline Q_i^{\theta}(s,a_i)} \nonumber \\
    \leq\;& \frac{2\min\left( \gamma^{\kappa_c-\kappa_r+1},1\right)}{1-\gamma} .
\end{align}}
\end{lemma}
In view of Lemma \ref{le:truncated_Q}, the $\kappa_c$-truncated averaged $Q$-function approximates the averaged $Q$-function (at a geometric rate) as $\kappa_c$ increases. Therefore, it is enough for the critic to estimate an arbitrary $\kappa_c$-truncated averaged $Q$-function within the class $\mathcal Q_i^{\theta,\kappa_c}$. It is worth noting that the use of truncated $Q$-functions and the exponential-decay property have been widely exploited in the cooperative MARL literature
for communication and dimension reduction in recent years \citep{qu2019scalableMARL,gu2022mean,lin2021multi}.  In this work, we show how to use such an approach in a non-cooperative setting for the first time. 

\textbf{Linear Function Approximation.} While using the $\kappa_c$-truncated $Q$-functions enables us to overcome the computational bottleneck as the number of agents increases, there is still the challenge due to the curse of dimensionality. To further reduce the parameter dimension, we use linear function approximation. To be specific, for each $i\in\mathcal{N}$, let $\phi_i: \sS_{\mathcal{N}_i^{\kappa_c}}\times \sA_i\rightarrow \R^{d_i}$ be a feature mapping of agent $i$. Then, with weight vector $w_i\in\mathbb{R}^{d_i}$, we consider approximating the $\kappa$-truncated $Q$-functions using $\hat Q_{i}(s_{\mathcal{N}_i^{\kappa_c}},a_i,w_i)=\langle\phi_i(s_{\mathcal{N}_i^{\kappa_c}},a_i),w_i \rangle$ for all $(s_{\mathcal{N}_i^\kappa},a_i)$. Let $\tilde\phi_i(s,a_i)=\phi_i(s_{\nikc},a_i)$ for any $i\in \sN$, $s\in \sS$, and $a_i\in \sA_i$. That is, given an agent $i$, for each pair $(s,a_i)$ of global state and local action, we look at the states of agents in agent $i$'s $\kappa_c$-hop neighborhood  (i.e., $s_{N_i^{\kappa_c}}$) and agent $i$'s action (i.e., $a_i$) and assign the vector $\phi(s_{N_i^{\kappa_c}}, a_i)$ to $\tilde{\phi}_i(s,a_i)$. Then agent $i$'s feature matrix $\Omega_i$ is defined to be an $|\mathcal{S}||\mathcal{A}_i|\times d_i$ matrix with its $(s,a_i)$-th row being $\tilde\phi_i^\top(s,a_i)$, where $(s,a_i)\in\mathcal{S}\times \mathcal{A}_i$. 

We propose a novel policy evaluation algorithm called localized TD$(\lambda)$ with linear function approximation, which is presented in Algorithm \ref{alg:LPES}. The algorithm can be viewed as an extension of the classical TD$(\lambda)$ with linear function approximation \citep{tsitsiklis1997analysis} to the case where we estimate the $\kappa_c$-truncated averaged $Q$-functions using local information. 




\begin{algorithm}[H]
    \caption{Localized TD($\lambda$) with Linear Function Approximation}
    \label{alg:LPES}
    \begin{algorithmic}[1]
    \STATE \textbf{Input}: Target policy $\xi^\theta$,
    positive integers $K$ and $\kappa_c\geq \kappa_r$, initializations $w_i(0)=0$ for all $i$, step size $\alpha>0$, $\lambda\in [0,1)$, and $\epsilon>0$.
    \STATE Construct $\epsilon$-exploration policy $\hat \xi_i(a_i|s_i)=(1-\epsilon)\xi^{\theta_i}_i(a_i|s_i)+\epsilon /\abs{\sA_i}$, for all $i,a_i$, and $s_i$.
    \label{line:critic_sample_beg}
    \STATE The agents use the joint policy $\hat\xi=(\hat{\xi}_1,\hat{\xi}_2,\cdots,\hat{\xi}_n)$ to collect a sequence of samples $\tau=\{(s(t),a(t),r(t))\}_{0\leq t\leq K-1}\cup \{s(K)\}$ 
    \FOR {$i=1,2,\cdots,n$}
    \STATE $\tau|_{(i,\kappa_c)}:=\{(s_{\mathcal{N}_i^{\kappa_c}}(t),a_i(t),r_i(t))\}_{0\leq t\leq K-1}\cup \{s_{\mathcal{N}_i^{\kappa_c}}(K)\}$
        \FOR{$t=0,1,\cdots, K-1$}
            \STATE $\delta_{i}(t) = \phi_i(s_{\mathcal{N}_i^{\kappa_c}}(t),a_i(t))^\top w_i(t)
            -r_i(t) -\gamma
            \phi_i(s_{\mathcal{N}_i^{\kappa_c}}(t+1),a_i(t+1))^\top  w_i(t)$
            \STATE $w_i(t+1)=w_i(t)-\alpha\delta_{i}(t)\zeta_{i}^{\kappa_c}(t)$ 
            \STATE $\zeta_{i}^{\kappa_c}(t+1) =(\gamma\lambda)\zeta_{i}^{\kappa_c}(t)+\phi_i(s_{\mathcal{N}_i^{\kappa_c}}(t+1),a_i(t+1))$
        \ENDFOR
        \label{line:critic_upd_end}
    \ENDFOR
    
    \STATE \textbf{Return} $\{w_i(K)\}_{i\in\mathcal{N}}$.
    \end{algorithmic}
\end{algorithm}

Note from Algorithm \ref{alg:LPES} Line $2$ that we use $\epsilon$-exploration policies to ensure exploration in localized TD$(\lambda)$. Denote the set of all $\epsilon$-exploration policies by $\Xi^\epsilon$. Importantly, agent $i$ requires only the states and the actions of the agents in its $\kappa_c$-hop neighborhood to carry out the algorithm, where $\kappa_c$ can be viewed as a tunable parameter that trades off the communication effort and the accuracy. In particular, the larger $\kappa_c$ is, the closer the $\kappa_c$-truncated averaged $Q$-function is to the true averaged $Q$-function, albeit at a cost of requiring more communication among agents.


\subsection{Localized Actor-Critic}
Combining IPG with localized TD($\lambda$), we arrive at a localized actor-critic algorithm for solving NMPGs, which is presented in Algorithm \ref{alg:approx_IPG}. 

\begin{algorithm*}[h]
    \caption{Localized Actor-Critic}\label{alg:approx_IPG}
    \begin{algorithmic}[1]
        \STATE \textbf{Input}: Non-negative integers $M$, $T$, $K$, $H$, $\kappa_c\geq \kappa_r$, and a positive real number $\epsilon>0$, initializations $\theta_i(0)=0$ for all $ i$, and $\Delta_{i}^0(m)=0$ for all $i$ and $m$.
        \FOR{$m=0,1,2,\cdots,M-1$} 
             \STATE  All agents simultaneously execute localized TD$(\lambda)$ with linear function approximation (with $K$ iterations) to estimate their $\kappa_c$-truncated averaged $Q$-function $T_{\kappa_c}^i    \overline{Q}_i^{\theta(m)}$, $i\in\sN$, and output weight vectors $\{w_i^m\}_{i\in\sN}$. \hfill $\vartriangleright$ Critic Update
        \FOR{$t=0,1,\cdots,T-1$}
        \STATE The agents use the joint policy $\xi^{\theta(m)}=(\xi_1^{\theta_1(m)},\xi_2^{\theta_2(m)},\cdots,\xi_n^{\theta_n(m)})$ to collect a sequence of samples $\{(s^t(k),a^t(k))\}_{0\leq k\leq H-1}$
        \STATE $\eta_i^t(m)=\sum_{k=0}^{H-1}\gamma^k \nabla_{\theta_i}\log \xi_i^{\theta_i(m)}(a_i^t(k)|s_i^t(k)) \phi_i(s_{\mathcal{N}_i^{\kappa_c}}^t(k),a_i^t(k))^\top  w_i^m  $
        \STATE $\Delta_i^{t+1}(m) =\frac{t }{t+1}\Delta_i^t(m)+\frac{1}{t+1}\eta_i^t(m)$
    \ENDFOR
    \STATE  $\theta_i(m+1)=\theta_i(m)+\beta \Delta_i^T(m)$\hfill $\vartriangleright$ Actor Update
    \ENDFOR
    \end{algorithmic}
\end{algorithm*}

The algorithm consists of three major steps. First, in Algorithm \ref{alg:approx_IPG} Line $3$, each agent calls localized TD$(\lambda)$ with linear function approximation for policy evaluation and outputs a weight vector $w_i^m$ for all $i\in\mathcal{N}$. Then, in Algorithm \ref{alg:approx_IPG} Lines $4$ -- $8$, each agent uses the averaged $Q$-function estimate to iteratively construct an estimate of the independent policy gradient. Specifically, since the independent policy gradient is an expected discounted sum of the averaged $Q$-functions (cf. Eq. (\ref{eq:PGT})), we essentially construct an estimator $\Delta_i^T(m)$ (cf. Algorithm \ref{alg:approx_IPG} Line $8$) of it by taking average of total $T$ samples $\{\eta_i^t(m)\}_{0\leq t\leq T-1}$ (cf. Algorithm \ref{alg:approx_IPG} Line $6$). Finally, in Algorithm \ref{alg:approx_IPG} Line $9$, using the estimated gradient, each agent implements an approximate version of the IPG algorithm presented in Algorithm \ref{alg:exact_IPG}.

Compared with Algorithm \ref{alg:exact_IPG}, Algorithm \ref{alg:approx_IPG} has the following strengths: (1) the algorithm is model-free, (2) due to the use of truncated $Q$-functions, each agent only requires information from its $\kappa_c$-hop neighborhood to carry out the algorithm, which eliminates long-distance communication along the network, and (3) the algorithm, to some extent, overcomes the curse of dimensionality thanks to the use of linear function approximation.



\section{Algorithm Analysis}\label{sec:analysis}

We next present the main results of the paper.  We formally state our assumptions in Section \ref{subsec:assumptions} and then present convergence bounds for Algorithms \ref{alg:exact_IPG}, \ref{alg:LPES}, and \ref{alg:approx_IPG} in Section \ref{subsec:theorems}.  A proof sketch of our main theorems is given in Section \ref{subsec:proof_idea}.

\subsection{Assumptions}\label{subsec:assumptions}
We make the following assumptions.
\begin{assumption}\label{assump:pot_exp_decay}
    There exists a decreasing function $\nu:\N\rightarrow \R^+$ such that:
    \begin{align}       &\abs{\Phi_i(\theta_{\nik},\theta_{-\nik}')-\Phi_i(\theta_{\nik},\theta_{-\nik})} \nonumber\\
    &\leq \nu(\kappa)\max_{j\in-\nik}\norm{\theta_{j}'-\theta_{j}},\;\forall\;\kappa\in \N,
    \end{align}
    where $\Phi_i(\theta)$ is the short-hand notation for $\Phi_i(\xi^\theta)$.
\end{assumption}

Assumption \ref{assump:pot_exp_decay} captures the idea that, for each agent, its potential function is less impacted by the agents far away, and can be viewed as a generalization of the decay property of the $Q$-functions in the existing literature to the networked MPG setting \citep{qu2019scalableMARL, lin2021multi, zhang2022global}. In the extreme case where $\kappa$ exceeds the diameter $ \max_{i,j}\text{dist}(i,j)$ of the network, we have $\nu(\kappa)=0$. Note that this assumption is automatically satisfied for our illustrative example in Section \ref{subsec:examples}, where changing the policy of an agent will only affect its direct neighbors. In Appendix F.5, we show that this assumption is also satisfied when each local potential function admits a stage-wise representation \citep{zhang2022logBarrierSoftmax}.


\begin{assumption}\label{assump:explore}
    It holds that $\inf_{\theta} \min_{s\in\sS} d^\theta(s)>0$, where we recall that $d^\theta$ is the discounted state visitation distribution under a softmax policy $\xi^\theta$
\end{assumption}
Assumption \ref{assump:explore} states that every state can be visited with positive probability under any policy, which easily holds when the initial state distribution $\mu(\cdot)$ is supported on the entire state space. This  assumption is standard and has been used in, e.g.,  \cite{zhang2022logBarrierSoftmax,agarwal2021theory,pmlr-v119-mei20b}. Under Assumption \ref{assump:explore}, we define
$D=1/\inf_{\theta} \min_{s\in\sS}d^{\theta}(s)$, which is finite.

\begin{assumption}\label{assump:MC}
There exists a joint policy $\xi$ such that the Markov chain $\{s(t)\}$ induced by $\xi$ is uniformly ergodic.
\end{assumption}

Under Assumption \ref{assump:MC}, \cite[Lemma 4]{zhang2022global} implies a uniform exploration property for the Markov chain $\{(s(t),a(t))\}$ induced by any policy with entries bounded away from zero, which includes $\epsilon$-exploration policy. Therefore, for any $\hat{\xi}\in\Xi^\epsilon$, the Markov chain $\{(s(t),a(t))\}$ induced by $\hat{\xi}$ has a unique stationary distribution, denoted by $\overline{\pi}^{\hat{\xi}}\in\Delta(\sS\times\sA)$, which satisfies $\pi_{\min}:=\inf_{\hat \xi\in \Xi^{\epsilon}}\min_{i\in \sN} \min_{s_{\mathcal{N}_i^{\kappa_c}},a_i} \overline \pi^{\hat \xi} (s_{\mathcal{N}_i^{\kappa_c}},a_i)>0$. 

While Assumption \ref{assump:explore}, to some extent, already ensures uniform exploration of our policy class, we further impose Assumption \ref{assump:MC} to deal with the Markovian sampling in Algorithm \ref{alg:approx_IPG}. This type of assumption is standard in the existing literature even for the single-agent setting \citep{Srikant2019FiniteTimeEB,tsitsiklis1997analysis}.

\begin{assumption}\label{assump:lin_indep}
    For all $i\in \sN$, the feature mapping is normalized so that $\max_{i,s,a_i}\|\Tilde{\phi}_i(s,a_i)\|\leq 1$.
    In addition, the feature matrix $\Omega_i$ (the row vectors of which are $\{\Tilde{\phi}_i^\top (s,a_i)\}_{(s,a_i)\in\mathcal{S}\times \mathcal{A}_i}$) has linearly independent columns.
\end{assumption}
Assumption \ref{assump:lin_indep} is indeed without loss of generality because neither disregarding dependent features nor performing feature normalization changes the approximation power of the function class \citep{bertsekas1996neuro}. 

To state our last assumption, let $D^{\hat \xi}\in\mathbb{R}^{|\mathcal{S}||\mathcal{A}|\times |\mathcal{S}||\mathcal{A}|}$ be the diagonal matrix with diagonal entries $ \{\overline \pi^{\hat \xi}(s,a)\}_{(s,a)\in \sS\times \sA} $. Since $D^{\hat{\xi}}$ has strictly positive diagonal entries under Assumption \ref{assump:MC} and the feature matrix $\Omega_i$ has linearly independent columns for all $i$, we have $\underline{\lambda}:=\min_{i\in \sN} \inf_{\hat \xi\in \Xi^{\epsilon}}\lambda_{\min}(\Omega_iD^{\hat{\xi}} \Omega_i)>0$, where $\lambda_{\min}(\cdot)$ returns the smallest eigenvalue of a positive definite matrix.
For any $i\in\sN$ and $\theta\in\mathbb{R}^{|\mathcal{S}||\mathcal{A}|}$, let $c_i(\theta):= \min_s \sum_{a_i^*\in{\argmax}_{a_i}\overline Q_i^{\theta}(s,a_i)} \xi_i^{\theta_i}(a_i^*|s_i)$.
\begin{assumption}\label{assump:c_inf}
$c:=\inf_{m\geq 0}\min_{1\leq i\leq N} c_i(\theta(m))>0$,  where $\{\theta(m)\}_{m\geq 0}$ are policy parameters encountered from the algorithm trajectory (cf. Algorithm \ref{alg:approx_IPG}).
\end{assumption}
The inequality stated in Assumption \ref{assump:c_inf} is called a non-uniform Łojasiewicz inequality \citep{zhang2022logBarrierSoftmax,pmlr-v119-mei20b}, which is used to connect the NE-Gap with the gradient of the objective function through gradient domination. This assumption automatically holds in the existing literature when the policy gradient is exact \citep{zhang2022logBarrierSoftmax}. However, for Algorithm \ref{alg:approx_IPG}, due to the more challenging model-free setup and the presence of noise in sampling, $c$ is not necessarily strictly positive, which motivates Assumption \ref{assump:c_inf} as a means for analytical tractability. Further relaxing this assumption is our immediate future direction.
One approach for removing Assumption \ref{assump:c_inf} is to regularize the problem (e.g., using log-barrier regularization like in \cite{zhang2021gradientStochasticGame}), which prevents the policy generated by IPG from being deterministic, albeit at a cost of introducing an asymptotic bias due to regularization.

\subsection{Results}\label{subsec:theorems}
We are now ready to present our main results. We first present the averaged Nash-regret bound of the IPG algorithm (cf. Algorithm \ref{alg:exact_IPG}) as a warm-up, then we present the finite-sample bound of Algorithm \ref{alg:approx_IPG}, which involves a critic error. Finally, we present a concise bound of the critic estimation error when using our localized TD$(\lambda)$ with linear function approximation. Given an arbitrary integer $\kappa$, let $n(\kappa) := \max_{i\in \mathcal{N}} |\nik|$ be the size of the largest $\kappa$-hop neighborhood.
\begin{theorem}\label{thm:tab_softmax_Nash_regret}
    Consider $\{\theta_i(m)\}_{0\leq m\leq M-1}$ generated by Algorithm \ref{alg:exact_IPG}. 
    Suppose that Assumptions 
    \ref{assump:pot_exp_decay}, \ref{assump:explore}, and
    \ref{assump:c_inf} are satisfied, and the step size $\beta=\frac{(1-\gamma)^3}{6n(\kappa_G)}$. Then, 
    \begin{align}
        &\regret(M)\nonumber \\
        \leq\; & \mathcal{O}\Sp{\frac{D}{c}\sqrt{\frac{\max_{j\in\sN} |\sA_j|n(\kappa_G)(\Phi_{\max}-\Phi_{\min})}{(1-\gamma)^3 M}}}\nonumber \\
        & + \mathcal{O}\Sp{\frac{D\sqrt{\max_{j\in\sN} |\sA_j|\nu(\kappa_G)}}{c(1-\gamma)}} \label{eq:IPG_bound}.
    \end{align}
\end{theorem}
The first term on the right-hand side of Eq. (\ref{eq:IPG_bound}) goes to zero at a rate of $\mathcal{O}(M^{-1/2})$, which matches with the existing convergence rate of IPG for solving MPGs \citep{zhang2022logBarrierSoftmax}. Note that, unlike in existing results, the total number of agents $n$ does not appear in the bound. Instead, we have $n(\kappa_G)$, which captures the impact of network structure. The second term on the right-hand side of Eq. (\ref{eq:IPG_bound}) arises because of the relaxation from MPG to NMPG (see Definition \ref{def:local_MPG}), which decreases with $\kappa_G$, and vanishes when $\kappa_G\geq \max_{i,j}\text{dist}(i,j)$.  

We next move on to study Algorithm \ref{alg:approx_IPG}. 
\begin{theorem}\label{thm:main}
    Consider $\{\theta_i(m)\}_{0\leq m\leq M-1}$ generated by Algorithm \ref{alg:approx_IPG}. 
Suppose that Assumptions 
    \ref{assump:pot_exp_decay} -- \ref{assump:c_inf} are satisfied, and
    $\beta= \frac{(1-\gamma)^3}{24n(\kappa_G)}$. Then,  
    \begin{align}
        &\E\left[\regret(M)\right] \nonumber\\
        \leq & \frac{\sqrt{\max_{j\in\sN} |\sA_j|}D }{c}\bigg\{ \mathcal{O}\Sp{\frac{\sqrt{n(\kappa_G)(\Phi_{\max}-\Phi_{\min})}}{(1-\gamma)^{1.5}M^{1/4}}} \nonumber\\
        &+\!\mathcal{O}\Sp{\frac{\sqrt{\nu(\kappa_G)}}{1-\gamma}}\!+\!\mathcal{O}\Sp{\frac{\sqrt{n(\kappa_G)}[1+(1-\gamma)\epsilon_{\text{critic}}]}{(1-\gamma)^2M^{1/4}}} \nonumber\\
        &+\!\mathcal{O}\Sp{\frac{ \sqrt{n(\kappa_G)}\epsilon_{\text{critic}}^{1/2}}{(1-\gamma)^{1.5}}}\!+\!\mathcal{O}\Sp{\frac{ \sqrt{n(\kappa_G)}\gamma^{H/2}}{(1-\gamma)^2}}\bigg\},\label{eq:bound:appro}
    \end{align}
    where $\epsilon_{\text{critic}}$ stands for the critic estimation error in policy evaluation:
    \begin{align*}   \epsilon_{\text{critic}}=\sup_{\theta,i}\mathbb{E}^{1/2}\left[\sup_{s,a_i}\left|\overline Q_{i}^{\theta}(s,a_i)-\phi_i(s_{\mathcal{N}_i^{\kappa_c}},a_i)^\top  w_i^\theta\right|^2\right].
    \end{align*}
\end{theorem}
The first two terms on the right-hand side of Eq. (\ref{eq:bound:appro}) are analogous to the two terms on the right-hand side of the IPG error bounds presented in Theorem \ref{thm:tab_softmax_Nash_regret}. The last $4$ terms are approximation errors for the independent policy gradient, which (in the order as they appear in the bound) consist of a localization error, an error incurred by using a finite sum (Algorithm \ref{alg:approx_IPG} Line $6$) to approximate an infinite sum (cf. Eq. (\ref{eq:PGT})), a critic error, and an error incurred by using a finite average (Algorithm \ref{alg:approx_IPG} Lines $4$ -- $8$) to approximate an expectation (cf. Eq. (\ref{eq:PGT})).



To establish an overall sample complexity bound of Algorithm \ref{alg:approx_IPG}, we need to specify how the critic error decays as a function of the number of iterations in localized TD$(\lambda)$ with linear function approximation, which is presented in the following.  
\begin{theorem}\label{thm:critic_short}
Consider $\{w_i(K)\}_{i\in\mathcal{N}}$ generated by Algorithm \ref{alg:LPES}. Suppose that Assumption \ref{assump:MC} is satisfied. Then, with appropriately chosen step size $\alpha$ (see Appendix D for the explicit requirements) and large enough $K$, we have 
    \begin{align}
    \epsilon_{\text{critic}} 
    \leq \; & \mathcal{O}(1-(1-\gamma)\underline{\lambda}\alpha)^{\frac{K}{2}}
    +\mathcal{O}\left[\frac{\alpha \log(1/\alpha)}{(1-\gamma)\underline{\lambda}}\right]^{1/2} \nonumber\\ 
    &+\mathcal{O}\left(\frac{\epsilon_{\text{app}}}{\pi_{\min}(1-\gamma)}\right)
    + \mathcal{O}\left(\frac{\gamma^{\kappa_c-\kappa_r}}{1-\gamma}\right) \nonumber\\ &+\mathcal{O}\left(\frac{n\epsilon}{(1-\gamma)^2}\right),\label{eq:bound:critic}
    \end{align}
    where 
    $\epsilon_{\text{app}}$ stands for the function approximation error. See Appendix D for the explicit definition.
\end{theorem}
The first two terms on the right-hand side of Eq. (\ref{eq:bound:critic}) represent the convergence bias (which has geometric convergence rate) and the variance (which decreases with the step size $\alpha$), and their behaviors agree with existing results on stochastic approximation \citep{Srikant2019FiniteTimeEB,chen2019nonlinearSA}. The third term arises from using linear function approximation and vanishes in the tabular setting where we use a complete basis. The fourth term represents the error between the averaged $Q$-function and the $\kappa_c$-truncated averaged $Q$-function, which is introduced to overcome the scalability issue when the number of agents increases. Note that the fourth term decays exponentially with the choice of $\kappa_c$, and vanishes when $\kappa_c$ is greater than the diameter (i.e., $\max_{i,j}\text{dist}(i,j)$) of the network. The last term arises because of using $\epsilon$-exploration behavior policies to ensure sufficient exploration.
 

Combining Theorem \ref{thm:main} and Theorem \ref{thm:critic_short} leads to the following sample complexity bound. 

\begin{corollary}\label{co:sample_complexity}
To achieve $\mathbb{E}[\regret(M)]\leq \tilde{\epsilon}+\mathcal{E}_{\text{EX}}+\mathcal{E}_{\text{FA}}+\mathcal{E}_{\text{LO}}$,
the sample complexity is $\tilde{\mathcal{O}}(\tilde{\epsilon}^{-4})$, where $\mathcal{E}_{\text{EX}}$ stands for the induced error from exploration (cf. the last term on the right-hand side of Eq. (\ref{eq:bound:critic})), $\mathcal{E}_{\text{FA}}$ stands for the function approximation error (cf. the third term on the right-hand side of Eq. (\ref{eq:bound:critic})), and $\mathcal{E}_{\text{LO}}$ stands for the induced error from localization (cf. the summation of the second last term on the right-hand side of Eq. (\ref{eq:bound:critic}) and the third term on the right-hand side of Eq. (\ref{eq:bound:appro})).
\end{corollary}

In Corollary \ref{co:sample_complexity} The presence of $\mathcal{E}_{\text{EX}}+\mathcal{E}_{\text{FA}}+\mathcal{E}_{\text{LO}}$ are due to the fundamental limit of the problem, such as the approximation power of function class, using truncated averaged $Q$-functions to approximate global averaged $Q$-functions, and using ``soft'' policies to ensure exploration. 


In single-agent RL, popular algorithms such as $Q$-learning and natural actor-critic are known to achieve $\tilde{\mathcal{O}}(\tilde{\epsilon}^{-2})$ sample complexity \citep{qu2020finite,lan2022policy}. While we study the more challenging setting of using localized algorithms to solve MARL problems, it is an interesting direction to investigate whether there is a fundamental gap.
In addition, while Localized Actor-Critic (cf. Algorithm \ref{alg:approx_IPG}) is an independent learning algorithm, our theoretical results require all agents to follow the same learning dynamics, which suggests some implicit coordination among the agents. Although this is common in the existing literature \citep{leo2021convergeMPG, pmlr-v162-ding22b, zhang2022logBarrierSoftmax}, developing completely independent learning dynamics is an interesting future direction.

\subsection{Proof Sketch}\label{subsec:proof_idea}
\textbf{Analysis of the Actor.}
At a high level, we use a Lyapunov approach to analyze the policy update, where the potential function is a natural choice of the Lyapunov function. The key is to bound $\Phi_i(\theta(m+1))-\Phi_i(\theta(m))$, $i\in\mathcal{N}$, in each iteration using the gradient of objective function $J_i(\cdot)$, which is related to NE-Gap of agent $i$ through the non-uniform Łojasiewicz inequality \citep{zhang2022logBarrierSoftmax,pmlr-v119-mei20b}. To exploit the network structure and to remove the raw dependence on the total number of agents in the NMPG setting, instead of directly bounding $\Phi_i(\theta(m+1))-\Phi_i(\theta(m))$, we perform the following decomposition:
\begin{align*} 
&\Phi_i(\theta(m+1))-\Phi_i(\theta(m) \\
=&\underbrace{\left[ \Phi_i(\theta_{\nikg}(m+1),\theta_{-\nikg}(m)) - \Phi_i(\theta(m)\right]}_{(a)} \\ 
&+ \underbrace{\left[\Phi_i(\theta(m+1))-\Phi_i(\theta_{\nikg}(m+1),\theta_{-\nikg}(m))\right]}_{(b)}.
\end{align*}
The term $(a)$ captures the policy change of the agents inside the $\kappa_G$-hop neighborhood of agent $i$, and the first step of bounding it is to use the smoothness property of the potential function, which is similar to that of \cite{zhang2022logBarrierSoftmax}. However, unlike existing analysis of IPG, we also need to bound the error in approximating the gradient, which can be decomposed into three error terms:
\begin{enumerate}
    \item[$e_1$:] error due to estimating the averaged $Q$-function, which is exactly  the critic error;
    \item[$e_2$:] error due to the  randomness in the trajectory sampling (see Algorithm \ref{alg:approx_IPG} Lines $4$ -- $8$), which has zero mean;
    \item[$e_3$:] error resulted from truncating the sample trajectory at horizon $H$ (see Algorithm \ref{alg:approx_IPG} Lines $6$), which decays exponentially with $H$.
\end{enumerate}

Term $(b)$ results from the policy change of agents outside the $\kappa_G$-hop neighborhood of agent $i$, and is a decreasing function of $\kappa_G$ (cf. Assumption \ref{assump:pot_exp_decay}). 

\textbf{Analysis of the Critic.}
The critic is designed to perform policy evaluation of a softmax policy $\xi^\theta$ using localized TD$(\lambda)$ with linear function approximation. Similar to \cite{chen2019nonlinearSA,Srikant2019FiniteTimeEB}, we formulate localized TD($\lambda$) as a stochastic approximation algorithm and again use a Lyapunov approach to establish the finite-sample bound of the difference between $w_i(K)$ and $w_i^\theta$, where $w_i^\theta$ is the solution to a properly defined projected Bellman equation associated with agent $i$.

The challenge lies in bounding the difference between the $Q$-function associated with the weight vector $w_i^\theta$ (denoted by $Q(w_i^\theta)$) and the true averaged $Q$-function $\overline{Q}_i^\theta$ of policy $\xi^{\theta}$, which we decompose into a function approximation error, an error due to using $\epsilon$-exploration policy, and an error due to truncating the averaged $Q$-function at its $\kappa_c$-hop neighborhood, and bound them separately. 
To achieve that, we develop a novel approach 
involving the construction of a ``sub-chain'', which is an auxiliary Markov chain with state space $\sS_{\mathcal{N}_i^{\kappa_c}}\times \sA_i$. 
See Appendix D for more details. 


\section{Conclusion}
We study MARL in the context of MPGs and introduce a networked structure that allows agents to learn equilibria using local information.  In particular, we develop a localized actor-critic framework for minimizing the averaged Nash regret of NMPGs. Importantly, the algorithm is scalable and uses function approximation. We provide finite-sample convergence bounds to theoretically support our proposed algorithm and conduct numerical simulations to demonstrate its empirical effectiveness. 

An immediate future direction is to investigate whether there is a fundamental gap in the convergence rates between localized MARL algorithms and single-agent RL algorithms.  It is also interesting to see if localized algorithms (with provable guarantees) can be designed to solve other classes of games beyond NMPGs.

\begin{acknowledgements}
This work is supported by NSF Grants CNS-2146814, CPS-2136197, CNS-2106403, NGSDI-2105648, with additional support from Amazon AWS. Yiheng Lin was supported by PIMCO graduate fellowship in Date Science and Amazon AI4Science fellowship. Zaiwei Chen was supported by PIMCO postdoctoral fellowship in Data Science and the Simoudis Discovery Prize.
\end{acknowledgements}



\bibliography{zhou_290}

\end{document}