
\section{Preliminaries}
\label{sec:preliminaries}
%
%The terms Markov games and Markov games are interchangeable. We use both of them in this paper.

%{\par \textbf{Markov games }}
We consider an episodic Markov Game (MG) of the form $\M=(\S,\A,H,\P,r,)$, with state space $\S$, action space $\A=\A_1\times\dots\A_n$ with $\A_i$ being the action space for agent $i\in[n]$, $H$ is the number of steps per episode or episode length -- we assume the non-bandit case $H\geq 2$, $\P = \{\P_h\}_{h\in[H]}$ are transition probability measures and $\P_h(\cdot\mid x,a)$ denotes the transition kernel over $h+1$ if all players take the action profile $a\in\A$ for state $x\in\S$. % at step $h\in[H]$. 
%
We denote agent $i$'s reward function profile by $r_i = \{ r^i_h\}_{h=1}^H$ with $r^i_h: \cS \times \A_i \to [0,1]$.\footnote{We assume deterministic rewards for simplicity.} 
%We define the reward function profile $r=(r_1,\dots,r_n)$.
For any agent $i\in[n]$, its action taken at step $h$ is denoted by $a_{i,h}\in\A_i$, 
%transition probability $\P:\S\times\A\to\S$ with $\P(s'|s,a) = \P(s'|s,a_1,\dots,a_n)$ being the probability of transitioning to the state $s'$ given the state $s$ and the actions $a$ taken by the agents, reward function $r=(r_1,\dots,r_n)$, $r_i:\S\times\A\to[0,1]$.
%
and let %its action profile by 
$a_i = \{ a_{i,h}\}_{h=1}^H$. 
%We define an action profile as $a=(a_1,\dots,\pi_H)\in\A$.
%
We assume every agent has a finite action space, while the state space can be arbitrarily large or even continuous. 
% We assume $r_i(s,a)\in[0,1]$ for every $i, a\in\A, s\in\S$. 
%Given an MG $\M$ and a fixed state $s\in\S$, we define the associated \emph{stage game} $\M_s=(\A,r(s,\cdot))$; i.e., a game of $n$ players with action space $\A$ and reward functions $r(s,\cdot):=(r_1(s,\cdot),\dots,r_n(s,\cdot))$.

We denote agent $i$'s policy by $\pi_i = \{ \pi_{i,h}\}_{h=1}^H$ with $\pi_{i,h}: \cS \to \Delta(\A_i)$. 
%, and define $\pi_{-i}=\{\pi_j\}_{j\in[n]\setminus\{i\}}$. 
%
With some abuse of notation, we also let $\pi_h:\S\to\Delta(\A)$ denote the (joint) policy taken by the agents over the joint action space at time step $h\in[H]$ -- the subindex $k$ in $\pi_k$ will be clear from the context whether it refers to an agent or a time step.
%; such policy can be \emph{factorized} whenever, given a whether or not the measure $\pi_h$ can be factorized (i.e., $\pi_h(\A)=\pi_{1,h}(\A_1)\times\dots\times\pi_{n,h}(\A_n)$).} 
Let $\pi$ be the joint policy of all agents. 
%to denote the set of all the agents' policies 
%%define 
%$\pi=(\pi_1,\dots,\pi_n)$. 
We say $\pi$ is a product policy (across agents) $\pi=\pi_1\times\dots\times\pi_n$ when, conditioned on the same state, the action of every agent can be sampled independently according to their own policy, i.e., $\pi_h(x)\in \Delta(\A_1)\times\dots\times\Delta(\A_n)$ for every $x\in\S$, $h\in[H]$.
%
%We define the policy $\pi=(\pi_1,\dots,\pi_n)$. 
For agent $i$, we define her value function $V_h^{i,\pi}: \S \to \R$ at the $h$-th step as $V_h^{i,\pi}(x) = \E_{\pi}[\sum_{h'=h}^H r_{h'}^i (s_{h'}, a_{h'})  \given s_h = x]$ and her Q-function or action-value function $Q_h^{i,\pi}: \S \times \A \to \R$ as $Q_h^{i,\pi}(x,a):=\E_{\pi}[ \sum_{h'=h}^H r_{h'}^i(s_{h'}, a_{h'})  \given s_h = x, a_h = a]$, where the expectation $\mathbb{E}_{\pi}$ is taken with respect to both the randomness in the transitions $\P$ and the randomness inherent in the policy $\pi$. If agent $i$ has policy $\nu$ and the rest of the agents joint policy $\pi_{-i}$, we denote its associated value function at step $h$ by the notation $V_h^{i,\nu,\pi_{-i}}$, i.e., by placing a superscript with $i$'s policy before the (joint) policy of the rest of the agents; consequently,  $V_h^{i,\pi_i,\pi_{-i}}=V_h^{i,\pi}$. 

%When the given reward function is understood from the context, we will omit it from the notation of the value and action-value functions.
%
%The Nash equilibrium (NE) is defined as the policy profile $(\pi_1^*,\dots,\pi_n^*)$ such that 
%

We now define our solution concept of interest.

\begin{definition}[Nash equilibrium for Markov games]
\label{def:NashEq-MG}
Given an initial state $s_o\in\cS$, a product policy profile $\pi^*$ is called a \emph{Nash equilibrium} (NE) if $V^{i,\pi^*_i,\pi^*_{-i}}_1(s_o)\geq V^{i,\pi_i,\pi^*_{-i}}_1(s_o)$ for any $i\in[n]$ and any policy $\pi_i$, and it is called an \emph{$\epsilon$-NE} if $V^{i,\pi^*_i,\pi^*_{-i}}_1(s_o)- V^{i,\pi_i,\pi^*_{-i}}_1(s_o)\leq\epsilon$. 
\end{definition}
%
%For simplicity, we let $V_h^{\dagger}(x; r) = V_h^{\pi^\dag, \nu^\dag}(x; r)$ and $Q_h^{\dagger}(x,a,b; r) = Q_h^{\pi^\dag, \nu^\dag}(x,a,b; r)$ denote the value function and Q-function under the NE $(\pi^\dag, \nu^\dag)$ at step $h\in[H]$. 
%Given a state $x\in\S$, the best response against Player 1 with policy $\pi$ is defined as $\bre_2(\pi):=\argmin_\nu V_1^{\pi, \nu}(x; r)$ and the one  against Player 2 with policy $\nu$ is defined as $\bre_1(\nu):=\argmax_\pi V_1^{\pi, \nu}(x; r)$. Note that $V_1^{\bre_1(\tilde{\nu}), \tilde{\nu}}(x; r) \geq V_1^{\dag}(x; r)  \geq V_1^{\tilde{\pi}, \bre_2(\tilde{\pi})}(x; r)$ holds for any policies $\tilde{\pi}$ and $\tilde{\nu}$ and $x\in\S$. We also introduce the notation $V^*_1(s; r) = \max_{\pi, \nu} V^{\pi, \nu}_1(s; r)$ and the associated optimal value function and optimal Q-function for the $h$-th step as $V^*_h(x; r)$ and $Q^*_h(x, a, b; r)$.
%
%\pccomment{
% I am considering an infinite-horizon MDP's which is amenable to policy learning (e.g., as in~\cite{RZ-ZR-NL:21}); however, we could also consider instead an episodic MDP, which is amenable to other type of stage-wise analysis on the Q- or V- functions (e.g, as in~\cite{WM-RS:22}) (in which case we usually assume a fixed initial state $s_o$).}
%
%We now define the problem we study in this paper more precisely.
%
%\begin{definition}[Learning problem]
%We study the following problem: develop a learning algorithm that provides the players policies (restricted to some history of past information) across time,
%
%%according to some stated history that if players follow independently by just observing the realization of their own stage reward functions $r_i$ \pcedit{(and past history of cumulative actions?)}, then their 
%
%so that such policies converge to an stationary $\epsilon$-Nash equilibrium in time that is polynomial in parameters $\epsilon, |S|, |A_i|, i\in [n]$. 
%\end{definition}

%Given a state $x\in\S$, we say agent $i\in[n]$ plays its \emph{best response} against a fixed policy profile $\pi_{-i}$ of the rest of the agents if its policy is defined by $\bre_i(\pi_{-i}):=\argmax_{\nu\in(\Delta(\A_i))^H} V_1^{i,\nu, \pi_{-i}}(x; r)$. Note that we can easily characterize a Nash equilibrium (Definition~\ref{def:NashEq-MG}) using best-responses.
We say agent $i\in[n]$ plays a \emph{best response} policy against the policy profile $\pi_{-i}$ of the rest of the agents according to  $\bre_i(\pi_{-i})\in\argmax_{\nu} V_h^{i,\nu, \pi_{-i}}(x)$ for any $(x,h)\in\S\times[H]$. Note that we can easily characterize a Nash equilibrium (Definition~\ref{def:NashEq-MG}) using best-responses.

For any function $f: \mathcal{S} \to \mathbb{R}$, we define the transition operator as $(\mathbb{P}_{h}f)(x, a) =\E_{x'\sim\P_h(\cdot|x,a)}[f(x')]$. 
%and the Bellman operator as $(\mathbb{B}_{h}f)(x, a) = r_{h} +  (\mathbb{P}_{h}f)(x, a)$ for each step $h \in [H]$. 
For any $i\in[n]$, the Bellman equation associated with a policy $\pi$ is: $Q^{i,\pi}_h(x,a)=(r_h^i(x,a)+\Pe_h V^{i,\pi}_{h+1})(x,a)$, $V^{i,\pi}_h(x)= \E_{a\sim \pi_h(x)}[Q^{i,\pi}_h(x,a)]$, with $V^{i,\pi}_{H+1}(x)=0$, for any $(x,a)\in\S\times\A$.
%Let $\pi^* = \argmax_{\pi} V^\pi_1(x)$, $V^*_h=V^{\pi^*}_h(x)$, and $Q^*_h(x, a) = Q^{\pi^*}_h(x, a)$.

In this paper, we consider linear MGs.

{\par \textbf{Linear (function approximation in) Markov Games.}} Under a linear MG setting, there exists a known feature map $\phi: \mathcal{S} \times \mathcal{A} \to \mathbb{R}^d$ such that for every $h\in[H]$, there exist $d$ unknown (signed) measures $\mu_{h} = \left(\mu_{h}^{(1)}, \dots \mu_{h}^{(d)}\right)$ over $\mathcal{S}$ and an unknown vector $\theta_{h} \in \mathbb{R}^d$ such that $\P_{h}(x'| x, a) = \langle \phi(x, a), \mu_{h}(x')\rangle$, $r_{h}(x, a) = \langle \phi(x, a), \theta_{h}\rangle$ for all $(x, a, x') \in \mathcal{S} \times \mathcal{A} \times \mathcal{S}$. We assume the non-scalar case with $d\geq 2$ and that the feature map satisfies $\|\phi(x, a) \|\leq 1$ for all $(x, a) \in \mathcal{S} \times \A$ and $\max\{\|\mu_{h}(\S)\|, \|\theta_{h}\|\} \leq \sqrt{d}$ at each step $h \in [H]$, where (with an abuse of notation) $\|\mu_{h}(\mathcal{S})\| = \int_{\mathcal{S}}\|\mu_{h}(x)\|dx$. Note that the transition kernel $\P_h(\cdot|x,a)$ may have infinite degrees of freedom since the measure $\mu_h$ is unknown.

%======

{\par \textbf{Performance Metric.}} We consider that all agents are learning during a total of $K$ episodes, starting %. We assume that all agents start 
at some initial state $s_o \in \cS$ at the beginning of each episode. 
%Given an agent $p\in[P]$ at a step $h\in[H]$ of an episode $k\in[K]$, let $x_h^{k,p}$ be the state, $a_h^{k,p}$ be the action taken according to some policy $\pi_h^{k,p}$, and $r_h^{k,p}:=r_h(x_h^{k,p},a_h^{k,p})$ be the reward obtained. Let $\pi^{k,p}:=\{\pi^{k,p}_h\}_{h=1}^H$. 
%
%
%Let $\pi^*$ be some Nash equilibrium of the underlying Markov game. 
For a set of policies $\{\pi^{k}\}_{k\in[K]}$ provided by an online MARL algorithm, we use the following regret performance metric:  %performance is measured by 
%\begin{equation}
%    \label{eq:regret_nash}
%    \textnormal{Regret}(K) = \sum_{k=1}^K\sum_{i=1}^n (V_1^{i,\pi^*}(s_o) - V_1^{i,\pi_i^k,\pi^*_{-i}}(s_o)).
%\end{equation}
\begin{equation}
    \label{eq:regret_nash}
    \textnormal{Regret}(K) = \sum_{k=1}^K\max_{i\in[n]} (V_1^{i,\bre(\pi^k_{-i}),\pi^k_{-i}}(s_o) - V_1^{i,\pi^k}(s_o)).
\end{equation}
The idea behind such regret is that, at episode $k\in[K]$, $\max_{i\in[n]} (V_1^{i,\bre(\pi^k_{-i}),\pi^k_{-i}}(s_o) - V_1^{i,\pi^k}(s_o))=0$ iff (product) policy $\pi^k$ is a Nash equilibrium for the Markov game.
%$\textnormal{Regret}(K) = 0$ if and only if $\pi^k$ is a Nash equilibrium for all $k\in[K]$.

{\par \textbf{Static games.}} We also consider that the $n$ agents can play a static game, keeping their respective action spaces. Given that each agent has an associated reward function $g_i:\A\to\R$ in a static game, the game is defined by the tuple  $(g_1,\dots,g_n)$. 
Given $a\in\A$, we define $a_{-i}$ as the respective element of $\A_{-i}=\A_1\times\dots \A_{i-1}\times\A_{i+1}\times\dots\times \A_n$. 
%
%
%For this static game
%%also define a (pure) strategy profile of all agents as $a^*=(a_1^*,\dots,a_n^*)$ with $a^*_i\in\A_i$ being the (pure) strategy of agent $i$. 
%we 
%define a strategy profile as $\nu^*\in\Delta(\A)$ with $\nu^*_i\in\Delta(\A_i)$ being the strategy of agent $i$. 
%%Given an agent $i\in[n]$, we use the notation $\nu_{-i}$ to denote the strategy profile of all the rest of the agents and let $\A_{-i}=\A_1\times\dots \A_{i-1}\times\A_{i+1}\times\dots\times \A_n$, and thus $\nu_{-i}\in\Delta(\A_{-i})$. 
%
%For a static game we consider $\nu\in\Delta(\A_i)$ to be a strategy profile, and we say it is a product one when 
%
We consider a tuple $\nu=(\nu_1,\dots,\nu_n)$ with $\nu_i\in\Delta(\A_i)$, $i\in[n]$, to be a strategy profile; and let $\nu_{-i}$ be the tuple $\nu$ without its $i$th element. In this work, we consider strategy profiles $\nu\in\Delta(\A)$ as product measures $\nu(a)=\prod^n_{i=1}\nu(a_i)$, a similar consideration follows for $\nu_{-i}\in\Delta(\A_{-i})$, $i\in[n]$. A strategy profile $\nu^*$ is a Nash equilibrium if $\nu^*_i\in\argmax_{\nu\in\Delta(\A_i)}\E_{\substack{a_i\sim\nu\\a_{i}\sim\nu^*_{i}}}[g_i(a)]$ for every $i\in[n]$.
%$\E_{a_i\sim\nu^*_i,i\in[n]}[g_{i}(a)]\geq \E_{\substack{a_i\sim\nu_i\\a_{-i}\sim\nu^*_{-i}}}[g_{i}(a)]$ for any $i\in[n]$ and any policy $\nu_i\in\Delta(\A_i)$. 
%We are interested in two special classes of strategy profiles:
%Nash equilibria for static games: \emph{global optimal} and \emph{saddle} equilibria.

%\begin{definition}[Global optimal and saddle Nash equilibria~\citep{Hu2003NashQ}]
%\label{def:global-saddle}
%A strategy profile $a^*=(a_1^*,\dots,a_n^*)$ of the static game $(g_1,\dots,g_n)$ is:
%\begin{enumerate}[label=(\roman*)]
%    \item a \emph{global optimal} (Nash) equilibrium if $g_{i}(a^*)\geq g_{i}(a)$ for any $a\in\A$; and 
%    \item a \emph{saddle} Nash equilibrium if $g_{i}(a^*)\geq g_{i}(a_i,a^*_{-i})$ for any $i\in[n]$ and any policy $a_i\in\A_i$, and $g_{i}(a^*)\leq g_{i}(a^*_i,a_{-i})$ for any $a_{-i}\in\A_{-i}$.
%\end{enumerate}
%%$\nu^*_i\in\Delta(\A_i)$, if $\E_{a_i\sim\nu^*_i,i\in[n]}[g_{i}(a)]\geq \E_{\substack{a_i\sim\nu_i\\a_{-i}\sim\nu^*_{-i}}}[g_{i}(a)]$    % 
%\end{definition}

\begin{definition}[Global optimal and saddle Nash equilibria~\citep{Hu2003NashQ}]
\label{def:global-saddle}
A strategy profile $\nu^*$ of the static game $(g_1,\dots,g_n)$ is:
\begin{enumerate}[label=(\roman*)]
    \item a \emph{global optimal} (Nash) equilibrium if $\E_{a\sim\nu^*}[g_{i}(a)]\geq \E_{a\sim\nu}[g_{i}(a)]$ for any strategy profile $\nu\in\Delta(\A)$; and 
    \item a \emph{saddle} Nash equilibrium if $\E_{a\sim\nu^*}[g_{i}(a)]\geq \E_{\substack{a_i\sim\nu_i\\a_{-i}\sim\nu^*_{-i}}}[g_{i}(a)]$ for any $i\in[n]$ and any $\nu_i\in\Delta(\A_i)$, and $\E_{a\sim\nu^*}[g_{i}(a)]\leq \E_{\substack{a_i\sim\nu^*_i\\a_{-i}\sim\nu_{-i}}}[g_{i}(a)]$ for any strategy profile $\nu_{-i}\in\Delta(\A_{-i})$.
\end{enumerate}
%$\nu^*_i\in\Delta(\A_i)$, if $\E_{a_i\sim\nu^*_i,i\in[n]}[g_{i}(a)]\geq \E_{\substack{a_i\sim\nu_i\\a_{-i}\sim\nu^*_{-i}}}[g_{i}(a)]$     
\end{definition}

%It is called an \emph{$\epsilon$-NE} if $V^{i,\pi^*_i,\pi^*_{-i}}(s_o)\geq V^{i,\pi_i,\pi^*_{-i}}(s_o)-\epsilon$. 

%==================================================
%==================================================
