\section{Neuro-symbolic CSGs}\label{sec:nscsgs}

% \startpara{Games and equilibria}
% \dave{Like LICS, $i$ or $Ag_i$ for an agent?}
% A normal-form game $\mathsf{N}$ is a tuple $(N, A, u)$, where $N=\{1,\dots,n$\} is a set of agents, $A= A_1 \times \cdots \times A_n$ where $A_i$ is a set of actions for $\agent_i$, and $u=(u_1,\dots,u_n)$ where $u_i:A \to \mathbb{R}$ is a utility function for $\agent_i$. For any finite set $X$, we will write $\mathbb{P}(X)$ for the set of probability distributions over $X$. For a normal-form game $\mathsf{N}$, a strategy $\mu_i$ for $\agent_i$ is a distribution over its action set, i.e., $\mu_i \in \mathbb{P}(A_i)$. We denote by $\mu=\mu_{-i}[\mu_i]=(\mu_1,\dots,\mu_n)$ $(i\in N)$ the strategy profile, where $\mu_{-i}$ refers to the strategy profile except $\mu_i$. Given a strategy profile $\mu$, the expected value of $u_i$ is computed by $\mathbb{E}^{\mu}[u_i]=\sum_{(a_1,\dots,a_n) \in A} u_i(a_1,\dots,a_n)\prod_{j=1}^n \mu_j(a_j)$. A strategy profile $\mu^*=(\mu_1^*,\dots,\mu_n^*)$ is a Nash equilibrium (NE) if, $\mathbb{E}^{\mu^*}[u_i]\ge\mathbb{E}^{\mu_{-i}^*[\mu_i]}[u_i]$ for all $\mu_i\in \mathbb{P}(A_i)$ and all $i\in N$. An NE $\mu^*$ is a social welfare Nash equilibrium (SWNE) if $\mathbb{E}^{\mu^*}[\sum_{i=1}^n u_i]\ge\mathbb{E}^{\mu}[\sum_{i=1}^n u_i]$ for all NEs $\mu$ of $\mathsf{N}$. 
% \marta{define correlated profile here}

We begin by describing \emph{neuro-symbolic concurrent stochastic games (NS-CSGs)}~\citep{RY-GS-GN-DP-MK:22},
the modelling formalism that we use in this paper, for which we then define our notions of equilibria.

An NS-CSG comprises a number of interacting neuro-symbolic agents acting in a shared environment. 
% We consider $E$ to be a special agent, and will thus focus on (extensive-form) concurrent games over the set $N\cup \{E\}$ of agents. %Our model can be viewed as concurrent, stochastic game extension of \cite{MEA-EB-PK-AL:20,MEA-EB-PK-AL:20-2}\marta{check}. 
Each agent has finitely many local states and actions, and is additionally endowed with a perception mechanism implemented as a neural network (NN), 
through which it can observe the state of the environment, storing the observations locally in \emph{percepts}.
For the purposes of this paper it suffices to assume that an NN is a function $f:\mathbb{R}^{m_1}\to\mathbb{R}^{m_2}$
over finite real vector spaces.
%The agents execute concurrently, transitioning between their local states probabilistically via joint actions. 
Formally, an NS-CSG is defined as follows.

\begin{defi}[NS-CSG]\label{defi:NS-CSG}
A neuro-symbolic concurrent stochastic game (NS-CSG)\ $\csg$ comprises
agents $(\agent_i)_{i \in N}$, for $N=\{1,\dots,n\}$, and an environment $E$ where:
\[
\agent_i  = (S_i,A_i,\Delta_i,\obs_i,\delta_i) \; \mbox{for $i \in N$}, \; \;
E = (S_E,\delta_E)
\]
and we have:
\begin{itemize}
    \item $S_i = \Loc_i\times \Per_i$ is a set of states for $\agent_i$, where $\Loc_i \subseteq \mathbb{R}^{b_i}$ and $\Per_i \subseteq \mathbb{R}^{d_i}$ are finite sets of local states and percepts,
    respectively;
    
    \item $S_E\subseteq \mathbb{R}^e$ is a finite or infinite set of environment states;
  
    \item $A_i$ is a nonempty finite action set for $\agent_i$,
    and $A \coloneqq (A_1 \cup \{ \bot \}) \times \cdots \times (A_n \cup \{ \bot \})$ is the set of \emph{joint actions},
    where $\bot$ is an idle action disjoint from $\cup_{i=1}^n A_i$;

    \item $\Delta_i: S_i \to 2^{A_i}$ is an available action function, defining the actions $\agent_i$ can take in each state;

    \item $\obs_i : (S_1 \times \cdots \times S_n \times S_E)\to \Per_i$ is an observation function for $\agent_i$, mapping  the state of all agents and the environment to a percept of the agent, implemented via an NN classifier;
    
    \item $\delta_i:S_i \times A \to \mathbb{P}(\Loc_i)$ is a probabilistic transition function for $\agent_i$,
    where $\mathbb{P}(X)$ denotes the set of probability distributions over a set $X$,
    determining the probability of moving to local states given its current state and joint action;
    
    \item $\delta_E:S_E \times A \to S_E$ is a deterministic environment transition function determining the environment's next state given its current state and joint action.
\end{itemize}
\end{defi}

% \begin{defi}[Neuro-symbolic agent]
% A neuro-symbolic agent, or agent, is a tuple $i=(S_i,A_i,\Delta_i,obs_i,\delta_i)$, where
% \begin{itemize}
%     \item $S_i=\loc_i\times \per_i\subseteq \mathbb{R}^{b_i}\times\mathbb{R}^{d_i}$ is a nonempty finite set of local states $(\loc_i,\per_i)$, comprising a private state $\loc_i\in \loc_i$ and a percept $\per_i\in \per_i$ storing the state of the environment observed by the agent;
    
%     \item $A_i$ is a nonempty finite set of actions for the agent;
    
%     \item $\Delta_i:S_i \to 2^{A_i} \setminus \{\varnothing\}$ is an available action function defining the actions the agent can take in a local state;
    
%     \item $obs_i:S_i\times S_E\to \per_i$ is an observation function mapping a pair of a local state $s_i\in S_i$ and an environment state $s_E\in S_E\subseteq\mathbb{R}^m$ to a percept $\per_i$ and is implemented via an NN $f_i:\mathbb{R}^{b_i+d_i+m}\to\mathbb{R}^{d_i}$;
    
%     \item $\delta_i:S_i\times (A_1\cup\{\perp\})\times\cdots\times (A_n\cup\{\perp\})\times (A_E\cup\{\perp\})\to Dist(\loc_i)$ is a probabilistic local transition function determining the distribution over the next private state given a local state and a joint action, where $\perp$ is an idle action disjoint from $\cup_{i=1}^nA_i\cup A_E$ and $A_E$ is the action set of the environment (defined later). 
% \end{itemize}
% \end{defi}

% We slightly abuse notation and write $\delta_i(s_i,\alpha,\loc_i)$ instead of $\delta_i(s_i,\alpha)(\loc_i)$

% \ruitodo{If we prefer to consider the neuro-symbolic agent with continuous private state and probabilistic local transition function, then one way as \cite{MEA-EB-PK-AL:20-2} did is to assume that the probabilistic local transition function determines a finite set of possible next privates given its current local state and the joint action.}

% Each agent has no access to the local states of other agents. 

% Formally, the semantics of an NS-CSG is an infinite state CSG,
% as described in~\citep{RY-GS-GN-DP-MK:22}.
% For brevity, we present here an informal description of the semantics.

Each (global) state $s$ of NS-CSG $\csg$
comprises the state $s_i=(\loc_i,\per_i)\in S_i$ of each agent $\agent_i$
and the state $s_E\in S_E$ of the environment.
Starting from some initial state, the game evolves as follows.
%
First, each agent $\agent_i$ observes the state of the agents and the environment to generate a new percept $\per_i'$ according to its observation function $obs_i$ implemented via an NN.
Then, each agent $\agent_i$ synchronously chooses one of the actions from the set $\Delta_i(s_i)$,
which are available in its state $s_i$. This results in a joint action $\alpha = (a_1, \dots, a_n) \in A$.
Each agent $\agent_i$ then updates its local state to $\loc_i'\in \Loc_i$ according to the probabilistic local transition function $\delta_i$, applied to the state of agent $(\loc_i, \per_i')$ and joint action $\alpha$. The environment updates the environment state to $s_E'\in S_E$ according to the environment transition function $\delta_E$, applied to its state $s_E$ and joint action $\alpha$. Thus, the game reaches the  state $s'=(s_1',\dots, s_n', s_E')$, where $s_i' = (\loc_i', \per_i')$ for $i \in N$. For simplicity, we consider here deterministic environments, but the results can be directly extended to discrete probabilistic environments with finite branching. 

For brevity, we omit the formal semantics of an NS-CSG,
which can be found in~\citep{RY-GS-GN-DP-MK:22}.
In fact, in this paper we consider a slight variant,
differing in the point at which observations are made during each transition.

%
NS-CSGs are a subclass of continuous-state CSGs, which assume a particular structure for the transition function,
distinguishing between agent and environment states
and using an NN-based observation function
to characterise which environment states have the same characteristics.
This provides a trade-off between exploiting the full generality of a continuous-state CSG model
and allowing for tractable computational methods for its analysis.

% Formally, the semantics of an NS-CSG is an infinite-state CSG,
% as described in
% For brevity, we present here an informal description of the semantics.

% For continuous-state POSGs, a finite probability table used in finite state spaces is not enough for the representation of the observation function due to the uncountable state space.
% Here, NNs are used to group the environment states which have the same characteristics, and afterwards to return the associated finite characteristics.

% Our NS-CSGs reduce to a variant of the model in \cite{MEA-EB-PK-AL:20} with discrete local states and deterministic environments when agents' transition functions are restricted to Dirac distributions.

%Stochastic environments are left as future work, as they are likely to require a different solution method. 
% \dave{say why no probability in env}
%\marta{there is no probability in env because we would need continuous distributions?}

% \begin{defi}[Environment]
% The environment is a tuple $E=(S_E,A_E,\Delta_E,\delta_E)$, where
% \begin{itemize}
%     \item $S_E\subseteq \mathbb{R}^m$ is a nonempty (possibly infinite) set of (possibly continuous) environment states;
    
%     \item $A_E$ is a nonempty finite set of actions available to $E$;
    
%     \item $\Delta_E:S_E\to A_E$ is an available action function defining actions the environment can take in each state;
    
%     %\item $\Delta_E:S_E\to 2^{\bar{A}_E}$ is an action assignment function;
    
%     \item $\delta_E:S_E\times(A_1\cup\{\perp\})\times\cdots\times (A_n\cup\{\perp\})\times(A_E\cup\{\perp\})\to S_E$ is an environment transition function.
% \end{itemize}
% \end{defi}

% The environment state can be accessed by all agents. 

% With the definitions of agents and environment, we are ready to define the semantics of NS-CSGs as the induced concurrent stochastic games comprising $n$ agents transitioning synchronously between global states, formed as a product of the states of agents and the environment state. 

% \begin{defi}[Semantics of an NS-CSG]\label{semantics-def}
% Given an NS-CSG $\csg$ consisting of $n$ agents and an environment, the semantics of $\csg$ is the CSG $\sem{\csg} = (N,S,(A_i)_{i \in N},(\Delta_i )_{i \in N},\delta)$ where:
% \begin{itemize}
%     \item $S=S_1\times \cdots \times S_n\times S_E$ is the set of (global) states,
%     which contain both discrete and continuous elements;
%     \item $\delta: (S \times ((A_1 \cup \{\bot \}) \times \cdots \times (A_n  \cup \{\bot \}))) \to \mathbb{P}(S)$ is the partial probabilistic transition function, where for states $s=(s_1,\dots,s_n,s_E),s'=(s_1',\dots,s_n',s_E')\in S$ and joint action $\alpha=(a_1,\dots,a_n)\in A$, if $a_i \in \Delta_i(s_i)$ when $\Delta_i(s_i) \neq \varnothing$ and $a_i= \bot$ otherwise, then $\delta(s,\alpha)$ is defined and if $s_i=(\loc_i,\per_i)$, $s_i'=(\loc_i',\per_i')$, $\per_i'=\obs_i(s)$ %$\delta_i((\loc_i, \per_i'),\alpha)(\loc'_i)>0$ 
%     for all $i \in N$  and $s_E'=\delta_E(s_E,\alpha)$,  then
%         \begin{equation}\label{eq:defi-delta}
%             \delta(s,\alpha)(s')= \mbox{$\prod_{i=1}^n$} \delta_i((\loc_i, \per_i'),\alpha)(\loc'_i)
%         \end{equation}
%     and otherwise $\delta(s,\alpha)(s')=0$.
% \end{itemize}
% \end{defi}

% \begin{defi}[Neuro-symbolic concurrent stochastic game]\label{defi:NS-CSG} A neuro-symbolic concurrent stochastic game (NS-CSG) is a tuple $\csg=(N,E,S,\bar{S},A,\Delta,obs,\delta)$, where
% \begin{itemize}
%     \item $N=\{1,\dots,n\}$ is a finite set of agents and $E$ is the environment;
%     \item $S=S_1\times\cdots\times S_n\times S_E$ is a nonempty set of (global) states and $\bar{S}\subseteq S$ is a set of initial (global) states;
%     \item $A= (A_1\cup\{\perp\})\times\cdots\times (A_n\cup\{\perp\})\times(A_E\cup\{\perp\})\setminus\{(\perp,\dots,\perp)\}$ is a finite set of joint actions;
    
%     \item $\Delta:S\to2^{\cup_{i=1}^nA_i\cup A_E} \setminus \{\emptyset\}$ is an action assignment function, where for $s=(s_1,\dots,s_n,s_E)$ we have $a_i \in \Delta(s)$ iff $a_i \in \Delta_i(s_i)$ for all $i \in N \cup \{E\}$;
    
%     \item $obs:S\to Per_1\times\cdots\times Per_n$ is an observation function, where for $s=(s_1,\dots,s_n,s_E)\in S$, then $obs(s)=(obs_1(s_1,s_E),\dots,obs_n(s_n,s_E))$;
    
%     \item $\delta:S\times A\to Dist(S)$ is a probabilistic transition function, where for $s=(s_1,\dots,s_n,s_E)\in S$, $\alpha=(a_1,\dots,a_n,a_E)\in A$, and $s'=(s_1',\dots,s_n',s_E')\in S$,
%     \begin{itemize}
%         \item if $s_i'=(\loc_i',per_i')$, $per_i'=obs_i((\loc_i,per_i),s_E)$, $\delta_i((\loc_i, per_i'),\alpha)(prv'_i)>0$ and $s_E'=\delta_E(s_E,\alpha)$,  then $\delta(s,\alpha)(s')=\prod_{i=1}^n \delta_i((\loc_i, per_i'),\alpha)(prv'_i)$, where $s_i=(\loc_i,\per_i)$;
%         \item otherwise, $\delta(s,\alpha)(s')=0$.
%     \end{itemize}
    
% %    \martatodo{are we using atomic prop?}
%     % \item $AP$ is a set of atomic propositions and $L:S\to2^{AP}$ is a labelling function.
% \end{itemize}
% \end{defi}



% \ruitodo{Not sure whether the action assignment functions could be designed independently for all agents. Besides, the motivation or common applications for action assignment functions are not quite clear to me.}

% \martatodo{Action assignment simply reflects which actions are enabled in the given state, and idle action reflect the fact that the agent's state does not change. For modelling convenience, we specify action assignment (could be different for each agent) and use the joint action to model a global transition}

% \ruitodo{For example, in my opinion, for the case of autonomous driving, the collision avoidance can be captured by the joint action assignment function, so it seems that at certain scenarios the action assignment functions are coupled. Here, we adopt the independent action assignment functions temporarily in Definitions 1 and 2, and we can discuss this setup further.}

% The game begins in an initial global state $s$, comprising a local state $s_i=(\loc_i,\per_i)\in S_i$ for each agent $i\in N$ and an environment state $s_E\in S_E$, and evolves as follows. First, each agent $i\in N$ observes the environment state $s_E$ and combines its local state $s_i$ to generate a new percept $\per_i'$ according to its observation function $obs_i$ implemented via an NN. Then, each agent $i\in N$ chooses an action $a_i$ according to a strategy and the  environment $E$ chooses an action $a_E$ by action function $\Delta_E$. They synchronously perform the selected actions. Then, each agent $i\in N$ updates its private state according to the probabilistic local transition function $\delta_i$, the local state $(\loc_i, \per_i')$ and the selected joint action to $\loc_i'\in \loc_i$, and the environment updates the environment state according to the environment transition function $\delta_E$ and the selected joint action to $s_E'\in S_E$. Thus, the game reaches a new state $s'=(s_1',\dots, s_n', s_E')$, where $s_i' = (\loc_i', \per_i')$. Our NS-CSGs reduce to a variant of the model in \cite{MEA-EB-PK-AL:20} with discrete local states and deterministic environments when agents' transition functions are restricted to Dirac distributions.

%\marta{Need to say when our system reduces to Akitunde NS-MAS}
%Below we illustrate the NS-CSGs on an example. %\marta{say example is from Akitunded, who actually propose VCAS[2], which is a 2-agent MAS based on VCAS, also cite original VCAS paper}


%\martatodo{Here we need a small example that demonstrates the expressive power of the model, see my suggestions by email; if we want VCAS we need to add stochasticity, or esle if we want the current game from Fig 2 then we need to add perception }

%\ruitodo{I have added stochasticity into VCAS. I also did some experiments for this stochastic VCAS and got some sensible results ($\epsilon_{\textup{own}}=0.1$ and $\epsilon_{\textup{int}}=0$). It seems that the strategy synthesis for our model currently does not support stochastic models. I'll discuss with Gabriel about this.}

%\martatodo{Sounds good, please comment when you have discussed}

%\ruitodo{Sorry, I tested the strategy synthesis again and found that it supports the stochastic model. So there's no problem now.}

Our use of NNs as perception functions to yield observations is in line with a recent trend in autonomous systems, where agents make decisions based on the output of NNs, for instance, probabilistic observation functions extracted from NNs by abstracting them with the help of robustness verification tools \citep{RC-CI-RM-CP-MAS-GV:22}.

To illustrate NS-CSGs, we model the VerticalCAS Collision Avoidance Scenario \citep{KDJ-MJK:19,KDJ-SS-JBJ-MJK:19} presented as a two-agent neurosymbolic system (VCAS[2]) in \citep{MEA-EB-PK-AL:20}. Our model differs in that we separate the states of the agents and the environment state by adding
to the agents' states a variable that measures their trust in the advisory's output,
whereas \citep{MEA-EB-PK-AL:20} replicates the climb rates in both agents' local states and the environment state.
We update the agents' trust level probabilistically to account for possible uncertainty.

%\marta{Can we say the unfolding of our model corresponds to that of \cite{MEA-EB-PK-AL:20} if distributions are Dirac?}

\input{figures/tex/vcas-geometry}

%\marta{Consider rearranging the example by putting some of the descriptions in text and only summarising the formal tuple as a game, similarly to the parking example}
% \marta{Attempting to rearrange to make it easier to understand}

% \ruitodo{It's worth distinguishing these terms: VerticalCAS is a collision avoidance system introduced in \cite{KDJ-MJK:19,KDJ-SS-JBJ-MJK:19}, VCAS[2] is more like a model with two agents both equipped with a VerticalCAS, introduced in \cite{MEA-EB-PK-AL:20}, and there seems to be no VCAS before.}\marta{AS far as I know the full protocol is called ACAS Xu, and VCAS=VerticalCAS is its part} \ruitodo{I looked through most of references on VerticalCAS, but they didn't abbreviate VerticalCAS to VCAS, though we could do this first.}\marta{We have already used this, which I assumed came from other papers http://www.fun2model.org/papers/wlpk20.pdf}

\begin{exam}\label{vcas-example}
 In the VCAS[2] system (Figure \ref{fig:vcas-geometry}) there are
 two aircraft (ownship and intruder: $\agent_i$ for $i\in \{\textup{own}, \textup{int} \}$), each of which is equipped with an NN-controlled collision avoidance system called VCAS. Each second, VCAS issues an advisory ($ad_i$) from which, together with the current trust level ($tr_i$) in the previous advisory, the pilot needs to make a decision about accelerations, aiming at avoiding a near mid-air collision (NMAC) \citep{MEA-EB-PK-AL:20-2}.
 
%  a region where two aircraft are separated by less than $100$ ft vertically and $500$ ft horizontally. 

 The input of the VCAS is $\smash{(h,\dot{h}_{\textup{own}},\dot{h}_{\textup{int}},t)}$ recording the relative altitude $h$ of two aircraft, the climb rate $\dot{h}_{\textup{own}}$ of the ownship, the climb rate $\dot{h}_{\textup{int}}$ of the intruder, and the time $t$ until loss of their horizontal separation. VCAS is implemented via nine feed-forward NNs $F=\{ f_i:\mathbb{R}^4\to\mathbb{R}^9  \,|\, i \in [9] \}$, each of which corresponds to an advisory and outputs the scores of nine possible advisories, where $[k]$ is the set $\{1,\dots,k\}$. Each advisory will provide a set of accelerations for the agent to select from.  There are four trust levels $\{\text{4, 3, 2, 1}\}$ indicating the trust scores.  The trust level is increased probabilistically if the current advisory is compliant with the executed action, and decreased otherwise. We formulate VCAS[2] as an NS-CSG with the agents $\agent_i$ for $i\in \{\textup{own}, \textup{int} \}$ and the environment defined as follows:
\begin{itemize}
    \item $s_i=(tr_i, ad_i)$ is a state of the agent $\agent_i$ with local state $tr_i{\in}[4]$ and percept $ad_i{\in}[9]$;
    
    \item $s_E=(h,\dot{h}_{\textup{own}},\dot{h}_{\textup{int}},t)$ is an environment state;
    
    \item $A_i$ is a finite set of accelerations ($\ddot{h}_i$);
    
    % \item $A_i=\{0,\pm3.0, \pm 7.33, \pm 9.33, \pm 9.7, \pm11.7\}$, where $a_i \in A_i$ is an acceleration $\ddot{h}_i$;  
    % \item $\Delta_i(s_i) = A_i$ returns a set of available accelerations in $s_i$;
    \item $\Delta_i(s_i)$ returns a set of available accelerations;
    
    % \item the available action function $\Delta_i$ returns two non-zero acceleration actions \cite{MEA-EB-PK-AL:20} given a local state (see \appxref{sec:appendix-b}), plus zero acceleration;
    
    \item observation function $obs_i$ is implemented via $F$;
    
    % is given by $ad_{i}'=obs_{i}(ad_{i},s_E)$, where $obs_{\textup{own}}(ad_{\textup{own}},s_E)=\textup{argmax}(f_{ad_{\textup{own}}}(h,\dot{h}_{\textup{own}},\dot{h}_{\textup{int}},\tau))$ and $obs_{\textup{int}}(ad_{\textup{int}},s_E)=\textup{argmax}(f_{ad_{\textup{int}}}(-h,\dot{h}_{\textup{int}},\dot{h}_{\textup{own}},\tau))$;
    
    \item the local transition function $\delta_i$ updates its trust level according to its current trust level, its updated advisory and its executed action;
    
    % if $a_i$ is compliant with $ad_i'$ (i.e., $a_i$ is non-zero), when $be_i\leq3$, then $be_i'=be_i+1$ with probability $1-\epsilon_i$ and $be_i'=be_i$ with probability $\epsilon_i$, and when $be_i=4$, then $be_i'=be_i$; otherwise, when $be_i\ge2$, then $be_i'=be_i-1$ with probability $1-\epsilon_i$ and $be_i'=be_i$ with probability $\epsilon_i$, and when $be_i=1$, then $be_i'=be_i$, where $\epsilon_i \in [0,1]$.
    
    % then $be_i'=be_i+1$ with probability $1-\epsilon_i$ and $be_i'=be_i$ with probability $\epsilon_i$ if $be_i\leq3$, and $be_i'=be_i$ if $be_i=4$; otherwise, then $be_i'=be_i-1$ with probability $1-\epsilon_i$ and $be_i'=be_i$ with probability $\epsilon_i$ if $be_i\ge2$, and $be_i'=be_i$ if $be_i=1$, where $\epsilon_i \in [0,1]$.

    % \item the local transition function $\delta_i$ computes a belief level according to the current belief level $be_i$, the updated advisory $ad_i'$ and the executed action $a_i$: if $a_i$ is compliant with $ad_i'$ (i.e., $a_i$ is non-zero), then $be_i'=be_i+1$ with probability $\epsilon$ if $be_i\leq3$ and $be_i'=be_i$ if $be_i=4$; otherwise, $be_i'=be_i-1$ if $be_i\ge2$ and $be_i'=be_i$ if $be_i=1$.
    
    \item the environment transition function $\delta_E(s_E,\alpha)$ is defined as: $h'=h-\Delta t(\dot{h}_{\textup{own}}-\dot{h}_{\textup{int}})-0.5\Delta t^2(\ddot{h}_{\textup{own}}-\ddot{h}_{\textup{int}})$, $\dot{h}_{\textup{own}}'=\dot{h}_{\textup{own}}+\ddot{h}_{\textup{own}}\Delta t$, $\dot{h}_{\textup{int}}'=\dot{h}_{\textup{int}}+\ddot{h}_{\textup{int}}\Delta t$ and $t'=t-\Delta t$, where $\Delta t=1$ is the time step. 
\end{itemize}
%  Each aircraft is endowed with an observation function implemented via nine feed-forward NNs $F=\{ f_i:\mathbb{R}^4\to\mathbb{R}^9  \,|\, i \in [9] \}$, each of which corresponds to an advisory and outputs the scores over nine possible advisories, where 
%     $[k]$ is the set $\{1,\dots,k\}$. Each advisory will provide a set of accelerations for the agent to select from.
  
%   $f_{ad_i}:\mathbb{R}^4\to\mathbb{R}^9$ with four inputs, seven hidden layers of 45 nodes and nine outputs representing the score of each possible advisory. There are nine NNs $F=\{ f_i:\mathbb{R}^4\to\mathbb{R}^9  \,|\, i \in [9] \}$, each of which corresponds to an advisory, where 
%     $[k]$ is the set $\{1,\dots,k\}$.
 
%  Each aircraft is endowed with a perception function implemented via a feed-forward NN $f_{ad_i}:\mathbb{R}^4\to\mathbb{R}^9$ with four inputs, seven hidden layers of 45 nodes and nine outputs representing the score of each possible advisory. There are nine NNs $F=\{ f_i:\mathbb{R}^4\to\mathbb{R}^9  \,|\, i \in [9] \}$, each of which corresponds to an advisory, where 
% $[k]$ is the set $\{1,\dots,k\}$.

% \marta{I don't understand how two actions are provided by advisory?}
% \ruitodo{They have a table summarising which two actions should be provided for a given advisory. For example, if the advisory is "DNC: Do Not Climb" coded as $ad=2$, then the table gives two available actions $\{-9.33 ft/s^2,-7.33 ft/s^2\}$.}
% \marta{I am asking because the transition function only has one action, so is the table part of the model?}
% \ruitodo{Yes, they defined a protocol function for each agent to reflect the table. So I was wondering if we also need this protocol function. I will add it in case we need.}
% \marta{thanks, this is helpful}\marta{We should check if we can avoid prot by using available action mapping}

% \ruitodo{I agree. I have replaced $prot_i$ with $\Delta_i$, which is called available action mapping, aligned with the action assignment function $\Delta$}


% Each advisory will provide two non-zero acceleration actions for the agent to select from, except that the agent is also allowed to adopt zero acceleration.
 %
%  The belief in the previous advisory and previous advisory (percept) are stored in a state of agent $s_i=(be_i, ad_i)$. 

%  and nine possible advisories.
%  The  current advisory is computed 
%  from the previous advisory $ad_i$ and environment state $s_E$ using the observation function $obs_i$.
%  %
%  The trust level is increased probabilistically if the current advisory is compliant with the executed action, and decreased otherwise. VCAS[2] can be formulated as an NS-CSG with the agents $\agent_i$ for $i\in \{\textup{own}, \textup{int} \}$ and the environment as follows:
% \begin{itemize}
%     \item $s_i=(tr_i, ad_i)$ is a state of the agent $\agent_i$ with local state $tr_i{\in}[4]$ and percept $ad_i{\in}[9]$;
    
%     \item $s_E=(h,\dot{h}_{\textup{own}},\dot{h}_{\textup{int}},\tau)$ is an environment state;
    
%     \item $A_i$ is a finite set of accelerations ($\ddot{h}_i$);
    
%     % \item $A_i=\{0,\pm3.0, \pm 7.33, \pm 9.33, \pm 9.7, \pm11.7\}$, where $a_i \in A_i$ is an acceleration $\ddot{h}_i$;  
%     % \item $\Delta_i(s_i) = A_i$ returns a set of available accelerations in $s_i$;
%     \item $\Delta_i(s_i) = A_i$ for all $s_i \in [4] \times [9]$;
    
%     % \item the available action function $\Delta_i$ returns two non-zero acceleration actions \cite{MEA-EB-PK-AL:20} given a local state (see \appxref{sec:appendix-b}), plus zero acceleration;
    
%     \item observation function $obs_i$ is implemented via $F$;
    
%     % is given by $ad_{i}'=obs_{i}(ad_{i},s_E)$, where $obs_{\textup{own}}(ad_{\textup{own}},s_E)=\textup{argmax}(f_{ad_{\textup{own}}}(h,\dot{h}_{\textup{own}},\dot{h}_{\textup{int}},\tau))$ and $obs_{\textup{int}}(ad_{\textup{int}},s_E)=\textup{argmax}(f_{ad_{\textup{int}}}(-h,\dot{h}_{\textup{int}},\dot{h}_{\textup{own}},\tau))$;
    
%     \item the local transition function $\delta_i$ updates its trust level according to its current trust level, its updated advisory and its executed action;
    
%     % if $a_i$ is compliant with $ad_i'$ (i.e., $a_i$ is non-zero), when $be_i\leq3$, then $be_i'=be_i+1$ with probability $1-\epsilon_i$ and $be_i'=be_i$ with probability $\epsilon_i$, and when $be_i=4$, then $be_i'=be_i$; otherwise, when $be_i\ge2$, then $be_i'=be_i-1$ with probability $1-\epsilon_i$ and $be_i'=be_i$ with probability $\epsilon_i$, and when $be_i=1$, then $be_i'=be_i$, where $\epsilon_i \in [0,1]$.
    
%     % then $be_i'=be_i+1$ with probability $1-\epsilon_i$ and $be_i'=be_i$ with probability $\epsilon_i$ if $be_i\leq3$, and $be_i'=be_i$ if $be_i=4$; otherwise, then $be_i'=be_i-1$ with probability $1-\epsilon_i$ and $be_i'=be_i$ with probability $\epsilon_i$ if $be_i\ge2$, and $be_i'=be_i$ if $be_i=1$, where $\epsilon_i \in [0,1]$.

%     % \item the local transition function $\delta_i$ computes a belief level according to the current belief level $be_i$, the updated advisory $ad_i'$ and the executed action $a_i$: if $a_i$ is compliant with $ad_i'$ (i.e., $a_i$ is non-zero), then $be_i'=be_i+1$ with probability $\epsilon$ if $be_i\leq3$ and $be_i'=be_i$ if $be_i=4$; otherwise, $be_i'=be_i-1$ if $be_i\ge2$ and $be_i'=be_i$ if $be_i=1$.
    
%     \item the environment transition function $\delta_E(s_E,\alpha)$ is defined as: $h'=h-\Delta\tau(\dot{h}_{\textup{own}}-\dot{h}_{\textup{int}})-0.5\Delta\tau^2(\ddot{h}_{\textup{own}}-\ddot{h}_{\textup{int}})$, $\dot{h}_{\textup{own}}'=\dot{h}_{\textup{own}}+\ddot{h}_{\textup{own}}\Delta\tau$, $\dot{h}_{\textup{int}}'=\dot{h}_{\textup{int}}+\ddot{h}_{\textup{int}}\Delta\tau$ and $\tau'=\tau-\Delta\tau$, where $\Delta\tau=1$ is the time step. 
% \end{itemize}

% The environment $E$ is modelled as follows:
% \begin{itemize}
%     \item the set of environment states is $S_E=[-3000,3000]\times[-2500,2500]\times[-2500,2500]\times[0,40]$, with $s_E=(h,\dot{h}_{\textup{own}},\dot{h}_{\textup{int}},\tau)$ as above;

%     \item the environment transition function $\delta_E(s_E,\alpha)$ is defined as: $h'=h-\Delta\tau(\dot{h}_{\textup{own}}-\dot{h}_{\textup{int}})-0.5\Delta\tau^2(\ddot{h}_{\textup{own}}-\ddot{h}_{\textup{int}})$, $\dot{h}_{\textup{own}}'=\dot{h}_{\textup{own}}+\ddot{h}_{\textup{own}}\Delta\tau$, $\dot{h}_{\textup{int}}'=\dot{h}_{\textup{int}}+\ddot{h}_{\textup{int}}\Delta\tau$ and $\tau'=\tau-\Delta\tau$, where $\Delta\tau=1$ is the time step. 
% \end{itemize}

\end{exam}

%\marta{While the individual components are clear, it is difficult to understand the overall game - more information about the global states and example of how belief is chosen would be helpful, for example a picture?}

% \rev{First, each agent $i\in N$ chooses an action $a_i$ according to its strategy defined later and the  environment $E$ chooses an action $a_E$ by protocol function $prot_E$.} They synchronously perform the selected actions. Then, each agent $i\in N$ updates its private state according to the probabilistic local transition function and the selected joint action to $\loc_i'\in \loc_i$, and the environment updates the environment state according to the environment transition function and the selected joint action to $s_E'\in S_E$. Finally, each agent $i\in N$ observes the updated environment state $s_E'$ and combines its updated private state $\loc_i'$ as well as last percept $\per_i$ to generate a new percept $\per_i'$ according to its observation function implemented via an NN. Thus, a new local state $s_i'=(prv'_i,\per_i')\in S_i$ is \rev{obtained}.


% Given the local state $s_i=(\loc_i,\per_i)\in S_i$ for each agent $i\in N$ and environment state $s_E\in S_E$, the game evolves as follows. \rev{First, each agent $i\in N$ chooses an action $a_i$ according to its strategy defined later and the  environment $E$ chooses an action $a_E$ by protocol function $prot_E$.} They synchronously perform the selected actions. Then, each agent $i\in N$ updates its private state according to the probabilistic local transition function and the selected joint action to $\loc_i'\in \loc_i$, and the environment updates the environment state according to the environment transition function and the selected joint action to $s_E'\in S_E$. Finally, each agent $i\in N$ observes the updated environment state $s_E'$ and combines its updated private state $\loc_i'$ as well as last percept $\per_i$ to generate a new percept $\per_i'$ according to its observation function implemented via an NN. Thus, a new local state $s_i'=(prv'_i,\per_i')\in S_i$ is \rev{obtained}.


% {\color{red} Essentially, we are going to handle with a class of perfect information stochastic games, provided that we focus on fully-observable strategies here. Furthermore, we pay our attention to finite horizon. As for the solution concepts, we are interested in SPE, which indeed possess some good properties and can also provide us two common alternatives of strategies. The first one is the traditional SPE (i.e., the agent can memorize what happened before), for which the existence is guaranteed while all the paths (or histories), whose number is huge, have to be considered. The second one is called stationary SPE (i.e., the agent has no memory), in which the strategies just depend on the current state, greatly simplifying the strategy space. However, the stationary SPE may not exist \cite{KAH-SCS:20}. I prefer the first one, because determining the existence of the stationary SPE is hard, as far as I know.

% Additionally, we need to agree on the formula of the reward functions, because different reward functions can model the considered scenarios to a different accuracy degree. There are some candidates: accumulated rewards and reach-avoid rewards. I think it just depends on what specific problems we want to solve in the implementation part. 
% }

% \martatodo{I agree that reward formulas are important and will be decided based on examples. Accumulated rewards are likely to be the easiest. For stationary SPE, are there any restrictions on the system that ensure existence? There will be many advantages of working with stationary strategies.}

\startpara{Game Tree Unfolding}
%
The finite-horizon evolution of an NS-CSG $\csg$ from a given global state $s$
can be unfolded into a finite tree in the usual way by applying \emph{strategies} to select actions.
We distinguish between (past) \emph{histories} of a given state and its (future) \emph{paths}.

% \marta{There is a lot of notation here which is hard to parse for people without the right background. Ideally, there would be some small examples of histories, paths, action choice for VCAS, such as the assignment of beliefs}

% \ruitodo{I agree. I have tried it but it is hard to exemplify these concepts concisely through VCAS, because this example has many variables for a state. Did I understand this correctly?}

We assume that the duration of the game is finite with $K$ stages. A history $h$ of $\csg$ in stage $\ell\in[0,K]$ is a sequence $h=s^0\xrightarrow{\alpha^0}s^1
\xrightarrow{\alpha^1}\cdots\xrightarrow{\alpha^{\ell-1}}s^{\ell}$ where $s^k
\in S$, $\alpha^k\in A$ and $\delta(s^k,\alpha^k)(s^{k+1})>0$. The prefix 
% $s^0\xrightarrow{\alpha^0}s^1
% \xrightarrow{\alpha^1}\cdots\xrightarrow{\alpha^{\bar{\ell}-1}}s^{\bar{\ell}}$ 
of $h$ ending in stage $\bar{\ell}$ is denoted by $h_{\leq \bar{\ell}}$ for any $\bar{\ell}\leq \ell$. The set of all histories in stage $\ell$ for all initial states (for an initial state $s$) is denoted by $H^\ell$ ($H^\ell_s$), the set of all histories before stage $K$ is $H^{<K}=\cup_{0\leq\ell<K}H^\ell$ ($H_s^{<K}=\cup_{0\leq\ell<K}H_s^\ell$) and the set of all histories from $s$ is
% $H=\cup_{0\leq\ell\leq K}H^\ell$
$H_s=H_s^{<K} \cup H_s^K$. We denote by $last(h)$ the last state of the history $h\in H_s$. If $h\in H^{<K}$, we denote by $\textup{Succ}(h)$ the set of one-stage successors of $h$. 

% We assume that the action selection at each stage depends on the history and can be randomised. 
For a state $s=(s_1, \dots, s_n, s_E)$, the available actions of $\agent_i$ are denoted by $A_i(s)$, i.e., $A_i(s)$ equals $\Delta_i(s_i)$ if $\Delta_i(s_i) \neq \varnothing$ and equals $\{ \bot \}$ otherwise, and we denote by $A(s)$ the possible joint actions in a state, i.e. $A(s)= A_1(s) \times \cdots A_n(s)$.

% For each agent $i\in N\cup\{E\}$, we consider fully observable strategies, in which the action at each stage depends on the history and can be randomised.

% \martatodo{What is an alternative to fully observable?}

% \ruitodo{In game theory, some works use "perfect monitoring" to indicate such a information pattern.}

We can now define strategies, strategy profiles and correlated profiles.
In each case, we follow~\citep{RY-GS-GN-DP-MK:22} in assuming a \emph{fully observable} setting as a baseline,
i.e., where decisions are made based on the full state of the NS-CSG,
not just the parts of it revealed by the agents' observation functions.
An extension to partial observability
(i.e., where the NS-CSG represents a continuous-state partially observable stochastic game)
is left for future work.

\begin{defi}[Strategy]
A \emph{strategy} for $\agent_i$ is a function $\sigma_i:H^{<K}\to \mathbb{P}(A_i\cup\{\perp\})$ such that, if $\sigma_i(h)(a_i)>0$, then $a_i\in A_i(last(h))$.
A \emph{strategy profile} $\sigma = (\sigma_1,\dots,\sigma_n)$ comprises a strategy for each agent.
We denote by $\Sigma_i^\textup{N}$ the set of all strategies for $\agent_i$ and by $\Sigma^\textup{N}=\Sigma_1^\textup{N}\times\cdots\times\Sigma_n^\textup{N}$ the set of all strategy profiles.
% Each strategy profile $\sigma=(\sigma_1,\dots,\sigma_n)\in\Sigma^\textup{N}$ is also written as $\sigma=\sigma_{-i}[\sigma_i]$ for any $i\in N$.
\end{defi}

Alternatively, we can use a \emph{correlated profile}, in which agent choices are correlated.
For brevity, we refrain from formally defining a correlation mechanism
(such as public signals) and map directly to joint actions.

\begin{defi}[Correlated profile]
A \emph{correlated profile} is a function $\tau : H^{<K} \to \mathbb{P}(A)$ such that if $\tau(h)(\alpha) > 0$, then $\alpha = (a_1,\dots, a_n)$ and $a_i\in A_i(last(h))$ for all $i \in N$. We denote by $\Sigma^\textup{C}$ the set of correlated profiles.
\end{defi}

% A \emph{deviation} for $\agent_i$ is a function $\varsigma_i: H^{<K} \times A_i \rightarrow A_i$ such that if $\varsigma_i(h, a_i)$ is defined, then $A_i(last(h)) \neq \{ \perp \}$ and $a_i \in A_i(last(h))$. We denote by $\Sigma^{\textup{D}}_i$ the set of deviations for $\agent_i$. We use $\tau_{-i}[\varsigma_i]$ to the scenario where $\agent_i$ adopts $\varsigma_i(h, a_i)$ if its suggested action by $\tau$ at $h$ is $a_i$.

% \martatodo{Please check, there is confusion about whether the environment is included in the strategy profile and rewards}

% \ruitodo{Since the environment has actions, we have to specify how to select an environment action at each state if it is not dummy. One way to avoid defining the strategy for environment is to add a protocol function which selects environment actions.}

% We denote by $\sigma=(\sigma_1,\dots,\sigma_n)\in\Sigma$ the strategy profile, where $\Sigma=\Sigma_1\times\cdots\times\Sigma_n$ is the set of strategy profiles. We also write the strategy profile $\sigma=\sigma_{-i}[\sigma_i]$ for any $i\in N$.

A (future) path $\pi$ of $\csg$ starting from a history $h\in H^\ell$ in stage $\ell$ until the game ends in stage $K$ is a sequence $\pi=s^{\ell}\xrightarrow{\alpha^{\ell}}
\cdots\xrightarrow{\alpha^{K-1}}s^{K}$ where $s^{\ell}=last(h)$, $s^k
\in S$, $\alpha^k\in A$ and $\delta(s^k,\alpha^k)(s^{k+1})>0$. 
% The set of all paths starting in stage $\ell$ (for a history $h\in H^\ell$) is denoted as $FPaths_\ell$ ($FPaths_{\ell,h}$). 
For path $\pi$, $\pi(k)$ is the $(k+1)$th state, $\pi[k]$ the action associated with the $(k+1)$th transition from $\pi(k)$ to $\pi(k+1)$, and $last(\pi)$ the final state. 

% \startpara{Observability}
% %
% NS-CSGs model neuro-symbolic agents,
% whose operation depends on particular perception functions, which may result in imperfect information. However, in this paper we consider \emph{full observability},
% i.e., where agents' decisions can depend on the full state space. It is straightforward to extend the semantics above to
% \emph{partially observable} CSGs (POSGs) by including the observation function defined by $\obs_i$ in \defiref{semantics-def}
% and restricting to observationally-equivalent strategies. We here focus on the simpler (but still challenging) case of full observability for the following reasons.
% %
% First, the fully observable case represents an important baseline,
% against which partially observable scenarios can later be evaluated.
% Second, the solution presented here can be directly used when converting imperfect-information games to perfect information, such as the mechanism in \citep{NB-AB-AL-QG:20,VK-MS-NB-MB-VL:22}.


% We denote by $[m]$ the set $\{1,\dots,m\}$. According to the action assignment function, each agent $i\in N\cup \{E\}$ selects an action at state $s$ from its available actions $A_i(s):=\Delta(s)\cap A_i$. A path (history) $\pi$ of $\csg$ is a sequence $\pi=s^0\xrightarrow{\alpha^0}s^1
% \xrightarrow{\alpha^1}\cdots$ where $s^k
% \in S$, $\alpha^k\in A$ and $\delta(s^k,\alpha^k)(s^{k+1})>0$. The length of a finite path $\pi$ is defined as the number of involved states, and denoted by $|\pi|$. We use $FPaths_{\csg}^k$ ($FPaths_{\csg,s}^k$) to denote the set of paths (starting in state $s$) of length $k\in\mathbb{Z}$ for the game $\csg$. This work considers the finite-horizon games with $K$ steps (transitions). Denote by $FPaths_{\csg}=\cup_{k\in[K+1]}FPaths_{\csg}^k$ ($FPaths_{\csg,s}=\cup_{k\in[K+1]}FPaths_{\csg,s}^k$) the set of paths (starting in state $s$) of length no more than $K+1$. For a path $\pi$, we denote by $\pi(k)$ the $(k+1)$th state, $\pi[k]$ the action associated with the $(k+1)$th transition from $\pi(k)$ to $\pi(k+1)$, and $last(\pi)$ the final state. We assume that all paths start and end in a state.


% Now, we consider the part of a path which is known to agent $i$. A local (environment) path $\pi_E$ of the environment $E$ is a sequence $\pi_E=s_E^0\xrightarrow{\alpha^0}s_E^1\xrightarrow{\alpha^1}\cdots$ where $s_E^k
% \in S_E$, $\alpha^k\in A$, and $s_{E}^{k+1}=\delta_E(s_E^{k},\alpha^k)$ for all $k\ge0$. A local path $\pi_i$ of agent $i\in N$ is a sequence $\pi_i=s_i^0\xrightarrow{\alpha^0}s_i^1\xrightarrow{\alpha^1}\cdots$ where $s_i^k
% \in S_i$, $\alpha^k\in A$, $s^{k+1}_i=(\loc_i^{k+1},\per_i^{k+1})$, $\delta_i(s_i^{k},\alpha^k)(prv_{i}^{k+1})>0$ and $\per_i^{k+1}=obs_i((\loc_i^{k+1},\per_i^k),s_E^{k+1})$ for all $k\ge0$. Thus, a path $\pi$ can also be written as $\pi=(\pi_1,\dots,\pi_n,\pi_E)$. We use $FPaths_{\csg}^i$ ($FPaths_{\csg,s}^i$) to denote the set of finite  paths (starting in state $s$) for agent $i\in N\cup \{E\}$. For a finite path $\pi_i$, we denote by $\pi_i(k)$ agent $i$'s $(k+1)$th local state, and $\pi_i[k]$ agent $i$'s action associated with the $(k+1)$th transition from $\pi_i(k)$ to $\pi_i(k+1)$. If $\pi_i$ is finite, $last(\pi_i)$ is the final local state. 

\startpara{Rewards} We endow NS-CSGs with \emph{rewards} that define agents' objectives.
% Rewards and equilibria were not considered in \cite{MEA-EB-PK-AL:20}.
We use $r=(r_i)_{i\in N}$ where each agent $\agent_i$ has a reward structure $r_i=(r_i^A,r_i^S)$
comprising action reward function $r_i^A:S\times A\to \mathbb{R}$
and state reward function $r_i^S:S\to \mathbb{R}$.
%  For a path $\pi$, the immediate reward at the $(k+1)$th transition for agent $i\in N$ is computed by $r_i^A(\pi(k),\pi[k])+r_i^S(\pi(k))$. 
An \emph{objective profile} is $Y=(Y_1,\dots,Y_n)$,  where
$Y_i(\pi)$ is the accumulated reward of $\agent_i$
until the final stage $K$,
along a path $\pi$ that starts in some stage $\ell \in [0, K]$: %, is defined as: 
%
\begin{equation*}
    Y_i(\pi){=}\!\sum_{k=0}^{K-\ell-1}\!\!\Big(r_i^A(\pi(k),\pi[k])+r_i^S(\pi(k))\Big) + r_i^S(last(\pi)).
\end{equation*}
%
Given a strategy profile %\dave{strat prof} 
$\sigma\in\Sigma ^\textup{N}$, we denote by $\mathbb{E}_{\ell,h}^{\sigma}[Y_i]$ the expected value of $Y_i$ when starting from $h\in H^\ell$ at the $\ell$th stage until the game ends.
% and ending at the $K$th stage. 
Given %\dave{correl. prof} 
a correlated profile $\tau \in \Sigma^{\textup{C}}$, we denote by $\mathbb{E}_{\ell}^{\tau}[Y_i, a_i'| a_i, h]$ the expected value of $Y_i$ when starting from $h\in H^\ell$ at the $\ell$th stage until the game ends,
% and \marta{needs checking, not clear where K is from} ending at the $K$th stage, 
under the strategy that $\agent_i$ takes the actual action $a_i'$ instead of the recommended action $a_i$ at $h$, and otherwise the recommendation by $\tau$ is followed by all agents.

% For a strategy profile $\sigma$ and a history $h\in H^\ell$ ($\ell<K$), we denote by $FPaths_{\ell,h}^{\sigma}$ the set of paths starting in $h$ at stage $\ell$ under the profile $\sigma$. The probability of a path $\pi\in FPaths_{\ell,h}^\sigma$ is given by $\textbf{P}^{\sigma}(\pi)$, defined as:
% \begin{equation*}
%     \begin{aligned}
%     &\prod\nolimits_{k=0}^{K-\ell-1}\Big(\big(\prod\nolimits_{i}\sigma_i(h\xrightarrow{\pi[0]}\cdots\xrightarrow{\pi[k-1]}\pi(k))(\pi_i[k])\big)\\
%     &\times\delta(\pi(k),\pi[k])(\pi(k+1))\Big).
%     \end{aligned}
% \end{equation*}
% Given $\sigma\in\Sigma_{\csg}$ and $h\in H^\ell$, we the expected value of $Y_i$ is computed by $\mathbb{E}_{\ell,h}^{\sigma}[Y_i]=\sum_{\pi\in FPaths_{\ell,h}^\sigma}\textbf{P}^{\sigma}(\pi)Y_i(\pi)$, where $FPaths_{\ell,h}^\sigma$ is a finite set because $\delta_E$ is deterministic. 

An NS-CSG is \emph{zero-sum} if $\sum_{i=1}^n\big(r_i^A(s,\alpha) + r_i^S(s)\big)=0$ for all $s \in S$ and all $\alpha \in A$; otherwise, it is \emph{nonzero-sum}.

\startpara{Social Welfare Subgame-Perfect Equilibria} 
A \emph{Nash equilibrium} (NE) ensures that no agent has an incentive to deviate unilaterally from their strategy.
Here we work with \emph{subgame-perfect Nash equilibria} (SPNEs)~\citep{MJO:04},
% a refinement of NEs
which are NEs in every state of the game.
% \rui{Just to remind that, this property is only valid for NE, but not for CE (the reason is obscure through a few words)}\marta{I revised to clarify}
Since an SPNE is therefore an NE of every subgame of the original game, the agents' behaviour from any point in the game onward forms an NE of the continuation game, regardless of what happened before.
We also consider the less well studied notion of \emph{subgame-perfect correlated equilibria} (SPCEs)~\citep{CM-GG:07}.
For an SPCE, no agent can expect to gain by disobeying the recommendation of the correlated profile after any history of play.
% The SPE is also credited for satisfying one-shot deviation principle \citep{JW:13}, that is, no agent can increase its reward by changing its strategy at a stage from the SPE. 

The formal definitions of both types of subgame-perfect equilibria (SPE) follow, where we denote by $\mu=\mu_{-i}[\mu_i]=(\mu_1,\dots,\mu_n)$ $(i\in N)$ the strategy profile, where $\mu_{-i}$ refers to the strategy profile except $\mu_i$. For SPCEs, we again omit a correlation mechanism and abuse notation by expressing it as individual deviations from the recommended actions associated to a correlated profile $\tau$.
% \dave{The formal definition of SPE follows, where we denote by $\mu=\mu_{-i}[\mu_i]=(\mu_1,\dots,\mu_n)$ $(i\in N)$ the strategy profile, where $\mu_{-i}$ refers to the strategy profile except $\mu_i$.}

% \startpara{Social Welfare Subgame-Perfect Equilibrium} We consider the subgame-perfect (Nash) equilibrium (SPE) \citep{MJO:04}, a refinement of NE. Since an SPE is an NE of every subgame of the original game, the agents' behaviour from any point in the game onward represents an NE of the continuation game, regardless of what happened before. The SPE is also credited for satisfying one-shot deviation principle \citep{JW:13}, that is, no agent can increase its reward by changing its strategy at a stage from the SPE. An NS-CSG is \emph{zero-sum} if $\sum_{i=1}^n\big(r_i^A(s,\alpha) + r_i^S(s)\big)=0$ for all $s \in S$ and all $\alpha \in A$; otherwise, it is \emph{nonzero-sum}.

%\marta{why? motivation?}

% An SPE also necessarily satisfies one-shot deviation principle \cite{JW:13}, that is, no agent can benefit \marta{this is not clear} through rewards by deviating a single decision at a stage from their SPE.
% Next we introduce several equilibria for the NS-CSGs.

\begin{defi}[Subgame-perfect equilibrium]\label{defi-SPE}
For an initial state $s \in S$, a strategy profile $\sigma^*=(\sigma_1^*,\dots,\sigma_n^*) \in \Sigma^{\textup{N}}$ is a \emph{subgame-perfect Nash equilibrium} (SPNE) if $\mathbb{E}_{\ell,h}^{\sigma^*}[Y_i]\ge\mathbb{E}_{\ell,h}^{\sigma_{-i}^*[\sigma_i]}[Y_i]$ for all $\sigma_i\in\Sigma_i^{\textup{N}}$, all $i\in N$ and all $h\in H^{<K}_s$. A correlated profile $\tau^* \in \Sigma^{\textup{C}}$ is a \emph{subgame-perfect correlated equilibrium} (SPCE) if $\mathbb{E}_{\ell}^{\tau^*}[Y_i,a_i | a_i, h] \ge \mathbb{E}_{\ell}^{\tau^*}[Y_i, a_i' | a_i, h]$ for all $a_i, a_i' \in A_i(last(h))$, all $i \in N$ and all $h \in H^{<K}_s$.
\end{defi}

% \begin{defi}[Social welfare]\label{defi-SW}The social welfare $W_{\ell,h}^{\sigma}$ of a history $h\in H^{\ell}$ ($\ell<K$) under a strategy profile $\sigma$ is defined as the sum of expected values of objective profiles $Y_i$ starting in $h$ for all agents $i\in N$, that is, $W_{\ell,h}^{\sigma}=\mathbb{E}_{\ell,h}^{\sigma}[Y_1+\cdots+Y_n]$.
% \end{defi}

We emphasize that the SPE is defined here for a given initial state. Since multiple SPEs can exist, we introduce additional optimality constraints. First, we define the \emph{social welfare} $W_{\ell,h}^{\sigma}$ ($W_{\ell,h}^{\tau}$, resp.) of a history $h\in H^{\ell}$ ($\ell<K$) under a strategy profile $\sigma$ (a correlated profile $\tau$, resp.) as the sum of expected values of objective profiles $Y_i$ starting in $h$ for all agents, that is, $W_{\ell,h}^{\sigma}=\mathbb{E}_{\ell,h}^{\sigma}[\sum_{i=1}^n Y_i]$ ($W_{\ell,h}^{\tau} = \mathbb{E}_{\ell,h}^{\tau}[\sum_{i=1}^n Y_i]$, resp.).
Social-welfare optimal SPNE and and SPCE are then defined as follows.
% Next we introduce an important class of SPE for both SPNE and SPCE.

\begin{defi}[Social welfare SPE]
For an initial state $s\in S$, an SPNE $\sigma^*$ is a social welfare optimal SPNE (SW-SPNE) of $\csg$ if $W_{0,s}^{\sigma^*}\ge W_{0,s}^{\sigma}$ for all SPNEs $\sigma$ of $\csg$. An SPCE $\tau^*$ is a social welfare optimal SPCE (SW-SPCE) of $\csg$ if $W_{0,s}^{\tau^*}\ge W_{0,s}^{\tau}$ for all SPCEs $\tau$ of $\csg$.
\end{defi}
%
% \rev{We as a coordinator} want to find an SW-SPE and then \rev{dispatch the corresponding distribution over actions to each agent at each stage.} 
%
Notice that, starting from a fixed initial state, SW-SPNE and SW-SPCE
are \emph{globally optimal}, i.e. over the social welfare achieved over
a finite horizon from that start state.

Our approach of defining optimality in terms of the value from a fixed initial state
is further motivated by the following result,
% because defining them over multiple states causes issues with existence.
which reveals that SW-SPNEs and SW-SPCEs do not possess the property of subgame perfection on social welfare,
i.e., an SPNE or SPCE with optimal social welfare at one state might induce a non-optimal social welfare at another state as the game moves forward.

\begin{lema}[No optimal subgame perfection]\label{pro:subgame-perfection} 
For an initial state $s\in S$, an NS-CSG may have no SPNE (resp., SPCE) that is an SW-SPNE (resp., SW-SPCE) for all its subgames.
\end{lema}
% \begin{proof}
% We postpone the proof to \appxref{sec:appendix-a}.
% \end{proof}

A proof of this, and all other results in the paper
can be found in the appendix.
%
Note also that this and the following results
are stated in the context of NS-CSGs,
%in order to closely connect with the new model,
but they also apply to general CSGs with discrete states and actions. % over a finite horizon.
