\documentclass[accepted]{uai2022}


\usepackage[american]{babel}
% add packages here
\usepackage[table,dvipsnames]{xcolor}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{bm}
\usepackage[compress]{cite}
\usepackage{enumitem}
\usepackage{dsfont}
\usepackage{bbm}
\usepackage{bbold}
\usepackage{hhline}
\usepackage{mathtools}
\usepackage{algpseudocode}
\usepackage{pgfplots}
\usepackage[caption=false]{subfig}
\usepackage{lineno}
\usepackage{multirow}
\usepackage{amsfonts}
\usepackage{textcomp}
\usepackage{adjustbox}
\usepackage{tabularx}
\usepackage[table]{xcolor}
\usepackage{colortbl}
\usepackage{tablestyles}
\usepackage{figures/aircraftshapes}
\usepackage{pgflibrarysnakes}
% \usepackage[colorlinks=true]{hyperref}
\usepackage{titling}
% \usepackage[british]{babel}
\usepackage{stmaryrd}
\usepackage{istgame}
\usepackage{transparent}
\usepackage[bottom]{footmisc}

\usepackage{tikz}
\usetikzlibrary{arrows.meta,shapes,decorations,patterns}
\tikzset{%
	>={Latex[width=2mm,length=2mm]},
	% Specifications for style of nodes:
	base/.style = {rectangle, rounded corners, draw=black,
		minimum width=2cm, minimum height=1cm,
		text centered, font=\sffamily},
	activityStarts/.style = {base, fill=blue!30},
	startstop/.style = {base, fill=red!30},
	activityRuns/.style = {base, fill=green!30},
	process/.style = {base, minimum width=2.5cm, fill=orange!15,
		font=\ttfamily},
}

\usepackage{centernot}
\usepackage{version,xspace}
\usepackage{environ}
\usepackage{blkarray}
\usetikzlibrary{automata,positioning,arrows,through}
\usepackage{threeparttable}
\usepackage{mathrsfs}
\usepackage{algorithm}
\usepackage{algorithmicx}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables


% new nice boxes for comments
% \usepackage[prependcaption,colorinlistoftodos]{todonotes}
% \newcommand{\todoinb}[1]{\todo[inline,color=blue!60, linecolor=orange!250]{\small#1}}
% \newcommand{\todoing}[1]{\todo[inline,color=green!60, linecolor=orange!250]{\small#1}}
% \newcommand{\todoiny}[1]{\todo[inline,color=yellow!80, linecolor=orange!250]{\small#1}}
% \newcommand{\todoinr}[1]{\todo[inline,color=red!60, linecolor=orange!250]{\small#1}}

\newcommand{\weadd}{\color{blue}}
\newcommand{\ruirev}[1]{{\color{orange} #1}}
\newcommand{\ruitodo}[1]{\par\noindent{\raggedright\color{red}\texttt{From Rui: #1}\par\marginpar{$\star$}}} 
\newcommand{\rui}[1]{{\color{red} {\bf R:} \textsf{#1}}}

\newcommand{\dave}[1]{{\color{teal} {\bf D:} \textsf{#1}}}
\newcommand{\daveside}[1]{\marginpar{\scriptsize \color{teal} {\bf D:} \textsf{#1}}}

\newcommand{\martatodo}[1]{\par\noindent{\raggedright\color{blue}\texttt{From Marta: #1}\par\marginpar{$\star$}}}  
\newcommand{\marta}[1]{{\color{green} {\bf M:} \textsf{#1}}}
\newcommand{\martaside}[1]{\marginpar{\footnotesize \color{green} {\bf M:} \textsf{#1}}}

\newcommand{\gabrieltodo}[1]{{\color{orange}\texttt{From Gabriel: #1}}} 
\newcommand{\gabriel}[1]{{\color{orange} {\bf GS:} \textsf{#1}}}
\newcommand{\gabrielrev}[1]{{\color{blue} #1}}

\newcommand{\rev}[1]{{\color{red} #1}}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%\usepackage{showframe}
\newtheorem{defi}{\textbf{Definition}}
\newtheorem{thom}[defi]{\textbf{Theorem}}
\newtheorem{asp}[defi]{\textbf{Assumption}}
\newtheorem{rek}[defi]{\textbf{Remark}}
\newtheorem{pro}[defi]{\textbf{Proposition}}
\newtheorem{lema}[defi]{\textbf{Lemma}}
\newtheorem{algm}[defi]{\textbf{Algorithm}}
\newtheorem{pbm}[defi]{\textbf{Problem}}
\newtheorem{cor}[defi]{\textbf{Corollary}}
\newtheorem{tab}[defi]{\textbf{Table}}
\newtheorem{exam}{\textbf{Example}}

\newcommand{\defiref}[1]{Definition \ref{#1}}
\newcommand{\thomref}[1]{Theorem~\ref{#1}}
\newcommand{\aspref}[1]{Assumption~\ref{#1}}
\newcommand{\rekref}[1]{Remark~\ref{#1}}
\newcommand{\proref}[1]{Proposition \ref{#1}}
\newcommand{\algoref}[1]{Algorithm \ref{#1}}
\newcommand{\pbmref}[1]{Problem \ref{#1}}
\newcommand{\corref}[1]{Corollary \ref{#1}}
\newcommand{\lemaref}[1]{Lemma \ref{#1}}
\newcommand{\tabref}[1]{Table \ref{#1}}
\newcommand{\examref}[1]{Example \ref{#1}}
%\newcommand{\appxref}[1]{Appendix~\ref{#1}}
\newcommand{\appxref}[1]{Appendix~\ref{#1}}


\newcommand{\cO}{\mathcal{O}}
\newcommand{\obs}{\mathit{obs}}
\newcommand{\Obs}{\mathit{Obs}}
\newcommand{\Prv}{\mathit{Prv}}
\newcommand{\Loc}{\mathit{Loc}}
\newcommand{\Per}{\mathit{Per}}
\newcommand{\prv}{\mathit{prv}}
\newcommand{\loc}{\mathit{loc}}
\newcommand{\per}{\mathit{per}}
\newcommand{\csg}{\mathsf{C}}
\newcommand{\posg}{\mathsf{G}}
\newcommand{\game}{\mathsf{G}}
\newcommand{\nfgame}{\mathsf{N}}
\newcommand{\agent}{\mathsf{Ag}}
\newcommand{\equilibrium}{\mathsf{T}}
\newcommand{\sem}[1]{\llbracket {#1} \rrbracket}

\newcommand{\startpara}[1]{{%
\vskip6pt\noindent
{\bf #1.}}}

\input{sections/macros}

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%

\title{Finite-horizon Equilibria for Neuro-symbolic Concurrent Stochastic Games \\ ~ \\ (Supplementary material)}


% Add authors
\author[1]{Rui~Yan \thanks{Equal Contributions.}}
\author[1]{Gabriel~Santos\hspace{2.5pt}$^*$}
\author[2]{Xiaoming~Duan}
\author[3]{David~Parker}
\author[1]{Marta~Kwiatkowska}
% Add affiliations after the authors
\affil[1]{%
    Department of Computer Science\\
    University of Oxford\\
    Oxford, UK
}
\affil[2]{%
     Department of Automation\\
     Shanghai Jiao Tong University\\
     Shanghai, China
}
\affil[3]{%
    School of Computer Science\\
    University of Birmingham\\
    Birmingham, UK
  }

% \date{}

\begin{document}
\maketitle

\appendix 

\section{Proofs of Main Results}\label{sec:appendix-a}

To prove Lemmas 6 and 8, we introduce the following example.

\begin{exam}\label{example-1}
Consider a two-stage two-agent game with deterministic transitions in Fig. \ref{fig:example1}, in which each agent has two actions: $\{U,D\}$ for agent $1$ and $\{L,R\}$ for agent $2$. Non-leaf and leaf nodes, containing the node numbers, are marked with circles and rectangles, respectively. For clarity, several histories reaching stage $2$ are not displayed here. Edges are labelled with the associated joint actions. The payoff vectors below leaf nodes are the terminal rewards, while the payoff vectors below non-leaf nodes denote the unique equilibrium payoffs (expected accumulated rewards) from these nodes to the leaf nodes, where $\phi$ is negative. The immediate rewards along the edges are assumed to be zero.

By GBI, there are three NEs at node $4$: $\mu^{4(1)}=\{(1,0),(1,0)\}$, $\mu^{4(2)}=\{(1/5,4/5),(1,0)\}$ and $\mu^{4(3)}=\{(0,1),(0,1)\}$, and the respective equilibrium payoffs are $V^{4(1)}=(0,8)$, $V^{4(2)}=(0,8/5)$ and $V^{4(3)}=(5,2)$. The NE and the equilibrium payoff at the initial node $1$ depend on which NE is considered at node $4$. If $V^{4(1)}$ or $V^{4(2)}$ is selected, then there is a unique NE at node $1$: $\mu^{1(1)}=\{(1,0),(1,0)\}$ with equilibrium payoff $(1,1+\phi)$. If $V^{4(3)}$ is chosen, then there is a unique NE at node $1$: $\mu^{1(2)}=\{(0,1),(1,0)\}$ with equilibrium payoff $(5,2)$.    
\end{exam}

\setcounter{figure}{5}

\begin{figure}[ht]
    \centering
    \begin{tikzpicture}[
    roundnode/.style={circle, radius = 0.5mm, draw=white, inner sep=0pt, minimum size=12pt},
    squarednode/.style={rectangle, draw=white,  inner sep=0pt, minimum size=12pt},
    ]
    % nodes
    \node[roundnode,fill=red!60]  (node1) at (0,0) {1};
    \node[roundnode,fill=red!45]  (node2) at (-3,-1) {2};
    \node[roundnode,fill=red!45]  (node3) at (-1,-1) {3};
    \node[roundnode,fill=red!45]  (node4) at (1,-1) {4};
    \node[roundnode,fill=red!45]  (node5) at (3,-1) {5};
    \node[squarednode,fill=red!30] (node6) at (-2,-2.5) {6};
    \node[squarednode,fill=red!30] (node7) at (0,-2.5) {7};
    \node[squarednode,fill=red!30] (node8) at (2,-2.5) {8};
    \node[squarednode,fill=red!30] (node9) at (4,-2.5) {9};
    % edges
    \path[draw,-, above] (node1) to node [pos=0.85] {\scriptsize (U,L)}(node2);
    \path[draw,-, right] (node1) to node [left, pos=0.6] {\scriptsize (U,R)}(node3);
    \path[draw,-, left] (node1) to node [right, pos=0.6] {\scriptsize (D,L)}(node4);
    \path[draw,-, above] (node1) to node [pos=0.85] {\scriptsize (D,R)}(node5);
    \path[draw,-, above] (node4) to node [pos=0.9] {\scriptsize (U,L)}(node6);
    \path[draw,-, right] (node4) to node [left, pos=0.775] {\scriptsize (U,R)}(node7);
    \path[draw,-, left] (node4) to node  [right, pos=0.775] {\scriptsize (D,L)}(node8);
    \path[draw,-, above] (node4) to node [pos=0.9] {\scriptsize (D,R)}(node9);
    % payoffs
    \node[below=2mm] at (node6) {\footnotesize $(0,8)$};
    \node[below=2mm] at (node7) {\footnotesize $(0,0)$};
    \node[below=2mm] at (node8) {\footnotesize $(0,0)$};
    \node[below=2mm] at (node9) {\footnotesize $(5,2)$};
    \node[below=2mm] at (node2) {\footnotesize $(1,1+\phi)$};
    \node[below=2mm] at (node3) {\footnotesize $(3,\phi)$};
    \node[below=2mm] at (node5) {\footnotesize $(0,0)$};
    \end{tikzpicture}
    \caption{A two-stage game tree with two agents with $\phi<0$.}
    \label{fig:example1}
\end{figure}

\paragraph{Proof of Lemma 6.}
We consider the game in \examref{example-1}. Given $\phi<0$, the SW-SPNE and SW-SPCE starting at node $1$ are the same and unique with social welfare $5+2=7$, in which the strategy at node $4$ is $\mu^{4(3)}$. However, the SW-SPNE and SW-SPCE for the subgame starting at node $4$ are both $\mu^{4(1)}$ instead of $\mu^{4(3)}$, which completes the proof.

\paragraph{Proof of Proposition 7.}
It is well known in game theory that, for a normal-formal game, (mixed-strategy) NEs always exist \citep{JN:51} and all NEs are fully characterized by the set of feasible solutions of a nonlinear program with compact constraints \citep{MJO:04}. This implies that the SWNEs, which are NEs maximising social welfare, always exist as well. Since every NE is a CE and all CEs are fully characterized by the set of feasible solutions of a linear program with compact constraints \citep{RJA:74}, then SWCEs always exist, which completes the proof.


\paragraph{Proof of Lemma 8.}
We consider \examref{example-1} again. Since $\mu^{4(1)}$ has the maximum social welfare, then Generalized BI via SWE feeds $V^{4(1)}$ to node $1$ for both the case of SWNE and SWCE, thus leading to node $1$'s social welfare $W_{0,s}^{\mu}=2+\phi$. However, node $1$'s social welfare $W_{0,s}^{\mu^*}$ under both SW-SPNE and SW-SPCE $\mu^*$ is $7$. Thus, if $\phi$ is negative enough, the difference $W_{0,s}^{\mu^*}-W_{0,s}^{\mu}=5-\phi$ is positive and unbounded.  

% {\color{red}Do we need to prove Proposition 8?}

% \rui{It seems to be straightforward, as we allow mixed-strategy NEs at each history. The compactness of the set of NEs for a normal-form game guarantees the existence of the SWNE. Dave: ok, but let's state that explicitly since we imply that all omitted proofs are in the appx}

% \gabrieltodo{Fix refs.}

\paragraph{Proof of Theorem 9.}
The conclusions (i) and (ii) are straightforward by the encoding procedure. The sets of feasible solutions to (4) and (5) are not empty, as (mixed-strategy) NEs of a normal-form game always exist \citep{JN:51}, and thus so do CEs. Additionally, they are compact by noting the constraints (1), (2) and (3). Then, the conclusions (iii) and (iv) follow from the continuity of the objective function.


\paragraph{Proof of Theorem 10.}
In Algorithm 2, step $1$ returns a feasible solution to the nonlinear program (4) or (5) (depending on the equilibrium type $\equilibrium$). Since the variables of the nonlinear program $P$ (step $5$) are independent of the frozen variables due to the game tree structure and the history selection (or region construction), the pair $(\mu,V)$ in steps $7$ and $8$ is still a feasible solution to (4) or (5). The conclusions follow from the coordinate descent optimization with constraints \citep{SJW:15}.


\section{Further Details for Algorithms}\label{sec:appendix-b}

\subsection{Approximation Algorithms}

FSI is described in Sec.~4 and is summarised as 
Algorithm~2. In Fig. \ref{fig:FSI-example}, we give an illustration of the approach:
FSI freezes all variables related to the red histories and optimizes over the blue history, where each node contains the current equilibrium payoff.

\begin{figure}[!h]
    \centering
\begin{tikzpicture}[level distance=9mm]
\tikzstyle{every node}=[fill=red!50,circle,inner sep=1pt]
\tikzstyle{level 1}=[sibling distance=20mm,
set style={{every node}+=[fill=red!40]}]
\tikzstyle{level 2}=[sibling distance=5mm,
set style={{every node}+=[fill=red!20]}]
\node [fill=blue!50] {8}
child {node {4}
child {node {3}}
child {node {2}}
child {node {2}}
child {node {1}}
}
child {node {6}
child {node {3}}
child {node {5}}
child {node {4}}
child {node {2}}
}
child {node [fill=blue!45] {6} edge from parent[color=blue!60,very thick]
child[black] {node {2} edge from parent[color=black,thin]}
child[black] {node [fill=blue!30] {3} edge from parent[color=blue!60,very thick] }
child[black] {node {2} edge from parent[color=black,thin]}
child[black] {node {1} edge from parent[color=black,thin]}
}
child {node {3}
child {node {2}}
child {node {3}}
child {node {3}}
child {node {1}}
};
\end{tikzpicture}
\caption{An example for Frozen Subgame Improvement.}
\label{fig:FSI-example}
\end{figure}

We also suggest an alternative approach for the selection of histories in FSI, shown in  \algoref{alg:find-history}. It returns a history by starting from the initial state $s$, moving to the successor with the maximum social welfare indicated by the current equilibrium payoff $V$ and perturbed by $\epsilon$ (if there are multiple such successors, we select one randomly), and iterating until the stage $K-1$, where $\mathsf{UNIFORM}(\cdot)$ is a uniform sampling function.

\setcounter{algorithm}{2}

\begin{algorithm}[h]
	\caption{Finding a History by Maximum Social Welfare}
	\textbf{Input:} histories $H_s$, distribution $\mu$, equilibrium payoff $V$, exploration rate $\epsilon\in[0,1]$
	
	\textbf{Output:} a history $h\in H_s^{K-1}$
	\begin{algorithmic}[1]
	\State $h\gets s$
	\Repeat
	    \State $h'\gets\arg\max_{h''\in\textup{Succ}(h)}\sum_{i\in N}V^{h''}_i$
	    \If{$\mathsf{UNIFORM}([0,1])>\epsilon$}
	    \State $h \gets h'$
	    \Else
	    \State $h\gets \mathsf{UNIFORM}(\textup{Succ}(h))$
	    \EndIf
	\Until{$h\in H_s^{K-1}$}
	\State \Return $h$
	\end{algorithmic}
	\label{alg:find-history}
\end{algorithm}

\section{Further Details for Case Studies}\label{sec:appendix-c}

\subsection{Automated Parking}

The formal details of the NS-CSG model for the automated parking case study are as follows.
%
There are two players (vehicles) $\{ \agent_i \}_{i \in N}$ for $N=\{1,2\}$ and two parking slots $M=\{1,2\}$ in a $5\times 4$  grid $C$. The coordinate of the cell in the $i$th row and $j$th column is denoted by $(i,j)$. Thus, $C=\{(i,j)\,|\,i\in[5],j\in[4]\}$. The coordinates of two parking slots are $y_1=(2,4)$ and $y_2=(5,1)$. Fig.~3 shows the grid. Vehicles are forbidden to enter the red cells and have to follow the traffic rules indicated by black arrows.

The environment state is $s_E=(x_1,x_2)$, where $x_i\in C$ is vehicle $i$'s coordinate. Each agent $i\in N$ is as follows:

\begin{itemize}
    \item a state of agent $\agent_i$ is $s_i=(loc_i,(x_{1},x_{2}))$, where the local state $loc_i$ is dummy, and the coordinates $x_{k}\in C$ ($k\in N$) of two vehicles constitute the percept;
    
    \item actions include four directions $\textup{U}=(0,1)$, $\textup{D}=(0,-1)$, $\textup{L}=(-1,0)$, and $\textup{R}=(1,0)$. We assume that $\agent_1$ is twice as fast as $\agent_2$, i.e., $A_2=\{\textup{U}, \textup{D}, \textup{L}, \textup{R}\}$ and $A_1=A_2\times A_2\setminus\{ \textup{UD}, \textup{DU}, \textup{LR}, \textup{RL}\}$;  %\gabrieltodo{not quite. it does not make sense to have UD, DU, LR, RL.}
    
    % \item let $\Delta_i(s_i)=\bar{A}_i$ for any $s_i\in S_i$;
    
    \item the available action function is such that $a_i \in \Delta_i(s_i)$ iff taking action $a_i$ at $s_i$ does not break the traffic rules or enter a red cell; 
    
    \item observation function $obs_i$ computes the cells where two vehicles are, i.e., $obs_i(s_1,s_2, s_E)=(x_{1},x_{2})$;
    
    \item the local transition function $\delta_i$ is dummy. 
    
    
    %two local transition functions $tr_i^1$ and $tr_i^2$ are selected with probabilities $0.8$ and $0.2$ respectively, where $tr_i^1(s_i,\alpha)=\hat{y}_i'$ with $\hat{y}_i'$ as the closest parking slot to $x_i$ and $tr_i^2(s_i,\alpha)=\hat{y}_i'$ with $\hat{y}_i'$ as the second closest parking slot to $x_i$. If there are many candidates for $\hat{y}_i'$, then they will be selected equally.
    
    
    % Thanks If $a_i=\perp$, then $\delta_i(s_i,\alpha)(\hat{y}_i)=1$ and $\delta_i(s_i,\alpha)(\hat{y}_i')=0$ for $\hat{y}_i'\neq \hat{y}_i$.
    % \begin{equation*}
    %  \delta_i(s_i,\alpha)(\hat{y}_i')= \frac{\textup{exp}(\beta_i U_{\hat{y}_i'})}{\sum_{\ell\in\bar{C}}\textup{exp}(\beta_i U_{\ell})}, 
    % \end{equation*}
    % where $U_{l}\in\mathbb{R}$ is the utility of vehicle $i$ selecting goal parking slot $\ell$ given local state $s_i$ and joint action $\alpha$, and the noise is measure by the rationality parameter $\beta_i\in\mathbb{R}_{\ge0}$. 
    
\end{itemize}

For $\alpha=(a_1,a_2)\in A_1 \times A_2$, $\delta_E(s_E,\alpha)=(x_1',x_2')$ where $x_i'=x_i+a_i$ for all $i\in N$.
The two vehicles start from $x^0_1=(3,1)$ and $x^0_2=(2,2)$.

There are two reward structures. The first one is plain time minimizing: $r_i^A(s,\alpha)=0$; if $x_1=x_2$, then $r_i^S(s)=-20$; if $x_1\neq x_2$ and $x_i=y_j$ for some $j\in M$, then $r_i^S(s)=0$; $r_i^S(s)=-1$ otherwise. The second one is time minimizing with bonus, in which we add a bonus of $5.5$ to agent $2$ at a designated cell (in yellow): $r^S_2(s)=5.5-1=4.5$ if $x_2=(1,2)$ when $k\leq1$. 

% \begin{itemize}
%     \item if $
%     \bar{x}_i\neq \bar{y}_j$ for all $j\in M$: $r_i^S(s_i)=-1$, and if there exists a $j\in M$ such that $\bar{x}_{i}+a_i=\bar{y}_{j}$ and $\bar{x}_k+a_k\neq\bar{y}_j$ for all $k\neq i$, then $r_i^A(s_i,\alpha)=10$; if $\bar{x}_i+a_i\neq\bar{y}_j$ for all $j\in M$ and $\bar{x}_i+a_i\neq \bar{x}_k+a_k$ for all $k\neq i$, then $r_i^A(s_i,\alpha)=0$; if there exists a $k\in N$ such that $\bar{x}_i+a_i= \bar{x}_k+a_k$, then $r_i^A(s_i,\alpha)=-20$;
    
%     \item if $
%     \bar{x}_i=\bar{y}_j$ for some $j\in M$: if $\bar{x}_i\neq \bar{x}_k$ for all $k\neq i$, then $r_i^S(s_i)=10$ and $r_i^A(s_i,\alpha)=0$; otherwise, $r_i^S(s_i)=-20$ and $r_i^A(s_i,\alpha)=0$.
% \end{itemize}

This example was modelled using the PRISM-games modelling language, since the simplicity of the perception mechanism lets it be reduced to a discrete-state CSG.
%
% Table~\ref{tab:parking} shows statistics for the models constructed and the time for their verification.



\subsection{Two-Agent Aircraft Collision Avoidance Scenario}

 In the VCAS[2] system (Figure 1) there are
 two aircraft (ownship and intruder, denoted by $\agent_i$ for $ i\in \{\textup{own}, \textup{int} \}$), each of which is equipped with an NN-controlled collision avoidance system called VCAS. Each second, VCAS issues an advisory ($ad_i$) from which, together with the current trust in the previous advisory ($tr_i$), the pilot needs to make a decision about accelerations, aiming at avoiding a near mid-air collision (NMAC), a region where two aircraft are separated by less than $100$ ft vertically and $500$ ft horizontally. 

 The environment state $\smash{s_E=(h,\dot{h}_{\textup{own}},\dot{h}_{\textup{int}},t)}$ records the altitude $h$ of the intruder relative to the ownship (ft), the vertical climb rate $\dot{h}_{\textup{own}}$ of the ownship (ft/sec), the vertical climb rate $\dot{h}_{\textup{int}}$ of the intruder (ft/sec), and the time $t$ until loss of horizontal separation of the two aircraft (sec).
 
 Each aircraft is endowed with a perception function implemented via a feed-forward NN $f_{ad_i}:\mathbb{R}^4\to\mathbb{R}^9$ with four inputs, seven hidden layers of 45 nodes and nine outputs representing the score of each possible advisory. There are nine NNs $F=\{ f_i:\mathbb{R}^4\to\mathbb{R}^9  \,|\, i \in [9] \}$, each of which corresponds to an advisory.

% \marta{I don't understand how two actions are provided by advisory?}
% \ruitodo{They have a table summarising which two actions should be provided for a given advisory. For example, if the advisory is "DNC: Do Not Climb" coded as $ad=2$, then the table gives two available actions $\{-9.33 ft/s^2,-7.33 ft/s^2\}$.}
% \marta{I am asking because the transition function only has one action, so is the table part of the model?}
% \ruitodo{Yes, they defined a protocol function for each agent to reflect the table. So I was wondering if we also need this protocol function. I will add it in case we need.}
% \marta{thanks, this is helpful}\marta{We should check if we can avoid prot by using available action mapping}

% \ruitodo{I agree. I have replaced $prot_i$ with $\Delta_i$, which is called available action mapping, aligned with the action assignment function $\Delta$}

Each advisory will provide two non-zero acceleration actions for the agent to select from, except that the agent is also allowed to adopt zero acceleration.
 %
 The trust in the previous advisory and previous advisory (percept) are stored in a state of the agent $s_i=(tr_i, ad_i)$. There are four trust levels $\{4, 3, 2, 1\}$ and nine possible advisories \citep{MEA-EB-PK-AL:20-2}.
 The  current advisory is computed 
 from the previous advisory $ad_i$ and environment state $s_E$ using the observation function $obs_i$.
 %
 The trust level is increased probabilistically if the current advisory is compliant with the executed action, and decreased otherwise.
 
Formally, each agent $\agent_i$ for $i\in \{\textup{own}, \textup{int} \}$ and the environment $E$ are defined as follows:
\begin{itemize}
    \item $s_i=(tr_i, ad_i)$ is a state of the agent $\agent_i$ with local state $tr_i{\in}[4]$ and percept $ad_i{\in}[9]$;
    
    \item the set of environment states is $S_E=[-3000,3000]\times[-2500,2500]\times[-2500,2500]\times[0,40]$, with $s_E=(h,\dot{h}_{\textup{own}},\dot{h}_{\textup{int}},t)$ as above;
    
    \item $A_i=\{0,\pm3.0, \pm 7.33, \pm 9.33, \pm 9.7, \pm11.7\}$, where $a_i \in A_i$ is an acceleration $\ddot{h}_i$;  
    
    \item the available action function $\Delta_i$ returns two non-zero acceleration actions \citep{MEA-EB-PK-AL:20} shown in Table \ref{tab:advisory} given a state of the agent, plus zero acceleration;
    
    \item observation function $obs_i$, implemented via $F$, is given by $ad_{i}'=obs_{i}(ad_{i},s_E)$, where $obs_{\textup{own}}(ad_{\textup{own}},s_E)=\textup{argmax}(f_{ad_{\textup{own}}}(h,\dot{h}_{\textup{own}},\dot{h}_{\textup{int}},t))$ and $obs_{\textup{int}}(ad_{\textup{int}},s_E)=\textup{argmax}(f_{ad_{\textup{int}}}(-h,\dot{h}_{\textup{int}},\dot{h}_{\textup{own}},t))$;
    
    \item the local transition function $\delta_i$ computes a trust level according to the current trust level $tr_i$, the updated advisory $ad_i'$ and the executed action $a_i$: if $a_i$ is compliant with $ad_i'$ (i.e., $a_i$ is non-zero), when $tr_i\leq3$, then $tr_i'=tr_i+1$ with probability $1-\epsilon_i$ and $tr_i'=tr_i$ with probability $\epsilon_i$, and when $tr_i=4$, then $tr_i'=tr_i$; otherwise, when $tr_i\ge2$, then $tr_i'=tr_i-1$ with probability $1-\epsilon_i$ and $tr_i'=tr_i$ with probability $\epsilon_i$, and when $tr_i=1$, then $tr_i'=tr_i$, where $\epsilon_i \in [0,1]$.
    
    \item the environment transition function $\delta_E(s_E,\alpha)$ is defined as: $h'=h-\Delta t(\dot{h}_{\textup{own}}-\dot{h}_{\textup{int}})-0.5\Delta t^2(\ddot{h}_{\textup{own}}-\ddot{h}_{\textup{int}})$, $\dot{h}_{\textup{own}}'=\dot{h}_{\textup{own}}+\ddot{h}_{\textup{own}}\Delta t$, $\dot{h}_{\textup{int}}'=\dot{h}_{\textup{int}}+\ddot{h}_{\textup{int}}\Delta t$ and $ t'= t-\Delta t$, where $\Delta t=1$ is the time step. 
    
    % then $tr_i'=tr_i+1$ with probability $1-\epsilon_i$ and $tr_i'=tr_i$ with probability $\epsilon_i$ if $tr_i\leq3$, and $tr_i'=tr_i$ if $tr_i=4$; otherwise, then $tr_i'=tr_i-1$ with probability $1-\epsilon_i$ and $tr_i'=tr_i$ with probability $\epsilon_i$ if $tr_i\ge2$, and $tr_i'=tr_i$ if $tr_i=1$, where $\epsilon_i \in [0,1]$.

    % \item the local transition function $\delta_i$ computes a trust level according to the current trust level $tr_i$, the updated advisory $ad_i'$ and the executed action $a_i$: if $a_i$ is compliant with $ad_i'$ (i.e., $a_i$ is non-zero), then $tr_i'=tr_i+1$ with probability $\epsilon$ if $tr_i\leq3$ and $tr_i'=tr_i$ if $tr_i=4$; otherwise, $tr_i'=tr_i-1$ if $tr_i\ge2$ and $tr_i'=tr_i$ if $tr_i=1$.
\end{itemize}

% The environment $E$ is modelled as follows:
% \begin{itemize}
%     \item the set of environment states is $S_E=[-3000,3000]\times[-2500,2500]\times[-2500,2500]\times[0,40]$, with $s_E=(h,\dot{h}_{\textup{own}},\dot{h}_{\textup{int}}, t)$ as above;
%     \item a single idle action $A_E=\{\perp\}$ and thus the available action function is $\Delta_E:S_E\to \{\perp\}$;
% %\marta{Why is this separate from the rest of environment?}
% \item the environment transition function $\delta_E(s_E,\alpha)$ is defined as: $h'=h-\Delta t(\dot{h}_{\textup{own}}-\dot{h}_{\textup{int}})-0.5\Delta t^2(\ddot{h}_{\textup{own}}-\ddot{h}_{\textup{int}})$, $\dot{h}_{\textup{own}}'=\dot{h}_{\textup{own}}+\ddot{h}_{\textup{own}}\Delta t$, $\dot{h}_{\textup{int}}'=\dot{h}_{\textup{int}}+\ddot{h}_{\textup{int}}\Delta t$ and $ t'= t-\Delta t$, where $\Delta t=1$ is the time step. 
% \end{itemize}

% The NS-CSG model used in the VCAS[2] case study was described in Example~\ref{vcas-example}.

% The acceleration actions available to the agent for each advisory \cite{MEA-EB-PK-AL:20} are shown in Table \ref{tab:advisory}.
%

\setcounter{table}{2}

\input{figures/tex/advisory_table}
%
When computing the equilibria presented in Fig.~4, we use two reward structures, with the first given by $r^S_{\textup{own}}(s) = r^S_{\textup{int}}(s) = h$ if $k =  t_{\textup{init}} -  t$,  and 0 otherwise. For the zero-sum case, the reward for the intruder is negated. In both cases, action rewards are set to 0 for all state-action pairs, i. e., $r^A_{\textup{own}}(s, \alpha) = r^A_{\textup{int}}(s, \alpha) = 0$, $\forall s \in S, \alpha \in A$.

This case study was developed by extending the implementation available in \citep{MEA-EB-PK-AL:20-files}. We first modified the original code in order to consider all actions recommended by the advisory system plus the action corresponding to zero acceleration. We later develop this model further by adding trust values to the states of the agents and the corresponding probabilistic updates as described in Section 2. In both cases, we build a game tree by considering all states the system could be in and translate that into a PRISM-games model.

% \input{figures/tex/vcas_strategy}

We also consider another reward structure with additional preferences: (i) not only safety but also trust matters; and (ii) reducing fuel consumption is desired in addition to maintaining safety.  %on the top of a surplus safety. 
More specifically, if $|h|\leq 200$, then $r_{i}^A(s, \alpha)=0$ and $r_{i}^S(s)=|h|/h_{\max}+tr_{i}/4$; if $|h| > 200$, then $r_{i}^A(s, \alpha) = -|\ddot{h}_{i}| / \ddot{h}_{\textup{max}}$ and $r_{i}^S(s)=0$ for $i \in \{\textup{own}, \textup{int}\}$, where $h_{\max}$ and $\ddot{h}_{\max}$ are the maximal absolute values of all altitudes and accelerations in the generated game tree, respectively. The initial values are $h=50$, $\dot{h}_{\textup{own}}=-5$, $tr_{\textup{own}}=4$, $\dot{h}_{\textup{int}}=5$ and $tr_{\textup{int}}=4$.

% Table \ref{tab:VCAS} shows that the model construction is much more costly than BI, and the verification takes less time than either of these due to short horizon ($k= t$).

% In Section~5 we discussed equilibria strategies for different values of the uncertainty parameter $\epsilon_{\textup{own}}$. Fig.~\ref{fig:vcas-strategy} (left) illustrates that following the advisories is the best strategy when safety and trust are the priority. The strategy in Fig.~\ref{fig:vcas-strategy} (right) shows that violations may occur (i.e., zero action is adopted after $s^2$) when the altitude is expected to reach a safer level in the future.



% \ruitodo{I only add strategies and states for the two most interesting cases. The others can be attached in the appendix if necessary.}
% \martatodo{Looks good - can you say what is interesting about these strategies?}

% \ruitodo{The first case shows that following the advisories is the best strategy when the safety and trust are priorities. The second case shows that violations probably occur (i.e., zero action is adopted like the strategy after $s^2$) when the altitude can reach more than a quite safe level.}

% \input{figures/tex/vcas_table}

\bibliography{references}

% \input{figures/tex/vcas_strategy_table}

\end{document}