% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

% \title{Instructions for Authors: Title in Title Case}

% % The standard author block has changed for UAI 2022 to provide
% % more space for long author lists and allow for complex affiliations
% %
% % All author information is authomatically removed by the class for the
% % anonymous submission version of your paper, so you can already add your
% % information below.
% %
% % Add authors
% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2022 paper}{Jane~J.~von~O'L\'opez}{}}
% \author[1]{Harry~Q.~Bovik}
% \author[1,2]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% % Add affiliations after the authors
% \affil[1]{%
%     Computer Science Dept.\\
%     Cranberry University\\
%     Pittsburgh, Pennsylvania, USA
% }
% \affil[2]{%
%     Second Affiliation\\
%     Address\\
%     …
% }
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
  
%   \begin{document}
% \maketitle

% \begin{abstract}
%   This is the abstract for this article.
%   It should give a self-contained single-paragraph summary of the article's contents, including context, results, and conclusions.
%   Avoid citations; but if you do, you must give essentially the whole reference.
%   For example: This whole paper is devoted to praising É. Š. Åland von Vèreweg's most recent book (“Utopia's government formation problems during the last millenium”, Springevier Publishers, 2016).
%   Also, do not put mathematical notation and abbreviations in your abstract; be descriptive.
%   So not “we solve \(x^2+A xy+y^2\), where \(A\) is an RV”, but “we solve quadratic equations in two unknowns in which a single coefficient is a random variable”.
%   The reason is that mathematical notation will not display correctly when the abstract is reused on the proceedings website, for example, and that one should not assume the abstract's reader knows the abbreviation.
%   Of course the same remarks hold for your paper's title.
% \end{abstract}

% \section{Introduction}\label{sec:intro}
% UAI 2022 papers have to be prepared using \LaTeX.
% To start writing your paper, copy \texttt{uai2022-template.tex} and replace title, authorship, and content with your own.

% The UAI 2022 paper style is based on a custom \textsf{uai2022} class.
% The class file sets the page geometry and visual style.\footnote{%
%     The class uses the packages \textsf{adjustbox}, \textsf{environ}, \textsf{letltxmacro}, \textsf{geometry}, \textsf{footmisc}, \textsf{caption}, \textsf{textcase}, \textsf{titlesec}, \textsf{titling}, \textsf{authblk}, \textsf{enumitem}, \textsf{microtype}, \textsf{lastpage}, and \textsf{kvoptions}.
% }
% The class file also loads basic text fonts.\footnote{%
%     Fonts loaded are \textsf{times} (roman), \textsf{helvet} (sanserif), \textsf{courier} (fixed-width), and \textsf{textcomp} (common symbols).
% }
% \emph{You may not modify the geometry or style in any way, for example, to squeeze out a little bit of extra space.}
% (Also do not use \verb|\vspace| for this.)
% Feel free to use convenience functionality of loaded packages such as \textsf{enumitem}.
% The class enables hyperlinking by loading the \textsf{hyperref} package.

% You are free to load any packages available in \TeX{Live}~2020 that are compatible with the UAI class.\footnote{In case this template or your submission does not compile, always first make sure your \TeX\ installation is up-to-date.}
% (Mik\TeX{} and Mac\TeX{} generally contain the same packages.)
% Do not load conflicting packages—you will get an error message—, as this complicates creating the proceedings.
% Please avoid using obsolete commands, such as \verb|\rm|, and obsolete packages, such as \textsf{epsfig}.\footnote{%
%     See \url{https://ctan.org/pkg/l2tabu}.
% }

% \swap[ ]{in the header of your source file.}{Feel free to include your own macros}

% \section{General Formatting Instructions}
% As a general rule: \emph{follow the template}.

% \subsection{Authorship}
% Reviewing is double-blind.
% However, you can already fill in your author names and affiliations in the \verb|\author| block in the preamble following the example of the template because the class will remove it as long as the option \textsf{accepted} is not passed to the class.
% Nevertheless, make sure any other information in the paper does not disclose your identity, for example URLs to supplementary material.

% \subsection{Sectioning}
% Three numbered sectioning commands are provided: \verb|\section|, \verb|\subsection|, and \verb|\subsubsection|.
% Please respect their order, so do not put a \verb|\subsubsection| directly beneath a \verb|\section|.
% One unnumbered sectioning command is provided, \verb|\paragraph|.
% It can be used directly below any numbered section level.
% Do not use any other sectioning commands.

% \subsubsection{Typing the Section Titles}
% The \verb|\section| and \verb|\subsection| titles are uppercased by the class.
% Please type them in title case.
% (This is used in the PDF bookmarks.)
% Please also write the \verb|\subsubsection| titles in title case.

% \paragraph{What is title case?}
% \href{https://en.wikipedia.org/wiki/Title_case}{Wikipedia} explains:
% \begin{quote}
%     Title case or headline case is a style of capitalization used for rendering the titles of published works or works of art in English.
%     When using title case, all words are capitalized except for ‘minor’ words (typically articles, short prepositions, and some conjunctions) unless they are the first or last word of the title.
% \end{quote}

% \subsection{References, Citations, Footnotes}\label{sec:etc}
% \subsubsection{Cross-Referencing}
% Always use \verb|\label| and \verb|\ref|—or a command with a similar effect—when cross-referencing.
% For example, this subsection is Section~\ref{sec:etc}.

% \subsubsection{Citations}
% Citations should include the author's last name and year.
% They should be part of the sentence.
% An example parenthetical citation: “Good introductions to the topic are available \citep{latexcompanion}.”
% An example textual citation: “\citet{einstein} discusses electrodynamics of moving bodies.”
% Do not use a parenthetical citation where a textual one is appropriate.
% An example of what \emph{not} to do: “\citep{einstein} discusses electrodynamics of moving bodies.”

% We strongly advise to use reference list software such as Bib\TeX{} and a citation package such as \textsf{natbib}.
% The reference style you use should be compatible with the author-year citations.
% Both the citation style and reference style used should be consistent.

% For the original submission, take care not to reveal the authors' identity through the manner in which one's own previous work is cited.
% For example, writing
% “I discussed electrodynamics of moving bodies before \citep{einstein}.” would be inappropriate, as it reveals the author's identity.
% Instead, write “\citet{einstein} discussed electrodynamics of moving bodies.”

% \subsubsection{Footnotes}
% You can include footnotes in your text.\footnote{
%     Use footnotes sparingly, as they can be distracting, having readers skip back and forth between the main text and the foot of the page.
% }
% The footnote mark should follow the fragment to which it refers, so a footnote\footnote{
%     A footnote is material put at the foot of a page.
% }
% for a word has a footnote mark attached to that word and a footnote for a phrase or sentence has a footnote mark attached to the closing punctuation.

% \section{Math}\label{sec:math}
% The class file does not load any math support package like \textsf{amsmath}\footnote{%
%   See the \textsf{amsmath} documentation at \url{https://ctan.org/pkg/amsmath} for further details.
% }.
% We advise using the \textsf{mathtools}\footnote{%
%   See the \textsf{mathtools} documentation at \url{https://ctan.org/pkg/mathtools} for further details.
% }
% package, which extends \textsf{amsmath} with fixes and even more useful commands.
% Feel free to load other support packages for symbols, theorems, etc.

% Use the \textsf{amsmath} environments for displayed equations.
% So, specifically, use the \texttt{equation} environment instead of \verb|$$...$$| and the \texttt{align} environment instead of \texttt{eqnarray}.\footnote{For reasons why you should not use the obsolete \texttt{eqnarray} environment, see Lars Madsen, \textit{Avoid eqnarray!} TUGboat 33(1):21--25, 2012.}
% An \texttt{equation}:
% \begin{equation}\label{eq:example}
%   0 = 1 - 1.
% \end{equation}
% Two \texttt{align}'ed equations:
% \begin{align*} % no numbers with starred version
%   1 + 2 &= 3,\\
%   1 - 2 &= -1.
% \end{align*}
% Equations can also be put inline, of course.
% For example, Equation~\eqref{eq:example}: \(0=1+1\). % $0=1+1$ also works
% (Notice that both inline and displayed math are part of the sentence, so punctuation should be added to displayed math.)

% The \textsf{amsmath} and \textsf{mathtools} packages provide a lot of nice functionality, such as many common math operators, e.g., \(\sin\) and \(\max\), and also commands for defining new ones.

% \section{Floats}\label{sec:floats}
% Floats, such as figures, tables and algorithms, are moving objects and are supposed to float to the nearest convenient location.
% Please do not force them to go in the middle of a paragraph.
% They must respect the column width.

% Two-column floats are possible.
% They appear at the top of the next page, so strategic placement may be necessary.
% For an example, see Figure~\ref{fig:tikz}.
% They may not enter the margins.
% \begin{figure*}
%     \centering
%     \begin{tikzpicture}[xscale=1.5]
%         \coordinate (origin);
%         \draw[->] (origin) -- +(1cm,0) node[below] {$x$};
%         \draw[->] (origin) -- +(0,1cm) node[left] {$y$};
%         \fill[gray] (45:1cm) circle[radius=.2cm];
%     \end{tikzpicture}
%     \caption{A Nice Filled Ellipse with a Pair of Coordinate Axes.}\label{fig:tikz}
% \end{figure*}

% All material in floats should be legible and of good quality.
% So avoid very small or large text and pixelated or fuzzy lines.

% \subsection{Figures}\label{sec:figures}
% Figures should go in the \texttt{figure} environment and be centered therein.
% The caption should go below the figure.
% Use \verb|\includegraphics| for external graphics files but omit the file extension.
% Supported formats are \textsf{pdf} (preferred for vector drawings and diagrams), \textsf{png} (preferred for screenshots), and \textsf{jpeg} (preferred for photographs).
% Do not use \verb|\epsfig| or \verb|\psfig|.
% If you want to scale the image, it is better to use a fraction of the line width rather than an explicit length.
% For example, see Figure~\ref{fig:toronto}.
% \begin{figure}
%   \centering
%   \includegraphics[width=0.7\linewidth,page=3]{toronto}
%   \caption{A View of a Nice City.}\label{fig:toronto}
% \end{figure}

% Do not use \verb|\graphicspath|.
% If the images are contained in a subdirectory, specify this when you include the image, for example \verb|\includegraphics{figures/mypic}|.

% \subsection{Tables}\label{sec:tables}
% Tables should go in the \texttt{table} environment and be centered therein.
% The caption should go above the table and be in title caps.
% For an example, see Table~\ref{tab:data}.
% \begin{table}
%     \centering
%     \caption{An Interesting Table.}\label{tab:data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \subsection{Algorithms}\label{sec:algorithms}
% You can load your favorite algorithm package, such as \textsf{algorithm2e}\footnote{See the \textsf{algorithm2e} documentation at \url{https://ctan.org/pkg/algorithm2e}.}.
% Use the environment defined in the package to create a centered float with an algorithm inside.

% \section{Back Matter}
% There are a some final, special sections that come at the back of the paper, in the following order:
% \begin{itemize}
%   \item Author Contributions
%   \item Acknowledgements
%   \item References
% \end{itemize}
% They all use an unnumbered \verb|\subsubsection|.

% For the first two special environments are provided.
% (These sections are automatically removed for the anonymous submission version of your paper.)
% The third is the ‘References’ section.
% (See below.)

% (This ‘Back Matter’ section itself should not be included in your paper.)

% \begin{contributions} % will be removed in pdf for initial submission,
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions.
%     This is a nice way of making clear who did what and to give proper credit.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

% \begin{acknowledgements} % will be removed in pdf for initial submission,
%                          % so you can already fill it to test with the
%                          % ‘accepted’ class option
%     Briefly acknowledge people and organizations here.

%     \emph{All} acknowledgements go in this section.
% \end{acknowledgements}

% \bibliography{uai2022-template}

% \appendix
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% \section{Math font exposition}
% How math looks in equations is important:
% \begin{equation*}
%   F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.

% \end{document}
\usepackage{dsfont}
\usepackage{times}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{bm}
\usepackage{xcolor}
\usepackage{amssymb}
\usepackage{xparse}

\newtheorem{exmp}{Example}[section]
\newtheorem{lemma}{Lemma}[section]
\newtheorem{remark}{Remark}[section]
\newtheorem{coroll}{Corollary}[section]
% \newtheorem{proof}{Proof}[section]
\newtheorem{assumption}{Assumption}[section]

\newtheorem{theorem}{Theorem}[section]
\newtheorem{Proposition}{Proposition}[section]
\definecolor{blued}{RGB}{70,197,221}
\definecolor{applegreen}{rgb}{0.55, 0.71, 0.0}
\definecolor{flame}{rgb}{0.89, 0.35, 0.13}
\DeclareMathOperator*{\EX}{\mathbb{E}}
\DeclareMathOperator*{\Var}{\mathbb{V}\mathrm{ar}}
\DeclareMathOperator*{\Cov}{\mathbb{C}\mathrm{ov}}
\DeclareMathOperator*{\esssup}{ess\,sup}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\arginf}{arg\,inf}
\DeclareMathOperator*{\argsup}{arg\,sup}
\newcommand{\Regret}{\text{Regret}}
\newcommand{\indi}[1]{\mathds{1}\left\{#1\right\}}

\DeclareMathOperator*{\cartesian}{\bigtimes}

\DeclareRobustCommand{\eg}{e.g.,\@\xspace}                   
\DeclareRobustCommand{\ie}{i.e.,\@\xspace}                  
\DeclareRobustCommand{\wrt}{w.r.t.\@\xspace}                
\DeclareRobustCommand{\wp}{w.p.\@\xspace}
 \usepackage{thm-restate}
\declaretheorem[name=Theorem,numberwithin=section]{thm}
\declaretheorem[name=Proposition,numberwithin=section]{prp}
\DeclareRobustCommand{\quotes}[1]{``#1''}
\newcommand{\norm}[1]{\left\|#1\right\|}
\newcommand{\bmr}[1]{\bm{\mathrm{#1}}}
% \newcommand{\gr}[1]{\todo[color=blued, inline]{\small {\bf Giorgia} #1}}
\newcommand{\Ss}{\mathcal{S}}
\newcommand{\As}{\mathcal{A}}
\newcommand{\SAs}{\mathcal{S} \times \mathcal{A}}
\newcommand{\Nat}[1][]{\mathbb{N}_{\ifthenelse{\isempty{#1}}{}{\ge #1}}}
\newcommand{\Reals}[1][]{\mathbb{R}_{\ifthenelse{\isempty{#1}}{}{\ge #1}}}

\DeclareMathOperator*{\essinf}{ess\,inf}

\DeclareMathOperator{\Ham}{\mathcal{H}}% hamiltonian
\DeclareMathOperator{\Hes}{\mathcal{J}}% hessian
\DeclareMathOperator{\A}{A}% hessian
\newcommand{\ra}[1]{\renewcommand{\arraystretch}{#1}}
\newcommand{\E}{\mathbb{E}}
\newcommand{\reals}{\mathbb{R}}
\newcommand{\vW}{\mathbr{W}}
\newcommand{\identity}{\mathbr{I}}
\newcommand{\vM}{\mathbr{M}}
\newcommand{\nrm}[1]{\left\|#1\right\|}
\newcommand{\vQ}{\mathbr{Q}}
\newcommand{\vA}{\mathbr{A}}
\newcommand{\vB}{\mathbr{B}}
\newcommand{\vx}{\mathbr{x}}
\newcommand{\old}{\text{old}}
\newcommand{\BigO}{\mathcal{O}}
\newcommand{\algname}{LOGEL}
\newcommand{\mathbr}[1]{\bm{\mathbf{#1}}}
\newcommand{\vtheta}{\bm{\theta}}
\newcommand{\vomega}{\bm{\omega}}
\newcommand{\vphi}{\bm{\phi}}
\newcommand{\vvarphi}{\bm{\varphi}}
\newcommand{\de}{\mathrm{d}}
\newcommand{\grad}{\nabla_{\vtheta}}
\newcommand{\traj}{\tau}
\newcommand{\vpsi}{\bm{\psi}}
\newcommand{\vvec}{\mathrm{vec}}
\newcommand{\vmu}{\bm{\mu}}
\newcommand{\vSigma}{\bm{\Sigma}}
\newcommand{\vlambda}{\bm{\lambda}}
\newcommand{\grads}{\mathbr{\Psi}}
\newcommand{\state}{\mathcal{S}}
\newcommand{\actions}{\mathcal{A}}
\newcommand{\transitions}{\mathcal{P}}
\newcommand{\reward}{\mathcal{R}}
\newcommand{\MDP}{$\mathcal{M}$}
\newcommand\footnoteref[1]{\protected@xdef\@thefnmark{\ref{#1}}\@footnotemark}
\newcommand{\indicator}{\mathds{1}}
% \usepackage[colorinlistoftodos, textwidth=18mm]{todonotes}
% \newcommand{\todomr}[1]{\todo[color=orange!30!yellow!10, inline]{\small MR: #1}}
% \newcommand{\todomrout}[1]{\todo[color=orange!30!yellow!10]{\scriptsize MR: #1}}
% \newcommand{\todogr}[1]{\todo[color=blue!10, inline]{\small GR: #1}}
% \newcommand{\todogrout}[1]{\todo[color=blue!10]{\scriptsize GR: #1}}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros

\title{Learning in Markov Games: can we exploit a general-sum opponent?}
\date{ }
% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2022 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Giorgia Ramponi}
\author[2]{Marcello Restelli}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    ETH AI Center\\
    Zurich, Switzerland
}
\affil[2]{%
Computer Science Dept.\\
Politecnico di Milano\\
Milan, Italy
}
% \affil[3]{%
%     Another Affiliation\\
%     Address\\
%     …
%   }
  
  \begin{document}
\maketitle

% If your paper is accepted and the title of your paper is very long,
% the style will print as headings an error message. Use the following
% command to supply a shorter title of your paper so that it can be
% used as headings.
%
%\runningtitle{I use this title instead because the last one was very long}

% If your paper is accepted and the number of authors is large, the
% style will print as headings an error message. Use the following
% command to supply a shorter version of the authors names so that
% they can be used as headings (for example, use only the surnames)
%
%\runningauthor{Surname 1, Surname 2, Surname 3, ...., Surname n}






\begin{abstract}%
% In this paper, we study the online learning problem in two-player general-sum Markov Games. We consider the online setting where we control a single player playing against an arbitrary opponent with the goal of minimizing regret. Previous works only consider the zero-sum Markov Games setting, in which the two agents are completely adversarial. However, in some cases, the two agents may have different reward functions without being adversarial. Furthermore, by learning our opponent's reward function, we can exploit this information to increase our performance. This involves a stronger notion of regret with respect to the one used in previous works. 

% We show that the new regret minimization problem is significantly harder than in standard Markov Decision Processes, deriving a lower bound on the expected regret of any ``good'' learning strategy. Then we propose a novel optimistic algorithm that nearly matches the proposed lower bound. Proving these results requires overcoming several new challenges that are not present in Markov Decision Processes or zero-sum Markov Games. 

In this paper, we study the learning problem in two-player general-sum Markov Games. We consider the online setting where we control a single player, playing against an arbitrary opponent to minimize the regret. Previous works only consider the zero-sum Markov Games setting, in which the two agents are completely adversarial. However, in some cases, the two agents may have different reward functions without having conflicting objectives. This involves a stronger notion of regret than the one used in previous works. This class of games, called general-sum Markov Games is far to be well understood and studied. 

We show that the new regret minimization problem is significantly harder than in standard Markov Decision Processes and zero-sum Markov Games. To do this, we derive a lower bound on the expected regret of any ``good'' learning strategy which shows the constant dependencies with the number of deterministic policies, which is not present in zero-sum Markov Games and Markov Decision Processes. Then we propose a novel optimistic algorithm that nearly matches the proposed lower bound. Proving these results requires overcoming several new challenges that are not present in Markov Decision Processes or zero-sum Markov Games. 

\end{abstract}



\section{Introduction}
Reinforcement Learning (RL) \citep{sutton2018reinforcement} is an area of Machine Learning that studies sequential decision-making problems, where a learning agent interacts with an unknown environment to maximize its rewards. 
In recent years, RL methods have made substantial progress in solving real-world problems (e.g., beating the world champion player of Go \citep{silver2017mastering}, solving real-time strategy games \citep{openai} and Poker \citep{moravvcik2017deepstack,brown2018superhuman}, in autonomous driving \citep{shalev2016safe}, learning communications and emergent behaviours \citep{foerster2016learning,lowe2017multi,bansal2018emergent}, providing solutions to robotic control problems \citep{lillicrap2015continuous}, and managing the power consumption of households \citep{chung2020distributed}).
All of these challenging real-world problems can be framed in a Multi-Agent RL (MARL) context. In Multi-Agent Reinforcement Learning (MARL), multiple agents act in the same environment, to optimize their objectives. However, despite the empirical success of MARL algorithms, theoretical understanding of MARL is relatively rare.  

The MARL framework is usually modeled as a Markov Game (MG) \citep{shapley1953stochastic}, which is an extension of Markov Decision Processes (MDPs) \citep{sutton1998introduction}. In general, learning in MGs is harder than learning in MDPs. The complications arise from the fact that all agents affect both the transitions and the rewards of the other agents, while the agents can have completely different, even conflicting objectives. Moreover, the agents without knowledge of the transition model, have to estimate it by interactions, as in single-agent RL problems, but they need also to infer and learn the other agent's policy.
% in order to construct powerful algorithms. 
In the literature \citep{xie2020learning,wei2017online,tian2020provably}, the learning problem in MGs has been divided into two settings: \textit{online} and \textit{offline}. In the \textit{online} setting, the algorithm has the control of only one agent, to maximize its rewards in a multi-agent environment. On the other hand, the \textit{offline} setting aims at providing self-play algorithms, i.e., algorithms that have the control of all the agents, or at least, it is assumed that all the agents use the same algorithm. While the \textit{offline} setting has received considerable attention, it fails at modeling many use-cases of practical interest. For example, in many robot control problems, artificial agents interact with humans who are not-controllable agents; or in card/video games it is unrealistic for the opponent to use the same learning algorithm as our agent. While the online setting is more suitable to model previous examples, it remains less studied and understood. Moreover, the algorithm that takes into account the \textit{online} setting usually assumes that the non-controllable \footnote{With non-controllable we mean the policy of the second agent can not be decided by our algorithm.} agent has a conflicting objective, i.e. the problem that we are facing is a zero-sum Markov Game. However, there are many real-world problems where the other agent can have only different objectives, but not completely adversarial. 

% Learning in Markov Games can be divided into two different settings: when we have control on both agents (\textit{offline} setting), or when we control only one of the two agents (\textit{online} setting).
In this paper, we consider the \textit{online} learning problem in a two-player general-sum turn-based Markov Game. More precisely, we consider the problem of learning in Markov Games, where there is one agent that we can control and it can observe the interaction between the non-controllable agent and the environment. We are interested in solving the following question left open in the literature:
\begin{center}
\emph{``Can we design a provably efficient algorithm for Markov Games exploiting a general-sum opponent?''}
\end{center}
This problem was also raised out in \citep{xie2020learning} as an interesting open direction. In that paper, the authors present the problem of learning in the presence of a \textit{weak opponent}, i.e. we are facing an opponent that is not totally adversarial (as in zero-sum games). As suggested by the authors, in this case, the guarantee involves a stronger notion of regret with respect to the minimax ones. 


In this paper we answer \textit{partially} affirmative. In fact, we show that the regret minimization problem in this context is more complex than in standard MDPs. Our contributions can be summarized as follows:
\begin{enumerate}
    \item First, we define a stronger regret definition for the problem since the minimax one \citep{xie2020learning, tian2020provably} does not capture the nature of the interaction. 
    \item Second, we provide a novel lower bound for the regret minimization problem, that shows how the exploration problem in general-sum MGs is harder than in MDPs and in zero-sum MGs.
    \item Finally, we provide an algorithm, called \textit{Turn-based Markov Game OPtimistic Value Iteration} (TMG-OPVI) which nearly matches the proposed lower bound.
\end{enumerate}

\textbf{Paper outline}
The paper is structured as follows. We start by over-viewing some related works (Section \ref{sec:relatedworks}). Then we introduce the notation (Section \ref{sec:preliminaries}) and we provide in Section \ref{sec:problem} a formal introduction to the problem. In Section \ref{sec:lowerbound}, we derive a lower bound on the expected regret of any ``good'' learning strategy that captures the exploration challenges in this context. In particular, the lower bound clearly shows that the regret minimization in Stochastic Games is significantly more complex than in standard MDPs. Then, in Section \ref{sec:algorithm}, we propose an algorithm that nearly matches the proposed lower bound. 

\begin{remark}
The considered setting is easier than the simultaneous MG setting and the one in which the agent can change its type over time. We decided to consider this framework for two reasons: 1) it is similar to the online learning zero-sum MG considered in the literature, \cite{xie2020learning}, and, most important, 2) we proved that even in this setting, which is a special case of the simultaneous one and of the one considered in \cite{sessa2020learning}, the learning problem is hard.
\end{remark}
% \todogr{finish introduction}
%This is where the content of your paper goes.
%\begin{itemize}
%  \item Use the \texttt{\textbackslash documentclass[anon,12pt]\{alt2022\}} option during submission process -- this automatically hides the author names listed under \texttt{\textbackslash altauthor}. Do not include author names in the remainder of the text, and to the extent possible, avoid directly identifying the authors. You should still include all relevant references, including your own, and any other relevant discussion, even if this might allow a reviewer to infer the author identities. 
%\item The \textsf{jmlr} class automatically loads \textsf{natbib}
%and automatically sets the bibliography style, so you don't need to
%use \verb|\bibliographystyle|.
%This sample file has the citations defined in the accompanying
%BibTeX file \texttt{jmlr-sample.bib}. For a parenthetical
%citation use \verb|\citep|. For example: ``ALT 2020 proceedings
%\citep{ALT2020}". For a textual citation use
%\verb|\citet|. For example: ``The proceedings were edited by \citet{ALT2020}''.
%Both commands may take a comma-separated list.
%
%These commands have optional arguments and have a starred
%version. See the \textsf{natbib} documentation for further
%details.\footnote{Either \texttt{texdoc natbib} or
%\url{http://www.ctan.org/pkg/natbib}}
%
%\end{itemize}
\section{Related works}\label{sec:relatedworks}

After the introduction of the concept of Markov Games~\citep{shapley1953stochastic},  many RL algorithms were proposed to learn in this setting. However, the theoretical study of this context is quite poor, compared to the empirical one (see the survey~\citep{zhang2019multi,da2019survey,hernandez2019survey,papoudakis2019dealing}). Only recently, there has been a growing interest in providing algorithms with strong sample-complexity and regret guarantees for the two theoretical MARL settings: \textit{offline} and \textit{online}. In the \textit{online} setting, the algorithm controls only one agent, which has to maximize its own reward function. Instead, in the \textit{offline} setting, the algorithms control all the agents in the MG.

\textbf{Offline setting}
 Most recent works provide results in the zero-sum offline setting, where both model-free \citep{bai2020near,zhang2020robust} and model-based \citep{bai2020provable,sidford2020solving,li2020exploration,liu2020sharp,zhang2020model} algorithms were proposed with near-optimal sample complexity and regret guarantees. For the model-based setting, the prevalent approach is to assume to have access to a generative model, such as in \citep{sidford2020solving,zhang2020model}, where the authors provide non-asymptotic results on the number of queries to the generator. However, in \citep{liu2020sharp} the authors proposed a model-based algorithm for the zero-sum setting without access to a generative model and which matches the information-theoretic lower bound.  Furthermore, in this recent work, the authors also proposed the first line of provably sample-efficient algorithms for multi-player general-sum games. In \citep{li2020exploration} the authors introduced, instead, an algorithm to learn a Nash Equilibrium in the multi-player general-sum setting. Very recently, the first algorithm to deal with sample complexity in the general-sum games (not Markov) that achieves an $\epsilon$-Stackelberg Equilibrium was introduced \citep{bai2021sample}. In this paper, the authors consider the \textit{bandit feedback} setting i.e., they can see only the random samples of the rewards received by the two players. The authors identify a fundamental gap between the exact value of the Stackelberg equilibrium and its estimated version using finite samples. This result gives insights into the hardness of learning in General-sum games even when the setting is stateless and the algorithm has the control of both the leader and the follower.


\textbf{Online setting}
The online setting is only studied, as far as we know, in the zero-sum setting. The first work that analyzes the problem of online learning in Stochastic Games is \citep{brafman2002r}. In this paper, the authors propose the famous R-MAX algorithm that deals with the zero-sum average-reward setting and provides the first regret bound for the setting. In \cite{wei2017online} the authors provide an algorithm for zero-sum Stochastic games that extends UCRL2, but which works under strong reachability assumptions. This algorithm significantly improves the regret bound of R-MAX. \cite{xie2020learning} propose an algorithm with a ``weak'' regret notion (the minimax defined before), which is compatible with a zero-sum game. This is the first work that considers linear function approximation in Markov Games. The authors analyze both the \textit{offline} and \textit{online} settings and their algorithms achieve near-optimal regret bounds. Instead, \cite{tian2020provably} introduce the online setting with bandit-feedback, called also \textit{agnostic} setting. In this case, the agent cannot observe any interaction between the other agent and the environment. The authors extend the method of \cite{bai2020near} to deal with this setting. In \citep{xie2020learning} the authors leave as open question how to construct an algorithm to achieve optimal regret exploiting a ``weak opponent'', i.e., an opponent who is not totally adversarial (as in zero-sum games). In our work, we give the first solution to this problem.

\textbf{Adversarial MDPs} The adversarial MDP problem is strictly related to the Markov Game setting. Most of the works in this setting consider adversarial rewards~\citep{even2009online,gergely2010online,zimin2013online,rosenberg2019online,jin2020learning,dick2014online},
i.e., the presence of an opponent who can change the received rewards. This setting is substantially different from Markov Games as the opponent can affect only the rewards and not the transitions model. Other works, instead, consider also adversarial transitions \citep{yu2009arbitrarily,neu2012adversarial,lykouris2019corruption}. This setting is quite challenging and the algorithms to solve this problem do not provide a $\mathcal{O}\left(\sqrt{T}\right)$ regret bound. These approaches can be applied in the bandit setting where we cannot see any feedback from the other agent, with the scope of constructing an algorithm robust to these perturbations.

\section{Preliminaries}\label{sec:preliminaries}
In this section, we formally describe the background that will be employed throughout the remainder of the paper.

\subsection{Turn-based Markov Games: background and notation}
We consider two-player finite-horizon Markov Game setting \citep{shapley1953stochastic, xie2020learning, bai2020near} $MG = (\state, \actions_1, \actions_2, \transitions, \reward_1, \reward_2, \mu, H)$, where $\state$ is the finite state space, $\actions_1$, $\actions_2$ are the finite action spaces respectively for the first and the second player, $\transitions \in \Delta^{\state}_{\state \times \actions_1 \times \actions_2}$ is the transition kernel, $\reward_i: \state \times \actions_1 \times \actions_2 \rightarrow [0,1]$ is the reward function of the $i$-th player, $\mu$ is the initial state distribution and $H$ is the horizon. In a turn-based MG at each state only one player takes an action. The state space $\state$ is partitioned into $\state = \state_1 \cup \state_2$, $\state_1 \cap \state_2 = \emptyset$, where $\state_i$ is the set of states where it is $i$'s turn to play. For each state $s \in \state$, let $I(s) \in \{1, 2\}$ be a function that indicates the current player to play. A stochastic policy for the i-th ($i \in \{1,2\}$) player is a sequence of H functions $\pi := (\pi_{h}: \state \rightarrow \Delta_{\actions_1})_{h \in H}$. We define as $\pi_1 = (\pi_{1,1}, \dots, \pi_{1,K})$ and $\pi_2 = (\pi_{2,1}, \dots, \pi_{2,K})$ the two sequences of policies that, respectively, are played by our agent (agent $1$) and the other agent (agent $2$). 
% \todogr{add the fact that the reward is a random realization of the reward function etc.}

\paragraph{Value Functions} The value function and action-value function, given policies $\pi_1$ and $\pi_2$, are defined for agent $i \in \{1,2\}$, for each time step $h \in [1, H]$, state $s \in \state$, action $a \in \actions$, as follows:
\begin{equation*}
    V^{\pi_1, \pi_2}_{i,h}(s) = \EX\left[\sum_{t=h}^H \reward_i(s_t, a_t)|s_h = s\right],
\end{equation*}
\begin{equation*}
    Q^{\pi_1, \pi_2}_{i,h}(s,a) = \EX\left[\sum_{t=h}^H \reward_i(s_t, a_t)|s_h = s, a_h=a\right],
\end{equation*}
where the $a_t \sim \pi_{i(s_t)}$ and $s_t \sim \mathcal{P}(\cdot|s_t,a_t)$. Furthermore, we denote with $V^{\pi_1,\pi_2}_1=\EX_{s\sim\mu}[V^{\pi_1, \pi_2}_{1,1}(s)]$ and $V^{\pi_1,\pi_2}_2=\EX_{s\sim\mu}[V^{\pi_1, \pi_2}_{2,1}(s)]$ the expected returns for the two agents.

The interaction between the two agents proceeds in episodes, where at the beginning of each episode the agents decide which policy to play. We indicate with $K$ the number of episodes played by the two agents. The agent observes the states, the actions played by the two agents, and noisy feedback of the agents' reward functions, i.e. $\widetilde{r}_{i,h}$ sampled from a distribution with mean $\reward_i(s_h,a_h)$.







% In this thesis, we consider the two-player finite-horizon (episodic) Stochastic Game setting, $SG = (N, \state, \actions, \transitions, \reward, \gamma, \mu, H)$, described in Chapter \ref{ch:marl}, i.e., an SG such that $N = 2$, $H < \infty$ and $\gamma=1$. As we have introduced in Chapter \ref{ch:marl}, for this setting, we can construct a value-iteration-like algorithm that converges to an Equilibrium policy, under the assumption that the admissible policies are only the Markovian ones. 


%While the \textit{offline} setting has received considerable attention, it fails at modeling many use-cases of practical interest. For example, in many robot control problems the agents interact with humans which are non-controllable agents; or card/video-games, where it is unrealistic that the opponent will use the same learning algorithm. On the other hand while the online setting is more suitable to model previous examples, it remains less studied. 






\section{Problem statement}\label{sec:problem}
In this section, we introduce the online learning problem in Turn-based General-sum Markov Games. We remark that in these games, at each step $h$ the agent $I(s_h)$ has to decide the action $a_h$ to be taken and the two players receive respectively rewards $\reward_1(s_h,a_h)$ and $\reward_2(s_h,a_h)$; then, the system transitions to the next state $s_{h+1} \sim \transitions(\cdot| s_h, a_h)$. 

The algorithm controls only agent $1$. We do not know the policy $\pi_2$ that agent $2$ will play at iteration $k$ as well as the reward function $\reward_2$ that agent $2$ is optimizing. The goal is to learn a sequence of policies $\pi_1=\{\pi_{1,1}, \dots, \pi_{1,k}\}$ that minimizes the total (expected) \textit{regret}, defined by:
\begin{equation}
\EX[\Regret(K)] = \sum_{k=1}^K V^\star_1 - V^{\pi_{1,k}, \pi_{2,k}}_1,
\end{equation}
where $V^\star_1$ corresponds to the benchmarks couple of policies $\pi_1^\star, \pi_2^\star$ used to compare our algorithm. In literature, the common way of defining $V^\star_1$ \citep{xie2020learning, wei2017online} is the minimax policy defined as:
\begin{equation}
V^{\text{minimax}} = \min_{\pi_2 \in \Pi_2} \max_{\pi_1 \in \Pi_1} V^{\pi_1, \pi_2}_1.
\end{equation}
The two policies $\pi_1, \pi_2$ correspond to the Nash Equilibrium of the $MG = (\state, \actions_1, \actions_2, \transitions, \reward_1, -\reward_1, \mu, H)$, i.e. a zero-sum Markov Game where the agent $2$ is minimizing agent $1$'s reward function. The minimax benchmark policy is suitable to account for adversarial settings, where the other agent can adversarially change its policy to maximize the regret, or when we are in a zero-sum game. Furthermore, it was shown that in some cases, it is necessary to adopt the minimax benchmark since, otherwise, the regret minimization problem would be too difficult. An example is the \textit{agnostic} setting when agent $1$ cannot observe any information regarding the interaction between agent $2$ and the environment \citep{tian2020provably}.

However, in general, minimax policies do not capture the nature of the general-sum setting where the agent $2$ just wants to maximize its own reward function. In fact, in the latter setting, agent $1$ could hope to perform better than when facing a \textit{non-Competitive} (its reward is not the opposite of our reward function) opponent. A benchmark that better fits the general-sum setting is the Stackelberg Equilibrium of the game:
\begin{equation}
V^{\text{SE}}_1 = \max_{\pi_1 \in \Pi_1} V^{\pi_1, \text{br}(\pi_1)}_1,
\end{equation}
where $\text{br}: \Pi_1 \rightarrow \Pi_2$ corresponds to a function that selects a best response policy for the agent $2$ to each policy of the agent $1$. Although this setting can not be applied in the \textit{agnostic} case, in many real-world scenarios, it is plausible that the agent $1$ can observe the interactions between agent $2$ and the environment. Moreover, in some cases, the agent $1$ can observe also agent $2$ rewards or it can at least recover them (for example, using IRL approaches). In these cases, it is more reasonable to use the more challenging regret notion. 

Since the \textit{uncontrollable} agent is rationale, we can easily suppose that it plays the best response, as it is already done in literature \citep{balcan2015commitment,peng2019learning,sessa2020learning}. More formally, the agent $2$, given the policy $\pi^i_1 \in \Pi_1$, follows the policy $\pi^{*,i}_{2}$ such that:
\begin{equation*}
\pi^{*,i}_{2} \in \argmax_{\pi_2 \in \Pi_2} V^{\pi^i_1,\pi_2}_2,
\end{equation*}
i.e., it plays its \textit{best response}. This creates an inherently \textit{asymmetrical} interaction: the first agent can be seen as a \textit{leader}, who decides the policy to be played in an episode, and the second agent can be seen as a \textit{follower}, who can see the leader's policy and adapts its response to it. So, as in the game-theory literature \citep{balcan2015commitment,peng2019learning,sessa2020learning}, we make the following assumption:
\begin{assumption}
For every policy $\pi_1 \in \Pi_1$ the second uncontrollable agent will always play the same best response policy $br(\pi_1)$, where $br: \Pi_1 \rightarrow \Pi_2$. Furthermore, $br(\pi_1)$ is deterministic.
\end{assumption}
Under this assumption the goal of our agent is well-defined and consists in finding the policy $\pi_1 \in \Pi_1$ that is optimal under the second agent's best response policy:
\begin{equation*}
\pi^\star_1 \in \argmax_{\pi_1 \in \Pi_1} V_1^{\pi_1,br(\pi_1)}.
\end{equation*}
This corresponds to finding the Stackelberg Equilibrium of the game.

We remark that agent $1$ does not know the policies that the second agent will play, i.e., the $br$ function is unknown \footnote{We can also suppose that the \textit{uncontrollable} agent will play an arbitrary mapping between its policies and the controllable agent ones. However, if we suppose it plays the best response, we need only to know the reward function, to know the $br$ function.}. From an online learning perspective, the regret that the algorithm has to minimize is defined as:
\begin{equation}
\label{eq:regredef}
\EX[\Regret(K)] = K V_1^{\text{SE}} - \sum_{k=1}^K V_1^{\pi_{1,k}, br(\pi_{1,k})}.
\end{equation} 


\paragraph{Bandit vs Turn-based MG}
Obviously, this problem can be seen as solving a stochastic multiarmed bandit problem~\citep{lattimore2020bandit}. In this case, the arms are the policies, and the agent at each episode receives a random realization of its expected return. So, this problem can be solved with standard bandit algorithms such as UCB1 \citep{auer2002finite}. However, as we will explain in the next section, this is not the best we can do. In fact, the regret would not scale sublinearly with the number of possible policies, as it happens with standard bandit algorithms (where the regret is $\mathcal{O}\left(\sqrt{|\Pi_1|K}\right)$). However, we show in Section \ref{sec:lowerbound} that the regret has a \emph{constant} dependence on the number of policies (i.e., not multiplicative of K). 
In fact, we prove a lower bound on the regret and an upper bound such that the quantity $\mathcal{O}\left(\sqrt{T}\right)$ does not scale with the number of possible policies, but with a \textit{constant} dependence on the number of possible policies.

% \paragraph{Comparison between Nash Equilibrium and Stackelberg Equilibrium} In this chapter, we use the notion of Stackelberg Equilibrium (SE) rather than the Nash Equilibrium (NE) to formalize the regret. There are two reasons to adopt the SE concept, one more philosophical and the other more practical. The SE, differently from the NE, models the asymmetric interaction between the two agents and, moreover, models many real-world problems (e.g., in the security domain and network routing) that cannot be modeled with the NE. Moreover, from a practical viewpoint, at every stage game, the computation of the NE is also PPAD complete for two agents with $|\actions| > 2$ \cite{papadimitriou1992inefficient}; on the other hand, computing the SE requires polynomial complexity\cite{coniglio2020computing}.


\begin{figure*}[h!]
\centering
\includegraphics[scale=1]{mdp2.pdf}
\caption{The composite Turn-based Markov Game was constructed for the lower bound. The states belonging to $\state_2$ are in orange, the ones belonging to $\state_1$ in blue. The dashed lines corresponds to the transition probabilities taking action $a^\star$, the others taking any other action $a \in \actions$ with $a \ne a^\star$. The dots indicate the chain composed of $N$ states. We omit the self-loop in $s_f$, which corresponds to the fact that $s_f$ is a sink state.}
\label{fig:mdplower}
\end{figure*}
\section{Hardness of Learning in Turn-based two-player Markov Games}
\label{sec:lowerbound}
In this section, we provide a lower bound on the expected regret defined in Equation \ref{eq:regredef}. We remark that agent $1$, i.e., the one who is controlled by the algorithm, sees the actions taken by the second agent as well as its rewards. 

We consider the Turn-based MG (TMG) shown in Figure \ref{fig:mdplower}.  That is, there are $N+3$ states, $A$ actions with $N,A \in \mathbb{N}$, and  $H = N+1$. The state space is defined as $\state = \state_1 \cup \state_2$, $\state_1 \cap \state_2 = \emptyset$. Agent $2$ controls the starting state $s_0$, $\state_2 = \{s_0\}$, identified in the figure with the orange color. The state space of agent $1$, instead, is equal to $\state_1 = \{s_f, s_p, s_1, \dots, s_N\}$, i.e. it controls the blue states in Figure \ref{fig:mdplower}. In state $s_0$, agent $2$ can choose between a \textit{good} action $a^*$ and a \textit{failure} action $a_f$. The reward functions of the two agents are:
\begin{align*}
&\reward_1(s,a) = \left\{ \begin{matrix}
1 & \text{if } s = s_{N} \\
0 & \text{otherwise}
\end{matrix} \right.\\
&\reward_2(s,a) =  \left\{ \begin{matrix}
R & \text{if } s = s_{N} \\
R_f & \text{if } s = s_{f}\\
0 & \text{otherwise} 
\end{matrix} 
\right.
\end{align*}
where $R,R_f \in \reals$.
The transition model of the TMG is defined as follows:
\begin{itemize}
    \item In state $s_0$, $\transitions(s_1|s_0, a^\star) = 1$, $\transitions(s_f|s_0, a_f) = \delta$ and $\transitions(s_0|s_0, a_f) = 1-\delta$. Then, if the agent $2$ takes the \textit{good} action, the TMG transits to state $s_1$, otherwise the next state is $s_f$ with probability $\delta$ and $s_0$ with probability $1-\delta$.
    \item From state $s_f$ with any action we continue to stay in state $s_f$, i.e., $\transitions(s_f|s_f,a) = 1$ $\forall a \in \actions$. 
     \item From state $s_p$ with any action we continue to stay in state $s_p$, i.e., $\transitions(s_p|s_p,a) = 1$ $\forall a \in \actions$. 
    \item From all the other states $s_i$ with $i \in [N]$: $\transitions(s_{i+1}|s_i,a^\star) = \delta + \epsilon$ and $\transitions(s_{\textcolor{red}{p}}|s_i,a^\star) = 1 - \delta - \epsilon$; instead, for any other action $a \in \actions$, $\transitions(s_{i+1}|s_i,a) = \delta$ and $\transitions(s_{\textcolor{red}{p}}|s_i,a) = 1 - \delta$. 
\end{itemize} 
The second agent has only two response functions: $\pi_{2,i}(s_0) = a_0$ or $\pi_{2,i}(s_0)= a_f$ with $i \in [1,H]$\footnote{It is easy to see that all other policies are not optimal, as either it prefers to gain $R_f$ or $R$. More details on it are provided in Appendix \ref{sec:lbproof}.}. Obviously, it depends on the policy that agent $1$ decides to take at the beginning of the episode. In the next proposition we prove that always exist two values for $R$ and $R_f$ such that the only policies of agent 1 that induce the second agent to take action $a^*$ are the ones such that $\pi_{1,i}(a^*|s_{i}) = 1$ for all $i \in [2,H]$. We call this set of policies $\Pi_1^*$.

\begin{restatable}{prp}{prop}
For every $\delta, \epsilon \in (0,1)$, $H > 3$, there exist two values $R$ and $R_f$ such that agent $2$ will play action $a^*$ only if agent $1$ plays a policy $\pi \in \Pi_1^*$. 
% If $R_f = \frac{(\delta+\epsilon)^{H-2}(\delta+c) R}{\sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}}$, there exists a constant $0 < c < \epsilon$  such that the only policy that induces the second agent to play $a^\star$ is $\pi_1(a^\star|\cdot) = 1$ for all $s \in \state_1$.
\end{restatable}
% \begin{proof}
% \begin{small}
% We start by calling $\pi_2^{a^\star}$ and $\pi_2^{a_f}$ the policies of the second agent that choose respectively $a^\star$ and $a_f$ in $s_0$.
% To be $\pi_1^\star$ the only policy that induces the second agent to play $a^\star$ two things have to happen:
% \begin{enumerate}
% \item $V_2^{\pi_1^\star, \pi_2^{a^\star}} > V_2^{\pi_1^\star, \pi_2^{a_f}}$,
% \item $V_2^{\pi_1, \pi_2^{a^\star}} < V_2^{\pi_1^\star, \pi_2^{a_f}}$,
% \end{enumerate}
% where $\pi_1 \in \Pi_1$ is every policy in $\Pi_1$ such that $\pi_1 \ne \pi_1^\star$. We now show that using the proposed two proposed rewards $R$ and $R_f$ fulfills these two conditions. We start by evaluating the two value functions. We remind that $H = N+1$.
% \begin{align*}
% &V_2^{\pi_1^\star, \pi_2^{a^\star}} = (\delta+\epsilon)^{H-1} R, \\
% &V_2^{\pi_1, \pi_2^{a^\star}} \le (\delta+\epsilon)^{H-2} \delta R,\\
% &V_2^{\pi_1^\star, \pi_2^{a_f}} = V_2^{\pi_1, \pi_2^{a_f}} = \sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}R_f,
% \end{align*}
% where the second equation is due the fact that the policy that achieves the greatest expected return is the one that chooses the action $a^\star$ in all the states except to state $s_1$.

% We now show that the first condition holds.
% \begin{align*}
% & (\delta+\epsilon)^{H-1} R> \sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}R_f \\
% & \frac{(\delta+\epsilon)^{H-1} R}{\sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}} > R_f \\
% & \frac{(\delta+\epsilon)^{H-1} R}{\sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}} > \frac{(\delta+\epsilon)^{H-2}(\delta+c) R}{\sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}}.  
% \end{align*}
% And it is always true if $\epsilon > c$.

% It is easy to see that the second condition also holds since:\begin{align*}
% & (\delta+\epsilon)^{H-2}\delta R < \sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}R_f \\
% & \frac{(\delta+\epsilon)^{H-2}\delta R}{\sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}} < R_f \\
% & \frac{(\delta+\epsilon)^{H-2}\delta R}{\sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}} < \frac{(\delta+\epsilon)^{H-2}(\delta+c) R}{\sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}}.  
% \end{align*}
% \end{small}
% \end{proof}

% Then, for an appropriate value, $R_f$ the second agent will choose the action $a^*$ only if agent 1 chooses a policy that always takes the correct action $a^*$.

\paragraph{Intuition on lower bound} From this construction, we show that agent $2$ has the power to hide part of the MG. In fact, in all cases where agent $1$ plays a policy different from the optimal one, we cannot acquire any further information about the transition model since we only visit state $s_f$ and $s_0$ until the end of the episode. Intuitively, we can notice that, in the worst case, we have to play all the policies in $\Pi_1$ before finding the policy that leads us to acquire information about the states in the chain. In fact, only with a policy $\pi \in \Pi_1^*$ the agent $2$ allows us to visit states other than $s_f$. In the following theorem, we will formally prove this intuition.

\begin{restatable}[Lower bound for online Turn-based Stochastic Game]{thm}{lb}
% \begin{theorem}
\label{thm:lb}
Let $\mathfrak{A}$ be a``good'' learning algorithm, where with ``good'' we indicate an algorithm such that its expected regret is upper bounded by $ \mathcal{O}\left(CK^{\alpha}\right)$ with $\alpha < 1$ in all Turn-based Markov Games \footnote{We note that algorithms that satisfy this assumption exist. For instance, applying UCB over the set of policies $\Pi_1$ yields regret $\mathcal{O}\left(\sqrt{|\Pi_1|K}\right)$}.
Then we can create a Turn-based Markov Game such that the expected regret is lower bounded by:
\begin{equation}
\label{eq:regret1}
\EX[\Regret^{\mathfrak{A}}(K)] \ge \Omega\left(H\sqrt{SAK}\right).
\end{equation}
 Furthermore, we can create a Turn-based Markov Game with $S$ states, $A$ actions and horizon $H=S-1$, and a specific initial distribution $\mu$ such that the expected regret of $\mathfrak{A}$ after $K$ steps:
\begin{equation}
\label{eq:regret2}
\EX[\Regret^{\mathfrak{A}}(K)] \ge \Omega\left(A^{HS}\right). 
\end{equation}
\end{restatable}

% 
\subsection{Proof sketch of Theorem \ref{thm:lb}}
In this part, we provide a sketch of the proof of the lower bound for the online Turn-based Markov Game problem. The complete proof can be found in Supplementary \ref{sec:lbproof}. We start by proving Equation \ref{eq:regret1}. We can easily notice that a Markov Decision Process is a special case of a Turn-based Markov Game, in which the state space of the second agent $\state_2 = \emptyset$. Then from this consideration, we can state that the worst-case lower bound for MDPs can also be applied for TMGs \citep{jaksch2010near,domingues2020episodic}:
\begin{equation*}
\EX[\Regret^{\mathfrak{A}}(K)] \ge \Omega\left(H\sqrt{SAK}\right).
\end{equation*} 
To prove Equation \ref{eq:regret2}, instead, we rely on standard information-theoretic arguments used to prove lower bounds in episodic MDPs and bandit problems. More precisely, we use the following lemma due to \cite{simchowitz2019non} (lemma H.1).
\begin{restatable}[Lower bound for online Turn-based Stochastic Game]{lemma}{lbinformation}
\label{lemma:lbinformation}
Let $\mathcal{TMG} = \left(\state, \actions, H, \reward, \mu, \transitions\right)$ and 
$\mathcal{TMG}' = \left(\state, \actions, H, \reward, \mu, \transitions'\right)$ 
be two TMGs with the same state space $\state$, action space $\actions$, 
initial state distribution $\mu$ and horizon $H$.
Fix a number of episodes $K \ge 1$ and let $\mathcal{F}_K$ be the filtration generated by all rollouts up to episode $K$. Then for any $\mathcal{F}_K$-measurable random variable $Z \in \left[0,1 \right]$,
\begin{align*}
\sum_{s,a} \mathbb{E}^{\mathfrak{A}}_\mathcal{TMG}\left[N_K(s,a)\right]KL(\transitions(\cdot|s,a), \transitions^\prime(\cdot|s,a)) \ge \\ \text{kl}(\mathbb{E}^{\mathfrak{A}}_\mathcal{TMG}[Z], \mathbb{E}^{\mathfrak{A}}_{\mathcal{TMG}^\prime}[Z])
\end{align*}
where $\text{kl}(x,y) = x \log\left(\frac{x}{y}\right) + (1-x) \log\left(\frac{1-x}{1-y}\right)$ is the binary KL-divergence and $KL(\cdot, \cdot)$ denotes the KL-divergence between two probability laws and $N_k(s,a)$ is the number of times the state-action pair $(s,a)$ is visited till iteration $k$.
\end{restatable}
Then we construct an alternative $MG'$ such that $MG'$ coincides with the $MG$ except in the transition from $s_0$ to $s_f$:
\begin{equation*}
\transitions(s_f|s_0,a_f) = \delta + \epsilon \quad
\transitions(s_0|s_0,a_f) = 1 - \delta - \epsilon.
\end{equation*}
Changing this transition the agent $2$ will always play the action $a_f$. Knowing that the two games differ only in this transition, by construction, the sum of Lemma \ref{lemma:lbinformation} reduces to consider in the left-hand side only the pair state $s_0$ and action $a_f$. From this consideration, we create two events such that, the KL divergence is greater than $\mathcal{O}\left(A^{SH}\right)$.


\subsection{Discussion on Theorem \ref{thm:lb}}
Theorem \ref{thm:lb} shows that learning in general-sum MGs is exponentially harder than learning in MDPs. This result proves that when we are not in control of the environment it is hard to explore it in a smart way. Furthermore, we would like to remark that the setting we are analyzing also supposes a strong assumption about the not-controllable agent behavior: it can answer only with the same deterministic optimal policy. However, by removing this assumption, i.e., assuming agent $2$ can choose any optimal policy, our result continues to apply. 

The proof of the lower bound implicitly says that we can create very small sub-optimality gaps for the second agent and that the regret must scale with the inverse of them regardless of the suboptimality gaps of the first agent. Although we do not explicitly show this, it is intuitive to see why it can happen. In fact, agent $1$ does not pay for the small suboptimality gap of the not-controllable agent but for its gap that can be potentially very high. We leave as future work to prove a problem-dependent lower bound for TMGs.

This lower bound is the first one that states the difficulties in learning in general-sum Markov Games with the possibility to see the other agent's reward function and actions. Other lower bounds were derived for the general-sum setting. In~\citep{bai2020near} the authors proposed a lower bound to underline the difficulties to learn against an adversarial opponent. In~\citep{tian2020provably}, instead, the authors show the statistical hardness of learning with only bandit feedback, i.e., in an \textit{agnostic} setting. However, these two settings are harder than the one proposed in this section, and, for this reason, the results cannot be applied.



%\section{Comparison with existing lower bounds}
%\gr{se hai tempo}
\section{TMG Optimistic Policies Value Iteration}\label{sec:algorithm}
\label{sec:algotsg}
In this section, we propose an algorithm, called \textit{Turn-based Markov Game Optimistic Policies Value Iteration} (TMG-OPVI), that nearly matches the lower bound proposed in the previous section. 
% The idea of the algorithm is the following: given the set of policies $\Pi_1$ for the first agent, it stores a table recording the policy played by the second player. 
We assume that $\Pi_1$ is any set of policies (similarly to~\cite{abbasi2013online}), not necessarily corresponding to the full set of all deterministic Markov policies, and let $M$ be the cardinality of the policy set $\Pi_1$. 

\paragraph{TMG-OPVI algorithm} TMG-OPVI is a variant of Optimistic Value Iteration \cite{azar2013minimax}, an optimistic regret minimization algorithm for finite-horizon MDPs. The algorithm proceeds as follows. Given the set of policies $\Pi_1$ for the first agent, it stores a table recording the policy played by the second player. For every $i \in [M]$, $k \in [K]$, $h \in [H]$ we denote with $\actions^i_{k,h}(s) \subseteq \actions$ the set of plausible actions, i.e., the set of actions that can be played by the second agent, in state $s$ at step $h$ for policy $\pi_{i} \in \Pi_1$ at the beginning of episode $k$. Since, given agent $1$ policy, the response policy of the other agent is deterministic and unique for assumption (see Section \ref{sec:preliminaries}), when we play the policy $\pi_{i}$ and we observe in state $s \in \state_2$, at time step $h$, the policy $\pi_{2,h}(s)$, we can set $\actions^i_{k,h}(s) = \{\pi_{2,h}(s)\}$. 

As common in optimistic value iteration algorithms~\citep{azar2013minimax}, we shall build upper confidence bounds to the value function of each policy by adding bonus terms based on confidence intervals on the rewards and transition probabilities. Formally, for every $k \in [K]$, state $s \in \state$ and action $a \in \actions$ we derive the bonus term, based on Hoeffding's concentration inequality, for the reward function and the expected value function:
\begin{align*}
% \resizebox{\linewidth}{!}{
b^r_k(s,a) = \sqrt{\frac{2\log\left(\frac{4SAHk}{\delta}\right)}{N_k(s,a)}}, \\ b^\transitions_{k}(s,a) = H\sqrt{\frac{2S\log\left(\frac{4SAHk}{\delta}\right)}{N_k(s,a)}}.
\end{align*}
Furthermore, we indicate with 
% maintain a confidence interval for the the reward function and a confidence interval for the transition model based on Hoeffding inequality:
%\begin{equation}
%\widehat{\reward}_{1,k}(s,a) + \sqrt{\frac{2\log\left(\frac{4SAk}{\delta}\right)}{N_k(s,a)}} \quad \quad \widehat{\transitions}_{1,k}(s^\prime|s,a) + \sqrt{\frac{2S\log\left(\frac{4SAk}{\delta}\right)}{N_k(s,a)}},
%\end{equation}
 $\widehat{\reward}_{1,k}(s,a)$ and $\widehat{\transitions}_{1,k}(s^\prime|s,a)$ the sample means of respectively the observed rewards and transitions up to (and not including) episode $k$.

Based on this, at the beginning of each episode $k \in [K]$ the algorithm computes for each policy $\pi^i_1$ with $i \in [M]$ an optimistic approximation $\widetilde{V}^i_{1,k}$ of the expected return $V^i_{1}$. Recursive for each $h \in [H]$, $s \in \state$ the optimistic approximation $\widetilde{V}^i_{1,k,h}(s)$ of the value function $V^i_{1,h}(s)$ is equal to:
\begin{align*}
% \resizebox{\linewidth}{!}{
    \widetilde{V}^i_{1,k,h}(s) = &  \widehat{\reward}(s,\pi^i_{1,h}(s)) + \sum_{s^\prime \in \state} \widehat{\transitions}(s^\prime|s,\pi^i_{1,h}(s)) \\& \times \widetilde{V}^i_{1,k,h+1}(s^\prime) + b_{k}(s,\pi^i_{1,h}(s)),
\end{align*}
if $I(s)=1$ and, otherwise, is equal to:
\begin{align*}
% \resizebox{\linewidth}{!}{
    \widetilde{V}^i_{1,k,h}(s) = & \max_{a \in \actions^i_{k,h}(s)}\widehat{\reward}(s,a) + \sum_{s^\prime \in \state} \widehat{\transitions}(s^\prime|s,a) \\& \times \widetilde{V}^i_{1,k,h+1}(s^\prime)  + b_{k}(s,a),
\end{align*}
% \begin{footnotesize}
% \begin{align}
% \label{eq:valopt}
%  &\widetilde{V}^i_{1,k,h}(s) = \Bigg\{\begin{matrix} \widehat{\reward}(s,\pi^i_{1,h}(s)) + \sum_{s^\prime \in \state} \widehat{\transitions}(s^\prime|s,\pi^i_{1,h}(s)) \widetilde{V}^i_{1,k,h+1}(s^\prime) + b_{k}(s,\pi^i_{1,h}(s)) & \text{if } I(s) = 1 \\
%  \max_{a \in \actions^i_{k,h}(s)}\widehat{\reward}(s,a) + \sum_{s^\prime \in \state} \widehat{\transitions}(s^\prime|s,a) \widetilde{V}^i_{1,k,h+1}(s^\prime)  + b_{k}(s,a) & \text{if } I(s) = 2\end{matrix},
% \end{align}
% \end{footnotesize}
where $b_{k}(s,\pi^i_{1,h}(s)) =  b^r_{k}(s,\pi^i_{1,h}(s)) +  b^\transitions_{k}(s,\pi^i_{1,h}(s))$.

\paragraph{Two levels of optimism}
Note that we use two levels of optimism: one for the unknown transition probabilities and rewards, and one for the unknown actions of the second agent. More precisely, if we have already seen the action that the second agent will play in a state $s$ with a policy $\pi^i_1$ we use this information to estimate the value function, otherwise we act optimistically by taking the maximum overall plausible actions. The pseudocode of TMG-OPVI is reported in Algorithm \ref{alg:tbs}.

\begin{algorithm}[t]
\caption{TMG-OPVI}
\label{alg:tbs}
\small
\begin{algorithmic}[1]
\STATE \textbf{Input:} $\mathcal{S}$, $\mathcal{A}$, $H$, $\Pi_1=\{\pi^1_1, \dots, \pi^M_1\}$
\STATE Initialize $\mathcal{A}_{1,h}^i(s) = \mathcal{A}$ for all $s \in \mathcal{S}$, $h \in [H]$, and $i \in [M]$
\FOR{episodes $1,2,\dots,K$}
\STATE Compute $\widetilde{V}_{1,k}^{i}$ for all $i \in [M]$
\STATE Play $\pi^{I_k}_1$ with $I_k \in \argmax_{i \in [M]} \widetilde{V}_{1,k}^{i}$ 
\STATE  Observe $(s_{k,1},a_{k,1},\dots,s_{k,H-1},a_{k,H-1},s_{k,H})$
\STATE Compute the plausible actions for all $s \in \Ss$, $h \in [H]$ and $i \in [M]$: $$\mathcal{A}_{k+1,h}^{i}(s) = \begin{cases} 
	\{ a_{k,h} \} & \text{if } i=I_k \text{ and } s=s_{k,h}  \\ \mathcal{A}_{k,h}^{i}(s) & \text{otherwise} \end{cases} $$
\ENDFOR
\end{algorithmic}
\end{algorithm}

\subsection{Regret Guarantees}
In this section, we give a regret bound for the proposed algorithm. The result exploits the determinism of the other agent's policies in order to match the lower bound derived in the previous section. 

\paragraph{Estimation of the transition model} The main idea behind the proof is that after having played a certain number of times every policy, we know in every state that is reachable what action the agent $2$ will play. At this point, we have reduced our problem to an MDP. In fact, when we know the best response function of agent $2$, for every policy we can create a policy that is the union of the policy of agent $1$ and agent $2$. At this point, the uncertainty comes only from the transition model and the reward function. It is important to note that we do not need to know explicitly the set of reachable states, but the algorithm implicitly will estimate correctly the agent $2$ policy after having visited all of them.

Before stating the result we need to introduce some quantities. We define $d^i(s)$ as the probability of visiting state $s$ playing policies $\pi^i_{1}$ and $\text{br}(\pi^i_{1})$. Then, we define the set 
\begin{equation*}
S_{2,h}^{+,i} = \{ s \in \state_2 \text{ such that } d^i_h(s) > 0 \}.
\end{equation*}
We define as $d = \min_{i \in [M]} \min_{h \in [H]} \min{s \in S^{+,i}} d^{i}_h(s)$, i.e., the minimum probability of visiting a ``reachable'' state. In the following theorem we provide an upper bound of the regret of TMG-OPVI algorithm.

\begin{restatable}[Regret of TMG-OPVI]{thm}{opvi}
\label{thm:tsgreg}
Let $TMG = (\state, \actions, \transitions, \mu, \reward_1, \reward_2, H)$ with $\state = \state_1 \cup \state_2$ and $\state_1 \cap \state_2 = \emptyset$ be the finite-horizon TMG of our problem. Then the expected regret of TMG-OPVI at every episode $K>0$ is bounded by:
\begin{equation*}
\EX[\Regret(K)] \le \mathcal{O}\left( MSH\overline{K} + SH\sqrt{AHK\log\left(SAK^2H\right)}\right),
\end{equation*}
where $\bar{K}$ is the first integer such that $\overline{K} > \frac{\log\left(MS\overline{K}^2\right)}{-\log\left(1-d\right)}$
\end{restatable}

\subsection{Proof sketch of Theorem \ref{thm:tsgreg}}
In this section, we provide a proof sketch of Theorem \ref{thm:tsgreg}. The complete proof can be found in Appendix \ref{apx:proofalg}. We consider the fact that, at some point, if we play a certain policy $\pi_1$ we will observe every state that can be reached playing $\pi^i_1$ and $\text{br}(\pi^i_1)$. 
In fact, there exists an iteration $\overline{K}$ such that each state $s \in S_{2,h}^{+,i}$ with $i \in [M]$ has been visited at least one time. After $\overline{K}$, agent $1$ has complete knowledge of the best response function $\text{br}$\footnote{It is important to notice that we do not have to know the sets of reachable states, but we use these sets only for a proof purpose.}. From this iteration, the algorithm is facing a single-agent problem, where the joint policy is derived by the union of the policies of the two agents:
\begin{equation*}
\pi^i(s) = \Bigg \{ \begin{matrix}
\pi^i_1(s) & \text{if} & I(s) = 1 \\
\actions_{k,h}^i(s) & \text{if} & I(s) = 2
\end{matrix},
\end{equation*}
where in this case $\actions_{k,h}^i(s)$ is a singleton for every state $s \in \state_2$ and policy $\pi^i_1$ with $i \in [M]$. Then we can proceed with our proof considering this new single-agent problem.

\subsection{Discussion on Theorem \ref{thm:tsgreg}}
The following regret nearly matches the proposed lower bound. In fact, if we instantiate the set of policies of agent $1$ equal to all the possible deterministic policies, then $M = A^{HS}$ where $A$ is the cardinality of the action space and $S$ the cardinality of the state space and $H$ is the horizon. Instead, the second term of the regret is comparable with the worst-case lower bound for MDPs~\citep{azar2013minimax}. 

It is interesting to note that respect to \textit{agnostic} MG \citep{tian2020provably} we achieve better regret guarantees in terms of $K$ in our setting where they achieve a regret upper bound of $\mathcal{O}\left({K}^{\frac{2}{3}}\right)$. We achieve also stronger regret guarantees with respect to adversarial MDPs \citep{abbasi2013online} where transitions and rewards can change adversarially. The two settings are quite similar, since also in the setting considered in this paper the transitions change \textit{adversarially}, since agent $2$ influences the transitions of the MG. However, in this case, differently to ours, the regret that is achieved is $\mathcal{O}\left(\sqrt{K \log\left(M\right)}\right)$ while our regret does not depend on the number of policies in the $K$ term. On the other hand, their constant dependence on the number of policies is $\mathcal{O}\left(\log(M)\right)$, while we obtain a constant dependence on the number of policies. Clearly, the two settings, are different, so it is hard to compare them, but our result shows that using the information of the not-controllable agent, we can achieve better performances. 




\section{Conclusions}
\textbf{Contributions} In this paper, we propose the first insights to the online learning problem in general-sum Stochastic Games. Although there are some recent results in solving the problem in the zero-sum (aka competitive) setting, there are no other works that take into account the problem or consider that we could face a non-Competitive opponent. We have shown that the problem is much more complicated than in a zero-sum MG and an MDP The main problems arise from the limited control on the environment's exploration. We underline this difficulty by providing a novel lower bound (Section \ref{sec:lowerbound}), which proves that the regret scales constantly with the number of deterministic policies that can be played by the controllable agent. This creates a big gap between what we can obtain learning in MDPs with respect to general-sum MGs. Then we show how to build a provably efficient algorithm in Section \ref{sec:algotsg}. Our algorithm, TMG-OPVI, achieves optimal performance nearly matching the proposed lower bound. We would like to underline that this is the first paper that considers the online learning problem in the general-sum Markov Game, and we think that our findings help in the understanding of the MARL problem.

\textbf{Future directions and discussion on learning in Markov Games} Currently, there is a need for a formal understanding of the online MARL problem to construct provably efficient learning algorithms for this context. As our result suggested, the MARL setting poses novel challenges, especially in the well-known exploration-exploitation dilemma, i.e., the trade-off between gathering new information and exploiting it: in a multi-agent environment, the agent needs to explore not only to understand the underlying environment but also to learn the other agents' behaviors. Moreover, from our findings, it is clear that the algorithm design and the resulting performance guarantees heavily depend on any knowledge about the opponents, either known as a priori or obtainable during the learning process. 

Furthermore, there are open problems also in the agnostic setting, presented in \citep{tian2020provably}, as it is possible to achieve better theoretical (regret) guarantees and construct algorithms with optimal sample complexity. This scenario, having no assumptions on the opponents, is widely applicable to capture real-world problems. On the other hand, assuming to have the possibility to observe other agents' interactions with the environment or having some previous knowledge about the other agents (as having access to a finite set of opponents \citep{balcan2015commitment} or considering a larger set of opponents' classes with some regularity assumptions \citep{sessa2020learning}) we could hope to obtain better theoretical guarantees. An unexplored, but promising future direction would be considering some structural relation between the best response of the agent $2$ and agent $1$. It could overcome the problem of the constant dependence on the number of deterministic policies. 

Another interesting future direction is to better show the relationship between the sub-optimality gap of the non-controllable agent and the one of the controllable agent. To prove this it is necessary to prove a problem-dependent lower bound. We leave this analysis as future work.

\begin{acknowledgements} % will be removed in pdf for initial submission,
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    We thank Andrea Tirinzoni for valuable discussions.
\end{acknowledgements}

    % \bibliographystyle{plainnat}

\bibliography{uai2022-template.bib}

% \begin{thebibliography}{}
% \setlength{\itemindent}{-\leftmargin}
% \makeatletter\renewcommand{\@biblabel}[1]{}\makeatother
% \bibitem{} J.~Alspector, B.~Gupta, and R.~B.~Allen (1989).
%     \newblock Performance of a stochastic learning microchip.
%     \newblock In D. S. Touretzky (ed.),
%     \textit{Advances in Neural Information Processing Systems 1}, 748--760.
%     San Mateo, Calif.: Morgan Kaufmann.

% \bibitem{} F.~Rosenblatt (1962).
%     \newblock \textit{Principles of Neurodynamics.}
%     \newblock Washington, D.C.: Spartan Books.

% \bibitem{} G.~Tesauro (1989).
%     \newblock Neurogammon wins computer Olympiad.
%     \newblock \textit{Neural Computation} \textbf{1}(3):321--323.
% \end{thebibliography}
% \onecolumn

% \include{supplement}


\onecolumn
\appendix

\section*{Notation}
We provide the notation used in the following proofs and in the main paper.
\begin{table}[htbp]
\centering % to have the caption near the table
\begin{tabular}{r p{10.5cm} }
$\mathcal{S}$ & State space\\
$\mathcal{S}_i$ & State space of agent $i$\\
$\mathcal{A}_i$ & Action space of agent $i$\\
$\mathcal{P}$ & Transition kernel\\
$\reward_i$ & Agent i's reward function\\
$\mu$ & Initial state distribution\\
$H$ & Horizon \\
$Q^{\pi_1,\pi_2}_{i,h}(s,a)$ & Agent i's Q-value with policies $\pi_1$ and $\pi_2$\\
$V^{\pi_1,\pi_2}_{i,h}(s)$ & Agent i's value function with policies $\pi_1$ and $\pi_2$\\
$V^{\pi_1,\pi_2}_{i}$ & Agent i's expected return with policies $\pi_1$ and $\pi_2$\\
% $\widetilde{\pi}_{i,k}$ & Estimated agent's best response policy for configuration $p_i$ at episode $k$\\
%$\widetilde{V}^{i,l}$ & {\color{red}Optimistic configurator's expected return for the configuration $p_i$ with best response policy, when the configurator has already played the configuration $p_i$ $l$ times }\\
% $\Delta_i = V^{*}-V^{i}$ & Suboptimality gap of the configuration $p_i$ \\
$K$ & Number of episodes \\
$N^i_{k,h}(s)$ & Number of times the state $s$ is visited following policies $\pi^i_1$ and $\text{br}(\pi^i_1)$ \\
$d^i_h(s)$ & probability of visit state $s$ at time step $h$ following policies $\pi^i_1$ and $\text{br}(\pi^i_1)$ \\
$\mathcal{S}^{+,i}_{2,h}$ & set of states in $\mathcal{S}_2$ such that have $d^i_h(s) > 0$
% $N_k(s)$ & Number of visits of state $s$ before episode $k$ \\
% $N^i_{k,h}(s)$ & Number of visits of state $s$ at step $h$ before episode $k$ with configuration $p_i $ \\
% $\underline{r}_{o,k}(s)$ & Lower confidence value for the agent's reward \\
% $\overline{r}_{o,k}(s) $ & Upper confidence value for the agent's reward \\
% $\widehat{r}_{o,k}(s) $ &  Sample mean of observed rewards \\
% $\underline{Q}^{i}_{o,k,h}(s,a)$ & Lower confidence value of the agent's Q-function with configuration $p_i$ \\
% $\overline{Q}^{i}_{o,k,h}(s,a)$ & Upper confidence value of the agent's Q-function with configuration $p_i$ \\
% $\mathcal{A}_{k,h}^i(s)$ & Set of agent's plausible actions in state $s$ at step $h$ up to episode $k$\\
% $d_h^i(s)$ & Visitation probability the state $s$ at step $h$ with configuration $p_i$ under the agent's best response policy $\pi_i$\\
% $\widetilde{d}_h^i(s)$ & Visitation probability the state $s$ at step $h$ with configuration $p_i$ under the estimated agent's best response policy $\widetilde{\pi}_{i,k}$\\
\end{tabular}
\end{table}
\newpage
% The correct transition model is the following:
% \begin{itemize}
%     \item In state $s_0$, $\transitions(s_1|s_0, a^\star) = 1$, $\transitions(s_f|s_0, a_f) = \delta$ and $\transitions(s_0|s_0, a_f) = 1-\delta$. Then, if the agent $2$ takes the \textit{good} action, the TMG transits to state $s_1$, otherwise the next state is $s_f$ with probability $\delta$ and $s_0$ with probability $1-\delta$.
%     \item From state $s_f$ with any action we continue to stay in state $s_f$, i.e., $\transitions(s_f|s_f,a) = 1$ $\forall a \in \actions$. 
%      \item From state $s_p$ with any action we continue to stay in state $s_p$, i.e., $\transitions(s_p|s_p,a) = 1$ $\forall a \in \actions$. 
%     \item From all the other states $s_i$ with $i \in [N]$: $\transitions(s_{i+1}|s_i,a^\star) = \delta + \epsilon$ and $\transitions(s_{\textcolor{red}{p}}|s_i,a^\star) = 1 - \delta - \epsilon$; instead, for any other action $a \in \actions$, $\transitions(s_{i+1}|s_i,a) = \delta$ and $\transitions(s_{\textcolor{red}{p}}|s_i,a) = 1 - \delta$. 
% \end{itemize} 

\section{Lower bound}


\label{sec:lbproof}
\prop*
\begin{proof}
% \begin{small}
First of all we would like to notice that for the agent $2$ there are only two possible policies: $\pi_{2,h}(a^*|s_0) = 1$ $\forall h \in [H]$ and $\pi_{2,h}(a^f|s_0) = 1$ $\forall h \in [H]$.
% \todogr{explain better this concept}
We denote with:
\begin{align*}
    \pi_2^{a^*} = \pi_{2,h}(a^*|s_0) = 1 \quad \quad \forall h \in [H], \\
    \pi_2^{a_f} = \pi_{2,h}(a^f|s_0) = 1 \quad \quad \forall h \in [H],
\end{align*}
the policies of the second agent that choose respectively $a^\star$ and $a_f$ in $s_0$ and with $\pi_1^*$ a policy $\pi \in \Pi^*_1$. We have to create a \textit{gap} between every policy $\pi \in \Pi^*_1$ and the other policies $\pi \in \Pi_1\setminus \Pi^*_1$, in order to make $\pi \in \Pi^*_1$ the only policies that
induce the second agent to play $a^\star$. To do it, two things have to happen:
\begin{enumerate}
\item $V_2^{\pi_1^\star, \pi_2^{a^\star}} > V_2^{\pi_1^\star, \pi_2^{a_f}}$,
\item $V_2^{\pi_1, \pi_2^{a^\star}} < V_2^{\pi_1^\star, \pi_2^{a_f}}$,
\end{enumerate}
where $\pi_1 \in \Pi_1 \setminus \Pi^*_1$. We start by evaluating the two value functions. We remind that $H = N+1$. We start by evaluating the value functions.
\begin{align}
&V_2^{\pi_1^\star, \pi_2^{a^\star}} = (\delta+\epsilon)^{H-1} R, \label{eq:v1} \\
&V_2^{\pi_1, \pi_2^{a^\star}} \le (\delta+\epsilon)^{H-2} \delta R, \label{eq:v2}\\
&V_2^{\pi_1^\star, \pi_2^{a_f}} = V_2^{\pi_1, \pi_2^{a_f}} = \sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}R_f, \label{eq:v3}
\end{align}
where equation \ref{eq:v1} is due to the fact that playing policies $\pi^*_1$ and $\pi_2^{a^\star}$, agent $2$ receives $R$ only if it follows all the chain (without ending up in state $s_p$); equation \ref{eq:v2} is an upper bound on the value that every policy $\pi \in \Pi_1 \setminus \Pi^*_1$ can achieve and it represents when the expect return that agent $2$ achieves with the policy that chooses the action $a^\star$ in all the states except to state $s_1$; equation \ref{eq:v3} is the value of choosing $\pi_2^{a_f}$.

We now analyze condition 1. 
\begin{align*}
& (\delta+\epsilon)^{H-1} R> \sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}R_f \\
& \frac{(\delta+\epsilon)^{H-1} R}{\sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}} > R_f \\
& \frac{(\delta+\epsilon)^{H-1} R}{\sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}} > \frac{(\delta+\epsilon)^{H-2}(\delta+c) R}{\sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}}, 
\end{align*}
where we set $R_f = \frac{(\delta+\epsilon)^{H-2}(\delta+c) R}{\sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}}$. It is always true if $\epsilon > c$.

We now need to use the same $R_f$ also for condition 2:
\begin{align*}
& (\delta+\epsilon)^{H-2}\delta R < \sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}R_f \\
& \frac{(\delta+\epsilon)^{H-2}\delta R}{\sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}} < R_f \\
& \frac{(\delta+\epsilon)^{H-2}\delta R}{\sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}} < \frac{(\delta+\epsilon)^{H-2}(\delta+c) R}{\sum_{h=1}^{H-1}(H-h)\delta(1-\delta)^{h-1}}. 
\end{align*}
The inequality is also true setting $c > 0$.
% \end{small}
\end{proof}
\lb*
\begin{proof}
In this part, we will prove the lower bound for the online Turn-based Stochastic Game problem. We start by stating that a Markov Decision Process is a special case of a Turn-based Stochastic Game, in which the state space of the second agent $\state_2 = \emptyset$. Then from this consideration, we can state that the worst-case lower bound for MDPs can be also applied for TSGs \citep{jaksch2010near,domingues2020episodic}:
\begin{equation*}
\EX[\Regret^{\mathfrak{A}}(K)] \ge \Omega\left(H\sqrt{SAK}\right).
\end{equation*} 

We need to prove the lower bound in equation \ref{eq:regret2}. To prove this result, we rely on standard information-theoretic methods to prove lower bounds in episodic MDP and bandit problems. We start by stating a lemma taken from \citep{simchowitz2019non}. Since the proof is analogous we omitted it here, but it can be found in the original paper \citep{simchowitz2019non}. We indicate with $N_K(s,a) = \sum_{k=1}^K \sum_{h=1}^H \indi{s_{k,h} = s, a_{k,h} = a}$.
\lbinformation*
% \label{lemma:lbinformation}
Let $\mathcal{TSG} = \left(\state, \actions, H, \reward, \mu, \transitions\right)$ and 
$\mathcal{TSG}' = \left(\state, \actions, H, \reward, \mu, \transitions'\right)$ 
be two TSGs with the same state space $\state$, action space $\actions$, 
initial state distribution $\mu$ and horizon $H$.
 Fix a number of episodes $K \ge 1$ and let $\mathcal{F}_K$ be the filtration generated by all rollouts up to episode $K$. Then for any $\mathcal{F}_K$-measurable random variable $Z \in \left[0,1 \right]$,
\begin{equation}
\sum_{s,a} \mathbb{E}^{\mathfrak{A}}_\mathcal{TSG}\left[N_K(s,a)\right]KL(\transitions(\cdot|s,a), \transitions^\prime(\cdot|s,a)) \ge \text{kl}(\mathbb{E}^{\mathfrak{A}}_\mathcal{TSG}[Z], \mathbb{E}^{\mathfrak{A}}_{\mathcal{TSG}^\prime}[Z])
\end{equation}
where $\text{kl}(x,y) = x \log\left(\frac{x}{y}\right) + (1-x) \log\left(\frac{1-x}{1-y}\right)$ is the binary KL-divergence and $KL(\cdot, \cdot)$ denotes the KL-divergence between two probability laws.
% \end{lemma}
We apply this lemma as follows. For the fixed state-action pair $(s_0,a_f)$, we define an alternative $\mathcal{TSG}'$ to be the TSG that coincides with $\mathcal{TSG}$ except that:
\begin{equation*}
%\transitions'(s_{i+1}|s_i,a^\star) = \delta \quad \transitions(s_f|s_i,a^\star) = 1-\delta \quad
% \transitions(s_{i+1}|s_i,a) = \delta + \epsilon \quad \transitions(s_f|s_i,a) = 1 - \delta - \epsilon
\transitions(s_f|s_0,a_f) = \delta + \epsilon \quad
\transitions(s_0|s_0,a_f) = 1 - \delta - \epsilon.
\end{equation*}
For this game, there are no policies that induce the second agent to play $a^\star$ since it always gains more by playing $a_f$ and taking $R_f$.

By construction the two games $\mathcal{TSG}$ and $\mathcal{TSG}'$ differ only at $s_0$, $a_f$. Thus the lemma \ref{lemma:lbinformation} becomes:
\begin{equation*}
\mathbb{E}^{\mathfrak{A}}_\mathcal{TSG}\left[N_K(s_0,a_f)\right]KL(\transitions(\cdot|s_0,a_f), \transitions^\prime(\cdot|s_0,a_f)) \ge \text{kl}(\mathbb{E}^{\mathfrak{A}}_\mathcal{TSG}[Z], \mathbb{E}^{\mathfrak{A}}_{\mathcal{TSG}^\prime}[Z]).
\end{equation*}

We define $N_K(a) = \sum_{k=1}^K \indi{a_{k,1} = a}$. We define the following two events:
\begin{align*}
\mathcal{E}^a_K = \{ N_K(a_f) \ge \overline{K}\}, \quad \quad  \mathcal{E}^{s_f}_K =\{ \sum_{k=1}^K \indi{s_2 = s_f} \ge \overline{K} \}
\end{align*}
i.e. at episode $K$ the number of times the second agent has played action $a_f$ at time step $1$ and $s_f$ is visited at time step 2 is greater than $\overline{K}$, where $\bar{K}$ is a constant to be chosen later. We define $\indi{\mathcal{E}_K^a, \mathcal{E}_K^{s_f}}$ as the indicator random variable that is $1$ if event $\mathcal{E}_K^a$ and $\mathcal{E}_K^{s_f}$ happens and $0$ otherwise. We are going to evaluate now the expectation $\mathbb{E}^\mathfrak{A}_\mathcal{TSG}[\indi{\mathcal{E}_K^a, \mathcal{E}_K^{s_f}}]$. We start by stating that:
\begin{align*}
\mathbb{E}^\mathfrak{A}_\mathcal{TSG}[\indi{\mathcal{E}_K^a, \mathcal{E}_K^{s_f}}] \le \mathbb{E}^\mathfrak{A}_\mathcal{TSG}[\indi{\mathcal{E}_K^a}].
\end{align*}
Then we note that we have assumed that the algorithm $\mathfrak{A}$ is ``good'' in the sense that its regret is bounded by $\mathcal{O}\left(CK^{\alpha}\right)$. From this assumption and considering that we do not pay regret only if the second agent plays $a^\star$ in state $s_0$, we can say that:
\begin{align}
\mathbb{E}^\mathfrak{A}_\mathcal{TSG}[\indi{\mathcal{E}_K^a, \mathcal{E}_K^{s_f}}] \le \mathbb{E}^\mathfrak{A}_\mathcal{TSG}[\indi{\mathcal{E}_K^a}] \le \frac{CK^\alpha}{\overline{K}}, \label{p:006}
\end{align}
applying the Markov inequality.

Then we evaluate $\mathbb{E}^\mathfrak{A}_{\mathcal{TSG}^\prime}[\indi{\mathcal{E}_K^a, \mathcal{E}_K^{s_f}}]$. We start considering the fact that in the modified $\mathcal{TSG}^\prime$ the second agent always plays the action $a_f$, so:
\begin{align*}
\mathbb{E}^\mathfrak{A}_{\mathcal{TSG}^\prime}[\indi{\mathcal{E}_K^a, \mathcal{E}_K^{s_f}}] = \mathbb{E}^\mathfrak{A}_{\mathcal{TSG}^\prime}[\indi{\mathcal{E}_K^{s_f}}],
\end{align*}
%Then the probability to visit the state $s_f$ at time step $2$ more than $\overline{K}$ is equal to the probability to visit the state $s_0$ at time step $2$ less than $\overline{K}$. We indicate this event with:
%\begin{align*}
%\mathcal{E}^{s_0}_K =\{ \sum_{k=1}^K \indi{s_2 == s_0} \le \overline{K} \}
%\end{align*}
and equivalently to lower bound the value $\mathbb{E}^\mathfrak{A}_{\mathcal{TSG}^\prime}[\indi{\mathcal{E}_K^{s_f}}]$
\begin{align*}
\mathbb{E}^\mathfrak{A}_{\mathcal{TSG}^\prime}[\indi{\mathcal{E}_K^{s_f}}] & =  1-\mathbb{E}^\mathfrak{A}_{\mathcal{TSG}^\prime}[\indi{\neg \mathcal{E}_K^{s_f}}].
\end{align*}
Where $\neg \mathcal{E}_K^{s_f} = \{ \sum_{k=1}^K \indi{s_2 = s_f} < \overline{K}\}$. Noting that the random variable $\sum_{k=1}^K \indi{s_2=s_f}$ has a binomial distribution with probability of success $\delta$, we upper bound $\mathbb{E}^\mathfrak{A}_{\mathcal{TSG}^\prime}[\indi{\neg \mathcal{E}_K^{s_f}}] $ with:
\begin{align}
 \mathbb{E}^\mathfrak{A}_{\mathcal{TSG}^\prime}[\indi{\neg \mathcal{E}_K^{s_f}}] \le  \frac{(K-\overline{K})(\delta + \epsilon)}{(K(\delta+\epsilon) - \overline{K})^2}, \label{p:0009}
\end{align}
applying the upper bound from \citep{feller1957introduction} for the pdf of the binomial distribution.

For small $\epsilon$ we have (see lemma \ref{lemma:aux1}):
\begin{align*}
KL(\transitions(\cdot|s_0,a_f), \transitions'(\cdot|s_0,a_f)) \le  2\epsilon.
\end{align*}


Then applying lemma \ref{lemma:lbinformation}, where we call $Z = \indi{\mathcal{E}_K^a, \mathcal{E}_K^{s_f}}$:
\begin{align}
\mathbb{E}^\mathfrak{A}_{\mathcal{TSG}}\left[N_K(s_0,a_f)\right] 2\epsilon 
&\ge \mathbb{E}^\mathfrak{A}_{\mathcal{TSG}}[Z]  \log\left(\frac{\mathbb{E}^\mathfrak{A}_{\mathcal{TSG}}[Z]}{\mathbb{E}^\mathfrak{A}_{\mathcal{TSG}^\prime}[Z]}\right) + (1-\mathbb{E}^\mathfrak{A}_{\mathcal{TSG}}[Z])  \log\left(\frac{1-\mathbb{E}^\mathfrak{A}_{\mathcal{TSG}}[Z]}{1-\mathbb{E}^\mathfrak{A}_{\mathcal{TSG}^\prime}[Z]}\right) \nonumber
\\
&\ge (1-\mathbb{E}^\mathfrak{A}_{\mathcal{TSG}}[Z])  \log\left(\frac{1}{1-\mathbb{E}^\mathfrak{A}_{\mathcal{TSG}^\prime}[Z]}\right) - \log(2) \label{p:0002}
\\&\ge \left(1-\frac{CK^\alpha}{\overline{K}}\right) \log \left(\frac{1}{1 - \left(1-\frac{(K-\overline{K})(\delta + \epsilon)}{(K(\delta+\epsilon) - \overline{K})^2} \right)}\right) - \log(2) \label{p:0003} \\& 
=    \left(1-\frac{CK^\alpha}{\overline{K}}\right) \log \left(\frac{(K(\delta+\epsilon) - \overline{K})^2}{(K-\overline{K})(\delta + \epsilon)}\right) - \log(2) \nonumber\\
& \ge  \left(1-\frac{CK^\alpha}{\overline{K}}\right) \log \left(\frac{K^2(\delta+\epsilon) - \overline{K}}{K-\overline{K}}\right) - \log(2) \nonumber.
\end{align}
Where (\ref{p:0002}) is due to lemma 5 in \citep{domingues2020episodic}, in \ref{p:0003} we apply inequality \ref{p:006} and \ref{p:0009}.
Setting $\overline{K} = C2K^\alpha$:
\begin{align*}
\EX_{\mathcal{TSG}}\left[N_K(s_0,a)\right] 2\epsilon & \ge \frac{1}{2}  \log \left(\frac{K^2(\delta+\epsilon) - C2K^\alpha}{K-C2K^\alpha}\right) - \log(2)
\end{align*}
Then setting $\epsilon = \frac{1}{4A^S}$.
\begin{align*}
\EX_{\mathcal{TSG}}\left[N_K(s_0,a)\right] \ge A^S  \log \left(\frac{K^2(\delta+\epsilon) - C2K^\alpha}{K-C2K^\alpha}\right) - \log(2),
\end{align*}
and if $\delta > \frac{2K-C2K^\alpha}{K^2}$ then the logarithm is always greater than $\log(2)$ then:
\begin{align*}
\EX_{\mathcal{TSG}}\left[N_K(s_0,a_f)\right] \ge A^S \log(2)  - \log(2).
\end{align*}
Then since every time the action $a_f$ is taken in state $s_0$, we cannot reach state $s_N$ in $H$ steps and we pay a regret equal to $1$, the expected regret is bounded by:
\begin{align*}
\EX[\Regret(K)]  \ge \EX_{\mathcal{TSG}}\left[N_K(s_0,a_f)\right] \ge \Omega \left(A^S\right).
\end{align*}
Then the result follows.
\newpage
%So we can evaluate the $\EX_{\mathcal{TSG}}[Z] = 1$ since the second player will choose an action that will take our agent to state $s_f$ we cannot escape from this state for the remaining $H-1$ timesteps.

%For this game there are two policies which are optimal for our agent, $\pi^\star$ and $(\pi^\star)'$ that is equal to $\pi^\star$ except that $(\pi^\star)'(a|s_i) = 1$ and $(\pi^\star)'(a'|s_i) = 0$ for every $a' \in \actions$.
%By construction the two games $\mathcal{TSG}$ and $\mathcal{TSG}'$ differ only at $s_i$, $a$. Thus:
%\begin{equation*}
%\EX_{\mathcal{TSG}}\left[N_K(s_i,a)\right]KL(\transitions(\cdot|s_i,a), \transitions^\prime(\cdot|s_i,a)) \ge \text{kl}(\EX_{\mathcal{TSG}}[Z], \EX_{\mathcal{TSG}^\prime}[Z]).
%\end{equation*}
%We need now to define the random variable $Z$ to prove our result. To do this we introduce the following event:
%\begin{equation}
%\mathcal{E} = \{\text{Playing $(\pi^\star)'$ $K$ times }N_K(s_f) \ge K(H-2)\},
%\end{equation}
%where $N_K(s_f)$ is the number of time we visit state $s_f$ in $K$ episodes.
%and we define $Z$ as the indicator random variable that is $1$ if event $E$ happens and $0$ otherwise. So we can evaluate the $\EX_{\mathcal{TSG}}[Z] = 1$ since the second player will choose an action that will take our agent to state $s_f$ we cannot escape from this state for the remaining $H-1$ timesteps. On the other hand the $\EX_{\mathcal{TSG}'}[Z] = (1-\delta-\epsilon)^K$, since we could hand up in state $s_f$ only at time step $3$ when the agent at step $2$ is in state $s_1$ takes the action $(\pi^\star)'(s_1)$ and it ends up in state $s_f$. Then:
%\begin{equation*}
%\text{kl}(\EX_{\mathcal{TSG}}[Z], \EX_{\mathcal{TSG}^\prime}[Z]) = \log \left(\frac{1}{(1-\delta-\epsilon)^K}\right),
%\end{equation*}
%and:
%\begin{align*}
%\EX_{\mathcal{TSG}}\left[N_K(s_i,a)\right]KL(\transitions(\cdot|s_i,a), \transitions^\prime(\cdot|s_i,a)) \ge \log \left(\frac{1}{(1-\delta-\epsilon)^K}\right).
%\end{align*}
%Since following \cite{} there exists a universal constant $c$ such that $KL(\transitions(\cdot|s_i,a), \transitions^\prime(\cdot|s_i,a))\le c\epsilon^2$, then:
%\begin{equation*}
%\EX_{\mathcal{TSG}}\left[N_K(s_i,a)\right]c\epsilon^2\ge \log\left(\frac{1}{(1-\delta-\epsilon)^K}\right).
%\end{equation*}
%Setting $\epsilon = \frac{c}{\sqrt{A^S}}$ then we have that:
%\begin{equation*}
%\EX_{\mathcal{TSG}}\left[N_K(s_i,a)\right]\ge \log\left(\frac{1}{(1-\delta)^K}\right)A^S.
%\end{equation*}
%Then the result follows.
\end{proof}

\section{Proof of Theorem  \ref{thm:tsgreg}}
\label{apx:proofalg}
\opvi*
\begin{proof}
We start by defining for every state $s \in \state$, policy $\pi_1^i$ with $i \in [M]$, opponent's policy  $\text{br}(\pi^{i}_1)$, and time step $h \in H$, $d^{i}_{h}(s)$ as the probability of visiting $s$ under these policies. 
Then we define:
\begin{align*}
S_{2,h}^{+,i} = \{ s \in \state_2 \text{ such that } d^i_h(s) > 0 \}.],
\end{align*}
as the set of states with positive probability of being visited playing policies $\pi^{i}_1$ and $\text{br}(\pi^{i}_1)$.
We define:
\begin{equation*}
    d = \min_{i \in [M]} \min_{h \in [H]} \min{s \in S^{+,i}} d^{i}_h(s),
\end{equation*}
the minimum probability of visiting a ``reachable'' state.

Moreover, we define $N_{k,h}^i(s)$ as the number of time a state $s \in \mathcal{S}$ is visited playing policy $\pi_1^i$ up to iteration $k-1 \le K$ and time step $h$,
\begin{equation*}
    N_{k,h}^i(s) = \sum_{k'=0}^k \indi{s_{k',h} = s}.
\end{equation*}


Then we define for each policy $\pi_1^i$ with $i \in [M]$ the following event:
\begin{equation*}
    \mathcal{E}_i = \{ \forall h \in [H] \text{  }  \forall s \in S_{2,h}^{+,i} \text{ such that } N_{k,h}^i(s) > 0 \},
\end{equation*}
i.e. under the event, $\mathcal{E}_i$ every opponent's state is visited at least one time. Then we introduce the indicator random variable $\indi{\mathcal{E}_i}$ which is equal to one of the events $\mathcal{E}_i$ is verified and $0$ otherwise.

We can then decompose the regret as:
\begin{align*}
\Regret &= \sum_{k=1}^K V_1^{\pi_1^\star,br(\pi_1^\star)} - V_1^{\pi_{1,k}, br(\pi_{1,k})} \\
&=  \underbrace{\sum_{k=1}^K \left(V_1^{\pi_1^\star,br(\pi_1^\star)} - V_1^{\pi_{1,k}, br(\pi_{1,k})}\right) \indi{\neg \mathcal{E}_{\text{I}_{k}}}}_{A} + \underbrace{\sum_{k=1}^K \left(V_1^{\pi_1^\star,br(\pi_1^\star)} - V_1^{\pi_{1,k},  br(\pi_{1,k})} \right) \indi{\mathcal{E}_{\text{I}_{k}}}}_{B}
\end{align*}

We start by bounding the A part. We rewrite the regret making explicit its dependence on the policy $i$.
\begin{align}
&\sum_{k=1}^K \left(V_1^{\pi_1^\star,br(\pi_1^\star)} - V_1^{\pi_{1,k}, br(\pi_{1,k})}\right) \indi{\neg \mathcal{E}_{\pi_{1,k}}}  \nonumber \\&= \sum_{k=1}^K\sum_{i \in [M]} \left(V_1^{\pi_1^\star,br(\pi_1^\star)} - V_1^{\pi_{1,k}, br(\pi_{1,k})}\right) \indi{\neg \mathcal{E}_{\pi_{1,k}}} \indi{\pi_{1,k} = \pi_i}. \label{eq:aterm}
\end{align}
%Now we define as $d = \min_{i \in [M]} \min_{h \in [H]} \min{s \in S^{+,i}} d^{i}_h(s)$ the minimum probability of visit each state, at each time step $h \in [H]$ and policy $\pi_i$ with $i \in [M]$. 
Fixing a policy $\pi_i$ we can say that:
\begin{align}
&\sum_{k=1}^K \left(V_1^{\pi_1^\star,br(\pi_1^\star)} - V_1^{\pi_{1,k}, br(\pi_{1,k})}\right) \indi{\neg \mathcal{E}_{\pi_{1,k}}} \indi{\pi_{1,k} = \pi_i} \nonumber\\
& \le \sum_{k=1}^{\overline{K}}\left( V_1^{\pi_1^\star,br(\pi_1^\star)} - V_1^{\pi_{1,k}, br(\pi_{1,k})} \right) \indi{\neg \mathcal{E}_{\pi_{1,k}},\pi_{1,k} = \pi_i, N_k(\pi_i) \le \overline{K}} \label{p:00010}\\&+ \sum_{k=\overline{K}}^\infty \left(V_1^{\pi_1^\star,br(\pi_1^\star)} - V_1^{\pi_{1,k}, br(\pi_{1,k})} \right)\indi{\neg \mathcal{E}_{\pi_{1,k}},\pi_{1,k} = \pi_i, N_k(\pi_i) > \overline{K}} \le \overline{K} \nonumber
\end{align}
 where we use in (\ref{p:00010}) lemma \ref{lemma:visitationonline} and $\overline{K}$ is the first $\overline{K}$ such that it fulfills the inequality $\overline{K} >\frac{\log\left(MSH^2\overline{K}\right)}{\log\left(\frac{1}{1-d}\right)}$.
 
Then we can bound equation \ref{eq:aterm}.
\begin{align*}
&\sum_{i \in [M]} \sum_{k =1}^K V_1^{\pi_1^\star,br(\pi_1^\star)} - V_1^{\pi_{1,k}, br(\pi_{1,k})} \indi{\neg \mathcal{E}_{\pi_{1,k}}} \indi{\pi_{1,k} = \pi_i} \le MH\overline{K}.
\end{align*}
%with probability $1-\delta$. So setting $\delta = KH$ term A is equal to:
%\begin{align*}
%&\sum_{i \in [M]} \sum_{k =1}^K V_1^{\pi_1^\star,br(\pi_1^\star)} - V_1^{\pi_{1,k}, br(\pi_{1,k})} \indi{\neg \mathcal{E}_{\pi_{1,k}}} \indi{\pi_{1,k} = \pi_i} \\& \le MH\overline{K} + 1.
%\end{align*}
%We have to underline that in this way $\overline{K}$ has to be grater than $\frac{\log\left(\frac{MS}{K}\right)}{\log\left(\frac{1}{1-d}\right)}$.
%We define the event $\mathcal{E}$:
%\begin{equation*}
%    \mathcal{E}^k = \{ \forall i \in [M] \quad \forall s \in S_2^{+,i} N_k^i(s) > 0 \}
%\end{equation*}
%
%So we can bound the regret as:
%\begin{equation*}
%    \EX[\text{Regret}] \le  \underbrace{\EX[\text{Regret}|\neg \mathcal{E}]}_{\text{A}} + \underbrace{\EX[\text{Regret}| \mathcal{E}]}_{\text{B}}
%\end{equation*}
%
%We start bounding A. We define as $d = \min_{i \in [M]} \min_{h \in [H]} \min{s \in S^{+,i}} d^{i}_h(s)$ so using lemma \ref{} we can state that with probability $1-\delta_k$, after $k$ step every state $s$ is visited, under every policy $\pi^i$ at least:
%\begin{equation*}
%N_k^i(s) \ge (k-1)d - \sqrt{\frac{k-1}{2} \log \left(\frac{1}{\delta_k}\right)}.
%\end{equation*}
%Then we can say that after $\widehat{K}$ episodes such that $(\widehat{K}-1)d - \sqrt{\frac{\widehat{K}-1}{2} \log \left(\frac{1}{\delta_k}\right)}$ with probability 
%
%\begin{equation*}
%    \EX[\text{Regret}|\neg E] \\
%    \le 
%\end{equation*}

Then we bound the term B. When the event $\mathcal{E}_i$ is verified  we know the policy $br(\pi^i_1)$'s behavior in every state $s \in \state^{+,i}_2$. Then, we can easily notice that in this case, we are facing a single-agent problem. In fact, we can derive the single-agent policy by the union of the two policies of the agents. The policy that is played can be written as:
\begin{equation}
\pi^i(s) = \Bigg \{ \begin{matrix}
\pi^i_1(s) & \text{if} & I(s) = 1 \\
\actions_{k,h}^i(s) & \text{if} & I(s) = 2
\end{matrix},
\end{equation}
where in this case $\actions_{k,h}^i(s)$ is a singleton for every state $s \in \state_2$ and policy $\pi^i_1$ with $i \in [M]$.

\begin{align*}
\sum_{k=1}^{K} V_1^{\pi_1^\star,br(\pi_1^\star)} - V_1^{\pi_{1,k},  br(\pi_{1,k})} & \le \sum_{k=1}^{K} \widetilde{V}_1^{\pi_{1,k},  br(\pi_{1,k})} - V_1^{\pi_{1,k},  br(\pi_{1,k})} 
\end{align*}
with probability $1-\delta$, since we used the optimism to bound the regret\footnote{In lemma \ref{lemma:confint} we proved that the confidence intervals are verified with probability $1-\delta$.}.
Then, for a specific episode $k \le K$ and time step $h \le H$:
\begin{align}
    &\widetilde{V}^{\pi_k}_{1,k,h}(s_{k,h}) -  V^{\pi_k}_{1,k,h}(s_{k,h}) \\&= \widehat{\mathcal{R}}_1(s_{k,h}, a_{k,h}) + b^r_{k,h} - \mathcal{R}_1(s_{k,h}, a_{k,h}) + \widehat{\mathcal{P}}(\cdot|s_{k,h}, a_{k,h}) \widetilde{V}^{\pi_k}_{1,k,h} + b^\transitions_{k,h} - \mathcal{P}(\cdot|s_{k,h}, a_{k,h}) V^{\pi_k}_{1,k,h} \\
    & =  \underbrace{b^r_{k,h} + \widehat{\mathcal{R}}_1(s_{k,h}, a_{k,h}) - \mathcal{R}_1(s_{k,h}, a_{k,h})}_{\Delta^R_{k,h}} + \underbrace{b^\transitions_{k,h} + (\widehat{\mathcal{P}}(\cdot|s_{k,h}, a_{k,h})-\mathcal{P}(\cdot|s_{k,h}, a_{k,h})) \widetilde{V}^{\pi_k}_{1,k,h}}_{\Delta^P_{v,k}} \\&\phantom{=}+\underbrace{\mathcal{P}(\cdot|s_{k,h}, a_{k,h})( \widetilde{V}^{\pi_k}_{1,k,h} - V^{\pi_k}_{1,k,h}) - (\widetilde{V}^{\pi_k}_{1,k,h+1}(s_{k,h+1}) - V^{\pi_k}_{1,k,h+1}(s_{k,h+1}))}_{\Delta^V_{k,h}} \\ & \phantom{=}+ \widetilde{V}^{\pi_k}_{1,k,h+1}(s_{k,h+1}) - V^{\pi_k}_{1,k,h+1}(s_{k,h+1}) 
\end{align}
We are going to bound the different $\Delta$s terms. We call $\Delta^S_{k,h+1}(s) = \widetilde{V}^{\pi_k}_{1,k,h+1}(s) -   V^{\pi_k}_{1,k,h+1}(s)$, and we can say that with probability $1-\delta$:
\begin{align*}
\sum_{k=1}^K\sum_{h=1}^H\Delta^V_{k,h} &= \sum_{k=1}^K\sum_{h=1}^H\EX_{s \sim \mathcal{P}(\cdot|s_{k,h}, a_{k,h})}
[\Delta^S_{k,h+1}(s)] - \Delta^S_{k,h+1}(s_{k,h+1}) \\
& \le \sqrt{2KH\log\left(\frac{1}{\delta}\right)}
\end{align*}
where we apply Azuma-Hoeffding inequality since it is a martingale difference sequence. 

For the second term $\Delta^R_{k,h}$ we apply the confidence intervals on the reward function:
\begin{align}
\sum_{k=1}^K\sum_{h=1}^H \Delta^R_{k,h} &\le \sum_{k=1}^K\sum_{h=1}^H 2b^R_{k,h} =  \sum_{k=1}^K\sum_{h=1}^H \sum_{s,a \in \state \times \actions} 2b^R_{k,h} \indi{s_{k,h} = s} \indi{a_{k,h} = a} \nonumber\\
&=  \sum_{k=1}^K\sum_{h=1}^H \sum_{s,a \in \state \times \actions} 2 \sqrt{\frac{2\log(\frac{4SAHk}{\delta})}{N_k(s,a)}} \indi{s_{k,h} = s} \indi{a_{k,h} = a} \label{p:201}\\
&\le 2\sqrt{2\log(\frac{4SAHK}{\delta})} \sum_{s,a \in \state \times \actions}\sum_{k=1}^K\sum_{h=1}^H  \sqrt{\frac{1}{N_k(s,a)}} \indi{s_{k,h} = s} \indi{a_{k,h} = a} \label{p:202}\\
&\le 2\sqrt{2\log\left(\frac{4SAHK}{\delta}\right)} \left(\sum_{s,a \in \state \times \actions}\sum_{i=1}^{N_K(s,a)}  \sqrt{\frac{1}{i}}\right) \label{p:203}\\
& =  4\sqrt{2SAKH \log\left(\frac{4SAHK}{\delta}\right) }, \nonumber
\end{align}
with probability $1-\delta$. In line (\ref{p:201}) we write explicitly the confidence intervals, in line (\ref{p:202}) we bounded the term with its maximum value $K$, in line (\ref{p:203}) we bound the sum, summing every time we saw a specific state with its visitation value.

The term $\Delta^P$s can be bounded:                                                 
\begin{align*}
\sum_{k=1}^K \sum_{h=1}^H \Delta^P_{k,h} & \le \sum_{k=1}^K \sum_{h=1}^H 2b^P_{k,h} = 2 H\sum_{k=1}^K \sum_{h=1}^H \sqrt{\frac{2S\log\left(\frac{4SAHk}{\delta}\right)}{N_k(s,a)}} 
\le 4HS\sqrt{2AKH\log\left(\frac{4SAHK}{\delta}\right)},
\end{align*}
with probability $1-\delta$.

Putting everything together, and including the regret suffered on the events where the confidence intervals do not hold (which occur with probability at most $\delta$):
\begin{align*}
\EX[\Regret(K)] &\le MH\overline{K} + (1-\delta) 8SH\sqrt{2AHK \log\left(\frac{4SAHK}{\delta}\right) } \\&+ 2\delta KH + (1-\delta)\sqrt{2KH\log\left(\frac{1}{\delta}\right)} + \delta KH
\end{align*}
Setting $\delta = \frac{1}{3KH}$ the result follows.
%For this reason we will consider in the rest of the proof that we have only one agent and we are facing a single-agent finite horizon RL problem. The rest of the proof follows same derivation as for UCBVI.

%\begin{align*}
%    \EX[\text{Regret}| \mathcal{E}] =& \sum_{k=1}^K V^*_1(s_{k,1}) - V^{\pi_k}_1(s_{k,1}) \\
%    &\le \sum_{k=1}^K \tilde{V}^{\pi_k}_1(s_{k,1}) -  V^{\pi_k}_1(s_{k,1}), \\
%\end{align*}
%where we used the optimism to bound the regret.
%Then, for a specific episode $k \le K$ and time step $h \le H$:
%\begin{align*}
%    \widetilde{V}^{\pi_k}_1(s_{k,h}) -  V^{\pi_k}_1(s_{k,h}) &= \widetilde{\mathcal{R}}_1(s_{k,h}, a_{k,h}) - \mathcal{R}_1(s_{k,h}, a_{k,h}) + \widetilde{\mathcal{P}}(\cdot|s_{k,h}, a_{k,h}) \widetilde{V}^{\pi_k}_{1,k,h} - \mathcal{P}(\cdot|s_{k,h}, a_{k,h}) V^{\pi_k}_{1,k,h} \\
%    & \le \underbrace{\widetilde{\mathcal{R}}_1(s_{k,h}, a_{k,h}) - \mathcal{R}_1(s_{k,h}, a_{k,h})}_{\Delta^R_{k,h}} + \underbrace{(\widetilde{\mathcal{P}}(\cdot|s_{k,h}, a_{k,h})-\mathcal{P}(\cdot|s_{k,h}, a_{k,h})) \widetilde{V}^{\pi_k}_{1,k,h}}_{\Delta^P_{v,k}} \\&\phantom{=}+\underbrace{( \widetilde{V}^{\pi_k}_{1,k,h} - V^{\pi_k}_{1,k,h})\mathcal{P}(\cdot|s_{k,h}, a_{k,h})}_{\Delta^V_{k,h}}
%\end{align*}
%So we bound the different $\Delta$ terms:
%\begin{align*}
%    \Delta^R_{k,h} \le c.i.
%\end{align*}
%
%\begin{align*}
%    \Delta^P_{k,h} &\le \norm{\widetilde{\mathcal{P}}(\cdot|s_{k,h}, a_{k,h})-\mathcal{P}(\cdot|s_{k,h}, a_{k,h})} \norm{\widetilde{V}^{\pi_k}_{1,k,h}} = c.i. H
%\end{align*}
%
%\begin{align*}
%    \Delta^V_{k,h} \le \EX_{x \sim \mathcal{P}(\cdot|s_{k,h}, a_{k,h})}[\widetilde{V}^{\pi_k}_{1,k,h+1}(x) - V^{\pi_k}_{1,k,h+1}(x)]
%\end{align*}
\end{proof}
\newpage
\section{Auxiliar lemmas}
\begin{lemma}
\label{lemma:aux1}
For small $\epsilon$ the probability between two Bernoulli distribution $p$ and $q$ with parameters $\delta$ and $\delta+\epsilon$:
\begin{equation*}
    KL(p|q) \le 2\epsilon
\end{equation*}
\end{lemma}
\begin{proof}
We start by saying that:
\begin{equation*}
    \log(1+x) \ge \frac{x}{1+x}
\end{equation*}
Then:
\begin{align*}
    KL(p|q) = \delta \log\left(\frac{\delta}{\delta+\epsilon}\right) + (1-\delta) \log\left(\frac{1-\delta}{1-\delta-\epsilon}\right) \\
    = \delta \log\left(\frac{1}{1+\frac{\epsilon}{\delta}}\right) + (1-\delta) \log\left(\frac{1}{1-\frac{\epsilon}{1-\delta}}\right) \\
    - \delta \log\left(1+\frac{\epsilon}{\delta}\right) - (1-\delta) \log\left(1-\frac{\epsilon}{1-\delta}\right) \\
    \le (1-\delta)\frac{\frac{\epsilon}{1-\delta}}{1-\frac{\epsilon}{1-\delta}} = \frac{\epsilon}{1-\frac{\epsilon}{1-\delta}} \le 2 \epsilon
\end{align*}
where the last inequality is true if $\frac{\epsilon}{1-\delta}\le\frac{1}{2}$.
\end{proof}
In the following lemma we prove that if for every policy $\pi_i $ with $i \in [M]$, every state $s \in \state_2^{+,i}$, at each time step $h \in [H]$ has the probability to be visited equal at least to $d$ then after ``enough'' times the policy is played then every state reachable is visited at least one time.
\begin{lemma}
\label{lemma:visitationonline}
For each policy $\pi_i$ with $i \in [M]$ if $N_k(\pi_i) \geq \bar{K}$, where $\bar{K} \geq \frac{\log\left(\frac{MSH}{\delta}\right)}{\log\left(\frac{1}{1-d}\right)}$, then every state $s \in \state_2^{+,i}$ is visited at least one time with probability $1-\delta$.
\end{lemma}
\begin{proof}
\begin{small}
We start by bounding the probability that there is at least one state reachable that is not already visited at least one policy:
\begin{align*}
\mathrm{P}\{ &\exists s \in \state^{+,i}_2, \quad \exists \pi_1^i \text{ with } i \in [M] \text{ such that } N_{k,h}(s) = 0 \text{ and } N_{k}(\pi^i_1) \ge \overline{K}\} \le \\
&\sum_{i \in [M]} \sum_{s \in \state^{+,i}_2} \sum_{h \in [H]} \mathrm{P}\{ N_{k,h}(s) = 0 \text{ and } N_{k}(\pi^i_1) \ge \overline{K}\} \le M S H (1 - d)^{\overline{K}}.
\end{align*}
Since we want to say that this probability is less than $\delta$:
\begin{align*}
&M S H (1 - d)^{\overline{K}} \le \delta \\
& \overline{K} \log\left(\frac{1}{1-d}\right) \ge \log\left(\frac{MSH}{\delta}\right) \\
& \overline{K} \ge \frac{\log\left(\frac{MSH}{\delta}\right)}{\log\left(\frac{1}{1-d}\right)}.
\end{align*}
Then the result follows.
\end{small}
\end{proof}

Then we provide the lemma for the confidence intervals:
\begin{lemma}
\label{lemma:confint}
The confidence intervals derived by the bonus $b^r$ and $b^\transitions$ are verified with probability $1-\delta$.
\end{lemma}
\begin{proof}
\begin{small}
We recall that the bonus terms used are respectively:
\begin{align*}
b^r_k(s,a) = \sqrt{\frac{2\log\left(\frac{4SAHk}{\delta}\right)}{N_k(s,a)}} \quad \quad b^\transitions_{k,h}(s,a) = h\sqrt{\frac{2S\log\left(\frac{4SAHk}{\delta}\right)}{N_k(s,a)}}.
\end{align*}
The bonus term is directly derived by Hoeffding concentration inequality and union bound.

\end{small}
\end{proof}
% If your paper is accepted and the title of your paper is very long,
% the style will print as headings an error message. Use the following
% command to supply a shorter title of your paper so that it can be
% used as headings.
%
%\runningtitle{I use this title instead because the last one was very long}

% If your paper is accepted and the number of authors is large, the
% style will print as headings an error message. Use the following
% command to supply a shorter version of the authors names so that
% they can be used as headings (for example, use only the surnames)
%
%\runningauthor{Surname 1, Surname 2, Surname 3, ...., Surname n}

% Supplementary material: To improve readability, you must use a single-column format for the supplementary material.

\vfill
% \end{document}
\end{document}
