% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

%% my preamble
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{dsfont}
\usepackage{subcaption}
% \usepackage{algorithmic}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{hyperref}

\algrenewcommand\algorithmicindent{1.0em}%
\newcommand{\argmax}{\mathop{\rm arg~max}\limits}
\newcommand{\argmin}{\mathop{\rm arg~min}\limits}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{remark}[theorem]{Remark}

\allowdisplaybreaks[1]

%%% HELPER CODE FOR DEALING WITH EXTERNAL REFERENCES
\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{
    \externaldocument{#1}
    \addFileDependency{#1.tex}
    \addFileDependency{#1.aux}
}
%%% END HELPER CODE
\myexternaldocument{abe_504-supp}

\title{Mutation-Driven Follow the Regularized Leader for Last-Iterate Convergence in Zero-Sum Games}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<abe$\_$kenshi@cyberagent.co.jp>?Subject=Your UAI 2022 paper}{Kenshi Abe}{}}
\author[2]{Mitsuki Sakamoto}
\author[2]{Atsushi Iwasaki}
% Add affiliations after the authors
\affil[1]{%
    CyberAgent, Inc.
}
\affil[2]{%
    University of Electro-Communications
}
  
\begin{document}
\maketitle

\begin{abstract}
In this study, we consider a variant of the Follow the Regularized Leader (FTRL) dynamics in two-player zero-sum games.
FTRL is guaranteed to converge to a Nash equilibrium when time-averaging the strategies, while a lot of variants suffer from the issue of limit cycling behavior, i.e., lack the last-iterate convergence guarantee.
To this end, we propose mutant FTRL (M-FTRL), an algorithm that introduces mutation for the perturbation of action probabilities.
We then investigate the continuous-time dynamics of M-FTRL and provide the strong convergence guarantees toward stationary points that approximate Nash equilibria under full-information feedback.
Furthermore, our simulation demonstrates that M-FTRL can enjoy faster convergence rates than FTRL and optimistic FTRL under full-information feedback and surprisingly exhibits clear convergence under bandit feedback.
\end{abstract}

\begin{figure*}[t!]
    \centering
    \begin{minipage}[t]{0.24\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figs/continuous_time/brps-rd-crop.pdf}
        \subcaption{RD}\label{fig:FTRL-trajectory}
    \end{minipage}
    \begin{minipage}[t]{0.24\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figs/continuous_time/brps-rmd-mu0.01-crop.pdf}
        \subcaption{RMD ($\mu=0.01$)}\label{fig:M-FTRL-trajectory-0.01}
    \end{minipage}
    \begin{minipage}[t]{0.24\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figs/continuous_time/brps-rmd-mu0.1-crop.pdf}
        \subcaption{RMD ($\mu=0.1$)}\label{fig:M-FTRL-trajectory-0.1}
    \end{minipage}
    \begin{minipage}[t]{0.24\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figs/continuous_time/brps-rmd-mu1-crop.pdf}
        \subcaption{RMD ($\mu=1.0$)}\label{fig:M-FTRL-trajectory-1.0}
    \end{minipage}
    \caption{
    Learning dynamics of RD and RMD in biased Rock-Paper-Scissors.
    The red dot represents the Nash equilibrium point of the game.
    }
    \label{fig:learningdynamics}
\end{figure*}

\section{Introduction}
Our study focuses on the problem of learning an equilibrium in two-player zero-sum games.
In order to find an equilibrium in two-player zero-sum games, we need to solve a minimax optimization (or saddle-point optimization) in the form of $\min_{x}\max_{y}f(x,y)$.
Motivated by advances of multi-agent reinforcement learning~\citep{busoniu2008comprehensive} and Generative Adversarial Networks (GANs)~\citep{goodfellow2014generative}, the development of algorithms that efficiently approximate the solution of the minimax optimization is attracting considerable interest \citep{blum:agt:2007,daskalakis2017training}.

There are a lot of studies focusing on developing no-regret learning algorithms where the iterate-average strategy profile converges to a Nash equilibrium of two-player zero-sum games~\citep{banerjee2005efficient,zinkevich2007regret,daskalakis2011near}.
However, well-known no-regret learning algorithms such as Follow the Regularized Leader (FTRL) are shown to cycle and fail to converge without time-averaging~\citep{mertikopoulos2018cycles,bailey2018multiplicative}.
In recent years, several studies have developed and analyzed algorithms whose trajectory of updated strategies directly converges to an equilibrium without forming a cycle, such as optimistic FTRL (O-FTRL)~\citep{daskalakis2017training,daskalakis2018last,mertikopoulos2018optimistic,wei2020linear,lei2021last}.
This convergence property is known as {\it last-iterate convergence}.
However, establishing the explicit convergence rates of optimistic multiplicative weights update, which is tantamount to O-FTRL with entropy regularization, requires that the equilibrium in underlying games must be unique \citep{daskalakis2018last,wei2020linear}.

In this study, as an alternative, we propose mutant FTRL\footnote{An implementation of our method is available at \url{https://github.com/CyberAgentAILab/mutant-ftrl}.} (M-FTRL), an algorithm that introduces mutation for the perturbation of action probabilities.
We first identify the discrete-time version of the M-FTRL dynamics and then modify it to the continuous-time version to provide the theoretical analysis. We prove the followings: 
1) M-FTRL dynamics induced by the entropy regularizer is equivalent to replicator-mutator dynamics (RMD)~\citep{hofbauer:mor:2009,zagorsky:plosone:2013,Bauer:2019};
2) for general regularization functions, the strategy trajectory of M-FTRL converges to a stationary point of the RMD;
3) the trajectory of M-FTRL with the entropy regularizer converges to an approximate Nash equilibrium at an exponentially fast rate.
To the best of our knowledge, we are the first to provide the convergence result for RMD in two-player zero-sum games.

Furthermore, our simulation demonstrates that M-FTRL can enjoy faster convergence rates than FTRL and optimistic FTRL under full-information feedback, i.e., M-FTRL converges to a stationary point, which approximates a Nash equilibrium, faster. 
It also exhibits clear convergence under partial-information or bandit feedback, where each player takes the feedback about the payoffs from his or her chosen actions. 
We empirically observe the last-iterate convergence behavior in the M-FTRL dynamics, as well as under full-information feedback, while neither FTRL nor O-FTRL reveals such behavior.  
This is surprising because it is an open question if a last-iterate convergence guarantee is provided under bandit feedback. 


\section{Related Literature}
\paragraph{Average-iterate convergence}
There are a lot of previous studies focusing on developing no-regret learning algorithms that enjoy average-iterate convergence in two-player zero-sum games \citep{cesa2006prediction,zinkevich2007regret,hofbauer:mor:2009,syrgkanis2015fast}.
FTRL is one of the most widely studied no-regret learning algorithm and has been shown to be convergent if the equilibrium is deterministic or strict \citep{mertikopoulos2018cycles,giannou2021convergence}.
If the equilibrium strategy is a mixed strategy with full support, FTRL's trajectory can be recurrent \citep{mertikopoulos2018cycles}.
For extensive-form games, counterfactual regret minimization \citep{zinkevich2007regret} and its variants have been developed as a no-regret learning algorithm \citep{gibson2012generalized,tammelin2014solving,lanctot:nips:2017,schmid2019variance,brown2019solving,davis2020low}.
However, most of these algorithms have not been proven that the last-iterate strategy converges.

\paragraph{Last-iterate convergence}
In recent years, various algorithms using an optimistic online learning framework \citep{rakhlin2013online,rakhlin2013optimization} have been proposed for last-iterate convergence in minimax optimization.
Optimistic gradient descent ascent \citep{daskalakis2017training,mertikopoulos2018optimistic,wei2020linear} and optimistic multiplicative weights update \citep{daskalakis2018last,wei2020linear,lei2021last} are the variants of O-FTRL, and they have been shown to enjoy the last-iterate convergence guarantee in constrained and unconstrained saddle optimization problems. 
Furthermore, \citet{nguyen2021last} have proposed the no-regret learning algorithm, which exhibits the last-iterate convergence in asymmetric repeated games.
In contrast to their optimistic modification of FTRL, which boosts updates for expected utitilities, 
our method is motivated by replicator-mutator dynamics and provides an alternative way to enjoy the last-iterate convergence guarantee.   


\paragraph{Replicator-mutator dynamics}
Evolutionary game theory has been strongly related to learning dynamics. 
In fact, it is well-known that cross learning converges to the replicator dynamics (RD) in the continuous-time limit \citep{borgers:jet:1997,Bloembergen2015}, similarly to FTRL. 
On the other hand, RMD \citep{Hofbauer1998} has been overlooked in the context of learning.  
Introducing mutation empirically makes numerical errors in computation small \citep{zagorsky:plosone:2013}. 
However, it makes difficult to analyze the properties. Some notable exceptions report that mutation stabilizes the dynamics~\citep{bomze:geb:1995,Bauer:2019}. Let $\pi^{\mu}$ be an interior stationary point of RMD with mutation rate $\mu$, then $\pi^{\mu}$ is $\varepsilon$-Nash equilibrium of the underlying game for $\varepsilon=\mu$ \citep{Bauer:2019}. Also, evolutionary game dynamics such as RD typically exhibits continua of stationary points and is unlikely to converge to a unique, stable stationary point. Mutation dissolves continua of neutrally stable equilibria into isolated, asymptotically stable ones \citep{bomze:geb:1995}.

\section{Preliminaries}
\subsection{Two-Player Zero-Sum Normal-Form Game}
A two-player normal-form game is defined by utility functions $u_i\in [-u_{\max}, u_{\max}]^{A_1\times A_2}$, where $A_i$ is the finite action space for player $i\in \{1, 2\}$.
In a two-player zero-sum normal-form game, $u_i$ satisfies $u_1(a_1, a_2)+u_2(a_1, a_2)=0$ for all $a_1\in A_1$ and $a_2\in A_2$.
In this game, each player $i$ selects action $a_i\in A_i$ simultaneously.
Then, player $i$ receives utility $u_i(a_1, a_2)$.
Let us denote $\pi_i\in \Delta(A_i)$ as a {\it mixed strategy} for player $i$, where $\Delta(A_i):=\{p \in [0,1]^{|A_i|} ~|~ \sum_{a_i\in A_i}p(a_i)=1\}$ represents the probability simplex on $A_i$.
We define a {\it strategy profile} as $\pi=(\pi_1, \pi_2)$.
For a given strategy profile $\pi$, the expected utility for player $i$ is given by $v_i^{\pi}=\mathbb{E}_{a\sim \pi}\left[u_i(a_1, a_2)\right]$.
We further define the conditional expected utility of taking action $a_i\in A_i$ as $q^{\pi}_i(a_i)=\mathbb{E}_{a_{-i}\sim \pi_{-i}}[u_i(a_i, a_{-i}) | a_i]$, where $-i$ represents the opponent of player $i$.
Finally, we denote the conditional expected utility vector as $q_i^{\pi}=(q_i^{\pi}(a_i))_{a_i\in A_i}$.


\subsection{Nash Equilibrium and Exploitability}
A common solution concept for two-player games is a {\it Nash equilibrium} \citep{nash1951non}, where no player cannot improve his/her expected utility by deviating from his/her specified strategy.
In two-player zero-sum normal-form games, a Nash equilibrium $\pi^{\ast}=(\pi_1^{\ast}, \pi_2^{\ast})$ ensures the following condition: $\forall \pi_1\in \Delta(A_1), \forall \pi_2\in \Delta(A_2),$
\begin{align*}
    v_1^{\pi_1^{\ast}, \pi_2} \geq v_1^{\pi_1^{\ast}, \pi_2^{\ast}} \geq v_1^{\pi_1, \pi_2^{\ast}}.
\end{align*}
An {\it $\epsilon$-Nash equilibrium} $(\pi_1, \pi_2)$ is an approximation of a Nash equilibrium, which satisfies the following inequality:
\begin{align*}
    \max_{\tilde{\pi}_1\in \Delta(A_1)}v_1^{\tilde{\pi}_1, \pi_2} + \max_{\tilde{\pi}_2\in \Delta(A_2)}v_2^{\pi_1, \tilde{\pi}_2} \leq \epsilon.
\end{align*}
Furthermore, we call $\mathrm{exploit}(\pi):=\max_{\tilde{\pi}_1\in \Delta(A_1)}v_1^{\tilde{\pi}_1, \pi_2} + \max_{\tilde{\pi}_2\in \Delta(A_2)}v_2^{\pi_1, \tilde{\pi}_2}$ as {\it exploitability} of a given strategy profile $\pi$.
Exploitability is a metric for measuring how close $\pi$ is to a Nash equilibrium $\pi^{\ast}$ in two-player zero-sum games \citep{johanson2011accelerating,johanson2012finding,lockhart2019computing,timbers2020approximate,abe2020off}.
From the definition, a Nash equilibrium $\pi^{\ast}$ has the lowest exploitability of $0$.

\subsection{Problem Setting}
In this study, we consider the setting where the game is played repeatedly for $T$ iterations.
At each iteration $t\in [T]$, each player $i$ determines the (mixed) strategy $\pi_i^t\in \Delta(A_i)$ based on the past-observed feedback.
Then, each player $i$ observes the new feedback.
In this study, we focus on two feedback cases: {\it full-information feedback} and {\it bandit feedback}.
At the end of the iteration $t$ under full-information feedback, player $i$ observes the conditional expected utility vector $(q_i^{\pi^t}(a_i))_{a_i \in A_i}$ as feedback.
Under bandit feedback, each player $i$ chooses an action $a_i^t$ according to $\pi_i^t$.
Then, each player observes the realized utility $u_i(a_1^t, a_2^t)$.

FTRL is a widely used learning algorithm in the repeated game setting.
For player $i$, FTRL methods are defined with {\it regularization function} $\psi_i: \Delta(A_i)\to \mathbb{R}$, which is strictly convex and continuously differentiable on $\Delta(A_i)$.
In FTRL, each player $i$ determines her strategy $\pi_i^t$ at iteration $t$ as follows:
\begin{align*}
    \pi_i^t &= \argmax_{p\in \Delta(A_i)} \left\{\eta\left\langle y_i^t, p\right\rangle - \psi_i(p)\right\},\\
    y_i^t(a_i) &= \sum_{s=1}^{t-1} q^{\pi^s}_i(a_i),
\end{align*}
where $\eta>0$ is the learning rate.

\subsection{Other Notations}
We denote the interior of the probability simplex $\Delta(A_i)$ by $\Delta^{\circ}(A_i) := \{p \in \Delta(A_i) ~|~ \forall a_i\in A_i, ~p(a_i) > 0\}$.
For a strictly convex and continuously differentiable function $\psi$, the associated {\it Bregman divergence} is defined as $D_{\psi}(x, x')=\psi(x) - \psi(x') - \langle \nabla \psi(x'), x-x'\rangle$.
The {\it Kullback-Leibler divergence}, which is the Bregman divergence with the entropy regularizer $\psi(x)=\sum_i x_i \ln x_i$, is denoted by $\mathrm{KL}(x, x')=\sum_i x_i\ln \frac{x_i}{x_i'}$.
Besides, we define the sum of Bregman divergences and sum of Kullback-Leibler divergences as $D_{\psi}(\pi, \pi')=\sum_{i=1}^2D_{\psi_i}(\pi_i, \pi_i')$ and $\mathrm{KL}(\pi, \pi')=\sum_{i=1}^2\mathrm{KL}(\pi_i, \pi_i')$, respectively.

\section{Mutant Follow the Regularized Leader}
In this section, we introduce {\it Mutant Follow the Regularized Leader} (M-FTRL), which is inspired by the RMD~\citep{Hofbauer1998,zagorsky:plosone:2013}. 
Let us see what happens in a biased version of the Rock-Paper-Scissors game, see Table~\ref{tab:biased-rps}.
Figure~\ref{fig:learningdynamics} compares trajectories of RD and RMD with varying mutation parameters $\mu$ (see (\ref{eq:rmd} for the differential equation of RMD).
Note that $\mu$ represents the parameter that controls the strength of mutation.
Figure~\ref{fig:FTRL-trajectory} shows that the trajectories form a cycle and never converge to the Nash equilibrium because the game is intransitive. 
Note, however, that the time-averaged trajectory of FTRL converges to interior Nash equilibria in two-player zero-sum games~\citep{hofbauer:mor:2009}. 
In contrast, Figures~\ref{fig:M-FTRL-trajectory-0.01} and \ref{fig:M-FTRL-trajectory-0.1} exhibit a clear convergence to the unique stationary point, which is almost equivalent to the interior Nash equilibrium (the red dot) without taking the time average. As the mutation parameter increases to $1.0$, although the stationary point becomes far from the Nash equilibrium, it is still asymptotically stable in Figure~\ref{fig:M-FTRL-trajectory-1.0}. 
Thus, mutation is expected to ensure that the trajectory of a learning dynamics reaches an approximated equilibrium.

\subsection{Algorithm}
\begin{figure}[t!]
\begin{algorithm}[H]
    \caption{Mutant Follow the Regularized Leader with adaptive reference strategies for player $i$.}
    \label{alg:m-ftrl}
    \begin{algorithmic}[1]
    \Require{Time horizon $T$, learning rate $\eta$, regularization function $\psi_i$, mutation parameter $\mu$, update frequency $N$, initial strategy $\pi_i^0$}
    \State $c_i\gets \left(\frac{1}{|A_i|}\right)_{a_i \in A_i}$
    \State $\tau \gets 0$
    \State Initialize $z_i^0$ so that $\pi_i^{0} = \argmax_{p\in \Delta(A_i)} \left\{\left\langle z_i^0, p\right\rangle - \psi_i(p)\right\}$
    \For{$t=1,2,\cdots, T$}
        \State Compute strategy $\pi_i^t$ by $$\pi_i^t = \argmax_{p\in \Delta(A_i)} \left\{\left\langle z_i^t, p\right\rangle - \psi_i(p)\right\}$$
        \For{$a \in A_i$}
            \State $\!z_i^{t+1}(a)\!\gets\! z_i^{t}(a) \!+ \eta \!\left(\! q_i^{\pi^t}(a) \!+\! \frac{\mu}{\pi_i^t(a)}\!\left(\!c_i(a)\!-\!\pi_i^t(a)\right)\!\right)\!$
        \EndFor
        \State $\tau \gets \tau + 1$
        \If{$\tau = N$}
            \State $c_i \gets \pi_i^t$
            \State $\tau\gets 0$
        \EndIf
    \EndFor
    \end{algorithmic}
\end{algorithm}
\end{figure}
We propose a discrete-time version of the M-FTRL algorithm under two feedback cases: full-information feedback and bandit feedback.
First, we provide the strategy update rule under full-information feedback:
\begin{align}
    \label{eq:m-ftrl_discrete}
    \pi_i^t &= \argmax_{p\in \Delta(A_i)} \left\{\eta\left\langle \sum_{s=1}^{t-1}q_i^{\mu, s}, p\right\rangle - \psi_i(p)\right\}, \\
    q_i^{\mu, s}(a_i) &= q_i^{\pi^s}(a_i) + \frac{\mu}{\pi_i^s(a_i)}\left(c_i(a_i)-\pi_i^s(a_i)\right), \nonumber
\end{align}
where $\eta>0$ is the learning rate, $\mu>0$ is the {\it mutation parameter}, and $c_i\in \Delta^{\circ}(A_i)$ is the {\it reference strategy}.

As shown in Figure \ref{fig:M-FTRL-trajectory-0.01}-\ref{fig:M-FTRL-trajectory-1.0}, strategies $\pi_i^t$ updated by (\ref{eq:m-ftrl_discrete}) would converge to the stationary point, which is different from the Nash equilibrium of the original game.
The stationary point is a $2\mu$-Nash equilibrium of the original game, and the stationary point is not Nash equilibrium unless $(c_1, c_2)$ is a Nash equilibrium (see Theorem \ref{thm:expoitability_bound}).
Therefore, for convergence to a Nash equilibrium of the original game, we introduce a technique to adapt the reference strategy.
That is, we copy probabilities from $\pi_i^t$ into $c_i$ every $N(\leq T)$ iterations.
This technique is similar to the direct convergence method by \citep{perolat2021poincare}.
The pseudo-code of our algorithm with adaptive reference strategies is presented in Algorithm \ref{alg:m-ftrl}.

Under bandit feedback, each player $i$ needs to estimate $q_i^{\mu, t} = \left(q_i^{\pi^t}(a_i) + \frac{\mu}{\pi_i^t(a_i)}\left(c_i(a_i)-\pi_i^t(a_i)\right)\right)_{a_i\in A_i}$ from the realized utility $u_i(a_1^t, a_2^t)$.
Similarly to \citep{wei2018more,ito2021parameter}, we construct the following estimator $\hat{q}_i^{\mu, t}$:
\begin{align}
    \label{eq:estimator}
    \!\hat{q}_i^{\mu, t}(a_i) \!=\! \frac{u_i(a_1^t, a_2^t)}{\pi_i^t(a_i^t)}\!\mathds{1}[a_i = a_i^t] \!+\! \frac{\mu}{\pi_i^t(a_i)}\!\left(c_i(a_i) \!-\! \pi_i^t(a_i)\right).
\end{align}
It is easy to confirm that $\hat{q}_i^{\mu, t}$ is an unbiased estimator of $q_i^{\mu, t}$.
Under bandit feedback, M-FTRL updates the strategy $\pi_i^t$ by the following update rule, which uses $\hat{q}_i^{\mu, t}$ instead of $q_i^{\mu, t}$ in (\ref{eq:m-ftrl_discrete}):
\begin{align*}
    \pi_i^t &= \argmax_{p\in \Delta(A_i)} \left\{\eta\left\langle \sum_{s=1}^{t-1}\hat{q}_i^{\mu, s}, p\right\rangle - \psi_i(p)\right\}.
\end{align*}
Note that M-FTRL does not require any information about the opponent's strategy $\pi_{-i}^t$ under bandit feedback.


\section{Theoretical Analysis}
In this section, we provide the theoretical relationship between RMD and M-FTRL and the last-iterate convergence guarantee of M-FTRL.
Instead of the discrete-time version of M-FTRL algorithm, we analyze the theoretical properties of the following continuous-time version of M-FTRL dynamics:
\begin{align}
    \label{eq:m-ftrl}
    \pi_i^t &= \argmax_{p\in \Delta(A_i)} \left\{\left\langle z_i^t, p\right\rangle - \psi_i(p)\right\}, \\
    z_i^t(a_i) &= \int_0^t \left(q^{\pi^s}_i(a_i) + \frac{\mu}{\pi_i^s(a_i)}\left(c_i(a_i)-\pi_i^s(a_i)\right)\right) ds. \nonumber
\end{align}

First, we show that this dynamics is a generalization of RMD \citep{Bauer:2019}.
That is, the dynamics of M-FTRL with the entropy regularizer $\psi_i(p)=\sum_{a_i\in A_i}p(a_i)\ln p(a_i)$ induces RMD:
\begin{theorem}
\label{thm:rmd}
The dynamics defined by (\ref{eq:m-ftrl}) with the entropy regularizer $\psi_i(p)=\sum_{a_i\in A_i}p(a_i)\ln p(a_i)$ is equivalent to replicator-mutator dynamics:
\begin{align}
\label{eq:rmd}
\begin{aligned}
    \frac{d}{dt}\pi_i^t(a_i) =& \pi_i^t(a_i)\left(q_i^{\pi^t}(a_i) - v_i^{\pi^t}\right) \\
    &+ \mu\left(c_i(a_i)-\pi_i^t(a_i)\right).
\end{aligned}  \tag{RMD}
\end{align}
\end{theorem}
The proof of this theorem is shown in Appendix \ref{sec:appendix_proof_thm_rmd}.

From here, we derive the relationship between the stationary point $\pi^{\mu}$ of (\ref{eq:rmd}) (i.e., the strategy profile that satisfies $\frac{d}{dt}\pi_i^{\mu}(a_i)=0$ for all $i\in \{1,2\}$ and $a_i\in A_i$) and the updated strategy profile $\pi^t$.
Note that, from Lemma 3.3 in \citep{Bauer:2019}, for any $\mu>0$ there exists $\pi^{\mu}\in \prod_{i=1}^2\Delta^{\circ}(A_i)$ such that $\pi^{\mu}$ is a stationary point of (\ref{eq:rmd}).
Thus, $\pi^{\mu}$ is well-defined.
We first derive the time derivative of the (sum of) Bregman divergence between $\pi^{\mu}$ and $\pi^t$:
\begin{theorem}
\label{thm:bregman_div}
Let $\pi^{\mu}\in \prod_{i=1}^2\Delta(A_i)$ be a stationary point of (\ref{eq:rmd}).
Then, $\pi^t$ updated by M-FTRL satisfies that:
\begin{align*}
    &\frac{d}{dt}D_{\psi}(\pi^{\mu}, \pi^t) \\
    &= - \mu\sum_{i=1}^2\sum_{a_i\in A_i}c_i(a_i)\left(\sqrt{\frac{\pi_i^t(a_i)}{\pi_i^{\mu}(a_i)}}-\sqrt{\frac{\pi_i^{\mu}(a_i)}{\pi_i^t(a_i)}}\right)^2.
\end{align*}
Furthermore, if the regularizer is entropy $\psi_i(p)=\sum_{a_i\in A_i}p(a_i)\ln p(a_i)$, then $\pi^t$ satisfies that:
\begin{align*}
    \frac{d}{dt}\mathrm{KL}(\pi^{\mu}, \pi^t) \leq -\mu \xi\mathrm{KL}(\pi^{\mu}, \pi^t),
\end{align*}
where $\xi=\min_{i\in \{1,2\}, a_i\in A_i}\frac{c_i(a_i)}{\pi_i^{\mu}(a_i)}$.
\end{theorem}

The first statement implies that $\frac{d}{dt}D_{\psi}(\pi^{\mu}, \pi^t)=0$ holds if and only if $\pi^t=\pi^{\mu}$, and $\forall \pi^t\neq \pi^{\mu}, \frac{d}{dt}D_{\psi}(\pi^{\mu}, \pi^t)<0$.
Thus, by Lyapunov arguments \citep{khalil2015nonlinear}, the Bregman divergence between $\pi^{\mu}$ and $\pi^t$ converges to $0$, and then $\pi^t$ converges to $\pi^{\mu}$.
Note that Theorem \ref{thm:bregman_div} holds for all stationary points of (\ref{eq:rmd}).
This means that for a fixed $\mu$ and $(c_i)_{i=1}^2$, the stationary point is unique.
From the second statement, we can show that exponential convergence rates can be achieved when using the entropy regularizer:
\begin{corollary}
\label{cor:KL_bound}
Assume that the regularizer is entropy $\psi_i(p)=\sum_{a_i\in A_i}p(a_i)\ln p(a_i)$.
Then, M-FTRL's trajectory converges to a stationary point of (\ref{eq:rmd}) exponentially fast, i.e., 
\begin{align*}
\mathrm{KL}(\pi^{\mu}, \pi^t) \leq \mathrm{KL}(\pi^{\mu}, \pi^0)\exp\left(-\mu \xi t\right).
\end{align*}
\end{corollary}

Finally, combining this corollary and Lemma 3.5 in \citep{Bauer:2019}, we can derive the exploitability bound of $\pi^t$.
\begin{theorem}
Assume that the regularizer is entropy $\psi_i(p)=\sum_{a_i\in A_i}p(a_i)\ln p(a_i)$.
Then, the exploitability for M-FTRL is bounded as:
\label{thm:expoitability_bound}
\begin{align*}
    &\mathrm{exploit}(\pi^t) \leq 2\mu \\
    &+ 2u_{\max}\sqrt{(\ln 2)\mathrm{KL}(\pi^{\mu}, \pi^0)}\exp\left(-\frac{\mu\xi}{2}  t\right).
\end{align*}
\end{theorem}
Theorem \ref{thm:expoitability_bound} means that $\pi^t$ converges to a $2\mu$-Nash equilibrium exponentially fast.
The proof of the theorem is shown in Section \ref{sec:proof_thm_exploitability}.

\subsection{Proof Sketch of Theorem \ref{thm:bregman_div}}
We sketch below the proof of Theorem \ref{thm:bregman_div}.
The complete proof and the associated lemmas are presented in Appendix \ref{sec:appendix_proof_lem_bregman_div}-\ref{sec:appendix_proof_thm_bregman_div}.

\paragraph{Proof of the first part of Theorem \ref{thm:bregman_div}.}
First, we derive the time derivative of the Bregman divergence between $\pi\in \prod_{i=1}^2\Delta(A_i)$ and $\pi^t$:
\begin{lemma}
\label{lem:bregman_div}
For any $\pi\in \prod_{i=1}^2\Delta(A_i)$, $\pi^t$ updated by M-FTRL satisfies that:
\begin{align*}
    \frac{d}{dt}&D_{\psi}(\pi, \pi^t) \\
    =& \sum_{i=1}^2 v_i^{\pi_i^t, \pi_{-i}} + 2\mu - \mu\sum_{i=1}^2\sum_{a_i\in A_i}c_i(a_i)\frac{\pi_i(a_i)}{\pi_i^t(a_i)}.
\end{align*}
\end{lemma}
The proof of Lemma \ref{lem:bregman_div} stems from the fact that $D_{\psi}(\pi, \pi^t)=\sum_{i=1}^2\!\left(\max_{p\in \Delta(A_i)}\left\{\left\langle z_i^t, p\right\rangle \!-\! \psi_i(p)\right\}-\langle z_i^t, \pi_i\rangle + \psi_i(\pi_i)\right)$.

Next, we derive the relationship between the expected utilities $v^{\pi^{\mu}}$ and $v^{\pi_i', \pi_{-i}^{\mu}}$ for any $\pi_i'\in \Delta(A_i)$:
\begin{lemma}
\label{lem:rmd_property}
Let $\pi^{\mu}\in \prod_{i=1}^2\Delta(A_i)$ be a stationary point of (\ref{eq:rmd}).
Then, for any $i\in \{1, 2\}$ and $\pi_i' \in \Delta(A_i)$:
\begin{align*}
    &v_i^{\pi_i',\pi_{-i}^{\mu}} = v_i^{\pi^{\mu}} + \mu - \mu\sum_{a_i\in A_i}c_i(a_i)\frac{\pi_i'(a_i)}{\pi_i^{\mu}(a_i)}.
\end{align*}
\end{lemma}
This result can be shown by the fact that $\pi^{\mu}$ is the stationary point of (\ref{eq:rmd}), i.e., $\pi_i^{\mu}(a_i)\left(q_i^{\pi^{\mu}}(a_i) - v_i^{\pi^{\mu}}\right) + \mu\left(c_i(a_i)-\pi_i^{\mu}(a_i)\right)=0$ for all $i\in \{1, 2\}$.

By combining Lemmas \ref{lem:bregman_div} and \ref{lem:rmd_property}, we can obtain:
\begin{align*}
    \frac{d}{dt}&D_{\psi}(\pi^{\mu}, \pi^t) \\
    =&  \sum_{i=1}^2 v_i^{\pi_i^t,\pi_{-i}^{\mu}} + 2\mu - \mu\sum_{i=1}^2\sum_{a_i\in A_i}c_i(a_i)\frac{\pi_i^{\mu}(a_i)}{\pi_i^t(a_i)} \\
    =& \sum_{i=1}^2v_i^{\pi^{\mu}} \!+\! 4\mu \!-\! \mu\sum_{i=1}^2\sum_{a_i\in A_i}c_i(a_i)\left(\frac{\pi_i^t(a_i)}{\pi_i^{\mu}(a_i)}+\frac{\pi_i^{\mu}(a_i)}{\pi_i^t(a_i)}\right) \\
    =& 4\mu - \mu\sum_{i=1}^2\sum_{a_i\in A_i}c_i(a_i)\left(\frac{\pi_i^t(a_i)}{\pi_i^{\mu}(a_i)}+\frac{\pi_i^{\mu}(a_i)}{\pi_i^t(a_i)}\right) \\
    =& - \mu\sum_{i=1}^2\sum_{a_i\in A_i}c_i(a_i)\left(\sqrt{\frac{\pi_i^t(a_i)}{\pi_i^{\mu}(a_i)}}-\sqrt{\frac{\pi_i^{\mu}(a_i)}{\pi_i^t(a_i)}}\right)^2,
\end{align*}
where the third equality follows from $\sum_{i=1}^2v_i^{\pi^{\mu}}=0$ by the definition of zero-sum games.
This concludes the first statement of the theorem.

\paragraph{Proof of the second part of Theorem \ref{thm:bregman_div}.}
Let us define $\xi_i=\min_{a_i\in A_i}\frac{c_i(a_i)}{\pi_i^{\mu}(a_i)}$.
From the first part of the theorem, we have:
\begin{align}
\label{eq:J_dot_KL_main}
    &\frac{d}{dt}D_{\psi}(\pi^{\mu}, \pi^t) \nonumber \\
    &= - \mu\sum_{i=1}^2\sum_{a_i\in A_i}c_i(a_i)\left(\frac{\pi_i^t(a_i)}{\pi_i^{\mu}(a_i)}+\frac{\pi_i^{\mu}(a_i)}{\pi_i^t(a_i)} - 2\right) \nonumber\\
    &= - \mu\sum_{i=1}^2\sum_{a_i\in A_i}\frac{c_i(a_i)}{\pi_i^{\mu}(a_i)}\frac{(\pi_i^t(a_i)-\pi_i^{\mu}(a_i))^2}{\pi_i^t(a_i)} \nonumber\\
    &\leq - \mu\sum_{i=1}^2\xi_i\sum_{a_i\in A_i}\frac{(\pi_i^t(a_i)-\pi_i^{\mu}(a_i))^2}{\pi_i^t(a_i)} \nonumber\\
    &\leq - \mu\sum_{i=1}^2\xi_i\ln \left(1 + \sum_{a_i\in A_i}\frac{(\pi_i^t(a_i)-\pi_i^{\mu}(a_i))^2}{\pi_i^t(a_i)}\right) \nonumber\\
    &= - \mu\sum_{i=1}^2\xi_i\ln \left(\sum_{a_i\in A_i}\pi_i^{\mu}(a_i)\frac{\pi_i^{\mu}(a_i)}{\pi_i^t(a_i)}\right) \nonumber\\
    &\leq - \mu\sum_{i=1}^2\xi_i\sum_{a_i\in A_i}\pi_i^{\mu}(a_i)\ln \left(\frac{\pi_i^{\mu}(a_i)}{\pi_i^t(a_i)}\right) \nonumber\\
    &= - \mu\sum_{i=1}^2\xi_i\mathrm{KL}(\pi_i^{\mu}, \pi_i^t) \leq - \mu\xi\sum_{i=1}^2\mathrm{KL}(\pi_i^{\mu}, \pi_i^t),
\end{align}
where the second inequality follows from $x \geq \ln(1+x)$ for all $x>0$, and the third inequality follows from the concavity of the $\ln(\cdot)$ function and Jensen's inequality for concave functions.
On the other hand, if $\psi_i(p)=\sum_{a_i\in A_i}p(a_i)\ln p(a_i)$, then $D_{\psi_i}(\pi^{\mu}, \pi^t)=\mathrm{KL}(\pi^{\mu}, \pi^t)$.
From this fact and (\ref{eq:J_dot_KL_main}), we have:
\begin{align*}
    \frac{d}{dt}\mathrm{KL}(\pi^{\mu}, \pi^t) \leq - \mu\xi\mathrm{KL}(\pi^{\mu}, \pi^t).
\end{align*}
This concludes the second statement of the theorem.
\qed

\begin{figure*}[t!]
    \centering
    \begin{minipage}[t]{0.24\textwidth}
        \centering
        \includegraphics[width=1.1\linewidth]{figs/full_feedback/brps/brps_full_feedback_exploitability.pdf}
    \end{minipage}
    \begin{minipage}[t]{0.24\textwidth}
        \centering
        \includegraphics[width=1.1\linewidth]{figs/full_feedback/multiple/multiple_full_feedback_exploitability.pdf}
    \end{minipage}
    \begin{minipage}[t]{0.24\textwidth}
        \centering
        \includegraphics[width=1.1\linewidth]{figs/full_feedback/random/random_payoff_size_10_full_feedback_exploitability.pdf}
    \end{minipage}
    \begin{minipage}[t]{0.24\textwidth}
        \centering
        \includegraphics[width=1.1\linewidth]{figs/full_feedback/random/random_payoff_size_50_full_feedback_exploitability.pdf}
    \end{minipage}
    \caption{
    Exploitability of $\pi^t$ for M-FTRL, FTRL, and O-FTRL under full-information feedback.
    }
    \label{fig:exploitability_full}
\end{figure*}
\begin{figure*}[t!]
    \centering
    \begin{minipage}[t]{0.24\textwidth}
        \centering
        \includegraphics[width=1.0\linewidth]{figs/full_feedback/brps/brps_full_feedback_MFTRL_trajectories_triangle_0.pdf}
        \subcaption{M-FTRL with a fixed reference strategy}
    \end{minipage}
    \begin{minipage}[t]{0.24\textwidth}
        \centering
        \includegraphics[width=1.0\linewidth]{figs/full_feedback/brps/brps_full_feedback_MFTRL Update Mutation Probabilities_trajectories_triangle_0.pdf}
        \subcaption{M-FTRL with adaptive reference strategies}
    \end{minipage}
    \begin{minipage}[t]{0.24\textwidth}
        \centering
        \includegraphics[width=1.0\linewidth]{figs/full_feedback/brps/brps_full_feedback_FTRL_trajectories_triangle_0.pdf}
        \subcaption{FTRL}
    \end{minipage}
    \begin{minipage}[t]{0.24\textwidth}
        \centering
        \includegraphics[width=1.0\linewidth]{figs/full_feedback/brps/brps_full_feedback_OFTRL_trajectories_triangle_0.pdf}
        \subcaption{O-FTRL}
    \end{minipage}
    \caption{
    Trajectories of $\pi^t$ for M-FTRL, FTRL and O-FTRL in BRPS under full-information feedback.
    We set the initial strategy profile to $\pi_i^0=\frac{1}{|A_i|}$ for $i\in \{1, 2\}$.
    The black point represents the equilibrium strategy.
    The blue/red points represent the initial/final points, respectively.
    }
    \label{fig:trajectory_brps_full}
\end{figure*}
\begin{figure}[t!]
    \centering
    \begin{minipage}[t]{0.49\linewidth}
        \centering
        \includegraphics[width=1.0\linewidth]{figs/full_feedback/multiple/multiple_full_feedback_MFTRL_start_end_points_triangle_0.pdf}
        \subcaption{M-FTRL with a fixed reference strategy}
    \end{minipage}
    \begin{minipage}[t]{0.49\linewidth}
        \centering
        \includegraphics[width=1.0\linewidth]{figs/full_feedback/multiple/multiple_full_feedback_OFTRL_start_end_points_triangle_0.pdf}
        \subcaption{O-FTRL}
    \end{minipage}
    \caption{
    Initial strategies and final strategies for player $1$ in $100$ instances (M-Eq under full-information feedback).
    The green dashed line represents the set of equilibrium strategies for player $1$.
    The blue/red points represent the initial/final points, respectively.
    }
    \label{fig:start_end_points_multiple_full}
\end{figure}
\begin{figure*}[t!]
    \centering
    \begin{minipage}[t]{0.24\textwidth}
        \centering
        \includegraphics[width=1.1\linewidth]{figs/bandit_feedback/brps/brps_bandit_feedback_exploitability.pdf}
    \end{minipage}
    \begin{minipage}[t]{0.24\textwidth}
        \centering
        \includegraphics[width=1.1\linewidth]{figs/bandit_feedback/multiple/multiple_bandit_feedback_exploitability.pdf}
    \end{minipage}
    \begin{minipage}[t]{0.24\textwidth}
        \centering
        \includegraphics[width=1.1\linewidth]{figs/bandit_feedback/random/random_payoff_size_10_bandit_feedback_exploitability.pdf}
    \end{minipage}
    \begin{minipage}[t]{0.24\textwidth}
        \centering
        \includegraphics[width=1.1\linewidth]{figs/bandit_feedback/random/random_payoff_size_50_bandit_feedback_exploitability.pdf}
    \end{minipage}
    \caption{
    Exploitability of $\pi^t$ for M-FTRL, FTRL, and O-FTRL under bandit feedback.
    }
    \label{fig:exploitability_bandit}
\end{figure*}
\begin{figure*}[t!]
    \centering
    \begin{minipage}[t]{0.33\textwidth}
        \centering
        \includegraphics[width=1.0\linewidth]{figs/bandit_feedback/brps/brps_bandit_feedback_MFTRL_trajectories_triangle_0.pdf}
        \subcaption{M-FTRL with a fixed reference strategy}
    \end{minipage}
    \begin{minipage}[t]{0.33\textwidth}
        \centering
        \includegraphics[width=1.0\linewidth]{figs/bandit_feedback/brps/brps_bandit_feedback_FTRL_trajectories_triangle_0.pdf}
        \subcaption{FTRL}
    \end{minipage}
    \begin{minipage}[t]{0.33\textwidth}
        \centering
        \includegraphics[width=1.0\linewidth]{figs/bandit_feedback/brps/brps_bandit_feedback_OFTRL_trajectories_triangle_0.pdf}
        \subcaption{O-FTRL}
    \end{minipage}
    \caption{
    Trajectories of $\pi^t$ for M-FTRL, FTRL and O-FTRL in BRPS under bandit feedback.
    We set the initial strategy profile to $\pi_i^0=\frac{1}{|A_i|}$ for $i\in \{1, 2\}$.
    The black point represents the equilibrium strategy.
    The blue/red points represent the initial/final points, respectively.
    }
    \label{fig:trajectory_brps_bandit}
\end{figure*}

\subsection{Proof of Theorem \ref{thm:expoitability_bound}}
\label{sec:proof_thm_exploitability}
From the definition of exploitability, we have:
\begin{align}
\label{eq:exploitability_bound_pi_t}
    &\mathrm{exploit}(\pi^t) = \sum_{i=1}^2 \max_{\tilde{\pi}_i\in \Delta(A_i)} v_i^{\tilde{\pi}_i, \pi_{-i}^t} \nonumber\\
    &= \sum_{i=1}^2 \left(\max_{\tilde{\pi}_i\in \Delta(A_i)} v_i^{\tilde{\pi}_i, \pi_{-i}^{\mu}} \right. \nonumber\\
    &\left. ~~~~~~~~~+ \max_{\tilde{\pi}_i\in \Delta(A_i)} v_i^{\tilde{\pi}_i, \pi_{-i}^t}- \max_{\tilde{\pi}_i\in \Delta(A_i)} v_i^{\tilde{\pi}_i, \pi_{-i}^{\mu}}\right) \nonumber\\
    &\leq \sum_{i=1}^2 \!\left(\max_{\tilde{\pi}_i\in \Delta(A_i)} v_i^{\tilde{\pi}_i, \pi_{-i}^{\mu}} \!+\! \max_{\tilde{\pi}_i\in \Delta(A_i)} \left(v_i^{\tilde{\pi}_i, \pi_{-i}^t}- v_i^{\tilde{\pi}_i, \pi_{-i}^{\mu}}\right)\!\right) \nonumber\\
    &\leq \sum_{i=1}^2 \left(\max_{\tilde{\pi}_i\in \Delta(A_i)} v_i^{\tilde{\pi}_i, \pi_{-i}^{\mu}} \right. \nonumber\\
    &\left. ~~~~~~~~~+ \|\pi_i^{\mu} - \pi_i^t\|_1\max_{\tilde{\pi}_{-i}\in \Delta(A_{-i})} \|q_i^{\pi_i^t, \tilde{\pi}_{-i}}\|_{\infty}\right) \nonumber\\
    &\leq \sum_{i=1}^2 \left(\max_{\tilde{\pi}_i\in \Delta(A_i)} v_i^{\tilde{\pi}_i, \pi_{-i}^{\mu}} + u_{\max}\sqrt{2(\ln 2)\mathrm{KL}(\pi_i^{\mu}, \pi_i^t)}\right),
\end{align}
where the second inequality follows from H\"{o}lder's inequality, and the last inequality follows from Lemma 11.6.1 in \citep{thomas2006elements}.

From Lemma 3.5 of \citep{Bauer:2019}, a stationary point $\pi^{\mu}$ of (\ref{eq:rmd}) satisfies that for all $i\in \{1, 2\}$ and $a_i\in A_i$, $q_i^{\pi^{\mu}}(a_i) - v_i^{\pi^{\mu}} \leq \mu$.
Therefore, the term of $\max_{\tilde{\pi}_i\in \Delta(A_i)} v_i^{\tilde{\pi}_i, \pi_{-i}^{\mu}}$ can be bounded as:
\begin{align}
\label{eq:exploitability_bound_pi_mu}
    &\sum_{i=1}^2 \max_{\tilde{\pi}_i\in \Delta(A_i)} v_i^{\tilde{\pi}_i, \pi_{-i}^{\mu}} \!=\! \sum_{i=1}^2 \left(\max_{\tilde{\pi}_i\in \Delta(A_i)} v_i^{\tilde{\pi}_i, \pi_{-i}^{\mu}} - v_i^{\pi^{\mu}}\right) \nonumber\\
    &= \sum_{i=1}^2 \left(\max_{a_i\in A_i} q_i^{\pi^{\mu}}(a_i) - v_i^{\pi^{\mu}}\right) \leq 2\mu,
\end{align}
where the second equality follows from $\sum_{i=1}^2v_i^{\pi^{\mu}}=0$ by the definition of zero-sum games.
By combining (\ref{eq:exploitability_bound_pi_t}), (\ref{eq:exploitability_bound_pi_mu}), and Corollary \ref{cor:KL_bound}, we have:
\begin{align*}
    &\mathrm{exploit}(\pi^t)  \leq 2\mu + u_{\max}\sum_{i=1}^2\sqrt{2(\ln 2) \mathrm{KL}(\pi_i^{\mu}, \pi_i^t)} \\
    &\leq 2\mu + u_{\max}\sqrt{2(\ln 2) }\sqrt{2\sum_{i=1}^2\mathrm{KL}(\pi_i^{\mu}, \pi_i^t)} \\
    &\leq 2\mu + 2\sqrt{\ln 2}u_{\max}\sqrt{\mathrm{KL}(\pi^{\mu}, \pi^0)\exp\left(-\mu \xi t\right)} \\
    &= 2\mu + 2u_{\max}\sqrt{(\ln 2)\mathrm{KL}(\pi^{\mu}, \pi^0)}\exp\left(-\frac{\mu \xi}{2} t\right),
\end{align*}
where the second inequality follows from $\sqrt{a} + \sqrt{b}\leq \sqrt{2(a+b)}$ for $a,b>0$.
This concludes the statement.
\qed

\section{Experiments}
In this section, we empirically evaluate M-FTRL.
We compare its performance to those of FTRL and O-FTRL.

We conduct experiments on the following games: biased rock-paper-scissors (BRPS), a normal-form game with multiple Nash equilibria (M-Eq), and random utility games.
BRPS and M-Eq have the following utility matrix, respectively:
\begin{table}[h!]
    \centering
    \begin{minipage}[t]{0.23\textwidth}
    \centering
    \caption{Biased RPS utilities}
    \label{tab:biased-rps}
    \begin{tabular}{cccc}
    \hline
      & R  & P  & S  \\ \hline
    R & $0$  & $-0.1$  & $0.3$ \\
    P & $0.1$ & $0$  & $-0.1$ \\
    S & $-0.3$  & $0.1$ & $0$  \\ \hline
    \end{tabular}
    \end{minipage}
    \begin{minipage}[t]{0.23\textwidth}
    \centering
    \caption{M-Eq utilities}
    \label{tab:m-eq}
    \begin{tabular}{cccc}
    \hline
      & $y_1$  & $y_2$  \\ \hline
    $x_1$ & $0.1$ & $-0.2$ \\
    $x_2$ & $-0.4$ & $0.3$ \\
    $x_3$ & $-1$  & $0.9$  \\ \hline
    \end{tabular}
    \end{minipage}
\end{table}

The set of Nash equilibria in M-Eq is given by:
\begin{align*}
    &\Pi^{\ast}_1 \!=\! \left\{ x\in \Delta^3 |~   x_2 = -\frac{22}{12}x_1 + \frac{19}{12}; ~x_3 = \frac{10}{12}x_1 - \frac{7}{12}   \right\}, \\
    &\Pi^{\ast}_2 = \left\{\left(\frac{1}{2}, \frac{1}{2}\right) \right\}.
\end{align*}
For random utility games, we generate each component in a utility matrix uniformly at random in $[0, 1]$.
We consider random utility games with action sizes $|A_1|=|A_2|=10$ and $|A_1|=|A_2|=50$.
For each game, we average the results for $100$ instances.
We generate the initial strategy profile $\pi^0$ uniformly at random in $\prod_{i=1}^2\Delta^{\circ}(A_i)$ for each instance.
We use the entropy regularizer $\psi_i(p)=\sum_{a_i\in A_i}p(a_i)\ln p(a_i)$ in all experiments.

\subsection{Full-Information Feedback}
First, we provide the results under full-information feedback.
In these experiments, we analyze the performance of M-FTRL with a fixed reference strategy $c_i=\left(\frac{1}{|A_i|}\right)_{a_i\in A_i}$ and one with adaptive reference strategies (Algorithm \ref{alg:m-ftrl}).
We set the learning rate to $\eta=10^{-1}$ for all algorithms, and set the mutation parameter to $\mu=10^{-2}$ for M-FTRL.
For M-FTRL with adaptive reference strategies, we set $N=4,000$ in BRPS and M-Eq, and $N=20,000$ in the random utility games.

Figure \ref{fig:exploitability_full} shows the average exploitability of $\pi^t$ updated by each algorithm.
We find that the exploitability of M-FTRL converges to a constant value faster than FTRL and O-FTRL.
Furthermore, by adapting the reference strategy, the exploitability of M-FTRL's strategy profile quickly converges to $0$.
We provide additional experimental results with varying mutation parameters $\mu\in \{10^{-3}, 5\times 10^{-3}, 10^{-2}, 10^{-1}, 1\}$ in Appendix \ref{sec:sensitivity_analysis_mu}.

Next, we compare the trajectories of strategies updated by each algorithm.
Figure \ref{fig:trajectory_brps_full} shows the trajectories of $\pi^t$ updated by each algorithm from an instance of RBPS.
Note that in this figure, we set the initial strategy to $\pi_i^0=\frac{1}{|A_i|}$ for $i\in \{1, 2\}$.
We can observe that FTRL's strategies cycle around the Nash equilibrium strategy, and O-FTRL's strategies gradually approach the Nash equilibrium strategy.
Unlike these methods, M-FTRL's strategies quickly approach the stationary point.
Figure \ref{fig:start_end_points_multiple_full} shows the initial strategies and final strategies for player $1$ in M-Eq.
We find that M-FTRL's strategy profile converges to a unique stationary point regardless of the setting of the initial point, while O-FTRL's strategy profile converges to a different Nash equilibrium for each instance.
This result highlights the uniqueness property of the stationary point from Theorem \ref{thm:bregman_div}.

\subsection{Bandit Feedback}
Next, we provide the results under bandit feedback.
We set the learning rate to $\eta=10^{-4}$ for all algorithms, and set the mutation parameter to $\mu=10^{-2}$ for M-FTRL.
In the bandit feedback experiments, we focus on the performance of M-FTRL with a fixed reference strategy $c_i=\frac{1}{|A_i|}$.
In FTRL and O-FTRL algorithms, we use the unbiased estimator by \citep{lattimore2020bandit} as the estimator of $q_i^{\pi^t}$ so that the estimator takes values in $(-\infty, u_{\max}]$ for computational stability.
We provide further details on the estimator in Appendix \ref{sec:appendix_lattimore_estimator}.
Note that M-FTRL does not need this estimator, but it is sufficient to use the importance-weighted estimator in (\ref{eq:estimator}).

Figure \ref{fig:exploitability_bandit} shows the average exploitability of $\pi^t$ updated by each algorithm, and Figure \ref{fig:trajectory_brps_bandit} shows the trajectories of $\pi^t$ updated by each algorithm from an instance of RBPS.
We can see that unlike the experimental results under full-information feedback, O-FTRL's trajectory does not converge to a Nash equilibrium.
On the other hand, M-FTRL's trajectory converges near a stationary point.
These results suggest that M-FTRL has the last-iterate convergence property even under bandit feedback.

\section{Conclusion}
In this study, we proposed M-FTRL, a simple FTRL algorithm that incorporates mutation
for last-iterate convergence to a stationary point.
We proved that the M-FTRL dynamics induced by the entropy regularizer is equivalent to RMD.
Besides, we showed that the trajectory of M-FTRL with general regularization functions converges to a stationary point of the RMD. 
The numerical simulation reveals that M-FTRL outperforms the state-of-the-art FTRL and O-FTRL in a variety of two-player zero-sum games.
In future studies, we will extend M-FTRL algorithm and provide a theoretical analysis to more complex games, such as extensive-form games and Markov games. 

\begin{acknowledgements}
Atsushi Iwasaki was supported by JSPS KAKENHI Grant Numbers JP21H04890 and JP20K20752.
\end{acknowledgements}

\bibliography{abe_504}

\end{document}
