\section{Proof of Theorem \ref{T2}}\label{A2}
The proof is a combination of three parts:

\begin{itemize}
    \item[1] The KL-divergence is a decreasing function throughout the composition of Extra-MWU in a period, starting from an arbitrary initial point. (Proposition \ref{decreasing_KL})
    \item[2] Discrete-time LaSalle invariance principle, which provides a sufficient condition for a discrete dynamical system to converge. (Proposition \ref{DLIP})
    \item[3] A characterization of attractors of periodic dynamical system. (Proposition \ref{attrat})
\end{itemize}


	%Let $h_1 : \Delta_m^{\circ} \to \BR$, $\ h_1(\tb{x}_1) = \sum^m_{i =1}\tb{x}_{1,i} \ln(\tb{x}_{1,i})$, and $h_2 : \Delta_n^{\circ} \to \BR$, $\ h_2(\tb{x}_2) = \sum^n_{j=1}\textbf{x}_{2,j}\ln(\textbf{x}_{2,j})$ be the negative entropy function, and
	%\begin{align*}
	%	h_1^* : & \BR^{m} \to \Delta_m^{\circ}\\
	%	& \tb{y}_1 \to \max_{\tb{x}_1 \in \Delta_m^{\circ}} \{ \langle \tb{y}_1,\tb{x}_1\rangle - h_1(\tb{x}_1) \},\\
	%	h_2^*:&\BR^{n} \to \Delta_n^{\circ}\\
	%	& \tb{y}_2 \to \max_{\tb{x}_2 \in \Delta_n^{\circ}} \{ \langle \tb{y}_2,\tb{x}_2\rangle - h_2(\tb{x}_2) \}
	%\end{align*}

Let $h:\ \Delta_m \to \BR$ be the negative entropy function, i.e.,  $h(\tb{x})=\sum_{i}^{m}\textbf{x}_i\ln \textbf{x}_i$ , and $h^*(\cdot)$ be the convex conjugate of $h(\cdot)$, i.e., 
	\begin{align*}
		h^*:\ \BR^m &\to \BR \\
		\textbf{y} &\to  \max_{\tb{x} \in \Delta_m} \{ \langle \tb{y},\tb{x}\rangle - h(\tb{x}) \}.
	\end{align*}
Note that we have $\nabla h(\tb{x}) = \left(1+\ln (\tb{x}_i)\right)^m_{i=1}$.
\begin{lem} [Page 148 in \cite{shalev2012online}] We have
		\begin{align*}
			h^*(\tb{y}) =  \ln \left( { \sum^m_{s=1}e^{\tb{y}_{s}}} \right),\
			\nabla h^*(\tb{y}) = \left( \frac{e^{\tb{y}_{i}} }{ \sum^m_{s=1}e^{\tb{y}_{s}}} \right)^m_{i=1}.
		\end{align*}
\end{lem}

\begin{defn}
We define the equivalence relation $``\sim"$ between two vectors in $\BR^{m}$ as follows : For two vectors  $\tb{y}$ and $\tb{y}' \in \BR^m$,
\begin{align*}
     \tb{y} \sim \tb{y}' \iff  
     & \exists \ \tb{c} = (c,...,c) \in \BR^m, \\ &\text{such that}\
    \tb{y} - \tb{y}' = \tb{c}.
\end{align*}
We denote the space generated by $\BR^m$ module the above equivalence relation as $\BR^m / \sim$, and use $[\tb{y}]$ to represent the equivalence class that $\tb{y}$ lies in. 
\end{defn}

\begin{rem}
    With the equivalence relation defined above, the function $\nabla h^*$ can be thought as a function defined on $\BR^m / \sim$, i.e., 
    \begin{align*}
        \nabla h^* :  \BR^m / \sim &\to \Delta_m \\
          [\tb{y}] &\to  \left( \frac{e^{\tb{y}_{i}} }{ \sum^m_{s=1}e^{\tb{y}_{s}}} \right)^m_{i=1}.
    \end{align*}

    Moreover, the function $\nabla h (\cdot)$ can be thought as a function take values in  $\BR^m / \sim$, i.e., 
    \begin{align*}
        \nabla h :  \Delta_m &\to \BR^m / \sim \\
         \tb{x} &\to [ \left(1+\ln (\tb{x}_i)\right)^m_{i=1}  ].
    \end{align*}
\end{rem}
In the following, for a vector $\tb{y} \in \BR^m$, we will use $[\tb{y}]$ to represent to equivalence class in $\BR^m / \sim$ that $\tb{y}$ lies in.
\begin{lem}\label{inverse}
$\nabla h^*(\cdot)$ and $\nabla h(\cdot)$ are inverse functions to each other, i.e., we have both
$
\nabla h^* \circ \nabla h : \Delta_m \to \Delta_m 
$
and
$
\nabla h \circ \nabla h^* : \BR^m / \sim \to \BR^m / \sim 
$
are identity maps.
\end{lem}

\begin{proof} It is directly to verify
\begin{align*}
    [\tb{y}] \stackrel{ \nabla h^*}{\longrightarrow} \left( \frac{e^{\tb{y}_i}}{ \sum^m_{s=1} e^{\tb{y}_s} } \right) \stackrel{ \nabla h}{\longrightarrow} \left[ \left(1+ \ln (\frac{e^{\tb{y}_i}}{ \sum^m_{s=1} e^{\tb{y}_s}} ) \right)^m_{s=1}  \right].
\end{align*}
and note that $\left[ \left(1+ \ln (\frac{e^{\tb{y}_i}}{ \sum^m_{s=1} e^{\tb{y}_s}} ) \right)^m_{s=1}  \right] = [\tb{y}]$, thus $\nabla h \circ \nabla h^* = \text{Id}$.

It is also similar to verify $\nabla h^* \circ \nabla h = \text{Id}$.
\end{proof}








\begin{lem}\label{Fenchel}
    $h : \Delta_m^{\circ} \to \BR$ is $1$-strongly convex and has $1$-Lipschitz continuous gradients, and $h^{*} (\cdot)$  is $1$-strongly convex and has $1$-Lipschitz continuous gradients.
\end{lem}
\begin{proof}
    It can be computed that for $i\in[n]$
	\begin{align*}
		\frac{\partial h}{\partial \tb{x}_i}=\ln(\tb{x}_i)+1,\ 
		\frac{\partial^2 h}{\partial \tb{x}_i^2}=\frac{1}{\tb{x}_i},\ 
		\frac{\partial^2 h}{\partial \tb{x}_i \partial \tb{x}_j}=0.
	\end{align*}
	From $\tb{x}\in \Delta_m^{\circ}$, we have that $ \frac{1}{\tb{x}_i}\ge 1$.
	So $h$ is diagonal matrix with each diagonal element larger than 1. Then we have that $h$ is $1$-strongly convex and $\nabla h$ is $1$-Lipschitz continuous. The statemens about $h^*(\cdot)$ follows from the standard Fenchel duality property, for example, see Theorem 1 in \cite{zhou2018fenchel}.
\end{proof}


%\begin{defn}
%We define the equivalence relation $\sim$ between two vectors in $\BR^{m}$ as follows : For two vectors  $\tb{y}$ and $\tb{y}' \in \BR^m$,
%\begin{align*}
%     \tb{y} \sim \tb{y}' \iff  
%     & \exists \ \tb{c} = (c,...,c) \in \BR^m, \\ &\text{such that}\
%    \tb{y} - \tb{y}' = \tb{c}.
%\end{align*}
%We denote the space generated by $\BR^m$ module the above equivalence relation as $\BR^m / \sim$, and use $[\tb{y}]$ to represent the equivalence class that $\tb{y}$ lies in. 
%\end{defn}

%We use function $g$ to represent the equivalence relation, that is
%\begin{align*}
%    g(\tb{x})=g(\tb{x}')\iff \tb{x}\sim \tb{x}'.
%\end{align*}

%Denote function $l: \BR^m/\sim\to \BR^m$ assigns $\tb{z} \in \BR^m/\sim$ to an arbitrary element in the set $g^{-1}(\tb{z})$.


The vanilla Multiplicative Weights Updates algorithm (MWU) for one player can be written as the following function :
\begin{align*}
		\MWU :   \Delta_{m} \times \BR^{m}/\sim &\to \Delta_{m}  \\
		 (\tb{x},[\tb{y}]) & \to  \left(\frac{\tb{x}_{i} e^{ \tb{y}_{i}}}{ \sum^m_{s=1} \tb{x}_{s} e^{\tb{y}_{s}} }   \right)^m_{i=1}.
\end{align*}


\begin{defn} We define a function $\phi : \Delta_m \times \BR^m / \sim \to \BR^m / \sim$ as follow:
\begin{align*}
    \phi : \Delta_m \times \BR^m / \sim &\to \BR^m / \sim \\
     (\tb{x},[\tb{y}]) &\to \left[ \nabla h(\tb{x}) + \tb{y} \right].
\end{align*}
    
\end{defn}



%\ping{Not complete lemma}
%\begin{lem}\label{lem: hinv} Denote $\nabla \tilde{h}=g\circ \nabla h : \Delta_m \to \BR^m/\sim$ is a homeomorphism between these two topology space, and its inverse map is $\nabla \tilde{h}^*= \nabla h^*\circ l$.
%\end{lem}
%\begin{proof}
%		We first illustrate that for any $\tb{z}\in \BR^m/\sim $, $(g\circ \nabla h\circ \nabla h^*\circ l)(\tb{z})=\tb{z} $.
%		Suppose $ l(\tb{z})=\tb{y}$, which equals to $g(\tb{y})=\tb{z}$. 
%		Then 
%		\begin{align*}
%			&\nabla h^*(\tb{y})= \left( \frac{e^{\tb{y}_i} }{ \sum^m_{s=1}e^{\tb{y}_s}} \right)^m_{i=1}\\
%			\rightarrow & \nabla h\circ \nabla h^*(\tb{y})=\left( {\tb{y}_i} -\ln( \sum^m_{s=1}e^{\tb{y}_s})\right)^m_{i=1}\\
%			\rightarrow &g(\left( {\tb{y}_i} -\ln( \sum^m_{s=1}e^{\tb{y}_s})\right)^m_{i=1})=g(\tb{y})=\tb{z}\\
%			\rightarrow & (g\circ \nabla h\circ \nabla h^*\circ l)(\tb{z})=\tb{z}.
%		\end{align*}
%		Then we prove the other side. For any $\tb{x}\in \Delta_m\times \Delta^n$, $\nabla h(\tb{x})=\left(\ln(\tb{x}_i)+1\right)_{i=1}^n$. Suppose $g(\left(\ln(\tb{x}_i)+1\right)_{i=1}^n)=\tb{z}$. Then $l(\tb{z})=\left(\ln(\tb{x}_i)+c\right)_{i=1}^n$, for some $c\in \BR$. Hence, 
%		\begin{align*}
%			& \nabla h^*(\left(\ln(\tb{x}_i)+c\right)_{i=1}^n)\\
%			=& \left( \frac{e^{\ln(\tb{x}_i)+c} }{ \sum^m_{s=1}e^{\ln(\tb{x}_s)+c}} \right)^m_{i=1}\\
%			=& \left( \frac{e^{\ln(\tb{x}_i)} }{ \sum^m_{s=1}e^{\ln(\tb{x}_s)}} \right)^m_{i=1}\\
%			=& \left( \frac{\tb{x}_i }{ \sum^m_{s=1}\tb{x}_s} \right)^m_{i=1}\\
%			=&\tb{x},
%		\end{align*}
%		where the last equality comes from $\tb{x}\in \Delta_m$. The below equalities imply that for any $\tb{x}\in \Delta_m$, $( \nabla h^*\circ l\circ g\circ \nabla h)(\tb{x})=\tb{x}$.
%		Then we complete the proof of the lemma.
%\end{proof}


\begin{prop}\label{prop: commutative}
The following diagram is commutative :
		
\begin{align*}
\xymatrix{
& \Delta_m \times \BR^m / \sim \ar[dl]_{\MWU} \ar[dr]^{\phi (\cdot)} & \\
 \Delta_m   \ar@/^/[rr]^{\nabla h (\cdot) } && \BR^m / \sim \ar@/^/[ll]^{\nabla h^*(\cdot)}
}
\end{align*}
\end{prop}

\begin{proof}
    For any $(\tb{x},[\tb{y}])\in  \Delta_{m} \times \BR^{m}/\sim$, our goal is to prove that
		\begin{enumerate}
		  \item  $\nabla h  \circ \MWU (\tb{x},[\tb{y}]) = \phi(\tb{x},[\tb{y}])$,
            \item  $\nabla h^*  \circ \phi (\tb{x},[\tb{y}]) = \MWU(\tb{x},[\tb{y}])$.
		\end{enumerate}
		We start by proving the first item. It is directly to calculate
  \begin{align*}
      \nabla h  \circ \MWU (\tb{x},\tb{y}) & = \nabla h \left( \left(\frac{\tb{x}_{i} e^{ \tb{y}_{i}}}{ \sum^m_{s=1} \tb{x}_{s} e^{\tb{y}_{s}} }   \right)^m_{i=1}   \right)\\
      & = \left(    1 + \ln \left( \frac{\tb{x}_i e^{\tb{y}_i} }{\sum^m_{s=1}\tb{x}_s e^{\tb{y}_s}} \right)    \right)^m_{i=1} \\
      & = \left[\left(1 + \tb{y}_i + \ln(\tb{x}_i) -  \ln ( \sum^m_{s=1}\tb{x}_s e^{\tb{y}_s} ) \right)^m_{i =1} \right] \\
      & = \left[\left( \tb{y}_i + \ln(\tb{x}_i)  \right)^m_{i =1} \right],
  \end{align*}
and
\begin{align*}
    \phi(\tb{x},[\tb{y}]) = \left[ \left(1 + \ln (\tb{x}_i) + \tb{y}_i \right)\right].
\end{align*}
Since as equivalence class, we have $\left[\left( \tb{y}_i + \ln(\tb{x}_i)  \right)^m_{i =1} \right] = \left[ \left(1 + \ln (\tb{x}_i) + \tb{y}_i \right)^m_{i=1}\right]$,  this prove the first item.

For the second item, it is directly to calculate

\begin{align*}
    \nabla h^*  \circ \phi (\tb{x},[\tb{y}]) & = \nabla h^* \left( \left[ (1+\ln(\tb{x}_i) +\tb{y}_i)^m_{i=1}\right] \right) \\
    & =  \nabla h^* \left( \left[ (\ln(\tb{x}_i) +\tb{y}_i)^m_{i=1}\right] \right) \\
    & = \left( \frac{e^{\ln (\tb{x}_i) + \tb{y}_i}}{\sum^m_{s=1} e^{\ln (\tb{x}_s) + \tb{y}_s} } \right)^m_{i=1}\\
    & = \left(\frac{\tb{x}_ie^{\tb{y}_i}}{ \sum^m_{s=1} \tb{x}_se^{\tb{y}_s}}\right)\\
    & = \MWU(\tb{x},[\tb{y}]).
\end{align*}
This prove the second item.

\end{proof}

\begin{lem}\label{lem: h-y} For arbitrary $\tb p \in \Delta_m$, if $\tb{y} \in \BR^m$ and $\tb{x} =\nabla h^*([\tb{y}]) $, then
\begin{align*}
        \langle \nabla h(\tb{x}) - \tb{y}, \tb{x} - \tb p \rangle = 0.
\end{align*}
\end{lem}

\begin{proof}
    From Lemma \ref{inverse}, we have
		\begin{align*}
			\nabla h(\nabla h^*([\tb {y}]))=\tb{y}+\tb{c},
		\end{align*}
		where $\tb{c}$ is a constant vector, therefore $\langle \nabla h(\tb{x}) - \tb{y}, \tb{x} - \tb p \rangle $ can be transitioned to
		\begin{align*}
			&\langle \tb c, \tb{x} - \tb p \rangle \\
			=&\langle \tb c, \tb{x} \rangle -\langle \tb c,  \tb p \rangle\\
			=&0.
		\end{align*}
		The second equality arises from $\textbf{x}$ and $\textbf{p}$ belong to the simplex, i.e., $\sum \tb{x}_i = \sum \tb{p}_i = 1$.
\end{proof}

\begin{lem}\label{lem: MWU continuous}We have
\begin{align*}
       \lVert \MWU(\tb{x},[\tb{y}_1]) -  \MWU(\tb{x},[\tb{y}_2]) \lVert \le \lVert \tb{y}_1 - \tb{y}_2 \lVert.
\end{align*}
\end{lem}

\begin{proof} From Proposition \ref{prop: commutative}, we have
\begin{align*}
     \lVert \MWU(\tb{x},[\tb{y}_1]) -  \MWU(\tb{x},[\tb{y}_2]) \lVert & = 
     \lVert  \nabla h^* \left( \phi(\tb{x},\tb{y}_1) \right) -   \nabla h^* \left( \phi(\tb{x},\tb{y}_2) \right) \lVert \\
     & \le  \lVert  \phi(\tb{x},\tb{y}_1)  -   \phi(\tb{x},\tb{y}_2)  \lVert \\
     & =  \lVert  \tb{y}_1  -   \tb{y}_2  \lVert,
\end{align*}
  where the first inequality from $h^*$ has 1-Lipschitz continuous gradient, see Lemma \ref{Fenchel}, and the last equality is from the definition of $\phi$.
\end{proof}

\begin{lem}[Three-points identity \cite{chen1993convergence}]\label{lem: kl x x'}
For ant $\tb{p}, \tb{x}, \tb{x}' \in \Delta_m$, the following equality holds
\begin{align*}
    \KL(\tb{p},\tb{x}') = \KL(\tb{p},\tb{x}) + \KL(\tb{x},\tb{x}') +
    \langle \left( \ln(\tb{x}'_i/\tb{x}_i)^{m}_{i=1} \right), (\tb{x}_i - \tb{p}_i)^{m}_{i=1} \rangle
\end{align*}
    
\end{lem}

\begin{proof}
    By definiton, it holds that
		\begin{align*}
			&\KL(\tb{p},\tb{x}')=h(\textbf{p})-h(\textbf{x}')- \langle \nabla h(\textbf{x}'),\textbf{p}-\textbf{x}')\rangle,\\
			&\KL(\tb{p},\tb{x})=h(\textbf{p})-h(\textbf{x}) - \langle \nabla h(\textbf{x}) ,\textbf{p}-\textbf{x} )\rangle,\\
			&\KL(\tb{x},\tb{x}')=h(\textbf{x})-h(\textbf{x}')- \langle \nabla h(\textbf{x}'),\textbf{x}-\textbf{x}')\rangle.
		\end{align*}
		Then $\KL(\tb{x},\tb{x}') +\KL(\tb{p},\tb{x}) - \KL(\tb{p},\tb{x}') $ gives
		\begin{align*}
			\KL(\tb{p},\tb{x}') = \KL(\tb{p},\tb{x}) + \KL(\tb{x},\tb{x}') +
			\langle  \nabla h(\textbf{x}')-\nabla h(\textbf{x}), \textbf{p}-\textbf{x} \rangle.
		\end{align*}
		By replacing $\nabla h(\textbf{x})$ with $\left(\ln \textbf{x}_i+1\right)_{i=1}^{m}$, the result can be concluded.
\end{proof}


\begin{lem}\label{lem: kl x p} Let $\tb{x}^{\dagger} = \MWU(\tb{x},\tb{y})$, then
\begin{align*}
    \KL(\tb{p},\tb{x}^{\dagger}) = \KL(\tb{p},\tb{x}) - \KL(\tb{x}^{\dagger} ,\tb{x} ) + \langle \tb{y},\tb{x}^{\dagger} - \tb{p} \rangle.
\end{align*}
\end{lem}

\begin{proof}
    In Lemma \ref{lem: kl x x'}, take $\tb{x} = \tb{x}^{\dagger}$ and $\tb{x}' = \tb{x}$, it turns out to be 
    \begin{align*}
        \KL(\tb{p},\tb{x}^{\dagger}) & =  \KL(\tb{p},\tb{x}) - \KL(\tb{x}^{\dagger} ,\tb{x} ) + \langle  \nabla h(\tb{x}) -  \nabla h(\tb{x}^{\dagger}) ,\tb{x}^{\dagger} - \tb{p} \rangle \\
        & = \KL(\tb{p},\tb{x}) - \KL(\tb{x}^{\dagger} ,\tb{x} ) + \langle  \nabla h(\tb{x}) -  \phi(\tb{x},[\tb{y}]) ,\tb{x}^{\dagger} - \tb{p} \rangle \\
        & =  \KL(\tb{p},\tb{x}) - \KL(\tb{x}^{\dagger} ,\tb{x} ) + \langle  \tb{y} ,\tb{x}^{\dagger} - \tb{p} \rangle,
    \end{align*}
where the second equality comes from Proposition \ref{prop: commutative} and the last equality is from the definition of $\phi$ and the fact that for any two vectors $\tb{y},\tb{y}' \in [\tb{y}]$, we have
$\langle \tb{y}, \tb{p} \rangle = \langle \tb{y}', \tb{p} \rangle$.
\end{proof}

Let $\CF_{i} : \Delta_m \times \Delta_n \to \Delta_m \times \Delta_n$ be the (Extra-MWU) algorithm with payoff matrix $A_i$, for any initial condition $(\tb{x}_0,\tb{y}_0)$ and any $i \in [\CT]$, the following Property shows the KL-divergence will decrease after an iteration by
\begin{align*}
    \tilde{\CF}_i = \CF_{i+\CT-1} \circ \CF_{i+\CT-2} \circ ...\circ \CF_{i+1} \circ \CF_{i}.
\end{align*}

	\begin{prop}\label{decreasing_KL} For any $i \in [\CT]$ and $n$, if the step size $\eta$ in (Extra-MWU) satisfies $\eta \cdot \max_{t \in [\CT]}\lVert A_t \lVert < 1$, then we have
		\begin{align*}
			\KL\left( (\tb{x}_1^*,\tb{x}_2^*), \tilde{\CF}_i (\tb{x}_1^{n\CT+i},\tb{x}_2^{n\CT+i})  \right) < \KL\left( (\tb{x}_1^*,\tb{x}_2^*), (\tb{x}_1^{n\CT+i},\tb{x}_2^{n\CT+i})  \right),
		\end{align*}
		and the equal holds if and only if $ (\tb{x}_1^{n\CT+i},\tb{x}_2^{n\CT+i})=(\tb{x}_1^*,\tb{x}_2^*)$.
	\end{prop}
	
	\begin{proof}
		In fact, from $\tilde{\CF}_i (\tb{x}_1^{n\CT+i},\tb{x}_2^{n\CT+i})= (\tb{x}_1^{(n+1)\CT+i},\tb{x}_2^{(n+1)\CT+i})$, it holds that
		\begin{align*}
			&\KL\left( (\tb{x}_1^*,\tb{x}_2^*), \tilde{\CF}_i (\tb{x}_1^{n\CT+i},\tb{x}_2^{n\CT+i})  \right) - \KL\left( (\tb{x}_1^*,\tb{x}_2^*), (\tb{x}_1^{n\CT+i},\tb{x}_2^{n\CT+i})  \right)\\
			=&\KL\left( (\tb{x}_1^*,\tb{x}_2^*), (\tb{x}_1^{(n+1)\CT+i},\tb{x}_2^{(n+1)\CT+i}) \right) - \KL\left( (\tb{x}_1^*,\tb{x}_2^*), (\tb{x}_1^{n\CT+i},\tb{x}_2^{n\CT+i})  \right)\\
			=&\sum_{j=0}^{\CT-1} \left( \KL\left( (\tb{x}_1^*,\tb{x}_2^*), (\tb{x}_1^{n\CT+i+j+1},\tb{x}_2^{n\CT+i+j+1}) \right) - \KL\left( (\tb{x}_1^*,\tb{x}_2^*), (\tb{x}_1^{n\CT+i+j},\tb{x}_2^{n\CT+i+j})  \right) \right).
		\end{align*}

    In the following we will prove for any $j \in [\CT]$, we have
    \begin{align*}
        \KL\left( (\tb{x}_1^*,\tb{x}_2^*), (\tb{x}_1^{n\CT+i+j+1},\tb{x}_2^{n\CT+i+j+1}) \right) - \KL \left( (\tb{x}_1^*,\tb{x}_2^*), (\tb{x}_1^{n\CT+i+j},\tb{x}_2^{n\CT+i+j})\right) < 0,
    \end{align*}
    which implies Proposition \ref{decreasing_KL}.
		
  In following, for a fixed $j \in [\CT]$, we use $\textbf{x} $ to represent $(\textbf{x}_1^{n\CT+i+j},\textbf{x}_2^{n\CT+i+j})$, $\textbf{x}^\dagger$ to represent $(\textbf{x}_1^{n\CT+i+j+\frac{1}{2}},\textbf{x}_2^{n\CT+i+j+\frac{1}{2}})$, and $\textbf{x}^\ddagger$ to represent $( \textbf{x}_1^{n\CT+i+j+1},\textbf{x}_2^{n\CT+i+j+1})$. Similarly, we use $\textbf{y}$  to represent $(\textbf{y}_1^{n\CT+i+j},\textbf{y}_2^{n\CT+i+j})$, 
		$\textbf{y}^\dagger$  to represent $(\textbf{y}_1^{n\CT+i+j+\frac{1}{2}},\textbf{y}_2^{n\CT+i+j+\frac{1}{2}})$.
		
		By the definition of (Extra-$\MWU$), for $i \in [2]$ we have
  
		\begin{align*}
            \tb{x}_i^{n\CT+i+j+\frac{1}{2}}&=\MWU(\tb{x}_i^{n\CT+i+j},\tb{y}_i^{n\CT+i+j} ),\\
            \tb{x}_i^{n\CT+i+j+1}&=\MWU(\tb{x}_i^{n\CT+i+j},\tb{y}_i^{n\CT+i+j+\frac{1}{2}}),
		\end{align*}
  
		which leads to
		\begin{align*}
			\textbf{x}_i^\dagger&=\MWU(\textbf{x}_i,\textbf{y}_i),\\
		\textbf{x}_i^\ddagger&=\MWU(\textbf{x}_i,\textbf{y}^\dagger_i).
		\end{align*}
%		Let $\textbf{x}^\ddagger=\MWU(\textbf{x}^\dagger,\textbf{y}^\dagger)$. 
		Replacing $\textbf{x}^\dagger$ with $\textbf{x}^\ddagger$ and $\textbf{p}$ with $\textbf{x}^*$ in  Lemma~\ref{lem: kl x p}, we have
		\begin{align*}
			\KL(\textbf{x}^*,\textbf{x}^\ddagger)-\KL(\textbf{x}^*,\textbf{x}^\dagger)=-\KL(\textbf{x}^\ddagger,\textbf{x})+\langle \textbf{y}^\dagger,\textbf{x}^\ddagger-\textbf{x}^* \rangle.
		\end{align*}

		Let $\textbf{p}=\textbf{x}^\ddagger$ in Lemma~\ref{lem: kl x p}, we have
		\begin{align*}
\KL(\textbf{x}^\ddagger,\textbf{x})=\KL(\textbf{x}^\ddagger,\textbf{x}^\dagger)+\KL(\textbf{x}^\dagger,\textbf{x})-\langle \textbf{y},\textbf{x}^\dagger-\textbf{x}^\ddagger\rangle.
		\end{align*}
		Combining the above two equalities, it holds that
		\begin{align*}
			&\KL(\textbf{x}^*,\textbf{x}^\ddagger)-\KL(\textbf{x}^*,\textbf{x}^\dagger)\\
			=&-\KL(\textbf{x}^\ddagger,\textbf{x}^\dagger)-\KL(\textbf{x}^\dagger,\textbf{x})+\langle \textbf{y}^\dagger,\textbf{x}^\ddagger-\textbf{x}^* \rangle+\langle \textbf{y},\textbf{x}^\dagger-\textbf{x}^\ddagger\rangle\\
			=&-\KL(\textbf{x}^\ddagger,\textbf{x}^\dagger)-\KL(\textbf{x}^\dagger,\textbf{x})+\langle \textbf{y}^\dagger,\textbf{x}^\dagger-\textbf{x}^* \rangle+\langle \textbf{y}^\dagger-\textbf{y},\textbf{x}^\ddagger-\textbf{x}^\dagger\rangle\\
			\le & -\frac{1}{2}\norm{\textbf{x}^\ddagger-\textbf{x}^\dagger}^2-\frac{1}{2}\norm{\textbf{x}^\dagger-\textbf{x}}^2+\langle \textbf{y}^\dagger,\textbf{x}^\dagger-\textbf{x}^* \rangle+\frac{1}{2}\norm{\textbf{y}^\dagger-\textbf{y}}^2+\frac{1}{2}\norm{\textbf{x}^\ddagger-\textbf{x}^\dagger}^2\\
			=&\frac{1}{2}\norm{\textbf{y}^\dagger-\textbf{y}}^2-\frac{1}{2}\norm{\textbf{x}^\dagger-\textbf{x}}^2+\langle \textbf{y}^\dagger,\textbf{x}^\dagger-\textbf{x}^* \rangle.
		\end{align*}
        Next, we estimate $\norm{\textbf{y}^\dagger-\textbf{y}}^2$
 and $\langle \textbf{y}^\dagger,\textbf{x}^\dagger-\textbf{x}^* \rangle$.
 Recall the definition of $\textbf{y}$ and $\textbf{y}^\dagger$ : 
		\begin{align*}
			\textbf{y}&=(\textbf{y}_1^{n\CT+i+j},\textbf{y}_2^{n\CT+i+j})\\
			&=(\eta A_{n\CT+i+j}\textbf{x}_2^{n\CT+i+j},-\eta A_{n\CT+i+j}^\top \textbf{x}_1^{n\CT+i+j})\\
			&=(\eta A_{i+j}\textbf{x}_2^{i+j},-\eta A_{i+j}^\top \textbf{x}_1^{n\CT+i+j})\\
			&=\eta\cdot
			\begin{bmatrix}
				&A_{i+j}\\
				-A_{i+j}^\top&
			\end{bmatrix}\textbf{x},
		\end{align*}
		and
		\begin{align*}
			\textbf{y}^\dagger&=(\textbf{y}_1^{n\CT+i+j+\frac{1}{2}},\textbf{y}_2^{n\CT+i+j+\frac{1}{2}})\\
			&=(\eta A_{n\CT+i+j}\textbf{x}_2^{n\CT+i+j+\frac{1}{2}},-\eta A_{n\CT+i+j}^\top \textbf{x}_1^{n\CT+i+j+\frac{1}{2}})\\
			&=(\eta A_{i+j}\textbf{x}_2^{n\CT+i+j+\frac{1}{2}},-\eta A_{i+j}^\top \textbf{x}_1^{n\CT+i+j+\frac{1}{2}})\\
			&=\eta\cdot
			\begin{bmatrix}
				&A_{i+j}\\
				-A_{i+j}^\top&
			\end{bmatrix}\textbf{x}^\dagger.
		\end{align*}
Then we have that
\begin{align*}
	\norm{\textbf{y}^\dagger-\textbf{y}}^2\le \eta^2 \norm{A_{i+j}}^2\cdot \norm{\textbf{x}^\dagger-\textbf{x}}.
\end{align*}
and 
\begin{align*}
	&\langle \textbf{y}^\dagger,\textbf{x}^\dagger-\textbf{x}^* \rangle\\
	=&-(\textbf{x}_1^*)^\top A_{i+j} \textbf{x}_2^{n\CT+i+j+\frac{1}{2}}+(\textbf{x}_1^{n\CT+i+j+\frac{1}{2}})^\top A_{i+j} \textbf{x}_2^*\\
	=&(\textbf{x}_1^*)^\top A_{i+j}\textbf{x}_2^*-(\textbf{x}_1^*)^\top A_{i+j} \textbf{x}_2^{n\CT+i+j+\frac{1}{2}}+(\textbf{x}_1^{n\CT+i+j+\frac{1}{2}})^\top A_{i+j} \textbf{x}_2^*-(\textbf{x}_1^*)^\top A_{i+j}\textbf{x}_2^*\\
	\le&0,
\end{align*}
where the last inequality comes from $\textbf{x}_1$ is the maxima player, and $\textbf{x}_2$ is the minima player.

Let $q = \max_{t \in [\CT]} \lVert A_t \lVert $, then we have
	\begin{align*}
		&\KL(\textbf{x}^*,\textbf{x}^\ddagger)-\KL(\textbf{x}^*,\textbf{x}^\dagger)\\
		\le & \frac{1}{2}(\eta^2q^2-1)\norm{\textbf{x}^\dagger-\textbf{x}}^2+\langle \textbf{y}^\dagger,\textbf{x}^\dagger-\textbf{x}^* \rangle\\
		\le &\frac{1}{2}(\eta^2q^2-1)\norm{\textbf{x}^\dagger-\textbf{x}}^2 < 0.
	\end{align*}
	Then it can be concluded that
	\begin{align*}
		 \KL\left( (\tb{x}_1^*,\tb{x}_2^*), (\tb{x}_1^{n\CT+i+j+1},\tb{x}_2^{n\CT+i+j+1}) \right) - \KL\left( (\tb{x}_1^*,\tb{x}_2^*), (\tb{x}_1^{n\CT+i+j},\tb{x}_2^{n\CT+i+j})  \right)< 0,
	\end{align*}
	which leads to the result.
\end{proof}


\begin{prop}[Discrete-time LaSalle invariance principle ,\ \cite{la1976stability}]\label{DLIP} Let  $G$ be any set in $\BR^m$. Consider a difference equations system defined by a map $T : G \to G$ that is well defined for any $x \in G$ and continuous at any $x \in G$. Suppose there exists a scalar map $V : \bar{G} \to \BR$ satisfying 
\begin{itemize}
    \item $V(x)$ is continuous at any $x \in \bar{G}$,
    \item $V\left(T(x)\right) - V(x) \le 0$ for any $x \in G$.
\end{itemize}
    For any $x_0 \in G$, if the solution to the following initial-value problem
    \begin{align*}
        x(n+1) = T(x(n)), x(0) = x_0,
    \end{align*}
satisfying that $\{ x(n) \}^{\infty}_{n=1}$ is bounded and $x(n) \in G$ for any $n \in \BN$, then there exists some $c \in \BR$ such that 
$x(n) \to M \cap V^{-1}(c)$ as $n \to \infty$, where 
\begin{align*}
    V^{-1}(c) = \{ x \in \BR^m \lvert V(x) = c \},
\end{align*}
and $M$ is the largest invariant set in $E= \{x \in G \ \lvert \  \ V(T(x))-V(x)=0 \}$.
\end{prop}

%\textcolor{red}{Li: $\tilde{\CF}_i$ doesn't transit $(\tb{x}_0,\tb{y}_0)$, should be start from $(\tb{x}_{i-1},\tb{y}_{i-1})$.}
\begin{prop}
    For any $i \in [\CT]$ and $(\tb{x}^0_1,\tb{x}^0_2) \in \Delta_m \times \Delta_n$, we have
    \begin{align*}
        \lim_{n \to \infty} \tilde{\CF}^n_i \left((\tb{x}^0_1,\tb{x}^0_2)\right) = (\tb{x}_1^*,\tb{x}_2^*).
    \end{align*}
\end{prop}


\begin{proof} In Proposition \ref{DLIP}, we replace the dynamical system $T$ by $\tilde{\CF}_i$ and the scalar map $V$ by the $\KL$-divergence $\KL \left( (\tb{x}_1^*,\tb{x}_2^*), \cdot \right)$.
Note that as $\KL$-divergence is defined as $+\infty$ on the boundary of simplex, thus $\KL \left( (\tb{x}_1^*,\tb{x}_2^*), \cdot \right)$ is continuous function on the simplex.

From Proposition \ref{decreasing_KL}, the invariant set $M$ can only the the single point set $\{ (\tb{x}_1^*,\tb{x}_2^*) \}$, thus we have
\begin{align*}
        \lim_{n \to \infty} \tilde{\CF}^n_i \left((\tb{x}^0_1,\tb{x}^0_2)\right) = (\tb{x}_1^*,\tb{x}_2^*).
    \end{align*}
\end{proof}

The following Proposition character the attractor of a periodic dynamical system.

\begin{prop}[Theorem 3 in \cite{franke2003attractors}]\label{attrat2} Let $\Omega$ be an attractor for the $\CT$-periodic dynamical system ${f_0,f_1,...,f_{\CT-1}}$.Then $\Omega = \cup^{\CT-1}_{i=0} \Omega_i$,where $\Omega_i$ is an attractor for the map $f_{i+\CT-1} \circ ... \circ f_{i}$, for $i \in [\CT]$.
\end{prop}

Now Theorem \ref{T2} directly follows from Proposition \ref{attrat}, as it has been shown in our case $\Omega_i = \{ (\tb{x}_1^*,\tb{x}_2^*) \}$.

