\newpage
\subsection{Proof for Theorem \ref{thm:exp3-ucb}}\label{proof:exp3ucb}

\begin{theorem*}
[Last iterate convergence of EXP3-UCB under limited information]
In a limited information setting of a repeated general-sum Stackelberg game with noisy bandit feedback, applying EXP3-UCB, with $\alpha = \mathcal{O}\left(T^{-\frac{1}{3}}\right)$, $\eta = \mathcal{O}\left(T^{-\frac{1}{3}}\right)$, for $a\neq a_{se}$, with probability at least $1-3\delta$,
\[
\mP\big[a\neq a_{se} \big] \leq \alpha + A\exp\left( -\Delta_2  T^{\frac{2}{3}} + 2\sqrt{2A \log\frac{2}{\delta}} T^{\frac{1}{3}} +  \frac{C_1 A B}{\Delta_1^2}\log \frac{T}{\delta} \log \frac{1}{\delta} \right),
\]
where $C_1$ is a constant, $\Delta_2$ is the leader's suboptimality reward gap to Stackelberg equilibrium: 
\[
    \Delta_2 = \min_{a\neq a_{se}} \mu_l(a_{se},b_{se}) - \mu_l(a,\tbr(a)).
\]
\end{theorem*}

\bigskip
We first prove the following two lemmas which will then be used to prove Theorem~\ref{thm:exp3-ucb}. We denote $ \tr_t(a) = \frac{1}{\Tilde{x}_{t}(a)} r_{l,t}(a, \mF_{t}(a)) \mI\{a_t = a\}$. We denote $r_t(a) = \mu_{l}(a, \mF_{t}(a))$ and $\mu(a) = \mu_{l}(a, \tbr(a))$ in this section when there is no ambiguity.
\begin{lemma}\citep{wu2022multi}\label{lemma:concentration}
For $a\in\mA$, $T>0$, with probability at least $1-\delta$,
\begin{equation*}
     |\sum_{t=1}^T \eta(\tr_t(a)-r_t(a))| < \sqrt{2  \eta^2\frac{A}{\alpha} T \log\frac{2}{\delta}}.
\end{equation*}
\end{lemma}
\begin{proof}[\textbf{Proof of Lemma~\ref{lemma:concentration}}]
Fix $T>0$ and any action $a$. Let $\Xi_t=\eta (\tr_t(a)-r_t(a))$ and $ S_t=\sum_{\tau=1}^t \Xi_{\tau}$. Since $T$ is fixed, $\mathbb{E}[S_t]$ is bounded. Moreover, we have
\begin{align*}
    \mathbb{E}[S_t|S_{t-1},\cdots,S_1] &= S_{t-1} + \eta \cdot \mathbb{E}[\tr_t(a)-r_t(a)|\mG_{t-1}]=S_{t-1}.
\end{align*}
By definition, $\{S_t\}_{t=1}^T$ is a martingale. Apply Azuma’s inequality to $\{S_t\}$, for any $x>0, 1\leq t \leq T$, we have  
\begin{equation}\label{eq:Benn}
    \mathbb{P}[|S_t| \geq x]\leq 2\exp{\Big(-\frac{x^2}{2W}\Big)}.
\end{equation}
$W$ is an upper bound of $\sum_{\tau=1}^t \mathbb{E}[\Xi_{\tau}^2|\mG_{\tau-1}]$. Note that
\begin{align*}
\notag
\mathbb{E}[\Xi_t^2|\mathcal{G}_{t-1}]&= \eta^2\mathbb{E}\Big[\big(0-r_t(a)\big)^2 \cdot (1-x_{t}(a)) + \Big(\tr_t(a)-r_t(a)\Big)^2 \cdot x_{t}(a)\Big|\mathcal{G}_{t-1}\Big] \\  \notag
&=\eta^2\mathbb{E}\Big[r_{l,t}^2(a, \mF_{t}(a))\cdot\frac{1}{x_t(a)}-r_t^2(a)\Big|\mathcal{G}_{t-1}\Big] \\ \label{eq:xisquareupper}
& \leq \frac{A}{\alpha}\eta^2.
\end{align*}
We take $t=T$ and $W=\frac{A}{\alpha}\eta^2 T$ in Eq.\eqref{eq:Benn} and obtain
\begin{equation*}\label{eq:solvex}
    x\geq \sqrt{2W\log \frac{2}{\delta}},
\end{equation*}
which implies with probability at least $1-\delta$, 
$$|\sum_{t=1}^T \eta(\tr_t(a)-r_t(a))| < \sqrt{2  \eta^2\frac{A}{\alpha} T \log\frac{2}{\delta}}.$$
\end{proof}

\begin{lemma}\label{lemma: bound for wbr and wwr}
We set $\delta=T^{-\beta}$ in $\text{ucb}_{f,t}(\cdot,\cdot)$, and $\beta\geq 3$. Denote $T_{wbr}(a)$ as the number of rounds that the follower did not play best response when leader played $a$, with probability at least $1-\delta$, for all $a\in\mA$, we have
% \begin{equation*}
%     T_{wbr}(a)\leq \mO\left(B\frac{\log(T/\delta)}{\Delta_1^2} + B\log\left(\frac{AB}{\delta}\right)\right)=\mO\left(B\frac{\log(T/\delta)}{\Delta_1^2}\right).
% \end{equation*}
\begin{equation*}
    T_{wbr}(a)\leq \mO\left(B\frac{\log(T/\delta)}{\Delta_1^2}\right).
\end{equation*}
Similarly, we denote the number of wrong recognition rounds of $\twr(a)$ in Algorithm~\ref{alg:UCB-FM} for every action $a\in\mA$ as $T_{wwr}(a)$, with probability at least $1-\delta$, for all $a\in\mA$, we have
% \begin{equation*}
%     T_{wwr}(a)\leq \mO\left(B\frac{\log(ABT/\delta)}{\Delta_1^2} + B\log\left(\frac{AB}{\delta}\right)\right)=\mO\left(B\frac{\log(ABT/\delta)}{\Delta_1^2} \right).
% \end{equation*}
\begin{equation*}
    T_{wwr}(a)\leq \mO\left(B\frac{\log(ABT/\delta)}{\Delta_1^2} \right).
\end{equation*}
\end{lemma}
\begin{proof}
% See Theorem 10.14 of \citet{orabona2019modern} for a detailed proof. 
Combining the Theorem 10.14 of \citet{orabona2019modern} and Bernstein inequality we can get the results.
\end{proof}



\begin{proof}[\textbf{Proof of Theorem~\ref{thm:exp3-ucb}}]
We denote $\mu(a) = \mu_{l}(a, \tbr(a))$ in this section when there is no ambiguity. For $a\neq a_{se}$, based on E.4 of \citet{yu2022learning}, we have
\begin{equation}\label{eq:bound for not best response}
\begin{aligned}
\sum_{t=1}^T r_t(a) - r_t(a_{se}) 
& = \sum_{t=1}^T \Big(r_t(a) - \mu(a)\Big) + \Big(\mu(a) - \mu(a_{se})\Big) + \Big(\mu(a_{se}) - r_t(a_{se})\Big) \\
& \leq \sum_{t=1}^T \Big(r_t(a) - \mu(a)\Big) - \Delta_2 T + \sum_{t=1}^T \Big(\mu(a_{se})- r_t(a_{se})\Big) \\
& \leq 4 s_{m} \sum_{a\in\mA} T_{wbr}(a) - \Delta_2 T,
\end{aligned}
\end{equation}

where $s_m$ represents the biggest interval the leader chooses action $a$ between the $i$-th time and the $(i+1)$-th time for any $i\in [n(a)-1]$. With probability at least $1-\delta$, we have
\[
s_{m} \leq \mO\left(\frac{A}{\alpha} \log \frac{1}{\delta}\right).
\]

We denote $y_t(a) = \sum_{\tau=1}^t \eta \tr_t(a)$. For $a\neq a_{se}$, combined with~\ref{lemma:concentration} and Eq.\eqref{eq:bound for not best response}, we have
\begin{equation*}
\begin{aligned}
y_T(a) - y_T(a_{se})
&= \sum_{t=1}^T \eta \left[\tr_t(a) - \tr_t(a_{se})\right] \\ 
&= \sum_{t=1}^T \eta \left[\tr_t(a) - r_t(a))\right] + \sum_{t=1}^T \eta \left[r_t(a) - r_t(a_{se})\right] +\sum_{t=1}^T \eta \left[r_t(a) - \tr_t(a_{se})\right] \\
& \leq 2\sqrt{2  \eta^2\frac{A}{\alpha} T \log\frac{2}{\delta}} + 4 \eta s_{m} \sum_{a\in\mA} T_{wbr}(a) - \Delta_2 \eta T.
\end{aligned}
\end{equation*}




For any $a\neq a_{se}$, we set $\alpha = \mO\left(T^{-\frac{1}{3}}\right)$, $\eta = \mO\left(T^{-\frac{1}{3}}\right)$, by the union bound, with probability at least $1-3\delta$,
% at least $1-3A\delta$
\begin{equation}\label{eq:bound for probability}
\begin{aligned}
x_{T+1}(a) 
&= \frac{\exp (y_T(a))}{\sum_{a^\prime \in \mA} \exp(y_T(a^\prime))} 
\leq \frac{\exp (y_T(a))}{\exp (y_T(a_{se}))}  \\
&\leq \exp\left( -\Delta_2 T^{\frac{2}{3}} + 2\sqrt{2A \log\frac{2}{\delta}} T^{\frac{1}{3}} +  \frac{C_1 A B}{\Delta_1^2}\log \frac{T}{\delta} \log \frac{1}{\delta} \right) 
\end{aligned}
\end{equation}
Note that
\begin{equation*}
    \mP\big[a_{T}\neq a_{se}\big] = \sum_{a\neq a_{se}} \tx_{T}(a) \leq  \alpha + \sum_{a\neq a_{se}} x_T(a).
\end{equation*}
So, combined with Eq.\eqref{eq:bound for probability}, we have reached our conclusion. 

\end{proof}

% \noindent
\textbf{Remark:} 
Last-iterate convergence refers to the phenomenon where the sequence of actions converges to the optimal action or the sequence of policies converges to the optimal policy, formulated as $\lim_{t \to \infty} a_t = a_\star$. It represents a particularly strong form of convergence, which is inevitably a stronger notion than the average-iterate convergence~\citep{wu2022multi, abe2022last, cai2024uncoupled}: an algorithm demonstrating last-iterate convergence is proven to also be capable of achieving average-iterate convergence whereas the reverse is not necessarily true. Our findings corroborate this view, indicating that $\lim_{t \to \infty} P(a_t \neq a_\star)=0$ leads to $\lim_{t \to \infty} a_t = a_\star$.

The formulations of the bounds presented in both Theorem 2 and Theorem 3 are conventional within the learning theory community, attributable to divergent analytical approaches applied to EXP3 and UCB type algorithms. 
Generally speaking, Theorem 2 introduces a decay rate of $\mathcal{O}(T^{-1/3} + \exp(-T^{2/3}))$, contrasting with the linear decay in $T$ observed in Theorem 3. This distinction suggests a comparative advantage for Theorem 3, contingent upon a specific constraint on $T$. Nevertheless, given that both bounds are influenced by the gaps, denoted as $\Delta_1, \Delta_2$, a thorough comparison necessitates consideration of these gap values.
