\newpage
\subsection{Proof for Theorem \ref{thm:ucbe-ucb}}\label{proof:ucbeucb}
\begin{theorem*}[Last iterate convergence of UCBE-UCB  under limited information]
In a limited information setting of a repeated general-sum Stackelberg game with noisy bandit feedback, applying UCBE-UCB with $S_0 = \mathcal{O}\left(\frac{B}{\epsilon^3}\log\frac{ABT}{\delta}\right)$, $\varepsilon = \min\{\Delta_1,\Delta_2\}$, for $T \geq \mathcal{O}\left(\frac{A S_0}{\Delta_1^2}\right)$, with probability at least $1-3\delta$, we have:
\[
\mathbb{P}\big(a_T \neq a_{se}\big) \leq \frac{\delta}{T}.
\]
\end{theorem*}

\bigskip \bigskip
\begin{lemma}\label{lemma:ucb bound}
Let $\tU_l(a,b)=\hmu(a,b) + \sqrt{\frac{S_1}{n_t(a,b)}}$, $n(a,b)\geq 1$, set $S_1 = 2 \log \frac{ABT}{\delta}$, when $n(a,b)\geq  \mO\left(\frac{\log (ABT/\delta)}{\varepsilon^2}\right)$, with probability at least $1-\frac{\delta}{ABT}$, we have
\[
U_l(a,b) \in \left[ \mu(a,b)-\frac{\varepsilon}{4}, \mu(a,b)+\frac{\varepsilon}{4} \right].
\]
A similar result holds for the follower. 
\end{lemma}

\begin{proof}[\textbf{Proof of Lemma~\ref{lemma:ucb bound}}]
(1) When $n(a,b)\geq \mO\left(\frac{\log (ABT/\delta)}{\varepsilon^2}\right)$, by the Hoeffding inequality, with probability at least $1-\frac{\delta}{ABT}$, we have
\[
\hmu(a,b) \in \left[ \mu(a,b)-\frac{\varepsilon}{8}, \mu(a,b)+\frac{\varepsilon}{8} \right].
\]

(2) When $n(a,b)\geq \mO\left(\frac{S_1}{\varepsilon^2}\right)$, 
\[
\sqrt{\frac{S_1}{n_t(a,b)}} \leq \frac{\varepsilon}{8} .
\]

So combining (1) and (2), when $n(a,b)\geq \mO\left(\max\left\{\frac{\log (ABT/\delta)}{\varepsilon^2},  \frac{S_1}{\varepsilon^2}\right\}\right)$, with probability at least $1-\frac{\delta}{ABT}$, we have
\[
U_l(a,b) \in \left[ \mu(a,b)-\frac{\varepsilon}{4}, \mu(a,b)+\frac{\varepsilon}{4} \right].
\]
The bound for the UCB term of the follower is completely analogous and thus omitted.
\end{proof}


\begin{proof}[\textbf{Proof of Theorem~\ref{thm:ucbe-ucb}}]
We next only consider rounds that the leader plays $a\in\mA$. With a little abuse of notation, $t$ is referred to as $t$-th time that the leader plays the action $a$.
We denote $\tucb_l(a) = \hmu(a) + \sqrt{\frac{S_0}{n(a)}}$. We ignore the subscript $t$ when there is no ambiguity.

Noticed that when $n(a) < \frac{S_0}{4}$, $ucb_l(a) > 2$. And when $n(a) \geq S_0$, $ucb_l(a) \leq 2$. So when total round $T$ ($T \geq \mO\left(A S_0\right)$) is big enough, the leader will play action $a$ for at least $\frac{S_0}{4}$ rounds.

We next bound the difference between accumulative reward the leader receives and the accumulative reward under best responses,
\begin{align*}
&\left|\sum_{t=1}^{n(a)} r_{l,t}(a, b_t)-\mu_l(a, \tbr(a))\right| \\
=& \left|\sum_{t=1}^{n(a)} \big(r_{l,t}(a, b_t) - \mu_l(a, \tbr(a))\big)\mI\left\{b_t = \tbr(a)\right\}
+ \sum_{t=1}^{n(a)} \big(r_{l,t}(a, b_t) - \mu_l(a, \tbr(a))\big)\mI\left\{b_t \neq \tbr(a)\right\}\right| \\
% \leq & \sum_{t=1}^{n(a)} \left(r_{l,t}(a, \tbr(a)) - \mu_l(a, \tbr(a))\right)\mI\left\{b_t = \tbr(a)\right\} + \sum_{t=1}^{n(a)} \mI\left\{b_t \neq \tbr(a)\right\}\\
\leq & \left|\sum_{t=1}^{n(a)} \big(r_{l,t}\left(a, \tbr\left(a\right)\right) - \mu_l\left(a, \tbr(a)\right)\big)\mI\left\{b_t = \tbr(a)\right\}\right| + T_{wbr}(a).
\end{align*}
% We have with probability at least $1- \frac{\delta}{ABT}$,
% \[
% T_{wbr}(a)=\sum_{t=1}^{n(a)} \mI\left\{b_t \neq \tbr(a)\right\} \leq t_0 + 2B + \mO\left( \log \frac{ABT}{\delta}\right) = \mO\left( \frac{B}{\varepsilon^2} \log \frac{ABT}{\delta}\right) 
% \]
And when $n(a)-T_{wbr}(a)\geq\mO\left(\frac{\log (AB/\delta) }{\varepsilon^2}\right)$, by Lemma~\ref{lemma:ucb bound}, with probability at least $1- \frac{\delta}{ABT}$,
% \[
% \left|\hat{r}_l(a, \tbr(a))-\hmu_l(a, \tbr(a))\right| \leq \frac{\varepsilon}{8}.
% \]
\[
\frac{1}{n(a)}\left|\sum_{t=1}^{n(a)} \big(r_{l,t}\left(a, \tbr\left(a\right)\right) - \mu_l\left(a, \tbr(a)\right)\big)\mI\left\{b_t = \tbr(a)\right\}\right| \leq \frac{\varepsilon}{8}.
\]
So when $n(a) \geq T_{wbr}(a) + \mO\left(\frac{\log \frac{AB}{\delta}}{\varepsilon^2}\right)$, and $n(a) \geq \frac{8}{\varepsilon} T_{wbr}(a)$, with probability at least $1- \frac{\delta}{ABT}$, we have
\begin{align*}
    &\frac{1}{n(a)} \left| \sum_{t=1}^{n(a)} r_{l,t}(a, b_t)-\mu_l(a, \tbr(a)) \right|  
\leq  \frac{\varepsilon}{8} + \frac{1}{n(a)} T_{wbr}(a) 
\leq  \frac{\varepsilon}{8} + \frac{\varepsilon}{8} = \frac{\varepsilon}{4}
\end{align*}
We set $S_0\geq \mO\left( \frac{B}{\varepsilon^3} \log\frac{ABT}{\delta}\right)$, which will guarantee $n(a)\geq \mO\left( \frac{B}{\varepsilon^3} \log\frac{ABT}{\delta}\right)$ and the following inequality holds based on above analysis
\[
\left|\hmu_l(a) - \mu_l(a, \tbr(a)) \right| \leq \frac{\varepsilon}{4}.
\]
When $n(a)\geq \mO\left( \frac{S_0}{\varepsilon^2} \right)$, $\sqrt{\frac{S_0}{n(a)}}\leq \frac{\varepsilon}{8}$, then with probability at least $1-\frac{\delta}{T}$, for all $a\in\mA$, we have,
\[
\left|\tucb_l(a) - \mu_l(a, \tbr(a))\right| \leq \frac{3\varepsilon}{8} < \frac{\varepsilon}{2}.
\]
So $\argmax_{a\in\mA}\tucb_l(a) = a_{se}$.
Then by the union bound, with probability at least $1-3\delta$, we have 
\begin{equation*}
    \mP\big[a\neq a_{se}\big]\leq \frac{\delta}{T}.
\end{equation*}
\end{proof}