\section{Proofs for Section 4}
\subsection{Proof for Theorem \ref{thm:non-convergence}}\label{proof:non-convergence}
\begin{table}[ht]
\vspace{-1mm}
\centering
%\setlength{\tabcolsep}{12mm}{
\begin{tabular}{|c|c|c|}
\hline
Leader / Follower   & $b_1$ & $b_2$  \\
\hline
$a_1$  &  (0.95, 0.3)  &  (0.9, 0.2)    \\
\hline
$a_2$  &  (1, 0.8)  &   (0, 0.79)     \\
\hline         	
\end{tabular}
%}
\caption{Payoff matrix of an specific Stackelberg game. }   
\label{tab:non-convengence}
\vspace{-4mm}
\end{table}
We apply the UCB-UCB algorithm to a specific Stackelberg game, as depicted in Table~\ref{tab:non-convengence}, where the Stackelberg equilibrium is identified as $(a_2, b_1)$. The UCB algorithms for the leader and follower are defined respectively as:
\begin{equation*}
\text{ucb}_{l,t}(a) = \hat{\mu}_{l,t}(a) + \sqrt{\frac{2\log(T/\delta)}{n_t(a)}},
\end{equation*}
and
\begin{equation*}
\text{ucb}_{f,t}(a,b) = \hat{\mu}_{f,t}(a,b) + \sqrt{\frac{2\log(T/\delta)}{n_t(a,b)}}.
\end{equation*}
For simplicity in demonstrating the proof, it is assumed that players receive their actual true rewards at each round. Thus, $\text{ucb}_{l,t}(a_1) > 0.9$, and for $a_2$ and $b_1$, $b_2$, the follower's UCB values are calculated as:
\begin{align*}
\text{ucb}_{f,t}(a_2,b_1) &= \mu_f(a_2,b_1) + \sqrt{\frac{2\log(T/\delta)}{n_t(a_2,b_1)}}, \\
\text{ucb}_{f,t}(a_2,b_2) &= \mu_f(a_2,b_2) + \sqrt{\frac{2\log(T/\delta)}{n_t(a_2,b_2)}}.
\end{align*}
Let $n_t(a_2, b_1) = 2k_1 \log(T/\delta) + 1$ and $n_t(a_2, b_2) = 2k_2 \log(T/\delta)$. The UCB mechanism implies that for the follower to prefer $b_1$ over $b_2$, the following inequality must hold:
\begin{equation*}
\mu_{f}(a_2,b_1)+ \sqrt{\frac{2\log(T/\delta)}{n_t(a_2,b_1)-1}} > \mu_{f}(a_2,b_2)+ \sqrt{\frac{2\log(T/\delta)}{n_t(a_2,b_2)}}.
\end{equation*}
Given $\mu_{f}(a_2,b_1) - \mu_{f}(a_2,b_2) = 0.01$, it follows that:
\begin{equation*}
k_2 > \frac{1}{\left(0.01 + \sqrt{\frac{1}{k_1}}\right)^2}.
\end{equation*}
Assuming $k_2 = 10^4$ yields $k_1 > \frac{1}{4}k_2$. If $k_1 + k_2 = \frac{5}{4} \times 10^4$, then $k_1 < 4k_2$. Given $n_t(a_2) = n_t(a_2, b_1) + n_t(a_2, b_2)$ and assuming $n_t(a_2) = \frac{5}{4} \times 10^4 \log(T/\delta)$, for sufficiently large $T$, the leader's UCB for $a_2$ becomes:
\begin{align*}
\text{ucb}_{l,t}(a_2) &= \frac{n_t(a_2, b_1)\mu_{l}(a_2,b_1) + n_t(a_2, b_2)\mu_{l}(a_2,b_2)}{n_t(a_2)} + \sqrt{\frac{2\log(T/\delta)}{n_t(a_2)}} < 0.9,
\end{align*}
which leads to a contradiction. Thus, $n_t(a_2) < \frac{5}{4} \times 10^4 \log(T/\delta) = \mathcal{O}(\log(T/\delta))$, indicating that the leader will play the optimal action no more than $\mathcal{O}(\log(T/\delta))$ times. Consequently, for sufficiently large $T$, the leader is expected to incur regret linear in $T$.



% We consider applying UCB-UCB algorithm to a specific Stackelberg game (Table~\ref{tab:non-convengence}), and the Stackelberg equilibrium of this game is $(a_2, b_1)$.
% The leader's UCB algorithm is:
% \[
% \text{ucb}_{l,t}(a)= \hmu_{l,t}(a)+\sqrt{\frac{ 2  \log (T/\delta) }{n_t(a)}}.
% \]

% The follower's UCB algorithm is:
% \[
% \text{ucb}_{f,t}(a,b)= \hmu_{f,t}(a,b)+ \sqrt{\frac{ 2  \log (T/\delta) }{n_t(a,b)}}.
% \]
% We assume players will receive actual true rewards at each round for simplicity of the proof illustration. 
% So $\tucb_{l,t}(a_1)>0.9$, and 
% \[
% \text{ucb}_{f,t}(a_2,b_1)= \mu_f(a_2,b_1) + \sqrt{\frac{ 2  \log (T/\delta) }{n_t(a_2,b_1)}}, \text{ucb}_{f,t}(a_2,b_2)= \mu_f(a_2,b_2) + \sqrt{\frac{ 2  \log (T/\delta) }{n_t(a_2,b_2)}}.
% \]
% In the following assume that $n_t(a_2, b_1)=2k_1 \log (T/\delta) + 1$, $n_t(a_2, b_2)=2k_2 \log (T/\delta)$. 
% According to the mechanism of UCB algorithm, we have
% % Reason: This is because when the follower want to play $b_1$, the UCB term of $b_1$ must greater than $b_2$
% \[
% \mu_{f}(a_2,b_1)+ \sqrt{\frac{ 2  \log (T/\delta) }{n_t(a_2,b_1)-1}}>\mu_{f}(a_2,b_2)+ \sqrt{\frac{ 2  \log (T/\delta) }{n_t(a_2,b_2)}}.
% \]
% As $\hmu_{f,t}(a_2,b_1)-\hmu_{f,t}(a_2,b_2)=0.01$, we have
% \[
% k_2 > \frac{1}{\left(0.01+\sqrt{\frac{1}{k_1}}\right)^2}.
% \]
% Assume that $k_2=10^4$, we have $k_1>\frac{1}{4}k_2$.
% If $k_1 + k_2= \frac{5}{4}\times 10^4$, then $k_1<4 k_2$.
% Noticed that we have $n_t(a_2)=n_t(a_2, b_1) + n_t(a_2, b_2)$. Assume $n_t(a_2)= \frac{5}{4}\times 10^4 \log (T/\delta)$, for big enough $T$, we have
% \begin{align*}
% \text{ucb}_{l,t}(a_2)
% &= \hmu_{l,t}(a_2)+\sqrt{\frac{ 2  \log (T/\delta) }{n_t(a_2)}} \\ 
% &= \frac{n_t(a_2, b_1)\mu_{l}(a_2,b_1)+n_t(a_2, b_2)\mu_{l}(a_2,b_2)}{n_t(a_2)}+ \sqrt{\frac{ 2  \log (T/\delta) }{n_t(a_2)}}<0.9,
% \end{align*}
% which is a contradictory. So $n_t(a_2)< \frac{5}{4}\times 10^4 \log (T/\delta)=\mathcal{O}\left(\log (T/\delta)\right)$, which implies the leader will only play the optimal action no more than $\mathcal{O}\left(\log (T/\delta)\right)$ times. So we conclude that for big enough $T$ the leader will suffer regret linear in $T$.
