\newpage
\section{Proof for Section 6}
\subsection{Proof for Theorem~\ref{thm:EXP3-FMUCB}}\label{proof:exp3fmucb}
\begin{theorem*}[Last iterate convergence of EXP3-FMUCB with noisy side information]
When the leader uses EXP3 with parameter $\alpha = \mO\left(T^{-\frac{1}{3}}\right)$, $\eta = \mO\left(T^{-\frac{1}{3}}\right)$, the follower uses FMUCB, and let $T_{f,w}$ be the total number of rounds that the follower did not play the best manipulation strategy in $T$ rounds, with probability at least $1-3\delta$,
\[
T_{f,w} \leq  \mO\left(\frac{ A^2 B }{\alpha \varepsilon^2} \log \frac{ABT}{\delta}\right), \text{and}
\]
\[
\mP\big[a\neq a_{se} \big] \leq \alpha + A\exp\left( -\Delta_3 T^{\frac{2}{3}} + 2\sqrt{2A \log\frac{2}{\delta}} T^{\frac{1}{3}} + \frac{C_2 A^2 B}{\varepsilon^2}\log \frac{ABT}{\delta} \log\frac{1}{\delta}\right),
\]
where $\varepsilon = \min\{\Delta_4, \Delta_5, \Delta_6\}$, $C_2$ is a constant.
\end{theorem*}
\bigskip\bigskip

% We study the performance of Algorithm~\ref{alg:UCB-FM} with the assumption that the leader will play every action $a\in\mA$ in turn, to find out how many wrong manipulation follower will conduct. 
We study the case when the follower fails to do the manipulation when leader plays $a$. We ignore the subscript $t$ when there is no ambiguity, including the following two cases.
\paragraph{Wrong recognition for $\twr(a)$} We denote the rounds that the follower did not find $\twr(a)$ correct for all $a\in\mA$ as $t_1$. By Lemma~\ref{lemma: bound for wbr and wwr}, we have 
\begin{equation*}
    t_1 \leq \sum_{a\in\mA} T_{wwr}(a) \leq \mO\left(AB\frac{\log(ABT/\delta)}{\Delta_1^2} + AB\log\left(\frac{AB}{\delta}\right)\right).
\end{equation*}
\paragraph{Wrong recognition of manipulation when $\twr(a)$ is correct for all $a\in\mA$}
We next study the wrong manipulation rounds when $\twr(a)$ is correct for all $a\neq a_{se}$. For any action pair $(a,b)\in \mA \times \mB$, we consider whether it will be selected as the best manipulation pair or under what condition it will be eliminated by Algorithm~\ref{alg:UCB-FM} when it is not the best manipulation pair. We classify and discuss two different situations:

(i) $\mu_f(a,b) \geq \mu_f(a_{fm},b_{fm})$, but there exists $\hat{a} = \argmin_{\hat{a} \neq a} \mu_{l}(\hat{a},\text{wr}(\hat{a}))$, $\mu_l(\hat{a},\text{wr}(\hat{a})) \geq \mu_l(a,b)$. As a reminder, $\mu_{l}(\hat{a}, \twr(a))-\mu_{l}(a,b) = \Delta_6 \geq \varepsilon$. When $n(a, b) > t_2 = \mO\left(\frac{\log (ABT / \delta)}{\varepsilon^2}\right)$ and $n(\hat{a}, \twr(\hat{a})) > t_2$,
by Lemma~\ref{lemma:ucb bound}, the following inequality holds
\begin{equation*}
\hmu_{l,t}(a,b) \leq \mu_{l}(a,b)+\frac{\varepsilon}{8} < \mu_{l}(\hat{a}, \twr(a)) - \frac{\varepsilon}{4}
\leq U_{l,t}(\hat{a},\twr(\hat{a})),
\end{equation*}
which means that $(a,b)$ will be eliminated by Algorithm~\ref{alg:UCB-FM} through Line 4. So it needs at most extra $2t_1$ rounds for action pair $(a,b)$ to eliminate the wrong manipulation pair $(a,b)$. 


(ii) $\mu_f(a,b) < \mu_f(a_{fm},b_{fm})$.
When $n(a,b)>t_3 = \mO\left(\max\left\{\frac{\log (ABT/\delta)}{\varepsilon^2},  \frac{S_1}{\varepsilon^2}\right\}\right)$, by Lemma~\ref{lemma:ucb bound}, we have
\[
U_l(a,b) \leq \mu(a,b)+\frac{\varepsilon}{4} < \mu(a_{fm},b_{fm}) < U_l(a_{fm},b_{fm}).
\]
So the extra wrong manipulation rounds for $(a,b)$ is no more than $t_3$.

% (iii) $(a,b) = (a_{fm}, b_{fm})$. We denote $\hat{a} = \argmin_{\hat{a} \neq a} \mu_{l}(\hat{a},\text{wr}(\hat{a}))$
% With probability at least $1-\frac{\delta}{ABT}$, 
% \[
% U_l(\hat{a},\text{wr}(\hat{a})) \leq \mu_l(\hat{a},\text{wr}(\hat{a})) < \mu_l(a,b) = \mu_l(a_{fm}, b_{fm}).
% \]
% In this case $(a,b)$ will not be eliminated due to our special UCB design. 

% So combining the upper bound for $T_{wwr}$, (i), and (ii), we have the upper bound for total number of rounds that the follower did not play the optimal manipulation strategy as follows, by union bound we have with probability at least $1 - \delta$
% \[
% \mE\left[T_{f,w}\right]
% \leq \frac{A}{\alpha} \left(t_1 + 2 AB t_2\right) 
% \leq \frac{A}{\alpha} \left(At_0+ 2AB + 2 AB t_2\right) 
% \leq \mO\left(\frac{ \beta A^2 B }{\alpha \varepsilon^2} \log \frac{A B T}{\delta}\right).
% \]


% First we should make clear that when the leader's uniform exploration parameter is $\alpha$, one single estimation mistake of calculating the manipulation strategy will caused at most $\frac{A}{\alpha}$ wrong manipulation rounds.  For instance, if the manipulation strategy of follower is $\mF = \left\{\mF(a):a\in\mA\right\}$, if follower use $\mF(a)$ to response $a$ for $n(\mF(a))$ rounds, it will consume at most $\frac{A}{\alpha}$ total rounds of the game in expectation.

When the leader plays more than $\mO\left(\frac{A \log(1/\delta)}{\alpha} t_0 \right)$ rounds, because of the existence of uniform exploration parameter $\alpha$, with probability at least $1-\delta$, $n(a)\geq t_0$. So by the union bound, with probability at least $1-3\delta$,
\begin{equation*}
    T_{f,w}\leq \mO\left(\frac{A}{\alpha}\log\frac{1}{\delta}\left(t_1 + AB t_2 +AB t_3\right)\right)=\mO\left(\frac{ A^2 B }{\alpha \varepsilon^2} \log \frac{ABT}{\delta} \log\frac{1}{\delta}\right).
\end{equation*}

We denote $\mu_m(a) = \mu_{l}(a, \mF_{fm}(a))$ in this section when there is no ambiguity. For $a\neq a_{se}$, we have
\begin{equation*}
\begin{aligned}
\sum_{t=1}^T r_t(a) - r_t(a_{fm}) 
& = \sum_{t=1}^T \Big(r_t(a) - \mu_m(a)\Big) + \Big(\mu_m(a) - \mu_m(a_{fm})\Big) + \Big(\mu_m(a_{fm}) - r_t(a_{fm})\Big) \\
& = \sum_{t=1}^T \Big(r_t(a) - \mu_m(a)\Big) - \Delta_3 T + \sum_{t=1}^T \Big(\mu_m(a_{fm})- r_t(a_{fm})\Big) \\
& \leq  \mO\left(\frac{A\log(1/\delta)}{\alpha}t_1\right)  - \Delta_3 T + 2 s_{m} B (t_2 +t_3).
\end{aligned}
\end{equation*}
Analogous to the proof of Theorem~\ref{thm:exp3-ucb} in Appendix~\ref{proof:exp3ucb}, we can complete the proof of Theorem~\ref{thm:EXP3-FMUCB}. 