\section{Manipulative follower with noisy side information}\label{sec:farsighted_bandit}
%\hp{add motivation} \yl{I don't if it is suitable to introduce the importance of noisy bandit feedback setting, since in Section 4 we already use noisy bandit feedback setting.}

%The follower needs to explore and manipulate during the online process. 




% $n_t(a,b)=\sum_{\tau=1}^{t} \mathbb{I}\left\{a_\tau= a \land b_\tau= b \right\}$, 
% $\hmu_{f,t}(a,b) = \frac{1}{n_t(a,b)}\sum_{\tau=1}^{t} r_{f,t}(a,b)\mathbb{I}\left\{a_\tau= a \land b_\tau= b \right\}$, $\hmu_{l,t}(a,b) = \frac{1}{n_t(a,b)}\sum_{\tau=1}^{t} r_{l,t}(a,b)\mathbb{I}\left\{a_\tau= a \land b_\tau= b \right\}$.

\subsection{Follower's manipulation strategy}
\setlength{\textfloatsep}{1mm}
\begin{algorithm}[ht]
\caption{FMUCB($t$): Follower's manipulation by UCB at round $t$}
\label{alg:UCB-FM}
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
\begin{algorithmic}[1]
\REQUIRE Candidate set $\mathcal{K}=\mathcal{A} \times \mathcal{B}$, $\tU_{l,t}(\cdot,\cdot)$, $\tU_{f,t}(\cdot,\cdot)$, $\hmu_{l,t}(\cdot,\cdot)$.
% \WHILE{$\max_{a\neq a^\prime} \mu_l(a,\mathcal{F}(a))>\mu_l(a^\prime,b^\prime)$}
 \STATE Candidate manipulation pair $(a^\prime,b^\prime)=\argmax_{(a,b)\in\mathcal{K}} \tU_{f,t}(a,b)$
 \STATE $\mathcal{F}=\{\mathcal{F}(a^\prime)=b^\prime\}\cup\{\mathcal{F}(a)=\argmin_{b\in\mathcal{B}} \tU_{l,t}(a,b): a\neq a^\prime\}$
 
 % \WHILE{$\max_{a\neq a^\prime} U_{l,t}^{-}(a,\mathcal{F}
 % (a))>U_{l,t}^{+}(a^\prime,b^\prime)$}
 \IF{$\max_{a\neq a^\prime} \tU_{l,t}(a,\mathcal{F}
 (a))\geq\hmu_{l,t}(a^\prime,b^\prime)$}
 % \STATE \yaolong{Need to add an exploration parameter, the primary algorithm can not guarantee convergence. Solution: use UCB exploration.}
 \STATE Eliminate $(a^\prime, b^\prime)$ from candidate set:  $\mathcal{K}\leftarrow \mathcal{K} \backslash (a^\prime,b^\prime)$ 
 \STATE Return to Line 1
\ENDIF
\ENSURE The response function $\mathcal{F}$
\end{algorithmic}
\end{algorithm}

We now study the more intricate setting of learning the best manipulation strategy with noisy side information. 
We first present the follower manipulation algorithm FMUCB in Algorithm~\ref{alg:UCB-FM}, a variant of FBM (Algorithm~\ref{alg:FBM}) that uses UCB to learn the best follower manipulation strategy with noisy side information. We can see that Algorithm~\ref{alg:UCB-FM} largely resembles Algorithm~\ref{alg:FBM}, and hence we omit the detailed description. Except for the inherited idea from FBM to manipulate the learning of the leader, another key intuition of FMUCB is to design appropriate UCB terms in place of the true reward terms $\mu_l(a,b)$ and $\mu_f(a,b)$ in FBM, that balances the trade-off between \textit{exploration} and \textit{manipulation}. 

In Line 1, the goal is to find the candidate manipulation pair that maximizes $u_f(\cdot,\cdot)$. Therefore, we maximize the upper confidence bound $\text{U}_{f,t}(a,b)$:
\begin{equation}
\text{U}_{f,t}(a,b)= \hmu_{f,t}(a,b)+\sqrt{\frac{2 \log (ABT/\delta)}{n_t(a,b)}}.  
\end{equation}
The suboptimal manipulation pair will not be selected after enough exploration, given the existence of the following suboptimality gap: for any action pair $(a_k, b_k)\in \mA \times \mB$, satisfying $\mu_f(a_k, b_k)<\mu_f(a_{fm},b_{fm})$,
%it must satisfy that $\mu_f(a_{fm}, b_{fm}) > \mu_f(a_k, b_k)$, and 
\[ 
\Delta_4 = \min_{(a_k, b_k)} \mu_f(a_{fm},b_{fm}) -\mu_f(a_k,b_k). 
\]
Conversely, in Line 2, the goal is to find the ``worst response'' $\twr(a) = \arg \min_{b\in\mB}\mu_l(a,b)$ for each action $a\neq a_{fm}$. Therefore, we define the term $\text{U}_{l,t}(a,b)$ to be the lower confidence bound: 
\begin{equation}
\text{U}_{l,t}(a,b)= \hmu_{l,t}(a,b)-\sqrt{\frac{2 \log (ABT/\delta)}{n_t(a,b)}}.
\end{equation}
For each action $a$, the suboptimality gap against $\twr(a)$ is defined as 
$\Delta_{5,a} = \min_{b\neq \twr(a)} \mu_l(a, b) - \mu_l(a, \twr(a))$, and the minimum suboptimality gap for all $a\in\mA$ is 
\[\Delta_5 = \min_{a\in\mA} \Delta_{5, a}.\]
Given $\Delta_5$, the suboptimal $\twr(a)$ will not be selected after enough exploration. 

When verifying whether the current $\{\mF, (a^\prime, b^\prime)\}$ is a qualified manipulation (Line 3), we also use $U_{l,t}(a,\mathcal{F}(a))$ as it lower bounds $\mu_{l}(a,\mathcal{F}(a))$. This guarantees that the true best manipulation solution satisfies the constraint since the following holds with high probability: for any action $a\neq a_{fm}$, 
\[ 
U_{l,t}(a,\twr(a)) \leq \mu_l(a,\twr(a)) < \mu_l(a_{fm},b_{fm}). 
\] 
This eliminates the unfeasible solution since $U_{l,t}(a,\twr(a))$ approaches $\mu_{l}(a,\twr(a))$, and the following suboptimality gap exists: for any action pair $(a_k, b_k)\in \mA \times \mB$ satisfying $\mu_f(a_{fm},b_{fm}) < \mu_f(a_k, b_k)$,  
\[ 
\Delta_6 = \min_{(a_k, b_k)} \max_{a\neq a_k} \mu_l(a,\twr(a)) - \mu_l(a_{k},b_{k}). 
\] 
%\hp{why $\mu_f(a_{fm},b_{fm}) < \mu_f(a_k, b_k)$}\yl{Eliminate those $\mu_f(a_k, b_k)>\mu_f(a_{fm},b_{fm}) $ but do not satisfy the constraint.}
Hence, Algorithm~\ref{alg:UCB-FM} can find the follower's best manipulation with noisy side information. 



\begin{figure*}[htb]
\vspace{-3mm}
  \centering
  \begin{minipage}[t]{0.32\linewidth}
    \centering
    \includegraphics[width=\linewidth]{Img/1.1.png}
    \subcaption{Limited information}
    \label{fig:subfig1}
  \end{minipage}
  \begin{minipage}[t]{0.32\linewidth}
    \centering
    \includegraphics[width=\linewidth]{Img/2.1.png}
    \subcaption{Omniscient follower}
    \label{fig:subfig2}
  \end{minipage}
  \begin{minipage}[t]{0.32\linewidth}
    \centering
    \includegraphics[width=\linewidth]{Img/2.2.png}
    \subcaption{Noisy side information}
    \label{fig:subfig3}
  \end{minipage}
  \caption{Experiments for the limited information and side information settings. Each experiment is run 5 times to calculate the mean and std (standard deviation). The shaded area shows $\pm$std.}
  \label{fig:subfig}
  \vspace{-2mm}
\end{figure*}


\subsection{Last iterate convergence analysis}
We also provide theoretical guarantees for the sample efficiency and convergence of this algorithm. As a continuation of Section \ref{sec:myopic_follower}, we analyze the performance of our designed algorithm towards the leader's two kinds of algorithms -- EXP3 and UCBE. 

\begin{theorem}[Last iterate convergence of EXP3-FMUCB with noisy side information]\label{thm:EXP3-FMUCB}
When the leader uses EXP3 with parameter $\alpha = \mO\left(T^{-\frac{1}{3}}\right)$, $\eta = \mO\left(T^{-\frac{1}{3}}\right)$, the follower uses FMUCB, and let $T_{f,w}$ be the total number of rounds that the follower did not play the best manipulation strategy in $T$ rounds, with probability at least $1-3\delta$,
\[
T_{f,w} \leq  \mO\left(\frac{ A^2 B }{\alpha \varepsilon^2} \log \frac{ABT}{\delta} \log\frac{1}{\delta}\right), \text{and}
\]
\[
\begin{aligned}
&\mP\big[a_T \neq a_{fm} \big] \leq \alpha + \\
&A\exp\left( -\Delta_3 T^{\frac{2}{3}} + 2\sqrt{2A \log\frac{2}{\delta}} T^{\frac{1}{3}} + \frac{C_2 A^2 B}{\varepsilon^2}\log \frac{ABT}{\delta} \log\frac{1}{\delta}\right),
\end{aligned}
\]
where $\varepsilon = \min\{\Delta_4, \Delta_5, \Delta_6\}$, $C_2$ is a constant.
\end{theorem}
\begin{theorem}[Last iterate convergence of UCBE-FMUCB with noisy side information]\label{thm:UCBE-FMUCB}
When the leader uses UCBE with $S_0 = \mO\left( \frac{B}{\varepsilon^3} \log\frac{ABT}{\delta}\right)$ and $\varepsilon = \min\{\Delta_4, \Delta_5, \Delta_6\}$, the follower uses FMUCB, and let $T_{f,w}$ be the total number of rounds that the follower did not play the best manipulation strategy in $T$ rounds, with probability at least $1-3\delta$,
\[
T_{f,w} \leq  \mO\left(\frac{ A B }{ \varepsilon^2} \log \frac{ABT}{\delta}\right),
\]
and for $T\geq  \mO\left( \frac{A S_0}{\Delta_3^2}\right)$, $\mathbb{P}\big[a_T \neq a_{fm} \big]\leq \frac{\delta}{T}$.
\end{theorem}
The proofs for Theorems~\ref{thm:EXP3-FMUCB} and~\ref{thm:UCBE-FMUCB} are respectively in Appendix~\ref{proof:exp3fmucb} and Appendix~\ref{proof:ucbefmucb}. The two theorems present the sample complexity for learning the best manipulation strategy using FMUCB with noisy side information. They also guarantee that the game will achieve the last iterative convergence to the best manipulation pair $(a_{fm},b_{fm})$ with the algorithms EXP3-FMUCB and UCBE-FMUCB. 














% In the noisy bandit feedback scenario, it's not easy to find the best manipulation strategy. A wrong recognition for manipulation pair can be categorized into two cases:

% (i) Wrong manipulation pair $(\ap, \bp)$ satisfies $\mu_f(a^\prime, b^\prime) \geq \mu_f(a_{fm},b_{fm})$, but exists $\hat{a} = \argmin_{\hat{a} \neq \ap} \mu_{l}(\hat{a},\text{wr}(\hat{a}))$, $\mu_l(\hat{a},\text{wr}(\hat{a})) \geq \mu_l(\ap,\bp)$, which means manipulation pair $(a,b)$ violates the constraints.
% So the reason for selecting this wrong manipulation pair using Algorithm~\ref{alg:UCB-FM} is that samples is not enough to do the judgement about whether $(a,b)$ satisfies the constraint or not. Insufficient exploration leads to this wrong recognization.

% So an essential gap for this situation is for any action pair $(a_k, b_k)\in \mA \times \mB$ satisfies $\mu_f(a_{fm},b_{fm}) < \mu_f(a_k, b_k)$, 
% \[
% \Delta_5 = \min_{(a_k, b_k)} \min_{a\neq a_k} \mu_l(a,\twr(a)) - \mu_l(a_{k},b_{k}).
% \]
% After enough exploration, $U_{l,t}(\hat{a},\text{wr}(\hat{a}))>\hmu(\ap,\bp)$ will hold due to the existence of $\Delta_4$, and $(\ap,\bp)$ will be eliminated by the constraint test (Line 3).


% (ii) Wrong manipulation pair $(\ap,\bp)$ satisfies $\mu_f(\ap,\bp) < \mu_f(a_{fm},b_{fm})$. The statement $U_{f,t}(a,b)>U_{f,t}(a_{fm},b_{fm})$ may happen with insufficient exploration, which may result in selecting this wrong manipulation pair. This wrong manipulation pair will not be selected after enough exploration  (Line 1) and the existence of the following gap:
% For any action pair $(a_k, b_k)\in \mA \times \mB$ it satisfies that $\mu_f(a_{fm}, b_{fm}) > \mu_f(a_k, b_k)$, 
% \[
% \Delta_6 = \min_{(a_k, b_k)} \mu_f(a_{fm},b_{fm}) -\mu_f(a_k,b_k).
% \]
% $\Delta_6$ represents the suboptimal manipulation gap.

% For the best manipulation pair $(a_{fm},b_{fm})$, we use an UCB bonus to let the true best manipulation pair pass the constraint test. Noticed that with probability at least $1-\frac{\delta}{ABT}$, for any action $a\neq a_{fm}$,
% \[
% U_{l,t}(a,\twr(a)) \leq \mu_l(a,\twr(a)) < \mu_l(a_{fm},b_{fm}).
% \]
% So when $(a^\prime, b^\prime) = (a_{fm},b_{fm})$, $(a^\prime, b^\prime)$ will not be eliminated under noisy bandit feedbacke setting in Line 3.
% \begin{theorem}[]
% When leader uses $\alpha$EXP3 with parameter $\alpha$, follower uses UCB-FM to play and update for $T$ round, let $T_{f,w}$ represents the total number of rounds that follower did not play the optimal manipulation strategy, with probability at least $1-\delta$,
% \[
% \mathbb{E}\left[T_{f,w}\right] \leq  \mO\left(\frac{ A^2 B }{\alpha \varepsilon^2} \log \frac{ABT}{\delta}\right).
% \]
% \end{theorem}