% !TeX root = ..\freeExp.tex
\section{Proofs}\label{sec:proofs}

\subsection{Preliminary Lemmas}
\begin{lemma}[{\citep[Lemma 3]{wang2020optimal}}]
    \label{lma:emp-mean-event}
    Let $k\in \mathcal{K}$, and \(c>0\). Let \(H\) be a random set of rounds such that for all \(t,\, \{t\in H\}\in \mathcal{F}_{t-1}\).
    Assume that there exists \((C_t)_{t\ge 0}\), a sequence of independent binary random variables such that for any \(t\ge 1\),
    \(C_t\) is \(\mathcal{F}_{t}\)-measurable and \(\P[C_t=1]\ge c\). Further assume for any \(t\in H\), arm \(k\) is selected if \(C_t=1\). Then,
    \[
        \sum_{t\ge 1}\P\left[\{ t\in H, \abs{\hat{\mu}_{t}(k) - \mu(k)} \ge \varepsilon \}\right] \le 2c^{-1}(2c^{-1} + \varepsilon^{-2}).
    \]
\end{lemma}

\begin{lemma}[{\citep[Lemma 6]{combes2015learning}}]
    \label{lma:kl-ucb-event}
    For any arm \(k\), we have \[
        \sum_{t\ge 1}\P[d_t(k) \le \mu(k)] \le 15,
    \]
    where \(d_t(k)\) is the KL-divergence of arm \(k\) at time slot \(t\).
\end{lemma}

\begin{lemma}
    \label{lma:kl-div-ineq}
    The the KL-divergence of two Gaussian random variables with means \(\mu_1, \mu_2\) and same variance \(\sigma^2\) has the following expression,
    \begin{equation}
        \label{eq:kl-guassian-ineq}
        \kl(\mu_1, \mu_2) = \frac{(\mu_1 - \mu_2)^2}{2\sigma^2}.
    \end{equation}
\end{lemma}

\begin{lemma}[Abel Transformation]\label{lma:abel-trans}
    For two sequences \(\{a_n\}\) and \(\{b_n\}\) with \(n\in\mathbb{N}\), we have
    \[
        \sum_{n=0}^N a_n b_n = \sum_{n=0}^N a_n b_N - \sum_{n=0}^{N-1}\sum_{k=0}^n a_k (b_{n+1} - b_n).
    \]
\end{lemma}
\begin{proof}[Proof of Lemma~\ref{lma:abel-trans}]
    We use induction to prove this lemma.
    When \(N=0\), we have that \(a_0b_0 = a_0b_0\) holds.

    Suppose when \(N=m\,(\in\N)\) the above equation holds. We show when \(N=m+1\) the above equation holds as follows,
    \[
        \begin{split}
            \text{RHS} &= \sum_{n=0}^{m+1} a_n b_{m+1} - \sum_{n=0}^{m}\sum_{k=0}^n a_k (b_{n+1} - b_n)\\
            &=\sum_{n=0}^{m+1} a_n b_{m+1} - \sum_{n=0}^{m-1}\sum_{k=0}^n a_k (b_{n+1} - b_n) - \sum_{k=0}^m a_k b_{m+1} + \sum_{k=0}^m a_k b_{m}\\
            &= a_{m+1} b_{m+1} + \underbrace{\sum_{k=0}^m a_k b_{m} - \sum_{n=0}^{m-1}\sum_{k=0}^n a_k (b_{n+1} - b_n)}_{\text{use the supposition}}\\
            &=\sum_{n=0}^{m+1} a_nb_n\\
            &=\text{LHS}.
        \end{split}
    \]
\end{proof}


\subsection{Proof of Theorem~\ref{thm:regret-lower-bound} (Regret Lower Bound)}

Fix an arm \(k\) whose \(\bar{\Delta}(k)> 0\).
Recall that \(\bar{i}(k)\)
is the agent whose local optimal arm's reward mean is the closest to arm \(k\)'s.
For this agent \(\bar{i}(k)\),
we define \(\theta = (P_{\theta}(1),P_{\theta}(2),\dots, P_{\theta}(K))\) as an instance of
the \(K\) arms' reward distributions %for agent \(i\)
(arm specific reward plus agent specific reward).
Then, we consider another instance
\(\theta'=(P_{\theta'}(1), P_{\theta'}(2), \dots, P_{\theta'}(K))\),
whose distributions are the same as instance \(\theta\)'s
except for that arm \(k\) reward distribution \(P_{\theta'}(k)\)'s mean
is increased by \(\lambda\), i.e., \(\E[P_{\theta'}(k)] = \E[P_{\theta}(k)] +\lambda\),
where \(\bar{\Delta}(k) < \lambda < \min_{i\in\mathcal{M}\setminus\{\bar{i}(k)\}} \Delta^\brai(k)\).
Therefore, in instance \(\theta\), the arm \(k\) is suboptimal, while
in instance \(\theta'\), arm \(k\) becomes the optimal arm.
% \rev{in the reward shift setting, both instances \(\theta\)
%     and \(\theta'\) are the same.
%     But in the instance \(\theta'\),
%     the reward shift of agent \(i\) on arm \(k\) is increased by a \(\lambda\).
% }
% We keep all other arms' reward means
% the same as the \(\theta\) instance's.
% Fix a consistent policy \(\pi\).
Denote \(\P_{\theta,\pi}\) as the probability measure of \(T\)-round action-reward
histories induced by the interconnection of policy \(\pi\)
and the environment \(\theta\).
Denote \(\E_{\theta,\pi}[\text{R}_T(k)]\) as
\MATOBHR's regret of pulling the suboptimal arm \(k\)
under instance \(\theta\) and policy \(\pi\),
and \(N_T(k)\) as the total pulling times of
this suboptimal arm \(k\) in \(T\) time slots.
Then, we have \[
    \begin{split}
        &\quad \E_{\theta,\pi}[\text{R}_T(k)] + \E_{\theta',\pi}[\text{R}_T(k)] \\
        &\ge
        \frac{T\bar{\Delta}(k)}{2}\P_{\theta,\pi}\left(
        N_T(k) \ge \frac{T}{2}
        \right) +
        \frac{T(\lambda - \bar{\Delta}(k))}{2}\P_{\theta',\pi}\left(
        N_T(k) < \frac{T}{2}
        \right)\\
        &\ge \frac{T}{2}\min\left\{ \bar{\Delta}(k), \lambda - \bar{\Delta}(k) \right\}
        \left(
        \P_{\theta,\pi}\left(
        N_T(k) \ge \frac{T}{2}
        \right) + \P_{\theta',\pi}\left(
        N_T(k) < \frac{T}{2}
        \right)
        \right)\\
        &\overset{(a)}\ge \frac{T}{2}\min\left\{ \bar{\Delta}(k), \lambda - \bar{\Delta}(k) \right\}\exp\left( -\KL(\P_{\theta,\pi}, \P_{\theta', \pi}) \right)\\
        &\ge \frac{T}{2}\min\left\{ \bar{\Delta}(k), \lambda - \bar{\Delta}(k) \right\}\exp\left( -\E_{\theta,\pi}[N_T(k)]
        \KL(P_{\theta}(k), P_{\theta'}(k)) \right),
    \end{split}
\]
where the inequality (a) is due to the Bretagnolle-Huber inequality~\citep{bretagnolle1978estimation}
and the \(\KL\) represents the KL-divergence between two general probability distributions.

Rearranging the above inequality, we have
\[
    \begin{split}
        \frac{\E_{\theta,\pi}[N_T(k)]}{\log T} \ge
        \frac{1}{\KL(P_{\theta}(k), P_{\theta'}(k))} \left( 1 + \frac{\log \frac{\min\left\{ \bar{\Delta}(k), \lambda - \bar{\Delta}(k) \right\}}{2}}{\log T}
        - \frac{\log (\E_{\theta,\pi}[\text{R}_T(k)] + \E_{\theta',\pi}[\text{R}_T(k)])}{\log T} \right).
    \end{split}
\]

When \(T \to \infty\), for RHS, the second term inside the bracket is equal to \(0\)
and the third term becomes arbitrarily small since \(\pi\) is a consistent policy.
So, we have \[
    \liminf_{T\to\infty}\frac{\E_{\theta,\pi}[N_T(k)]}{\log T} \ge\frac{1}{\KL(P_{\theta}(k), P_{\theta'}(k))}.
\]
Notice that the smallest cost of pulling the suboptimal arm once in instance \(\theta\) is \(\bar{\Delta}(k)\). We transform the pull times of arm \(k\) to regret costs of pulling arm \(k\) as follows, \[
    \begin{split}
        \liminf_{T\to\infty}\frac{\E_{\theta,\pi}[\text{R}_T(k)]}{\log T}
        \ge\frac{\bar{\Delta}(k)}{\KL(P_{\theta}(k), P_{\theta'}(k))}
        \overset{(a)}=\frac{\bar{\Delta}(k)}{\kl(\bar{\omega}(k),\bar{\omega}(k)+\bar{\Delta}(k))},
    \end{split}
\]
where \(\kl\) represents the KL-divergence between two Gaussian distributions with the same variance,
and the the equation (a) is by choosing the reward distribution as Gaussian and letting \(\lambda \to \bar{\Delta}(k)\)
and the definition of \(\bar{\omega}(k)\) in~\eqref{eq:critial-omega}.
Lastly, we take a summation over all arms \(k\) whose suboptimality gap \(\bar{\Delta}(k) > 0\)
and obtain a regret lower bound as follows,
\[
    \begin{split}
        \liminf_{T\to\infty}\frac{\E_{\theta,\pi}[\RT]}{\log T}
        \ge
        \sum_{k:\bar{\Delta}(k) > 0}\frac{\bar{\Delta}(k)}{\kl(\bar{\omega}(k),\bar{\omega}(k)+\bar{\Delta}(k))}.
    \end{split}
\]



\subsection{Proof of Theorem~\ref{thm:free-exp-upper-bound}
    (Regret Upper Bound)}
% For the generality of this proof, we consider the case that agents' has a partial accessible arm set \(\mathcal{K}^\brai\)
% and denote \(K^\brai = \abs{\mathcal{K}^\brai}\).
% This covers the special case studied in~\cite{yang2022distributed}.
% If \(\mathcal{K}^\brai = \mathcal{K}\) for all agent \(i\),
% then it becomes the general setting.
Fix an agent \(i\).
Recall that \(I_t^\brai\) is the arm index with highest reward empirical mean at time slot \(t\) for agent \(i\),
and \(J_t^\brai\) is the arm that agent \(i\) pulls in time slot \(t\) in \FreeExp.
We first define two sets of time slots as follows,
\[
    \begin{split}
        \mathcal{A}^\brai & \coloneqq \{t\ge 1: I_t^\brai \neq k_*^\brai\},\\
        \mathcal{B}^\brai & \coloneqq\{t\ge 1: \abs{\hat{\omega}_t^\brai(I_t^\brai) - \omega^\brai(I_t^\brai)} \ge \delta\}= \{t\ge 1: \abs{\hat{\mu}_t(I_t^\brai) - \mu(I_t^\brai)} \ge \delta\},
    \end{split}
\]
where \(\mathcal{A}^\brai\) denotes a set of time slots in which the empirical optimal arm \(I_t^\brai\) is not the true optimal arm \(k_*^\brai\); \(\mathcal{B}^\brai\) denotes the set of time slots in which the empirical optimal arm \(I_t^\brai\)'s empirical mean is different from its true reward mean by at least a \(\delta\).


Denote \(\mathcal{T} = \{1,2,\dots, T\}\)
and \(\tilde{\mathcal{T}}\coloneqq \cup_{i\in\mathcal{M}}(\mathcal{A}^\brai \cup \mathcal{B}^\brai)\). We can decompose regret \(\ERT\) as
\begin{equation}\label{eq:decompose-I-II}
    \begin{split}
        \ERT & = \E\left[ \sum_{i\in\mathcal{M}} \text{R}_T^\brai \right]
        =\E\left[ \sum_{i\in\mathcal{M}}\sum_{t\in\mathcal{T}}\sum_{k\in\mathcal{K}^\brai} \Delta^\brai(k)\1{J_t^\brai = k} \right]\\
        &=\E\left[ \sum_{i\in\mathcal{M}}\sum_{t\in\tilde{\mathcal{T}}}\sum_{k\in\mathcal{K}^\brai} \Delta^\brai(k)\1{J_t^\brai = k}
            + \sum_{i\in\mathcal{M}}\sum_{t\in\mathcal{T}\setminus \tilde{\mathcal{T}}}\sum_{k\in\mathcal{K}^\brai} \Delta^\brai(k)\1{J_t^\brai = k} \right]\\
        &\overset{(a)}\le \underbrace{bM\E\left[\abs{\tilde{\mathcal{T}}}\right]}_{(I)}+ \underbrace{\E\left[ \sum_{i\in\mathcal{M}}\sum_{t\in\mathcal{T}\setminus \tilde{\mathcal{T}}}\sum_{k\in\mathcal{K}^\brai\setminus \mathcal{K}^\texttt{fr}} \Delta^\brai(k)\1{J_t^\brai = k}\right]}_{(II)},
    \end{split}
\end{equation}
where inequality (a) is due to that
(1) the first term is scaled by \(\Delta^\brai(k) < b\) for all \(k\in\mathcal{K}^\brai\);
(2) when \(t\in \mathcal{T}\setminus\tilde{\mathcal{T}}\) all free exploration arms (in \(\mathcal{K}^\texttt{fr}\)) are correctly identified, so these arms will not be explored with cost in the second term, and we can replace \(k\in\mathcal{K}^\brai\) with \(k\in\mathcal{K}^\brai\setminus\mathcal{K}^\texttt{fr}\).

Next, we provide Lemmas~\ref{lma:subset-cover} and~\ref{lma:set-card-upper-bounds} to show that the first term \((I)\)
is upper bounded by a constant, and Lemmas~\ref{lma:bound-arm-pulls} and~\ref{lma:bound-arm-regrets} to show
that the second term \((II)\) is upper bounded by a logarithmic term.
% {\color{blue}Xutong: (II) and (I) seems strange to me, maybe we can consider roman numerals or other notations like A, B?}

\paragraph{\bf Bounding term \((I)\)}
We define the following two sets,
\[
    \begin{split}
        \mathcal{C}^\brai & \coloneqq \{t\ge 1: d_t^\brai(k_*^\brai) < \omega^\brai (k_*^\brai)\},\\
        \mathcal{E}^\brai & \coloneqq \{t \in \mathcal{A}^\brai\setminus (\mathcal{B}^\brai\cup \mathcal{C}^\brai), \abs{\hat{\omega}_t^\brai(k_*^\brai) - \omega^\brai(k_*^\brai)} \ge \delta\}\\
        &= \{t \in \mathcal{A}^\brai\setminus (\mathcal{B}^\brai\cup \mathcal{C}^\brai), \abs{\hat{\mu}_t(k_*^\brai) - \mu(k_*^\brai)} \ge \delta\},
    \end{split}
\]
where \(\mathcal{C}^\brai\) denotes the set of time slots in which
the optimal arm \(k_*^\brai\)'s KL-UCB index is smaller than this optimal arm's true reward mean \(\omega^\brai(k_*^\brai)\);
\(\mathcal{E}^\brai\) denotes the subset of time slots of \(\mathcal{A}^\brai\setminus (\mathcal{B}^\brai\cup \mathcal{C}^\brai)\)
and in which the optimal arm \(k_*^\brai\)'s empirical mean is different from its  true reward mean by at least a \(\delta\).

% {\color{blue}Xutong: Is it possible to show Lemma 1, Lemma 2, and the usage of Lemma 1, Lemma 2 to bound (II); Then put the proofs of two lemmas at the end of the above derivation (or in the appendix)?}
Lemma~\ref{lma:subset-cover} shows the set \(\mathcal{A}^\brai \cup \mathcal{B}^\brai\)
can be covered by another set \(\mathcal{B}^\brai\cup \mathcal{C}^\brai \cup \mathcal{E}^\brai\).
Lemma~\ref{lma:set-card-upper-bounds} separately upper bounds the expected set cardinality of \(\mathcal{B}^\brai, \mathcal{C}^\brai, \mathcal{E}^\brai\).
We defer both lemmas' proof to the end of this subsection.

\begin{lemma}
    \label{lma:subset-cover}
    \(\mathcal{A}^\brai\cup \mathcal{B}^\brai\subseteq \mathcal{B}^\brai\cup \mathcal{C}^\brai\cup \mathcal{E}^\brai\). Thus,
    \(\E\left[ \mathcal{A}^\brai\cup \mathcal{B}^\brai \right] \le \E\left[ \mathcal{B}^\brai \right]
    + \E\left[ \mathcal{C}^\brai \right]
    +  \E\left[ \mathcal{E}^\brai \right]\).
\end{lemma}


\begin{lemma}\label{lma:set-card-upper-bounds}
    \(\E\left[ \abs{\mathcal{B}^\brai} \right] \le 4K^\brai(4+\delta^{-2}),
    \E\left[ \abs{\mathcal{C}^\brai} \right] \le 15,
    \E\left[ \abs{\mathcal{E}^\brai} \right] \le 4(K^\brai)^2(4K^\brai+\delta^{-2}).\)
\end{lemma}


Given Lemmas~\ref{lma:subset-cover} and~\ref{lma:set-card-upper-bounds}, we have
\(
\E\left[ \abs{\mathcal{A}^\brai\cup\mathcal{B}^\brai} \right]
\le 6(K^\brai)^2 (4K^\brai +\delta^{-2}).
\)
Then, we can upper bound the term \((I)\) as follows,
\begin{equation}\label{eq:bound-I}
    \begin{split}
        (I) &= bM\E\left[\abs{\tilde{\mathcal{T}}}\right]
        = bM\E\left[ \abs*{\cup_{i\in\mathcal{M}}(\mathcal{A}^\brai\cup \mathcal{B}^\brai)} \right]\\
        &\le  bM \sum_{i\in\mathcal{M}} \E\left[ \abs*{\mathcal{A}^\brai\cup \mathcal{B}^\brai} \right]
        \le bM^2 \max_{i\in\mathcal{M}}  \E\left[ \abs*{\mathcal{A}^\brai\cup \mathcal{B}^\brai} \right]\\
        &\le 6bM^2 K^2 (4K +\delta^{-2}).
    \end{split}
\end{equation}


\paragraph{\bf Bounding term \((II)\)}
One key challenge to upper bound \((II)\) is that the pulls of an arm \(k\) is among agents with heterogeneous rewards and the costs of pulling arm \(k\) can be different among these agents.
So, the common technique of bounding the pull times of arm \(k\) in current bandits literature is not applicable in bounding these agents' total pulling arm \(k\).
To address the challenge, we sort these agents according to their reward gaps \(\Delta^\brai(k)\) of this arm \(k\), bound the pull times of arm \(k\) among a group of incremental agent subsets where these subsets gradually include agents with higher reward gaps \(\Delta^\brai(k)\),
and apply an Abel transformation to bound these agents' total regret cost at last.

Fix an arm \(k\in\mathcal{K}\setminus\mathcal{K}^\texttt{fr}\) that cannot be freely explored.
Denote \(\mathcal{M}(k) \coloneqq
\{i\in \mathcal{M}: k\in\mathcal{K}^\brai\}\)
as the set of agents having access to arm \(k\)
and \(M(k) \coloneqq  \abs{\mathcal{M}(k)}\) is the number of such agents.
We consider an order \(\{i(k;1), i(k;2), \dots,
i(k;M(k))\}\) of those \(M(k)\) agents such that
\(\Delta^{(i(k;1))}(k) \ge \Delta^{(i(k;2))}(k) \ge \dots
\ge \Delta^{(i(k;M(k)))}(k)\).
With this order, we rearrange summations of \((II)\) as follows,
\[
    \begin{split}
        &(II) = \E\Biggl[ \sum_{i\in\mathcal{M}}\sum_{t\in\mathcal{T}\setminus \tilde{\mathcal{T}}}\sum_{k\in\mathcal{K}^\brai\setminus \mathcal{K}^\texttt{fr}} \Delta^\brai(k)\1{J_t^\brai = k}\Biggl]
        = \E\Biggl[ \sum_{i\in\mathcal{M}}\sum_{k\in\mathcal{K}^\brai\setminus\mathcal{K}^\texttt{fr}} \Delta^\brai(k) \sum_{t\in\mathcal{T}\setminus \tilde{\mathcal{T}}}\1{J_t^\brai = k} \Biggl]
        \\
        &\overset{(a)}=  \E\Biggl[\sum_{k\in\mathcal{K}\setminus\mathcal{K}^\texttt{fr}}
        \sum_{m=1}^{M(k)}\Delta^{(i(k;m))}(k)
        \sum_{t\in\mathcal{T}\setminus\tilde{\mathcal{T}}}
        \1{J_t^{(i(k;m))} = k} \Biggl]
        \overset{(b)}=  \E\Biggl[\sum_{k\in\mathcal{K}\setminus\mathcal{K}^\texttt{fr}}
        \sum_{m=1}^{M(k)}\Delta^{(i(k;m))}(k)
        n_T^{(i(k;m))}(k)\Biggl],
    \end{split}
\]
where (a) is because the arm \(k\in\mathcal{K}\setminus\mathcal{K}^\texttt{fr}\) is only pulled by agents
\(\{i(k;1), i(k;2), \dots,
i(k;M(k))\}\),
and  (b) is from a simplified definition \(n_T^{(i(k;m))}(k) \coloneqq \sum_{t\in\mathcal{T}\setminus\tilde{\mathcal{T}}}
\1{J_t^{(i(k;m))} = k}\).



In Lemma~\ref{lma:bound-arm-pulls}, we provide an intermediate result that bounds the number of times of pulling arm \(k\)
by agents \(\{i(k;1), i(k;2), \dots,
i(k;m)\}\) for any \(m \le M(k)\).
Lemma~\ref{lma:bound-arm-regrets}
is derived via an Abel transformation and based on Lemma~\ref{lma:bound-arm-pulls}.
We defer both lemmas' proof to the end of this subsection.


\begin{lemma}
    \label{lma:bound-arm-pulls}
    \(\displaystyle
    \E\left[ \sum_{\ell=1}^m n_T^{(i(k;\ell))} (k) \right]
    \le \frac{\log T + 4\log (\log T)}{\kl(\omega^{(i(k;m))}(k)+\delta, \omega^{(i(k;m))}(k_*^{(i(k;m))})-\delta)} + m(4 + 2\delta^{-2}).
    \)
\end{lemma}


\begin{lemma}\label{lma:bound-arm-regrets}
    \[
        \begin{split}
            \E\left[ \sum_{m=1}^{M(k)} n_T^{(i(k;m))}(k) \Delta^{(i(k;m))}(k)\right]
            \le (4+2\delta^{-2})\sum_{m=1}^{M(k)} \Delta_m+
            {\frac{1}{2(\sigma_1^2 + \sigma_2^2)} } \frac{(\Delta_m-2\delta)(\log T + 4\log(\log T))}{\kl(\bar{\omega}(k)+\delta, \bar{\omega}(k)+\bar{\Delta}(k)-\delta)}.
        \end{split}
    \]
\end{lemma}


In Lemma~\ref{lma:bound-arm-regrets},
we bound the regret cost of pulling arm \(k\in\mathcal{K}\setminus\mathcal{K}^\texttt{fr}\) when \(t\in\mathcal{T}\setminus\tilde{\mathcal{T}}\) by agents \(\{i(k;1), i(k;2), \dots, i(k;M(k))\}\).
Given Lemma~\ref{lma:bound-arm-regrets}, we can upper bound the term \((II)\) as follows,
\begin{equation}\label{eq:bound-II}
    \begin{split}
        &(II) =  \sum_{k\in\mathcal{K}\setminus\mathcal{K}^\texttt{fr}}\E\left[
        \sum_{m=1}^{M(k)}\Delta^{(i(k;m))}(k)
        n_T^{(i(k;m))}(k)\right] \\
        &\le
        \sum_{k:\bar{\Delta}(k) > 0}
        \frac{1}{2(\sigma_1^2 + \sigma_2^2)} \frac{(\Delta_m-2\delta)(\log T + 4\log(\log T))}{\kl(\mu(k)+\delta, \mu(k)+\bar{\Delta}(k)-\delta)} + (4+2\delta^{-2})\sum_{k:\bar{\Delta}(k) > 0}\sum_{m=1}^{M(k)} \Delta_m \\
        &\le
        \frac{1}{2(\sigma_1^2 + \sigma_2^2)}
        \sum_{k:\bar{\Delta}(k) > 0}
        \frac{(\Delta_m-2\delta)(\log T + 4\log(\log T))}{\kl(\mu(k)+\delta, \mu(k)+\bar{\Delta}(k)-\delta)}+bMK(4+2\delta^{-2}).
    \end{split}
\end{equation}


Finally, we obtain the regret bound by substituting~\eqref{eq:bound-I} and~\eqref{eq:bound-II} into~\eqref{eq:decompose-I-II} as follows,
\begin{equation}
    \label{eq:finite-regret-upper-bound}
    \begin{split}
        \ERT \le
        \frac{1}{2(\sigma_1^2 + \sigma_2^2)}
        \sum_{k:\bar{\Delta}(k) > 0}
        \frac{(\Delta_m-2\delta)(\log T + 4\log(\log T))}{\kl(\mu(k)+\delta, \mu(k)+\bar{\Delta}(k)-\delta)}
        +7bM^2 K^2(4K + \delta^{-2})
    \end{split}
\end{equation}

% The factor \(
% \frac{1}{2(\sigma_1^2 + \sigma_2^2)}
% \)
% can be upper bounded by a constant given the assumption that
% all arms' reward means are in \((a,b)\), where \(0<a< b<1\).

Letting \(T\to \infty, \delta \to 0\), we obtain the asymptotic regret upper bound.
\emph{The main proof of Theorem~\ref{thm:free-exp-upper-bound}
    is finished.}

In the rest of this section, we present the detailed proofs of Lemmas~\ref{lma:subset-cover}--\ref{lma:bound-arm-regrets}.


\begin{proof}[Proof of Lemma~\ref{lma:subset-cover}]
    Denote \(t \in\mathcal{A}^\brai \setminus (\mathcal{B}^\brai \cup \mathcal{C}^\brai)\). We only need to show that this condition leads to \(t \in \mathcal{E}^\brai\).
    From the condition, we have
    \[
        \hat{\omega}^\brai_t(k_*^\brai) \overset{(a)}\le \hat{\omega}^\brai_t(I_t^\brai)
        \overset{(b)}\le \omega^\brai(I_t^\brai) + \delta
        \overset{(c)}\le \omega^\brai(k_*^\brai) - \delta,
    \]
    where the inequality (a) is due to \(t\in\mathcal{A}^\brai\),
    inequality (b) is due to \(t\not\in \mathcal{B}^\brai\),
    and inequality (c) is due to the definition of  \( \delta <
    \frac{1}{4}{\min_{i\in\mathcal{M}, k_1\neq k_2\in \mathcal{K}}
        \abs{\omega^\brai (k_1) - \omega^\brai (k_2)}}\).

    Therefore, we have \(\mu(k_*^\brai) - \hat{\mu}_t(k_*^\brai) = \omega^\brai(k_*^\brai) - \hat{\omega}^\brai_t(k_*^\brai) \ge \delta\). That is, \(t\in\mathcal{E}^\brai\).
\end{proof}

\begin{proof}[Proof of Lemma~\ref{lma:set-card-upper-bounds}]

    To show \(\E\left[ \abs{\mathcal{B}^\brai} \right] \le 4K^\brai(4+\delta^{-2})\):
    for any arm \(k\in\mathcal{K}^\brai\), denote \(\mathcal{B}^\brai(k)\coloneqq \{t\ge 1: I_t^\brai=k, \abs{\hat{\mu}_t(k) - \mu(k)}\ge \delta\}\).
    Then, applying Lemma~\ref{lma:emp-mean-event} (in Appendix)
    via letting \(H = \{t\ge 1: I_t^\brai = k\}, C_t = \1{J_t^\brai = k}\)
    and \(\P(C_t = 1\vert H)\ge \frac{1}{2}\) (because the agent has a probability of \(1/2\) to pull the empirical optimal arm \(I_t^\brai\)),
    we have \(
    \sum_{t>1}\P(t\in\mathcal{B}^\brai(k)) \le 4(4+\delta^{-2})
    \),
    that is, \(\E\left[ \abs{\mathcal{B}^\brai(k)} \right] \le 4(4+\delta^{-2})\).
    Applying union bound over all arms in \(\mathcal{K}^\brai\),
    we obtain \(\E\left[ \abs{\mathcal{B}^\brai} \right] \le 4K^\brai(4+\delta^{-2})\).

    To show \(\E\left[ \abs{\mathcal{C}^\brai} \right] \le 15\):
    this is from KL-UCB's property in Lemma~\ref{lma:kl-ucb-event} in Appendix.

    To show \(\E\left[ \abs{\mathcal{E}^\brai} \right] \le 4(K^\brai)^2(4K^\brai+\delta^{-2})\):
    for any arm \(k\in\mathcal{K}^\brai\),
    denote \(\mathcal{E}^\brai(k)\coloneqq \{t\ge 1:
    t\in\mathcal{A}^\brai\setminus(\mathcal{B}^\brai\cup \mathcal{C}^\brai),
    I_t^\brai=k, \abs{\hat{\mu}_t(k_*^\brai) - \mu(k_*^\brai)}\ge \delta\}\).
    Applying Lemma~\ref{lma:emp-mean-event} via
    letting \(H = \{t\ge 1:
    t\in\mathcal{A}^\brai\setminus(\mathcal{B}^\brai\cup \mathcal{C}^\brai),
    I_t^\brai = k\},
    C_t = \1{J_t^\brai = k_*^\brai}\)
    and thus \(\P(C_t = 1\vert H)\ge \frac{1}{2K^\brai}\) (which will be proven next),
    we then have \(
    \E\left[ \abs{\mathcal{E}^\brai(k)} \right] \le 4K^\brai(4K^\brai +\delta^{-2}).
    \)
    Then, with union bound over all arms in \(\mathcal{K}^\brai\), we derive the result.

    We now show \(\P(C_t = 1\vert H)\ge \frac{1}{2K^\brai}\).
    From the choice of \(H\), we have
    \[
        d_t^\brai(k_*^\brai) \overset{(a)}\ge \omega^\brai(k_*^\brai)
        \overset{(b)}\ge \omega^\brai(I_t^\brai) + \delta
        \overset{(c)}\ge \hat{\omega}^\brai_t(I_t^\brai),
    \]
    where inequality (a) is due to \(t\not\in \mathcal{C}^\brai\),
    inequality (b) is due to \(t\in \mathcal{A}^\brai\) and the definition of \(\delta\),
    and inequality (c) is due to \(t\not\in \mathcal{B}^\brai\).
    Therefore, \(d_t^\brai(k_*^\brai) > \hat{\omega}^\brai_t(I_t^\brai)\),
    which implies that the agent may explore this arm
    with a probability of at least \({1}/{2K^\brai}\)
    (cf. Algorithm~\ref{alg:free-exp}'s line~\ref{line:pull-arm-end}
    and if this arm is removed from \(\mathcal{D}^\brai_t\),
    then some other agents pull this arm
    with a probability of at least \(1/2\)).
\end{proof}



\begin{proof}[Proof of Lemma~\ref{lma:bound-arm-pulls}]
    Fix an arm \(k\in\mathcal{K}\setminus \mathcal{K}^\texttt{fr}\) and an integer \(m\in\{1,2,\dots, M(k)\}\).
    Denote a set of agent-time pairs \((i,t)\) as follows,
    \[
        \mathcal{G}(k;m)\coloneqq
        \{(i,t): i\in\{i(k;1),\dots,i(k;m)\},
        t\le T, t\not\in \cup_{\ell=1}^m(\mathcal{A}^{(i(k;\ell))} \cup \mathcal{B}^{(i(k;\ell))}),
        J_t^\brai = k\}.
    \]

    Let
    \(n_0 \coloneqq
    {(\log T + 4 \log (\log T))} /
    {\kl(\omega^{(i(k;m))}(k)+\delta, \omega^{(i(k;m))}(k_*^{(i(k;m))})-\delta)}\),
    and denote
    \(c_t \coloneqq \sum_{s=1}^t \sum_{\ell=1}^m \1{(\ell, s)\in\mathcal{G}(k;m)}\)
    as the number of times that the agent-time pair \((i,t)\) lies
    inside \(\mathcal{G}(k;m)\).
    Define two subsets of \(\mathcal{G}(k;m)\) as follows,
    \[
        \begin{split}
            \mathcal{G}_1(k;m) &= \{(i,t)\in\mathcal{G}(k;m): \abs{\hat{\mu}_t(k) - \mu(k)} \ge \delta\},\\
            \mathcal{G}_2(k;m) &= \{(i,t)\in\mathcal{G}(k;m): c_t < n_0\}.
        \end{split}
    \]


    We first show that \(\mathcal{G}(k;m)\subseteq \mathcal{G}_1(k;m)\cup \mathcal{G}_2(k;m)\). Let \(t \in \mathcal{G}(k;m)\setminus (\mathcal{G}_1(k;m)\cup \mathcal{G}_2(k;m))\),
    from which we have \[
        \begin{split}
            &\qquad d_t^{(i(k;m))}(k)
            \overset{(a)}\ge \hat{\omega}^{(i(k;m))}_t(I_t^{(i(k;m))})
            \overset{(b)}{=} \hat{\omega}^{(i(k;m))}_t(k_*^{(i(k;m))})\\
            &\overset{(c)}{>} \omega^{(i(k;m))}(k_*^{(i(k;m))}) - \delta
            \overset{(d)}{>} \omega^{(i(k;m))}(k) + \delta
            \overset{(e)}{>} \hat{\omega}^{(i(k;m))}_t(k),
        \end{split}
    \]
    where the inequality (a) is due to the definition of KL-UCB \(d_t^\brai(k)\),
    the equation (b) is due to \(t\not \in \mathcal{A}^\brai\),
    the inequality (c) is due to \(t\not \in \mathcal{B}^\brai\),
    the inequality (d) is due to the definition of \(\delta\),
    the inequality (e) is due to \(t\not \in \mathcal{G}_1(k;m)\).
    Recall that \(n_t(k)\) is the number of times that arm \(k\) has been pulled up to time \(t\).
    Then, we have \[
        \begin{split}
            n_0\kl(\hat{\omega}^{(i(k;m))}_t(k), \omega^{(i(k;m))}(k_*^{(i(k;m))}) - \delta)
            &\overset{(a)}\le n_t(k)\kl(\hat{\omega}^{(i(k;m))}_t(k), \omega^{(i(k;m))}(k_*^{(i(k;m))}) - \delta)\\
            \overset{(b)}\le n_t(k)\kl(\hat{\omega}^{(i(k;m))}_t(k), d_t^{(i(k;m))}(k))
            &\overset{(c)}\le \log T + 4\log(\log T),
        \end{split}
    \]
    where the inequality (a) is due to \(t\not\in \mathcal{G}_2(k;m)\)
    and thus \(n_0\le c_t \le n_t(k)\),
    the inequality (b) is due to that \(\kl(x,y)\) is increasing for \(y\) when \(0<x<y<1\),
    the inequality (c) is due to the definition of \(d_t^{(i(k;m))}(k)\).

    We then substitute \(n_0\) into the above inequality and obtain \(
    \kl(\hat{\omega}^{(i(k;m))}_t(k), \omega^{(i(k;m))}(k_*^{(i(k;m))}) - \delta)
    \le \kl(\omega^{(i(k;m))}(k)+ \delta, \omega^{(i(k;m))}(k_*^{(i(k;m))}) - \delta).
    \)
    Noticing \(\kl(x,y)\) is decreasing for \(x\) when \(0<x<y<1\),
    the inequality further leads to
    \(\hat{\omega}^{(i(k;m))}_t(k) \ge \omega^{(i(k;m))}(k)+ \delta\),
    which contradicts \(t\not \in \mathcal{G}_1(k;m)\).
    From this contradiction, we have \(\mathcal{G}(k;m)
    \subseteq \mathcal{G}_1(k;m)\cup \mathcal{G}_2(k;m)\).

    Next, we upper bound \(\E\left[ \abs{\mathcal{G}_1(k;m)} \right]\)
    and \(\E\left[ \abs{\mathcal{G}_2(k;m)} \right]\).
    To bound \(\E\left[ \abs{\mathcal{G}_1(k;m)} \right]\), we apply Lemma~\ref{lma:emp-mean-event} for a fixed agent \(i\),
    let \(H = \{(j,t)\in \mathcal{G}_1(k;m): j=i\}, C_t = c = 1\),
    and obtain \(\E\left[
        \abs{\mathcal{G}_1(k;m)}\vert \text{agent }i
        \right]\le 4+2\delta^{-2}\).
    Summing up over all agent in \(\{i(k;1),\dots, i(k;m)\}\),
    we have \(\E\left[
        \abs{\mathcal{G}_1(k;m)}
        \right]\le m(4+2\delta^{-2})\).

    We bound \(\E\left[\abs{\mathcal{G}_2(k;m)} \right]\),
    via its definition as follows,
    \[
        \E\left[\abs{\mathcal{G}_2(k;m)} \right] \le n_0 =
        \frac{\log T + 4 \log (\log T)}
        {\kl(\omega^{(i(k;m))}(k)+\delta, \omega^{(i(k;m))}(k_*^{(i(k;m))})-\delta)}.
    \]
    Combining both bounds together, we have, for any \(m\le M(k)\) and arm \(k\),
    \[
        \E\left[ \abs{\mathcal{G}(k;m)} \right]
        \le \frac{\log T + 4\log (\log T)}
        {\kl(\omega^{(i(k;m))}(k)+\delta, \omega^{(i(k;m))}(k_*^{(i(k;m))})-\delta)}
        + m(4 + 2\delta^{-2}).
    \]
    Denote \(n_t^\brai(k)\) as the number of times that agent \(i\)
    pulls arm \(k\) in time slots \(\{s\le t: s \not\in\mathcal{A}^\brai\cup\mathcal{B}^\brai\}\).
    The above inequality can be rewritten as \[
        \E\left[ \sum_{\ell=1}^m n_t^{(i(k;\ell))} (k) \right]
        \le \frac{\log T + 4\log (\log T)}{\kl(\omega^{(i(k;m))}(k)+\delta, \omega^{(i(k;m))}(k_*^{(i(k;m))})-\delta)} + m(4 + 2\delta^{-2}).
    \]
\end{proof}




\begin{proof}[Proof of Lemma~\ref{lma:bound-arm-regrets}]

    For the fixed arm \(k\), we first simplify the notations by using
    \(n_m = n_T^{(i^{(m)})}(k)\) and \(\Delta_m = \Delta^{i(k;m)}(k)\) as follows,
    \[
        \E\left[ \sum_{m=1}^{M(k)} n_T^{(i(k;m))}(k) \Delta^{(i(k;m))}(k)\right]
        =  \sum_{m=1}^{M(k)} \E\left[n_T^{{(i(k;m))}}(k)\right] \Delta^{(i(k;m))}(k)
        =  \sum_{m=1}^{M(k)} \E\left[n_m\right]  \Delta_m.
    \]
    To simplify the result of Lemma~\ref{lma:bound-arm-pulls} as well, we denote \(
    A_m \coloneqq \frac{\log T + 4\log (\log T)}{\kl(\omega^{(i(k;m))}(k)+\delta, \omega^{(i(k;m))}(k_*^{(i(k;m))})-\delta)}
    \) and \(
    B_m \coloneqq m(4 + 2\delta^{-2})
    \). Then, Lemma~\ref{lma:bound-arm-pulls} becomes \(\sum_{\ell}^m\E[n_l]\le A_m + B_m\) for all integer \(m \le M(k)\).

    Next, we rewrite the summation and upper bound it as follows,
    \[
        \begin{split}
            \sum_{m=1}^{M(k)} \E\left[n_m\right]  \Delta_m
            &=(A_1+B_1)\Delta_1 + \underbrace{((A_1 + B_1) - \E[n_1])}_{>0, \text{ Lemma~\ref{lma:bound-arm-pulls}}}\underbrace{(\Delta_2 - \Delta_1)}_{<0} + ((A_2 + B_2) - (A_1 + B_1))\Delta_2 \\
            &\quad + \underbrace{((A_2 + B_2) - (\E[n_1] + \E[n_2]))}_{>0, \text{ Lemma~\ref{lma:bound-arm-pulls}}}\underbrace{(\Delta_3 - \Delta_2)}_{<0}+ ((A_3 + B_3) - (A_2 + B_2))\Delta_3 \\
            &\quad + \quad \vdots\\
            &\quad + \underbrace{\left( (A_{M(k)-1} + B_{M(k) - 1}) - \sum_{m=1}^{M(k) - 1} \E[n_m] \right)}_{>0, \text{ Lemma~\ref{lma:bound-arm-pulls}}}\underbrace{(\Delta_{M(k)} - \Delta_{M(k) - 1})}_{<0} \\
            &\quad + \Biggl( \underbrace{\sum_{m=1}^{M(k)} \E[n_m]}_{< A_{M(k)} + B_{M(k)}} - (A_{M(k)-1} + B_{M(k) - 1}) \Biggl)\Delta_{M(k)}\\
            &\le (A_1 + B_1)\Delta_1 + \sum_{m=1}^{M(k) - 1}((A_{m+1} + B_{m+1}) - (A_m + B_m) )\Delta_{m}\\
            &= A_1 \Delta_1 + \sum_{m=1}^{M(k) - 1} (A_{m+1} - A_m)\Delta_{m} + B_1\sum_{m=1}^{M(k)} \Delta_m.
        \end{split}
    \]

    One can bound \(\Delta_m\) as follows,
    \[
        \begin{split}
            \Delta_m
            &\overset{(a)}\le 2(\Delta_m - 2\delta)
            = 2((\omega^{(i(k;m))}(k_*^{(i(k;m))}) - \delta) - (\omega^{(i(k;m))}(k) + \delta))\\
            % &\le \sqrt{2}\cdot \sqrt{\kl(\omega^{(i(k;m))}(k)+\delta, \omega^{(i(k;m))}(k_*^{(i(k;m))})-\delta)}
            % \eqqcolon \sqrt{2} x_m,
            &\overset{(b)}= \sqrt{8(\sigma_1^2 + \sigma_2^2)\kl(\omega^{(i(k;m))}(k)+\delta, \omega^{(i(k;m))}(k_*^{(i(k;m))})-\delta)}
            \eqqcolon  x_m,
        \end{split}
    \]
    where the inequality (a) is due to \(\delta\)'s definition,
    the equation (b) is due to the property of the KL-divergence
    in Lemma~\ref{lma:kl-div-ineq}, and we define
    \(x_m \coloneqq
    \sqrt{\kl(\omega^{(i(k;m))}(k)+\delta, \omega^{(i(k;m))}(k_*^{(i(k;m))})-\delta)}\)
    for simplicity.
    We can substitute \(\Delta_m \le x_m\) into the above inequality
    and further scale it up as follows,
    \[
        \begin{split}
            &\quad A_1 \Delta_1 + \sum_{m=1}^{M(k) - 1} (A_{m+1} - A_m)\Delta_{m} + B_1\sum_{m=1}^{M(k)} \Delta_m
            \le A_1 x_1 + \sum_{m=1}^{M(k) - 1} (A_{m+1} - A_m)x_{m} + B_1\sum_{m=1}^{M(k)} \Delta_m\\
            &\overset{(a)}=\sum_{m=1}^{M(k) - 1}A_m(x_m - x_{m+1}) + A_{M(k)}x_{M(k)}
            + B_1\sum_{m=1}^{M(k)} \Delta_m\\
            &=8(\sigma_1^2 + \sigma_2^2)
            \left( \sum_{m=1}^{M(k) - 1}\frac{\log T + 4\log (\log T)}{x_m^2}(x_m - x_{m+1}) + \frac{\log T + 4\log (\log T)}{x_{M(k)}^2}x_{M(k)} \right)
            + (4+2\delta^{-2})\sum_{m=1}^{M(k)} \Delta_m\\
            &\le 8(\sigma_1^2 + \sigma_2^2)(\log T + 4\log (\log T))
            \left( \int_{x_{M(k)}}^{x_1}\frac{1}{x^2}dx
            + \frac{1}{x_{M(k)}} \right)
            + (4+2\delta^{-2})\sum_{m=1}^{M(k)} \Delta_m\\
            &\le 16(\sigma_1^2 + \sigma_2^2)\frac{\log T + 4\log (\log T)}{x_{M(k)}}
            + (4+2\delta^{-2})\sum_{m=1}^{M(k)} \Delta_m\\
            &\overset{(b)}= \frac{16(\sigma_1^2 + \sigma_2^2)}{\sqrt{8(\sigma_1^2+\sigma_2^2)}}\frac{\sqrt{\kl(\bar{\omega}(k)+\delta, \bar{\omega}(k)+\bar{\Delta}(k)-\delta)}(\log T + 4\log (\log T))}{\kl(\bar{\omega}(k)+\delta, \bar{\omega}(k)+\bar{\Delta}(k)-\delta)}
            + (4+2\delta^{-2})\sum_{m=1}^{M(k)} \Delta_m\\
            &\overset{(c)}= \frac{4(\Delta_m-2\delta)(\log T + 4\log(\log T))}{\kl(\bar{\omega}(k)+\delta, \bar{\omega}(k)+\bar{\Delta}(k)-\delta)}
            + (4+2\delta^{-2})\sum_{m=1}^{M(k)} \Delta_m,
        \end{split}
    \]
    where equation (a) applies the Abel transformation in Lemma~\ref{lma:abel-trans} in Appendix,
    equation (b) is by
    \(\omega^{(i(k;M(k)))}(k_*^{(i(k;M(k)))}) = \bar{\omega}(k) +\bar{\Delta}(k)\)
    where \(\bar{\omega}(k) = \omega^{(\bar{i}(k))}(k)\) and
    \(\bar{i}(k) = i(k;M(k))\),
    and equation (c) is due to the KL-divergence's property in Lemma~\ref{lma:kl-div-ineq}.
    The above inequality upper bounds the regret cost paying for the suboptimal arm \(k\).
\end{proof}