% !TeX root = ..\freeExp.tex
\section{Theoretical Results}\label{sec:analysis}

We present our theoretical results and their significance discussions in this section.
The rigorous proofs of these results are deferred to Appendix~\ref{sec:proofs}.
We first derive a regret lower bound in Theorem~\ref{thm:regret-lower-bound}
which reflects the impact of free exploration.

\begin{theorem}[Regret lower bound]
    \label{thm:regret-lower-bound}
    For any consistent policy \(\pi\) (i.e., for any bandit instance \(\bm{\nu}\) and any \(\alpha > 0\), the policy \(\pi\) always guarantees \(\E_{\bm{\nu},\pi}[\text{\normalfont R}_T] = O(T^\alpha)\)),
    the regret cost of addressing the \emph{\MATOBHR} model in \(T\) time slots
    is lower bounded by
    % \begin{equation}
    %     
    %     \liminf_{T\to\infty} \frac{\ERT}{\log T}
    %     \ge
    %     \sum_{k: \bar{\Delta}(k)>0}
    %     \frac{\bar{\Delta}(k)}{\kl(\mu(k),\mu(k)+\bar{\Delta}(k))}.
    % \end{equation}
    % \rev{
    \begin{equation}\label{eq:regret-lower-bound}
        \liminf_{T\to\infty} \frac{\ERT}{\log T}
        \ge \! \! \! \!
        \sum_{k: \bar{\Delta}(k)>0}  \! \!
        \frac{\bar{\Delta}(k)}{\kl(\bar{\omega}(k),\bar{\omega}(k)+\bar{\Delta}(k))},
    \end{equation}
    where \(\bar{\Delta}(k)\) defined in~\eqref{eq:gap} is the smallest reward gap of pulling arm \(k\)
    and \(\bar{\omega}(k)\) defined in~\eqref{eq:critial-omega} is the reward mean of pulling arm \(k\) by the agent who enjoys the smallest gap.
\end{theorem}

Theorem~\ref{thm:regret-lower-bound}'s proof leverages similar techniques of the classic stochastic bandits~\citep{lai1985asymptotically}.
Since \(\bar{\Delta}(k)=0\) for all free arms \(k\in\mathcal{K}^\texttt{fr}\) and \emph{vice versa}, the regret lower bound can be rewritten as
\begin{equation}\label{eq:regret-lower-bound-alter}
    \liminf_{T\to\infty} \frac{\ERT}{\log T}
    \ge  \! \!  \! \!
    \sum_{k\in\mathcal{K}\setminus\mathcal{K}^\texttt{fr}}  \! \!
    \frac{\bar{\Delta}(k)}{\kl(\bar{\omega}(k),\bar{\omega}(k)+\bar{\Delta}(k))}.
\end{equation}


\begin{remark}[Free arms have no contribution to the asymptotic regret lower bound]
    \label{rmk:lower-bound-free-exploration-observation}
    Free arms in \(\mathcal{K}^\texttt{fr}\)
    contribute at most sub-logarithmic costs to the regret lower bound.
    In fact, given our finite regret upper bound of \FreeExp next,
    free arms only contribute finite regret.
\end{remark}


% \begin{remark}[Comparison to~\cite{yang2022distributed}'s regret lower bound]
%     Authors in~\citep[Theorem 1]{yang2022distributed} proposed a regret lower bound for~\MATOBHR as follows,
%     \[
%         \liminf_{T\to\infty} \frac{\E[\RT]}{\log T}
%         \ge
%         \sum_{k\in \cup_{i\in\mathcal{M}} (\mathcal{K}^\brai\setminus \{k_*^\brai\})}
%         \frac{\bar{\Delta}(k)}{\kl(\mu(k),\mu(k)+\bar{\Delta}(k))}.
%     \]

%     Note that \(\mathcal{K}=\cup_{i\in\mathcal{M}}\mathcal{K}^\brai\) and 
%     \(\mathcal{K}^\texttt{fr}=\cup_{i\in\mathcal{M}} \{k_*^\brai\}\). 
%     So, we have \(\mathcal{K}\setminus\mathcal{K}^\texttt{fr}\subset \cup_{i\in\mathcal{M}} (\mathcal{K}^\brai\setminus \{k_*^\brai\})\).
%     For example, if an arm \(k\in\mathcal{K}^\texttt{fr}\) is also a suboptimal arm for another agent, 
%     then \(k\in \cup_{i\in\mathcal{M}} (\mathcal{K}^\brai\setminus \{k_*^\brai\})\)
%     and the arm \(k\) 
%     The summation range \(\mathcal{K}\setminus\mathcal{E}\)
%     of \cite{yang2022distributed}'s lower bound
%     may contain arms that can be freely explored (in \(\mathcal{K}^\texttt{fr}\)).
%     Therefore, their regret lower bound cannot reflex the true difficulty of~\MATOBHR.
%     To show \MATOBHR's true difficulty is neatly depicted by
%     our regret lower bound in~\eqref{eq:regret-lower-bound},
%     we devise the \FreeExp algorithm whose regret upper bound
%     tightly match our lower bound up to a constant factor in Sections~\ref{sec:algorithm} and~\ref{sec:regret-upper-bound}.
% \end{remark}




\begin{theorem}[Regret upper bound for \FreeExp (Algorithm~\ref{alg:free-exp})]\label{thm:free-exp-upper-bound}
    The \FreeExp algorithm's regret is upper bounded as follows,
    \begin{equation}\label{eq:finite-time-regret-upper-bound}
        \begin{split}
            &\ERT \le 7bM^2 K^2(4K + \delta^{-2})\\
            & +\sum_{k:\bar{\Delta}(k) > 0}
            \frac{4(\bar{\Delta}(k)-2\delta)(\log T + 4\log(\log T))}{\kl(\bar{\omega}(k)+\delta, \bar{\omega}(k)+\bar{\Delta}(k)-\delta)}
        \end{split}
    \end{equation}
    where \(0 < \delta <
    \frac{1}{4}{\min_{i\in\mathcal{M}, k_1\neq k_2\in \mathcal{K}}
        \abs{\omega^\brai (k_1) - \omega^\brai (k_2)}}\),
    and that \(\sigma_1^2\) and \(\sigma_2^2\) are the variance of arm- and agent-specific Gaussian rewards respectively,
    and \(b\) is an upper bound of arm-specific reward mean \(\mu(k)\) for all \(k\in\mathcal{K}\).\footnote{One can also obtain a near-optimal regret upper bound if the arm- and agent-specific rewards follow Bernoulli distributions.}

    If we let \(T\to \infty\) and \(\delta \to 0\)
    (e.g., \(\delta = (\log(\log T))^{-1}\)),
    the above finite-time regret upper bound has the following asymptotical form,
    \begin{equation}
        \label{eq:asymptotical-regret-upper-bound}
        \begin{split}
            \limsup_{T\to \infty}
            \frac{\ERT}{\log T} &\le O\left( \sum_{k: \bar{\Delta}(k) > 0}
            \frac{\bar{\Delta}(k)}{\kl(\bar{\omega}(k), \bar{\omega}(k)+\bar{\Delta}(k))}  \right).
        \end{split}
    \end{equation}
\end{theorem}


\noindent{\bf Proof sketch and technical challenges. }
The proof of the regret upper bound in Theorem~\ref{thm:free-exp-upper-bound} consists of two steps: (i) bound the regret cost of pulling free arms in \(\mathcal{K}^\texttt{fr}\),
and (ii) other arms outside \(\mathcal{K}^\texttt{fr}\).
To bound (i), notice that for any free arm \(k\) in \(\mathcal{K}^\texttt{fr}\), there exists ``corresponding'' agent(s) that takes arm \(k\) as its local optimal and can explore it with no cost. Hence, we only need to count the number of times that arm \(k\) is pulled by agents other than ``corresponding'' one(s), which only happens when the ``corresponding'' agent's empirical optimal arm \(I_t^\brai\) is not its true local optimal arm \(k_t^\brai\). Such events only occur with finite number of times even with a very large value of \(T\).
% Finite (independent of time) is the number of times of such events happening, as we prove.
The proof of (i) shares the similar logical flow to that of~\citep[Theorem~1]{wang2020distributed}. To proof (ii), however, we need to develop new techniques for addressing the heterogeneous rewards in \MATOBHR.
Note that in \MATOBHR the suboptimality reward gaps of pulling the same arm depend on the agents and thus are different.
Hence, one cannot bound the cost of pulling a suboptimal arm \(k\) via multiplying the number of times of pulling the suboptimal arm \(k\) by one suboptimality reward gap as the usual bandits literature did.
To address the challenge, we introduce two new techniques. First, we respectively count the number of times of the suboptimal arm pulls by agents (see Lemma~\ref{lma:bound-arm-pulls} and its proof), and secondly, we apply an Abel transformation to summing up the regret costs of all agents on pulling the arm \(k\) according to the order of magnitude of the arm's reward gaps \(\Delta^\brai(k)\) for these agents (see Lemma~\ref{lma:bound-arm-regrets} and its proof).



Similar to the regret lower bound's another expression in~\eqref{eq:regret-lower-bound-alter},
this regret upper bound's summation range can also be expressed according to the free arms,
\begin{equation}\label{eq:regret-upper-bound-free-arm-set}
    \begin{split}
        &\quad \limsup_{T\to \infty}
        \frac{\ERT}{\log T}  \\
        &\le   O  \left(  \sum_{k\in\mathcal{K}\setminus\mathcal{K}^\texttt{fr}}
        {\bar{\Delta}(k)}/{\kl(\bar{\omega}(k), \bar{\omega}(k)+\bar{\Delta}(k))}  \right) .
    \end{split}
\end{equation}
% {\color{blue}Xutong: How about let $\bar{\mu}(k,\delta)=\mu(k)+\bar{\Delta}(k)-\delta$ to shrink the size of above inequality.}

\begin{remark}[Regret optimality of the \FreeExp algorithm]
    This regret upper bound in~\eqref{eq:asymptotical-regret-upper-bound}
    matches the regret lower bound in~\eqref{eq:regret-lower-bound}
    up to a constant factor, which implies that both bounds are near-optimal, and therefore the \FreeExp algorithm is near-optimal as well.
\end{remark}

\begin{remark}[Comparison to~\cite{yang2022distributed}'s regret bounds]
    \citet{yang2022distributed} proposed algorithms achieving
    regret upper bounds~\citep[Theorems 2 and 4]{yang2022distributed} for \texttt{AC-MA2B} as follows
    (adapted to our notations),\footnote{
        To express~\cite{yang2022distributed}'s result, we abuse \(\bar{\Delta}(k)\) notation \emph{once},
        where \(\bar{\Delta}(k) \coloneqq \min_{i\in\mathcal{M}\setminus\mathcal{M}_*(k)} \Delta^\brai(k)\)---the smallest reward mean gap of arm \(k\) compared to the local optimal arms \emph{(excluding arm \(k\))} among agents having access to \(k\).
        The difference between this definition and the original one in~\eqref{eq:gap} is that for arm \(k\) in \(\mathcal{K}^\texttt{fr}\) this  \(\bar{\Delta}(k)\) is positive while the original one is zero.
    }
    \[
        \begin{split}
            &\limsup_{T\to \infty}\frac{\ERT}{\log T}\\
            &\qquad \le O\left(
            \sum_{k\in \cup_{i\in\mathcal{M}} (\mathcal{K}^\brai\setminus \{k_*^\brai\})}
            \frac{\bar{\Delta}(k)}{\kl(\bar{\omega}(k), \bar{\omega}(k)+\bar{\Delta}(k))} \right).
        \end{split}
    \]
    Note that \(\mathcal{K}=\cup_{i\in\mathcal{M}}\mathcal{K}^\brai\) and
    \(\mathcal{K}^\texttt{fr}=\cup_{i\in\mathcal{M}} \{k_*^\brai\}\).
    So, we have \(\mathcal{K}\setminus\mathcal{K}^\texttt{fr}\subset \cup_{i\in\mathcal{M}} (\mathcal{K}^\brai\setminus \{k_*^\brai\})\).
    For example, if an arm \(k\in\mathcal{K}^\texttt{fr}\) is also a suboptimal arm for another agent,
    then \(k\in \cup_{i\in\mathcal{M}} (\mathcal{K}^\brai\setminus \{k_*^\brai\})\) but \(k\not\in \mathcal{K}\setminus \mathcal{K}^\texttt{fr}\).
    In other words, the arm \(k\) contributes logarithmic regret costs to their upper bound but only contributes finite costs in ours.
    Therefore, their regret upper bound \emph{failed to capture the advantage of free exploration}
    and their algorithms did not utilize this appealing mechanism.
\end{remark}


\begin{remark}[Special cases with \(O(1)\) finite regret in \MATOBHR]
    The regret upper bound in~\eqref{eq:regret-upper-bound-free-arm-set} echos the regret lower bound's Remark~\ref{rmk:lower-bound-free-exploration-observation} that
    arms in \(\mathcal{K}^\texttt{fr}\)
    only cause finite \(O(1)\) costs in regret.
    Therefore, if all arms are local optimal for some agents, \(\mathcal{K}\setminus\mathcal{K}^\texttt{fr}=\emptyset\)
    (e.g., the example in Table~\ref{tab:simple-example}),
    then the regret upper bound in~\eqref{eq:asymptotical-regret-upper-bound}
    becomes \(O(1)\), i.e., a time horizon independent finite regret.
\end{remark}


\begin{remark}[Comparsion to~\citet{baek2021fair}]\label{rmk:free-arm-constant-regret}
    Recall that the set of \textit{free arms} \(\mathcal{K}^\texttt{fr}\) defined in our Eq.\eqref{eq:free_arm_set} contains arms that can be freely explored.
    In our regret upper bound, we show that \texttt{FreeExp}'s regret cost due to pulling arms in \(\mathcal{K}^\texttt{fr}\) is \(O(1)\), while \citet{baek2021fair}'s regret bound was asymptotic with respect to \(\log T\), implying that \texttt{KL-UCB}'s regret due to pulling arms in \(\mathcal{K}^\texttt{fr}\) was \(o(\log T)\) (the analysis in \citet{baek2021fair} upper bounds the cost for arm set \(\mathcal{K}^\texttt{fr}\) by \(O(\log \log T\))).
\end{remark}



\begin{remark}[Generalization to the homogeneous reward setting]
    If all agents' local arm sets are the same,
    then only one unique optimal arm can be freely explored (i.e., \(\abs{\mathcal{K}^\texttt{fr}}=1\))
    and all other arms would appear in the summation range in regret bounds~\eqref{eq:regret-lower-bound} and~\eqref{eq:asymptotical-regret-upper-bound}.
    Then, both the regret upper and lower bounds
    reduce to the ones in classic MABs in~\cite{lai1985asymptotically} (also the same as the optimal bounds of cooperative \MATOB).
    This observation highlights the \emph{``generality''} of our regret bounds
    and shows that \FreeExp also works
    for the homogeneous reward setting.
\end{remark}