% !TeX root = ..\FreeExp.tex

\begin{algorithm}[!t]
    \caption{The \FreeExp Algorithm (for Agent \(i\))}
    \label{alg:free-exp}
    \begin{algorithmic}[1]
        \STATE \textbf{Initialize:} \(d_t(k) = 0, \hat{\mu}_t(k) = 0, \hat{\omega}^\brai_t(k)\coloneqq \hat{\mu}_t(k) + \nu^\brai(k)\).
        \FOR {each time slot \(t\)}
        \STATE \(I_t^\brai\gets \argmax_{k\in\mathcal{K}^\brai} \hat{\omega}^\brai_t(k)\)
        \COMMENT{\texttt{identify the empirical optimal arm}}
        \label{line:empirical-optimal-arm}
        \STATE Send \(I_t^\brai\) to other agents and collect their \(I_t^\braj\)
        \STATE \(\mathcal{D}_t^\brai \gets
        \{k\in \mathcal{K}^\brai\setminus\{I_t^\brai\}:
        d_t^\brai(k) >  \hat{\omega}^\brai_t(I_t^\brai)\}\)
        \COMMENT{\texttt{choose arms with high KL-UCB}}
        \label{line:construct-exploration-arm-set}
        \STATE \(\mathcal{D}_t^\brai \gets \mathcal{D}_t^\brai\setminus \{I_t^{(j)}:\forall j\in\mathcal{M}\}\)
        \COMMENT{\texttt{take advantage of free exploration}}
        \label{line:remove-free-exploration-arms}
        \IF{\(\mathcal{D}_t^\brai = \emptyset\)}
        \label{line:pull-arm-begin}
        \STATE \(J_t^\brai \gets I_t^\brai\)
        \ELSE
        \STATE w.p., \(\frac{1}{2}\), \(J_t^\brai \gets I_t^\brai\)
        \STATE w.p., \(\frac{1}{2}\), \(J_t^\brai \gets\) uniformly pick an arm from \(\mathcal{D}_t^\brai\)
        \label{line:pull-arm-end}
        \ENDIF
        \STATE Pull arm \(J_t^\brai\) and receive observations \(X_t^\brai(J_t^\brai)\)
        \STATE Send observations \(X_t^\brai(J_t^\brai)-\nu^\brai(J_t^\brai)\) to other agents and also collect theirs \label{line:share-observation}
        \STATE Update \(\hat{\omega}_t^\brai(k)\) and \(d^\brai_t(k)\) for arm \(k\) and agent \(i\)
        \ENDFOR
    \end{algorithmic}
\end{algorithm}

\section{The \FreeExp Algorithm}\label{sec:algorithm}
In this section, we present the \FreeExp algorithm, which solves a multi-agent bandit problem in the \MATOBHR model.
Each agent runs its own \FreeExp algorithm and cooperates with each other.
In Section~\ref{sec:analysis}, we demonstrate that with \FreeExp, the reward heterogeneity not only does no harm, but in fact benefits the cooperative learning by the unique opportunity of free exploration.

\noindent{\bf High-level idea of \FreeExp: }
We now explain how \FreeExp implements the idea of free exploration to reduce regret.
The pivot of \FreeExp is the local optimal (free) arm of each agent, which is unknown in advance.
To address that for an agent \(i\), \FreeExp maintains an local optimal arm estimate \(I_t^\brai\) of the agent \(i\)
and an \textit{exploration arm set} \(\mathcal{D}_t^\brai\) containing arms that might be the ground truth local optimal arm and thus need further explorations.
To utilize free exploration, agent \(i\) periodically announces her estimated optimal arm \(I_t^\brai\) to others to discourage other agents exploring this arm.

\begin{remark}
    We note that some prior works~\citep{combes2014unimodal,combes2015learning,wang2020optimal}, such as the \texttt{DPE2} algorithm in cooperative \MATOB~\citep{wang2020optimal}, also involved a pivot arm and an exploration arm set in the algorithm design.
    However, the technical usage of both components in those works is very different from ours.
    For example, \texttt{DPE2} estimates the pivot arm to gather all exploration responsibility to a single leader agent,
    while our usage is relegating/dispersing the free arms to the agents for which they are locally optimal.
\end{remark}

\noindent{\bf Local optimal arm estimate and construction of exploration arm set: }
Let \(n_t(k)\) and \(\hat{\mu}_t(k)\)
denote the total number of times arm \(k\) is pulled up to time \(t\)
and the empirical mean of these \(n_t(k)\) reward observations of arm \(k\) among all \(M\) agents.
Denote \(\hat{\omega}_t^\brai(k)\coloneqq \hat{\mu}_t(k) + \nu^\brai(k)\) as the empirical reward mean of agent \(i\) pulling arm \(k\) and it is based on all agents' observations of arm \(k\).
\FreeExp uses agent \(i\)'s \emph{empirical local optimal arm} \(I_t^\brai\) (the arm with the largest empirical reward mean \(\hat{\omega}_t^\brai(k)\) of agent \(i\) at time \(t\)) as an estimate of the pivot.
Given this empirical optimal arm as the pivot, the agent either pulls its own empirical optimal arm \(I_t^\brai\) for free exploration,
or explores other arms in \(\mathcal{D}_t^\brai\) to guarantee the correctness of this estimated pivot.
To improve the efficiency of exploring other arms, we construct the \textit{exploration arm set} \(\mathcal{D}_t^\brai\) for each agent $i$ using the KL-UCB index~\citep{cappe2013kullback}.
The index of arm \(k\) at time slot \(t\) is
\begin{equation} \label{eq:kl-ucb}
    \begin{split}
        d_t^\brai(k) &\coloneqq   \sup\{q\ge 0:\\
        & \, n_t(k) \kl(\hat{\omega}_t^\brai(k), q) \le \log t + 4\log (\log t)
        \},
    \end{split}
\end{equation}
where \(\kl(a,b)\) is the KL-divergence between two Gaussian distributions with means \(a\) and \(b\) and same variance \(\sigma_1^2 + \sigma_2^2\).
The exploration arm set \(\mathcal{D}_t^\brai\) includes arms whose KL-UCB indexes \(d_t^\brai(k)\) are greater than the agent's highest empirical mean \(\hat{\omega}_t^\brai(I_t^\brai)\) (Line~\ref{line:construct-exploration-arm-set}) and
excludes arms that are empirically optimal for at least one agent (Line~\ref{line:remove-free-exploration-arms})---discourage agent \(i\) exploring others' local optimal arms.
Note that the agents only share the arm-specific reward to other, i.e., the agent subtracts the agent-specific reward from the observed compound reward before sharing (Line~\ref{line:share-observation}).

\noindent{\bf Arm pulling policy: }
To guarantee the accuracy of the pivot estimation (i.e., the empirical optimal arm is correct with high probability), each agent needs to have enough observations for her empirically optimal arm.
To accomplish this,
\FreeExp implements an arm pulling policy (Lines~\ref{line:pull-arm-begin}-\ref{line:pull-arm-end}) as follows:
if exploration arm set \(\mathcal{D}_t^\brai\) is empty,
the agent \(i\) pulls the empirical optimal arm \(I_t^\brai\);
if exploration arm set \(\mathcal{D}_t^\brai\) is not empty, with probability $1/2$, the agent, uniformly at random picks an arm
from \(\mathcal{D}_t^\brai\) to explore;
and with probability \(1/2\), pulls her empirical optimal arm---encourage free explorations of the agent's empirical optimal arm.
This policy produces sufficient observations of this arm to guarantee fast correction if the current empirical optimal arm is not the correct one.
Let \(J_t^\brai\) denote the arm selected by agent \(i\) in time slot \(t\) under \FreeExp.
We present pseudocode for \FreeExp in Algorithm~\ref{alg:free-exp}.


\begin{remark}[\NoFreeExp Algorithm]\label{rmk:nofree-exp-algo}
    There is a counterpart algorithm of \FreeExp, which does not utilize free exploration, i.e., Algorithm~\ref{alg:free-exp} without Line~\ref{line:remove-free-exploration-arms}. We name it as \NoFreeExp.
    Even without making use of free exploration, \NoFreeExp should have a better regret performance than known baselines, e.g., \texttt{CO-UCB},
    because \NoFreeExp is based on the KL-UCB algorithm, which is theoretically better than UCB-like algorithms~\citep{cappe2013kullback}.
\end{remark}

% \begin{remark}
% A similar algorithmic technique was also utilized in unimodal bandits~\cite{combes2014unimodal} (a specialized signal-agent bandit model for online pricing and bidding) and cooperative \MATOB~\cite{wang2020optimal}. 
% However, extending the algorithmic design to \MATOBHR to devise \FreeExp is non-trivial. 
% From the model aspect, the heterogeneous rewards of \MATOBHR are different from the homogeneous case in common cooperative \MATOB. 
% From the algorithmic aspect, \FreeExp needs to remove the free arms (in \(\mathcal{K}^\texttt{fr}\))  from the exploration arm set \(\mathcal{D}_t^\brai\) so to take advantage of the free exploration mechanism. 

% \end{remark}

% We now explain how \FreeExp implements the idea of free exploration. The high-level idea of \FreeExp is to partition the arms into two sets, those that are local optimal for at least one agent and those that are not. The responsibility for exploring arms in the first set is relegated to the agents for which they are locally optimal, incurring no regret;
% only exploration of arms in the second set incurs regret.
% To guarantee the arm set partition is correct with high probability, one should balance the exploration of arms in both sets.

% The pivot of this partition is the local optimal arm of each agent which, however, is unknown in advance.
% To address that, agents use their empirical local optimal arms \(I_t^\brai\) as estimates of the pivot.
% Given this empirical optimal arm, the agent either pulls its own empirical optimal arm for free exploration 
% or explores other arms to guarantee the correctness of this estimated pivot (and the partition as well).
% To improve the efficiency of exploring arms that is not empirical optimal, we introduce an \textit{exploration arm set} \(\mathcal{D}_t^\brai\) for each agent $i\in\mathcal{M}$ including arms whose KL-UCB indexes \(d_t^\brai(k)\) are greater than the empirical optimal arm \(I_t^\brai\)'s empirical mean \(\hat{\omega}_t^\brai(I_t^\brai)\) (Line~\ref{line:construct-exploration-arm-set}) and 
% excluding arms that are the empirically optimal for at least one agent (Line~\ref{line:remove-free-exploration-arms}) 
% --- discourage the agent exploring other agents' local optimal arms. 


% To make the pivot estimation work well (i.e., make sure the empirical optimal arm is correct with high probability), the algorithm needs to guarantee that each agent has enough observations for her empirically optimal arm. 
% To guarantee that,
% \FreeExp then implements an arm pulling policy (Lines~\ref{line:pull-arm-begin}-\ref{line:pull-arm-end}) as follows:
% if exploration arm set \(\mathcal{D}_t^\brai\) is empty,
% the agent \(i\) pulls the empirical optimal arm \(I_t^\brai\);
% if exploration arm set \(\mathcal{D}_t^\brai\) is not empty, with probability $1/2$, the agent, uniformly at random picks an arm
% from \(\mathcal{D}_t^\brai\) to explore; 
% and with probability \(1/2\), pulls her empirical optimal arm
% --- encourage free explorations of the agent's empirical optimal arm.
% This policy produces sufficient observations of this arm to guarantee fast correction if the current empirical optimal arm is not the correct one.
% We present pseudocode for \FreeExp in Algorithm~\ref{alg:free-exp}.


% \begin{remark}
% A similar algorithmic technique was also utilized in unimodal bandits~\cite{combes2014unimodal} (a specialized signal-agent bandit model for online pricing and bidding) and cooperative \MATOB~\cite{wang2020optimal}. 
% However, extending the algorithmic design to \MATOBHR to devise \FreeExp is non-trivial. 
% From the model aspect, the heterogeneous rewards of \MATOBHR are different from the homogeneous case in common cooperative \MATOB. 
% From the algorithmic aspect, \FreeExp needs to remove the free arms (in \(\mathcal{K}^\texttt{fr}\))  from the exploration arm set \(\mathcal{D}_t^\brai\) so to take advantage of the free exploration mechanism. 
% From the analysis aspect, the regret cost of pulling the same arm by different agents in the heterogeneous reward setting is different, 
% which is much more complex than the homogeneous reward case. 
% To address the challenge, we introduce two new techniques: (a) count the number of times of the suboptimal arm pulls according to two-dimension agent-time pairs (see Lemma~\ref{lma:bound-arm-pulls} and its proof) and (b) apply an Abel transformation to summing the regret costs of all agents on pulling an arm according to the order of magnitude of the arm's reward gaps for these agents (see Lemma~\ref{lma:bound-arm-regrets} and its proof).
% \end{remark}


% However, the local optimal arm of each agent is unknown in advance and the key challenge is how to assign each arm $k$ to be explored by the ``correct agents'' in \(\mathcal{M}_*(k)\) who can explore it with no regret. 
% To achieve this, \FreeExp has each agent periodically announce her empirically optimal arm to others to discourage their exploring this arm. 


% To implement this idea, we introduce an \textit{exploration arm set} \(\mathcal{D}_t^\brai\) for agent $i$ that excludes arms that are the empirically optimal for some other agents (line~\ref{line:remove-free-exploration-arms} of Algorithm~\ref{alg:free-exp}). 


% Then, in each round, agent $i$ pulls either an arm from  exploration arm set \(\mathcal{D}_t^\brai\) or her empirical local optimal arm \(I_t^\brai\). In addition, to make the above idea work well, the algorithm needs to guarantee that each agent have enough observations for the empirically optimal arm. 


% This is from two considerations: first,
% spending more pulls on the empirically optimal arm can speed up the learning process, since we can take it as a benchmark to update the exploration arm set; second, enough pulls guarantee the agent to make correct announcement to others, such that each agent is assigned with free arms efficiently.





% To achieve this, we build up the exploration arm set from which the agent frequently select an arm to pull.
% In our algorithm, with a probability of \(1/2\), the agent, at a guaranteed \(1/2\) probability, randomly picks an arm
% from \(\mathcal{D}_t^\brai\) to explore. details of algorithm showing why enough observations are guaranteed.
% Moreover,




% \rev{
%     Second, (free exploration) algorithm notifies other agents to keep them free from pulling the free arms (free exploration).
% }

% In this section, we devise an algorithm called \FreeExp (Free Exploration)
% for the \MATOBHR model.
% In Section~\ref{sec:analysis},
% we show \FreeExp is near-optimal:
% its regret upper bound matches the
% lower bound
% in Theorem~\ref{thm:regret-lower-bound} up to a constant factor.
% We begin with a na\"ive algorithm.

% \subsection{A Na\"ive Algorithm Utilizing Free Exploration}

% In order to design an optimal algorithm to solve \MATOBHR,
% one should make use of free explorations
% --- one agent should avoid pulling arms that are optimal for some other agents.
% However, the arms that are optimal for some agents
% are unknown a priori.
% To estimate the optimal arm, a straightforward idea
% is to pick the arm with highest empirical mean as an estimate.
% Together with the classic UCB algorithm, one can obtain the Algorithm~\ref{alg:naive-free-exp}:
% in each time slot,
% (i) agents identify its own empirical optimal arms and broadcast
% the arms to each other (line~\ref{algline:empirical-optimal-arm});
% (ii) agents pick the arm with highest UCB index
% among these that are not empirical optimal of any other agents (line~\ref{algline:pick-highest-ucb}).
% The formal notations are defer to the \FreeExp algorithms.

% \begin{algorithm}[H]
%     \caption{The Na\"ive Algorithm with Free Exploration (for Agent \(i\))}
%     \label{alg:naive-free-exp}
%     \begin{algorithmic}[1]
%         \STATE \textbf{Initialize:} \(d_t(k) = 0, \hat{\mu}_t(k) = 0, \hat{\omega}^\brai_t(k)\coloneqq \hat{\mu}_t(k) + \nu^\brai(k)\).
%         \FOR {each time slot \(t\)}
%         \STATE \(I_t^\brai\gets \argmax_{k\in\mathcal{K}^\brai} \hat{\omega}^\brai_t(k)\)
%         \COMMENT{identify the empirical opitmal arm}\label{algline:empirical-optimal-arm}
%         \STATE Send \(I_t^\brai\) to other agents and collect other agents' \(I_t^\braj,\,\forall j\neq i\)
%         \STATE \(\mathcal{E}_t^\brai\gets \{I_t^{(j)}:\forall j\in\mathcal{M},j\neq i\}\) \COMMENT{Collect other agents' free exploration arms}

%         \STATE \(J_t^\brai \gets \argmax_{k\in\mathcal{K}^\brai\setminus\mathcal{E}_t^\brai} \text{UCB}^\brai_t(k)\) \COMMENT{Select the arm with highest UCB index out of \(\mathcal{E}_t^\brai\)}\label{algline:pick-highest-ucb}
%         \STATE Pull arm \(J_t^\brai\) and receive the  observation \(X_t^\brai(J_t^\brai)\)
%         \STATE Send its observations to other agents and collect theirs
%         \STATE Update \(\hat{\omega}_t^\brai(k)\) and \(\text{UCB}^\brai_t(k)\) for all arm \(k\)
%         \ENDFOR
%     \end{algorithmic}
% \end{algorithm}

% However, this policy may perform badly.
% Because if an agent \(i\) overestimates an arm \(k\)'s reward mean and mistakenly identifies it as its optimal which is actually another agent \(j\)'s optimal arm, then both agents suffer additional regret --- agent \(i\) pulls arm \(k\) with cost which would have been free if pulled by agent \(j\);
% agent \(j\) also suffers cost due to keep pulling suboptimal arms since his true optimal arm is removed as a ``free arm'' of agent \(i\).
% Furthermore, this overestimation of arm \(k\)'s reward mean may last for a long time
% (thus cause large additional regret cost).
% Because agent \(i\) pulls arms with high UCB index which can be inconsistent with arms with the high empirical reward mean and, therefore, the new reward observations of arm \(k\) that can correct the estimation may be insufficient.

% To correct the overestimation of a suboptimal arm's reward mean faster
% and reduce the regret,
% a better algorithm can periodically pull the empirical optimal arm
% so as to constantly improve the arm's estimate accuracy.
% For another thing, incorporating the periodic pull of empirical optimal arm into UCB algorithm still cannot fully utilizing the free exploration mechanism.
% Because in the UCB algorithm, it takes \(O(\log T)\) time slots for an agent \(i\) to acquire enough samples and correctly identify its optimal arms \(k_*^\brai\).
% Nevertheless, after these \(O(\log T)\) time slots, other agents \(j\,(\neq i)\) would have already pay a significant cost on exploring the arm \(k_*^\brai\).
% % The cost before finding the free arms is already comparable to identify the free exploration arm with cost.
% To tackle this problem, we turn to the KL-UCB index which is a tighter optimistic reward mean estimate than UCB.
% % Pulling arms according to KL-UCB and periodically pull empirical optimal arms 


% \subsection{The Near-Optimal \FreeExp Algorithm}


% The proposed algorithm relies on traditional policies as the KL-UCB index to select an arm to pul

% To utilize free exploration,
% one agent needs to know other agents' local optimal arms,
% avoid pulling these arms,
% and enjoy other agents' free explorations on these arms.
% However, agents' local optimal arms are unknown a priori
% and can only be gradually estimated in the learning process.
% On the other hand,
% agents also need to tackle the exploration and exploitation dilemma.


% The \FreeExp algorithm
% periodically exploits the arm with highest empirical mean,
% and randomly explores arms with high KL-UCB indexes~\cite{cappe2013kullback}.
% This policy explicitly addresses the exploitation and exploitation dilemma.
% Under this policy,
% one can show that the empirical optimal arm
% is almost always the optimal arm
% except for a finite number of time slots (independent of time horizon \(T\)).
% Given this observation,
% one agent can take other agents' empirical optimal arms as a guide
% of free exploration.
% We note that a similar algorithmic technique has been utilized for unimodal bandits~\cite{combes2014unimodal} which is a specialized single agent MAB model and is very different from our multi-agent setting.

% Next, we illustrate the algorithm's details.
% Denote \(n_t(k)\) and \(\hat{\mu}_t(k)\)
% as the total number of times of pulling arm \(k\) up to time \(t\)
% and the empirical mean of these \(n_t(k)\) reward observations of arm \(k\) among all \(M\) agents.
% We define the KL-UCB index
% of arm \(k\) at time slot \(t\)
% as
% \(d_t^\brai(k)\coloneqq \sup\{q\ge 0:
% n_t(k) \kl(\hat{\omega}_t^\brai(k), q) \le \log t + 4\log (\log t)
% \}\), where \(\hat{\omega}_t^\brai(k)\coloneqq \hat{\mu}_t(k) + \nu^\brai(k)\).
% Denote \(I_t^\brai\) as the arm with highest empirical reward mean
% of agent \(i\) at time slot \(t\)
% (called \textit{empirical optimal arm}),
% \(J_t^\brai\) as the arm that agent \(i\) pulls in time slot \(t\),
% and \(\mathcal{D}_t^\brai\) as a set including arms to be explored
% by agent \(i\) (called \textit{exploration arm set}, defined next).

% \FreeExp (Algorithm~\ref{alg:free-exp}) contains two main components:
% (1) construct the exploration arm set \(\mathcal{D}_t^\brai\)
% (lines~\ref{line:empirical-optimal-arm}-\ref{line:remove-free-exploration-arms})
% and
% (2) balance exploration and exploitation in arm pulling
% (lines~\ref{line:pull-arm-begin}-\ref{line:pull-arm-end}).
% To construct \(\mathcal{D}_t^\brai\),
% we first pick arms in agent \(i\)'s local arm set \(\mathcal{K}^\brai\)
% whose KL-UCB indexes are greater than the empirical optimal arm \(I_t^\brai\)'s empirical mean \(\hat{\omega}_t^\brai(I_t^\brai)\);
% the arm \(I_t^\brai\) itself is excluded.
% Then, to take advantage of free explorations from other agents,
% we remove other agent's empirical optimal arms
% from this exploration arm set \(\mathcal{D}_t^\brai\).


% \begin{algorithm}[H]
%     \caption{The \FreeExp Algorithm for Heterogeneous Action Access (for Agent \(i\))}
%     \label{alg:free-exp}
%     \begin{algorithmic}[1]
%         \STATE \textbf{Initialize:} \(d_t(k) = 0, \hat{\mu}_t(k) = 0\)
%         \FOR {each time slot \(t\)}
%         \STATE \(I_t^\brai\gets \argmax_{k\in\mathcal{K}^\brai} \hat{\mu}_t(k)\)
%         \label{line:empirical-optimal-arm}
%         \COMMENT{identifiy the empirical opitmal arm}
%         \STATE \(\mathcal{D}_t^\brai \gets
%         \{k\in \mathcal{K}^\brai\setminus\{I_t^\brai\}:
%         d_t(k) > \hat{\mu}_t(I_t^\brai)\}\)
%         \COMMENT{choose arms with high KL-UCB indexes}
%         \STATE \(\mathcal{D}_t^\brai \gets \mathcal{D}_t^\brai\setminus \{I_t^{(j)}:\forall j\in\mathcal{M}\}\)
%         \label{line:remove-free-exploration-arms}
%         \COMMENT{take advantage of other agents' free exploration}
%         \IF{\(\mathcal{D}_t^\brai = \emptyset\)}
%         \label{line:pull-arm-begin}
%         \STATE Pull arm \(I_t^\brai\)
%         \Else
%         \STATE w.p. \(\frac{1}{2}\), pull arm \(I_t^\brai\)
%         \STATE w.p. \(\frac{1}{2}\), uniformly pick an arm from \(\mathcal{D}_t^\brai\) to pull
%         \label{line:pull-arm-end}
%         \ENDIF
%         \ENDFOR
%     \end{algorithmic}
% \end{algorithm}





% \subsection{\FreeExp for Action-Constraint Multi-Agent Multi-Armed Bandits}
