% !TeX root = ..\freeExp.tex

\section{Model and Notations}
\label{sec:model}
We first present the multi-agent multi-armed bandits with heterogeneous rewards problem (\MATOBHR) in Section~\ref{sec:sys_model} and its performance metric in Section~\ref{sec:metric}.
In Section~\ref{sec:not}, we introduce notations related to free exploration to facilitate our algorithm design and analysis.
% We discuss other variants of \MATOBHR in Appendix~\ref{app:hr-four-cases}.

\subsection{\MATOBHR: The Multi-agent Multi-armed Bandits with Heterogeneous Rewards}
\label{sec:sys_model}
In \MATOBHR, there are \(K\in\N^+\) arms and \(M\in\N^+\) agents.
Each arm \(k\in \mathcal{K}\,(\coloneqq \{1,2,\dots,K\})\)
is associated with a Gaussian reward random variable
with unknown mean \(\mu(k)\in (0,b)\) and variance \(\sigma_1^2\),
where \(b\) is positive and known.\footnote{If \(b\) is unknown, we can set it as an arbitrarily large constant.}
This is the \emph{arm-specific reward} representing the intrinsic value of the arm and it is independent of the preference of the agents.
In addition, each agent has its own private \emph{agent-specific reward} for each arm to capture its private preference for different arms.
The agent-specific reward of agent \(i\) for arm \(k\) is modelled by a Gaussian random variable with mean \(\nu^\brai(k)\) and variance \(\sigma_2^2\).
% More specifically, let \(\nu^\brai(k)\) be the mean of a Gaussian random variable with variance \(\sigma_2^2\), that represents the agent-specific reward of agent \(i\) for arm $k$.
The variances \(\sigma_1^2\) and \(\sigma_2^2\) are common for all arms and agents.
The agent- and arm-specific rewards are independent,
and both are also independent across arms \(\mathcal{K}\) and time  \(t=1,2,\ldots\).


By pulling an arm \(k\) at time \(t\), agent \(i\) observes a Gaussian reward \(X_t^\brai(k)\) with mean
\(\omega^\brai(k) \coloneqq \mu(k) + \nu^\brai(k)\) and variance \(\sigma_1^2+\sigma_2^2\). In this paper, we assume that the value of $\nu^\brai(k)$ is only known to agent $i$, but unknown to other agents, for all agent $i\in \mathcal{M}$. Similar to the basic setting of stochastic bandits, the arm-specific reward means $\mu(k)$
% follows an underlying distribution, which 
are unknown to all agents.
We also assume, for each agent \(i\), that all mean rewards $\omega^\brai(k)\,(\forall k\in\mathcal{K})$ are different; hence each agent has a unique  optimal arm.

\begin{remark}[Agent's local arm set]
    \label{rmk:local-arm-set}
    Observe that \(\mu(k)\in (0,b)\). Consequently, if there exist two arms \(k_1, k_2\) such that \(\nu^\brai(k_1) \ge \nu^\brai(k_2) + b\) for agent \(i \in \mathcal{M}\), then
    \[
        \begin{split}
            &\omega^\brai(k_1) - \omega^\brai(k_2) \\
            &\hspace{40pt} = (\mu(k_1) + \nu^\brai(k_1))
            - (\mu(k_2) + \nu^\brai(k_2))\\
            &\hspace{40pt} > \mu(k_1) - \mu(k_2) + b > 0,
        \end{split}
    \]
    that is, for agent \(i\), the reward mean of arm \(k_1\) is higher than that of arm \(k_2\).
    Therefore, there is no need for agent \(i\) to pull arm \(k_2\).
    More generally, we define agent \(i\)'s \emph{local arm set} as follows,
    Therefore, agent \(i\)'s local arm set is \[
        \mathcal{K}^\brai \coloneqq
        \left\{
        k\in\mathcal{K}: \nu^\brai(k) + b > \max\nolimits_{\ell\in\mathcal{K}}\nu^\brai(\ell)
        \right\},
    \]
    and agent \(i\) only needs to explore arms in its local arm set.
\end{remark}

% \mo{talk about the application and relevant models and refer to Appendix B \& C.}

Another relevant model for reward heterogeneity is contextual bandits~\citep{li2010contextual}. We discuss it in Appendix~\ref{subapp:contextual-bandit-model}.
The \MATOBHR model finds applications in diverse domains, e.g., online advertising, online shortest path routing, online cloud and edge resources allocation, and personalized clinical trial, cf., the detail application scenarios in Appendix~\ref{app:app}.



\subsection{Performance Metrics}\label{sec:metric}

Since rewards are heterogeneous across agents,  agents may have different optimal arms. The goal of each agent is to find its \textit{local} optimal arm, the one with the largest total reward, which is the sum of arm- and agent-specific rewards. Let $k_*^\brai$ be the local optimal arm of agent $i$, i.e., \(k_*^{(i)}\coloneqq \argmax_{k\in\mathcal{K}^{(i)}}\omega^\brai(k)\).
For an algorithm \(\mathcal{A}\), let \(J_t^\brai(\mathcal{A})\) be the arm pulled by agent \(i\) at time \(t\). The expected regret of agent $i$ under algorithm \(\mathcal{A}\) is the difference between the aggregate reward of pulling its local optimal arm and the aggregate reward of pulling arms in an online manner according to a bandit algorithm, i.e.,
\[
    \E[ \text{R}_T^{(i)} (\mathcal{A}) ] \coloneqq
    T\omega^\brai(k_*^\brai) - \E\left[ \sum\nolimits_{t=1}^T\omega^\brai(J_t^\brai(\mathcal{A})) \right],
\]
where the expectation is taken over the randomness of action sequence \(\{J_1^{(i)}(\mathcal{A}), J_2^{(i)}(\mathcal{A}), \dots\}\).

In the \MATOBHR model, agents can cooperate and share information to accelerate bandit learning. In particular, we assume that each agent can broadcast the arm-specific reward term (the observed rewards minus the agent-specific reward mean, \(X_t^\brai(k) - \nu^\brai(k)\)) at no cost to all other agents, and other agents immediately receives the broadcast observations. Note that this basic system model can be extended to include the communication costs, or an underlying topology to govern communication between agents, or agent privacy, etc. We leave these extensions to future works and focus on presenting the key idea of free exploration in this paper.
The learning environment is a cooperative one, hence, we consider \textit{aggregate regret} as the performance metric, which is simply the aggregate regret over $M$ agents, i.e.,
\begin{equation}
    \ERT \! \coloneqq \!
    \sum_{i=1}^M \! \left( \! T\omega^\brai(k_*^{(i)}) \! - \! \E\left[\! \sum_{t=1}^T\omega^\brai(k_t^{(i)}) \!\right] \!\right).
\end{equation}

% !TeX root = ..\freeExp.tex
\subsection{Application Scenarios}
\label{app:app}
The heterogeneous and known agent-specific reward means for \MATOBHR is a practically relevant setting and can find applications in diverse domains.
The applications mentioned in~\citet{yang2022distributed} and~\citet{baek2021fair} can also be handled by \MATOBHR since their models are special cases of \MATOBHR.
In the following, we present four motivating application scenarios that \MATOBHR could model. We note that we focus on motivating the arm- and agent-specific rewards. Detailed modeling of each application may require additional effort, which is beyond the scope of this paper.

\paragraph{Online Advertising in Social Networks: } Online advertising is a classic example of the MAB problem~\citep{tang2014ensemble,mahadik2020fast}. Consider a scenario where there are multiple bandit agents that select ads to be placed on a social platform. Each agent is responsible for a cluster of users with similar interests. The cluster may be constructed based on different criteria, e.g., location, age, etc. Indeed, the popularity of products can differ across different locations or age groups.
But the ads (arms) could be selected from a shared pool of available ads. In this scenario, the agent is aware of the personal preferences of users in its cluster, i.e., the agent-specific reward is known. However, the agents need to learn the potential value of ads as well; hence, arm-specific rewards are unknown. Since the learning agents all belong to the same social platform advertising engine, they can cooperate to share arm-specific observations and improve learning performance.



\paragraph{Online Shortest Path Routing in Wireless Networks: }
Another example is the problem of finding shortest paths in a multi-hop wireless network. Consider a scenario in which multiple learning agents try to learn the shortest paths for different communication sessions. In this scenario, bandit algorithms can be implemented to learn the shortest routing paths~\citep{He2013online,zou2014online,talebi2017stochastic}.
The cost (or latency) of a certain path (arm) depends on the physical condition of the path itself, representing an arm-specific cost unknown to the learning agents. Further, the session of each agent might have its local physical conditions, e.g., distance and the hardware spec of the mobile device, which is known only to the agent and impacts the overall cost of each path. In this scenario, the former is an arm-specific cost, which is homogeneous and unknown among all agents, while the latter varies across agents and whose mean is privately known to each agent only.

\paragraph{Online Cloud and Edge Resource Allocation: }
In prior literature, the MAB framework has been used for workload allocation into a pool of cloud/edge servers~\citep{talebi2018learning,johari2017matching,lattimore2014optimal,dagan2018better}. In this scenario, the cloud provider may categorize the compute jobs into multiple types, e.g., ML training workload, video processing, financial analytics, etc., and create a learning agent for finding the best server type for them. In this scenario, the arm-specific reward captures the hardware spec of the servers, and the agent-specific reward captures the job-specific hardware requirement of the workload, e.g., video processing is memory-intensive, while finance workload is compute-intensive.
% \textcolor{blue}{the agent-specific cost captures the job-specific power consumption of the workload, e.g., video processing is power-consuming while finance workload is power-friendly.}
In edge scenarios where the workload could be run in multiple locations, the agent-specific reward could be represented as the cost of moving the workload to different locations as well, which is known and heterogeneous for different agents.


\paragraph{Personalized Medicine and Clinical Trial:}
A classic MAB application is clinical trial~\cite{lai1987adaptive,villar2015multi,aziz2021multi}.
Consider a scenario where patients have different covariates, e.g., age, gender, genomic features, and medical history, and, therefore, should be categorized to several heterogeneous groups, and the doctor should create personalized agents (drug application policies) for every group.
In this scenario, the effectiveness of a treatment for a certain patient group depends not only on the treatment itself but also on the patient group's covariates.
For example, the effectiveness of a treatment that disturbs patients' blood glucose concentrations may be discounted on diabetics.
In this scenario, the arm-specific reward captures treatments' or medicines' basic effectiveness on a diseases,
and the agent-specific reward (or cost) captures the discounted or additional effectiveness due to the patient group features. The latter is known to (or can be well evaluated by) an expert.






% \subsection{Application Scenarios}
% \label{sec:app}
% The heterogeneous and known agent-specific reward means for \MATOBHR is a practically relevant setting and can find applications in diverse domains. In the following, we present three motivating application scenarios that \MATOBHR could model. We note that we focus on motivating the arm- and agent-specific rewards. Detailed modeling of each application may require additional effort, which is beyond the scope of this paper.

% \noindent{\bf Online Advertising in Social Networks: } Online advertising is a classic example of the MAB problem~\citep{tang2014ensemble,mahadik2020fast}. Consider a scenario where there are multiple bandit agents that select ads to be placed on a social platform. Each agent is responsible for a cluster of users with similar interests. The cluster may be constructed based on different criteria, e.g., location, age, etc. Indeed, the popularity of products can differ across different locations or age groups.
% But the ads (arms) could be selected from a shared pool of available ads. In this scenario, the agent is aware of the personal preferences of users in its cluster, i.e., the agent-specific reward is known. However, the agents need to learn the potential value of ads as well; hence, arm-specific rewards are unknown. Since the learning agents all belong to the same social platform advertising engine, they can cooperate to share arm-specific observations and improve learning performance.
% We note that one may think that contextual bandits, e.g., ~\citep{li2010contextual, slivkins2011contextual}, can capture the above application scenario, i.e., modelling multiple agents as multiple contexts.
% However, in contextual bandits,
% there is only one context in each time slot,
% % the arrival of contexts, e.g., stochastic or adversarial,
% % depend on the environment,
% while, in our scenario, \emph{all} agents (contexts) can select arms in each time slot.
% % is not something the algorithm can control. , which cannot be modeled as different contexts whose sequential arrivals
% % , e.g., stochastic or adversarial,.


% \noindent{\bf Online Shortest Path Routing in Wireless Networks: }
% Another example is the problem of finding shortest paths in a multi-hop wireless network. Consider a scenario in which multiple learning agents try to learn the shortest paths for different communication sessions. In this scenario, bandit algorithms can be implemented to learn the shortest routing paths~\citep{He2013online,zou2014online,talebi2017stochastic}.
% The cost (or latency) of a certain path (arm) depends on the physical condition of the path itself, representing an arm-specific cost unknown to the learning agents. Further, the session of each agent might have its local physical conditions, e.g., distance and the hardware spec of the mobile device, which is known only to the agent and impacts the overall cost of each path. In this scenario, the former is an arm-specific cost, which is homogeneous and unknown among all agents, while the latter varies across agents and whose mean is privately known to each agent only.

% \noindent{\bf Online Cloud and Edge Resource Allocation: }
% In prior literature, the MAB framework has been used for workload allocation into a pool of cloud/edge servers~\citep{talebi2018learning,johari2017matching,lattimore2014optimal,dagan2018better}. In this scenario, the cloud provider may categorize the compute jobs into multiple types, e.g., ML training workload, video processing, financial analytics, etc., and create a learning agent for finding the best server type for them. In this scenario, the arm-specific reward captures the hardware spec of the servers, and the agent-specific reward captures the job-specific hardware requirement of the workload, e.g., video processing is memory-intensive, while finance workload is compute-intensive.
% % \textcolor{blue}{the agent-specific cost captures the job-specific power consumption of the workload, e.g., video processing is power-consuming while finance workload is power-friendly.}
% In edge scenarios where the workload could be run in multiple locations, the agent-specific reward could be represented as the cost of moving the workload to different locations as well, which is known and heterogeneous for different agents.



\subsection{Notations Related to Free Exploration}
\label{sec:not}
To ease the presentation of \FreeExp and its analysis, we introduce some key notations relevant to free exploration.
In \MATOBHR, arms that are local optimal for at least one agent can be freely explored. Then, in a cooperative environment, other agents who take these arms as their suboptimal choices can enjoy the freely explored observations of these arms.

\begin{definition}[Set of free arms]
    \label{def:free}
    We define the set of free arms $\mathcal{K}^\texttt{\emph{fr}}$ as
    \begin{equation}
        \label{eq:free_arm_set}
        \mathcal{K}^\texttt{\emph{fr}} \coloneqq \{k\in\mathcal{K}: \mathcal{M}_*(k)\neq \emptyset\},
    \end{equation}
    where
    \(
    \mathcal{M}_*(k)\coloneqq \{i\in \mathcal{M}: k\in\mathcal{K}^\brai, k=k_*^\brai\}
    \)
    is a subset of agents with arm \(k\) as their local optimal arm. Any arm \(k\in\mathcal{K}^\texttt{\emph{fr}}\) can be freely explored without incurring regret by any agent in $\mathcal{M}_*(k)$. In the rest of this paper, we refer to the arms in $\mathcal{K}^\texttt{\emph{fr}}$ as free arms.
\end{definition}

% \mo{suggestions for notation change: not sure why using $(i)$ instead of $i$. Here are some suggestions: $\mathcal{K}^\texttt{fr} \rightarrow \mathcal{K}^\texttt{fr}$; $\mathcal{M}_*(k) \rightarrow \mathcal{M}_k^*$; $k_*\brai \rightarrow k^*_i$; generally, why not putting agent index $i$ as subscript? and arm $k$ in parenthesis, and for the times you have $t$, you may use $\omega_i(k_i^t)$; not sure how these suggestions are easy and makes sense to apply, I leave the final decision to you.  }

% We illustrate the free exploration mechanism in the heterogeneous agents system.
% Denote
% For agent \(i\), pulling its own local optimal arm \(k_*^{(i)}\) has zero regret cost,
% while for some other agent \(j\)
% who takes arm \(k_*^{(i)}\) as its local suboptimal
% (i.e., \(k_*^\brai \in \mathcal{K}^\braj, \mu(k_*^\brai) < \mu(k_*^\braj)\)),
% pulling this arm causes positive regret costs.
% These positive costs --- in the hindsight ---
% can be avoided if agent \(j\) knows
% that agent \(i\) can explore the arm \(k_*^\brai\)
% free of cost.

% Later on, we first derive a lower bound that is independent of
% arms in free exploration arm set \(\mathcal{K}^\texttt{fr}\),
% which reflexes the free exploration's
% impact on a heterogeneous multi-agent system (Section~\ref{subsec:regrete-lower-bound}),
% and then devise a algorithm taking advantage of
% free exploration in an online manner (Section~\ref{sec:algorithm}).


Recall that in the classic MAB, the difficulty of distinguishing a suboptimal arm $k$ from the optimal arm depends on \(\Delta(k)\)---the reward mean gap between arm $k$ and the optimal arm $k^*$. In \MATOBHR, the notion of optimality gap needs to be redefined since agents may have different local optimal arms. In the following, we formally define the suboptimality gap of each arm $k$ as the smallest gap between arm $k$ and any local optimal arms. A formal definition is given below.
\begin{definition}[Suboptimality gap]
    The suboptimality gap of arm $k$ is defined as
    \begin{equation}
        \label{eq:gap}
        \bar{\Delta}(k) \coloneqq \min_{i\in\mathcal{M}}\Delta^\brai(k),
    \end{equation}
    where
    \(
    \Delta^\brai(k) \coloneqq \omega^\brai(k_*^\brai) - \omega^\brai(k)
    \)
    is the gap between the mean rewards of arm $k$ and \(k_*^\brai\)---the local optimal arm of agent $i$.
\end{definition}
All free arms have zero suboptimality gaps, i.e., \( \bar{\Delta}(k) = 0,\, \forall k \in \mathcal{K}^\texttt{fr}\).
Denote \(\bar{i}(k) \in \argmin_{i\in\mathcal{M}(k)}\Delta^\brai(k)\)
to be an agent with the smallest reward gap of arm \(k\) (one can break ties arbitrarily).
Then, \(\bar{\Delta}(k)\) can be rewritten as
\(\bar{\Delta}(k) = \omega^{(\bar{i}(k))}(k_*^{(\bar{i}(k))}) -
\omega^{(\bar{i}(k))}(k)\),
where for simplicity,
we denote \(\omega^{(\bar{i}(k))}(k)\) as \(\bar{\omega}(k)\), i.e.,
% the second term in RHS is denoted as 
\begin{equation}\label{eq:critial-omega}
    \bar{\omega}(k) \coloneqq
    \omega^{(\bar{i}(k))}(k) = \mu(k) + \nu^{(\bar{i}(k))}(k).
\end{equation}

