    \documentclass[11pt]{article}
\usepackage{natbib}
\usepackage{fullpage}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols

\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{comment}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{mathtools}
\usepackage{xcolor}
\usepackage{algorithm}
\usepackage{tikz}
% \usepackage{algpseudocode}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{algorithmic}
% \usepackage{algorithmicx}
\usepackage{wrapfig}  

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{dsfont}

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

\newtheorem{assumption}{Assumption}
\newtheorem{example}{Example}
\newtheorem{definition}{Definition}
\newtheorem{remark}{Remark}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{claim}{Claim}
\newtheorem{proposition}{Proposition}
\newtheorem{regret}{Regret}
\newtheorem{gap}{Gap}


\def \bvarphi {\mathrm{\boldsymbol{\varphi}}}
\def \btheta {\bm \theta}
\def \bsigma {\bm \Sigma}
\def \mt {\mathsf{T}}
\def \bV {\displaystyle\mV}
\def \bx {\displaystyle\vx}
\def \bA {\displaystyle\sA}
\def \bC {\displaystyle\sC}
\def \bD {\displaystyle\sD}
\def \bR {\displaystyle\sR}
\def \bT {\mathcal{T}}
\def \bI {\bold{I}}
\def \bb {\displaystyle\vb}
\def \bD {\mathcal{D}}
\def \bR {\mathcal{R}}
\def \E {\mathcal{E}}
\def \bC {\mathcal{C}}
\def \bbC {\mathcal{C}}
\def \A {\mathcal{A}}
\def \bP {\mathcal{P}}
\def \bn {\displaystyle\vn}
\def \bc {\mathcal{C}}
\def \ba {\bold{a}}
\def \bone {\mathds{1}}
\def \bE {\mathds{E}}
\def \bB {\mathbb{B}}
\def \bN {\mathcal{N}}
\def \bbN {N}
\def \M {\mathcal{M}}
\def \x {\mathbf{x}}
\def \t {\mathbf{\theta}}
\def \R {\mathcal{R}}
\def \I {\mathcal{I}}

\title{Pure Exploration Distributed Linear Bandits in the Synchronous and Asynchronous Environment}

\author{Zichen Wang\\ 
\and Chuanhao Li\\ 
\and Huazheng Wang\\
}
\date{}
\expandafter{}
\begin{document}
\maketitle

\begin{abstract}
\end{abstract}

\section{Introduction}

\section{Preliminary}

\subsection{Federated linear bandits}

\paragraph{Notations}
In this paper, we let $\vert A \vert = \{1,...,A\}$, $\Vert \x \Vert$ denotes the Euclidean norm, $\Vert \x \Vert_{A} = \sqrt{\x^\top A \x}$ denotes the matrix norm, $\bI\in R^{d\times d}$ denotes the identity matrix, $\bold{0}\in R^d$ denotes the zero vector or matrix, $\text{det}(A)$ denotes the determinant of the matrix $A$ and $A^\top$ denotes the transposed of $A$.

\paragraph{Basic setting}
We consider the distributed linear bandits as follows. The distributed linear bandits consist of an agent set $\M = \{m\}_{m=1}^M$ with $M$ agents, a central server and an the environment $\A =\{\x_i \in R^d\}_{i=1}^K$ with $K$ arms. In round $t$, if an agent $m \in \M$ pulls an arm $\x_{m,i,t} \in \A$, it would receive reward $r_{m,i,t} = \x_{m,i,t}^\top \t^* + \eta_{m,i,t}$, where $\t^*\in R^d$ is the global model parameter and $\eta_{m,i,t} \in R$ is a zero mean $\sigma$-sub Gaussian noise. Without loss of generality, we suppose $\Vert\x_i\Vert \le 1$ and $\Vert\t^*\Vert \le 1$. Similar to the other papers that studied the pure exploration, we consider the best arm $\x^* = \arg\max_{\x_i\in\A} \x_i^\top\t^*$ be unique. 

\paragraph{Learning objective of the pure exploration distributed linear bandit}
We focus on the fix confidence $(\epsilon,\delta)$-best arm identification problem. The goal of the bandit algorithm is to find an estimated best arm $\hat\x^* \in \A$ which satisfies
\begin{align}\label{1}
    P(\Delta(\x^{*}, \hat\x^{*}) \ge \epsilon) \le \delta
\end{align}
with minimum sample complexity. The notation $\Delta(\x^{*}, \hat\x^{*}) = y(\x^*, \hat\x^*)^\top \t^*$ and $y(\x^*, \hat\x^*) = \x^* - \hat\x^*$. The sample complexity is the total number of the agents interact with the environment, denotes as $S$. 

\subsection{Communication model}
In this paper, we consider a star shape communication framework, where there is a communication channel between every agent and the server. The agents can not directly communicate with each other, they can only upload information to the server and download information from the server. We define the communication complexity $\bC_\tau$ as the number of bits transferred between the agents and server in $\tau$ rounds. Due to the real number being transferred in binary form in the information channel, the number of bits for each real number representation is only logarithmic w.r.t. instance scale. For instance, if an agent wants to upload a real number $\tau$, the instantaneous communication complexity is counted as $\log(\tau)$. Furthermore, we suppose there is no latency in the communication channel.

\paragraph{Synchronous setting} The synchronous setting supposes agents are learning collaboratively and concurrently to accelerate the learning speed. Every agent should be active in each round, i.e., should pull an arm in $\A$. In this setting, the bandit algorithm targets to accelerate the learning speed by distributed learning. The bandit algorithm would find the estimated best arm with sample complexity $\tau = M\hat{\tau}$, where $\hat{\tau}$ is the average stopping time of each agent. 

\paragraph{Asynchronous setting} The asynchronous setting considers the case that the full participation of the agents and global synchronization mandated by the server is unimplementable. In each round, there is only one active agent $m_t$ that interacts with the environment. Besides, every agent can choose when to communicate with the server, which can be independent of other agents' communication. Moreover, the bandit algorithm would find the estimated best arm with sample complexity $\tau$. 

\paragraph{Switching cost} Our algorithm can achieve a low switching cost.

\section{The Proposed Synchronous Algorithm}
We propose a pure exploration algorithm for the synchronous distributed linear bandits. We call the algorithm Distributed Synchronous Linear Pure Exploration (Dis-S-LinPE), and it is inspired by the LinGapE and UGapE. We first illustrate the confidence bounds related to the pure exploration problem.

\paragraph{Different confidence bounds}
For any fixed sequence $\x_{a_\bold{n}} = (\x_{a_1},...,\x_{a_n})\in\A$ and the following least square estimator
\begin{align}
    V_{\bold{n}} = \sum_{i=1}^n \x_{a_i}\x_{a_i}^\top,\quad b_{\bold{n}} = \sum_{i=1}^n \x_{a_i}r_{a_i},\quad \hat{\t}_\bold{n} = V_{\bold{n}}^{-1}b_{\bold{n}}.
\end{align}
Then, by Azuma's inequality (Soare), the following inequality 
\begin{align}\label{3}
    \vert \x_i^\top(\t^* - \hat\t_\bold{n}) \vert \ge 2\sigma \Vert \x_i \Vert_{V_{\bold{n}}^{-1}}\sqrt{2\log\frac{6n^2K}{\delta\pi^2}},\ \forall \x_i\in\A
\end{align}
holds with probability at least $1-\delta$. If the selected sequence $\x_\bold{t} = (\x_{a_1},...,\x_{a_t})\in\A$ is adaptively determined on the past observations, then the regularized least-square estimator should be considered
\begin{align}
    V_{\bold{t}} = \lambda\bI + \sum_{i=1}^t \x_{a_i}\x_{a_i}^\top,\quad b_{\bold{t}} = \sum_{i=1}^t \x_{a_i}r_{a_i},\quad \hat{\t}_\bold{t} = V_{\bold{t}}^{-1}b_{\bold{t}}.
\end{align}
where $\lambda$ is the regularized parameter. Besides, with the confidence bound of Yasin, the following inequality 
\begin{align}
    \vert \x_i^\top(\t^* - \hat\t_\bold{t}) \vert \ge \Vert \x_i \Vert_{V_{\bold{t}}^{-1}}\bigg(\sigma\sqrt{d\log\frac{1+t\lambda}{\delta}} + \lambda^{1/2}\bigg),\ \forall \x_i\in\A
\end{align}
holds with probability at least $1-\delta$.

\paragraph{How do previous studies designed arm selection strategy}
The key problem of the pure exploration is to quickly decrease the matrix norm of $\Vert y(\x_i,\x_j)\Vert_{V^{-1}}$, $\forall \x_i,\x_j\in\A$, which can be deemed as reducing the uncertainty of the expected reward gap between arm $\x_i$ and $\x_j$. Most of the previous works decreased the matrix norm by experimental design, for instance, Soare et al. employed the $\mathcal{X}\mathcal{Y}$-static allocation to select the sequence 
\begin{align}\label{6}
    \x_{a_\bold{n}} = \arg \min_{\x_{a_{\bold{n}}}\in\A} \max_{\x_i,\x_j\in\A}\Vert y(\x_i,\x_j)\Vert_{V_\bold{n}^{-1}}.
\end{align}
This design only relies on the knowledge of the $\A$ and does not utilize the knowledge of the previous observation, which satisfies the prerequisite of the inequality (\ref{3}). Yihan Du et al. studied the pure exploration problem in collaborative kernelized bandit, which also uses an experimental design similar to (\ref{6}). However, there exist three main drawbacks of this kind of design in the distributed communication framework. First, this kind of design needs to solve some minimax optimizations similar to (\ref{6}), which may generate a large computational burden. Second, this kind of design can hardly be implemented in the asynchronous setting, we will demonstrate this point in Section 4. Third, this design treats $y(\x_i,\x_j)$, $\forall \x_i,x_j\in\A$ equally. Since the goal of the algorithm is to distinguish the best arm and other suboptimal arms, we only need to focus on $y(\x^*,\x^\prime)$, where $\x^\prime = \arg\max_{\x_i \not = \x^*}\x_i^\top\t^*$.

\paragraph{Dis-S-LinPE}
To control the amount of communication, the Dis-S-LinPE runs in the episode with length $L \in N$. Assume $\tau = L(\E-1)$, where $\E$ is the total number of the episode the algorithm spends till the stopping time. The algorithm first design $V_{1} = \lambda\bI$, $b_{1} = \bold{0}$ and $T_{1}(\x_i) = 0$, $\forall\ \x_i\in\A$, where $V_{1} \in R^{d\times d}$ and $b_1$ are use to compute $\hat{\t}_1$, and $T_{e}(\x_i)$ denotes the total number of arm $\x_i$ being pull in the first $e-1$ episodes. At the beginning of an episode $e$, the server sends $V_{e}$, $b_{e}$ and $T_{e}(\x_i)$ to all agents. Agents utilize this knowledge to design $\hat{\t}_{e}$, select the arm with the largest estimated reward $\x_{i,e}$ and the most ambiguous arm $\x_{j,e} = \arg\max_{\x_j\in\A} \hat{\Delta}_e(\x_j,\x_{i,e}) + \alpha_e$. The notation $\hat{\Delta}_e(\x_j,\x_{i,e}) = y(\x_{j,e}, \x_{i,e})^\top\t^*$ denotes the estimated reward gap between arm $\x_{j,e}$ and $\x_{i,e}$ and $\alpha_e$ denotes the high probability bound which will provide in Lemma \ref{lemma1}. Then, all agents would pull the most informative arm $\x_e^*$ to derive the knowledge of the reward gap between $\x_{i,e}$ and $\x_{j,e}$. All the agents would pull $\x_e^*$ for $L$ times in episode $e$, and derive reward $r_{e,m,l},\ \forall m\in\M,\ \forall l\in \vert L \vert$. At the end of the episode, the agents would upload $V^{loc}_{e,m} = L\x_e^*\x_e^{*\top}$, $b^{loc}_{e,m} = \sum_{l=1}^L \x_e^* r_{e,m,l}$, $T^{loc}_{e,m} (\x_e^*) = L$ and $T^{loc}_{e,m}(\x_i) = 0,\ \forall \x_i \not = \x_e^*$ to the server. The server would first update $V_{e + 1} = V_{e} + \sum_{m=1}^MV^{loc}_{e,m}$ and $b_{e + 1} = b_{e} + \sum_{m=1}^Mb^{loc}_{e,m}$ and the estimated model parameter $\hat\t_{e+1} = V_{e+1}^{-1}b_{e+1}$. Then, it would check the breaking condition, if the breaking condition is reached, the algorithm stops exploration and derives the estimated best arm $\hat{\x}^*$. Otherwise, the server renews $T_{e + 1}(\x_i) = T_{e}(\x_i) + \sum_{m=1}^MT^{loc}_{e,m}(\x_i),\ \forall \x_i\in\A$ and begins the next episode. The specific details of the Dis-S-LinPE are shown in the Algorithm \ref{alg1}.

 \begin{algorithm}[t]
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
	\caption{ (\texttt{Dis-S-LinPE}) }
	\label{alg1}
	\begin{algorithmic}[1]
            \STATE \textbf{Inputs:} Arm set $\A$, client set $\M$, episode length $L$, regularization parameter $\lambda$ and $(\delta,\epsilon)$
            \STATE Set $V_{1} = \lambda\bI$, $b_{1} = \bold{0}$ and $T_{1}(\x_i) = 0$, $\forall\ \x_i\in\A$
            \STATE \textbf{for} $i = 1:K$ \textbf{do}
            \STATE \quad Agent $1$ pulls arm $\x_i$ and receives reward $r_{i} = \x_i^\top\t^* + \eta_i$ 
            \STATE Agent $1$ uploads $\sum_{i=1}^K\x_i\x_i^\top$ and  $\sum_{i=1}^K \x_i r_i$ to the server
            \STATE Server updates $V_1 = \lambda\bI + \sum_{i=1}^K\x_i\x_i^\top$, $b_1 = \sum^K_i \x_i r_i$ and $T_1(\x_i) = 1,\ \forall \x_i \in \A$
            \STATE \textbf{for} $e = 1:\infty$ \textbf{do}
            \STATE \quad \textbf{for} $m=1:M$ \textbf{do}
            \STATE \quad \qua Server sends $V_e$, $b_e$ and $T_e(\x_i), \forall \x_i\in \A$ to agent $m$
            \STATE \quad \quad Agent $m$ sets $\hat{\t}_e = V_e^{-1}b_e$ \STATE\quad\quad Agent $m$ sets $\x_{i,e} = \arg\max_{\x_i\in\A} \x_i^\top\hat\t_e$ and  $\x_{j,e} = \arg\max_{\x_j\in\A} \hat{\Delta}_e(\x_{j},\x_{i,e}) + \alpha_e(\x_j,\x_{i,e})$
            \STATE \quad \quad Agent $m$ sets $V_{e,m}^{loc} = \bold{0}$, $b_{e,m}^{loc} = \bold{0}$ and $T_{e,m}^{loc}(\x_i) = 0,\ \forall \x_i \in \A$
            \STATE \quad Set $B(e) = \max_{\x_j\in\A} \hat{\Delta}_{e}(\x_{j},\x_{i,e}) + \alpha_{e}(\x_{j,e},\x_{i,e})$
            \STATE \quad \textbf{if} $B(e) \le \epsilon$ \textbf{then}
            \STATE \quad \quad Server returns $\x_{i,e}$ as the estimated best arm $\hat\x^*$ and break
            \STATE \quad \textbf{for} $l = 1:L$ \textbf{do}
            \STATE \quad \quad \textbf{for} $m = 1:M$ \textbf{do}
            \STATE \quad\quad\quad Agent $m$ pulls the most informative arm $\x_e^*$ by (\ref{select}) and derive $r_{e,m,l} = \x_e^{*\top}\t^* + \eta_{e,m,l}$
            \STATE \quad \quad \quad Agent $m$ sets $V^{loc}_{e,m} = V^{loc}_{e,m} + \x^*_e\x^{*\top}_e$, $b_{e,m}^{loc} = b_{e,m}^{loc} + \x_e^*r_{e,m,l}$ and $T_{e,m}^{loc}(\x_e^*) = T_{e,m}^{loc}(\x_e^*) + 1$
            \STATE \quad \textbf{for} $m=1:M$ \textbf{do}
            \STATE \quad \quad Agent $m$ sends $V^{loc}_{e,m}$, $b_{e,m}^{loc}$ and $T_{e,m}^{loc}(\x_i),\ \forall \x_i\in\A$ to the server
            \STATE \quad Server computes $V_{e+1} = V_{e} + \sum_{m=1}^M V^{loc}_{e,m}$, $b_{e+1} = b_e + \sum_{m=1}^M b_{e,m}^{loc}$ and $T_{e + 1}(\x_i) = T_{e}(\x_i) + \sum_{m=1}^MT^{loc}_{e,m}(\x_i),\ \forall \x_i\in\A$ 
	\end{algorithmic}  
\end{algorithm}

\paragraph{Arm selection strategy}
We here provide the our arm selection strategy, which is motivated by Liyuan Xu et al.  In each episode, agents should pull arm $\x_e^*$ to decrease $\Vert y(\x_{i,e}, \x_{j,e}) \Vert_{V_e^{-1}}$ and $\alpha_e$. We here provide how to select $\x_e^*$, it yields
\begin{align} \label{select}
    \x_e^* = \arg\min_{ \x_a\in\A} T_e(\x_a) / p_a^*(y(\x_{i,e}, \x_{j,e}))
\end{align}
where 
\begin{align}
    p_a^*(y(\x_{i,e}, \x_{j,e})) = \frac{w_a^*(y(\x_{i,e}, \x_{j,e})) }{\sum_{i=1}^K \vert w_a^*(y(\x_{i,e}, \x_{j,e})) \vert}
\end{align}
and $w_i^*(y(\x_{i,e}, \x_{j,e}))$ is the solution of the linear programming
\begin{align}\label{9}
    \arg\min_{w_i} \sum_{i=1}^K\vert 
w_i \vert \quad\quad s.t.\ y(\x_{i,e},\x_{j,e}) = \sum_{i=1}^K w_i\x_i.
\end{align}
The linear programming in (\ref{9}) is far more simple to solve.

\begin{lemma} \label{suplemma1} The returned arm $\hat{\x}^*$ from Algorithm \ref{alg1} can satisfies the $(\epsilon,\delta)$-condition in (\ref{1}).
\end{lemma}

\begin{proof}[Proof of Lemma \ref{suplemma1}] Suppose the stopping period is $ \E $. According to the breaking condition of the Algorithm \ref{alg1} (line 21$\sim$22), we have 
\begin{align}\label{14}
\epsilon \ge \max_{\x_{j}\in\A} \hat\Delta_{\E} (\x_j,\x_{i,\E}) + \alpha_{\E} = B(e).
\end{align}
Due to 
\begin{align}
    \max_{\x_{j}\in\A} \hat\Delta_{\E} (\x_j,\x_{i,\E}) + \alpha_{\E} \ge \hat\Delta_{\E} (\x^*,\x_{i,\E}) + \alpha_{\E}
\end{align}
where $\hat{\x}^* = \x_{i,\E}$ is the estimated best arm. Besides, due to in event $\I_s$, $\Delta(\x_i,\x_j) - \hat{\Delta}_e(\x^*,\hat\x^*) \le \alpha_\E$. Combine this with (\ref{14}), we have
\begin{align}
    \epsilon \ge \hat\Delta_{\E} (\x^*,\hat\x^*) + \alpha_{\E} \ge \Delta(\x^*,\hat\x^*).
\end{align}
Furthermore, due to the event $\I_s$ would happen with probability at least $1-\delta$, the $(\epsilon,\delta)$-condition (\ref{1}) is satisfied and here we finish the proof.
\end{proof}

\begin{lemma}\label{lemma1} 
Follows the results of Yasin, we set $\alpha_e(\x_{j,e},\x_{i,e}) = \Vert y(\x_{j,e},\x_{i,e}) \Vert_{V_e^{-1}} \beta_e$, where $\beta_e$ is 
\begin{align}
    \beta_e = \sigma \sqrt{d\log\frac{1 + \sum_{i=1}^K T_e(\x_i)/\lambda}{\delta}} + \lambda^{1/2} = \sigma \sqrt{d\log\frac{1 + ((e-1)L + K)M/\lambda}{\delta}} + \lambda^{1/2}.
\end{align}
Then, define event $\I_s = \{\forall e \in \vert\E\vert,\ \forall \x_i,\x_j\in\A,\ \vert \Delta(\x_i,\x_j) - \hat{\Delta}_e(\x_i,\x_j) \vert \le \alpha_e \}$. Event $\I_s$ would happen with probability at least $1-\delta$.
\end{lemma}

\begin{proof}[Proof of Lemma \ref{lemma1}] 

Recall the definition of the Yasin's bound, where
\begin{align}\label{11}
    \vert \x_i^\top(\t^* - \hat\t_\bold{t}) \vert \le \Vert \x_i \Vert_{V_{\bold{t}}^{-1}}\bigg(\sigma\sqrt{d\log\frac{1+t\lambda}{\delta}} + \lambda^{1/2}\bigg),\ \forall \x_i\in\A
\end{align}
hold with probability at least $1-\delta$ and $V_\bold{t}$ contains all the data from round $1$ to round $t$. In Algorithm \ref{alg1}, every agent would upload there locked data ($V_{m,e}^{loc}$ and $b_{m,e}^{loc}$) at the end of the episode. At that instance, the server has all the data and it would send it to all the agents when the next episode begin. Hence, and at the beginning of an arbitrary episode $e\in\E$, the $V_e$ and $b_e$ contain the total data of all agents collect from episode $1$ to episode $e-1$. Moreover, in an episode, one agent would sample the environment at most $L$ times and there are $M$ agents. Hence, we can rewrite the $t$ in (\ref{11}) as $((e-1)L + K)M$, $\hat{\t}_\bold{t}$ as $\hat{\t}_e$ and $V_\bold{t}$ as $V_e$, and the inequality still holds, i.e.,
\begin{align}
    \vert \x_i^\top(\t^* - \hat\t_e) \vert \le \Vert \x_i \Vert_{V_{e}^{-1}}\bigg(\sigma\sqrt{d\log\frac{1+((e-1)L + K)M\lambda}{\delta}} + \lambda^{1/2}\bigg),\ \forall \x_i\in\A.
\end{align}
According to $\Delta(\x_i,\x_j) = y(\x_i,\x_j)^\top \t^*$ and $\hat{\Delta}_e(\x_i,\x_j) = y(\x_i,\x_j)^\top \hat{\t}_e$, we have for every $e\in\E$, 
\begin{align}
    \begin{split}
    \vert \Delta(\x_i,\x_j) - \hat{\Delta}_e(\x_i,\x_j) \vert &= \vert y(\x_i,\x_j)^\top \t^* - y(\x_i,\x_j)^\top \hat{\t}_e \vert\\
    & = \vert y(\x_i,\x_j)^\top (\t^* - \hat\t_e) \vert\\
    &\le \Vert y(\x_i,\x_j)\Vert_{V_e^{-1}} \Vert \t^* - \hat\t_e \Vert_{V_e}\\
    &\le \Vert y(\x_i,\x_j)\Vert_{V_e^{-1}} \bigg(\sigma \sqrt{d\log\frac{1 + ((e-1)L + K)M/\lambda}{\delta}} + \lambda^{1/2}\bigg)
    \end{split}
\end{align}
holds with probability at least $1-\delta$.
\end{proof}

\begin{lemma} \label{lemma2} The matrix norm $\Vert y(\x_{i}, \x_{j}) \Vert_{V_e^{-1}}$ can be bounded by
\begin{align}
    \Vert y(\x_{i}, \x_{j}) \Vert_{V_e^{-1}} \le \sqrt{\frac{\rho(y(\x_{i}, \x_{j}))}{T_e(\x_i,\x_j)}},\ \forall \x_i,\x_j \in \A
\end{align}
where
\begin{align}\label{18}
    T_e(\x_i,\x_j) = \min_{\x_a\in \A,\ p_a^*(y(\x_i,\x_j)) > 0}T_e(\x_a)/p_a^*(y(\x_i,\x_j))
\end{align}
and
\begin{align}
    \rho(y(\x_{i}, \x_{j})) = \sum_{i=1}^K \vert w^*_i(y(\x_i,\x_j)) \vert.
\end{align}
\end{lemma}

\begin{proof}[Proof of Lemma \ref{lemma2}] We can decompose the covariance matrix $V_e$ as
\begin{align}
    V_e  = \lambda\bI + \sum_{i=1}^K
    T_e(\x_i) \x_i\x_i^\top.
\end{align}
We set $V_e^{\prime} = \lambda\bI + \sum_{s=1}^K T_e(\x_i,\x_j)p_s^*(y(\x_i,\x_j)) \x_s\x_s^\top$. From (\ref{18}), we have 
\begin{align}
    T_e(\x_i,\x_j)p_s^*(y(\x_i,\x_j)) \le T_e(\x_s), \forall \x_s\in\A
\end{align}
which implies $V_e^\prime \preceq V_e$ and 
\begin{align}
    y(\x_i,\x_j)^\top V_e^{-1} y(\x_i,\x_j) \le y(\x_i,\x_j)^\top V_e^{\prime-1} y(\x_i,\x_j),\ \x_i,\x_j\in\A.
\end{align}
We then bound $y(\x_i,\x_j)^\top V_e^{\prime-1} y(\x_i,\x_j)$, according to the KKT condition of (\ref{9}), we have the following formulas
\begin{align}
\begin{split}
    & w_s^*(y(\x_i,\x_j)) = \frac{1}{2} p_s^*(y(\x_i,\x_j)) \x_s^\top\varepsilon,\ \forall s\in\vert K \vert \\
    & y(\x_i,\x_j) = \frac{1}{2}\sum_{s=1}^K p_s^*(y(\x_i,\x_j))\x_s \x_s^\top \varepsilon,\ \forall \x_i,\x_j\in\A
\end{split}
\end{align}
where $\varepsilon\in\R^d$ corresponds to the Lagrange multiplier. We can rewrite $y(\x_i,\x_j)^\top V_e^{\prime-1} y(\x_i,\x_j)$ as
\begin{align}\label{24}
    y(\x_i,\x_j)^\top V_e^{\prime-1} y(\x_i,\x_j) = \frac{1}{4} \bigg(\sum_{s=1}^K p_s^*(y(\x_i,\x_j))\x_s \x_s^\top \varepsilon\bigg)^\top V_e^{\prime-1} \bigg(\sum_{s=1}^K p_s^*(y(\x_i,\x_j))\x_s \x_s^\top \varepsilon\bigg).
\end{align}
Besides, we can rewrite $\rho(y(\x_i,\x_j))$ as 
\begin{align}\label{25}
    \rho(y(\x_i,\x_j)) = \sum_{s=1}^K \frac{w_s^{*2}(y(\x_i,\x_j))}{p_s^*(y(\x_i,\x_j))} = \frac{1}{4} \varepsilon^\top \bigg( \sum_{s=1}^K p_s^*(y(\x_i,\x_j))\x_s\x_s^\top \bigg) \varepsilon
\end{align}
In the light of (\ref{24}) and (\ref{25}), we can bound $ y(\x_i,\x_j)^\top V_e^{\prime-1} y(\x_i,\x_j) + \rho(y(\x_i,\x_j))/T_e(\x_i,\x_j)$ with $0$
\begin{align}
    \begin{split}
    \nonumber
&y(\x_i,\x_j)^\top V_e^{\prime-1} y(\x_i,\x_j) + \frac{\rho(y(\x_i,\x_j))}{T_e(\x_i,\x_j)}\\ =& \frac{1}{4}\varepsilon^\top \bigg( \bigg(\sum_{s=1}^K p_s^*(y(\x_i,\x_j))\x_s\x_s^\top\bigg) - \frac{V_e^{\prime-1}}{T_e(\x_i,\x_j)} \bigg)V_e^{\prime-1}\bigg(\sum_{s=1}^K p_s^*(y(\x_i,\x_j))\x_s\x_s^\top\bigg) \varepsilon\\
=& \frac{1}{4}\varepsilon^\top \bigg( \bigg(\sum_{s=1}^K p_s^*(y(\x_i,\x_j))\x_s\x_s^\top\bigg) - \lambda\bI - \bigg(\sum_{s=1}^K p_s^*(y(\x_i,\x_j))\x_s\x_s^\top\bigg) \bigg)V_e^{\prime-1}\bigg(\sum_{s=1}^K p_s^*(y(\x_i,\x_j))\x_s\x_s^\top\bigg) \varepsilon\\
=& - \frac{1}{4\lambda}\varepsilon^\top V_e^{\prime-1}\bigg(\sum_{s=1}^K p_s^*(y(\x_i,\x_j))\x_s\x_s^\top\bigg) \varepsilon\\
\le & 0.
    \end{split}
\end{align}
The last inequality holds due to the definition of the positive definite matrix. Here we finish the proof of Lemma \ref{lemma2}.
\end{proof}

\begin{lemma}\label{suplemma2}
    Under event $\I_s$, for every $\x_{i,e}$ and $\x_{j,e}$ $\in\A$, $B(e)$ can be bounded as follows
    \begin{align}\label{27}
    B(e) \le -\max(\Delta(\x^*,\x_{i,e}),\Delta(\x^*,\x_{j,e})) + 3\alpha_e(\x_{j,e},\x_{i,e}).
    \end{align}
\end{lemma}

\begin{proof}[Proof of Lemma \ref{suplemma2}] According to the definition of the event $\I_s$, consider the case when $\x_{i,e} = \x^*$, we have
\begin{align}
\begin{split}\label{28}
    B(e) &=  \hat{\Delta}_e(\x_{j,e},\x_{i,e}) + \alpha_e(\x_{j,e},\x_{i,e})\\& \le  \Delta(\x_{j,e},\x_{i,e}) + 2\alpha_e(\x_{j,e},\x_{i,e})\\&
    = - \max(\Delta(\x^*,\x_{i,e}),\Delta(\x^*,\x_{j,e})) + 2\alpha_e(\x_{j,e},\x_{i,e})
\end{split}
\end{align}
where the third step owing to $\Delta(\x_{j,e},\x_{i,e}) = - \Delta(\x^*,\x_{j,e}) = - \max(\Delta(\x^*,\x_{i,e}),\Delta(\x^*,\x_{j,e}))$. 

Consider the case when $\x_{j,e} = \x^*$, we have
\begin{align}
\begin{split}\label{29}
    B(e) &=  \hat{\Delta}_e(\x_{j,e},\x_{i,e}) + \alpha_e(\x_{j,e},\x_{i,e})\\& \le  \Delta(\x_{j,e},\x_{i,e}) + 2\alpha_e(\x_{j,e},\x_{i,e})\\&
    = - \max(\Delta(\x^*,\x_{i,e}),\Delta(\x^*,\x_{j,e})) + 2\alpha_e(\x_{j,e},\x_{i,e})
\end{split}
\end{align}
where the third step owing to $\Delta(\x_{j,e},\x_{i,e}) = - \Delta(\x^*,\x_{i,e}) = - \max(\Delta(\x^*,\x_{i,e}),\Delta(\x^*,\x_{j,e}))$. 

Consider the case when $\x_{i,e} \not= \x^*$ and $\x_{j,e} \not= \x^*$, then we can derive
\begin{align}
\begin{split}\label{30}
    B(e) &= \hat{\Delta}_e(\x_{j,e},\x_{i,e}) + \alpha_e(\x_{j,e},\x_{i,e})\\
    &\le \Delta(\x_{j,e},\x_{i,e}) + 2\alpha_e(\x_{j,e},\x_{i,e})\\
    &= \Delta(\x_{j,e},\x^*) + \Delta(\x^*,\x_{i,e}) + 2\alpha_e(\x_{j,e},\x_{i,e})\\
    & \le \Delta(\x_{j,e},\x^*) + \hat{\Delta}_e(\x^*,\x_{i,e}) + 3\alpha_e(\x_{j,e},\x_{i,e})\\
    & \le -\Delta(\x^*,\x_{j,e}) + 3\alpha_e(\x_{j,e},\x_{i,e})
\end{split}
\end{align}
where the fourth step is due to the definition of the event $\I_s$, the fifth step is due to the definition of the $\x_{i,e}$. Similar to (\ref{30}), we also can show
\begin{align}
\begin{split}\label{31}
     B(e) &= \hat{\Delta}_e(\x_{j,e},\x_{i,e}) + \alpha_e(\x_{j,e},\x_{i,e})\\
    &\le  2\alpha_e(\x_{j,e},\x_{i,e})\\
    & \le -\Delta(\x^*,\x_{i,e}) +  \hat\Delta_e(\x^*,\x_{i,e}) + \alpha_e(\x^*,\x_{i,e}) + 2\alpha_e(\x_{j,e},\x_{i,e})\\
    & \le -\Delta(\x^*,\x_{i,e}) +  \hat\Delta_e(\x_{j,e},\x_{i,e}) + \alpha_e(\x_{j,e},\x_{i,e}) + 2\alpha_e(\x_{j,e},\x_{i,e})\\
    &\le  -\Delta(\x^*,\x_{i,e}) + 3\alpha_e(\x_{j,e},\x_{i,e})
\end{split}
\end{align}
The third inequality is due to the definition of the even $\I_s$ and the fourth inequality is due to the definition of the $\x_{j,e}$. Combine the (\ref{28}), (\ref{29}) (\ref{30}) and (\ref{31}), then we can finish the proof of (\ref{27}).
\end{proof}

With Lemma \ref{lemma1}, Lemma \ref{lemma2} and Lemma \ref{suplemma2}, we can bound the sample complexity and communication complexity of the Dis-S-LinPE.

\begin{theorem} [Sample complexity and communication complexity of the Dis-S-LinPE] \label{theorem1} 
\end{theorem}

\begin{proof}[Proof of Theorem \ref{theorem1}] Combine the Lemma \ref{lemma1} and Lemma \ref{lemma2}, we can bound the stopping time $\tau$ and thus the sample complexity $\tau M = M (\E-1) L + KM = \sum_{i}^K T_{\E}(\x_i) + KM$. We first need to derive the upper bound of $T_{\E}(\x_s)$, $\forall \x_s\in\A$.

Let $e^\prime$ be the last episode that $\x_s = \x_{e^\prime}^*$ and $e^\prime \not = \E,\E-1$, where $e^\prime \le \E - 1$. Then, with the result of the Lemma \ref{lemma1}, Lemma \ref{suplemma2} and the Algorithm \ref{alg1}, we have
    \begin{align}\label{31}
    \begin{split}
    \epsilon &< B(e^\prime)\\& \le -\max(\Delta(\x^*,\x_{i,e^\prime}),\Delta(\x^*,\x_{j,e^\prime})) + 3\alpha_{e^\prime}(\x_{j,e^\prime},\x_{i,e^\prime})\\
    &= - \max(\Delta(\x^*,\x_{i,e^\prime}),\Delta(\x^*,\x_{j,e^\prime})) + 3\Vert y(\x_{j,e^\prime}, \x_{i,e^\prime})\Vert_{V_{e^\prime}^{-1}}\beta_{e^\prime}
    \end{split}
    \end{align}
In the light of the Lemma \ref{lemma2}, (\ref{31}) can be further bounded by
\begin{align}
  \begin{split}
\epsilon < - \max(\Delta(\x^*,\x_{i,e^\prime}),\Delta(\x^*,\x_{j,e^\prime})) + 3\sqrt{\frac{\rho(y(\x_{i,e^\prime}, \x_{j,e^\prime}))}{T_{e^\prime}(\x_{i,e^\prime},\x_{j,e^\prime})}}\beta_{e^\prime}.
  \end{split}  
\end{align}
This implies
\begin{align}
    T_{e^\prime}(\x_{i,e^\prime},\x_{j,e^\prime}) <  \frac{3\rho(y(\x_{i,e^\prime}, \x_{j,e^\prime}))}{\max\Big(\frac{\epsilon+\Delta(\x^*,\x_{i,e^\prime})}{3},\frac{\epsilon+\Delta(\x^*,\x_{j,e^\prime})}{3}\Big)^2}\beta_{e^\prime}^2.
\end{align}
According to (\ref{18}), we have 
\begin{align}
\begin{split}
    T_{\E-1}(\x_s) &\le p_s^*(y(\x_{i,e^\prime},\x_{j,e^\prime})) T_{e^\prime}(\x_{i,e^\prime},\x_{j,e^\prime})\\&<  \frac{3p_s^*(y(\x_{i,e^\prime},\x_{j,e^\prime}))\rho(y(\x_{i,e^\prime}, \x_{j,e^\prime}))}{\max\Big(\frac{\epsilon+\Delta(\x^*,\x_{i,e^\prime})}{3},\frac{\epsilon+\Delta(\x^*,\x_{j,e^\prime})}{3}\Big)^2}\beta_{e^\prime}^2\\
    &\le  \max_{\x_i,\x_j\in\A}\frac{3p_s^*(y(\x_{i},\x_{j}))\rho(y(\x_{i}, \x_{j}))}{\max\Big(\frac{\epsilon+\Delta(\x^*,\x_{i})}{3},\frac{\epsilon+\Delta(\x^*,\x_{j})}{3}\Big)^2}\beta_{\E-1}^2
\end{split}
\end{align}

For $\x_s = \x_{e^\prime}$ and $e^\prime = \E - 1$, we have
\begin{align}
\begin{split}
      T_{\E-1}(\x_s) & = T_{\E-2}(\x_s) + ML \\&\le p_s^*(y(\x_{i,\E-2},\x_{j,\E-2})) T_{\E-2}(\x_{i,\E-2},\x_{j,\E-2}) + ML\\ 
    &\le  \max_{\x_i,\x_j\in\A}\frac{3p_s^*(y(\x_{i},\x_{j}))\rho(y(\x_{i}, \x_{j}))}{\max\Big(\frac{\epsilon+\Delta(\x^*,\x_{i})}{3},\frac{\epsilon+\Delta(\x^*,\x_{j})}{3}\Big)^2}\beta_{\E-1}^2 + ML
\end{split}
\end{align}
The first equality is due to $T_{\E-2,m}^{loc} (\x_s) = L$, $\forall m\in\M$ and $T_{\E-1}(\x_s) = T_{\E-2}(\x_s) + \sum_{m=1}^M T_{\E-2,m}^{loc} (\x_s)$.

Since at the episode $\E$ agent would break the loop before exploration, thus, $\tau = (\E-1) L$ and $\tau = \sum_{s=1}^K T_{\E-1}(\x_s)$. We can finally bound $\tau$ by 
\begin{align}\label{36}
    \tau \le H_{\epsilon}\beta_{\E-1}^2 + ML,
\end{align}
where 
\begin{align}
    H_{\epsilon} = \sum_{s=1}^K\max_{\x_i,\x_j\in\A}\frac{3p_s^*(y(\x_{i},\x_{j}))\rho(y(\x_{i}, \x_{j}))}{\max\Big(\frac{\epsilon+\Delta(\x^*,\x_{i})}{3},\frac{\epsilon+\Delta(\x^*,\x_{j})}{3}\Big)^2}.
\end{align}
With this result, we can derive the average sample complexity of each agent can be bounded by
\begin{align}
    \hat\tau \le \frac{H_{\epsilon}\beta_{\E-1}^2}{M} + L,
\end{align}
this implies the larger the $M$, the smaller the average sample complexity. Here finish the proof of Theorem \ref{theorem1}.
\end{proof} 

We will use two corollary to further analysis the relationship between average sample complexity and $L$.

\begin{corollary}\label{corollary1}
Let $\beta_e$ be
\begin{align}\label{38}
    \beta_e = R\sqrt{2\log\frac{K^2\text{det}(V_e)^{1/2}\text{det}(\lambda\bI)^{-1/2}}{\delta}} + \lambda^{1/2} 
\end{align}
\end{corollary}

\begin{proof}[Proof of Corollary \ref{corollary1}] With Lemma 10 of Yasin, we can first decompose (\ref{38}) as
\begin{align}
\begin{split}\label{40}
    \beta_{\E-1} &= R\sqrt{2\log\frac{K^2\text{det}(V_{\E-1})^{1/2}\text{det}(\lambda\bI)^{-1/2}}{\delta}} + \lambda^{1/2} \\
    &\le R\sqrt{2\log\frac{K^2}{\delta} + d\log\bigg(1 + \frac{(\E -1)LM + KM}{\lambda d}\bigg)} + \lambda^{1/2}\\
    &\le 2R\sqrt{2\log\frac{K^2}{\delta} + d\log\bigg(1 + \frac{(\E -1)LM +KM}{\lambda d}\bigg)} 
\end{split}
\end{align}

In the light of (\ref{36}) and (\ref{40}), we can derive 
\begin{align}
    \tau = (\E-1)LM \le 4R^2 H_\epsilon \bigg( 2\log\frac{K^2}{\delta} + d\log\bigg(1 + \frac{(\E -1)LM + KM}{\lambda d}\bigg) \bigg) + ML
\end{align}

Let $\E^\prime$ satisfies\
\begin{align}
\begin{split}
  (\E-1)LM &= 4R^2 H_\epsilon \bigg( 2\log\frac{K^2}{\delta} + d\log\bigg(1 + \frac{(\E^\prime -1)LM +KM}{\lambda d}\bigg) \bigg) + ML\\
   \E & = \frac{4R^2H_\epsilon}{LM} \bigg( 2\log\frac{K^2}{\delta} + d\log\bigg(1 + \frac{(\E^\prime -1)LM + KM}{\lambda d}\bigg) \bigg) + 2
\end{split}
\end{align}
which implies $\E^\prime \le \E$. We set 
\begin{align}
    \Theta = \frac{8R^2H_\epsilon}{LM}\log\frac{K^2}{\delta} + 2
\end{align}
then we can bound $\E^\prime$ by
\begin{align}
    \begin{split}
        \E^\prime \le \E = \Theta + \frac{4R^2H_\epsilon d}{LM} \log\bigg(1 + \frac{(\E^\prime -1)LM + KM}{\lambda d}\bigg)
    \end{split}
\end{align}
Owing to $\log(x + 1)<\sqrt{x}$ for every $x > 0$, we can further get
\begin{align}\label{45}
\begin{split}
    \E^\prime &\le \Theta + \frac{4\sigma^2H_\epsilon}{LM} \sqrt{ \frac{((\E^\prime -1)LM + KM)d}{\lambda }}\\
    (\E^\prime - 1)^2 & \le \Theta^2 + \frac{16 \sigma^4 H_\epsilon^2(\E^\prime - 1)d}{LM\lambda} +  \frac{8\Theta\sigma^2H_\epsilon}{LM} \sqrt{ \frac{(\E^\prime -1)LMd}{\lambda }}
\end{split}
\end{align}
By solving (\ref{45}), we should
\end{proof}

\begin{remark} Most of the previously studied algorithms for pure exploration distributed bandits and collaborative bandits are falling into the category of elimination based algorithms. They utilized the communication protocol similar to our Dis-S-linPE, i.e., divide the time horizon (or stopping time) into episodes, the communication would happen at the beginning and the end of every episode. However, they all utilized an optimal design for arm selection, which designed the sequence of arm selection before observing the rewards. This implies the length of their exploration episode should just be the length of their exploration sequence. In our algorithm, we can freely determine the episode length $L$, which implies our algorithm has a higher flexibility. Intuitively, the larger the $L$, the smaller the communication complexity and the larger the sample complexity. Hence, the user can choose a suitable $L$ to balance the sample complexity and communication complexity trade-off. 
\end{remark}

\section{The Proposed Asynchronous Algorithm}

In this section, we study the pure exploration problem in the asynchronous environment. Existed pure exploration algorithms in distributed or collaborative bandits all focus on the synchronous setting and design their exploration strategy by experimental design. This leads to their communication protocols being restricted to the fixed communication round manner, in which at the beginning of the episode, all the agents download exploration information from the server, and at the end of the episode, all the agents upload exploration results to the server (similar to Algorithm \ref{alg1}). Besides, this kind of exploration strategy should be predetermined before observing the results, which can hardly be implemented by the agents due to the active agent in every round is unknown to the server and every agent do not know other agents' historical action. To settle down the aforementioned open problems, we propose the Distributed Asynchronous Linear Pure Exploration (Dis-A-LinPE) algorithm. In Dis-A-LinPE, every agent designs its exploration strategy based on the data it has and communicates with the server independently of other agents.


 \begin{algorithm}[t]
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
	\caption{ (\texttt{Dis-A-LinPE}) }
	\label{alg2}
	\begin{algorithmic}[1]
            \STATE \textbf{Inputs:} Arm set $\A$, client set $\M$, regularization parameter $\lambda$ and $(\delta,\epsilon)$
            \STATE \textbf{for} $t = 1:K$ \textbf{do}
            \STATE \quad Agent $m_t$ pulls arm $\x_{t}$ and receives reward $r_{t} = \x_t^\top\t^* + \eta_t$ 
            \STATE \quad Agent $m_t$ uploads $r_t$ to the server
            \STATE Server sets $V_{K+1}^{ser} = \lambda\bI + \sum_{i=1}^K\x_i\x_i^\top$, $b_{K+1}^{ser} = \sum^K_i \x_i r_i$ and $T_{K+1}^{ser}(\x_i) = 1,\ \forall \x_i \in \A$
            \STATE \textbf{for} $m=1:M$ \textbf{do}
            \STATE \quad Agent $m$ downloads information from the server and sets $V_{m,K+1} = V_{K+1}^{ser}$, $V_{K+1}^{loc} = \bold{0}$, $b_{m,K+1} = b_{K+1}^{ser}$, $b_{K+1}^{loc} = \bold{0}$ and $T_{m,K+1}(\x_i) = T^{ser}_{K+1}(\x_i)$, $T_{m,K+1}^{loc}(\x_i) = 0$, $\forall\x_i\in\A$
            \STATE \textbf{for} $t = K+1:\infty$ \textbf{do}
            \STATE \quad Agent $m_t$ sets $\hat{\t}_{m_t,t} = V_{m_t,t}^{-1}b_{m_t,t}$
            \STATE \quad Agent $m_t$ sets $\x_{m_t,i,t} = \arg\max_{\x_i\in\A} \x_i^\top\hat\t_{m_t,t}$, $\x_{m_t,j,t} = \arg\max_{\x_j\in\A} \hat{\Delta}_{m_t,t}(\x_{j},\x_{m_t,i,t}) + \alpha_{m_t,t}$
            \STATE \quad Agent $m_t$ selects the most informative arm $\x_{m_t,t}^*$ by (\ref{select2}) and receive reward $r_{m_t,t}$
            \STATE \quad Agent $m_t$ sets $V_{m_t,t}^{loc} = V_{m_t,t-1}^{loc} + \x_{m_t,t}^*\x_{m_t,t}^{*\top}$, $b_{m_t,t}^{loc} = b_{m_t,t-1}^{loc} + r_{m_t,t}\x_{m_t,t}^*$ and $T_{m_t,t}^{loc}(\x_{m_t,t}^*) = T_{m_t,t-1}^{loc}(\x_{m_t,t}^*)+1$
            \STATE \quad \textbf{if} $\text{det}(V_{m_t,t} + V_{m_t,t}^{loc}) > (1+\gamma)\text{det}(V_{m_t,t})$ \textbf{or} $\sum_{i=1}^K (T_{m_t,t}(\x_i) + T^{loc}_{m_t,t}(\x_i)) > (1+\gamma) \sum_{i=1}^K T_{m_t,t}(\x_i)$ \textbf{then}
            \STATE \quad\quad Agent $m_t$ sends $V^{loc}_{m_t,t}$, $b_{m_t,t}^{loc}$ and $T_{m_t,t}^{loc}(\x_i),\ \forall \x_i\in\A$ to the server
            \STATE \quad\quad Server sets $V^{ser}_{t+1} = V^{ser}_{t} + V^{loc}_{m_t,t}$, $b^{ser}_{t+1} = b^{ser}_{t} + b_{m_t,t}^{loc}$ and $T_{t+1}^{ser}(\x_i) = T_{t}^{ser}(\x_i) + T_{m_t,t}^{loc}(\x_i)$
            \STATE \quad \quad Server sets $\hat{\t}^{ser}_{t} = V_{t+1}^{ser-1}b^{ser}_{t+1}$ 
            \STATE \quad \quad Server sets $\x_{i,t}^{ser}_t = \arg\max_{\x_i\in\A} \x_i^\top\hat\t^{ser}$ 
            
            \STATE \quad \quad Server sets $B(t) = \max_{\x_j\in\A} \hat{\Delta}_{t}^{ser}(\x_{j},\x_{i,t}^{ser}) + \alpha_{t}^{ser}(\x_{i,t}^{ser},\x_{j,t}^{ser})$
            \STATE \quad \quad \textbf{if} $B(t) \le \epsilon$ \textbf{then}
            \STATE \quad \quad \quad Server returns $\x_{i,t}^{ser}$ as the estimated best arm $\hat\x^*$ and break the loop
             \STATE \quad \quad Server sends $V^{ser}_{t+1}$, $b^{ser}_{t+1}$ and $T_{t+1}^{ser}(\x_i)$ back to the user
            \STATE \quad \quad Agent $m_t$ sets $V_{m_t,t+1} = V^{ser}_{t+1}$, $ b_{m_t,t+1} = b^{ser}_{t+1}$ and $ T_{m_t,t}(\x_i) = T_{t+1}^{ser}(\x_i)$, $\forall \x_i\in\A$
            \STATE \quad \quad Agent $m_t$ sets $V_{m_t,t}^{loc} = \bold{0}$, $b_{m_t,t}^{loc} = \bold{0}$ and $T_{m_t,t}^{loc}(\x_i) = 0$, $\forall \x_i\in \A$
            \STATE \quad \textbf{else}
            \STATE \quad \quad Agent $m_t$ sets $V_{m_t,t+1} = V_{m_t,t}$, $ b_{m_t,t+1} = b_{m_t,t}$ and $ T_{m_t,t+1}(\x_i) = T_{m_t,t}(\x_i)$, $\forall \x_i\in\A$
            \STATE \quad\quad Agent $m_t$ sets $V^{loc}_{m_t,t} = V^{loc}_{m_t,t}$, $ b^{loc}_{m_t,t} = b^{loc}_{m_t,t}$ and $ T^{loc}_{m_t,t}(\x_i) = T^{loc}_{m_t,t}(\x_i)$, $\forall \x_i\in\A$
	\end{algorithmic}  
\end{algorithm}

\paragraph{Dis-A-LinPE}
The main difference between the Dis-S-LinPE and Dis-A-LinPE is the communication of the Dis-A-LinPE is triggered by the communication event (line 13 of the Algorithm \ref{alg2}). In each round $t$, the active agent $m_t$ would select $\x_{m_t,i,t}$ and $\x_{m_t,j,t}$ and most informative arm $\x^*_{m_t,t}$ similar to Algorithm \ref{alg1}. Then, the active agent would update the local data $V^{loc}_{m_t,t}$, $b^{loc}_{m_t,t}$ and $T^{loc}_{m_t,t}(\x_i)$, $\forall \x_i\in\A$. If the communication event is triggered, then $m_t$ would upload $V^{loc}_{m_t,t}$, $b^{loc}_{m_t,t}$ and $T^{loc}_{m_t,t}(\x_i)$, $\forall \x_i\in\A$ to the server and download $V^{ser}_{t+1}$, $b^{ser}_{t+1}$ and $T^{ser}_{t+1}(\x_i)$, $\forall \x_i\in\A$ from the server. Otherwise, the communication doesn't happen and agent $m_t$ and the server holds the data aggregations (line 24$\sim$26 in Algorithm \ref{alg2}). The algorithm would run until the breaking condition is reached (line 21$\sim$22 in the Algorithm \ref{alg2}).

\paragraph{Global and local data}
We now provide new notations to illustrate the relations between the global information and the local information. The following matrix and vector denote the global information
\begin{align}
    V_{t}^{all} = \lambda\bI + \sum_{s=1}^t \x_{m_s,s}^* \x_{m_s,s}^{*\top},\quad b_{t}^{all} = \sum_{s=1}^t \x_{m_s,s}r_{m_s,s},\quad T^{all}_t(\x_i) = \sum_{s=1}^t\bone\{\x_i = \x_{m_s,s}^*\},\ \forall \x_i\in\A.
\end{align}
We also define the data has been upload to the server as
\begin{align}
    \begin{split}
    &V_{m,t}^{up} = \sum_{s=1}^{N_m(t)} \bone\{m_s = m\} \x_{m_s,s}^* \x_{m_s,s}^{*\top},\quad b_{m,t}^{up} = \sum_{s=1}^{N_m(t)} \bone\{m_s = m\} \x_{m_s,s}^* r_{m_s,s}\\&
    T^{up}_{m,t}(\x_i) = \sum_{s=1}^{N_m(t)}\bone\{m_s = m,\x_i = \x_{m_s,s}^*\},\ \forall \x_i\in\A,
    \end{split}
\end{align}
where $N_m(t)$ denotes the last round when agent $m$ communicate with the server. Similarly, the data has not been upload by the server is provided as follows
\begin{align}
    \begin{split}
         &V_{m,t}^{loc} = \sum_{s=N_m(t)}^{t} \bone\{m_s = m\} \x_{m_s,s}^* \x_{m_s,s}^{*\top},\quad b_{m,t}^{loc} = \sum_{s=N_m(t)}^{t} \bone\{m_s = m\} \x_{m_s,s}^* r_{m_s,s}\\&
    T^{loc}_{m,t}(\x_i) = \sum_{s=N_m(t)}^{t}\bone\{m_s = m,\x_i = \x_{m_s,s}^*\},\ \forall \x_i\in\A.  
    \end{split}
\end{align}

\paragraph{Arm selection strategy}
We here provide the our arm selection strategy in asynchronous setting. Similar in synchronous setting, active agent $m_t$ should pull arm $\x_{m_t,t}^*$ to decrease $\Vert y(\x_{m_t,i,t}, \x_{m_t,j,e}) \Vert_{V_{m_t,t}^{-1}}$ and $\alpha_{m_t,t}$. We here provide how to select $\x_{m_t,t}^*$, it yields
\begin{align} \label{select2}
    \x_{m_t,t}^* = \arg\min_{ \x_a\in\A} T_{m_t,t}(\x_a) / p_a^*(y(\x_{m_t,i,t}, \x_{m_t,j,t}))
\end{align}
where 
\begin{align}
    p_a^*(y(\x_{m_t,i,t}, \x_{m_t,j,t})) = \frac{w_a^*(y(\x_{m_t,i,t}, \x_{m_t,j,t})) }{\sum_{i=1}^K \vert w_a^*(y(\x_{m_t,i,t}, \x_{m_t,j,t})) \vert}
\end{align}
and $w_i^*(y(\x_{m_t,i,t}, \x_{m_t,j,t}))$ is the solution of the linear programming
\begin{align}
    \arg\min_{w_i} \sum_{i=1}^K\vert 
w_i \vert \quad\quad s.t.\ y(\x_{m_t,i,t},\x_{m_t,j,t}) = \sum_{i=1}^K w_i\x_i.
\end{align}

\begin{lemma}\label{lemma3} To bound the $\Vert\hat{\t}_{m_t,t} - \t^*  \Vert_{V_{m_t,t}^{-1}}$ with $a_{m_t,t}$, in this lemma, we should provide the relationship between the $V_{m,t}$ and $V^{all}_t$ and $T_{m,t}(\x_i)$. 
\end{lemma}

\begin{lemma} \label{lemma4}
    Note that we need to design the confidence similar to (\ref{12}), however, the active agent can not derive the knowledge of the $T_{t}^{all}(\x_i)$, hence, we should evaluate the relationship between $T_{m_t,t}(\x_i)$ and $T_t^{all}(\x_i)$, $\forall \x_i\in\A$.
\end{lemma}

\begin{lemma} We design $\alpha_{m_t,t} = \Vert y(\x_{m_t,j,t},\x_{m_t,i,t}) \Vert_{V_{m_t,t}^{-1}} \beta_{m_t,t}$. We know
\begin{align}\label{12}
\Vert\hat{\t}_{t}^{all} - \t^*  \Vert_{V_{t}^{all-1}} \le \sigma \sqrt{d\log\frac{1 + \sum_{m=1}^M\sum_{i=1}^K T^{all}_{m}(i)/\lambda}{\delta}} + \lambda^{1/2}
\end{align}
With the knowledge of the Lemma \ref{lemma3}, Lemma \ref{lemma4} and (\ref{12}), we have full knowledge to design $\beta_{m_t,t}$ to bound the $\Vert\hat{\t}_{m_t,t} - \t^*  \Vert_{V_{m_t,t}^{-1}}$. Then, define event $\I_a = \{\forall t \in \vert\tau\vert,\ \forall \x_i,\x_j\in\A,\ \vert \Delta(\x_i,\x_j) - \hat{\Delta}_{m_t,t}(\x_i,\x_j) \vert \le \alpha_{m_t,t} \}$. Event $\I_a$ would happen with probability at least $1-\delta$.
\end{lemma}

\begin{lemma} To bound the matrix norm $\Vert y(\x_{i}, \x_{j}) \Vert_{V_{m_t,t}^{-1}}$, we should first distinguish the ingredient of  the covariance matrix $V_{m_t,t}$. Obviously, 
\begin{align}
    V_{m_t,t} = \sum_{i=1}^K T_{m_t,t}(\x_i) \x_i\x_i^\top
\end{align}
\end{lemma}

\begin{lemma} Similar to Lemma \ref{lemma2}, the matrix norm $\Vert y(\x_{i}, \x_{j}) \Vert_{V_{m_t,t}^{-1}}$ can be bounded by
\begin{align}
    \Vert y(\x_{i}, \x_{j}) \Vert_{V_{m_t,t}^{-1}} \le \sqrt{\frac{\rho(y(\x_{i}, \x_{j}))}{T_{m_t,t}(\x_i,\x_j)}},\ \forall \x_i,\x_j \in \A.
\end{align}
\end{lemma}


\begin{theorem} With the above three lemmas, we can bound the sample complexity $\tau$, the communication complexity is related to the sample complexity (which may very similar to He et al.), and we can bound it too. 
\end{theorem}

\paragraph{Key idea, for simple analysis and low switching cost}
Suppose $t_{m,1}$ and $t_{m,2}$ is the neighborhood round for agent $m$ communicate with the server. And agent $m$ would upload $T_{m,t_{m,2}}^{loc}(k)$, $\forall k\in\A$ to the server at  $t_{m,2}$ round. It is easy to derive from round $t_{m,1}$ to $t_{m,2}$, agent $m$ would only pull a fixed arm when it is active. This is because the agent $m$ has same exploration strategy from round $t_{m,1}$ to $t_{m,2}$. This can benefit two points, 1. the analysis of the sample complexity; 2. low switching cost of the agents.

\begin{lemma} [The communication complexity of the hybrid event triggered strategy] 
The triggered number of the first event can be bounded by 
\begin{align}
    C_1(\tau) \le 2(M + 1/M^2) d\log(1+\frac{T L^2}{\lambda d}).
\end{align}
The triggered number of the second event can be bounded by 
\begin{align}
    C_2(\tau) \le 2(M + 1/M^2)\log(T).
\end{align}
Hence, the total triggered number can be bounded by
\begin{align}
    C(\tau) \le 2(M + 1/M^2) d\log\bigg(1+\frac{T L^2}{\lambda d} + T^{1/d}\bigg).
\end{align}
\end{lemma}

\paragraph{New event triggered strategy, hybrid event triggered strategy.}
Our event triggered strategy has two triggered condition, which can promise $\sum_{i=1}^K T^{ser}_t(i) \ge (1/\gamma) T_{m,t}^{loc}$ and $V_{t}^{ser} \succeq (1/\gamma) V_{m,t}^{loc}$. We can separately bound the triggered number of two events, and trivially add them to derive the total triggered number bound. The first event can be bounded by He's result and second event can be boned by our results

\paragraph{How to bound $\sum_{k=1}^K T^{all}_\tau(k)$? With this knowledge we can bound $\sum_{k=1}^K T^{all}_\tau(k) = \tau$.} The key idea to bound $\sum_{k=1}^K T^{all}_\tau(k)$ is to bound the $\sum_{k=1}^KT^{ser}_\tau(k)$ first, then we can utilize $\sum_{k=1}^K T^{ser}_\tau(k)$ to bound $\sum_{k=1}^K T^{all}_\tau(k)$. This is because $\sum_{k=1}^KT^{all}_\tau(k) = \sum_{k=1}^K T^{ser}_\tau(k) + \sum_{m=1}^M\sum_{k=1}^K T_{m,\tau}^{loc}(k)$, and $(1/\gamma) \sum_{k=1}^K T_{m,\tau}^{loc}(k) \le \sum_{k=1}^KT^{ser}_\tau(k)$, $\forall m\in\M$. When we set $\gamma = 1/M^2$, we have $\sum_{k=1}^K T^{all}_\tau(k) \le (1 + 1/M)\sum_{k=1}^K T^{ser}_\tau(k)$.

\paragraph{How to bound $\sum_{k=1}^K T^{ser}_\tau(k)$?} We need to bound $T_\tau^{ser}(k)$ one by one. The strategy of bounding single $T_\tau^{ser}(k)$ is proposed as follows. We suppose $t_k$ is the last round an agent $m_{t_k}$ downloads information from the server and $k$ would be pulled in the next round when $m_{t_k}$ is active. This implies we can bound $T_{m_{t_k},t_k + 1}(k)$ by normal strategy in UGape or LinGape. Then, we can utilize $T_{m_{t_k},t_k + 1}(k)$ to bound the $T^{ser}_{\tau}(k)$. Due to $T_{t_k + 1}^{ser}(k) = T_{m_{t_k},t_k + 1}(k)$ and the server would receive at most one $T^{loc}_{m,t} > 0$, $\forall m\in\M, $ (by the definition of the $t_k$). Hence, we can bound $ T^{ser}_{\tau}(k) \le 
 T_{t_k + 1}^{ser}(k) + M \gamma T^{ser}_{\tau}(k) = T_{m_{t_k},t_k + 1}(k) + (1/M)T^{ser}_{\tau}(k)$. Due to we can bound $T_{m_{t_k},t_k + 1}(k)$ by previous technique, we can bound $T_\tau^{ser} (k)$. Combine this with previous discussion, we can finally bound $\sum_{k=1}^K T_{\tau}^{all}(k) = \tau $.

\begin{remark} [The motivation of the Dis-A-LinPE] In real-world practice, the $(\epsilon,\delta)$-best arm identification problem may need a significantly large sample complexity, especially when the number of $K$ is large, $\epsilon = 0$ and $\delta \approx 0$. Hence, one agent may not have enough sample time to complete the whole problem. Besides, when considering the synchronous setting, some agents may hardly interact with the environment or communicate with the server in every time instance. The clients often have various response times and are even occasional unavailable in reality, due to the differences in their computational and communication capacities. Hence, we provide the Dis-A-LinPE, which can  

\begin{enumerate}
  \item The communication between an agent and the server in Dis-A-LinPE is independent of other agents.
  \item There is only one active agent in each round, which consider the agent unavailable case.
  \item The sample complexity of the Dis-A-LinPE would not be a bit larger than the single agent setting.
\end{enumerate}

\end{remark}

\begin{remark}
Recently, Yihan Du et al. has provided an algorithm for the $(\epsilon,\delta)$-pure exploration problem of the collaborative kernel bandit. We here illustrate the difference between their algorithms and our algorithms. Compare with their algorithms, our algorithms has the following advantages

\begin{enumerate}
  \item Their algorithms considered the P2P communication network, while our algorithms consider the star shape communication network.
  \item Their algorithm need to solve a minimax optimization by the kernelized gradient descent to derive the optimal allocation, which may generates a large communication burden and hard to implement in the distributed communication framework. Our algorithm only needs to solve a simple linear programming.
  \item Their algorithms can only work in the synchronous setting owing to their static exploration nature, while our algorithms consider both the synchronous and asynchronous settings.
  \item In synchronous setting, their exploration episode is totally fixed, while our Algorithm \ref{alg1} can arbitrarily adjust the $L$ to balance the trade-off between sample complexity and communication.
\end{enumerate}

Their algorithms has the following advantages

\begin{enumerate}
  \item They studied the kernelized setting, which is more general than our linear setting.
  \item In their setting, different agent would face an individual problem, while in our setting, every agents are cooperatively solving the same problem.
  \item They consider both the fixed budget and fixed confidence problem, while we only consider the fixed confidence problem.
\end{enumerate}

\end{remark}

\section{Asynchronous pure exploration, tabular case}
The chosen pure exploration algorithm in tabular case is similar to the algorithm in linear case. The key point of bounding the sample complexity of this problem, is to establish the relationship between $\sum_{i=1}^K T^{all}_\tau(i)$ and $\sum_{i=1}^KT_{m,\tau}(i)$, for every $m\in\M$. We first bound $\sum_{i=1}^K T_{m,\tau}(i)$, then we bound $\sum_{i=1}^KT^{all}_{\tau}(i) = \tau$ and finish the proof. The event triggered strategy would utilize to bound the communication complexity and establish the relationship between the local data and the global data. 

\paragraph{Dis-A-TarPE} This algorithm is relatively straightforward. At the beginning of the algorithm, the whole system would start a warm up step. The first $K$ active agents ($m_1\sim m_K$) would download exploration strategy from the server, pull arm $i$ respectively, and upload them to the server. Then the server would sent $\hat{\mu}_{m,K+1}(i)$ and $T_{m,K+1}(i)$, $\forall i\in\A$, $m\in\M$ to all the agents. For round $t>K$, the bandit algorithm would run as follows. At round $t$, an agent $m_t$ would be active and identify an estimated best arm $i_{m_t,t} = \arg\max_{i\in\A} \hat{\mu}_{m_t,t}(i) + \beta_{m_t,t}(i)$ and a most ambiguous arm $j_{m_t,t} = \arg\max_{j\not = i} \hat{\mu}_{m_t,t}(j,i_{m_t,t}) + \beta_{m_t,t}(j,i_{m_t,t})$, where $\hat{\mu}_{m_t,t}(j,i_{m_t,t}) = \hat{\mu}_{m_t,t}(j) - \hat{\mu}_{m_t,t}(i_{m_t,t})$. The agent would pull arm $\arg\max_{k\in\{i_{m_t,t},j_{m_t,t}\}} \beta_{m_t,t}(k)$ and check the communication condition. If the condition is satisfied, agent $m_t$ would upload $\mu^{loc}_{m_t,t}(i)$ and $T^{loc}_{m_t,t}(i)$, $\forall i\in\A$
to the server. The server would compute index $B(t)$. If $B(t) \le \epsilon$, the server would set $\hat{i}^* = i_{t}^{ser}$  and break the loop. Else, agent $m_t$ would download $\mu^{ser}_{t}(i)$ and $T^{ser}_{t}(i)$, $\forall i\in\A$ from the user. The agent $m_t$ would utilize the downloaded data to update its exploration strategy and begin the next episode. The specific details of this algorithm is shown in Algorithm \ref{alg3}

\begin{lemma}
    Utilizing the Hoeffding inequality, we can establish the local upper confidence bound $\beta_{m_t,t}(k)$ to let event
    \begin{align}
       I_a = \{\forall k\in\A,\forall t \in \vert \tau \vert,\ \vert \hat{\mu}_{m_t,t}(k) - \mu(k) \vert \le \beta_{m_t,t}(k)\}
    \end{align}
    holds with probability at least $1-\delta$.
\end{lemma}

\begin{lemma}\label{lemma12} If the Algorithm \ref{alg3} selects arm as line 16, we can bound the index $B(t)$ as
\begin{align}
    B(t) \le \min(0,-\max(\Delta(i^*,i_{m_t,t}), \Delta(i^*,j_{m_t,t})) + \beta_{m_t,t}(i_{m_t,t}) + \beta_{m_t,t}(j_{m_t,t}) ) + 2\beta_{m_t,t}(i_{m_t,t}) + 2\beta_{m_t,t}(j_{m_t,t}).
\end{align}
\end{lemma}

Lemma \ref{lemma12} is useful in bounding $T_{m_{t_k},t_k + 1}(k)$ and $T_\tau^{all} (k)$.

\begin{lemma} [The communication complexity of the single event triggered strategy] 
The triggered number  can be bounded by
\begin{align}
    C(\tau) \le 2(M + 1/M^2) \log T.
\end{align}
\end{lemma}

\begin{lemma} [New upper confidence bound in tabular case] In fully communication setting (single agent setting), for each $t\in\vert \tau \vert$, the $\hat\mu_{m_t,t}(k)$ satisfies $\vert\mu(k) - \hat\mu_{m_t,t}(k)\vert \le \beta_{m_t,t}(k)$, $\forall k\in\A$ with probability at least $1-\delta$. Where 
\begin{align}
    \beta_{m_t,t}(k) = \sqrt{\frac{\log\frac{4K(\sum_{s=1}^KT_{m_t,t}(s) -1)^3}{\delta}}{T_{m_t,t}(k)}}.
\end{align}
However, in intermittent communication setting, we should design new upper confidence bound to bound $\vert\mu(k) - \hat\mu_{m_t,t}(k)\vert \le \beta_{m_t,t}(k)$, $\forall k\in\A$ with high probability.
\end{lemma}

\paragraph{How to bound $\sum_{k=1}^K T^{all}_\tau(k)$? With this knowledge we can bound $\sum_{k=1}^K T^{all}_\tau(k) = \tau$.} The key idea to bound $\sum_{k=1}^K T^{all}_\tau(k)$ is to bound the $\sum_{k=1}^KT^{ser}_\tau(k)$ first, then we can utilize $\sum_{k=1}^K T^{ser}_\tau(k)$ to bound $\sum_{k=1}^K T^{all}_\tau(k)$. This is because $\sum_{k=1}^KT^{all}_\tau(k) = \sum_{k=1}^K T^{ser}_\tau(k) + \sum_{m=1}^M\sum_{k=1}^K T_{m,\tau}^{loc}(k)$, and $(1/\gamma) \sum_{k=1}^K T_{m,\tau}^{loc}(k) \le \sum_{k=1}^KT^{ser}_\tau(k)$, $\forall m\in\M$. When we set $\gamma = 1/M^2$, we have $\sum_{k=1}^K T^{all}_\tau(k) \le (1 + 1/M)\sum_{k=1}^K T^{ser}_\tau(k)$.

\paragraph{How to bound $\sum_{k=1}^K T^{ser}_\tau(k)$?} We need to bound $T_\tau^{ser}(k)$ one by one. The strategy of bounding single $T_\tau^{ser}(k)$ is proposed as follows. We suppose $t_k$ is the last round an agent $m_{t_k}$ downloads information from the server and $k$ would be pulled in the next round when $m_{t_k}$ is active. This implies we can bound $T_{m_{t_k},t_k + 1}(k)$ by normal strategy in UGape or LinGape. Then, we can utilize $T_{m_{t_k},t_k + 1}(k)$ to bound the $T^{ser}_{\tau}(k)$. Due to $T_{t_k + 1}^{ser}(k) = T_{m_{t_k},t_k + 1}(k)$ and the server would receive at most one $T^{loc}_{m,t} > 0$, $\forall m\in\M, $ (by the definition of the $t_k$). Hence, we can bound $ T^{ser}_{\tau}(k) \le 
 T_{t_k + 1}^{ser}(k) + M \gamma T^{ser}_{\tau}(k) = T_{m_{t_k},t_k + 1}(k) + (1/M)T^{ser}_{\tau}(k)$. Due to we can bound $T_{m_{t_k},t_k + 1}(k)$ by previous technique, we can bound $T_\tau^{ser} (k)$. Combine this with previous discussion, we can finally bound $\sum_{k=1}^K T_{\tau}^{all}(k) = \tau $.

 \paragraph{The difference between Dis-A-TarPE and Dis-A-LinPE?} Technical differences are listed as follows:
 \begin{enumerate}
  \item In Dis-A-TarPE, we just utilize the single event triggered strategy. In Dis-A-LinPE, we utilize the hybrid event triggered strategy to establish the relationship between multi-local data and global data.
  \item Due to in the tabular case, we can establish the upper confidence bound by directly utilizing the Hoeffding inequality. In linear case, we should establish the upper confidence bound by Yasin's bound, which require the well-defined filter. However, we can not directly obtain the well-defined filter. Hence, the evaluation strategy from He et al. should be employed to establish the local bound of $\hat{\t}_{m_t,t}$.
  \item Arm selection strategy is different. In linear case, we should obtain the selecting strategy by a linear programming. However, in tabular case, we can select the arm directly. 
  \item Transmitted data is different, vector $\And$ scalar.
\end{enumerate}

 \begin{algorithm}[t]
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
	\caption{ (\texttt{Dis-A-TarPE}) }
	\label{alg3}
	\begin{algorithmic}[1]
            \STATE \textbf{Inputs:} Arm set $\A$, client set $\M$ and $(\delta,\epsilon)$
            \STATE \textbf{for} $t = 1:K$ \textbf{do}
            \STATE \quad Agent $m_t$ pulls arm $t$ and receives reward $r_{t} = i_t + \eta_t$ 
            \STATE \quad Agent $m_t$ uploads $r_t$ to the server
            \STATE Server sets $\hat{\mu}_{K+1}^{ser}(i) = r_i$ and $T_{K+1}^{ser}(i) = 1,\ \forall i \in \A$
            \STATE \textbf{for} $m=1:M$ \textbf{do}
            \STATE \quad Agent $m$ downloads information from the server and sets $\hat{\mu}_{m,K+1}(i) = \hat{\mu}_{K+1}^{ser}(i)$, $\mu_{m,K+1}^{loc}(i) = 0$ and $T_{m,K+1}(i) = T^{ser}_{K+1}(i)$, $T_{m,K+1}^{loc}(i) = 0$, $\forall i\in\A$
            \STATE \textbf{for} $t = K+1:\infty$ \textbf{do}
            \STATE \quad Agent $m_t$ sets $i_{m_t,t} = \arg\max_{i\in\A} \hat{\mu}_{m_t,t}(i) + \beta_{m_t,t}(i)$ 
            \STATE \quad Agent $m_t$ sets $j_{m_t,t} = \arg\max_{j\not = i} \hat{\mu}_{m_t,t}(j,i_{m_t,t}) + \beta_{m_t,t}(i_{m_t,t}) + \beta_{m_t,t}(j)$
            \STATE \quad Agent $m_t$ pull arm $\arg\max_{s\in{i_{m_t,t},j_{m_t,t}}}\beta_{m_t,t}(s)$ and receive reward $r_{m_t,t}$
            \STATE \quad Agent $m_t$ sets $\mu_{m_t,t}^{loc}(j_{m_t,t}) = \mu_{m_t,t}^{loc}(j_{m_t,t-1}) + r_{m_t,t}$ and $T_{m_t,t}^{loc}(j_{m_t,t}) = T_{m_t,t}^{loc}(j_{m_t,t-1})+1$
            \STATE \quad \textbf{if} $\sum_{i=1}^K(T_{m_t,t}(i) + T_{m_t,t}^{loc}(i)) > (1+\gamma)\sum_{i=1}^KT_{m_t,t}(i)$ \textbf{then}
            \STATE \quad\quad Agent $m_t$ sends $\mu_{m_t,t}^{loc}(i)$ and $T_{m_t,t}^{loc}(i)$, $\forall i\in\A$ to the server
            \STATE \quad\quad Server sets  $T_{t+1}^{ser}(i) = T_{t}^{ser}(i) + T_{m_t,t}^{loc}(i)$ and $\hat{\mu}^{ser}_{t+1}(i) = (\hat{\mu}^{ser}_{t}(i)T_{t}^{ser}(i) + \mu_{m_t,t}^{loc}(i))/ T_{t+1}^{ser}(i)$, $\forall i\in \A$
            \STATE \quad\quad Server sets $i^{ser}_t = \arg\max_{i\in\A} \hat{\mu}^{ser}_{t+1}(i) + \beta^{ser}_{t+1}(i)$ 
            \STATE \quad \quad Server sets  $B(t) = \max_{j \not = i} \hat{\mu}_{m_t,t}(j,i^{ser}_{t}) + \beta^{ser}_{t}(i^{ser}_{t}) + \beta^{ser}_{t}(j)$
            \STATE \quad \quad \textbf{if} $B(t) \le \epsilon$ \textbf{then}
            \STATE \quad \quad \quad Server returns $i_{t}^{ser}$ as the estimated best arm $\hat i^*$ and break
            \STATE \quad \quad Server sends $T^{ser}_{t+1}(i)$ and $\hat{\mu}^{ser}_{t+1}(i)$, $\forall i\in\A$ back to the user
            \STATE \quad \quad Agent $m_t$ sets $T_{m_t,t+1}(i) = T^{ser}_{t+1}(i)$, $ \hat{\mu}_{m_t,t+1}(i) = \hat{\mu}^{ser}_{t+1}(i)$, $\forall i\in\A$
            \STATE \quad \quad Agent $m_t$ sets $T_{m_t,t}^{loc}(i) = 0$ and $\hat{\mu}_{m_t,t}^{loc}(i) = 0$, $\forall i\in \A$
            \STATE \quad \textbf{else}
            \STATE \quad \quad Agent $m_t$ sets $T_{m_t,t+1}(i) = T_{m_t,1}(i)$, $ \hat{\mu}_{m_t,t+1}(i) = \hat{\mu}_{m_t,t}(i)$, $\forall i\in\A$
            \STATE \quad \quad Agent $m_t$ sets $T_{m_t,t}^{loc}(i) = T_{m_t,t}^{loc}(i)$ and $\hat{\mu}_{m_t,t}^{loc}(i) = \hat{\mu}_{m_t,t}^{loc}(i)$, $\forall i\in \A$
	\end{algorithmic}  
\end{algorithm}

\section{Experiment}

\section{Related Work}

\section{Conclusion}

\end{document}

