\documentclass[10pt]{article}
\usepackage{natbib}
\usepackage{fullpage}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols

\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{comment}
\usepackage{graphicx}
\usepackage{subfigure}
\usepackage{booktabs} % for professional tables
\usepackage{mathtools}
\usepackage{xcolor}
\usepackage{algorithm}
\usepackage{tikz}
% \usepackage{algpseudocode}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{algorithmic}
% \usepackage{algorithmicx}
\usepackage{wrapfig}  

\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{dsfont}

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

\newtheorem{assumption}{Assumption}
\newtheorem{example}{Example}
\newtheorem{definition}{Definition}
\newtheorem{remark}{Remark}
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{lemma}{Lemma}
\newtheorem{claim}{Claim}
\newtheorem{proposition}{Proposition}
\newtheorem{regret}{Regret}
\newtheorem{gap}{Gap}


\def \bvarphi {\mathrm{\boldsymbol{\varphi}}}
\def \btheta {\bm \theta}
\def \bsigma {\bm \Sigma}
\def \mt {\mathsf{T}}
\def \bV {\displaystyle\mV}
\def \bx {\displaystyle\vx}
\def \bA {\displaystyle\sA}
\def \bC {\displaystyle\sC}
\def \bD {\displaystyle\sD}
\def \bR {\displaystyle\sR}
\def \bT {\mathcal{T}}
\def \bI {\bold{I}}
\def \bb {\displaystyle\vb}
\def \bD {\mathcal{D}}
\def \bR {\mathcal{R}}
\def \E {\mathcal{E}}
\def \bC {\mathcal{C}}
\def \bbC {\mathcal{C}}
\def \A {\mathcal{A}}
\def \bP {\mathcal{P}}
\def \bn {\displaystyle\vn}
\def \bc {\mathcal{C}}
\def \ba {\bold{a}}
\def \bone {\mathds{1}}
\def \bE {\mathds{E}}
\def \bB {\mathbb{B}}
\def \bN {\mathcal{N}}
\def \bbN {N}
\def \M {\mathcal{M}}
\def \x {\mathbf{x}}
\def \t {\mathbf{\theta}}
\def \R {\mathcal{R}}
\def \I {\mathcal{I}}

\title{\textbf{Pure Exploration Distributed Bandits in the Asynchronous Environment}}

\author{Zichen Wang\\ 
\and Chuanhao Li\\ 
\and Huazheng Wang\\
}
\date{}
\expandafter{}
\begin{document}
\maketitle

\begin{abstract}
\end{abstract}

\section{Introduction}

\section{Preliminary}

\subsection{Distributed bandits}

\paragraph{Notations}
In this paper, we let $\vert A \vert = \{1,...,A\}$, $\Vert \x \Vert$ denotes the Euclidean norm, $\Vert \x \Vert_{A} = \sqrt{\x^\top A \x}$ denotes the matrix norm, $\bI\in R^{d\times d}$ denotes the identity matrix, $\bold{0}$ denotes the $d$-dimension zero vector or $d\times d$-dimension zero matrix, $\text{det}(A)$ denotes the determinant of the matrix $A$ and $A^\top$ denotes the transpose of $A$.

\paragraph{Tabular case}
We consider the distributed tabular bandits as follows. The distributed tabular bandits consist of an agent set $\M = \{m\}_{m=1}^M$ with $M$ agents, a central server and an the environment $\A =\{ k \}_{k=1}^K$ with $K$ arms. In round $t$, if an agent $m_t \in \M$ pulls an arm $k \in \A$, it would receive reward $r_{m_t,t}= \mu(k) + \eta_{m_t,t}$, where $\mu(k)$ is the expected reward of arm $k$ and $\eta_{t} \in R$ is a zero mean $\sigma$-sub Gaussian noise. Similar to the other papers that studied the pure exploration, we consider the best arm $k^* = \arg\max_{k\in\A} \mu(k)$ be unique.

\paragraph{Linear case}
Different from the tabular case, in the linear case, every arm $k$ is related to a context $\x_k \in R^d$. In round $t$, if an agent $m_t \in \M$ pulls an arm $k \in \A$, it would receive reward $r_{m_t,t} = \x_{k}^\top \t^* + \eta_{m_t,t}$, where $\t^*\in R^d$ is the global model parameter. Without loss of generality, we suppose $\Vert\x_k\Vert \le 1$, $\forall k\in\A$, $\Vert\t^*\Vert \le 1$ and the best arm $k^* = \arg\max_{\x_k\in\A} \x_k^\top\t^*$ be unique. 

\paragraph{Learning objective of the pure exploration distributed linear bandit}
We focus on the fix confidence $(\epsilon,\delta)$-best arm identification problem. The goal of the bandit algorithm is to find an estimated best arm $\hat k^* \in \A$ which satisfies
\begin{align}\label{1}
    \bP(\Delta(k^{*}, \hat k^{*}) \ge \epsilon) \le \delta
\end{align}
with minimum sample complexity. Where $\Delta(k^{*}, \hat k^{*}) = \mu(k^*) - \mu(\hat k^*)$ in tabular case, $\Delta(k^{*}, \hat k^{*}) = y(k^*,\hat{k}^*)^\top \t^*$ in linear case and $y(k^*,\hat{k}^*) = \x_{k^*} - \x_{\hat{k}^*}$. The sample complexity is the total number of the agents interact with the environment, denotes as $\tau$. 

\subsection{Communication model}
In this paper, we consider a star shape communication framework, where there is a communication channel between every agent and the server. The agents can not directly communicate with each other, they can only upload information to the server and download information from the server. We define the communication cost $\bC(\tau)$ as the number of bits transferred between the agents and server in $\tau$ rounds. Due to the real number being transferred in binary form in the information channel, the number of bits for each real number representation is only logarithmic w.r.t. instance scale. For instance, if an agent wants to upload a real number $\tau$, the instantaneous communication cost is $\log(\tau)$. Furthermore, we suppose there is no latency in the communication channel. 

\paragraph{Asynchronous setting} 
The asynchronous setting studied in this paper considers the case that the full participation of the agents and global synchronization mandated by the server can hardly be implemented in each round. In our setting, we suppose there is only one active agent $m_t$ that interacts with the environment in each round $t$. Besides, except for the warm-up step, every agent can choose when to communicate with the server, which can be independent of other agents' communication. 

\section{The Proposed Asynchronous Algorithm For Tabular Case}
In this section, we study the pure exploration problem in the asynchronous environment. Existed pure exploration algorithms in distributed or collaborative bandits all focus on the synchronous setting and design their exploration strategy by experimental design. In this kind of exploration strategy, the exploration sequences should be predetermined before observing the results. This leads to their communication protocols being restricted to the fixed episode manner. This kind of communication protocol divides the time horizon $\tau$ into multi-episodes, at the beginning of the episode, all the agents download information from the server, and at the end of the episode, all the agents upload exploration results to the server. However, this category of communication strategy can hardly be implemented in the asynchronous environment due to the active agent in every round is unknown to the server and every agent does not know other agents' historical action. To settle down the aforementioned open problems, we propose the Distributed Asynchronous Tabular Pure Exploration (DisATarPE) algorithm. In DisATarPE, every agent designs its exploration strategy based on the data it has and communicates with the server almost independently of other agents.

\paragraph{DisATarPE} At the beginning of the algorithm, the whole system would start a warm-up step. The first $K$ active agents ($m_1 \sim m_K$) would download the exploration strategy from the server, pull the arm from $1$ to $K$ respectively, and upload the collected results to the server. Then the server would sent $\hat{\mu}^{ser}_{K+1}(k)$ and $T^{ser}_{K+1}(k)$, $\forall k\in\A$, to all the agents and the agents update $\hat{\mu}_{m,K+1}(k)$ and $T^{ser}_{m,K+1}(k)$, $\forall k\in\A$. For around $t>K$, the bandit algorithm would run as follows. At round $t$, an agent $m_t$ would be active and identify an estimated best arm $i_{m_t,t} = \arg\max_{i\in\A} \hat{\mu}_{m_t,t}(i)$ and a most ambiguous arm $j_{m_t,t} = \arg\max_{j\not = i} \hat{\Delta}_{m_t,t}(j,i_{m_t,t}) + \beta_{m_t,t}(i_{m_t,t},j)$, where $\hat{\Delta}_{m_t,t}(j,i) = \mu_{m_t,t}(j) - \mu_{m_t,t}(i)$ and $\beta_{m_t,t}(i,j) = \beta_{m_t,t}(i) + \beta_{m_t,t}(j)$. The agent would pull arm $k_{m_t,t} = \arg\max_{k\in\{i_{m_t,t},j_{m_t,t}\}} \beta_{m_t,t}(k)$ and receive reward $r_{m_t,t}$. Then $m_t$ would check the communication condition. If the condition is satisfied, agent $m_t$ would upload $\mu^{loc}_{m_t,t}(k)$ and $T^{loc}_{m_t,t}(k)$, $\forall k\in\A$ to the server. The server would update $\mu^{ser}_{t}(k)$ and $T^{ser}_{t}(k)$, $\forall k\in\A$. Then it would compute index $B(t)$. If $B(t) \le \epsilon$, the server would set $\hat{k}^* = i_{t}^{ser}$  and break the loop. Else, agent $m_t$ would download $\mu^{ser}_{t}(k)$ and $T^{ser}_{t}(k)$, $\forall k\in\A$ from the user. The agent $m_t$ would utilize the downloaded data to update its local data and exploration strategy. The specific details of this algorithm are shown in Algorithm \ref{alg3}.

\begin{remark}
    The communication protocol of DisATarPE needs only one synchronous round at the end of the warm-up step (i.e., line 6$\sim$7 of Algorithm \ref{alg3}). Besides, at the first $K$ round, the first $K$ active agents should sequentially upload the exploration strategy to the server. When $t\ge K+1$, all the agents can communicate independently with each other.
\end{remark}

 \begin{algorithm}[t]
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
	\caption{ (\texttt{DisATarPE}) }
	\label{alg3}
	\begin{algorithmic}[1]
            \STATE \textbf{Inputs:} Arm set $\A$, client set $\M$ and $(\delta,\epsilon)$
            \STATE \textbf{for} $t = 1:K$ \textbf{do}
            \STATE \quad Agent $m_t$ pulls arm $k_{m_t,t} = t$ and receives reward $r_{m_t,t} = \mu(k_{m_t,t}) + \eta_{m_t,t}$ 
            \STATE \quad Agent $m_t$ uploads $r_{m_t,t}$ to the server
            \STATE Server sets $\hat{\mu}_{K+1}^{ser}(k) = r_{m_k,k}$ and $T_{K+1}^{ser}(k) = 1,\ \forall k \in \A$
            \STATE \textbf{for} $m=1:M$ \textbf{do}
            \STATE \quad Agent $m$ downloads information from the server and sets $\hat{\mu}_{m,K+1}(k) = \hat{\mu}_{K+1}^{ser}(k)$, $\mu_{m,K+1}^{loc}(k) = 0$, $T_{m,K+1}(k) = T^{ser}_{K+1}(k)$ and $T_{m,K+1}^{loc}(k) = 0$, $\forall k\in\A$
            \STATE \textbf{for} $t = K+1:\infty$ \textbf{do}
            \STATE \quad Agent $m_t$ sets $i_{m_t,t} = \arg\max_{i\in\A} \hat{\mu}_{m_t,t}(i) $ 
            \STATE \quad Agent $m_t$ sets $j_{m_t,t} = \arg\max_{j\not = i} \hat{\Delta}_{m_t,t}(j,i_{m_t,t}) + \beta_{m_t,t}(i_{m_t,t},j)$
            \STATE \quad Agent $m_t$ pull arm $k_{m_t,t} = \arg\max_{s\in\{{i_{m_t,t},j_{m_t,t}}\}}\beta_{m_t,t}(s)$ and receive reward $r_{m_t,t}$
            \STATE \quad Agent $m_t$ sets $\mu_{m_t,t}^{loc}(j_{m_t,t}) = \mu_{m_t,t-1}^{loc}(j_{m_t,t}) + r_{m_t,t}$ and $T_{m_t,t}^{loc}(j_{m_t,t}) = T_{m_t,t-1}^{loc}(j_{m_t,t})+1$
            \STATE \quad \textbf{if} $\sum_{k=1}^K(T_{m_t,t}(k) + T_{m_t,t}^{loc}(k)) > (1+\gamma)\sum_{i=1}^KT_{m_t,t}(k)$ \textbf{then}
            \STATE \quad\quad Agent $m_t$ sends $\mu_{m_t,t}^{loc}(k)$ and $T_{m_t,t}^{loc}(k)$, $\forall k\in\A$ to the server
            \STATE \quad\quad Server sets  $T_{t}^{ser}(k) = T_{t}^{ser}(k) + T_{m_t,t}^{loc}(k)$ and $\hat{\mu}^{ser}_{t}(k) = (\hat{\mu}^{ser}_{t}(k)T_{t}^{ser}(k) + \mu_{m_t,t}^{loc}(k))/ T_{t}^{ser}(k)$, $\forall k\in \A$
            \STATE \quad\quad Server sets $i^{ser}_t = \arg\max_{i\in\A} \hat{\mu}^{ser}_{t}(i)$ and  $j^{ser}_t = \arg\max_{j \not = i} \hat{\Delta}^{ser}_{t}(j,i^{ser}_{t}) + \beta^{ser}_{t}(i^{ser}_{t},j)$
            \STATE \quad \quad Server sets  $B(t) =  \hat{\Delta}^{ser}_{t}(j^{ser}_t,i^{ser}_{t}) + \beta^{ser}_{t}(i^{ser}_{t},j^{ser}_t)$
            \STATE \quad \quad \textbf{if} $B(t) \le \epsilon$ \textbf{then}
            \STATE \quad \quad \quad Server returns $i_{t}^{ser}$ as the estimated best arm $\hat k^*$ and break
            \STATE \quad \quad Server sends $T^{ser}_{t}(k)$ and $\hat{\mu}^{ser}_{t}(k)$, $\forall k\in\A$ back to the user
            \STATE \quad \quad Agent $m_t$ sets $T_{m_t,t+1}(k) = T^{ser}_{t}(k)$, $ \hat{\mu}_{m_t,t+1}(k) = \hat{\mu}^{ser}_{t}(k)$, $\forall k\in\A$
            \STATE \quad \quad Agent $m_t$ sets $T_{m_t,t}^{loc}(k) = 0$ and $\hat{\mu}_{m_t,t}^{loc}(k) = 0$, $\forall k\in \A$
            \STATE \quad \textbf{else}
            \STATE \quad \quad Agent $m_t$ sets $T_{m_t,t+1}(k) = T_{m_t,t}(k)$ and $ \hat{\mu}_{m_t,t+1}(k) = \hat{\mu}_{m_t,t}(k)$, $\forall k\in\A$
            \STATE \quad Agent $m\not = m_t$ sets $T_{m,t+1}(k) = T_{m,t}(k)$ and $ \hat{\mu}_{m,t+1}(k) = \hat{\mu}_{m,t}(k)$, $\forall k\in\A$
	\end{algorithmic}  
\end{algorithm}

\begin{remark}\label{remark2}
    DisATarPE enjoys a low switching cost. The exploration strategy of agents in DisTarPE would not update in each round. We suppose $t_1$ and $t_2$ are two neighborhood communication rounds of agent $m$, and the $\hat{\mu}_{m,t}(k)$ and $\beta_{m,t}(k)$, $\forall k\in\A$, would remain unchanged from $t_{1}+1$ to $t_2$. This implies $k_{m,t}$, $t \in [t_{1} + 1,t_2)$ would also remain unchanged. Intuitively, after an agent $m$ downloads information from the server, it would update its exploration strategy with its data, continually explore the most informative arm (e.g. $k_{m,t}$), and communicate with the server when the amount of the collected information is large enough.
\end{remark}

\begin{remark} [Global and local data] \label{remark3} From Algorithm \ref{alg3}, we can easily derive the total pulled number of arm $k$ till round $t$ satisfies $T^{all}_t(k) = T^{ser}_t(k) + \sum_{m=1}^M T^{loc}_{m,t}(k)$. This is because $T^{ser}_t(k)$ contains all the data that has been uploaded to the server and $\sum_{m=1}^M T^{loc}_{m,t}(k)$ contains all the data has not been upload to the server. Combine these two parts and we can derive the global data. Besides, due to $T_{m,t}(k)$ is downloaded from the server in some round $< t$, we have $T_{m,t}(k) \le T^{ser}_t(k)$.
\end{remark}

\paragraph{Proof sketch} In the whole proof, we should focus on three main points, i.e., 1) the sample complexity $\tau$; 2) the communication cost $C(\tau)$; 3) the estimated best arm satisfies (\ref{1}). For the first point, we first establish the correlation between $\sum_{k=1}^KT^{ser}_t(k)$ and $\sum_{k=1}^KT^{loc}_{m,t}(k)$. Then, we bound $T^{ser}_{\tau}(k)$, $\forall k\in\A$ by the definition of the upper confidence bound and arm selection strategy (line 11 of Algorithm \ref{alg3}). Finally, utilizing the knowledge of Remark \ref{remark3}, we can bound $T^{all}_\tau(k)$, $\forall k\in\A$, and $\tau = \sum_{k=1}^K T^{all}_\tau(k)$. For the second point, due to the agents communicating with the server when the event (line 13 of Algorithm \ref{alg3}) happens, we can utilize the property of the event and bound the total communication number. Finally, for the third point, we can utilize the definition of the upper confidence bound to show: if $B(\tau) \le \epsilon$, then $\mu(k^*) - \mu(\hat{k}^*) \le \epsilon$ with probability at least $1-\delta$.

\begin{lemma} \label{lemmarela1} Under the event-triggered strategy, $\forall t\in [K+1,\tau]$, we have
\begin{align}
    \sum_{k=1}^K T^{ser}_t(k) \ge 1/\gamma \sum_{k=1}^K T^{loc}_{m_t,t}(k),\ \forall m_t\in\M.
\end{align}
\end{lemma}

\begin{proof} [Proof of Lemma \ref{lemmarela1}] 
For every round $t \in [K+1,\tau]$, if the agent $m_t$ communicates with the server at round $t$, we have
\begin{align}
     \sum_{k=1}^K T^{ser}_t(k) \ge 1/\gamma \sum_{k=1}^K T^{loc}_{m_t,t}(k) = 0
\end{align}
by line 22 of the Algorithm \ref{alg3}.

Else, according to the triggered condition of Algorithm \ref{alg3}, we have 
\begin{align}
    \begin{split}
    \sum_{k=1}^K(T_{m_t,t}(k) + T_{m_t,t}^{loc}(k)) &\le (1+\gamma)\sum_{i=1}^KT_{m_t,t}(k)\\
    \sum_{k=1}^K T_{m_t,t}^{loc}(k)&\le \gamma\sum_{i=1}^KT_{m_t,t}(k)
    \end{split}
\end{align}
With the fact that $\sum_{i=1}^KT_{m_t,t}(k) \le \sum_{i=1}^KT^{ser}_{t}(k)$, we can finally get
\begin{align}
    \sum_{k=1}^K T_{m_t,t}^{loc}(k) \le \gamma \sum_{i=1}^KT^{ser}_{t}(k).
\end{align}
Here we finish the proof.
\end{proof}

\begin{lemma}\label{lemmaprobabilitybound}
    Utilizing the Hoeffding inequality, we can establish the upper confidence bounds $\beta_{m_t,t}(k)$, $\beta^{ser}_{t}(k)$  and the event
    \begin{align}
       \I_{a1} = \Big\{\forall k\in\A,\forall t \in [K+1,\tau],\ \vert \hat{\mu}_{m_t,t}(k) - \mu(k) \vert \le \beta_{m_t,t}(k),\ \vert \hat{\mu}_{t}^{ser}(k) - \mu(k) \vert \le \beta_{t}^{ser}(k)\Big\}.
    \end{align}
    With 
    \begin{align}\label{7}
    \beta_{m_t,t}(k) = \sigma\sqrt{\frac{2\log(\frac{4K((1+M\gamma)\sum_{k=1}^K T_{m_t,t}(k))^3}{\delta})}{T_{m_t,t}(k)}}\ \text{and}\ \beta_{t}^{ser}(k) = \sigma \sqrt{\frac{2\log(\frac{4K((1+M\gamma)\sum_{k=1}^K T^{ser}_{t}(k))^3}{\delta})}{T^{ser}_{t}(k)}},
    \end{align}
    we have $\bP(\I_{a1}) \ge 1-\delta$.
\end{lemma}

\begin{proof} [Proof of Lemma \ref{lemmaprobabilitybound}]
We here first introduce the Hoeffding inequality. Suppose $X_1,X_2,...,X_n$ is drawn from a $\sigma$-sub-Gaussian distribution and $\bar{X} = (1/n) \sum_{s=1}^n X_s$ is the mean, then
\begin{align}
    \bP(\vert E[X] - \bar{X}\vert \ge -  a) \le e^{-a^2n/2\sigma^2}.
\end{align}

Due to $\hat{\mu}_{m_t,t}(k)$ and $T_{m_t,t}(k)$, $\forall k\in\A,\ t\in[K+1,\tau]$ are all downloaded from the server in some round $<t$. This implies $\forall t_1\in[K+1,\tau]$, there exists a $t_2\in[K+1,\tau]$ which satisfies 
\begin{align}
    \beta_{m_{t_1},t_1}(k) = \beta^{ser}_{t_2}(k)\ \text{and}\ \hat{\mu}_{m_{t_1},t_1}(k) = \hat{\mu}^{ser}_{t_2}(k),\ \forall k\in\A.
\end{align}
Hence, we can further get 
\begin{align}\label{10}
    \bP(\I_{a1}) = \bP\Big(\forall k\in\A,\forall t \in [K+1,\tau],\ \vert \hat{\mu}_{t}^{ser}(k) - \mu(k) \vert \le \beta_{t}^{ser}(k)\Big).
\end{align}
Define $\I_{a1}^c$ as the contradicted event of $\I_{a1}$. Utilizing the union bound, we can decompose (\ref{10}) as 
\begin{align}\label{union1}
    \bP(\I_{a1}^c) \le \sum_{k=1}^K \sum_{t=K+1}^\tau \bP\Big(\vert \hat{\mu}_{t}^{ser}(k) - \mu(k) \vert \ge \beta_{t}^{ser}(k)\Big).
\end{align}
With the help of the Hoeffding inequality, we can derive
\begin{align}
\begin{split}\label{union2}
    \bP\Big(\vert \hat{\mu}_{t}^{ser}(k) - \mu(k) \vert \ge \beta_{t}^{ser}(k)\Big) &\le  e^{\frac{-\beta_{t}^{ser}(k)^2n}{2\sigma^2}}\\
    &\le  e^{-\log(\frac{4K((1+M\gamma)\sum_{k=1}^K T^{ser}_{t}(k) )^3}{\delta})}\\
    &\le \frac{\delta}{4K((1+M\gamma)\sum_{k=1}^K T^{ser}_{t}(k) )^3}\\
    &\le \frac{\delta}{4K t^3}.
\end{split}
\end{align}
The last inequality hold due to $t = \sum_{k=1}^K T^{ser}_{t}(k) + \sum_{m=1}^M \sum_{k=1}^K T^{loc}_{m,t}(k) \le (1+M\gamma)\sum_{k=1}^K T^{ser}_{t}(k) $ (Lemma \ref{lemmarela1}). Substituting the last term of (\ref{union2}) into (\ref{union1}), we can finally bound
\begin{align}
\begin{split}
    \sum_{k=1}^K \sum_{t=K+1}^\tau \bP\Big(\vert \hat{\mu}_{t}^{ser}(k) - \mu(k) \vert \ge \beta_{t}^{ser}(k)\Big) \le \delta
\end{split}
\end{align}
and $\bP(\I_{a1}) = 1 -\bP(\I_{a1}^c) \ge 1-\delta$. Here we finish the proof.
\end{proof}

\begin{lemma} [The communication complexity] \label{lemmacommunication1}
The total communication cost of DisATarPE  can be bounded by
\begin{align}
    C(\tau) \le 2(M + 1/\gamma) \log \tau.
\end{align}
\end{lemma}

\begin{proof} [Proof of Lemma \ref{lemmacommunication1}] The proof of this Lemma relies on two sections, in the first section, we would divide the sample complexity $\tau$ into $\log_2\tau$ episodes, then we would analysis the upper bound of the communication number of all agents in each episode. We define
\begin{align}
    T_i = \min\Big\{ t\in[K+1,\tau],\ \sum_{k=1}^K T^{ser}_{t}(k) \ge 2^i \Big\}.
\end{align}
and the set of all rounds into episodes $\{ T_i,T_{i}+1,...,T_{i+1}-1 \}$, $\forall i \ge 0$. According to the definition, we have $\sum_{k=1}^K T^{ser}_{t}(k) \le \tau$, and thus 
\begin{align}
    \max\{ i\ge0\ \vert\ T_i\not = \emptyset \} = \log_2\sum_{k=1}^K T^{ser}_{t}(k) \le \log_2\tau.
\end{align}

We then prove $\forall i \ge 0 $ and $ T_i \not = \emptyset$, from round $T_i$ to $T_{i+1} - 1$, the communication number can be bounded by $2(M + 1/\gamma)$. We first define the communication number of agent $m$ from $T_i$ to $T_{i+1}-1$ is $N_m$, the sequence of communication round of agent $m$ from round $T_i$ to $T_{i+1}-1$ as $t^m_{1},...,t^m_{N_m}$, the sequence of all communication rounds from round $T_i$ to $T_{i+1}-1$ as $t_{i,1},...,t_{i,L}$, and the communication number of all agents is $L$. According to the communication condition (line 13 of Algorithm \ref{alg3}), we have
\begin{align}
\begin{split}
\sum_{k=1}^K(T_{m,t^m_j}(k) + T_{m,t^m_j}^{loc}(k)) &> (1+\gamma)\sum_{i=1}^K T_{m,t^m_j}(k) \\
\sum_{k=1}^KT_{m,t^m_j}^{loc}(k) &> \gamma \sum_{i=1}^K T_{m,t^m_j}(k)
\end{split}
\end{align}
Then, $\forall j\in \vert N_m \vert,\ j\ge2$, we have 
\begin{align}\label{loc-T_i}
\sum_{k=1}^KT_{m,t^m_j}^{loc}(k) &> \gamma \sum_{i=1}^K T_{m,t^m_j}(k) \ge  \gamma \sum_{i=1}^K T^{ser}_{T_i}(k).
\end{align}
The inequality holds due to $T_{m,t^m_j} = T^{ser}_{t^m_{j-1} + 1}$ and $t^m_{j-1} + 1 \ge T_i$. The above inequality implies $\forall t_{i,l} \ge t_2^{m_{t_{i,2}}}$
\begin{align}
\begin{split}
\sum_{k=1}^K (T^{ser}_{t_{i,l+1}}(k) - T^{ser}_{t_{i,l}}(k)) & \ge  \sum_{k=1}^K( T^{ser}_{t_{i,l}}(k) + T_{m_{t_{i,l}},t_{i,l}}^{loc}(k)) - \sum_{k=1}^K T^{ser}_{t_{i,l}}(k)\\
& = \sum_{k=1}^K T_{m_{t_{i,l}},t_{i,l}}^{loc}(k)\\
& > \gamma\sum_{i=1}^K T_{m_{t_{i,l}},t_{i,l}}(k)\\
& \ge \gamma\sum_{i=1}^K T_{T_i}^{ser}(k).
\end{split}
\end{align}
Finally we can bound $L = \sum_{m=1}^M N_m$
\begin{align}\label{9}
    \begin{split}
       \sum_{k=1}^K (T_{T_{i+1} - 1 }^{ser}(k) -  T_{T_{i}}^{ser}(k)) &= \sum_{l=1}^{L-1} \sum_{k=1}^K (T^{ser}_{t_{i,l+1}}(k) - T^{ser}_{t_{i,l}}(k))\\
       &\ge \gamma \sum_{m=1}^M (N_m - 1) \sum_{i=1}^K T_{T_i}^{ser}(k)
    \end{split}
\end{align}
The last inequality holds owing to (\ref{loc-T_i}). With the definition of the episode, $\sum_{k=1}^K T_{T_{i+1} - 1}^{ser}(k) \le 2 \sum_{k=1}^K T_{T_{i}}^{ser}(k)$, we can rewrite equation (\ref{9}) as
\begin{align} 
      M + 1/\gamma \ge \sum_{m=1}^M N_m  
\end{align}
Due to one communication includes one upload and one download, the communication cost in one round is at most $2(M+1/\gamma)$. We can then bound the total communication cost
\begin{align}
   C(\tau) \le 2(M+1/\gamma) \log_2(\tau).
\end{align}
Here we finish the proof.
\end{proof}

\begin{lemma}\label{lemmabound1} We define $k^{ser}_t$ in round $t\in[K+1,\tau]$ as $k_{t}^{ser} = \arg\max_{k\in\{i_t^{ser},j_t^{ser}\} }\beta^{ser}_t(k)$. We can bound the index $B(t)$ as
\begin{align}\label{12}
    B(t) \le \min\Big(0,-\Delta(k^*,k_t^{ser}) + 4\beta_{t}^{ser}(k_{t}^{ser}) \Big) + 2\beta_{t}^{ser}(k_{t}^{ser}).
\end{align}
\end{lemma}

\begin{proof} [Proof of Lemma \ref{lemmabound1}]
    According to the definition of the event $\I_{a1}$ and $k^{ser}_t$, consider the case when $i_{t}^{ser} = k^*$, we have
\begin{align}
\begin{split}\label{13}
    B(t) &= \hat{\Delta}^{ser}_{t}(j^{ser}_t,i^{ser}_{t}) + \beta^{ser}_{t}(i^{ser}_{t},j^{ser}_{t}) \\&\le \hat{\Delta}^{ser}_{t}(j^{ser}_t,i^{ser}_{t}) + 2\beta^{ser}_{t}(k^{ser}_{t}) \\& \le  \Delta(j^{ser}_t,i^{ser}_t) + 4\beta^{ser}_{t}(k^{ser}_{t})
    \\&
    = - \Delta(k^*,j^{ser}_t) + 4\beta^{ser}_{t}(k^{ser}_{t})
\end{split}
\end{align}

Consider the case when $j^{ser}_t = k^*$, we have
\begin{align}
\begin{split}\label{14}
   B(t) &\le \hat{\Delta}^{ser}_{t}(j^{ser}_t,i^{ser}_{t}) + 2\beta^{ser}_{t}(k^{ser}_{t})
     \\& \le  -\hat{\Delta}^{ser}_t(j^{ser}_t,i^{ser}_{t}) + 2\beta^{ser}_{t}(k^{ser}_{t})
   \\& \le  -\Delta(j^{ser}_t,i^{ser}_{t}) + 4\beta^{ser}_{t}(k^{ser}_{t})
    \\&
    = - \Delta(k^*,i^{ser}_t) + 4\beta^{ser}_{t}(k^{ser}_{t})
\end{split}
\end{align}
where the second step owing to the definition of the $i_{t}^{ser}$, $\hat{\Delta}^{ser}_t(j^{ser}_t,i^{ser}_{t})\le 0$. 

Combine (\ref{13}) and (\ref{14}), it yields
\begin{align}\label{17}
B(t) \le -\Delta(k^*,k^{ser}_t) + 4\beta_{t}^{ser}(k_{t}^{ser}).
\end{align}
when $i^{ser}_t = k^*$ or $j^{ser}_t = k^*$. Furthermore, due to $B(t)  = \hat{\Delta}^{ser}_t(j^{ser}_t,i^{ser}_t) + \beta^{ser}_t(j^{ser}_t,i^{ser}_t)$ and $\hat{\Delta}^{ser}_t(j^{ser}_t,i^{ser}_t) \le 0$, we can derive $B(t) \le \beta^{ser}_t(j^{ser}_t,i^{ser}_t) \le 2\beta^{ser}_t(k^{ser}_t)$. In the light of (\ref{17}), we can finally get
\begin{align}\label{20}
        B(t) \le \min\Big(0, \Delta(k^*,k^{ser}_t) + 2\beta_{t}^{ser}(k_{t}^{ser}) \Big) + 2\beta_{t}^{ser}(k_{t}^{ser}).
\end{align}

Consider the case when $i_{t}^{ser} \not= k^*$ and $j_{t}^{ser} \not= k^*$, then we can derive
\begin{align}
\begin{split}\label{15}
    B(t) &= \hat{\Delta}^{ser}_{t}(j^{ser}_t,i^{ser}_{t}) + \beta^{ser}_{t}(i^{ser}_{t},j^{ser}_t)\\ & \le \Delta(j^{ser}_t,k^*) + \Delta(k^*,i^{ser}_t) + 2\beta^{ser}_{t}(i^{ser}_{t},j^{ser}_t) \\& \le \Delta(j^{ser}_t,k^*) + 3\beta^{ser}_{t}(i^{ser}_{t},j^{ser}_t) \\
    &\le \Delta(j^{ser}_t,k^*) + 6\beta^{ser}_{t}(k^{ser}_{t})\\
    & = -\Delta(k^*,j^{ser}_t) +  6\beta^{ser}_{t}(k^{ser}_{t})
\end{split}
\end{align}
where the third step holds owing to 
\begin{align}
\begin{split}
\beta_t^{ser}(i_t^{ser},j^{ser}_t) &\ge \hat{\Delta}^{ser}_{t}(j^{ser}_t,i^{ser}_{t}) + \beta^{ser}_{t}(i^{ser}_{t},j^{ser}_t)\\ 
&\ge \hat{\Delta}^{ser}_{t}(k^*,i^{ser}_{t}) + \beta^{ser}_{t}(i^{ser}_{t},k^*)\\
&\ge \Delta(k^*,i^{ser}_{t})
\end{split}
\end{align}
Similar to (\ref{15}), we also can show
\begin{align}
\begin{split}\label{16}
    B(t) &= \hat{\Delta}^{ser}_{t}(j^{ser}_t,i^{ser}_{t}) + \beta^{ser}_{t}(i^{ser}_{t},j^{ser}_t) \\& \le   \beta^{ser}_{t}(i^{ser}_{t},j^{ser}_t)\\
    &\le  
 - \Delta(k^*,i^{ser}_t) + \hat{\Delta}^{ser}_t(k^*,i^{ser}_t) + \beta^{ser}_t(k^*,i^{ser}_t) + \beta^{ser}_{t}(i^{ser}_{t},j^{ser}_t)\\
    & \le  - \Delta(k^*,i^{ser}_t) + \hat{\Delta}^{ser}_t(j^{ser}_t,i^{ser}_t) + \beta^{ser}_t(j^{ser}_t,i^{ser}_t) + \beta^{ser}_{t}(i^{ser}_{t},j^{ser}_t)
    \\
    & \le -\Delta(k^*,i^{ser}_t) +  4\beta^{ser}_{t}(k^{ser}_{t})
\end{split}
\end{align}
The third inequality is due to the definition of the even $\I_{a1}$ and the fourth inequality is due to the definition of the $i_{t}^{ser}$. 

Combine (\ref{15}) and (\ref{16}), it yields
\begin{align}\label{18}
B(t) \le - \Delta(k^*,k_{t}^{ser}) + 6\beta_{t}^{ser}(k_{t}^{ser}).
\end{align}
when $i^{ser}_t \not = k^*$ and $j^{ser}_t \not = k^*$. In the light of (\ref{18}), we can finally get
\begin{align}\label{24}
        B(t) \le \min\Big(0,-\Delta(k^*,k^{ser}_t) + 4\beta_{t}^{ser}(k_{t}^{ser}) \Big) + 2\beta_{t}^{ser}(k_{t}^{ser}).
\end{align}
Combine the (\ref{20}) and (\ref{24}), then we can finish the proof.
\end{proof}

\begin{lemma} \label{serverlemma1} At the terminated round $\tau$, we can bound 
\begin{align}
    T_\tau^{ser}(k) \le \frac{1}{1 - M\gamma}\frac{2\sigma^2\log(4K(1+\gamma M)\tau^3/\delta)}{\max\big(\frac{\Delta(k^*,k) + \epsilon}{2},\epsilon\big)^2},\ \forall k\in\A
\end{align}
\end{lemma}

\begin{proof} [Proof of Lemma \ref{serverlemma1}] Suppose agent $m$ communicates in round $t_1$ and $t_2$ (we highlight this knowledge in Remark \ref{remark2}), then from round $t \in [t_1 + 1,t_2]$, owing to $\hat{\mu}_{m,t}(k)$ and $T_{m,t}(k)$ remain unchanged, $k_{m,t}$ would not change either. We define at round $t_k$, an agent $m$ communicates with the server and sets $k_{m,t_k 
+ 1} = k$, $\forall k\in\A$. And from round $t \in [t_k + 1,\tau]$, none of the agent $m_t$ would set $k_{m_t,t} = k$ after communicating with the server. This implies
\begin{align}
\begin{split}
    T_\tau^{ser}(k) \le& T^{ser}_{t_k}(k) + (M\gamma) T_\tau^{ser}(k) \\ =& T_{m,t_k+1}(k) + (M\gamma) T_\tau^{ser}(k).
\end{split}
\end{align}
The inequality holds due to for $t\in[t_k + 1,\tau]$, $\forall m\in\M$ would upload $T_{m,t}^{loc}(k)>0$ to the server at most one time and $T_{m,t}^{loc}(k) \le M\gamma \sum_{k=1}^K T^{ser}_\tau(k)$ according to the Lemma \ref{lemmarela1}.

We would further bound $T^{ser}_t(k) = T_{m,t_k+1}(k)$. Due to agent $m$ sets $k_{m,t_k + 1} = k$, with Lemma \ref{lemmabound1}, we can derive
\begin{align}
\begin{split}\label{27}
    \epsilon &\le B(t_k)\\
             &\le \min\Big(0,-\Delta(k^*,k) + 2\beta_{t_k}^{ser}(k) \Big) + 2\beta_{t_k}^{ser}(k).
\end{split}
\end{align}
 Then, substituting (\ref{7}) into (\ref{27}), we can get 
\begin{align}\label{28}
    T_{m,t_k+1}(k) = T^{ser}_{t_k}(k) \le \frac{2\sigma^2\log(4K(1+M\gamma)(\sum_{s=1}^K T^{ser}_{t_k}(s))^3/\delta)}{\max\big(\frac{\Delta(k^*,k) + \epsilon}{2},\epsilon\big)^2}
\end{align}
We can finally bound $T^{ser}_\tau(k)$, i.e.
\begin{align}
\begin{split}
    T^{ser}_\tau(k) &\le \frac{2\sigma^2\log(4K(1+M\gamma)(\sum_{s=1}^K T^{ser}_{t_k}(s))^3/\delta)}{\max\big(\frac{\Delta(k^*,k) + \epsilon}{2},\epsilon\big)^2} + M\gamma T^{ser}_\tau(k)\\
    &\le \frac{1}{1 - M\gamma}\frac{2\sigma^2\log(4K(1+M\gamma)\tau^3/\delta)}{\max\big(\frac{\Delta(k^*,k) + \epsilon}{2},\epsilon\big)^2}.
\end{split}
\end{align}
Here we finish the proof.
\end{proof}

\begin{theorem} \label{theorem1} The DisATarPE (Algorithm \ref{alg3}) can achieve goal (\ref{1}) with $\gamma = 1/M^2$ sample complexity
\begin{align}
        \tau \le \frac{M+1}{M-1}H_{\epsilon}\log\bigg(\frac{4K(1+1/M)\tau^3}{\delta}\bigg)
\end{align}
where 
\begin{align}
        H_\epsilon = \sum_{k=1}^K \frac{2\sigma^2}{\max\big(\frac{\Delta(k^*,k) + \epsilon}{2},\epsilon\big)^2}.
\end{align}
The communication cost can be bounded by
\begin{align}\label{33}
   C(\tau) \le 2(M + M^2) \log_2 \tau.
\end{align}
\end{theorem}

\begin{proof}[Proof of Theorem \ref{theorem1}]  According to the breaking condition of the Algorithm \ref{alg3} (line 19$\sim$20), we have 
\begin{align}\label{32}
\epsilon \ge \hat\Delta^{ser}_{\tau} (j^{ser}_\tau,i^{ser}_\tau) + \beta^{ser}_{\tau}(i^{ser}_\tau,j^{ser}_\tau) = B(\tau).
\end{align}
Due to 
\begin{align}
    \hat\Delta^{ser}_{\tau} (j^{ser}_\tau,i^{ser}_\tau) + \beta^{ser}_{\tau}(i^{ser}_\tau,j^{ser}_\tau) \ge \hat\Delta^{ser}_{\tau} (k^*,i^{ser}_\tau) + \beta^{ser}_{\tau}(i^{ser}_\tau,k^*).
\end{align}
Recall that $\hat{k}^* = i^{ser}_{\tau}$ is the estimated best arm. Besides, according to the definition of event $\I_{a1}$, we can derive
\begin{align}
    \epsilon \ge \hat\Delta^{ser}_{\tau} (k^*,\hat k^*) + \beta^{ser}_{\tau}(\hat k^*,k^*) \ge \Delta(k^*,\hat k^*).
\end{align}
Furthermore, due to the event $\I_{a1}$ would happen with probability at least $1-\delta$, the $(\epsilon,\delta)$-condition (\ref{1}) is satisfied.

With the Lemma \ref{serverlemma1}, we can bound 
\begin{align}\label{37}
\begin{split}
    \tau & = \sum_{k=1}^K T^{ser}_\tau(k) + \sum_{m=1}^M \sum_{k=1}^K T_{m,\tau}^{loc}(k)
    \\& \le (1+1/M) \sum_{k=1}^K T^{ser}_\tau(k)\\
    &\le \sum_{k=1}^{K}\frac{M + 1}{M - 1}\frac{2\sigma^2\log(4K(1+1/M)\tau^3/\delta)}{\max\big(\frac{\Delta(k^*,k) + \epsilon}{2},\epsilon\big)^2}\\
    &\le \frac{M+1}{M-1}H_{\epsilon}\log\bigg(\frac{4K(1+1/M)\tau^3}{\delta}\bigg).
\end{split}
\end{align}
Combine with the result of Lemma \ref{lemmacommunication1}, here we finish the proof of Theorem \ref{theorem1}.
\end{proof}

\begin{corollary} \label{corollary1}
 The DisATarPE (Algorithm \ref{alg3}) can achieve goal (\ref{1}) with sample complexity
\begin{align}
        \tau = O\Big( \frac{M+1}{M-1} H_\epsilon \log\Big( \frac{H_\epsilon}{\delta} \Big) \Big)
\end{align}
The communication cost satisfies
\begin{align}
   C(\tau) = O\Big(2(M + M^2) \log_2 (H_\epsilon)\Big).
\end{align}
\end{corollary}

\begin{proof} [Proof of Corollary \ref{corollary1}]Let $\tau^\prime$ be a parameter satisfies
\begin{align}
    \tau^\prime \le \tau = \frac{M+1}{M-1}H_{\epsilon}\log\bigg(\frac{4K(1+1/M)\tau^{\prime3}}{\delta}\bigg)
\end{align}
Due to the fact that $\sqrt{x} \ge \log(x)$ holds when $x>0$, we have
\begin{align}
\begin{split}\label{38}
    \tau^\prime &\le \frac{M+1}{M-1}H_{\epsilon}\log\bigg(\Big(\frac{4K(1+1/M)\tau^{\prime}}{\delta}\Big)^3\bigg)\\
    &\le 3\frac{M+1}{M-1}H_{\epsilon}\sqrt{\bigg(\frac{4K(1+1/M)\tau^\prime}{\delta}\bigg)}\\
    &\le \Big(3\frac{M+1}{M-1}H_{\epsilon}\Big)^2\bigg(\frac{4K(1+1/M)}{\delta}\bigg).\\
\end{split}
\end{align}
We define the last term of (\ref{38}) as $P$. Then based on (\ref{37}) and (\ref{38}), we can finally get
\begin{align}
\begin{split}\label{39}
    \tau \le& \frac{M+1}{M-1}H_{\epsilon}\log\bigg(\frac{4K(1+1/M)P^{3}}{\delta}\bigg)\\
    = & O\Big(\frac{M+1}{M-1}H_\epsilon\log\Big(\frac{H_\epsilon}{\delta}\Big)\Big)
\end{split}
\end{align}

In the light of (\ref{33}) and (\ref{39}), we can bound the communication complexity
\begin{align}
   C(\tau) = O\Big(2(M + M^2) \log_2 (H_\epsilon)\Big).
\end{align}
Here we finish the proof of the Corollary \ref{corollary1}.
\end{proof}

\section{The Proposed Asynchronous Algorithm For Linear Case}

In this section, we study a harder problem: finding the best arm of the linear bandit in the asynchronous environment. Technical differences between the linear case and tabular case are listed as follows:
 \begin{enumerate}
  \item In Dis-A-TarPE, we just utilize the single event triggered strategy. In Dis-A-LinPE, due to we need to establish the correlation between $V^{ser}_t$ and $V_{m_t,t}$; $T^{ser}_t(k)$ and $T_{m_t,t}(k)$, we propose a hybrid event triggered strategy.
  \item In the tabular case, we can establish the upper confidence bound by directly utilizing the Hoeffding inequality. In the linear case, due to the exploration sequence being selected based on past observation, we should establish the upper confidence bound based on Yasin's bound, which requires a well-defined filter. However, we can not directly obtain a well-defined filter. Hence, the evaluation strategy from He et al. should be employed to establish the local bound of $\hat{\t}_{m_t,t}$  and $\hat{\t}^{ser}_t$.
  \item Arm selection strategies are different. In the linear case, we should obtain the selecting strategy by linear programming. However, in tabular cases, we can select the arm directly (line 12 of Algorithm \ref{alg3}). 
\end{enumerate}


 \begin{algorithm}[t]
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
	\caption{ (\texttt{DisALinPE}) }
	\label{alg2}
	\begin{algorithmic}[1]
            \STATE \textbf{Inputs:} Arm set $\A$, client set $\M$, regularization parameter $\lambda$ and $(\delta,\epsilon)$
            \STATE \textbf{for} $t = 1:K$ \textbf{do}
            \STATE \quad Agent $m_t$ pulls arm $\x_{m_t,t} = \x_t$ and receives reward $r_{m_t,t} = \x_{m_t,t}^\top\t^* + \eta_{m_t,t}$ 
            \STATE \quad Agent $m_t$ uploads $r_{m_t,t}$ to the server
            \STATE Server sets $V_{K+1}^{ser} = \lambda\bI + \sum_{k=1}^K\x_{m_t,t}\x_{m_t,t}^\top$, $b_{K+1}^{ser} = \sum^K_{t=1} \x_{m_t,t} r_{m_t,t}$ and $T_{K+1}^{ser}(k) = 1,\ \forall k \in \A$
            \STATE \textbf{for} $m=1:M$ \textbf{do}
            \STATE \quad Agent $m$ downloads information from the server and sets $V_{m,K+1} = V_{K+1}^{ser}$, $V_{K+1}^{loc} = \bold{0}$, $b_{m,K+1} = b_{K+1}^{ser}$, $b_{K+1}^{loc} = \bold{0}$, $T_{m,K+1}(k) = T^{ser}_{K+1}(k)$ and $T_{m,K+1}^{loc}(k) = 0$, $\forall k\in\A$
            \STATE \textbf{for} $t = K+1:\infty$ \textbf{do}
            \STATE \quad Agent $m_t$ sets $\hat{\t}_{m_t,t} = V_{m_t,t}^{-1}b_{m_t,t}$
            \STATE \quad Agent $m_t$ sets $i_{m_t,t} = \arg\max_{k\in\A} \x_k^\top\hat\t_{m_t,t}$ and $j_{m_t,t} = \arg\max_{k\in\A} \hat{\Delta}_{m_t,t}(k,i_{m_t,t}) + \alpha_{m_t,t}(i_{m_t,t},k)$
            \STATE \quad Agent $m_t$ selects the most informative arm $k_{m_t,t}$ by (\ref{select2}) and receive reward $r_{m_t,t}$
            \STATE \quad Agent $m_t$ sets $V_{m_t,t}^{loc} = V_{m_t,t-1}^{loc} + \x_{m_t,t}\x_{m_t,t}^{\top}$, $b_{m_t,t}^{loc} = b_{m_t,t-1}^{loc} + r_{m_t,t}\x_{m_t,t}$ and $T_{m_t,t}^{loc}(\x_{m_t,t}) = T_{m_t,t-1}^{loc}(\x_{m_t,t})+1$
            \STATE \quad \textbf{if} $\text{det}(V_{m_t,t} + V_{m_t,t}^{loc}) > (1+\gamma)\text{det}(V_{m_t,t})$ \textbf{or} $\sum_{k=1}^K (T_{m_t,t}(k) + T^{loc}_{m_t,t}(k)) > (1+\gamma) \sum_{k=1}^K T_{m_t,t}(k)$ \textbf{then}
            \STATE \quad\quad Agent $m_t$ sends $V^{loc}_{m_t,t}$, $b_{m_t,t}^{loc}$ and $T_{m_t,t}^{loc}(k),\ \forall k\in\A$ to the server
            \STATE \quad\quad Server sets $V^{ser}_{t} = V^{ser}_{t} + V^{loc}_{m_t,t}$, $b^{ser}_{t} = b^{ser}_{t} + b_{m_t,t}^{loc}$ and $T_{t}^{ser}(k) = T_{t}^{ser}(k) + T_{m_t,t}^{loc}(k)$, $\forall k\in\A$
            \STATE \quad \quad Server sets $\hat{\t}^{ser}_{t} = {V_{t}^{ser}}^{-1}b^{ser}_{t}$ 
            \STATE \quad \quad Server sets $i^{ser}_t = \arg\max_{\x_k\in\A} \x_k^\top\hat\t^{ser}_t$ 
            \STATE \quad \quad Server sets $j^{ser}_t = \arg\max_{k\in\A} \hat{\Delta}_{t}^{ser}(k,i^{ser}_t) + \alpha_{t}^{ser}(i^{ser}_t,k)$
            \STATE \quad \quad Server sets $B(t) =  \hat{\Delta}_{t}^{ser}(j^{ser}_t,i^{ser}_t) + \alpha_{t}^{ser}(i^{ser}_t,j^{ser}_t)$
            \STATE \quad \quad \textbf{if} $B(t) \le \epsilon$ \textbf{then}
            \STATE \quad \quad \quad Server returns $i_t^{ser}$ as the estimated best arm $\hat k^*$ and break the loop
             \STATE \quad \quad Server sends $V^{ser}_{t}$, $b^{ser}_{t}$ and $T_{t}^{ser}(k)$, $\forall k\in\A$ back to the user
            \STATE \quad \quad Agent $m_t$ sets $V_{m_t,t+1} = V^{ser}_{t}$, $ b_{m_t,t+1} = b^{ser}_{t}$ and $ T_{m_t,t}(k) = T_{t}^{ser}(k)$, $\forall k\in\A$
            \STATE \quad \quad Agent $m_t$ sets $V_{m_t,t}^{loc} = \bold{0}$, $b_{m_t,t}^{loc} = \bold{0}$ and $T_{m_t,t}^{loc}(k) = 0$, $\forall k\in \A$
            \STATE \quad \textbf{else}
            \STATE \quad \quad Agent $m_t$ sets $V_{m_t,t+1} = V_{m_t,t}$, $ b_{m_t,t+1} = b_{m_t,t}$ and $ T_{m_t,t+1}(k) = T_{m_t,t}(k)$, $\forall k\in\A$
            \STATE \quad Agent $m$ sets $V_{m,t+1} = V_{m,t}$, $ b_{m,t+1} = b_{m,t}$ and $ T_{m,t+1}(k) = T_{m,t}(k)$, $\forall k\in\A$
	\end{algorithmic}  
\end{algorithm}

\paragraph{DisALinPE}
At the beginning, the DisALinPE would start a warm-up step similar to Algorithm \ref{alg3}.
In each round $t$, the active agent $m_t$ would select $i_{m_t,t}$ (context is $\x^i_{m_t,t}$) and $j_{m_t,t}$ (context is $\x^j_{m_t,t}$) follow the similar strategy of Algorithm \ref{alg3}. The upper confidence bounds are defined as $\alpha_{m_t,t}(i_{m_t,t},j_{m_t,t}) = \Vert y(i_{m_t,t},j_{m_t,t})\Vert_{V_{m_t,t}^{-1}} \beta_{m_t,t}$ and $\alpha^{ser}_{t}(i^{ser}_t,j^{ser}_t) = \Vert y(i^{ser}_t,j^{ser}_t)\Vert_{V_{t}^{ser-1}} \beta^{ser}_{t}$, where the definition of the $\beta_{m_t,t}$ and $\beta^{ser}_{t}$ are provided in the following section. The agent $m_t$ would pull the most informative arm $k_{m_t,t}$ (context is $\x_{m_t,t}$) and receive reward $r_{m_t,t}$. The active agent would update the locked data $V^{loc}_{m_t,t}$, $b^{loc}_{m_t,t}$ and $T^{loc}_{m_t,t}(k)$, $\forall k\in\A$. DisALinPE utilizes a hybrid event-triggered strategy to support communication. If one of the two communication events is triggered, then it would upload $V^{loc}_{m_t,t}$, $b^{loc}_{m_t,t}$ and $T^{loc}_{m_t,t}(k)$, $\forall k\in\A$ to the server. The server would update $V^{ser}_{t}$, $b^{ser}_{t}$ and $T^{ser}_{t}(k)$, $\forall k\in\A$, set $i^{ser}_t$ (context is $\x^{ser}_{i,t}$), $j^{ser}_t$ (context is $\x^{ser}_{j,t}$) and $B(t)$. If $B(t) > \epsilon$, the server would return $V^{ser}_{t}$, $b^{ser}_{t}$ and $T^{ser}_{t}(k)$, $\forall k\in\A$ to the user. DisALinPE would repeat the above steps until $B(\tau) \le \epsilon$. More details of the DisALinPE can be founded in Algorithm \ref{alg2}.

\paragraph{Arm selection strategy} [Liyuan Xu et al.]
We here provide the arm selection strategy of Algorithm \ref{alg2}. We let active agent $m_t$ to pull $k_{m_t,t}$ to decrease the matrix norm $\Vert y(i_{m_t,t}, j_{m_t,t}) \Vert_{V_{m_t,t}^{-1}}$ and $\alpha_{m_t,t}(i_{m_t,t}, j_{m_t,t})$. We here provide how to select $k_{m_t,t}$, it yields
\begin{align} \label{select2}
    k_{m_t,t} = \arg\min_{k\in\A} \frac{T_{m_t,t}(k)}{p_k^*(y(i_{m_t,t}, j_{m_t,t}))}
\end{align}
where 
\begin{align}
    p_k^*(y(i_{m_t,t},j_{m_t,t})) = \frac{w_k^*(y(i_{m_t,t}, j_{m_t,t})) }{\sum_{s=1}^K \vert w_s^*(y(i_{m_t,t}, j_{m_t,t})) \vert}
\end{align}
and $w_k^*(y(i_{m_t,t}, j_{m_t,t}))$ is the solution of the linear programming
\begin{align}
    \arg\min_{w_k} \sum_{k=1}^K\vert 
w_k \vert \quad\quad s.t.\ y(i_{m_t,t}, j_{m_t,t}) = \sum_{k=1}^K w_k\x_k.
\end{align}

\paragraph{Global and local data}
Due to data in the linear case is more complicated than data in tabular case. We here provide new notations to illustrate the relations between the global information and the local information. The following matrix and vector denote the global information
\begin{align}
    V_{t}^{all} = \lambda\bI + \sum_{s=1}^t \x_{m_s,s} \x_{m_s,s}^{\top},\quad b_{t}^{all} = \sum_{s=1}^t \x_{m_s,s}r_{m_s,s},\quad T^{all}_t(k) = \sum_{s=1}^t\bone\{\x_k = \x_{m_s,s}\},\ \forall k\in\A.
\end{align}
We also define the data has been upload to the server as
\begin{align}
    \begin{split}
    &V_{m,t}^{up} = \sum_{s=1}^{N_{m,t}} \bone\{m_s = m\} \x_{m_s,s} \x_{m_s,s}^{\top},\quad b_{m,t}^{up} = \sum_{s=1}^{N_{m,t}} \bone\{m_s = m\} \x_{m_s,s} r_{m_s,s}\\&
    T^{up}_{m,t}(k) = \sum_{s=1}^{N_{m,t}}\bone\{m_s = m,\x_k = \x_{m_s,s} \},\ \forall k\in\A,
    \end{split}
\end{align}
where $N_{m,t}$ denotes till round $t$, the final round when agent $m$ communicates with the server. Similarly, the data has not been upload by the server is provided as follows
\begin{align}
    \begin{split}
         &V_{m,t}^{loc} = \sum_{s=N_{m,t}}^{t} \bone\{m_s = m\} \x_{m_s,s} \x_{m_s,s}^{\top},\quad b_{m,t}^{loc} = \sum_{s=N_{m,t}}^{t} \bone\{m_s = m\} \x_{m_s,s} r_{m_s,s}\\&
    T^{loc}_{m,t}(k) = \sum_{s=N_{m,t}}^{t}\bone\{m_s = m,\x_k = \x_{m_s,s}\},\ \forall k\in\A.  
    \end{split}
\end{align}

\begin{lemma}\label{lemmarela2} Under the communication strategy of line 13 in Algorithm \ref{alg2}, we can derive
\begin{align}\label{54}
    V^{ser}_t \succeq (1/\gamma)V^{loc}_{m,t},\ \forall m\in\M,\ t\in[K+1,\tau].
\end{align}
Furthermore, following the results of Lemma \ref{lemmarela1}, we can also derive
\begin{align}\label{55}
    T^{ser}_t(k) \ge (1/\gamma)T^{loc}_{m,t}(k),\ \forall m\in\M,\ t\in[K+1,\tau], k\in\A.
\end{align}
\end{lemma}

\begin{proof} [Proof of Lemma \ref{lemmarela2}] The proof of (\ref{54}) is similar to the proof in He et al. Suppose the last round agent $m$ communicates with the server is $t_1$, then, we can trivially derive
\begin{align}
    V^{ser}_t \succ \bold{0} = V^{loc}_{m,t},\ \forall t\in[K+1,\tau].
\end{align}
Otherwise, according to the definiton of the first triggered condition, we have
\begin{align}
    \text{det}(V_{m,t} + V^{loc}_{m,t}) \le (1+\gamma) \text{det}(V_{m,t}).
\end{align}
According to the lemma of Yasin et al., we have 
\begin{align}
    \frac{\Vert\x\Vert^2_A}{\Vert\x\Vert^2_B} \le \frac{\text{det}(A)}{\text{det}(B)} 
\end{align}
where $A\succ B$ are positive definite matrices. This implies
\begin{align}
    \frac{\Vert\x\Vert^2_{V_{m,t}} + \Vert\x\Vert^2_{V^{loc}_{m,t}}}{\Vert\x\Vert^2_{V_{m,t}}} = \frac{\Vert\x\Vert^2_{V_{m,t} + V^{loc}_{m,t}}}{\Vert\x\Vert^2_{V_{m,t}}} \le \frac{\text{det}(V_{m,t} + V^{loc}_{m,t})}{\text{det}(V_{m,t})} \le 1 + \gamma
\end{align}
and 
\begin{align}
    V_{m,t} \succeq (1/\gamma) V^{loc}_{m,t}.
\end{align}
With the fact that $V^{ser}_t \succeq V_{m,t}$, we can finish the proof of (\ref{54}). 

The proof of (\ref{55}) is similar to the proof of Lemma \ref{lemmarela1}. Combine with the two results and we can finish the whole proof.
\end{proof}

\begin{lemma}\label{lemmabound2} In this lemma, we should provide $\beta_{m_t,t}$ to bound $\Vert\hat{\t}_{m_t,t} - \t^*  \Vert_{V_{m_t,t}^{-1}}$ and $\beta^{ser}_{t}$ to bound $\Vert\hat{\t}^{ser}_{t} - \t^*  \Vert_{V_{t}^{ser-1}}$. Then we define event
  \begin{align}
       \I_{a2} = \Big\{\forall i,j\in\A,\forall t \in [K+1,\tau],\ \vert \hat\Delta_{m_t,t}(i,j) - \Delta(i,j) \vert \le \alpha_{m_t,t}(i,j),\  \vert \hat\Delta^{ser}_{t}(i,j) - \Delta(i,j) \vert \le \alpha^{ser}_{t}(i,j) \Big\}
    \end{align}
and $\bP(\I_{a2}) \le 1-\delta$.
\end{lemma}

\begin{proof} [Proof of Lemma \ref{lemmabound2}]
    
\end{proof}

\begin{lemma} [The communication cost of the hybrid event triggered strategy in Algorithm \ref{alg2}] \label{lemmacommunication2}
Till round $\tau$, the triggered number of the first event can be bounded by 
\begin{align}
    (M + 1/\gamma) d\log(1+\frac{\tau}{\lambda d})
\end{align}
and the triggered number of the second event can be bounded by 
\begin{align}\label{58}
     (M + 1/\gamma)\log(\tau).
\end{align}
Hence, the total communication cost can be bounded by
\begin{align}
    C(\tau) \le 2(M + 1/\gamma) d\log\bigg(1+\frac{\tau}{\lambda d} + \tau^{1/d}\bigg).
\end{align}
\end{lemma}

\begin{proof} [Proof of Lemma \ref{lemmacommunication2}]
    The triggered number of the second event can be bounded by Lemma \ref{lemmacommunication1}. Besides, we can bound the triggered number of the first event by technique similar to He et al. The proof of (\ref{58}) also relies on two sections, in the first section, we would divide the sample complexity $\tau$ into $\log_2(1 + \tau/\lambda d)$ episodes, then we would analysis the upper bound of the triggered number in each episode. We define
\begin{align}
    T_i = \min\Big\{ t\in\vert\tau\vert,\ \text{det}(V^{ser}_t) \ge 2^i\lambda^d \Big\}.
\end{align}
and the set of all rounds into episodes $\{ T_i,T_{i}+1,...,T_{i+1}-1 \}$, $\forall i \ge 0$. We can bound 
\begin{align}
    \text{det}(V^{ser}_\tau) \le \lambda^d \Big(1 + \frac{T}{\lambda d}\Big)^d
\end{align}
We then can bound the number of the episode 
\begin{align}
    \max\{ i\ge0\ \vert\ T_i\not = \emptyset \} 
 \le \log_2\big(\text{det}(V^{ser}_\tau)/\lambda^d\big) = d\log_2 \Big(1 + \frac{T}{\lambda d}\Big).
\end{align}

We then prove $\forall i \ge 0 $ and $ T_i \not = \emptyset$, from round $T_i$ to $T_{i+1} - 1$, the triggered number of the first event can be bounded by $2(M + 1/\gamma)$. We first define the number of agent $m$ triggers the first event in $T_i$ to $T_{i+1}-1$ as $N_m$, the sequence of agent $m$ triggers the first event in round $T_i$ to $T_{i+1}-1$ as $t^m_{1},...,t^m_{N_m}$, the sequence of total triggered of the first event in $T_i$ to $T_{i+1}-1$ as $t_{i,1},...,t_{i,L}$, and the total triggered number of the first event in $T_i$ to $T_{i+1}-1$ as $L$. According to the definition of the first event, we have
\begin{align}
\begin{split}
\text{det}(V_{m_t,t} + V_{m_t,t}^{loc}) > (1+\gamma)\text{det}(V_{m_t,t})
\end{split}
\end{align}
Then, $\forall j\in \vert N_m \vert,\ j\ge2$, we have 
\begin{align}
\text{det}(V_{T_i}^{ser} + V_{m,t^m_j}^{loc}) > (1+\gamma)\text{det}(V^{ser}_{T_i}).
\end{align}
The inequality holds due to $\forall j\in \vert N_m \vert,\ j\ge2$, $V^{ser}_{T_i} \preceq V_{m,t^m_j}$ and 
\begin{align}
    \text{det}(A + B + C) + \text{det}(A) \ge \text{det}(A + B) + \text{det}(A + C)
\end{align}
where $A$, $B$ and $C$ are arbitrary positive definitive matrices. 
The above inequality implies $\forall t_{i,l} \ge t_2^{m_{t_{i,l}}}$
\begin{align}
\begin{split}
\text{det} (V^{ser}_{t_{i,l+1}} - V^{ser}_{t_{i,l}}) & \ge \text{det}( V^{ser}_{t_{i,l}} + V_{m_{t_{i,l}},t_{i,l}}^{loc}) - \text{det}( V^{ser}_{t_{i,l}})\\
& \ge \text{det}( V^{ser}_{T_i} + V_{m_{t_{i,l}},t_{i,l}}^{loc}) - \text{det}( V^{ser}_{T_i})\\
& \ge \gamma\text{det}(V^{ser}_{T_i}).
\end{split}
\end{align}
Finally we can bound $L = \sum_{m=1}^M N_m$
\begin{align}
    \begin{split}
       \text{det} (V_{T_{i+1} - 1 }^{ser}) - \text{det}( V_{T_{i}}^{ser}) &= \sum_{l=1}^{L-1}   \text{det}(V^{ser}_{t_{i,l+1}}) - \text{det}(V^{ser}_{t_{i,l}})\\
       &\ge \gamma \sum_{m=1}^M (N_m - 1) \text{det}(V_{T_i}^{ser})
    \end{split}
\end{align}
Due to the definition of the episode, it has $2\text{det}(V_{T_i}^{ser}) \ge \text{det}(V_{T_{i+1} - 1}^{ser})$. We can rewrite equation (\ref{9}) as
\begin{align} 
      M + 1/\gamma \ge \sum_{m=1}^M N_m  
\end{align}
 We can then bound the total triggered number of the first event
\begin{align}
   C_1(\tau) \le (M+1/\gamma) \log\Big(1+ \frac{\tau}{\lambda d}\Big).
\end{align}
Due to the communication would happen when at least one of the triggered event is happened, the total communication round is smaller equal than the triggered number of two events. Hence, the total communication number can be bounded by
\begin{align}
    (M + \gamma) d\log\bigg(1+\frac{\tau}{\lambda d} + \tau^{1/d}\bigg).
\end{align}
Furthermore, due to one communication includes one upload and one download, the total communication cost can be bound by
\begin{align}
   C(\tau) \le 2(M + \gamma) d\log\bigg(1+\frac{\tau}{\lambda d} + \tau^{1/d}\bigg).
\end{align}
Here we finish the proof.
\end{proof}

\begin{lemma} \label{lemmamatrixbound} For all $t\in [K+1,\tau]$, The matrix norm $\Vert y(i,j) \Vert_{V_{m_t,t}^{-1}}$ can be bounded by
\begin{align}\label{72}
    \Vert y(i,j) \Vert_{V_{m_t,t}^{-1}} \le \sqrt{\frac{\rho(y(i, j))}{T_{m_t,t}(i, j)}},\ \text{and}\ \Vert y(i,j) \Vert_{V^{ser^{-1}}_{t}} \le \sqrt{\frac{\rho(y(i, j))}{T^{ser}_{t}(i, j)}},\ \forall i,j \in \A,
\end{align}
where
\begin{align}
\begin{split}
    & T_{m_t,t}(i,j) = \min_{k\in \A,\ p_k^*(y(i,j)) > 0} T_{m_t,t}(k)/p_k^*(y(i,j))\\
    & T^{ser}_{t}(i,j) = \min_{k\in \A,\ p_k^*(y(i,j)) > 0} T^{ser}_{t}(k)/p_k^*(y(i,j))
\end{split}
\end{align}
and
\begin{align}
    \rho(y(i, j)) = \sum_{i=1}^K \vert w^*_i(y(i,j)) \vert.
\end{align}
\end{lemma}

\begin{proof}[Proof of Lemma \ref{lemmamatrixbound}] This proof is similar to the proof of Liyuan Xu et al. First, due to $V_{m_t,t}$ and $T_{m_t,t}(k)$, $\forall k\in\A,\ t\in[K+1,\tau]$ are all downloaded from the server. This implies $\forall t_1\in[K+1,\tau]$, there exists a $t_2\in[K+1,\tau]$ which satisfies 
\begin{align}
    V_{m_{t_1},t_1} = V^{ser}_{t_1}\ \text{and}\ T_{m_{t_2},t_2}(k) = T^{ser}_{t_2}(k),\ \forall k\in\A.
\end{align}
This implies we only need to proof the second inequality of (\ref{72}). We can decompose the covariance matrix $V^{ser}_{t}$ as
\begin{align}
    V^{ser}_{t}  = \lambda\bI + \sum_{k=1}^K
    T^{ser}_{t}(k) \x_k\x_k^\top.
\end{align}
We set $\tilde{V}_{t}^{ser} = \lambda\bI + \sum_{k=1}^K T^{ser}_t(i,j)p_s^*(y(i,j)) \x_k\x_k^\top$. From (\ref{18}), we have 
\begin{align}
    T^{ser}_t(i,j)p_k^*(y(i,j)) \le T_t^{ser}(k),\ \forall k\in\A
\end{align}
which implies $\tilde{V}_{t}^{ser} \preceq V_{t}^{ser}$ and 
\begin{align}
    y(i,j)^\top V_t^{ser^{-1}} y(i,j) \le y(i,j)^\top \tilde{V}_t^{ser^{-1}} y(i,j),\ i,j \in \A.
\end{align}
We then bound $y(i,j)^\top \tilde{V}_t^{ser^{-1}} y(i,j)$, according to the KKT condition of (\ref{9}), we have the following formulas
\begin{align}
\begin{split}
    & w_k^*(y(i,j)) = \frac{1}{2} p_k^*(y(i,j)) \x_k^\top\varepsilon,\ \forall k\in\A \\
    & y(i,j) = \frac{1}{2}\sum_{k=1}^K p_k^*(y(i,j))\x_k \x_k^\top \varepsilon,\ \forall i,j\in\A
\end{split}
\end{align}
where $\varepsilon\in\R^d$ corresponds to the Lagrange multiplier. We can rewrite $y(i,j)^\top \tilde{V}_t^{ser^{-1}} y(i,j)$ as
\begin{align}\label{24}
    y(i,j)^\top \tilde{V}_t^{ser^{-1}} y(i,j) = \frac{1}{4} \bigg(\sum_{k=1}^K p_k^*(y(i,j))\x_k \x_k^\top \varepsilon\bigg)^\top \tilde{V}_t^{ser^{-1}} \bigg(\sum_{k=1}^K p_k^*(y(i,j))\x_k \x_k^\top \varepsilon\bigg).
\end{align}
Besides, we can rewrite $\rho(y(i,j))$ as 
\begin{align}\label{25}
    \rho(y(i,j)) = \sum_{k=1}^K \frac{w_k^{*2}(y(i,j))}{p_k^*(y(i,j))} = \frac{1}{4} \varepsilon^\top \bigg( \sum_{k=1}^K p_k^*(y(i,j))\x_k\x_k^\top \bigg) \varepsilon
\end{align}
In the light of (\ref{24}) and (\ref{25}), we can bound $ y(i,j)^\top \tilde{V}_t^{ser^{-1}} y(i,j) + \rho(y(i,j))/T_t^{ser}(i,j)$ with $0$
\begin{align}
    \begin{split}
    \nonumber
&y(i,j)^\top \tilde{V}_t^{ser^{-1}} y(i,j) + \frac{\rho(y(i,j))}{T_t^{ser}(i,j)}\\ =& \frac{1}{4}\varepsilon^\top \bigg( \bigg(\sum_{k=1}^K p_k^*(y(i,j))\x_k\x_k^\top\bigg) - \frac{\tilde{V}_t^{ser}}{T^{ser}_t(i,j)} \bigg)\tilde{V}_t^{ser^{-1}}\bigg(\sum_{k=1}^K p_k^*(y(i,j))\x_k\x_k^\top\bigg) \varepsilon\\
=& - \frac{\lambda}{4}\varepsilon^\top \tilde{V}_t^{ser^{-1}}\bigg(\sum_{k=1}^K p_s^*(y(i,j))\x_k\x_k^\top\bigg) \varepsilon\\
\le & 0.
    \end{split}
\end{align}
The second equality holds due to  the definition of the $\tilde{V}_t^{ser}$, the last inequality holds due to the definition of the positive definite matrix. Here we finish the proof.
\end{proof}

\begin{lemma}\label{lemmabound2}
    Under event $\I_{a2}$, $\forall t\in[K+1,\tau]$, $B(t)$ can be bounded as follows
    \begin{align}\label{27}
    B(t) \le \min\Big(0,-\max\Big(\Delta(k^*,i^{ser}_t),\Delta(k^*,j^{ser}_t)\Big) +  2\alpha^{ser}_t(i^{ser}_t,j^{ser}_{t})\Big) + \alpha^{ser}_t(i^{ser}_t,j^{ser}_{t}).
    \end{align}
\end{lemma}

\begin{proof}[Proof of Lemma \ref{lemmabound2}] 
    This proof is similar to the Proof of Lemma \ref{lemmabound1}. According to the definition of the event $\I_{a2}$, consider the case when $i_{t}^{ser} = k^*$, we have
\begin{align}
\begin{split}\label{83}
    B(t) &= \hat{\Delta}^{ser}_{t}(j^{ser}_t,i^{ser}_{t}) + \alpha^{ser}_{t}(i^{ser}_{t},j^{ser}_{t}) \\& \le  \Delta(j^{ser}_t,i^{ser}_t) + 2\alpha^{ser}_{t}(i^{ser}_{t},j^{ser}_{t})
    \\&
    = - \Delta(k^*,j^{ser}_t) + 2\alpha^{ser}_{t}(i^{ser}_{t},j^{ser}_{t})
\end{split}
\end{align}

Consider the case when $j^{ser}_t = k^*$, we have
\begin{align}
\begin{split}\label{84}
   B(t) & \le  -\hat{\Delta}^{ser}_t(j^{ser}_t,i^{ser}_{t}) + \alpha^{ser}_{t}(i^{ser}_{t},j^{ser}_{t})
   \\& \le  -\Delta(j^{ser}_t,i^{ser}_{t}) + 2\alpha^{ser}_{t}(i^{ser}_{t},j^{ser}_{t})
    \\&
    = - \Delta(k^*,i^{ser}_t) + 2\alpha^{ser}_{t}(i^{ser}_{t},j^{ser}_{t})
\end{split}
\end{align}
where the second step owing to the definition of the $i_{t}^{ser}$ and $\hat{\Delta}^{ser}_t(j^{ser}_t,i^{ser}_{t})\le 0$. 

Combine (\ref{83}) and (\ref{84}), it yields
\begin{align}\label{85}
  B(t) \le \min\Big(0,-\max\Big(\Delta(k^*,i^{ser}_t),\Delta(k^*,j^{ser}_t)\Big) +  \alpha^{ser}_t(i^{ser}_t,j^{ser}_{t})\Big) + \alpha^{ser}_t(i^{ser}_t,j^{ser}_{t}).
\end{align}

Consider the case when $i_{t}^{ser} \not= k^*$ and $j_{t}^{ser} \not= k^*$, then we can derive
\begin{align}
\begin{split}
    B(t) &= \hat{\Delta}^{ser}_{t}(j^{ser}_t,i^{ser}_{t}) + \alpha^{ser}_t(i^{ser}_t,j^{ser}_{t})\\ & \le \Delta(j^{ser}_t,k^*) + \Delta(k^*,i^{ser}_t) + 2\alpha^{ser}_t(i^{ser}_t,j^{ser}_{t}) \\& \le \Delta(j^{ser}_t,k^*) + 3\alpha^{ser}_t(i^{ser}_t,j^{ser}_{t}) \\
    & = -\Delta(k^*,j^{ser}_t) + 3\alpha^{ser}_t(i^{ser}_t,j^{ser}_{t})
\end{split}
\end{align}
where the third step holds owing to 
\begin{align}
\begin{split}
\beta_t^{ser}(i_t^{ser},j^{ser}_t) &\ge \hat{\Delta}^{ser}_{t}(j^{ser}_t,i^{ser}_{t}) + \alpha^{ser}_t(i^{ser}_t,j^{ser}_{t})\\ 
&\ge \hat{\Delta}^{ser}_{t}(k^*,i^{ser}_{t}) + \alpha^{ser}_t(i^{ser}_t,k^*)\\
&\ge \Delta(k^*,i^{ser}_{t})
\end{split}
\end{align} We also can show
\begin{align}
\begin{split}\label{89}
    B(t) &= \hat{\Delta}^{ser}_{t}(j^{ser}_t,i^{ser}_{t}) + \alpha^{ser}_t(i^{ser}_t,j^{ser}_{t}) \\& \le  \alpha^{ser}_t(i^{ser}_t,j^{ser}_{t})\\
    &\le  
 - \Delta(k^*,i^{ser}_t) + \hat{\Delta}^{ser}_t(k^*,i^{ser}_t) + \alpha^{ser}_t(k^*,j^{ser}_{t}) + \alpha^{ser}_t(i^{ser}_t,j^{ser}_{t})\\
    & \le  - \Delta(k^*,i^{ser}_t) + \hat{\Delta}^{ser}_t(j^{ser}_t,i^{ser}_t) + \alpha^{ser}_t(i^{ser}_t,j^{ser}_{t}) + \alpha^{ser}_t(i^{ser}_t,j^{ser}_{t})
    \\
    & \le -\Delta(k^*,i^{ser}_t) +  2\alpha^{ser}_t(i^{ser}_t,j^{ser}_{t})
\end{split}
\end{align}
The third inequality is due to the definition of the even $\I_{a2}$ and the fourth inequality is due to the definition of the $i_{t}^{ser}$. 

Combine (\ref{85}) and (\ref{89}), it yields
\begin{align}\label{90}
        B(t) \le \min\Big(0,-\Delta(k^*,k^{ser}_t) + 2\alpha^{ser}_t(i^{ser}_t,j^{ser}_{t}) \Big) + \alpha^{ser}_t(i^{ser}_t,j^{ser}_{t}).
\end{align}
Combine the (\ref{85}) and (\ref{90}), then we can finish the proof.
\end{proof}

\begin{lemma} \label{serverlemma2} At the terminated round $\tau$, we can bound 
\begin{align}
    T_\tau^{ser}(k) \le \frac{1}{1 - M\gamma}\frac{\rho(y(i,j))p^*_k(y(i,j))}{\max\big(\frac{\Delta(k^*,i) + \epsilon}{3}, \frac{\Delta(k^*,j) + \epsilon}{3}, \epsilon\big)^2} \beta^{ser^2}_\tau,\ \forall k\in\A
\end{align}
\end{lemma}

\begin{proof} [Proof of Lemma \ref{serverlemma2}] The difference between this proof and Lemma \ref{serverlemma1} is Algorithm \ref{alg2} employs a different arm selection strategy. Suppose agent $m$ communicates in round $t_1$ and $t_2$, then from round $t \in [t_1 + 1,t_2]$, owing to $\hat{\mu}_{m,t}(k)$ and $T_{m,t}(k)$ remain unchanged, $k_{m,t}$ would not change either. We define at round $t_k$, an agent $m$ communicates with the server and sets $k_{m,t_k 
+ 1} = k$, $\forall k\in\A$. And from round $t \in [t_k + 1,\tau]$, none of the agent $m_t$ would set $k_{m_t,t} = k$ after communicating with the server. This implies
\begin{align}
\begin{split}
    T_\tau^{ser}(k) \le& T^{ser}_{t_k}(k) + (M\gamma) T_\tau^{ser}(k) \\ =& T_{m,t_k+1}(k) + (M\gamma) T_\tau^{ser}(k).
\end{split}
\end{align}
The inequality holds due to for $t\in[t_k + 1,\tau]$, $\forall m\in\M$ would upload $T_{m,t}^{loc}(k)>0$ to the server at most one time and $T_{m,t}^{loc}(k) \le M\gamma \sum_{k=1}^K T^{ser}_\tau(k)$ according to the Lemma \ref{lemmarela2}.

 With Lemma \ref{lemmabound2}, we can derive
\begin{align}
\begin{split}\label{100}
    \epsilon &\le B(t_k)\\
             &\le \min\Big(0,-\max(\Delta(k^*,i_{m,t_k + 1}),\Delta(k^*,j_{m,t_k + 1})) + 2\alpha_{m,t_k + 1}(i_{m,t_k + 1},j_{m,t_k + 1}) \Big) + \alpha_{m,t_k + 1}(i_{m,t_k + 1},j_{m,t_k + 1}).
\end{split}
\end{align}
We would further bound $T^{ser}_t(k) = T_{m,t_k+1}(k)$. Recalling the arm selection strategy of Algorithm \ref{alg2}, when $k$ is chosen by agent $m$ in round $t_k + 1$, this implies
\begin{align}\label{101}
    T_{m,t_k+1}(i_{m,t_k + 1},j_{m,t_k + 1}) = T_{m,t_k+1}(k)/p^*_k(i_{m,t_k + 1},j_{m,t_k + 1}).
\end{align}
Substituting (\ref{72}) and (\ref{100}) into (\ref{101}), we can derive
\begin{align}
\begin{split}
  T^{ser}_{t_k+1}(k) = T_{m,t_k + 1}(k) & \le \frac{\rho(y(i_{m,t_k + 1},j_{m,t_k + 1}))p^*_k(y(i_{m,t_k + 1},j_{m,t_k + 1}))}{\max\big(\frac{\Delta(k^*,i_{m,t_k + 1}) + \epsilon}{3}, \frac{\Delta(k^*,j_{m,t_k + 1}) + \epsilon}{3}, \epsilon\big)^2} \beta^2_{m,t_k+1}\\ & \le \max_{i,j\in\A} \frac{\rho(y(i,j))p^*_k(y(i,j))}{\max\big(\frac{\Delta(k^*,i) + \epsilon}{3}, \frac{\Delta(k^*,j) + \epsilon}{3}, \epsilon\big)^2} \beta^{ser^2}_{\tau}
\end{split}
\end{align}
We can finally bound $T^{ser}_\tau(k)$, i.e.
\begin{align}
\begin{split}
    T^{ser}_\tau(k) &\le \max_{i,j\in\A} \frac{\rho(y(i,j))p^*_k(y(i,j))}{\max\big(\frac{\Delta(k^*,i) + \epsilon}{3}, \frac{\Delta(k^*,j) + \epsilon}{3}, \epsilon\big)^2} \beta^{ser^2}_{\tau} + M\gamma T^{ser}_\tau(k)\\
    &\le \frac{1}{1 - M\gamma} \max_{i,j\in\A} \frac{\rho(y(i,j))p^*_k(y(i,j))}{\max\big(\frac{\Delta(k^*,i) + \epsilon}{3}, \frac{\Delta(k^*,j) + \epsilon}{3}, \epsilon\big)^2} \beta^{ser^2}_{\tau}.
\end{split}
\end{align}
Here we finish the proof.
\end{proof}

\begin{theorem} \label{theorem2} The DisATarPE (Algorithm \ref{alg2}) can achieve goal (\ref{1}) with $\gamma = 1/M^2$ sample complexity
\begin{align}
        \tau \le \frac{M+1}{M-1} 
        H_\epsilon \beta^{ser^2}_{\tau}.
\end{align}
where 
\begin{align}
        H_\epsilon =  \sum_{k=1}^K \max_{i,j\in\A} \frac{\rho(y(i,j))p^*_k(y(i,j))}{\max\big(\frac{\Delta(k^*,i) + \epsilon}{3}, \frac{\Delta(k^*,j) + \epsilon}{3}, \epsilon\big)^2}.
\end{align}
The communication cost can be bounded by
\begin{align}
      C(\tau) \le 2(M + 1/\gamma) d\log\bigg(1+\frac{\tau}{\lambda d} + \tau^{1/d}\bigg).
\end{align}
\end{theorem}

\begin{proof}[Proof of Theorem \ref{theorem2}]  According to the breaking condition of the Algorithm \ref{alg2} (line 19$\sim$20), we have 
\begin{align}
\epsilon \ge \hat\Delta^{ser}_{\tau} (j^{ser}_\tau,i^{ser}_\tau) + \alpha^{ser}_{\tau}(i^{ser}_\tau,j^{ser}_\tau) = B(\tau).
\end{align}
Due to 
\begin{align}
    \hat\Delta^{ser}_{\tau} (j^{ser}_\tau,i^{ser}_\tau) + \alpha^{ser}_{\tau}(i^{ser}_\tau,j^{ser}_\tau) \ge \hat\Delta^{ser}_{\tau} (k^*,i^{ser}_\tau) + \alpha^{ser}_{\tau}(i^{ser}_\tau,k^*).
\end{align}
Recall that $\hat{k}^* = i^{ser}_{\tau}$ is the estimated best arm. Besides, according to the definition of event $\I_{a2}$, we can derive
\begin{align}
    \epsilon \ge \hat\Delta^{ser}_{\tau} (k^*,\hat k^*) + \alpha^{ser}_{\tau}(\hat k^*,k^*) \ge \Delta(k^*,\hat k^*).
\end{align}
Furthermore, due to the event $\I_{a2}$ would happen with probability at least $1-\delta$, the $(\epsilon,\delta)$-condition (\ref{1}) is satisfied.

With the Lemma \ref{serverlemma2}, we can bound 
\begin{align}\label{37}
\begin{split}
    \tau & = \sum_{k=1}^K T^{ser}_\tau(k) + \sum_{m=1}^M \sum_{k=1}^K T_{m,\tau}^{loc}(k)
    \\& \le (1+1/M) \sum_{k=1}^K T^{ser}_\tau(k)\\
    &\le \sum_{k=1}^{K}\frac{M + 1}{M - 1}\max_{i,j\in\A} \frac{\rho(y(i,j))p^*_k(y(i,j))}{\max\big(\frac{\Delta(k^*,i) + \epsilon}{3}, \frac{\Delta(k^*,j) + \epsilon}{3}, \epsilon\big)^2} \beta^{ser^2}_{\tau}\\
    &\le \frac{M+1}{M-1}H_{\epsilon}\beta^{ser^2}_{\tau}.
\end{split}
\end{align}
Combine with the result of Lemma \ref{lemmacommunication2}, here we finish the proof of Theorem \ref{theorem2}.
\end{proof}

\begin{remark}
Recently, Yihan Du et al. has provided an algorithm for the $(\epsilon,\delta)$-pure exploration problem of the collaborative kernel bandit. We here illustrate the difference between their algorithms and our algorithms. Compare with their algorithms, our algorithms has the following advantages

\begin{enumerate}
  \item Their algorithms considered the P2P communication network, while our algorithms consider the star shape communication network.
  \item Their algorithm need to solve a minimax optimization by the kernelized gradient descent to derive the optimal allocation, which may generates a large communication burden and hard to implement in the distributed communication framework. Our algorithm only needs to solve a simple linear programming.
  \item Their algorithms can only work in the synchronous setting owing to their static exploration nature, while our algorithms consider both the synchronous and asynchronous settings.
  \item In synchronous setting, their exploration episode is totally fixed, while our Algorithm \ref{alg1} can arbitrarily adjust the $L$ to balance the trade-off between sample complexity and communication.
\end{enumerate}

Their algorithms has the following advantages

\begin{enumerate}
  \item They studied the kernelized setting, which is more general than our linear setting.
  \item In their setting, different agent would face an individual problem, while in our setting, every agents are cooperatively solving the same problem.
  \item They consider both the fixed budget and fixed confidence problem, while we only consider the fixed confidence problem.
\end{enumerate}

\end{remark}

\section{Experiment}

\section{Related Work}

\section{Conclusion}

\end{document}

