\section{Appendix / symbol explanation}
For all symbols in this article, we give explanations of them in Table~\ref{symbol}.
\begin{table*}[h]
\centering
\begin{tabular}{|c|c|}
\hline
\textbf{Symbol/Term} & \textbf{Definition} \\ 
\hline
$\mathcal{G}(\mathcal{N},\mathcal{E},\mathcal{A})$ & A graph to describe a multi-agent system\\
\hline
$\mathcal{N} = \{1, \dots, N\}$ & Set of agents in a multi-agent system \\ 
\hline
$\mathcal{E}\subset \mathcal{N}\times \mathcal{N}$ & The edge set in graph $\mathcal{G}$ \\
\hline
$\mathcal{X}=[a_{i,j}]_{N\times N}$ & The weight matrix to describe the relations between agents \\
\hline
$\mathcal{N}_j$ & Neighborhood of agent $j$, excluding agent $j$ \\ 
\hline
$\bm{W} = [\omega_{a,b}]_{N \times N}$ & Communication matrix \\ 
\hline
$D$ & The diameter of graph $\mathcal{G}$ \\ 
\hline
$\lambda_2$ & The second largest eigenvalue of $\bm{W}$ \\ 
\hline
$Q$ & A symbol to describe whether the graph is balanced \\ 
\hline
$\mathcal{K} = \{1, \dots, K\}$ & Set of arms in a multi-armed bandit (MAB) problem \\ 
\hline
$T$ & Total number of time slots \\ 
\hline
$A_j(t)$ & Arm chosen by agent $j$ at time slot $t$ \\ 
\hline
$X_{A_j(t), j}(t)$ & Random reward received by agent $j$ after pulling arm $A_j(t)$ at time slot $t$ \\ 
\hline
$X_{i,j}(t)$ & Random reward of arm $i$ observed by agent $j$ at time slot $t$ \\ 
\hline
$\mu_{i,j}$ & Mean reward of arm $i$ observed by agent $j$, bounded in $[0,1]$ \\ 
\hline
$X_{i}(t)$ & Global reward of arm $i$ at time slot $t$ \\ 
\hline
$\mu_i$ & Global mean reward of arm $i$ \\ 
\hline
$\tilde{\mu}_{i,j}(t)$ & The global estimate of agent $j$ of arm $i$ at time slot $t$ \\ 
\hline
$i^{\star}$ & The unique optimal arm with the largest global mean reward \\ 
\hline
$\mathcal{S}_j(t)$ & The candidate arm set of agent $j$ at time slot $t$ \\ 
\hline
$\mathcal{B}_j(t)$ & The elimination arm set of agent $j$ at time slot $t$ \\ 
\hline
$t_i$ & The time label attached to arm $i$ \\ 
\hline
$\Delta_i = \mu_{i^{\star}} - \mu_i$ & Reward gap between the optimal arm and arm $i$ \\ 
\hline
$\tau_{i,j}(t)$ & The sample count of agent $j$ on arm $i$ until time slot $t$ \\ 
\hline
$\tau_i(t)$ & The global sample count on arm $i$ \\ 
\hline
$\delta$ & The violation probability of confidence interval \\ 
\hline
$U_{i,j}(t,\delta)$ & The radius of confidence interval \\ 
\hline
$\texttt{UCB}_{i,j}$ & The upper confidence bound of agent $j$ on arm $i$ \\ 
\hline
$\texttt{LCB}_{i,j}$ & The lower confidence bound of agent $j$ on arm $i$ \\ 
\hline
$\mathbb{E}[{R^T}(\mathcal{A})]$ & Expected group regret for a distributed algorithm $\mathcal{A}$ \\ 
\hline
$\mathbb{E}[{R_{j}^T}(\mathcal{A})]$ & Expected individual regret for agent $j$ in a distributed algorithm $\mathcal{A}$ \\ 
\hline
\end{tabular}
\caption{Summary of symbols and Definitions}
\label{symbol}
\end{table*}






\section{Appendix / some knowledge of graphs}\label{appendix: multi-agent system}
Throughout this paper, we consider FMAB problems with $N$ agents operating in a time-invariant network. The network is represented by a communication graph $\mathcal{G}(\mathcal{N}, \mathcal{E}, \mathcal{X})$, which consists of three components:

\begin{enumerate}
    \item $\mathcal{N} = \{1, \ldots, N\}$ is the set of agents in the network, corresponding to the number of agents in the distributed system.
    \item $\mathcal{E} \subset \mathcal{N} \times \mathcal{N}$ is the edge set, which determines the connectivity between agents. 
    \item $\mathcal{X} = [a_{j,j^{\prime}}]_{N \times N}$ is the adjacency matrix of the graph $\mathcal{G}$, where $a_{j,j^{\prime}}$ denotes the weight of the edge between agents $j$ and $j^{\prime}$. 
\end{enumerate}

Notably, the adjacency matrix represents the importance of one agent to its neighbors and encodes neighborhood information in $\mathcal{G}$. Specifically, $a_{j,j^{\prime}}$ is the weight from agent $j^{\prime}$ to agent $j$. Since the graph is directed, we have $a_{j,j^{\prime}} \neq a_{j^{\prime},j}$.

The graph has no self-loops, meaning that $a_{j,j} = 0$ for all $j \in \mathcal{N}$. An edge between agents $j$ and $j^{\prime}$ exists if and only if $a_{j,j^{\prime}} \neq 0$, i.e., $(j,j^{\prime}) \in \mathcal{E}$. 

For each agent $j$, its neighborhood is denoted as $\mathcal{N}_j = \{ j^{\prime} \mid j^{\prime} \in \mathcal{N}, a_{j,j^{\prime}} \neq 0, j^{\prime} \neq j \}$. Finally, we define the diameter of the graph $\mathcal{G}$ as $D$, which represents the longest distance between any two agents in the network.

For graph $\mathcal{G}$, its corresponding Laplacian matrix $\mathcal{L}$ is defined as follows
\begin{equation*}
    \mathcal{L}_{j,j^{\prime}}=\left\{\begin{split}
    &-a_{j,j^{\prime}},&j\neq j^{\prime}\\
    &\sum_{k=1}^Na_{j,k}.&j=j^{\prime}\\
    \end{split}\right.
\end{equation*}

The maximum degree of graph $\mathcal{G}$ is defined as $\epsilon=\max_i(\sum_{j^{\prime}\neq j}a_{j,j^{\prime}})$. Then, for any constant $\beta\in(0,1/\epsilon]$, the Perron matrix $W=I-\beta\mathcal{L}$ could be obtained. The Perron matrix $\bm{W}=[\omega_{i,j}]_{N\times N}$ is a doubly random matrix and both the sum of row elements and column elements in $\bm{W}$ is $1$. In the multi-agent bandit setting, it is widely used to solve the consensus problem \citep{olfati2007consensus}. 

\section{Appendix / preliminaries of the problem}
\begin{lemma}\label{lemma1} \citep{yan2012distributed}
For a doubly random matrix $W$, it is an irreducible, doubly stochastic matrix with strictly positive diagonal entries. Then, there exists a positive constant $Q$ such that
$$ \sum_{j=1}^N \left| \omega^k_{i,j} - \frac{1}{N} \right| < Q \lambda_2^k, $$
where $\omega^k_{i,j}$ represents the element in the $i$-th row and $j$-th column of the matrix ${W}^k$, $k$ represents the iteration step, and $\lambda_2$ is the second largest eigenvalue of matrix ${W}$. $Q$ is equal to 1 if the graph $\mathcal{G}$ is balanced; otherwise, $Q = \sqrt{N}$.
\end{lemma}

\begin{lemma}\label{lemma3}  \citep{lattimore2020bandit}
Suppose that $X_i$ is $\sigma_i^2$ sub-Gaussian and $X_i$ are all independent for $i\in\{1,\dots,N\}$. Then we have $\frac{1}{N}\sum_{i=1}^NX_i$ is $\frac{\sum^N_i\sigma_i^2}{N^2}$ sub-Gaussian.
\end{lemma}

\begin{lemma}\label{lemma2} \citep{molloy2002graph}
Assume that $X(t)-\mu$ is independent, $\sigma^2$ sub-Gaussian random variable. Then for any $\epsilon\geq0$,
\begin{equation*}
\begin{split}
    &\mathbb{P}(\hat{\mu}\geq\mu+\epsilon)\leq \exp(-\frac{n\epsilon^2}{2\sigma^2}),\\
    &\mathbb{P}(\hat{\mu}\leq\mu-\epsilon)\geq \exp(-\frac{n\epsilon^2}{2\sigma^2}),\\
\end{split}
\end{equation*}
where $\hat{\mu}=\frac{1}{n}\sum^n_{t=1}X(t)$ and $n$ is the sample count. 
\end{lemma}

\begin{lemma}\label{lemma5} \citep{dubhashi2009concentration}
If a random variable $X$ has a finite mean and $a\leq X\leq b$ almost surely, then $X$ is $\frac{1}{4}(b-a)^2$ sub-Gaussian.
\end{lemma}

\begin{lemma}\label{lemma6}\citep{gross2003handbook}
For a strong connected graph $\mathcal{G}$ with $N$ nodes and diameter $D$, the second largest eigenvalue of Perron matrix $W$ is bounded by
    \begin{equation*}
        \lambda_2\le 1-\frac{\beta}{ND},
    \end{equation*}
with $\beta\in(0,1/\epsilon]$ and $\epsilon=\max_i(\sum_{j^{\prime}\neq j}a_{j,j^{\prime}})$.
\end{lemma}


\begin{lemma}\label{lemma7}\cite{lattimore2020bandit}
In FMAB problems, let $\mathcal{D}_j(i)$ denote the event that agent $j$ eliminates the optimal arm $i^\star$ in favor of some suboptimal arm $i$. Then, the probability of this event is bounded by
$$ \mathbb{P}(\mathcal{D}_j(i)) \le \delta. $$

Assuming the violation probability is $\delta = \frac{1}{T^2}$, the regrets incurred from the erroneous elimination of the optimal arm are of $O(1)$ order.
\end{lemma}

\section{Appendix / missed proofs}\label{appendix: missed proofs}
\subsection{Proof of Lemma~\ref{lemma: distributed estimation}}\label{appendix: proof of lemma 1}
\begin{proof}
To clearly illustrate the relationship between the sampling count and the global estimate, let $X_{i,j}^{\tau_{i,j}(t)}$ represent the reward, and $\tilde{\mu}_{i,j}^{\tau_{i,j}(t)}$ the global estimate, both for agent $j$ when pulling arm $i$ at the $\tau_{i,j}(t)$-th sample. Here, $\tau_{i,j}(t)$ denotes the number of times agent $j$ has pulled arm $i$ up to time slot $t$. Benefiting from the design of Algorithm~\ref{alg: DAEE}, all agents sample arm $i$ at the same frequency. In the proof, we focus on arm $i$ and use $\tau$ to represent the sampling count $\tau_{i,j}$ for simplicity.

For the sake of computation, stack the value of $\tilde{\mu}_{i,j}^{\tau}$ and $X_{i,j}^{\tau}$ into vectors as follows
\begin{equation*}\label{z3}
    \begin{split}
        & \bm{\tilde{\mu}_{i}^{\tau}}:=[ \tilde{\mu}_{i,1}^{\tau},\dots, \tilde{\mu}_{i,N}^{\tau}]^T,\\
        & \bm{X_{i}^{\tau}}:=[X_{i,1}^{\tau},\dots,X_{i,N}^{\tau}]^T.\\
    \end{split}
\end{equation*}

Stacking all global estimates $\tilde{\mu}_{i,j}^{\tau}$, equation \eqref{iteration} can be rewritten as
\begin{equation}\label{z4}
     \bm{\tilde{\mu}_{i}^{\tau}}=(1-\sigma_i(\tau))\bm{W}\bm{\tilde{\mu}_{i}^{\tau-1}}+\sigma_i(\tau)\bm{X_{i}^{\tau}}.
\end{equation}

Substituting $\sigma_i(\tau)=\frac{1}{\tau+1}$ into \eqref{z4} and iterating it, we have
\begin{equation*}\label{z5}
    \bm{\tilde{\mu}_{i}^{\tau}}=\frac{1}{\tau+1}\bm{W}^{\tau}\bm{\tilde{\mu}_{i}^{0}}+\frac{1}{\tau+1}\sum^{\tau}_{k=1}\bm{W}^{\tau-k}\bm{X_{i}^k}.
\end{equation*}

In Algorithm~\ref{alg: DAEE}, when $t=0$, there is no communication between agents. Hence, we denote $\tilde{\mu}_{i,j}^0=X_{i,j}^0$ (Line~\ref{DRRB_2}, Algorithm~\ref{alg: DAEE}). Then, the above equation could be rewritten as
\begin{equation*}\label{z5_1}
    \bm{\tilde{\mu}_{i}^{\tau}}=\frac{1}{\tau+1}\sum^{\tau}_{k=0}\bm{W^{\tau-k}}\bm{X_{i}^k}.
\end{equation*}

Spitting the elements from $\bm{\tilde{\mu}_{i}^{\tau}}$, the global estimate $\tilde{\mu}_{i,j}^{\tau}$ of agent $j$ for arm $i$ is as follows
\begin{equation*}\label{z6}
    \tilde{\mu}_{i,j}^{\tau}=\frac{1}{\tau+1}\sum^{\tau}_{k=0}\sum_{j^{\prime}=1}^N\omega^{\tau-k}_{j,j^{\prime}}X_{i,j^{\prime}}^k.
\end{equation*}

Meanwhile, the global estimate under the full information communication is written as
\begin{equation}\label{z2}
  \hat{\mu}_{i}^{\tau}=\frac{1}{N}\sum_{j=1}^N\bar{X}_{i,j}^{\tau},
\end{equation}
where\[
\bar{X}_{i,j}^{\tau}=\frac{1}{\tau+1}\sum_{k=0}^{\tau}X_{i,j}^k.
\]

Subtracting $\hat{\mu}_{i}^{\tau}$ from $\tilde{\mu}_{i,j}^{\tau}$, we have
\begin{equation*}\label{z7}
\begin{split}
    \tilde{\mu}_{i,j}^{\tau}-\hat{\mu}_{i}^{\tau}&=\frac{1}{\tau+1}\sum^{\tau}_{k=0}\sum_{j^{\prime}=1}^N\omega^{\tau-k}_{j,j^{\prime}}X_{i,j^{\prime}}^k-\frac{1}{\tau+1}\sum_{k=0}^{\tau}\sum_{j=1}^N\frac{1}{N}X_{i,j}^k\\
     &=\frac{1}{\tau+1}\sum^{\tau}_{k=0}\sum_{j^{\prime}=1}^N(\omega^{\tau-k}_{j,j^{\prime}}-\frac{1}{N})X_{i,j^{\prime}}^k.\\
\end{split}
\end{equation*}

Then, substituting Lemma~\ref{lemma1} into $\tilde{\mu}_{i,j}^{\tau}-\hat{\mu}_{i}^{\tau}$, we have
\begin{equation}\label{z8}
    \begin{split}
         \lvert\tilde{\mu}_{i,j}^{\tau}-\hat{\mu}_{i}^{\tau}\rvert&<\frac{1}{\tau+1}\sum^{\tau}_{k=0}Q\lambda_2^{\tau-k}=\frac{1-\lambda_2^{\tau+1}}{1-\lambda_2}\cdot\frac{Q}{\tau+1}\leq\frac{Q}{(1-\lambda_2)(\tau+1)},\\
    \end{split}
\end{equation}
where the constant $Q$ depends on the matrix $\bm{W}$. When the doubly random matrix $\bm{W}$ is symmetric, we have $Q=1$. Otherwise, we have $Q=\sqrt{N}$. The details are also shown in Lemma 8 in \cite{zhu2023distributed}.

The goal is to obtain an unbiased estimation $\tilde{\mu}_{i,j}^{\tau}$ on the global mean $\mu_i$. To achieve the goal, we could divide the problem into two parts: $\tilde{\mu}_{i,j}^{\tau}-\hat{\mu}_{i}^{\tau}$ and $\hat{\mu}_{i}^{\tau}-\mu_i$. According to the triangle inequality, we have
\begin{equation}\label{z16}
    \lvert\tilde{\mu}_{i,j}^{\tau}-\mu_i\rvert\leq \lvert\tilde{\mu}_{i,j}^{\tau}-\hat{\mu}_{i}^{\tau}\rvert+\lvert\hat{\mu}_{i}^{\tau}-\mu_i\rvert.
\end{equation}

From equation~\eqref{z16} we have completed the bound between $\tilde{\mu}_{i,j}^{\tau}$ and $\hat{\mu}_{i}^{\tau}$. How to prove the bound of $\hat{\mu}_{i}^{\tau}-\mu_{i}$ is what we require to consider in the following proof. According to the definition of $\hat{\mu}_{i}^{\tau}$ in \eqref{z2}, $\hat{\mu}_{i}^{\tau}$ could be rewritten as
\begin{equation*}\label{z10}
    \begin{split}
        &\hat{\mu}_{i}^{\tau}=\frac{1}{N}\sum^N_{j=1}\bar{X}_{i,j}^{\tau}=\frac{1}{N(\tau+1)}\sum^N_{j=1}\sum^{\tau}_{k=0}X_{i,j}^k,\\
    \end{split}
\end{equation*}
 
For arm $i$, the global reward $X_{i}^{\tau}=\frac{1}{N}\sum^N_{j=1}X_{i,j}$ is considered as a linear combination reward of $X_{i,j}^{\tau}, j\in\mathcal{N}$.
According to Lemma~\ref{lemma5}, the $[0,1]$-valued variable $X_{i,j}$ could be considered as a $\frac{1}{4}$ sub-Gaussian variable. Since that $X_{i,j}^{\tau}$, $i\in\mathcal{K}$ are all independent sub-Gaussian variables, we could deduce that $X_{i}^{\tau}$ is a $\frac{1}{4N}$ sub-Gaussian variable from Lemma~\ref{lemma3}. 

Assume that arm $i$ is pulled by $N$ agents for $\tau$ times, then it follows from Lemma~\ref{lemma2} that
\begin{equation}\label{0119_1}
\begin{split}
    &\mathbb{P}(\hat{\mu}_{i}^{\tau}\geq\mu_i+\varepsilon)\leq\exp\left(\frac{-\tau\varepsilon^2}{2\sigma^2}\right),\\
    &\mathbb{P}(\hat{\mu}_{i}^{\tau}\leq\mu_i-\varepsilon)\leq\exp\left(\frac{-\tau\varepsilon^2}{2\sigma^2}\right).\\
\end{split}
\end{equation}
where $\varepsilon=\sqrt{\frac{\log\delta^{-1}}{2N\tau}}$ and $\sigma^2=\frac{\sum^N_{i=1}\sigma_i^2}{4N^2}=\frac{1}{4N}$. Make a transformation on \eqref{0119_1}, we have
\begin{equation}\label{518}
\begin{split}
    &\mathbb{P}\left(\hat{\mu}_{i}(t)\geq\mu_i+\sqrt{\frac{\log\delta^{-1}}{2Nt}}\right)\leq\delta,\\
    &\mathbb{P}\left(\hat{\mu}_{i}(t)\leq\mu_i-\sqrt{\frac{\log\delta^{-1}}{2Nt}}\right)\leq\delta.\\
\end{split}
\end{equation}

Furthermore, combining the two inequalities in \eqref{518} yields
\begin{equation*}\label{z12}
    \mathbb{P}\left(\mu_i-\sqrt{\frac{\log\delta^{-1}}{2Nt}}\leq\hat{\mu}_{i}^{\tau}\leq\mu_i+\sqrt{\frac{\log\delta^{-1}}{2Nt}}\right)\geq 1-2\delta.
\end{equation*}

With probability at least $1-2\delta$, equation \eqref{z16} can be written as
\begin{equation}\label{z17}
    \lvert\tilde{\mu}_{i,j}^{\tau}-\mu_i\rvert\leq \sqrt{\frac{\log\delta^{-1}}{2N\tau}}+\frac{Q}{(1-\lambda_2)(\tau+1)}.
\end{equation}
\end{proof}

\subsection{Proof of Lemma~\ref{lemma6}}
\begin{proof}    
Due to that the relationship between the Laplacian matrix and the Perron matrix is $\bm{W}=\bm{I}-\beta\bm{\mathcal{L}}$, the second largest eigenvalue of $\bm{W}$ is equal to the smallest nonzero eigenvalue of $\bm{\mathcal{L}}$, which is the algebraic connectivity of graph $\mathcal{G}$. From reference \citep{gross2003handbook}, the smallest nonzero eigenvalue $\hat{\lambda}_2$ of $\bm{\mathcal{L}}$ is bounded by $\hat{\lambda}_2\ge\frac{1}{ND}$. Then, one can deduce that the second largest eigenvalue $\lambda_2$ of the Perron matrix $\bm{W}$ is
\begin{equation*}
    \lambda_2\le 1-\beta\hat{\lambda}_2\le1-\frac{\beta}{ND}.
\end{equation*}
\end{proof}

\subsection{Proof of Lemma~\ref{lemma7}}
\begin{proof}
According to the definition of arm elimination, the emergence of event $\mathcal{D}_j(i)$ implies equation~\eqref{criterion}. Based on Lemma~\ref{lemma2}, the probability of event $\mathcal{D}_j(i)$ is as follows
$$  \mathbb{P}(\mathcal{D}_j(i)) \le \exp\left(\frac{-4\tau_{i,j} U_{i,j}^2(t,\delta)}{2 \times \frac{1}{4N}}\right) \le \exp\left(-2N\tau_{i,j} \frac{\log \delta^{-4}}{2N\tau_{i,j}} \right) \le \delta^4\le \delta. $$

In Theorem~\ref{upper bound analysis}, the violation probability of the confidence interval is $\delta = \frac{1}{T^2}$. The expected regret caused by erroneous elimination is bounded by 
$$\sum_{j=1}^N\sum_{t=1}^T\sum_{i:\Delta_i>0}\Delta_i  \mathbb{P}(\mathcal{D}_j(i))\le NTK\delta\le \frac{NK}{T},$$
which is in order $O(1)$.
\end{proof}

\subsection{Proof of Theorem~\ref{upper bound analysis}}\label{appendix: proof of theorem 1}
\begin{proof}
\textbf{Step 1: Bound the sample count of each arm \(i\) for agent \(j\)}

Recall that for all arms $i \in \mathcal{K} \setminus \{i^{\star}\}$, $\Delta_i > 0$. For the two regrets defined in equations \eqref{group regret} and \eqref{individual regret}, the practical sample count $\tau_{i,j}(T)$ is of primary interest. However, in the case of \texttt{DRRB-bandit}, $\tau_{i,j}(t)$ is difficult to determine directly because additional sample counts may exist, even if the arm does not satisfy the condition in equation~\eqref{iteration}. This is because agents with strong learning capabilities need to maintain the same update of the candidate arm set as other agents, which could lead to an increase in regrets.

To address this, we introduce an auxiliary variable, $\hat{\tau}_{i,j}(t)$, representing the theoretical sample count for agent $j$ when sampling arm $i$. This can be computed using equation~\eqref{iteration}. The variable $\hat{\tau}_{i,j}(t)$ corresponds to the sample count when arm $i$ has been included in the set $\mathcal{B}_j$, while $\tau_{i,j}(t)$ represents the time when arm $i$ is excluded from the set $\mathcal{S}_j$. Therefore, when computing the upper bound of the sample count, we can replace $\tau_{i,j}(t)$ with $\hat{\tau}_{i,j}(t)$ as described in equation~\eqref{iteration}. 
For the two sample counts mentioned above, the relationship between them is 
\begin{equation}\label{521_1}
    \hat{\tau}_{i,j}(t)\leq\tau_{i,j}(t)\leq\hat{\tau}_{i,j}(t)+D.
\end{equation}

According to the explanation above, the sample counts of all arms in candidate set $\mathcal{S}_j$ are the same for all agents in agent set $\mathcal{N}$. Since that $\tilde{\mu}_{i,j}$ estimates $\mu_i$, the reward gap $\Delta_i$ for each agent $j\in\mathcal{N}$ is related to $U_{i,j}(t,\delta)$. Equation \eqref{criterion} is equal to 
\begin{equation}\label{z63}
2U_{i,j}(t,\delta) \geq\tilde{\mu}_{i^{\max},j}(t)-\tilde{\mu}_{i,j}(t)
\overset{(a)}\ge \Delta_i - 2U_{i,j}(t,\delta),
\end{equation}
where inequality (a) is from $\tilde{\mu}_{i^{\max},j}(t) \ge \tilde{\mu}_{i^{\star},j}(t) \geq \mu_{i^{\star}}-U_{i,j}(t)$ and $\tilde{\mu}_{i,j}(t)\leq \mu_{i}+U_{i,j}(t)$. 

Let $A_{i,j,t}$ denote the event in which agent $j$ pulls arm $i$ at time slot $t$, then we have
\[
\begin{split}
    \mathbb{P}\left( \bigcap_{i,j,t} A_{i,j,t}\right)
    = 1 - \mathbb{P}\left( \bigcup_{i,j,t} \neg A_{i,j,t}\right)\ge 1 - \sum_{i,j,t}\mathbb{P}\left(  \neg A_{i,j,t}\right)\ge 1 - 2tNK\delta. 
\end{split}
\]

Replacing $\tau_{i,j}(t)$ with $\hat{\tau}_{i,j}(t)$ in functions $U_{i,j}(t,\delta)$ and $\sigma_i(t)$, equation \eqref{z63} can be written as
\begin{equation}\label{equ1}
    4\left( \sqrt{\frac{\log \delta^{-1}}{2N\hat{\tau}_{i,j}(t)}}+\frac{Q}{(1-\lambda_2)(1+\hat{\tau}_{i,j}(t))}\right)\geq\Delta_i,
\end{equation}
i.e.,
\[
\begin{split}
    &\sqrt{\frac{\log \delta^{-1}}{2N\hat{\tau}_{i,j}(t)}}+\frac{Q}{(1-\lambda_2)(1+\hat{\tau}_{i,j}(t))}\ge\frac{\Delta_i}{4},\\
    &\frac{\log \delta^{-1}}{2N\hat{\tau}_{i,j}(t)}\ge (\frac{\Delta_i}{4}-\frac{Q}{(1-\lambda_2)(1+\hat{\tau}_{i,j}(t))})^2,\\
    &\frac{\log \delta^{-1}}{2N\hat{\tau}_{i,j}(t)}\ge \frac{\Delta_i^2}{16}-\frac{Q\Delta_i}{2(1-\lambda_2)\hat{\tau}_{i,j}(t)},\\
    &\hat{\tau}_{i,j}(t)\leq \frac{8\log \delta^{-1}}{N\Delta_i^2}+\frac{8Q}{(1-\lambda_2)\Delta_i}.\\
\end{split}
\]

Define $\tau^{\star}$ as the maximum satisfying equation \eqref{equ1}, then the maximum theoretic sample count $\hat{\tau}_{i,j}$ of agent $j$ pulling arm $i$ is
\begin{equation}\label{equ2}
\begin{split}
    \hat{\tau}_{i,j}&\le \lceil\tau^{\star}\rceil\le \tau^{\star}+1\le \frac{8\log \delta^{-1}}{N\Delta_i^2}+\frac{8Q}{(1-\lambda_2)\Delta_i}+1,\\
\end{split}
\end{equation}
with probability at least $1 - 2tNK\delta$.

\textbf{Step 2: Bound the individual regret of agent \(j\)}

In the design of Algorithm \ref{alg: DAEE}, agents refrain from using the most recent information to ensure consensus on the candidate set. There is a time delay in updating the candidate set because agent $j$ requires a few rounds to ensure that all other agents receive the information. The diameter of the communication graph $\mathcal{G}$ is denoted as $D$, which means that any agent can obtain the information initially learned by agent $j$ after at most $D$ communication rounds. Therefore, the practical sample count $\tau_{i,j}(t)$ is bounded by:
\begin{equation}\label{equ3}
    \hat{\tau}_{i,j}\le\tau_{i,j}(T)\le\hat{\tau}_{i,j}+D.
\end{equation}

Therefore, the cumulative regret for agent $j$ could be decomposed as follows
\begin{equation}\label{equ4}
\begin{split}
    \mathbb{E}[{R_{j}^T}(\mathcal{A})]&=T\mu_{i^{\star}}-\sum^T_{t=1}\mathbb{E}[X_{A_j(t)}(t)]=T\mu_{i^{\star}}-\underset{i:\Delta_i>0}{\sum}\mu_i\tau_{i,j}(T)\\
    &=\underset{i:\Delta_i>0}{\sum}\Delta_i\tau_i(t)\leq\underset{i:\Delta_i>0}{\sum}\Delta_i(\hat{\tau}_{i,j} + D).
\end{split}
\end{equation}

The regret consists of two parts: large probability events (The optimal arm persists in the candidate arm set) and small probability events (The optimal arm is eliminated). The regret caused by large probability events is bounded by the sample counts, while the regret caused by small probability events is bounded by Lemma~\ref{lemma7}. According to the equation~\eqref{equ4}, the total regret is bounded by
\begin{equation*}
\begin{split}
     \mathbb{E}[{R_{j}^T}(\mathcal{A})]&=\mathbb{E}[{R_{j}^T}(\text{large probability events})]+\mathbb{E}[{R_{j}^T}(\text{small probability events})]\\
     &\leq\underset{i:\Delta_i>0}{\sum}\Delta_i(\hat{\tau}_{i,j}+D)+1\leq \underset{i:\Delta_i>0}{\sum}\frac{16\log T}{N\Delta_i}+K(D+1)\Delta_i+\frac{8KQ}{1-\lambda_2}+1.
\end{split}    
\end{equation*}

According to \eqref{group regret}, the group regret $R^T(\mathcal{A})$ could be rewritten as follows
\begin{equation*}
\begin{split}
        \mathbb{E}[{R^T}(\mathcal{A})]&=\mathbb{E}[{R^T}(\text{large probability events})]+\mathbb{E}[{R^T}(\text{small probability events})]\\
        &=NT\mu_{i^{\star}}-\sum^T_{t=1}\sum^N_{j=1}\mathbb{E}[X_{A_j(t),j}(t)]+1\\
        &=NT\mu_{i^{\star}}-\sum_{i:\Delta_i>0}\sum^N_{j=1}\mu_i\tau_{i,j}(T)+1\\
        &\overset{(a)}\leq\sum_{i:\Delta_i>0}\sum^N_{j=1}(\mu_{i^{\star},j}-\mu_{i,j})(\hat{\tau}_{i,j}+D)+1\\
        &\overset{(b)}=\sum_{i:\Delta_i>0}N(\mu_{i^{\star}}-\mu_{i})(\hat{\tau}_{i,j}+D)+1\\
        &=\sum_{i:\Delta_i>0}N\Delta_{i}(\hat{\tau}_{i,j}+D)+1\\
        &\overset{(c)}\leq\underset{i:\Delta_i>0}{\sum}\frac{16\log T}{\Delta_i}+\underset{i:\Delta_i>0}{\sum}NKD\Delta_i+\frac{8KNQ}{1-\lambda_2}+1,
\end{split}
\end{equation*}
where inequality (a) arises due to the time delay discussed earlier, while equality (b) holds because each agent pulls each arm at the same time, i.e., $\hat{\tau}_{i,1}=\hat{\tau}_{i,2}=\cdots=\hat{\tau}_{i,N}$. Inequality (c) follows primarily from equation \eqref{equ2}.
\end{proof}

\subsection{Proof of Corollary~\ref{corollary1}}
\begin{proof}
According to Lemma~\ref{lemma6}, we have $$\lambda_2\le1-\frac{\beta}{ND},$$ where $\beta\in(0,1/\epsilon]$ is a given parameter corresponding to graph $\mathcal{G}$ and $\epsilon$ represents the largest neighbor number of any agents, which is bounded by $\epsilon=\max_i(\sum_{j^{\prime}\neq j}a_{j,j^{\prime}})\le N$.

In this paper, define $\beta$ as $\beta=1/\epsilon\ge\frac{1}{N}$. Then, one can deduce that $\lambda_2$ is bounded by $\lambda_2\le1-\frac{1}{N^2D}$ and the upper bound of $\frac{1}{1-\lambda_2}$ is
\begin{equation*}
    \frac{1}{1-\lambda_2}\le N^2D.
\end{equation*} 

Then, the individual regret is bounded by
\begin{equation*}
\begin{split}
    \mathbb{E}[{R_{j}^T}(\mathcal{A})]
    &\leq\sum_{i:\Delta_i>0}\frac{16\log T}{N\Delta_{i}}+\sum_{i:\Delta_i>0}(D+1)\Delta_i+8KQDN^2+1.
\end{split}
\end{equation*}

The group regret is bounded by
\[
    \begin{split}
    \mathbb{E}[R^T(\mathcal{A})]
    &\leq\sum_{i:\Delta_i>0}\frac{16\log T}{\Delta_i}+\sum_{i:\Delta_i>0}N(D+1)\Delta_i+8KQDN^3+1.
    \end{split}
\]
\end{proof}

\subsection{Proof of Corollary~\ref{corollary2}}
The individual regret $R^T_j$ could be decomposed into 
\[
\begin{split}
    \mathbb{E}[R^T_j(\mathcal{A})]&=\mathbb{E}[{R_j^T}(\text{large probability events})]+\mathbb{E}[{R_j^T}(\text{small probability events})]\\
    &\le\sum_{i:\Delta_i>0}\Delta_i\tau_{i,j}(T)+1\\
    &=\sum_{i:\Delta_i \ge\Delta}\Delta_i\tau_{i,j}(T) + \sum_{i:\Delta_i < \Delta}\Delta_i\tau_{i,j}(T)+1\\
    &\le\sum_{i:\Delta_i \ge\Delta}\Delta_i\tau_{i,j}(T)+\Delta T+1\\
    &\le \sum_{i:\Delta_i \ge\Delta}\frac{16\log T}{N\Delta_i}+\sum_{i:\Delta_i \ge\Delta}(D+1)\Delta_i+\frac{8QK}{1-\lambda_2}+\Delta T+1\\
    &\le \frac{16K\log T}{N\Delta}+\Delta T+K(D+1)+\frac{8QK}{1-\lambda_2}+1\\
    &\le 2\sqrt{\frac{16KT\log T}{N}}+K(D+1)+\frac{8QK}{1-\lambda_2}+1,
\end{split}
\]
where $\Delta=\sqrt{\frac{16K\log T}{NT}}$. The group regret could also be transformed into
\[
\begin{split}
    \mathbb{E}[R^T(\mathcal{A})]&=\mathbb{E}[{R^T}(\text{large probability events})]+\mathbb{E}[{R^T}(\text{small probability events})]\\
    &\le\sum^K_{i=1}\sum^N_{j=1}(\mu_{i^{\star},j}-\mu_{i,j})\tau_{i,j}(T)+1\\
    &\overset{(a)}=\sum_{i:\Delta_i>0}N\Delta_i\tau_{i,j}(T)+1\\
    &\le \sum_{i:\Delta_i\ge\Delta}N\Delta_i\tau_{i,j}(T)+\sum_{i:\Delta_i<\Delta}N\Delta_i\tau_{i,j}(T)+1\\
    &\le \sum_{i:\Delta_i\ge\Delta}N\Delta_i{\tau}_{i,j}(T)+NT\Delta+1\\
    &\le \sum_{i:\Delta_i\ge\Delta}N\Delta_i(\frac{16\log T}{N\Delta_i^2}+\frac{8Q}{(1-\lambda_2)\Delta_i}+1+D)+NT\Delta+1\\
    &\le \frac{16K\log T}{\Delta}+NT\Delta+\frac{8KNQ}{1-\lambda_2}+KN(D+1)+1\\
    &\le 8\sqrt{KNT\log T}+NT\Delta+\frac{8KNQ}{1-\lambda_2}+KN(D+1)+1,
\end{split}
\]
where equation (a) holds because all agents sample arms synchronously.

\subsection{Proof of Corollary~\ref{theorem: communication cost}}
In the proof of Theorem~\ref{upper bound analysis}, one can deduce that the suboptimal arm $i$ is sampled by agent $j$ at most $\frac{8\log\delta^{-1}}{N\Delta_i^2} + \frac{8Q}{(1-\lambda_2)\Delta_i} + D + 1$ times. In Theorem~\ref{upper bound analysis}, the violation probability is denoted by $\delta = \frac{1}{T^2}$, then the sample count is bounded by
$$ \tau_{i,j} \le \frac{16\log T}{N\Delta_i^2} + \frac{8Q}{(1-\lambda_2)\Delta_i} + D + 1. $$

In each round, \texttt{DRRB-bandit} collects information about all arms and communicates it with other agents in a single batch. Therefore, to determine the maximum number of communications, it suffices to consider the number of samples of the arm that remain in the candidate set for the second longest period.
$$ C^T(\mathcal{A}) \le \frac{16K\log T}{\Delta_{\min}^2} + \frac{8KNQ}{(1-\lambda_2)\Delta_{\min}} + KN(D + 1). $$

\subsection{Proof of Theorem~\ref{lower bound analysis}}
\begin{proof}
Assume that $\sum_{j^{\prime}\neq j}\mu_{i,j^{\prime}}$ are the same for all arm $i\in\mathcal{K}$; under this assumption, the problem reduces to a single-agent regret minimization problem, where only agent $j$'s observations matter. This assumption can be generalized to various cases, which could result in the same lower bound of the regret. That is, agent $j$ only needs to perform regret minimization according to its own local observation. Therefore, the problem inherits the regret of classical multi-armed bandits. 

According to the assumption above, define two reward distributions on arm $i$ as follows
\begin{equation*}
\begin{split}
    &\nu_{j}=(P_{1,j},\dots,P_{i,j},\dots,P_{K,j}),\\
    &\nu_j^{\prime}=(P_{1,j}^{\prime},\dots,P_{i,j}^{\prime},\dots,P^{\prime}_{N,j}),\\
\end{split}
\end{equation*}
where $P_{k,j}=P_{k,j}^{\prime}$ for all $k\neq i$.

Let $\mathcal{M}$ be a set of distributions with finite means, and let $\mu: \mathcal{M}\rightarrow \mathbb{R}$ be the function that maps $P_{i,j}\in \mathcal{M}$ to its mean. Let $\mu_{i^{\star},j}\in\mathbb{R}$ and $P_{i,j}\in \mathcal{M}$ have $\mu(P_{i,j})<\mu_{i^{\star},j}$ and define
\[
\begin{split}
    d_{i,j}&=d_{\inf}(P_{i,j},\mu_{i^{\star},j},\mathcal{M})=\inf_{P_{i,j}^{\prime}\in\mathcal{M}}\{D(P_{i,j},P_{i,j}^{\prime}):\mu(P_{i,j}^{\prime})>\mu_{i^{\star},j}\},
\end{split}
\]
where $D(P_{i,j},P_{i,j}^{\prime})$ is the relative entropy between $P_{i,j}$ and $P_{i,j}^{\prime}$. For arm $i$ and $\mu(P^{\prime}_{i,j})>\mu_{i^{\star},j}$, there exists arbitrary $\epsilon>0$ such that $D(P_{i,j},P^{\prime}_{i,j})\le d_{i,j}+\epsilon$. 

According to Lemma 15.1 in reference \cite{lattimore2020bandit}, the divergence between $\nu_j$ and $\nu_j^{\prime}$ is decomposed into 
\begin{equation*}
\begin{split}
    D(\mathbb{P}_{\nu_j},\mathbb{P}_{\nu_j^{\prime}})&=\sum_{k=1}^K\mathbb{E}[\tau_{k,j}(T)]D(P_{k,j},P^{\prime}_{k,j})\overset{(a)}{=}\mathbb{E}[\tau_{i,j}(T)]D(P_{i,j},P^{\prime}_{i,j})\le\mathbb{E}[\tau_{i,j}(T)](d_{i,j}+\epsilon),
\end{split}
\end{equation*}
where equation (a) is obtained based on $D(P_{k,j},P_{k,j}^{\prime})=0$ if $k\neq i$.

According to Bretagnolle–Huber inequality (Theorem 14.2 in \cite{lattimore2020bandit}), for any event $A_{i,j}$ (agent $j$ pulls arm $i$), we have
\begin{equation*}
\begin{split}
    \mathbb{P}_{\nu_j}(A_{i,j})+\mathbb{P}_{\nu_j^{\prime}}(A^c_{i,j})&\ge \frac{1}{2}\exp(-D(\mathbb{P}_{\nu_j},\mathbb{P}_{\nu_j^{\prime}}))\\
    &\ge \frac{1}{2}\exp(-\mathbb{E}_{\nu_j}[\tau_{i,j}(T)](d_{i,j}+\epsilon))
\end{split}
\end{equation*}

Choose $A_{i,j}=\{\tau_{i,j}(T)>T/2\}$, and let ${R}_j^T={R}_j^T(\mathcal{A},\nu_j)$ and ${R}^T_{j^{\prime}}={R}_{j^{\prime}}^T(\mathcal{A},\nu_j^{\prime})$. Then
\begin{equation*}
\begin{split}
        {R}_j^T+{R}^T_{j^{\prime}}&\ge \frac{T}{2}(\mathbb{P}_{\nu_j}(A_{i,j})\Delta_i+\mathbb{P}_{\nu_j^{\prime}}(A_{i,j}^c)(\mu_i^{\prime}-\mu_{i^{\star}}))\\
        &\ge \frac{T}{2}\min\{\Delta_i,\mu_i^{\prime}-\mu_{i^{\star}}\}(\mathbb{P}_{\nu_j}(A_{i,j})+\mathbb{P}_{\nu_j^{\prime}}(A^c_{i,j}))\\
        &\ge \frac{T}{2}\min\{\Delta_i,\mu_i^{\prime}-\mu_{i^{\star}}\}\exp(-\mathbb{E}_{\nu_j}[\tau_{i,j}(T)](d_{i,j}+\epsilon)).
\end{split}
\end{equation*}

Rearranging and taking the limit inferior leads to
\begin{equation*}
\begin{split}
    \liminf_{T\rightarrow\infty}\frac{\mathbb{E}[\tau_{i,j}(T)]}{\log T}&\ge \frac{1}{d_{i,j}+\epsilon}\liminf_{T\rightarrow\infty}\frac{\log\frac{T\min\{\Delta_i,\mu_i^{\prime}-\mu_{i^{\star}}\}}{4({R}_j^T+{R}^T_{j^{\prime}})}}{\log T}\\
    &\ge \frac{1}{d_{i,j}+\epsilon}(1-\liminf_{T\rightarrow\infty}\frac{\log ({R}_j^T+{R}^T_{j^{\prime}})}{\log T})\\
    &=\frac{1}{d_{i,j}+\epsilon},
\end{split}
\end{equation*}
where the last equality follows from the definition of consistency, which says that for any $p > 0$, there exists a constant $C_p$ such that for sufficiently large $T$, ${R}_j^T+{R}^T_{j^{\prime}}\le C_pT^p$, which implies that
\begin{equation*}
    \liminf_{T\rightarrow\infty}\frac{\log ({R}_j^T+{R}^T_{j^{\prime}})}{\log T}\le p.
\end{equation*}

Considering $p>0$ was arbitrary and $\epsilon>0$ is limited to zero, we have
\begin{equation}\label{lower bound of tau}
    \liminf_{T\rightarrow\infty}\frac{\mathbb{E}[\tau_{i,j}(T)]}{\log T}\ge \frac{1}{d_{i,j}}\overset{(a)}{=}\frac{2}{N^2\Delta_i^2},
\end{equation}
where equation (a) is obtained from Table 16.1 given in \cite{lattimore2020bandit}. We have $d_{i,j}=\frac{(\mu_{i,j}-\mu_{i^{\star},j})^2}{2}$. Considering that $\sum_{j^{\prime}\neq j}\mu_{i,j^{\prime}}$ and $\mu_{i}=\frac{1}{N}\sum_{j=1}^N\mu_{i,j}$, we have $\mu_{i^{\star},j}-\mu_{i,j}=N(\mu_{i^{\star}}-\mu_i)=N\Delta_i$.

The individual regret of the problem is lower bounded by
\[
\begin{split}
    \liminf_{T\to\infty} \frac{{R}^T_j(\mathcal{A})}{\log T} &\ge \liminf_{T\to\infty} \sum_{i:\Delta_i>0}\frac{\Delta_i \mathbb{E}[\tau_{i,j}(T)]}{\log T} \ge \sum_{i: \Delta_i>0}\frac{\Delta_i}{d_{i,j}}\ge \sum_{i: \Delta_i>0}\frac{2}{N^2\Delta_i}.
\end{split}
\]

For the group regret, we consider it as the sum of all agents' individual regret, and the lower bound of group regret could be written as
\begin{equation*}
\begin{split}
    \liminf_{T\to\infty} \frac{\mathbb{E}[{R^T(\mathcal{A})}]}{\log T}&= \liminf_{T\to\infty} \frac{\mathbb{E}[{\sum^N_{j=1}R_j^T(\mathcal{A})}]}{\log T}\ge\sum_{i: \Delta_i>0}\sum_{j=1}^N\frac{2}{N^2\Delta_i}\ge\sum_{i: \Delta_i>0}\frac{2}{N\Delta_i}.
\end{split}
\end{equation*}
\end{proof}

\subsection{Proof of Theorem~\ref{lower bound analysis2}}
\begin{proof}
In round-robin-based bandit algorithms, assume all agents sample the same arm at each time slot. Then, the global reward of each arm is associated with a $\frac{1}{\sqrt{N}}$-Gaussian distribution, which follows from Lemma~\ref{lemma3}.

Let $\mathcal{M}$ be a set of distributions with finite means, and let $\mu: \mathcal{M}\rightarrow \mathbb{R}$ be the function that maps $P\in \mathcal{M}$ to its mean. Let $\mu_{i^{\star}}\in\mathbb{R}$ and $P\in \mathcal{M}$ have $\mu(P)<\mu_{i^{\star}}$ and define
\[
    d_i=d_{\inf}(P,\mu_{i^{\star}},\mathcal{M})=\inf_{P^{\prime}\in\mathcal{M}}\{D(P,P^{\prime}):\mu(P^{\prime})>\mu_{i^{\star}}\},
\]
where $D(P,P^{\prime})$ is the relative entropy between $P$ and $P^{\prime}$.

Define two reward distributions as follows
\begin{equation*}
\begin{split}
    &\nu=(P_1,\dots,P_i,\dots,P_K),\\
    &\nu^{\prime}=(P_1,\dots,P_i^{\prime},\dots,P_K).\\
\end{split}
\end{equation*}

Let all arms except arm $i$ be the same in the two distributions. For arm $i$, let $\epsilon>0$ be arbitrary such that $D(P_i,P^{\prime}_i)\le d_i+\epsilon$ and $\mu(P^{\prime}_i)>\mu_{i^{\star}}$. 

According to Lemma 15.1 in reference \cite{lattimore2020bandit}, the divergence between $\nu$ and $\nu^{\prime}$ is decomposed into 
\begin{equation*}
    D(\mathbb{P}_{\nu},\mathbb{P}_{\nu^{\prime}})=\sum_{k=1}^K\mathbb{E}[\tau_{k,j}(T)]D(P_i,P^{\prime}_i)\overset{(a)}{=}\mathbb{E}[\tau_{i,j}(T)](d_i+\epsilon),
\end{equation*}
where equation (a) is obtained based on $D(P_j,P_j^{\prime})=0$ if $j\neq i$.

According to Bretagnolle–Huber inequality (Theorem 14.2 in \cite{lattimore2020bandit}), for any event $A$, we have
\begin{equation*}
\begin{split}
    \mathbb{P}_{\mu}(A)+\mathbb{P}_{\mu^{\prime}}(A^c)&\ge \frac{1}{2}\exp(-D(\mathbb{P}_{\nu},\mathbb{P}_{\nu^{\prime}}))\ge \frac{1}{2}\exp(-\mathbb{E}[\tau_{i,j}(T)](d_i+\epsilon))
\end{split}
\end{equation*}

Choose $A=\{\tau_{i,j}(T)>T/2\}$, and let ${R}_T={R}_T(\mathcal{A},\nu)$ and ${R}_T^{\prime}={R}^{\prime}_T(\mathcal{A},\nu^{\prime})$. Then
\begin{equation*}
\begin{split}
        {R}_T+{R}_T^{\prime}&\ge \frac{T}{2}(\mathbb{P}_{\mu}(A)\Delta_i+\mathbb{P}_{\mu^{\prime}}(A^c)(\mu_i^{\prime}-\mu_{i^{\star}}))\\
        &\ge \frac{T}{2}\min\{\Delta_i,\mu_i^{\prime}-\mu_{i^{\star}}\}(\mathbb{P}_{\mu}(A)\Delta_i+\mathbb{P}_{\mu^{\prime}}(A^c))\\
        &\ge \frac{T}{2}\min\{\Delta_i,\mu_i^{\prime}-\mu_{i^{\star}}\}\exp(-\mathbb{E}[\tau_{i,j}(T)](d_i+\epsilon)).
\end{split}
\end{equation*}

Rearranging and taking the limit inferior leads to
\begin{equation*}
\begin{split}
    \liminf_{T\rightarrow\infty}\frac{\mathbb{E}[\tau_{i,j}(T)]}{\log T}&\ge \frac{1}{d_i+\epsilon}\liminf_{T\rightarrow\infty}\frac{\log\frac{T\min\{\Delta_i,\mu_i^{\prime}-\mu_{i^{\star}}\}}{4({R}_T+{R}_T^{\prime})}}{\log T}\\
    &\ge \frac{1}{d_i+\epsilon}(1-\liminf_{T\rightarrow\infty}\frac{\log ({R}_T+{R}_T^{\prime})}{\log T})\\
    &=\frac{1}{d_i+\epsilon},
\end{split}
\end{equation*}
where the last equality follows from the definition of consistency, which says that for any $p > 0$, there exists a constant $C_p$ such that for sufficiently large $T$, ${R}_T+{R}_T^{\prime}\le C_pT^p$, which implies that
\begin{equation*}
    \liminf_{T\rightarrow\infty}\frac{\log ({R}_T+{R}_T^{\prime})}{\log T}\le p.
\end{equation*}

Considering $p>0$ was arbitrary and $\epsilon>0$ is limited to zero, we have
\begin{equation*}
    \liminf_{T\rightarrow\infty}\frac{\mathbb{E}[\tau_{i,j}(T)]}{\log T}\ge \frac{1}{d_i}.
\end{equation*}

According to Table 16.1 given in \cite{lattimore2020bandit}, we have $d_i=\frac{N\Delta_i^2}{2}$. The individual regret of the problem is lower bounded by
\[
\begin{split}
    \liminf_{T\to\infty} \frac{{R_j^T(\mathcal{A})}}{\log T} &\ge \liminf_{T\to\infty}\sum_{i:\Delta_i>0} \frac{ \Delta_i \mathbb{E}[\tau_{i,j}(T)]}{\log T}\ge \sum_{i: \Delta_i>0}\frac{\Delta_i}{d_i}\ge \sum_{i: \Delta_i>0}\frac{2}{N\Delta_i}.
\end{split}
\]

According to the definition of group regret ${R}^T(\mathcal{A})$ in equation \eqref{group regret}, the lower bound of group regret could be written as
\begin{equation*}
\begin{split}
    {R^T(\mathcal{A})}&=NT\mu_{i^{\star}}-\sum_{i: \Delta_i>0}\sum^T_{t=1}\sum^N_{j=1}\mathbb{I}\{A_j(t)=i\}\mathbb{E}[X_{i,j}(t)]\\
        &=NT\mu_{i^{\star}}-\sum_{i: \Delta_i>0}\sum^T_{t=1}\sum^N_{j=1}\mathbb{I}\{A_j(t)=i\}\mu_{i,j}\\
        &=NT\mu_{i^{\star}}-\sum_{i: \Delta_i>0}\sum^N_{j=1}\mu_{i,j}\tau_{i,j}(T),\\
        &=\sum_{i: \Delta_i>0}\sum^N_{j=1}(\mu_{i^{\star},j}-\mu_{i,j})\tau_{i,j}(T)\\
        &=\sum_{i: \Delta_i>0}N\Delta_i\tau_{i,j}(T).
\end{split}
\end{equation*}

Considering equation~\eqref{lower bound of tau}, we have
\begin{equation*}
\begin{split}
    \liminf_{T\to\infty} \frac{\mathbb{E}[{R^T(\mathcal{A})}]}{\log T} &\ge \liminf_{T\to\infty} \sum_{i: \Delta_i>0}\sum_{j=1}^N\frac{\Delta_{i,j}\mathbb{E}[\tau_{i,j}(T)]}{\log T}\ge\sum_{i: \Delta_i>0}\sum_{j=1}^N\frac{2N\Delta_i}{N\Delta_i^2}\ge\sum_{i: \Delta_i>0}\frac{2}{\Delta_i}.
\end{split}
\end{equation*}
\end{proof}

\section{Additional Experiment}\label{appendix: simulation}
Our algorithm (\texttt{DRRB-bandit}) can also obtain a similar result compared to the optimal homogeneous bandit algorithm (\texttt{DPE2} \citep{wang2020optimal}) in Figure~\ref{fig2}. \texttt{DPE2} contains a leader that could uniformly allocate resources and tasks. \texttt{DRRB-bandit} relies entirely on fully distributed communication. Hence, \texttt{DRRB-bandit} has more regret compared with \texttt{DPE2} but is also better than other heterogeneous bandit algorithms.
    \begin{figure}[htb]
    \centering
    \begin{subfigure}{0.9\linewidth}
        \centerline{\includegraphics[width=\columnwidth]{add_graph/6.png}}
    \end{subfigure}
    \begin{subfigure}{0.48\linewidth}
        \centerline{\includegraphics[width=\columnwidth]{add_graph/8.png}}
        \caption{Individual regrets}
        \label{original22}
    \end{subfigure}
    \begin{subfigure}{0.48\linewidth}
        \centerline{\includegraphics[width=\columnwidth]{add_graph/9.png}}
        \caption{Group regrets}
        \label{original2}
    \end{subfigure}\\
    \begin{subfigure}{0.48\linewidth}
        \centerline{\includegraphics[width=\columnwidth]{add_graph/7.png}}
        \caption{Regrets with different numbers of arms}
        \label{Vary arm number2}
    \end{subfigure}
    \begin{subfigure}{0.48\linewidth}
        \centerline{\includegraphics[width=\columnwidth]{add_graph/10.png}}
        \caption{Regrets with different numbers of agents}
        \label{Vary agent number2}
    \end{subfigure}
    \caption{Performance comparison in the homogeneous setting.\label{fig2}}
    \end{figure}
