\section{ASYNCHRONOUS ALGORITHM FOR FEDERATED LINEAR BANDITS}

In this section, we further consider the pure exploration problem of federated linear bandits. We propose an algorithm called Federated Asynchronous Linear Pure Exploration (\texttt{FALinPE}), and its description is given in Algorithm \ref{alg2}.

\begin{algorithm*}[t]
\renewcommand{\algorithmicrequire}{\textbf{Input:}}
\renewcommand{\algorithmicensure}{\textbf{Output:}}
	\caption{Federated Asynchronous Linear Pure Exploration (\texttt{FALinPE}) }
 \label{alg2}
	\begin{algorithmic}[1]
            \STATE \textbf{Inputs:} Arm set $\A$, agent set $\M$, regularization parameter $\lambda>0$, triggered parameter  $\gamma_1,\gamma_2$, and $(\delta,\epsilon)$
            \STATE \textbf{Initialization:}
            \STATE From round $1$ to $K$ sequentially pulls arm from $1$ to $K$ and receives reward  $r_{t}$, $\forall t\in\vert K\vert$ 
            \STATE Server sets $\V_{ser,K} = \lambda\bI + \sum_{t=1}^K\x_{t}\x_{t}^\top$, $\b_{ser,K} = \sum^K_{t=1} \x_{t} r_{t}$, $T_{ser,K}(k) = 1,\ \forall k \in \A$\COMMENT{{\color{blue}Server initialization}}
            \FOR {$m=1:M$} 
            \STATE  Agent $m$ sets $\V_{m,K+1} = \lambda\bI + \sum_{t=1}^K\x_{t}\x_{t}^\top$, $\b_{m,K+1} = \sum^K_{t=1} \x_{t} r_{t}$, $T_{m,K+1}(k) = 1$, $\V_{m,K}^{loc} = \bold{0}$, $\b_{m,K}^{loc} = \bold{0}$ and $T^{loc}_{m,K}(k) = 0$, $\forall k\in\A$ \COMMENT{{\color{blue}Agents initialization}}
            \ENDFOR
            \FOR {$t = K+1:\infty$} 
            \STATE Agent $m_t$ sets $\hat{\t}_{m_t,t}$, $i_{m_t,t}$ and $j_{m_t,t}$ by (\ref{alg2eq1}), pulls $k_{m_t,t}$ by (\ref{select2}) or (\ref{greedy}) and receive $r_{m_t,t}$ \COMMENT{{\color{blue}Sampling rule}}
            \STATE Agent $m_t$ updates $\V_{m_t,t}^{loc}$, $\b_{m_t,t}^{loc}$ and $T_{m_t,t}^{loc}(k_{m_t,t})$ based on (\ref{alg2eq2})
            \IF {$\frac{\text{det}(\V_{m_t,t} + \V_{m_t,t}^{loc})}{\text{det}(\V_{m_t,t})} > (1+\gamma_1)$ \textbf{or} $\frac{\sum_{k=1}^K (T_{m_t,t}(k) + T^{loc}_{m_t,t}(k))}{\sum_{k=1}^K T_{m_t,t}(k)} > (1+\gamma_2)$} 
            \STATE \textbf{\textbf{[Agent $m_t$ $\rightarrow$ Server]}} Send $\V^{loc}_{m_t,t}$, $\b_{m_t,t}^{loc}$ and $T_{m_t,t}^{loc}(k),\ \forall k\in\A$ to the server \COMMENT{{\color{blue}Upload data to server}}
            \STATE  Server sets $\V_{ser,t}$, $\b_{ser,t}$, $T_{ser,t}(k)$, $\forall k\in\A$, $\hat{\t}_{ser,t}$, 
            $i_{ser,t}$, $j_{ser,t}$ and $B(t)$ based on (\ref{alg2eq3}) and (\ref{alg2eq4})
            \IF {$B(t) \le \epsilon$} 
            \STATE Server returns $i_{ser,t}$ as the estimated best arm $\hat k^*$ and break the loop \COMMENT{{\color{blue}Stopping rule and decision rule}}
            \ENDIF
            \STATE \textbf{[Server $\rightarrow$ Agent $m_t$]} Send $\V_{ser,t}$, $\b_{ser,t}$ and $T_{ser,t}(k)$, $\forall k\in\A$ to agent $m_t$ \COMMENT{{\color{blue}Download data}}
            \STATE Agent $m_t$ sets $\V_{m_t,t+1} = \V_{ser,t}$, $ \b_{m_t,t+1} = \b_{ser,t}$ and $ T_{m_t,t+1}(k) = T_{ser,t}(k)$, $\forall k\in\A$
            \STATE Agent $m_t$ sets $\V_{m_t,t}^{loc} = \bold{0}$, $\b_{m_t,t}^{loc} = \bold{0}$ and $T_{m_t,t}^{loc}(k) = 0$, $\forall k\in \A$
            \ELSE
            \STATE Agent $m_t$ sets $\V_{m_t,t+1} = \V_{m_t,t}$, $ \b_{m_t,t+1} = \b_{m_t,t}$ and $ T_{m_t,t+1}(k) = T_{m_t,t}(k)$, $\forall k\in\A$
            \ENDIF
            \STATE Inactive agent $m \not = m_t$ sets $\V_{m,t+1} = \V_{m,t}$, $ \b_{m,t+1} = \b_{m,t}$ and $ T_{m,t+1}(k) = T_{m,t}(k)$, $\forall k\in\A$
            \ENDFOR
	\end{algorithmic}  
\end{algorithm*}

\paragraph{FALinPE algorithm }
Similar to Algorithm \ref{alg3}, \texttt{FALinPE} starts with an initialization step (line 2-7), where each arm is pulled once. Then in each round $t\ge K+1$, the active agent $m_t$ sets its estimated model parameter $\hat{\t}_{m_t,t}$, empirical best arm $i_{m_t,t}$ and most ambiguous arm $j_{m_t,t}$ as
\begin{align}\label{alg2eq1}
\begin{split}
    \hat{\t}_{m_t,t} =& \V_{m_t,t}^{-1}\b_{m_t,t},\quad i_{m_t,t} = \arg\max_{k\in\A} \x_k^\top\hat\t_{m_t,t},\\  j_{m_t,t} =& \arg\max_{k \in \A/\{i_{m_t,t}\}} \hat{\Delta}_{m_t,t}(k,i_{m_t,t})\\& + \alpha^L_{m_t,t}(i_{m_t,t},k)
\end{split}
\end{align}
and pulls the most informative arm $k_{m_t,t}$ (context denotes as $\x_{m_t,t}$). The exploration bonuses of pair $(i,j)$ in the linear case are defined as $\alpha^L_{m_t,t}(i,j) = \Vert \y(i,j)\Vert_{\V_{m_t,t}^{-1}} C_{m_t,t}$ and $\alpha^L_{ser,t}(i,j) = \Vert \y(i,j)\Vert_{\V_{ser,t}^{-1}} C_{ser,t}$, where the definitions of the scalers $C_{m_t,t}$ and $C_{ser,t}$ are provided in Theorem \ref{theorem2}. Besides, the estimated reward gaps between arm $i$ and $j$ are defined as $\hat{\Delta}_{m_t,t}(i,j) = \y(i,j)^\top\hat{\t}_{m_t,t}$ and $\hat{\Delta}_{ser,t}(i,j) = \y(i,j)^\top\hat{\t}_{ser,t}$. Agent $m_t$ would update its covariance matrix $\V_{m_t,t}^{loc}$, $\b_{m_t,t}^{loc}$ and $T_{m_t,t}^{loc}(k_{m_t,t})$ as
\begin{align}\label{alg2eq2}
    \begin{split}
    &\V_{m_t,t}^{loc} = \V_{m_t,t-1}^{loc} + \x_{m_t,t}\x_{m_t,t}^{\top},\\& \b_{m_t,t}^{loc} = \b_{m_t,t-1}^{loc} + r_{m_t,t}\x_{m_t,t},\\ &T_{m_t,t}^{loc}(k_{m_t,t}) = T_{m_t,t-1}^{loc}(k_{m_t,t})+1.
    \end{split}
\end{align}
\texttt{FALinPE} utilizes a hybrid event-triggered strategy to control the size of the exploration bonus $\alpha^L_{ser,t}(i,j)$ and $\alpha^L_{m_t,t}(i,j)$, and the observation number $\sum_{k=1}^KT_{ser,t}(k)$ and $\sum_{k=1}^KT_{ser,t}(k)$. If at least one of the two events is triggered, then agent $m_t$ would upload its collected data $\V^{loc}_{m_t,t}$, $\b^{loc}_{m_t,t}$ and $T^{loc}_{m_t,t}(k)$, $\forall k\in\A$ to the server. The server would update its collected data and estimation
\begin{align}\label{alg2eq3}
\begin{split}
   & \V_{ser,t} = \V_{ser,t-1} + \V^{loc}_{m_t,t},\\&  \b_{ser,t} = \b_{ser,t-1} + \b_{m_t,t}^{loc},\\& T_{ser,t}(k) = T_{ser,t-1}(k) + T_{m_t,t}^{loc}(k),\ \forall k\in\A,\\&\hat{\t}_{ser,t} = {V_{ser,t}^{-1}}b_{ser,t}
\end{split}
\end{align}
and set $i_{ser,t}$, $j_{ser,t}$, and the breaking index $B(t)$ as
\begin{align}\label{alg2eq4}
    \begin{split}
        &i_{ser,t} = \arg\max_{k\in\A} \x_k^\top\hat\t_{ser,t},\\& j_{ser,t} = \arg\max_{k = \A/\{i_{ser,t}\}} \hat{\Delta}_{ser,t}(k,i_{ser,t}) + \alpha^L_{ser,t}(i_{ser,t},k)\\& B(t) =  \hat{\Delta}_{ser,t}(j_{ser,t},i_{ser,t}) + \alpha^L_{ser,t}(i_{ser,t},j_{ser,t}).
    \end{split}
\end{align}
If the breaking index $B(t) > \epsilon$, the server would return $\V_{ser,t}$, $\b_{ser,t}$ and $T_{ser,t}(k)$, $\forall k\in\A$ to the user. \texttt{FALinPE} would repeat the above steps until $B(\tau) \le \epsilon$. 

\paragraph{Design of communication events of \texttt{FALinPE}} Similar to \texttt{FAMABPE}, \texttt{FALinPE} also enjoys a low switching cost (i.e, $1/2\C(\tau)$). Besides, the hybrid event-triggered strategy can simultaneously control the size of $\V^{loc}_{m,t}$, $\sum_{k=1}^KT^{loc}_{m,t}(k)$ and the exploration bonuses. Note that \citet{Min2021LearningSS} also utilize a hybrid event-triggered communication protocol to achieve a similar goal, but for learning stochastic shortest path with linear function approximation. The exploration bonuses in the linear case are not only related to $t$ but also related to covariance matrices. Therefore, different from the communication protocol in the MAB, the event-triggered communication protocol in the linear bandits is additionally required to keep $\V_{m_t,t}$ and $\V_{ser,t}$ in a desired proportion to the global covariance matrix $\lambda\bI + \sum_{s=1}^t \x_{m_t,t}\x_{m_t,t}^\top$. 


\paragraph{Arm selection strategy } To minimize the sample complexity $\tau$. We hope every agent can pull the most informative arm $k_{m_t,t}$ to reduce the exploration bonus $\alpha^L_{m_t,t}(i_{m_t,t}, j_{m_t,t})$ as fast as possible. The arm selection strategy of Algorithm \ref{alg2} ensures active agent $m_t$ to pull $k_{m_t,t}$ to most decrease the matrix norm $\Vert \y(i_{m_t,t}, j_{m_t,t}) \Vert_{\V_{m_t,t}^{-1}}$ (and also $\alpha^L_{m_t,t}(i_{m_t,t}, j_{m_t,t})$). Different from the MAB, in the linear case we can not directly find $k_{m_t,t}$, and need to derive it with a linear programming \citep{Xu2017AFA}, it yields
\begin{align} \label{select2}
    k_{m_t,t} = \arg\min_{k\in\A} \frac{T_{m_t,t}(k)}{p_k^*(\y(i_{m_t,t}, j_{m_t,t}))},
\end{align}
where $p^*(\cdot)$ is defined as follows
\begin{align}
    p_k^*(\y(i_{m_t,t},j_{m_t,t})) = \frac{w_k^*(\y(i_{m_t,t}, j_{m_t,t})) }{\sum_{s=1}^K \vert w_s^*(y(i_{m_t,t}, j_{m_t,t})) \vert}
\end{align}
and 
\begin{align}
\begin{split}
\label{programming}
 \bold{w}^*(\y(i_{m_t,t},&j_{m_t,t})) =  \arg\min_{\bold{w} \in \R^d} \sum_{k=1}^K\vert 
w_k \vert\\& s.t.\quad \y(i_{m_t,t}, j_{m_t,t}) = \sum_{k=1}^K w_k\x_k.
\end{split}
\end{align}
The notation $w_k^*(\y(i_{m_t,t}, j_{m_t,t}))$ denotes the $k$-th element of vector $\bold{w}^*(\y(i_{m_t,t}, j_{m_t,t}))$. Besides, the optimal value of programming (\ref{programming}) denotes as $\rho(\y(i,j)) = \sum_{k=1}^K w^*_k(\y(i,j))$, $\forall i,j\in\A$. However, the programming is computationally inefficient and we also propose to select the arm greedily similar to \citep{Xu2017AFA}
\begin{align}\label{greedy}
\begin{split}
    k_{m_t,t} =& \arg\max_{k \in \A} \y(i_{m_t,t}, j_{m_t,t})^\top\\&\times(\V_{m_t,t} + \x_k\x_k^\top)^{-1} \y(i_{m_t,t}, j_{m_t,t}).
\end{split}
\end{align}
Although we did not analyze the theoretical property of the greedy arm selection strategy, in the experiment section, we empirically validate that it performs well.

\begin{theorem} \label{theorem2} With $0 < \lambda \le \sigma^2\big(\sqrt{1 + \gamma_1 M} + \sqrt{2\gamma_1}M\big)^2\log(2/\delta)$, $\gamma_1 = 1/M^2$, $\gamma_2 = 1/(2MK)$, arm selection strategy (\ref{select2}) and 5
\begin{align}
\begin{split}
\nonumber
    &C_{m_t,t} = \sqrt{\lambda} + \Big(\sqrt{2\gamma_1}M + \sqrt{1 + \gamma_1 M}\Big) \\&\times\bigg(\sigma\sqrt{d\log\bigg(\frac{2}{\delta}\bigg(1+\frac{(1+\gamma_2 M) \sum_{k=1}^K T_{m_t,t}(k)}{\min(\gamma_1,1)\lambda}\bigg)\bigg)} \bigg)\\
   &C_{ser,t} = \sqrt{\lambda} + \Big(\sqrt{2\gamma_1}M + \sqrt{1 + \gamma_1 M}\Big) \\&\times\bigg(\sigma\sqrt{d\log\bigg(\frac{2}{\delta}\bigg(1+\frac{(1+\gamma_2 M) \sum_{k=1}^K T_{ser,t}(k)}{\min(\gamma_1,1)\lambda}\bigg)\bigg)} \bigg),
\end{split}
\end{align}
the estimated best arm $\hat k^*$ of \texttt{FALinPE} can satisfy condition (\ref{1}) and with probability at least $1-\delta$ the sample complexity can be bounded by
\begin{align}
\begin{split}
\nonumber
            \tau \le& \frac{M + 1/(2K)}{M - 1/2}   \bigg(\sqrt{2} +  \sqrt{ 1 + \frac{1}{M}}\bigg)^2  H^L_\epsilon 4 \sigma^2 d \\&\times \log\bigg(1 + \frac{(1 + 1/(2K)) \Lambda^2}{\lambda/M^{2}}\bigg) + \Gamma,
\end{split}
\end{align}
where 
\begin{align}
\nonumber
        H^L_\epsilon =  \sum_{k=1}^K \max_{i,j\in\A} \frac{\rho(\y(i,j))p^*_k(\y(i,j))}{\max\big(\frac{\Delta(k^*,i) + \epsilon}{3}, \frac{\Delta(k^*,j) + \epsilon}{3}, \epsilon\big)^2}
\end{align}
is the problem complexity in the linear bandits \citep{Xu2017AFA}, 
\begin{align}
\begin{split}
\nonumber
 \Lambda =& \frac{M + 1/(2K)}{M - 1/2} \bigg(\sqrt{2} +  \sqrt{ 1 + \frac{1}{M}}\bigg)^2 H^L_\epsilon 4 \sigma^2 d\\&\times \sqrt{1 + \frac{1 + 1/(2K) }{\lambda/M^{2}}} + \Gamma 
\end{split}
\end{align}
and
\begin{align}
\nonumber
 \Gamma = \frac{M + 1/(2K)}{M - 1/2} \bigg(\sqrt{2} +  \sqrt{ 1 + \frac{1}{M}}\bigg)^2 H^L_\epsilon 4 \sigma^2 d \log\Big(\frac{2}{\delta}\Big).
\end{align}
The communication cost satisfies $\C(\tau) = \tilde{O}\big( \max(M^{2}d,MK) \big)$.
\end{theorem}

\begin{remark}
As mention in \citep{Xu2017AFA}, the sample complexity of the LinGapE runs by a single agent (i.e., $\tau = \tilde{O}(H_\epsilon^L d)$) can match the lower bound in \citet{Soare2014BestArmII} up to a constant factor. As shown in the Theorem \ref{theorem2}, the sample complexity of \texttt{FALinPE} can also satisfies $\tau = \tilde{O}(H_\epsilon^L d)$ when we select the proper $\lambda$, $\gamma_1$ and $\gamma_2$. Besides, the communication cost of the \texttt{FALinPE} satisfies $\C(\tau) = \tilde{O}(dM^2)$ when $M \ge K/d$, which is the same as the communication cost of Async-LinUCB \citep{Li2021AsynchronousUC} and FedLinUCB \citep{He2022ASA} (both are $\tilde{O}(dM^2)$) in the regret minimization setting. It is worth noting that in our and \cite{He2022ASA}'s setting, the communication between the active agent and server is independent to the offline agent, while in \cite{Li2021AsynchronousUC}'s setting, the algorithm requires a global download section. In addition, the guarantee in \cite{Li2021AsynchronousUC} relies on a stringent regularity assumption on the contexts, while ours and \cite{He2022ASA}'s do not. We claim that the \texttt{FALinPE} can
achieve a near-optimal sample complexity and efficient communication cost.
\end{remark}

