\section{Preliminaries}\label{sec:preli_fa}






\subsection{An optimization perspective on federated averaging}
\label{fl_ag}
%\vspace{-2mm}
Federated averaging (FedAvg) is a standard algorithm in federated learning and is typically formulated into a distributed optimization framework as follows
\begin{align}\label{optim_perspective}
    \min_{\theta} \ell(\theta):=\frac{\sum_{c=1}^N \ell^c(\theta)}{\sum_{c=1}^N n_c},\quad \ell^c(\theta):= \sum_{i=1}^{n_c} l(\theta; x_{c, i}),
\end{align}
where $\theta\in\mathbb{R}^d$, $l(\theta;x_{c,j})$ is a certain loss function based on $\theta$ and the data point $x_{c,j}$.

\Wei{take care the update here in the next revision.}
One iterate of the FedAvg algorithm requires the following three steps:
\begin{itemize}
    \item \emph{Broadcast}: The center server \emph{broadcasts} the latest model, $\theta_k$, to all local clients.
    \item \emph{Local updates}: For any $c\in [N]$, the $c$-the client first sets $\beta_k^c=\theta_k$ and then conducts $K\geq 1$ local steps:
\begin{align*}
     \beta_{k+1}^c=\beta_k^c-\eta\nabla \tilde \ell^c(\beta_k^c),
\end{align*}
where $\eta$ is the learning rate and $\nabla \tilde \ell^c$ is the unbiased estimate of the exact gradient $\nabla \ell^c$.
    % \item \emph{Synchronization}: The local models are aggregated into a unique model $\theta_{k+K}:=\sum_{c=1}^N p_c \beta_{k+K}^c$ and sent to the center server. 
    \item \emph{Synchronization}: The local models are sent to the center server and then aggregated into a unique model $\theta_{k+K}:=\sum_{c=1}^N p_c \beta_{k+K}^c$. 
\end{itemize}

From the optimization perspective, \cite{lhy+20} proved the convergence of the FedAvg algorithm on non-i.i.d data such that a larger number of local steps $K$ and a higher order of data heterogeneity slows down the convergence. Notably, Eq.~\eqref{optim_perspective} can be interpreted as maximizing the likelihood function, which is a special case of maximum a posteriori estimation (MAP) given a uniform prior.
%\vspace{-2mm}
\subsection{Stochastic gradient Langevin dynamics}
%\vspace{-2mm}
Posterior inference offers the exact uncertainty quantification ability of the predictions. A popular method for posterior inference with large dataset is the stochastic gradient Langevin dynamics (SGLD) ~\cite{Welling11}, which injects additional noise into the stochastic gradient and adapts an optimization algorithm to a sampling one
\begin{align*}
     \theta_{k+1}=\theta_k-\eta\nabla \tilde f(\theta_k)+\sqrt{2\tau \eta}\xi_k,
\end{align*}
where $\tau$ is the temperature and $\xi_k$ is a standard  Gaussian vector. $f(\theta):=\sum_{c=1}^N \ell^c(\theta)$ is an energy function. $\tilde \nabla f(\theta)$ is an unbiased estimate of $\nabla f(\theta)$. In the longtime limit, a well known result is that $\theta_k$ converges weakly to the distribution $\pi(\theta)\propto \exp(-{f(\theta)}/{\tau})$ \cite{Teh16} as $\eta\rightarrow 0$.

\section{Posterior inference via federated averaging Langevin dynamics}\label{sec:posterior_inference}

The increasing concern for uncertainty estimation in federated learning motivates us to consider the simulation of the distribution $\pi(\theta)\propto \exp(-{f(\theta)}/{\tau} )$ with distributed clients.



\paragraph{Problem formulation} We propose the federated averaging Langevin dynamics (FA-LD) based on the FedAvg framework in section \ref{fl_ag}. We follow the same \emph{broadcast} step and \emph{synchronization} step but propose to inject random noises for \emph{local updates}. In particular, we consider the following scheme: for any $c\in [N]$, the $c$-the client first sets $\theta_k^c=\theta_k$ and then conducts $K\geq 1$ local steps:
\begin{align}\label{local_independent_noise}
     \beta_{k+1}^c=\theta_k^c-\eta\nabla \tilde f^c(\theta_k^c)+\sqrt{2\eta\tau} \Xi_k^c,
\end{align}
where $\nabla f^c(\theta)=\frac{1}{p_c} \nabla \ell^c(\theta)$. $\nabla \tilde f^c(\theta)$ is the unbiased estimate of $\nabla f^c(\theta)$ and $\Xi_k^{c}$ is an independent Gaussian vector to be defined later.



% achieving the targeting stationary distribution $\pi(\theta)\propto e^{-\frac{f(\theta)}{\tau}}$ requires a larger temperature $\tau/p_c$ for the $c$-th local client.


% Following the synchronization step \cite{lhy+20}, we have
% \begin{equation}
% \label{virtual_seq_main}
% \theta_k=\sum_{c=1}^N p_c \theta_k^c,\quad \nabla f(\theta_k)=\sum_{c=1}^N p_c \nabla f^c(\theta_k^c), \quad \nabla\tilde f(\theta_k)=\sum_{c=1}^N p_c \nabla \tilde f^c(\theta_k^c),
% \end{equation}
% where both $\beta_k$ and $\theta_k$ are \emph{both inaccessible when $k \text{ mod } K\neq 0$}. We also define $\xi_k=\sum_{c=1}^N \sqrt{p_c} \xi_k^c$, it follows that $\xi_k$ is still a standard $d$-dimensional Gaussian vector given $\sum_{c=1}^N p_c=1$. 

Summing Eq.~\eqref{local_independent_noise} from clients $c=1$ to $N$, we have the aggregated stochastic process as follows
\begin{align*}
    \beta_{k+1}=\theta_k-\eta \nabla \tilde f(\theta_k)+\sqrt{2\eta\tau}\xi_k,
\end{align*}
where
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% One-column version %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% \begin{align}\label{decomposition}
%     \beta_k=\sum_{c=1}^N p_c \beta_k^c,\quad \theta_k=\sum_{c=1}^N p_c \theta_k^c,\quad \nabla\tilde f(\theta_k)=\sum_{c=1}^N p_c \nabla \tilde f^c(\theta_k^c),\quad\xi_k=\sum_{c=1}^N p_c \Xi_k^c.
% \end{align}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Two-column version %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{align}\label{decomposition}
    \beta_k&=\sum_{c=1}^N p_c \beta_k^c,\quad \theta_k=\sum_{c=1}^N p_c \theta_k^c,\notag\\
    \nabla\tilde f(\theta_k)&=\sum_{c=1}^N p_c \nabla \tilde f^c(\theta_k^c),\quad\xi_k=\sum_{c=1}^N p_c \Xi_k^c.
\end{align}

By the nature of the \emph{synchronization} step, we always have $\beta_k=\theta_k$ whether $k+1 \text{ mod } K=0$ or not. In what follows, we can write
\begin{equation}
\label{fed_avg_langevin_dynamics_main}
\theta_{k+1}=\theta_k-\eta \nabla \tilde f(\theta_k)+\sqrt{2\eta\tau}\xi_k,
\end{equation}
which resembles the SGLD algorithm except that the construction of stochastic gradients is different and $\theta_k$ is \emph{not accessible when $k\text{ mod } K\neq 0$}. Since our target is to simulate from $\pi(\theta)\propto \exp( - f(\theta)/\tau )$, we expect that $\xi_k$ is a standard Gaussian vector. By the concentration property of independent Gaussian variables, it is natural to set $\Xi_k^c=\xi_k^c/\sqrt{p_c}$ so that $\xi_k=\sum_{c=1}^N p_c \Xi_k^c=\sum_{c=1}^N \sqrt{p_c} \xi_k^c$ and $\xi_k^c$ is also a standard Gaussian vector. Now we present it in Algorithm \ref{alg:alg_main_paper_text_independent_noise}.

\begin{algorithm*}[h]\caption{Federated averaging Langevin dynamics algorithm (FA-LD), informal version of Algorithm \ref{alg:alg_main_text_independent_noise}. $\eta_k$ is the learning rate at iteration $k$. $\tau$ is the temperature. Denote by $\theta_k^c$ the model parameter in the $c$-th client at the $k$-th step. Denote the immediate result of one step SGLD update from $\theta_k^c$ by $\beta_k^c$. $\xi_k^c$ is an independent standard $d$-dimensional Gaussian vector at iteration $k$ for each client $c\in[N]$. A global synchronization is conducted every $K$ steps.}\label{alg:alg_main_paper_text_independent_noise}
\begin{algorithmic}[1]
\State \begin{equation}\label{local_client_main_paper}
    \beta_{k+1}^c=\theta_k^c-\eta_k\nabla \tilde f^c(\theta_k^c)+\sqrt{2\eta_k\tau/p_c}\xi_k^c,
\end{equation}
\State
\begin{equation}  
\label{synchronization_main_paper}
\theta_{k+1}^c=\left\{  
             \begin{array}{lr}  
             \beta_{k+1}^c \qquad\qquad\qquad \text{if } k+1 \text{ mod } K\neq 0 \\  
              & \\
             \sum_{c=1}^N p_c \beta_{k+1}^c \ \qquad \text{ if } k+1 \text{ mod } K=0.
             \end{array}  
\right.  
\end{equation} 
\end{algorithmic}
\end{algorithm*}



\begin{algorithm*}[h]\caption{Hybrid federated averaging Langevin dynamics algorithm (hFA-LD), informal version of Algorithm \ref{alg:alg_main_text_different_seeds}. $\dot{\xi}_k$ is a $d$-dimensional Gaussian vector shared by all the clients; $\xi_k^c$ is an independent standard $d$-dimensional Gaussian vector at iteration $k$ for each client $c\in[N]$. $\rho$ denotes the correlation coefficient.}\label{alg:alg_main_paper_text_different_seeds}
\begin{algorithmic}[1]
\State \begin{equation*}
    \beta_{k+1}^c=\theta_k^c-\eta\nabla \tilde f^c(\theta_k^c)+\sqrt{2\eta\tau \rho^2}\dot\xi_k + \sqrt{2\eta(1-\rho^2)\tau/p_c}\xi_k^c,
\end{equation*}
\State
\begin{equation*}  
\theta_{k+1}^c=\left\{  
             \begin{array}{lr}  
             \beta_{k+1}^c \qquad\qquad\qquad \text{if } k+1 \text{ mod } K\neq 0 \\  
              & \\
             \sum_{c=1}^N p_c \beta_{k+1}^c \ \qquad \text{ if } k+1 \text{ mod } K=0.
             \end{array}  
\right.  
\end{equation*} 
\end{algorithmic}
\end{algorithm*}

We observe that the local process in Eq.~\eqref{local_client_main_paper} maintains a temperature $\tau/p_c>\tau$ to converge to the stationary distribution $\pi$. Such a mechanism may limit the disclosure of individual data and shows a potential to ensure a higher level of privacy.   

% \paragraph{Extension to correlated noise}

% \paragraph{Partial device participation}


\section{Convergence analysis}\label{sec:convergence}

% We define the 2-Wasserstein distance between a pair of Borel probability measures $\mu$ and $\nu$ on $\R^d$ as follows  
% \begin{align*}
%     W_2(\mu, \nu):=\inf_{\gamma^2\in \text{Couplings}(\mu, \nu)}\left(\int\|\bbeta_{\mu}-\bbeta_{\nu}\|_2^2 d \gamma^2(\bbeta_{\mu}, \bbeta_{\nu})\right)^{\frac{1}{2}},
% \end{align*}
% where $\|\cdot\|_2$ denotes the $\ell_2$ norm on $\mathbb{R}^d$ and the pair of random variables $(\bbeta_{\mu}, \bbeta_{\nu})\in \R^d\times\R^d$ is a coupling with the marginals following $\mathcal{L}(\bbeta_{\mu})=\mu$ and $\mathcal{L}(\bbeta_{\nu})=\nu$, where $\mathcal{L}(\cdot)$ denotes a distribution of a random variable.

% \Wei{W2 distribution}

In this section, we show that FA-LD converges to the stationary distribution $\pi(\theta)$ in the 2-Wasserstein ($W_2$) distance at a rate of $O({1}/{\sqrt{T_{\epsilon}}})$ for strongly log-concave and smooth density. The $W_2$ distance is defined between a pair of Borel probability measures $\mu$ and $\nu$ on $\R^d$ as follows  
\begin{align*}
% \small
    W_2(\mu, \nu):=\inf_{\gamma^2\in \text{Couplings}(\mu, \nu)}\left(\int\|\bbeta_{\mu}-\bbeta_{\nu}\|_2^2 d \gamma^2(\bbeta_{\mu}, \bbeta_{\nu})\right)^{\frac{1}{2}},
\end{align*}
where $\|\cdot\|_2$ denotes the $\ell_2$ norm on $\mathbb{R}^d$ and the pair of random variables $(\bbeta_{\mu}, \bbeta_{\nu})\in \R^d\times\R^d$ is a coupling with the marginals following $\mathcal{L}(\bbeta_{\mu})=\mu$ and $\mathcal{L}(\bbeta_{\nu})=\nu$. Note that $\mathcal{L}(\cdot)$ denotes a distribution of a random variable. Such a distance is more appealing than the total variation or the Kullback–Leibler divergence in statistical machine learning applications for providing the estimates of the first and second order moments.





\subsection{Notation and assumptions}

We make standard assumptions on the smoothness and convexity of the functions $f^1, f^2,\cdots, f^N$, which naturally yields appealing tail properties of the stationary measure $\pi$. Thus, we no longer require a restrictive assumption on the bounded gradient in $\ell_2$ norm as in \cite{Koloskova19, yyz19, lhy+20}. In addition, to control the distance between $\nabla f^c$ and $\nabla \tilde f^c$, we also assume a bounded variance of the stochastic gradient in assumption \ref{def:variance_main}.


\begin{assumption}[Smoothness]\label{def:smooth_main} For each $c\in [N]$, we say $f^c$ is $L$-smooth if for some $L>0$ 
\begin{align*}
% \small
f^c(y)\leq f^c(x)+\langle \nabla f^c(x),y-x \rangle+\frac{L}{2}\| y-x \|^2_2 \quad \forall x, y\in \R^d.
\end{align*}
\end{assumption}

\begin{assumption}[Strongly convex]\label{def:strong_convex_main}
For each $c\in [N]$, $f^c$ is $m$-strongly convex if for some $m>0$ and $\forall x, y\in \R^d$
\begin{align*}
% \small
f^c(x)\geq f^c(y)+\langle \nabla f^c(y),x-y \rangle + \frac{m}{2} \| y-x \|_2^2 \quad \forall x, y\in \R^d.
\end{align*}
\end{assumption}

\begin{assumption}[Bounded variance, informal version of Assumption \ref{def:variance}]\label{def:variance_main}
For each $c\in [N]$, the variance of noise in the stochastic gradient $\nabla \tilde f^c(x)$ in each client is upper bounded such that 
\begin{align*}
\mathbb{E}[ \| \nabla \tilde f^c(x) - \nabla f^c(x) \|_2^2] \leq \sigma^2 d,\quad \forall x\in \R^d.
\end{align*}
\end{assumption}

% The bounded variance is a rather standard assumption and has been widely used in \cite{ccbj18, dk19, lhy+20}. Extension of bounded variance to unbounded cases is quite straightforward and has been adopted in assumption A.4 stated in \cite{Maxim17}. 


\paragraph{Quality of non-i.i.d data} Denote by $\theta_*$ the global minimum of $f$. Next, we quantify the degree of the non-i.i.d data by $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2$, which is a non-negative constant and yields a larger scale if the data is less identically distributed.

%\Wei{Zhao, do we need to change the index? like assumption 1 instead of 4.1?} \Zhao{Using 4.1 and 3.2 such numbering is easy for reviewer to find statement, since it's easy to find section number. When I was reviewing paper, it is very hard to find statements if everything listed by 1,2,3.} \Wei{OK}


\subsection{Proof sketch}

The proof hinges on showing the one-step result in the $W_2$ distance. To facilitate the analysis, we first define an auxiliary continuous-time processes $(\bar\theta_t)_{t\geq 0}$ without communication concerns
\begin{align}
\label{continuous_dynamics_main}
\d \bar\theta_t = - \nabla f(\bar\theta_t) \d t + \sqrt{2\tau} \d \overline{W}_t,
\end{align}
where $\bar\theta_t=\sum_{c=1}^N p_c \bar\theta_t^c$, $\nabla f(\bar\theta_t)=\sum_{c=1}^N p_c \nabla f^c(\bar\theta_t^c)$, $\bar\theta_t^c$ is the continuous-time variable at client $c$, and $\overline{W}$ is a $d$-dimensional Brownian motion. The continuous-time algorithm is known to converge to the stationary distribution $\pi(\theta)\propto e^{-\frac{f(\theta)}{\tau}}$, where $f(\theta)=\sum_{c=1}^N p_c  f^c(\theta)$. Assume that $\bar\theta_0$ simulates from the stationary distribution $\pi$, then it follows that $\bar\theta_t\sim\pi$ for any $t\geq 0$.

\subsubsection{Dominated contraction in federated learning}


The first target is to show a certain contraction property of $\lrn{\beta-\theta-\eta(\nabla f(\beta)-\nabla f(\theta))}_2^2$ based on distributed clients with infrequent communications. Consider a standard decomposition 
\begin{align*}
    &\quad\lrn{\beta-\theta-\eta(\nabla f(\beta)-\nabla f(\theta))}_2^2\notag\\
    &=\lrn{\beta-\theta}_2^2 -2\eta \underbrace{\langle \beta-\theta, \nabla f(\beta)-\nabla f(\theta)\rangle}_{\mathcal{I}}+\eta^2 \lrn{\nabla f(\beta)-\nabla f(\theta)}_2^2.
\end{align*}

Using Eq.\eqref{decomposition}, we decompose $\mathcal{I}$ and apply Jensen's inequality to obtain a lower bound of $\mathcal{I}$. In what follows, we have the following lemma. 
\begin{lemma}[Dominated contraction property, informal version of Lemma \ref{contraction}]
\label{contraction_main}
Assume assumptions \ref{def:smooth_main} and \ref{def:strong_convex_main} hold. For any learning rate $\eta \in (0, \frac{1}{L+m}]$, any $\{\theta^c\}_{c=1}^N, \{\beta^c\}_{c=1}^N\in\mathbb{R}^d$, % simulated from Eq.\eqref{fed_avg_langevin_dynamics} and Eq.\eqref{continuous_dynamics}, respectively, 
we have
\begin{align*}
\small
    &\quad\lrn{\beta-\theta-\eta(\nabla f(\beta)-\nabla f(\theta))}_2^2\leq (1-\eta m) \cdot \|\beta-\theta \|_2^2+4\eta L\sum_{c=1}^N p_c \cdot \underbrace{( \| \beta^c-\beta \|_2^2 + \|\theta^c-\theta \|_2^2 )}_{\text{divergence term}},
\end{align*}
\end{lemma}
where $\beta=\sum_{c=1}^N p_c \beta^c$, $\theta=\sum_{c=1}^N p_c \theta^c$, $\nabla f(\theta)=\sum_{c=1}^N p_c \nabla f^c(\theta^c)$, and $\nabla f(\beta)=\sum_{c=1}^N p_c \nabla f^c(\beta^c)$. It implies that as long as the local parameters $\theta^c,\beta^c$ and global $\theta,\beta$ don't differ each other too much, we can guarantee the desired convergence. In a special case when the communication is conducted at every iteration, the divergence term disappears and recovers the standard contraction \cite{dk19}.


\subsubsection{Bounding divergence}


The following result shows that given a finite number of local steps $K$, the divergence between $\theta^c$ in local client and $\theta$ in the center is bounded in $\ell_2$ norm. Notably, since the Brownian motion leads to a lower order term $O(\eta)$ instead of $O(\eta^2)$, a na\"{i}ve proof framework such as \cite{lhy+20} may lead to a crude upper bound for the final convergence.  

\begin{lemma}[Bounded divergence, informal version of Lemma \ref{divergence}]\label{divergence_main}
Assume assumptions  \ref{def:smooth_main}, \ref{def:strong_convex_main}, and \ref{def:variance_main} hold. For any $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$ and some constant $\mathcal{D}$, we have the $\ell_2$ upper bound of the divergence between local clients and the center as follows
\begin{align*}
    \sum_{c=1}^N p_c\E{\|\theta_k^c-\theta_k \|_2^2}&\leq O(K^2\eta^2 d) +O(K\eta d).\notag
\end{align*}
\end{lemma}

The result also relies on showing a uniform upper bound in $\ell_2$ norm, which avoids making extra bounded gradient assumptions.



\subsubsection{Coupling to the stationary process}

Note that $\bar\theta_t$ is initialized from the stationary distribution $\pi$. 
The solution to the continuous-time process Eq.\eqref{continuous_dynamics_main} follows:
\begin{align}
\label{solution_continuous_dynamics_main}
    \bar\theta_t=\bar\theta_0 -\int_0^t \nabla f(\bar\theta_s)\d s + \sqrt{2\tau}\cdot\overline{W}_t, \qquad \forall t\geq 0.
\end{align}


Set $t\rightarrow(k+1)\eta$ and $\bar\theta_0\rightarrow\bar\theta_{k\eta}$ for Eq.\eqref{solution_continuous_dynamics_main} and consider a \emph{synchronous coupling} such that $W_{(k+1)\eta}-W_{k\eta}:=\sqrt{\eta}\xi_k$ is used to cancel the noise terms, we have
\begin{align}
\label{continuous_one_step_main}
    \bar\theta_{(k+1)\eta}=\bar\theta_{k\eta}-\int_{k\eta}^{(k+1)\eta}\nabla f(\bar\theta_s)\d s + \sqrt{2\tau\eta}\xi_k.
\end{align}

Subtracting Eq.\eqref{fed_avg_langevin_dynamics_main} from Eq.\eqref{continuous_one_step_main} and taking square and expectation on both sides yield that
\begin{align*}
    \E{\|\bar\theta_{(k+1)\eta}-\theta_{k+1} \|^2_2}&\leq  (1-{\eta m}/{2} ) \cdot \E{\|\bar\theta_{k\eta}-\theta_k\|_2^2}+\text{divergence term} + \text{time error}.
\end{align*}

Eventually, we arrive at the one-step error bound for establishing the convergence results.

\begin{lemma}[One step update, informal version of Lemma \ref{one_step_Dalalyan}]\label{one_step_Dalalyan_main}

Assume assumptions \ref{def:smooth_main}, \ref{def:strong_convex_main}, and \ref{def:variance_main} hold. Consider Algorithm \ref{alg:alg_main_paper_text_independent_noise} with any learning rate $\eta \in (0 , \frac{1}{2L})$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, where $\theta_*$ is the global minimum for the function $f$. Then
\begin{align*}
    W_2^2(\mu_{k+1}, \pi)&\leq  (1-{\eta m}/{2}) \cdot W^2_2(\mu_{k}, \pi)+ O(\eta^2 d(K^2+\kappa)),
\end{align*}
where $\mu_k$ denotes the probability measure of $\theta_k$ and $\kappa=L/m$ is the condition number.
\end{lemma}

\subsection{Full device participation}

\subsubsection{Convergence based on independent noise}
\label{ind_converge}
When the synchronization step is conducted at every iteration $k$, the FA-LD algorithm is essentially the standard SGLD algorithm \cite{Welling11}. Theoretical analysis based on the 2-Wasserstein distance has been established in \cite{dm+16, Dalalyan17, dk19}. However, in scenarios of $K> 1$ with distributed clients, a divergence between the global variable $\theta_k$ and local variable $\theta^c_k$ appears and unavoidably affects the performance. The upper bound on the sampling error is presented as follows.

\begin{theorem}[Main result, informal version of Theorem \ref{main_theorem}]\label{main_paper_theorem} Assume assumptions \ref{def:smooth_main}, \ref{def:strong_convex_main}, and \ref{def:variance_main} hold. Consider Algorithm \ref{alg:alg_main_paper_text_independent_noise} with a fixed learning rate $\eta\in (0, \frac{1}{2L}]$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have \footnote[2]{For ease of presentation, we report the result based on $K^2$ instead of $(K-1)^2$. The upper bound based on $(K-1)^2$ is detailed in the supplementary file.}
\begin{align*}
    W_2(\mu_{k}, \pi) &\leq  \left(1- {\eta m}/{4}\right)^k \cdot \bigg(\sqrt{2d}\big(\mathcal{D} +  \sqrt{\tau/m} \big)\bigg)+30\kappa\sqrt{\eta m d } \cdot \sqrt{(K^2+\kappa)H_0} .\notag
\end{align*}
where $\mu_k$ denotes the probability measure of $\theta_k$ at iteration $k$, $K$ denotes the number of local updates, $\kappa :=L/m$, $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2$, and $H_{0} := \mathcal{D}^2+\max_{c\in[N]}\frac{\tau}{mp_c} +\frac{\gamma^2}{m^2 d}+\frac{\sigma^2}{m^2}$.




\end{theorem}




We observe that the 
%parameter 
initialization, the scale of the injected noise, the heterogeneity of the data, and the noise in the stochastic gradient all affect the convergence. Similar to the result of \cite{lhy+20}, FA-LD with $K$-local steps resembles the behaviour of one-step SGLD with a large learning rate. %Parallel to our work, \cite{qlsd} showed that the convergence does not rely on 

\textbf{Optimal choice of $K$.} To ensure the algorithm to achieve the precision $\epsilon$ based on the total number of steps $T_{\epsilon}$ and the learning rate $\eta$, we can set 
\begin{align*}
    &30\kappa\sqrt{\eta m d} \cdot  \sqrt{(K^2+\kappa)H_0}  \leq {\epsilon}/{2},\quad \exp\big(-\frac{\eta m}{4} T_{\epsilon} \big) \cdot  \sqrt{2d} (\mathcal{D} +  \sqrt{\tau/m}  ) \leq {\epsilon}/{2}.
\end{align*}
This readily leads to 
\begin{align*}
    \eta m\leq  O\bigg(\frac{\epsilon^2}{d\kappa^2  {(K^2+\kappa)H_0}}\bigg),\quad T_{\epsilon}\geq \Omega\bigg(\frac{\log({d}/{\epsilon})}{m\eta}\bigg).
\end{align*}

Plugging into the upper bound of $\eta m$, it implies that to reach the precision $\epsilon$, it suffices to set %\Zhao{In the following quantity, after you re-define $H_0$, then $m$ should be gone}\Wei{Done!}
\begin{align}\label{def_T_main}
    T_{\epsilon}=\Omega( \epsilon^{-2}  d\kappa^2 {(K^2+\kappa)H_0} \cdot \log({d}/{\epsilon}) ).
\end{align}
It's obvious that $H_0 = \Omega(\mathcal{D}^2)=\Omega(1)$, thus we can conclude that the number of communication rounds is around the order 
\begin{align*}
    \frac{T_{\epsilon}}{K}=\Omega\bigg( K+\frac{\kappa}{K}\bigg),
\end{align*}
where the value of $\frac{T_{\epsilon}}{K}$ first decreases and then increases with respect to $K$, indicating that setting $K$ either too large or too small may lead to high communication costs and hurt the performance. Ideally, $K$ should be selected in the scale of $\Omega(\sqrt{\kappa})$. Combining the definition of $T_{\epsilon}$ in Eq.~\eqref{def_T_main}, this suggests an interesting result that the optimal $K$ for FA-LD should be in the order of $O(\sqrt{T_{\epsilon}})$. Similar results have been achieved by \cite{Stich19, lhy+20}.


\subsubsection{Convergence guarantees via varying learning rates}

\begin{theorem}[Informal version of Theorem \ref{main_theorem_decay}]\label{main_paper_theorem_decay} Assume assumptions \ref{def:smooth_main}, \ref{def:strong_convex_main}, and \ref{def:variance_main} hold. Consider Algorithm \ref{alg:alg_main_paper_text_independent_noise} with an initialization satisfying $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$ and the varying learning rate following
\begin{align*}
    \eta_{k}=\frac{1}{2L+(1/12)m k},\qquad k=1,2,\cdots.
\end{align*}
Then for any $k\geq 0$, we have %\Zhao{I don't see any problem for replacing $K-1$ by $K$. Perhaps I miss something here.}
\begin{align*}
    W_2(\mu_{k}, \pi)\leq 45\kappa\sqrt{ (K^2+\kappa)H_0}\cdot\big(\eta_k m d\big)^{1/2}, \qquad \forall k \geq 0.
\end{align*}
\end{theorem}



Note that the above result implies that to achieve the precision $\epsilon$, we require
\begin{align*}
     W_2(\mu_{k}, \pi)\leq 45\kappa\sqrt{ (K^2+\kappa)H_0} \cdot \bigg(\frac{md}{2L+(1/12){mk}}\bigg)^{1/2}\leq \epsilon.
\end{align*}
We therefore require ${\Omega} ( \epsilon^{-2} d )$ iterations to achieve the precision $\epsilon$, which improves the $\Omega( \epsilon^{-2} d \log( {d}/{\epsilon} ))$ rate for FA-LD with a fixed learning rate by a $O(\log(d/\epsilon))$ factor.



\subsubsection{Privacy-accuracy trade-off via correlated noises}


Note that Algorithm \ref{alg:alg_main_paper_text_independent_noise} requires all the local clients to generate the independent noise $\xi^c_k$. Such a mechanism enjoys the convenience of the implementation and yields a potential to protect the privacy of data and alleviates the security issue. However, the large scale noise inevitably slows down the convergence. To handle this issue, the independent noise can be generalized to correlated noise based on a correlation coefficient $\rho$ between different clients. Replacing Eq.~\eqref{local_client_main_paper} with 
\begin{equation}\label{local_client_diff_seeds_main_paper}
    \beta_{k+1}^c=\theta_k^c-\eta\nabla \tilde f^c(\theta_k^c)+\sqrt{2\eta\tau \rho^2}\dot{\xi}_k + \sqrt{2\eta(1-\rho^2)\tau/p_c}\xi_k^c,
\end{equation}
where $\dot{\xi}_k$ is a $d$-dimensional standard Gaussian vector shared by all the clients at iteration $k$ and $\dot\xi_k$ is dependent with $\xi_k^c$ for any $c\in[N]$. Following the synchronization step based on Eq.~\eqref{synchronization_main_paper}, we have
\begin{equation}
\label{fed_avg_langevin_dynamics_pp_main_paper}
\theta_{k+1}=\theta_k-\eta \nabla \tilde f(\theta_k)+\sqrt{2\eta\tau}\xi_k,
\end{equation}
where $\xi_k=\rho \dot\xi_k + \sqrt{1-\rho^2}\sum_{c=1}^N \sqrt{p_c}\xi_k^c$. Since the variance of i.i.d variables is additive, it is clear that $\xi_k$ follows the standard $d$-dimensional Gaussian distribution. The inclusion of the correlated noise implicitly reduces the temperature for each client and naturally yields a trade-off between federation and accuracy. We refer to the algorithm with correlated noise as the hybrid federated averaging Langevin dynamics (hFA-LD) and present it in Algorithm \ref{alg:alg_main_paper_text_different_seeds}.




Since the inclusion of correlated noise doesn't affect the iterate of Eq.~\eqref{fed_avg_langevin_dynamics_pp_main_paper}, the algorithm property maintains the same except the scale of the temperature $\tau$ and efficacy of federation are changed. Based on a target correlation coefficient $\rho\geq 0$, Eq.~\eqref{local_client_diff_seeds_main_paper} is equivalent to applying a temperature $T_{c,\rho}=\tau(\rho^2+(1-\rho^2)/p_c)$. In particular, setting $\rho=0$ leads to $T_{c, 0}=\tau/p_c$, which exactly recovers Algorithm \ref{alg:alg_main_paper_text_independent_noise}; however, setting $\rho=1$ leads to $T_{c, 1}=\tau$, where the injected noise in local clients is reduced by $1/p_c$ times. Now we adjust the analysis as follows
\begin{theorem}[Informal version of Theorem \ref{correlated_noise_supp}]\label{correlated_noise_main} Assume assumptions \ref{def:smooth_main}, \ref{def:strong_convex_main}, and \ref{def:variance_main} hold.  Consider Algorithm \ref{alg:alg_main_paper_text_different_seeds} with a correlation coefficient $\rho\in[0, 1]$, $\eta\in (0, \frac{1}{2L}]$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have
\begin{align*}
    W_2(\mu_{k}, \pi) &\leq  (1-{\eta m}/{4} )^k \cdot \bigg(\sqrt{2d}\big(\mathcal{D} +  \sqrt{\tau/m} \big)\bigg)+30\kappa\sqrt{\eta m d } \cdot \sqrt{(K^2+\kappa)H_{\rho}},\notag
\end{align*}
where $\mu_k$ denotes the probability measure of $\theta_k$, $H_{\rho}: = { \mathcal{D}^2}+\frac{1}{m}\max_{c\in[N]} T_{c,\rho} +{\frac{\gamma^2}{m^2d}}+{\frac{\sigma^2}{m^2}}$.
\end{theorem}

Such a mechanism leads to a trade-off between the efficacy of federation and accuracy and motivates us to exploit the optimal $\rho$ under the differential-privacy theories \cite{mama15}.

\subsection{Partial device participation}


Full device participation enjoys appealing convergence properties. However, it suffers from the straggler's effect in real-world applications, where the communication is limited by the slowest device. Partial device participation handles this issue by only allowing a small portion of devices in each communication and greatly increased the communication efficiency %fault-tolerant capability 
in a federated network. 

The first device-sampling scheme \text{I} \cite{LS20} selects a total of $S$ devices, where the $c$-th device is selected with a probability $p_c$. The first theoretical justification for convex optimization has been proposed by \cite{lhy+20}. %However, to our best knowledge, the convergence analysis of sampling algorithm was missing in the federated-learning literature. 


\paragraph{(Scheme \text{I}: with replacement).}
Assume $\mathcal{S}_k=\{n_1, n_2, \cdots, n_S\}$, where $n_j\in [N]$ is a random number that takes a value of $c$ with a probability $p_c$ for any $j\in\{1,2,\cdots, S\}$. The synchronization step follows that $\theta_{k}=\frac{1}{S}\sum_{c\in \mathcal{S}_k}\theta_{k}^c$.

Another strategy is to uniformly select $S$ devices without replacement. We follow  \cite{lhy+20} and assume $S$ indices are selected uniformly without replacement. In addition, the convergence also requires an additional assumption on balanced data \cite{lhy+20}. 
\paragraph{(Scheme \text{II}: without replacement).}  Assume $\mathcal{S}_k=\{n_1, n_2, \cdots, n_S\}$, where $n_j\in [N]$ is a random number that takes a value of $c$ with a probability $\frac{1}{S}$ for any $j\in\{1,2,\cdots, S\}$. Assume the data is balanced such that $p_1=\cdots=p_N=\frac{1}{N}$. The synchronization step follows that $\theta_{k}=\frac{N}{S}\sum_{c\in \mathcal{S}_k} p_c\theta_{k}^c=\frac{1}{S}\sum_{c\in \mathcal{S}_k} \theta_{k}^c$.





\begin{algorithm*}[h]\caption{Hybrid federated Averaging Langevin dynamics Algorithm (FA-LD) with partial device participation, informal version of Algorithm \ref{alg:alg_main_text_partial}. $\mathcal{S}_k$ is sampled according to a device-sampling rule based on scheme \text{I} or \text{II}.}\label{alg:alg_main_text_partial_main}
\begin{algorithmic}[1]
\State \begin{equation*}
    \beta_{k+1}^c=\theta_k^c-\eta\nabla \tilde f^c(\theta_k^c)+\sqrt{2\eta\tau \rho^2}\dot\xi_k + \sqrt{2\eta(1-\rho^2)\tau/p_c}\xi_k^c,
\end{equation*}
\State
\begin{equation*}  
\theta_{k+1}^c=\left\{  
             \begin{array}{lr}  
             \beta_{k+1}^c \qquad\qquad\qquad\quad\text{if } k+1 \text{ mod } K\neq 0 \\  
              & \\
             \sum_{c\in \mathcal{S}_{k+1}} \frac{1}{S} \beta_{k+1}^c \ \qquad \text{if } k+1 \text{ mod } K=0.
             \end{array}  
\right.  
\end{equation*} 
\end{algorithmic}
\end{algorithm*}

\begin{theorem}[Informal version of Theorem \ref{theorem_partial}]\label{thm:partial_II}
Assume assumptions \ref{def:smooth_main}, \ref{def:strong_convex_main}, and \ref{def:variance_main} hold. Consider Algorithm \ref{alg:alg_main_text_partial_main} with a hyperparameter $\rho\in[0, 1]$, a fixed %\Zhao{You can say fixed, but cannot say constant.} 
learning rate $\eta\in (0, \frac{1}{2L}]$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have
\begin{align*}
    W_2(\mu_{k}, \pi) &\leq  (1-{\eta m}/{4} )^k \cdot \bigg(\sqrt{2d}\big(\mathcal{D} +  \sqrt{\tau/m} \big)\bigg)\notag\\
    &\qquad+30\kappa\sqrt{\eta m d } \cdot \sqrt{ H_{\rho}(K^2+\kappa)}+O\bigg(\sqrt{\frac{d}{S}(\rho^2+N(1-\rho^2)) C_S}\bigg),
\end{align*}
where $C_S=1$ for {Scheme I} and $C_S=\frac{N-S}{N-1}$ for {Scheme II}. 
\end{theorem}
% \begin{remark}
% If $S=1$, our Scheme \text{II} matches the result in the Scheme \text{I}. If $S=N$, then our Scheme II recovers the result in the full device setting. If $S= N - o(N)$, then our Scheme II bound is better than scheme I.
% \end{remark}

We observe that partial device participation leads to an extra bias regardless of the scale of $\eta$. To reduce such a bias, we suggest to consider highly correlated injected noise, such as $\rho=1$, to reduce the impact of the injected noise. By setting $O(\sqrt{{d}/{S}})\leq {\epsilon}/{3}$ and following a similar learning rate as in section \ref{ind_converge}, we can achieve the precision $\epsilon$ within $\Omega( \epsilon^{-2} d \log( {d}/{\epsilon} ))$ iterations given a large number of devices satisfying $S = \Omega( \epsilon^{-2} d )$.


The device-sampling scheme \text{I} provides a viable solution to handle the straggler's effect in full device participation and greatly accelerates the communication efficiency. In addition, scheme \text{I} is rather robust to the data heterogeneity and doesn't require the data to be balanced. In other words, this device-sampling scheme is more preferred if a system is free to activate any devices at any time.

In more practical cases where a system can only operate based on the first $S$ messages for the local updates. The device-sampling scheme \text{II} proposes a concrete treatment to tackle this issue. Given a balanced data across different clients and each device is uniformly sampled, we can achieve a reasonable approximation. If $S=1$, our Scheme \text{II} matches the result in the Scheme \text{I}. If $S=N$, then our Scheme II recovers the result in the full device setting. If $S= N - o(N)$, then our Scheme II bound is better than scheme I.


