\section{Our Algorithm}

Let $N$ denote the number of clients. Let $T$ denote the number of global steps. Let $K$ denote the number of local steps. For each $c \in [N]:=\{1,2,\cdots, N\}$, we use $f^c$ and $\nabla f^c$ denote the loss function and gradient of the function $f^c$ in client $c$. For the stochastic gradient oracle, we denote by $\nabla \tilde f^c(\cdot)$ the \emph{unbiased} estimate of the exact gradient $\nabla f^c$ of client $c$. In addition, we denote $p_c$ as the weight of the $c$-th client such that $p_c\geq 0$ and $\sum_{c=1}^N p_c=1$. $\xi_k^c$ is an independent standard $d$-dimensional Gaussian vector at iteration $k$ for each client $c\in[N]$.

\begin{algorithm*}[h]\caption{Federated Averaging Langevin dynamics Algorithm (FA-LD). Denote by $\theta_k^c$ the model parameter in the $c$-th client at the $k$-th step. Denote the immediate result of one step SGLD update from $\theta_k^c$ by $\beta_k^c$. A global synchronization is conducted every $K$ steps.}\label{alg:alg_main_text_independent_noise}
\begin{algorithmic}[1]
\State \begin{equation}\label{local_client}
    \beta_{k+1}^c=\theta_k^c-\eta\nabla f^c(\theta_k^c)+\sqrt{2\eta\tau/p_c}\xi_k^c,
\end{equation}
\State
\begin{equation}  
\label{synchronization}
\theta_{k+1}^c=\left\{  
             \begin{array}{lr}  
             \beta_{k+1}^c \qquad\qquad\qquad \text{if } k+1 \text{ mod } K\neq 0 \\  
              & \\
             \sum_{c=1}^N p_c \beta_{k+1}^c \ \qquad \text{if } k+1 \text{ mod } K=0.
             \end{array}  
\right.  
\end{equation} 
\end{algorithmic}
\end{algorithm*}

Inspired by \cite{lhy+19}, we define two virtual sequences 
\begin{equation}
\label{virtual_seq}
\beta_k=\sum_{c=1}^N p_c \beta_k^c, \qquad \theta_k=\sum_{c=1}^N p_c \theta_k^c,
\end{equation}
which are \emph{both inaccessible when $k \text{ mod } K\neq 0$}. For the gradients and injected noise, we also define 
\begin{equation}
\label{sum_grad}
\nabla f(\theta_k)=\sum_{c=1}^N p_c \nabla f^c(\theta_k^c), \qquad \nabla\tilde f(\theta_k)=\sum_{c=1}^N p_c \nabla \tilde f^c(\theta_k^c), \qquad \xi_k=\sum_{c=1}^N \sqrt{p_c} \xi_k^c.
\end{equation}



In what follows, it is clear that $\E{\nabla \tilde f(\theta)}=\sum_{c=1}^N p_c \E{\nabla \tilde f^c(\theta_k^c)}=\nabla f(\theta)$ for any $\theta\in\R^d$ and $\sum_{c=1}^N\sqrt{1/p_c}p_c\xi_k^c=\xi_k$. Summing Eq.\eqref{local_client} from clients $c=1$ to $N$ and combining Eq.\eqref{virtual_seq} and Eq.\eqref{sum_grad}, we have
\begin{align}
\label{fed_avg_langevin_dynamics_preliminary}
    \beta_{k+1}&=\theta_k-\eta \nabla \tilde f(\theta_k)+\sqrt{2\eta\tau}\xi_k.
\end{align}
Moreover, we always have $\beta_k=\theta_k$ whether $k+1 \text{ mod } E=0$ or not by Eq.\eqref{synchronization} and Eq.\eqref{virtual_seq}. In what follows, we can write
\begin{equation}
\label{fed_avg_langevin_dynamics}
\theta_{k+1}=\theta_k-\eta \nabla \tilde f(\theta_k)+\sqrt{2\eta\tau}\xi_k,
\end{equation}
which resembles the SGLD algorithm \cite{Welling11} except that the construction of stochastic gradients is different and $\theta_k$ is \emph{not accessible when $k\text{ mod } K\neq 0$}. To facilitate the analysis, we also define an auxiliary continuous-time processes $(\bar\theta_t)_{t\geq 0}$ 
\begin{align}
\label{continuous_dynamics}
\d \bar\theta_t = - \nabla f(\bar\theta_t) \d t + \sqrt{2\tau} \d \overline{W}_t,
\end{align}
where $\bar\theta_t=\sum_{c=1}^N p_c \bar\theta_t^c$, $\nabla f(\bar\theta_t)=\sum_{c=1}^N p_c \nabla f^c(\bar\theta_t^c)$, $\bar\theta_t^c$ is the continuous-time variable at client $c$, and $\overline{W}$ is a $d$-dimensional Brownian motion. The continuous-time algorithm is referred to as Federated Averaging Langevin diffusion and is described as
\begin{align*}\label{local_client_continuous}
    \d \bar\beta_{t}^c &=\theta_t^c-\nabla f^c(\bar\theta_t^c)\d t+\sqrt{2\tau/p_c}\d \overline{W}_t \notag\\
    \quad\bar\theta_{t}^c&=\sum_{c=1}^N p_c \bar\beta_{t}^c.
\end{align*}
Since the synchronization step is conducted at every time step $t$, the Federated Averaging Langevin diffusion performs the same as the standard Langevin diffusion with the temperature $\tau$ and convergences to the stationary distribution $\pi(\theta)\propto e^{-\frac{f(\theta)}{\tau}}$. Assume that $\bar\theta_0$ simulates from the stationary distribution $\pi$, then it follows that $\bar\theta_t\sim\pi$ for any $t\geq 0$.


% \begin{algorithm*}[h]\caption{Federated Averaging Langevin diffusion. Denote by $\bar\theta_t^c$ the model parameter in the $c$-th client at time $t$. Denote the Langevin diffusion update from $\bar\theta_t^c$ by $\bar\beta_t^c$. The global synchronization is conducted at any time step $t$.}\label{alg:alg_main_continuous_text_same_seed}
% \begin{algorithmic}[1]
% \State \begin{equation}\label{local_client_continuous}
%     \d \bar\beta_{t}^c =\theta_t^c-\nabla f^c(\bar\theta_t^c)\d t+\sqrt{2\tau}\d \overline{W}_t,
% \end{equation}
% \State
% \begin{equation}  
% \label{synchronization_diffusion}
% \bar\theta_{t}^c=\sum_{c=1}^N p_c \bar\beta_{t}^c.
% \end{equation} 
% \end{algorithmic}
% \end{algorithm*}




\iffalse
\subsection{Our plan}

What are the assumptions do we need ..
\begin{itemize}
    \item shusen wang's paper
\end{itemize}

We need to prove a new version of Lemma 3 in page 12 in \cite{lhy+19}.
\begin{lemma}[Lemma 3 in page 12 in \cite{lhy+19}]
We have
\begin{align*}
    \mathbb{E} \Big[ \sum_{c=1}^N p_c \| u(t+1) - w_{K,c}(t)  \|^2 \Big] \leq 4 \eta_t^2 (E-1)^2 G^2 
\end{align*}
\end{lemma}

We need to generalize the above lemma to something as follows:
\begin{lemma}[Our version]
We have
\begin{align*}
    \mathbb{E} \Big[ \sum_{c=1}^N p_c \| u(t+1) - w_{K,c}(t)  \|^2 \Big] \leq 4 \eta_t^2 (E-1)^2 G^2  + \| \mathrm{noise} \|^2
\end{align*} 
\end{lemma}

Using Shusen's assumption, we can show Dala's paper \cite{dk19} page 7 contions are holding.

\fi


% \begin{table}[]
%     \centering
%     \begin{tabular}{|l|l|l|l|l|l|} \hline
%         {\bf Notations} & {\bf Ours} & \cite{dk19} & \cite{lhy+19} & \cite{ccbj18} \\ \hline
%         Function & $f$ & $f$ & $F$ & $f$ \\ \hline
%         Parameter &  & $\theta$ & $w$ & $x$ \\ \hline
%         Dimension & $d$ & $p$ & Never & $d$ \\ \hline
%         Smooth & $L$ & $M$ & $L$ & $L$ \\ \hline
%         Strongly convex & $m$ & $m$ & $\mu$ & $m$ \\ \hline
%         Global & & $K$ & $T$ & \\ \hline
%         Local steps & & $1$ & $K$ & \\ \hline
%         Variance & $\sigma^2 d$ & $\sigma^2 p$ & $\sigma^2$ & $\sigma^2 d$ \\ \hline
%         Learning rate & $\eta$ & $h$ & $\eta$ & $\delta$ \\ \hline
%         Choice of LR & & $h = 1/(m+M)$ & $\eta = 2 / (\mu T) $ &  \\\hline
%         \#Devices & & 1 & $N$ & \\ \hline
%         \#Datas per client & & & $n_k$ & \\ \hline
%     \end{tabular}
%     \caption{Notations to compare different papers. We put this table for easy of writing. There is no need to keep this in the final paper.}
%     \label{tab:my_label}
% \end{table}




% \Wei{To do:  independence on K. \textcolor{green}{Done}}

% \Wei{To do: remove the variance effect (learning rate) or propose a better rate. \textcolor{green}{Done}}

% \Wei{To do: require a continuous version for Bounded divergence \textcolor{green}{Done}}

% \Wei{To do: think about a name for the paper, FedAvg Langevin Dynamics?}

% \Wei{To do: Decay learning rate \textcolor{green}{done}}

% \Wei{To do: When we decay learning rate, it may be harder to prove the L2 bound. \textcolor{red}{required}}

% \Wei{To do: independent noise in each local client? \textcolor{green}{Done}}

% \Wei{To do: convex case or non-convex case?}

% \Wei{To do: sampling schemes for different clients \textcolor{green}{done}}

% \Wei{To do: Connection to optimization?}

% \Wei{To do: conditional expectation filtration?}

% \Wei{do we need to unify the words, client or device?}

\subsection{Main result}

% \subsubsection{Notations}

\subsubsection{Assumptions and notations}

\begin{assumption}[Smoothness]\label{def:smooth} For each $c\in [N]$, we say $f^c$ is $L$-smooth if for some $L>0$
\begin{align*}
\| \nabla f^c(y)-\nabla f^c(x) \|_2 \leq L \| y-x \|_2,\quad \forall x, y\in \R^d.
\end{align*}
\end{assumption}

Note that the above assumption is equivalent to saying that
\begin{align*}
f^c(y)\leq f^c(x)+\langle \nabla f^c(x),y-x \rangle+\frac{L}{2}\| y-x \|^2_2\quad \forall x, y\in \R^d.
\end{align*}

\begin{assumption}[Strongly convex]\label{def:strong_convex}
For each $c\in [N]$, $f^c$ is $m$-strongly convex if for some $m>0$
\begin{align*}
f^c(x)\geq f^c(y)+\langle \nabla f^c(y),x-y \rangle + \frac{m}{2} \| y-x \|_2^2\quad \forall x, y\in \R^d.
\end{align*}
\end{assumption}

\begin{assumption}[Bounded variance]\label{def:variance}
For each $c\in [N]$, the variance of noise in the stochastic gradient $\nabla \tilde f^c(x)$ in each client is upper bounded such that 
\begin{align*}
\mathbb{E}[ \| \nabla \tilde f^c(x) - \nabla f^c(x) \|_2^2] \leq \sigma^2 d,\quad \forall x\in \R^d.
\end{align*}
\end{assumption}


\paragraph{Quality of non-i.i.d data} Denote by $\theta_*$ the global minimum of $f$ and by $\theta^c_*$ the global minimum values of $f^c$ for each client $c\in [N]$. It follows that $\theta_*=\sum_{c=1}^N p_c \theta_*^c$. Next, we quantify the degree of the non-i.i.d data by $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$, which is a non-negative constant and yields a smaller scale if the data is more evenly distributed.

\begin{definition}\label{def:H_kappa_gamma}
We define parameter $T_{c, \rho}$ $H^2_{\rho}$, $\kappa$ and $\gamma$
\begin{align*}
    T_{c,\rho}: = & ~ \tau(\rho^2+(1-\rho^2)/p_c),\\
    H_{\rho}: = & ~  \underbrace{L \mathcal{D}^2}_{\small{\text{initialization}}}+\underbrace{\kappa \max_{c\in[N]} T_{c,\rho}}_{\small{\text{injected noise}}} +\underbrace{\frac{\kappa}{md}\gamma}_{\small{\text{data heterogeneity}}}+\underbrace{\frac{\kappa}{m}\sigma^2}_{\small{\text{stochastic noise}}},\\
    \kappa := & ~ L / m , \\
    \gamma : = & ~ \max_{c \in [N]} \| \nabla f^c (\theta_*) \|_2^2 .
\end{align*}
\end{definition}


% Wei's first algorithm formulation based on standard SGLD
% \subsection{Formulation}
% Let $\theta_k\in\R^d$ be the $k$-th iterate of the following stochastic gradient Langevin algorithm.
% \begin{align}\label{eq:sgld}
%     \theta_{k+1}=\theta_k -\eta \nabla \widetilde f(\theta_k)+\sqrt{2\tau\eta}\xi_k,
% \end{align}
% where $\eta$ is the learning rate, $\tau$ is the temperature, $\xi_k$ is a standard $d$-dimensional Gaussian vector, and $\nabla \widetilde f(\theta)$ is an unbiased estimate of the exact gradient $\nabla f(\theta)$.

% \subsubsection{Tools from previous work}

% \textbf{Gronwall's inequality} is a standard tool for obtaining estimates of differential equations. Suppose that $a(\cdot)$, $b(\cdot)$, and $\psi(\cdot)$ are continuous real-valued functions that satisfy
% \begin{align*}
%     \frac{\d}{\d t}\psi(t)\leq a(t)\psi(t)+b(t).
% \end{align*}
% Then 
% \begin{align*}
%     \psi(t)\leq \psi(t_0)e^{\int_{t_0}^t a(s)\d s} + \int_{t_0}^t e^{\int_{s}^t a(u)\d u}b(s)\d s.
% \end{align*}




\subsection{Main results}
\paragraph{Wasserstein distance}

% We denote the Borel $\sigma$-algebra 
We define the 2-Wasserstein distance between a pair of Borel probability measures $\mu$ and $\nu$ on $\R^d$ as follows  
\begin{align*}
    W_2(\mu, \nu):=\inf_{\Gamma\in \text{Couplings}(\mu, \nu)}\left(\int\|\bbeta_{\mu}-\bbeta_{\nu}\|_2^2 d \Gamma(\bbeta_{\mu}, \bbeta_{\nu})\right)^{\frac{1}{2}},
\end{align*}
where $\|\cdot\|_2$ denotes the $\ell_2$ norm on $\mathbb{R}^d$ and the pair of random variables $(\bbeta_{\mu}, \bbeta_{\nu})\in \R^d\times\R^d$ is a coupling with the marginals following $\mathcal{L}(\bbeta_{\mu})=\mu$ and $\mathcal{L}(\bbeta_{\nu})=\nu$, where $\mathcal{L}(\cdot)$ denotes a distribution of a random variable.



\begin{lemma}[Contraction property]
\label{contraction}
Assume assumptions \ref{def:smooth} and \ref{def:strong_convex} hold. For any learning rate $\eta \in (0, \frac{1}{L+m}]$, any $\theta, \beta\in\mathbb{R}^d$, % simulated from Eq.\eqref{fed_avg_langevin_dynamics} and Eq.\eqref{continuous_dynamics}, respectively, 
we have
\begin{align*}
\small
    &\quad\lrn{\beta-\theta-\eta(\nabla f(\beta)-\nabla f(\theta))}_2^2\leq (1-\eta m) \cdot \|\beta-\theta \|_2^2+4\eta L\sum_{c=1}^N p_c \cdot ( \| \beta^c-\beta \|_2^2 + \|\theta^c-\theta \|_2^2 ).
\end{align*}

\end{lemma}

\begin{lemma}[Time error]\label{lem:discretization}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any $s\geq 0$, any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, the iterates of $(\bar \theta_s)$ based on the continuous dynamics of Eq.\eqref{continuous_dynamics} satisfy the following estimate
\begin{align*}
    \E{ \big\| \bar\theta^c_{s} - \bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2} \leq 8\eta^2 d\kappa\bigg(\frac{\kappa\gamma}{d}+L\tau\bigg)+16\eta d\tau.
\end{align*}
\end{lemma}



\begin{lemma}[Bounded divergence]\label{divergence}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the $\ell_2$ upper bound of the divergence between local clients and the center as follows
\begin{align*}
    \sum_{c=1}^N p_c\E{\|\theta_k^c-\theta_k \|_2^2}&\leq 28(K-1)^2\eta^2 d\kappa m H_{\rho} +4(K-1)\eta d\tau,\notag
\end{align*}
where $H, \kappa$ and $\gamma$ are defined as Definition~\ref{def:H_kappa_gamma}. % $H_{\rho}=14 \kappa^2 \cdot  (m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2 )$, $\kappa=L/m$, and  $\gamma:=\max_{c\in[N]}\| \nabla f^c(\theta_*) \|_2^2$.
\end{lemma}






\begin{lemma}[Bounded variance] 
\label{lem:total_variance}
Given assumption \ref{def:variance}, we have 
\begin{equation*}
    \E{ \|\nabla f(\theta)-\nabla \tilde f(\theta) \|_2^2}\leq d \cdot \sigma^2 ,\qquad \forall \ \theta\in\R^d.
\end{equation*}
\end{lemma}

% \begin{lemma}[To be proved] 
% \label{lem:gradient_bound}
% Given a client index $c\in[N]$ and assumption XXX\Wei{will fix later}, we have 
% \begin{equation*}
%     \E{\lrn{\nabla f(\bar\theta_s^c)-\nabla f(\bar\theta^c_{k\eta})}_2^2}\leq C \eta^2
% \end{equation*}
% \end{lemma}



\begin{lemma}[One step update]\label{one_step_Dalalyan}

Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. Consider Algorithm \ref{alg:alg_main_text_independent_noise} with any learning rate $\eta \in (0 , \frac{1}{2L})$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, where $\theta_*$ is the global minimum for the function $f$. Then
\begin{align*}
    W_2^2(\mu_{k+1}, \pi)&\leq  (1-\frac{\eta m}{2}) \cdot W^2_2(\mu_{k}, \pi)+ 108\eta^2 d \kappa m H_0((K-1)^2+\kappa),
\end{align*}
where $\mu_k$ denotes the probability measure of $\theta_k$, $H_0, \kappa$ and $\gamma$ are defined as Definition~\ref{def:H_kappa_gamma}. 
\end{lemma}
% \Wei{does this $\frac{\eta m}{2}$ make sense? Is there a tradition such that a rate of $\eta m$ is required?}


\begin{proof}[Proof of Lemma \ref{one_step_Dalalyan}]


The solution of the continuous-time process Eq.\eqref{continuous_dynamics} follows that
\begin{align}
\label{solution_continuous_dynamics}
    \bar\theta_t=\bar\theta_0 -\int_0^t \nabla f(\bar\theta_s)\d s + \sqrt{2\tau}\cdot\overline{W}_t, \qquad \forall t\geq 0.
\end{align}


Set $t\rightarrow(k+1)\eta$ and $\bar\theta_0\rightarrow\bar\theta_{k\eta}$ for Eq.\eqref{solution_continuous_dynamics} and consider a synchronous coupling such that $W_{(k+1)\eta}-W_{k\eta}:=\xi_k$
\begin{align}
\label{continuous_one_step}
    \bar\theta_{(k+1)\eta}&=\bar\theta_{k\eta}-\int_{k\eta}^{(k+1)\eta}\nabla f(\bar\theta_s)\d s + \sqrt{2\tau} (W_{(k+1)\eta}-W_{k\eta})\notag\\
    &=\bar\theta_{k\eta}-\int_{k\eta}^{(k+1)\eta}\nabla f(\bar\theta_s)\d s + \sqrt{2\tau}\xi_k.
\end{align}

We first denote $\zeta_k:=\nabla \tilde f(\theta_k)-\nabla f(\theta_k)$. Subtracting Eq.\eqref{fed_avg_langevin_dynamics} from Eq.\eqref{continuous_one_step} yields that
\begin{align}
\label{decompose_full}
    &\quad \bar\theta_{(k+1)\eta}-\theta_{k+1}\notag\\
    &=\bar\theta_{k\eta}-\theta_{k}+\eta \nabla \tilde f(\theta_k) - \int_{k\eta}^{(k+1)\eta}\nabla f(\bar\theta_s)\d s\notag\\
    &=\bar\theta_{k\eta}-\theta_{k}-\eta \bigg(\nabla f(\theta_k+\bar\theta_{k\eta}-\theta_{k})-\nabla\tilde f(\theta_k)\bigg) - \int_{k\eta}^{(k+1)\eta}\bigg(\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})\bigg)\d s\\
    &=\bar\theta_{k\eta}-\theta_{k}-\eta \bigg(\underbrace{\nabla f(\theta_k+\bar\theta_{k\eta}-\theta_{k})-\nabla f(\theta_k)}_{:=X_k}\bigg)- \underbrace{\int_{k\eta}^{(k+1)\eta}\bigg(\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})\bigg)\d s}_{:=Y_k} +\eta\zeta_k.\notag
\end{align}

Taking square and expectation on both sides, we have
\begin{align}
\label{reestimate}
    &\quad\ \E{\|\bar\theta_{(k+1)\eta}-\theta_{k+1} \|_2^2}\notag\\
    &=\E{\| \bar\theta_{k\eta}-\theta_{k}-\eta X_k-Y_k \|_2^2}+\E{\| \eta\zeta_k \|_2^2}+2\eta\underbrace{\E{\langle\bar\theta_{k\eta}-\theta_{k}-\eta X_k-Y_k,  \zeta_k\rangle}}_{\E{\zeta_k}=0}\notag\\
    &\leq (1+q) \cdot \E{\| \bar\theta_{k\eta}-\theta_{k}-\eta X_k \|_2^2}+ ( 1 + 1 / q ) \cdot \E{\|Y_k \|_2^2}+\E{ \| \eta\zeta_k \|_2^2}\notag\\
    &\leq (1+q) \cdot \big( (1-\eta m) \cdot \E{\| \bar\theta_{k\eta}-\theta_k \|_2^2}+4\eta L\sum_{c=1}^N p_c \cdot \left(\E { \| \bar\theta_{k\eta}^c-\bar\theta_{k\eta} \|_2^2}+\E{\| \theta_k^c-\theta_k \|_2^2}\right) \big)\notag\\
    &\quad\quad + ( 1 + 1/q ) \cdot \E{ \| Y_k \|_2^2 } + \eta^2\sigma^2 d\notag\\
    &\leq (1+q) \cdot \bigg(\underbrace{\left(1-\eta m\right)}_{\phi}\E{ \| \bar\theta_{k\eta}-\theta_k \|_2^2}+112 \eta^3 d(K-1)^2 L\kappa m H_0+16(K-1)\eta^2 dL\tau\bigg)\notag\\
    &\quad\quad + (1+ 1 / q ) \cdot \E{ \| Y_k \|_2^2}+\eta^2\sigma^2  d,
\end{align}
where the first inequality follows by the AM-GM inequality for any $q>0$, the second inequality follows by Lemma \ref{contraction} and Assumption \ref{def:variance}. The third inequality follows by Lemma \ref{divergence}; moreover, the continuous-time process conducts synchronization at any time step, hence $\bar\theta^c_{k\eta}=\bar\theta_{k\eta}$. Since the learning rate follows $\frac{1}{2L}\leq \frac{1}{m+L}\leq \frac{2}{m}$, the requirement of the learning rate is clearly satisfied.

Recall that $\phi=1-\eta m$, we get $\frac{1+\phi}{2}=1-\frac{1}{2}\eta m$. Choose $q=\frac{1+\phi}{2\phi}-1$ so that $(1+q)\phi=\frac{(1+\phi)}{2}=1-\frac{1}{2}\eta m$. In addition, we have $1+\frac{1}{q}= \frac{1+q}{q}=\frac{1+\phi}{1-\phi}\leq \frac{2}{\eta m}$.  It follows that
\begin{align}
    \label{nice_inequality_v0}
    (1+q) \cdot (1-\eta m)\leq 1-\frac{1}{2}\eta m,  \quad  1+q\leq \frac{1-\frac{1}{2}\eta m}{1-\eta m}\leq 1.5, \quad (1 + 1/q )\leq \frac{2}{m\eta},
\end{align}
where the second inequality holds because $\eta\in (0, \frac{1}{2L}]\leq \frac{1}{2m}$.


% Applying Minkowski's inequality, we have
% \begin{align}
% \label{almost_final}
%     \E{\lrn{\bar\theta_{(k+1)\eta}-\theta_{k+1}}_2^2}^{1/2}\leq \E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k+\eta \zeta_k}_2^2}^{1/2} + \E{\lrn{Y_k}_2^2}^{1/2}.
% \end{align}


% \Wei{should use this one to obtain a sharper result}
% \begin{align}
% \label{almost_final}
%     \E{\lrn{\bar\theta_{(k+1)\eta}-\theta_{k+1}}_2^2}&= \E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k-Y_k}_2^2} + \eta^2\E{\lrn{\zeta_k}_2^2}\notag\\
%     &\leq \bigg(\E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k}_2^2}^{1/2}+\E{\lrn{Y_k}_2^2}^{1/2}\bigg)^2 + \eta^2\E{\lrn{\zeta_k}_2},
% \end{align}

% For the first term in the previous result, taking square and expectation, we have
% \begin{align}
% \label{estimate_of_first_term}
%     &\quad\E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k+\eta \zeta_k}_2^2}\notag\\
%     &=\E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k}_2^2}+\E{\lrn{\eta\zeta_k}_2^2}+2\eta\underbrace{\E{\langle\bar\theta_{k\eta}-\theta_{k}-\eta X_k,  \zeta_k\rangle}}_{=0}\notag\\
%     &\leq \E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k}_2^2}+\eta^2\sigma^2 d\notag \\
%     &\leq \left(1-\frac{\eta m}{2}\right)^2\E{\lrn{\bar\theta_{k\eta}-\theta_k}_2^2}+\eta^2\sigma^2 d\notag \\
%     &\quad\quad +2\eta(m+L)\sum_{c=1}^N p_c\left(\E{\lrn{\bar\theta_{k\eta}^c-\bar\theta_{k\eta}}_2^2}+\E{\lrn{\theta_k^c-\theta_k}_2^2}\right)\notag\\
%     &\leq  \left(1-\frac{\eta m}{2}\right)^2\E{\lrn{\bar\theta_{k\eta}-\theta_k}_2^2}+12 \eta^3 d(K-1)^2 LH_{\rho}+\eta^2\sigma^2 d+80(K-1)\eta^2 dL\tau,
% \end{align}
% \Wei{cannot go through with order 2; seems to work if we apply $\E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k+\eta \zeta_k}_2^2}\leq (1-\eta m)\E{\lrn{\bar\theta_{k\eta}-\theta_{k}}_2^2}+\eta^3+\eta^2$, $\E{\lrn{\bar\theta_{(k+1)\eta}-\theta_{k+1}}_2^2}\leq (1+\gamma)\E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k}_2^2}+\underbrace{(1+\frac{1}{\gamma}) \E{\lrn{Y_k}_2^2}}_{\text{order becomes worse from 3 to 2}}+\eta^2\sigma^2 d$ based on 1+gamma and 1-gamma trick}
% \begin{align}
% \label{estimate_of_first_term}
%     \E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k}_2^2}&= \E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta (\nabla f(\bar\theta_{k\eta})-\nabla f(\theta_k))}_2^2}\notag\\
%     &\leq \left(1-\frac{\eta m}{2}\right)^2\E{\lrn{\bar\theta_{k\eta}-\theta_k}_2^2}\notag \\
%     &\quad\quad +2\eta(m+L)\sum_{c=1}^N p_c\left(\E{\lrn{\bar\theta_{k\eta}^c-\bar\theta_{k\eta}}_2^2}+\E{\lrn{\theta_k^c-\theta_k}_2^2}\right)\notag\\
%     &\leq  \left(1-\frac{\eta m}{2}\right)^2\E{\lrn{\bar\theta_{k\eta}-\theta_k}_2^2}+12 \eta^3 d(K-1)^2 LH_{\rho}+80(K-1)\eta^2 dL\tau,
% \end{align}
% where the first inequality follows by Lemma \ref{contraction}, and the second inequality follows by Lemma \ref{divergence} and $m\leq L$. Since the learning rate $\frac{1}{m+L}\leq \frac{2}{m}$, the requirement of the learning rate is clearly satisfied.

For the term $\E{ \| Y_k \|_2^2 }$ in Eq.\eqref{reestimate}, we have the following estimate
\begin{align}
\label{y_estimate}
    \E{ \| Y_k \|_2^2}&=\E{\lrn{\int_{k\eta}^{(k+1)\eta}\bigg(\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})\bigg)\d s}_2^2}\notag\\
    &\leq\eta\int_{k\eta}^{(k+1)\eta}\E{\lrn{\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})}_2^2}  \d s\notag\\
    &=\eta\int_{k\eta}^{(k+1)\eta}\E{\lrn{\sum_{c=1}^N p_c \bigg(\nabla f^c(\bar\theta_s^c)-\nabla f^c(\bar\theta^c_{k\eta})\bigg)}_2^2}  \d s\notag\\
    &\leq \eta\int_{k\eta}^{(k+1)\eta}\sum_{c=1}^N p_c \cdot \E{\lrn{\nabla f^c(\bar\theta_s^c)-\nabla f^c(\bar\theta^c_{k\eta})}_2^2}  \d s\notag\\
    &\leq \eta L^2 \int_{k\eta}^{(k+1)\eta}\sum_{c=1}^N p_c \cdot \E{\lrn{\bar\theta_s^c-\bar\theta^c_{k\eta}}_2^2}  \d s\notag\\
    &\leq \eta L^2  \int_{k\eta}^{(k+1)\eta}  \left(8\eta^2 d\kappa\bigg(\frac{\kappa\gamma}{d}+L\tau\bigg)+16\eta d\tau\right) \d s\notag\\
    &=8\eta^4 d L^2 \kappa m H_0+16\eta^3 L^2 d\tau,
\end{align}
where the first inequality follows by H\"{o}lder's inequality, the second inequality follows by Jensen's inequality, the third inequality follows by Assumption \ref{def:smooth}, and the last inequality follows by Lemma \ref{lem:discretization}.

Plugging Eq.\eqref{nice_inequality_v0} and Eq.\eqref{y_estimate} into Eq.\eqref{reestimate}, we have
\begin{align*}
    \E{\|\bar\theta_{(k+1)\eta}-\theta_{k+1} \|^2_2}&\leq  (1-\frac{\eta m}{2} ) \cdot \E{\|\bar\theta_{k\eta}-\theta_k\|_2^2}+168 \eta^3 d(K-1)^2 L\kappa m H_0+24\eta^2 d(K-1)L\tau\notag\\
    &\quad\quad+16\eta^3 d L^2\kappa H_0+32\eta^2 d\frac{L^2}{m} \tau+\eta^2 \sigma^2 d.
\end{align*}

Choose the specific Langevin diffusion $\bar\theta$ in stationary regime, we have $W_2^2(\mu_k,\pi)=\E{\|\bar\theta_{k\eta}-\theta_k \|_2^2}$ and  $W_2^2(\mu_{k+1},\pi)\leq\E{\| \bar\theta_{(k+1)\eta}-\theta_{k+1} \|_2^2}$. Arranging the terms, we have
\begin{align*}
    W_2^2(\mu_{k+1}, \pi)&\leq  (1-\frac{\eta m}{2}) \cdot W^2_2(\mu_{k}, \pi)+ 108\eta^2 d \kappa m H_0((K-1)^2+\kappa),
\end{align*}
where $\eta\leq \frac{1}{2L}$, $\kappa\geq 1$, $L\tau\leq m H_0$, and $\sigma^2\leq H_0$ are applied to the result.

\end{proof}

\subsection{Full device participation}

\subsubsection{Convergence via independent noise}


\begin{theorem}\label{main_theorem} Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. Consider Algorithm \ref{alg:alg_main_text_independent_noise} with a constant learning rate $\eta\in (0, \frac{1}{2L}]$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have
\begin{align*}
    W_2(\mu_{k}, \pi) &\leq  \left(1-\frac{\eta m}{4}\right)^k \cdot \bigg(\sqrt{2d}\big(\mathcal{D} +  \sqrt{\tau/m} \big)\bigg)+15\sqrt{{\eta} d \kappa} \cdot \sqrt{((K-1)^2+\kappa)H_0} .\notag
\end{align*}
where $\mu_k$ denotes the probability measure of $\theta_k$, $H_0, \kappa$ and $\gamma$ are defined as Definition~\ref{def:H_kappa_gamma}.
\end{theorem}


\begin{proof}
Iteratively applying Theorem \ref{one_step_Dalalyan} and arranging terms, we have that
\begin{align}\label{one_step_squared}
    W_2^2(\mu_{k}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)^k W^2_2(\mu_{0}, \pi)+  \frac{2}{\eta m}\bigg(108\eta^2 d \kappa m H_0((K-1)^2+\kappa)\bigg)\notag\\
    &\leq \left(1-\frac{\eta m}{2}\right)^k W^2_2(\mu_{0}, \pi)+ 216 \eta d\kappa ((K-1)^2+\kappa) H_{0},
\end{align}
where $\kappa=\frac{L}{m}$. By Lemma \ref{lem:W2_init_bound} and the initialization condition $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have that
\begin{align*}
W_2(\mu_0, \pi)\leq \sqrt{2d}(\mathcal{D} +  \sqrt{\tau/m} ).
\end{align*}

Applying the inequality $(1-\frac{\eta m}{2})\leq (1-\frac{\eta m}{4})^2$  completes the proof. 
\end{proof}

\textbf{Discussions}


\textbf{Optimal choice of $K$.} To ensure the algorithm to achieve the $\epsilon$ precision based on the total number of steps $T_{\epsilon}$ and the learning rate $\eta_{\epsilon}$, we can set
\begin{align*}
    &15\sqrt{{\eta_{\epsilon}} d\kappa } \cdot \bigg(\sqrt{((K-1)^2+\kappa)H_0} \bigg)\leq \frac{\epsilon}{2}\notag\\
    &e^{-\frac{\eta_{\epsilon} m}{4} T_{\epsilon}} \cdot \bigg(\sqrt{2d}\big(\mathcal{D} +  \sqrt{\tau/m} \big)\bigg)\leq \frac{\epsilon}{2}.
\end{align*}
This readily leads to
\begin{align*}
    \eta_{\epsilon}\leq \min\bigg\{\frac{1}{2L}, \mathcal{O}\bigg(\frac{\epsilon^2}{d\kappa {((K-1)^2+\kappa)H_0}}\bigg)\bigg\},\quad T_{\epsilon}\geq \Omega\bigg(\frac{\log\big(\frac{d}{\epsilon^2}\big)}{m\eta_{\epsilon}}\bigg).
\end{align*}

Plugging into the definition of $\eta_{\epsilon}$, it implies that to reach the precision level $\epsilon$, it suffices to set
\begin{align}\label{def_T}
    T_{\epsilon}=\Omega\bigg(\frac{d\kappa {((K-1)^2+\kappa)H_0}}{m\epsilon^2}\cdot \log\big(\frac{d}{\epsilon^2}\big)\bigg).
\end{align}
We observe that the number of communication rounds is around the order
\begin{align*}
    \frac{T_{\epsilon}}{K}=\Omega\bigg( K+\frac{\kappa}{K}\bigg),
\end{align*}
where the value of $\frac{T_{\epsilon}}{K}$ first decreases and then increases with respect to $K$, indicating that setting $K$ either too large or too small may lead to high communication costs and hurt the performance. Ideally, $K$ should be selected in the scale of $\Omega(\sqrt{\kappa})$. Combining the definition of $T_{\epsilon}$ in Eq.\eqref{def_T}, this suggests an interesting result that the optimal $K$ should be in the order of $\mathcal{O}(\sqrt{T_{\epsilon}})$. Similar results have been achieved by \cite{Stich19, lhy+19}.


\textbf{Convergence guarantees via varying learning rates}

\begin{theorem}\label{main_theorem_decay} Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. Consider Algorithm \ref{alg:alg_main_text_independent_noise} with an initialization satisfying $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$ and varying learning rate following
\begin{align*}
    \eta_{k}=\frac{1}{2L+(1/12)m (k-K_1)_{+}},\qquad k=1,2,\cdots
\end{align*}
where $(k-K_1)_{+}=\max\{0, k-K_1\}$ and $K_1$ is an integer that satisfies
\begin{align*}
    K_1\geq \frac{\log\big(\frac{ W_2(\mu_{0}, \pi)}{\sqrt{d}}\big)+\log(L)}{\log(1+m/(8L))},
\end{align*}
where $C_{\kappa}=15\sqrt{\kappa((K-1)^2+\kappa)H_0}$. Then for any $k\geq K_1$, we have
\begin{align*}
    W_2(\mu_{k}, \pi)\leq 1.5C_{\kappa}\big(d\eta_k \big)^{1/2}, \qquad \forall k \geq K_1,
\end{align*}

\end{theorem}
\begin{proof}
Applying Theorem \ref{main_theorem} based on the constant learning rate $\eta=\frac{1}{2L}\leq \frac{1}{m+L}$ and $k=1,2,\cdots, K_1$, we have 
\begin{align*}
    W_2(\mu_{K_1}, \pi) &\leq  \left(1-\frac{m}{8L}\right)^{K_1} W_2(\mu_{0}, \pi) +\sqrt{\frac{ d}{2L}}C_{\kappa},\notag\\
    &\leq  \left(1+\frac{m}{8L-m}\right)^{-K_1} W_2(\mu_{0}, \pi) +\sqrt{\frac{ d}{2L}}C_{\kappa},\notag\\
    &\leq 1.5 C_{\kappa} \sqrt{\frac{d}{2L}},
\end{align*}
where $K_1\geq \frac{\log\big(\frac{ W_2(\mu_{0}, \pi)}{\sqrt{d}}\big)+\log(L)}{\log(1+m/(8L))}\geq \frac{\log\big(\frac{ W_2(\mu_{0}, \pi)}{\sqrt{d}}\big)+\log(L)-log(C_{\kappa})}{\log(1+m/(8L-m))}$.

Starting from iteration $K_1$, we use a decreasing learning rate
\begin{align*}
    \eta_{k}=\frac{1}{2L+(1/12)m (k-K_1)}.
\end{align*}

Next, we proceed to show that the following inequality by the induction method
\begin{align}\label{induction}
    W_2(\mu_{k}, \pi)\leq 1.5C_{\kappa}\bigg(\frac{d}{2L+(1/12){m(k-K_1)}}\bigg)^{1/2}=1.5C_{\kappa}\big(d\eta_k \big)^{1/2}, \qquad \forall k \geq K_1,
\end{align}

It is clear that $K_1$ satisfies Eq.\eqref{induction}. If now that Eq.\eqref{induction} holds for some $k\geq K_1$, it follows by Lemma \ref{one_step_Dalalyan} that
\begin{align*}
    W_2(\mu_{k+1}, \pi)&\leq  \big(1-\frac{\eta_k m}{4}\big) \cdot W_2(\mu_{k}, \pi)+ \frac{\eta_k^{3/2} m}{4}\sqrt{d}C_{\kappa}\notag\\
    &\leq  \big(1-\frac{\eta_k m}{4}\big) \cdot  1.5  C_{\kappa}\big(d \eta_k\big)^{1/2}+ \frac{\eta_k m}{6}1.5C_{\kappa}\sqrt{d \eta}\notag\\
    &\leq  \big(1-\frac{\eta_k m}{12}\big) \cdot 1.5   C_{\kappa}\big(d \eta_k\big)^{1/2}.\notag
\end{align*}

To prove $W_2(\mu_{k+1}, \pi)\leq 1.5C_{\kappa}\big(d\eta_{k+1} \big)^{1/2}$, it suffices to show
\begin{align*}
    \big(1-\frac{\eta_k m}{12}\big) \eta_k^{1/2}&=\frac{\sqrt{12}(11m+12L+m(k-K_1))}{(12m+12L+m(k-K_1))^{3/2}}\notag\\
    &\leq \frac{\sqrt{12}(11m+12L+m(k-K_1))^{1/2}}{12m+12L+m(k-K_1)}\notag\\
    &\leq \frac{\sqrt{12}}{(12m+12L+m(k+1-K_1))^{1/2}}:=\eta_{k+1},
\end{align*}
where the last inequality follows since 
\begin{align*}
    (11m+12L+m(k-K_1))(12m+12L+m(k+1-K_1))\leq (12m+12L+m(k-K_1))^2.
\end{align*}
\end{proof}

The above result implies that we only require $k={\Omega}(\frac{d}{\epsilon^2} \frac{C_{\kappa}^2}{m})$ to achieve the precision $\epsilon$. By contrast, the fixed learning rate requires $T_{\epsilon}=\Omega\bigg(\frac{d}{\epsilon^2}\frac{C_{\kappa}^2}{m}\cdot \log\big( \frac{d}{\epsilon^2}\big)\bigg)$, which is slower than the algorithm with varying learning rate by $\mathcal{O}\big(\log \big(\frac{d}{\epsilon^2}\big)\big)$ times.



\subsubsection{Preserving privacy via correlated noise}

Note that Algorithm \ref{alg:alg_main_text_independent_noise} requires all the local clients to generate the independent noise $\xi^c_k$. Such a mechanism enjoys the convenience and yields a potential to protect the privacy of data and alleviates the security issue. However, the scale of noises is maximized and inevitable slows down the convergence. For extensions, it can be naturally generalized to correlated noise based on a hyperparameter, namely the correlation coefficient $\rho$ between different clients. Replacing Eq.\eqref{local_client} with 
\begin{equation}\label{local_client_diff_seeds}
    \beta_{k+1}^c=\theta_k^c-\eta\nabla f^c(\theta_k^c)+\sqrt{2\tau \rho^2}\dot{\xi}_k + \sqrt{2(1-\rho^2)\tau/p_c}\xi_k^c,
\end{equation}
where $\dot{\xi}_k$ is a $d$-dimensional standard Gaussian vector shared by all the clients at iteration $k$, $\xi_k^c$ is a unique $d$-dimensional Gaussian vector generated by client $c\in [N]$ only. Moreover, $\dot\xi_k$ is dependent with $\xi_k^c$ for any $c\in[N]$. Following the same synchronization step based Eq.\eqref{virtual_seq}, Eq.\eqref{sum_grad}, Eq.\eqref{fed_avg_langevin_dynamics_preliminary}, we have
\begin{equation}
\label{fed_avg_langevin_dynamics_pp}
\theta_{k+1}=\theta_k-\eta \nabla \tilde f(\theta_k)+\sqrt{2\tau}\xi_k,
\end{equation}
where $\xi_k=\rho \xi_k + \sqrt{1-\rho^2}\sum_{c=1}^N \sqrt{p_c}\xi_k^c$. Since the variance of i.i.d variables is additive, it is clear that $\xi_k$ follows the standard $d$-dimensional Gaussian distribution. The inclusion of the correlated noise implicitly reduces the temperature and naturally yields a trade-off between federation and accuracy. We refer to the algorithm with correlated noise as the generalized Federated Averaging Langevin dynamics (gFA-LD) and present it in Algorithm \ref{alg:alg_main_text_different_seeds}.

Since the inclusion of correlated noise doesn't affect the formulation of Eq.\eqref{fed_avg_langevin_dynamics_pp}, the algorithm property maintains the same except the scale of the temperature $\tau$ and federation are changed. Based on a target correlation coefficient $\rho\geq 0$, Eq.\eqref{local_client_diff_seeds} is equivalent to applying a temperature $T_{c,\rho}=\tau(\rho^2+(1-\rho^2)/p_c)$. In particular, setting $\rho=0$ leads to $T_{c, 1}=(1-\rho^2)/p_c$, which exactly recovers Algorithm \ref{alg:alg_main_text_independent_noise}; however, setting $\rho=1$ leads to $T_{c, 0}=\tau$, where the noise in local clients is reduced by $1/p_c$ times. Now we adjust the analysis as follows
\begin{theorem} Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. Given a constant learning rate $\eta\in (0, \frac{1}{2L}]$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have
\begin{align*}
    W_2(\mu_{k}, \pi) &\leq  \left(1-\frac{\eta m}{4}\right)^k \cdot \bigg(\sqrt{2d}\big(\mathcal{D} +  \sqrt{\tau/m} \big)\bigg)+9\sqrt{{\eta} d } \cdot \bigg(\sqrt{(K^2+\kappa)(H_{\rho}+L\tau)} \bigg),\notag
\end{align*}
where $\mu_k$ denotes the probability measure of $\theta_k$, $H_{\rho}, \kappa$ and $\gamma$ are defined as Definition~\ref{def:H_kappa_gamma}.
\end{theorem}


\begin{algorithm*}[h]\caption{Generalized Federated Averaging Langevin dynamics Algorithm (gFA-LD). Denote by $\theta_k^c$ the model parameter in the $c$-th client at the $k$-th step. Denote the immediate result of one step SGLD update from $\theta_k^c$ by $\beta_k^c$. A global synchronization is conducted every $K$ steps.}\label{alg:alg_main_text_different_seeds}
\begin{algorithmic}[1]
\State \begin{equation}\label{local_client_diff_seeds_v2}
    \beta_{k+1}^c=\theta_k^c-\eta\nabla f^c(\theta_k^c)+\sqrt{2\tau \rho^2}\dot\xi_k + \sqrt{2(1-\rho^2)\tau/p_c}\xi_k^c,
\end{equation}
\State
\begin{equation}  
\label{synchronization_diff_seeds}
\theta_{k+1}^c=\left\{  
             \begin{array}{lr}  
             \beta_{k+1}^c \qquad\qquad\qquad \text{if } k+1 \text{ mod } K\neq 0 \\  
              & \\
             \sum_{c=1}^N p_c \beta_{k+1}^c \ \qquad \text{if } k+1 \text{ mod } K=0.
             \end{array}  
\right.  
\end{equation} 
\end{algorithmic}
\end{algorithm*}

\subsection{Partial device participation}

Full device participation enjoys appealing convergence properties. However, it suffers from the straggler's effect in real-world applications, where the communication is limited by the slowest device. Partial device participation handles this issue by only allowing a small portion of devices in each communication and greatly increased the communication efficiency %fault-tolerant capability 
in a federated network. 

The first device-sampling scheme \text{I} \cite{LS20} selects a total of $S$ devices, where the $c$-th device is selected with a probability $p_c$. The first theoretical justification for convex optimization has been proposed by \cite{lhy+19}. However, to our best knowledge, the convergence analysis of sampling algorithm was missing in the federated-learning literature. 


\paragraph{(Scheme \text{I}: with replacement).}
Assume $\mathcal{S}_k=\{n_1, n_2, \cdots, n_S\}$, where $n_j\in [N]$ is a random number that takes a value of $c$ with a probability $p_c$ for any $j\in\{1,2,\cdots, S\}$. The synchronization step follows that $\theta_{k}=\frac{1}{S}\sum_{c\in \mathcal{S}_k}\theta_{k}^c$.

Another strategy is to uniformly select $S$ devices without replacement. We follow  \cite{lhy+19} and assume $S$ indices are selected uniformly without replacement and the synchronization step is the same as before. In addition, the convergence also requires an additional assumption on balanced data \cite{lhy+19}. 
\paragraph{(Scheme \text{II}: without replacement).}  Assume $\mathcal{S}_k=\{n_1, n_2, \cdots, n_S\}$, where $n_j\in [N]$ is a random number that takes a value of $c$ with a probability $\frac{1}{S}$ for any $j\in\{1,2,\cdots, S\}$. Assume the data is balanced such that $p_1=\cdots=p_N=\frac{1}{N}$. The synchronization step follows that $\theta_{k}=\frac{N}{S}\sum_{c\in \mathcal{S}_k} p_c\theta_{k}^c=\frac{1}{S}\sum_{c\in \mathcal{S}_k} \theta_{k}^c$.


% \paragraph{Notation: }


\begin{algorithm*}[h]\caption{Federated Averaging Langevin dynamics Algorithm (FA-LD) with partial device participation. A global synchronization is conducted every $K$ steps. $\mathcal{S}_k$ is a subset that contains $S$ indices according to a device-sampling rule based on scheme \text{I} or \text{II}.}\label{alg:alg_main_text_partial}
\begin{algorithmic}[1]
\State \begin{equation}\label{local_client_partial}
    \beta_{k+1}^c=\theta_k^c-\eta\nabla f^c(\theta_k^c)+\xi_k,
\end{equation}
\State
\begin{equation}  
\label{synchronization_partial}
\theta_{k+1}^c=\left\{  
             \begin{array}{lr}  
             \beta_{k+1}^c \qquad\qquad\qquad\quad\text{if } k+1 \text{ mod } K\neq 0 \\  
              & \\
             \sum_{c\in \mathcal{S}_{k+1}} \frac{1}{S} \beta_{k+1}^c \ \qquad \text{if } k+1 \text{ mod } K=0.
             \end{array}  
\right.  
\end{equation} 
\end{algorithmic}
\end{algorithm*}


\begin{lemma}[Unbiased sampling scheme]\label{unbiased_scheme}
For any $k \text{ mod } K=0$ based on scheme \text{I} or \text{II}, we have
\begin{align*}
    \E{\theta_k}=\E{\sum_{c\in \mathcal{S}_k} \theta_k^c}=\beta_k:=\sum_{c=1}^N p_c \beta_k^c.
\end{align*}
\end{lemma}

\begin{proof}

According to the definition of scheme \text{I} or \text{II}, we have $\theta_{k}=\frac{1}{S}\sum_{c\in \mathcal{S}_k} \theta_{k}^c$. In what follows, $\E{\theta_k}=\frac{1}{S}\E{\sum_{c\in \mathcal{S}_k} \theta_{k}^c}=\frac{1}{S}\sum_{c_0\in\mathcal{S}_k}\sum_{c=1}^N p_c \beta_k^c=\sum_{c=1}^N p_c \beta_k^c$, where $p_1=p_2=\cdots=p_N$ for scheme \text{II} in particular.
\end{proof}


\begin{lemma}[Bounded divergence based on partial device]\label{divergence_partial}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the following results

For Scheme \text{I}, the divergence between $\theta_k$ and $\beta_k$ is upper bounded by
\begin{align*}
    \E{\|\beta_k-\theta_k \|_2^2}&\leq \frac{2}{S}K^2\eta^2 dH_{\rho} +\frac{4}{S}K\eta d\tau.\notag
\end{align*}

For Scheme \text{II}, assuming the data is balanced such that $p_1=\cdots=p_N=\frac{1}{N}$, the divergence between $\theta_k$ and $\beta_k$ is upper bounded by
\begin{align*}
    \E{\|\beta_k-\theta_k \|_2^2}&\leq \frac{N(1-\frac{S}{N})}{S(N-1)} \bigg(2K^2\eta^2 dH_{\rho} +4K\eta d\tau\bigg).\notag
\end{align*}
where $H_{\rho}, \kappa$ and $\gamma$ are defined as Definition~\ref{def:H_kappa_gamma}. % $H_{\rho}=14 \kappa^2 \cdot  (m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2 )$, $\kappa=L/m$, and  $\gamma:=\max_{c\in[N]}\| \nabla f^c(\theta_*) \|_2^2$.
\end{lemma}

\begin{proof} We prove the bounded divergence for the two schemes, respectively.

For \textbf{scheme \text{I}} with replacement, $\bar\theta_{k}=\sum_{c\in \mathcal{S}_k} \frac{1}{S} \beta_{k}^c$ for a subset of indices $\mathcal{S}_k$. Taking expectation with respect to $\mathcal{S}_{k}$,
we have
\begin{align}\label{scheme_1}
    \E{\lrn{\theta_{k}-\beta_{k}}_2^2}=\frac{1}{S^2}\sum_{i=1}^S\E{\lrn{\beta_{k}^{n_i}-\beta_{k}}_2^2}=\frac{1}{S}\sum_{c=1}^N p_c \lrn{\beta_{k}^c-\beta_{k}}_2^2,
\end{align}
where the first equality follows by the independence and unbiasedness of $\theta_{k}^{n_i}$ for any $i\in [S]$. 

To further upper bound Eq.\eqref{scheme_1}, we follow the same technique as in Lemma \ref{divergence}. Since $k\text{ mod } K=0$, $k_0=k-K$ is also the communication time, which yields the same $\theta_{k_0}^{c}$ for any $c\in[N]$. in what follows,
\begin{align}\label{scheme_1_step2}
    \sum_{c=1}^N p_c\lrn{\beta_{k}^c-\beta_{k}}_2^2&=\sum_{c=1}^N p_c \lrn{\beta_k^c-\theta_{k_0}-(\beta_k-\theta_{k_0})}_2^2\notag\\
&\leq \sum_{c=1}^N p_c \lrn{\beta_k^c-\theta_{k_0}}_2^2,
\end{align}
where the last inequality follows by $\beta_{k}=\sum_{c=1}^N p_c \beta_{k}^c$ and $\E{\lrn{x-\E{x}}_2^2}\leq \E{\lrn{x}_2^2}$. Combining Eq.\eqref{scheme_1} and Eq.\eqref{scheme_1_step2}, we have
\begin{align*}
    \E{\lrn{\theta_{k}-\beta_{k}}_2^2}&\leq \frac{1}{S}\sum_{c=1}^N p_c \lrn{\beta_k^c-\theta_{k_0}}_2^2\notag\\
    &\leq \frac{1}{S}\sum_{c=1}^N p_c \lrn{\beta_k^c-\theta^c_{k_0}}_2^2\notag\\
    &\leq \frac{1}{S}\sum_{c=1}^N p_c \E{\sum_{k=k_0}^{k-1} 2K\eta^2\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2 + 4K\eta d\tau}\notag\\
&\leq \frac{1}{S}\sum_{c=1}^N p_c \left(\sum_{k=k_0}^{k-1} 2K\eta^2\E{\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2}+4K\eta d\tau\right)\notag\\
&\leq \frac{2}{S}K^2\eta^2 dH_{\rho} +\frac{4}{S}K\eta d\tau,
\end{align*}
where the last inequality follows a similar argument as in Lemma \ref{divergence}.

For \textbf{scheme \text{II}}, given $p_1=p_2=\cdots=p_N=\frac{1}{N}$, we have $\theta_{k}=\frac{1}{S}\sum_{c\in \mathcal{S}_k} \beta_{k}^{c}$, which leads to
\begin{align*}
    &\quad\E{\lrn{\theta_{k}-\beta_{k}}_2^2}=\E{\lrn{\frac{1}{S}\sum_{c\in \mathcal{S}_k} \beta_{k}^{c}-\beta_{k}}_2^2}=\frac{1}{S^2}\E{\lrn{\sum_{c=1}^N \mathbb{I}_{c\in \mathcal{S}_k}(\beta_{k}^c-\beta_{k})}_2^2},
\end{align*}
where $\mathbb{I}_{A}$ is an indicator function that equals to 1 if the event $A$ happens.

Plugging the facts that $\mathbb{P}(c\in \mathcal{S}_{k})=\frac{S}{N}$ and $\mathbb{P}(c_1,c_2\in \mathcal{S}_{k})=\frac{S(S-1)}{N(N-1)}$ for any $c_1\neq c_2\in [N]$ into the above equation, we have
\begin{align*}
    &\quad\E{\lrn{\theta_{k}-\beta_{k}}_2^2}\notag\\
    &=\frac{1}{S^2}\bigg[\sum_{c\in [N]} \mathbb{P}(c\in \mathcal{S}_{k}) \lrn{\beta_{k}^c-\beta_{k}}_2^2+\sum_{c_1\neq c_2} \mathbb{P}(c_1,c_2\in \mathcal{S}_{k})\langle \beta_{k}^{c_1}-\beta_{k}, \beta_{k}^{c_2}-\beta_{k} \rangle \bigg]\notag\\
    &=\frac{1}{SN}\sum_{c=1}^N\lrn{\beta_{k}^c-\beta_{k}}_2^2+\sum_{c_1\neq c_2}\frac{S-1}{SN(N-1)} \langle \beta_{k}^{c_1}-\beta_{k}, \beta_{k}^{c_2}-\beta_{k} \rangle\notag\\
    &=\frac{1-\frac{S}{N}}{S(N-1)}\sum_{c=1}^N\lrn{\beta_{k}^c-\beta_{k}}_2^2,
\end{align*}
where the last equality holds since $\sum_{c\in[N]}\lrn{\beta_{k}^c-\beta_{k}}_2^2 +\sum_{c_1\neq c_2}\langle \beta_{k}^{c_1}-\beta_{k},\beta_{k}^{c_2}-\beta_{k}\rangle=\lrn{\beta_{k}-\beta_{k}}_2^2=0$.

Eventually, we have
\begin{align*}
    \E{\lrn{\theta_{k}-\beta_{k}}_2^2}&=\frac{N(1-\frac{S}{N})}{S(N-1)} \E{\frac{1}{N} \sum_{c=1}^N\lrn{\beta_{k}^c-\beta_{k}}_2^2}\notag\\
    &\leq\frac{N(1-\frac{S}{N})}{S(N-1)} \E{\frac{1}{N} \sum_{c=1}^N\lrn{\beta_{k}^c-\theta_{k_0}}_2^2}\notag\\
    &\leq \frac{N(1-\frac{S}{N})}{S(N-1)} \bigg(2K^2\eta^2 dH_{\rho} +4K\eta d\tau\bigg),
\end{align*}
where the first inequality follows a similar argument as in Eq.\eqref{scheme_1_step2} and the last inequality follows by Lemma \ref{divergence}.


\end{proof}

\begin{theorem} Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. Given a constant learning rate $\eta\in (0, \frac{1}{2L}]$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have
\begin{align*}
    W_2(\mu_{k}, \pi) &\leq  \left(1-\frac{\eta m}{4}\right)^k \cdot \bigg(\sqrt{2d}\big(\mathcal{D} +  \sqrt{\tau/m} \big)\bigg)\notag\\
    &\qquad+9\sqrt{{\eta} d } \cdot \bigg(\sqrt{(K^2+\kappa)(H_0+L\tau)}+\frac{C_KC_1}{m} \bigg)+\frac{C_K d}{Sm} C_2,
\end{align*}
where $C_K=\frac{\eta m K}{1-e^{-\frac{\eta m K}{2}}}$, $C_1=2K H^2_{\rho}$ and $C_2=4\tau$ for \textbf{Scheme I} and $C_1=2\frac{N(1-\frac{S}{N})}{(N-1)}KH_{\rho}$ and $C_2=4\frac{N(1-\frac{S}{N})}{(N-1)}\tau$ for \textbf{Scheme II}.
\end{theorem}

\begin{proof}

Note that 
\begin{align*}
&\quad\ \E{\lrn{\bar\theta_{(k+1)\eta}-\theta_{k+1}}_2^2}\notag\\
&= \E{\lrn{\bar\theta_{(k+1)\eta}-\beta_{k+1}+\beta_{k+1}-\theta_{k+1}}_2^2}\notag\\
&= \E{\lrn{\bar\theta_{(k+1)\eta}-\beta_{k+1}}_2^2}+ \E{\lrn{\beta_{k+1}-\theta_{k+1}}_2^2}+\E{2\langle \bar\theta_{(k+1)\eta}-\beta_{k+1}, \beta_{k+1}-\theta_{k+1} \rangle}\notag\\
&= \E{\lrn{\bar\theta_{(k+1)\eta}-\beta_{k+1}}_2^2}+ \E{\lrn{\beta_{k+1}-\theta_{k+1}}_2^2},\notag
\end{align*}
where the last equality follows by the unbiasedness of the device-sampling scheme in Lemma \ref{unbiased_scheme}.


If $k+1 \text{ mod } K\neq 0$, we always have $\beta_{k+1}=\theta_{k+1}$ and $\E{\lrn{\beta_{k+1}-\theta_{k+1}}_2^2}=0$. Following the same argument as in Lemma \ref{one_step_Dalalyan}, both schemes lead to the one-step iterate as follows
\begin{align}\label{non_period}
    W_2^2(\mu_{k+1}, \pi)&\leq  (1-\frac{\eta m}{2}) \cdot W^2_2(\mu_{k}, \pi)+  33 \eta^2 d (K^2+\kappa) \big(H_0+L\tau\big).
\end{align}


If $k+1 \text{ mod } K= 0$, combining Lemma \ref{divergence_partial} and Lemma \ref{one_step_Dalalyan}, we have
\begin{align}\label{period}
    W_2^2(\mu_{k+1}, \pi)&\leq  (1-\frac{\eta m}{2}) \cdot W^2_2(\mu_{k}, \pi)+ 33 \eta^2 d (K^2+\kappa) \big(H_0+L\tau\big) + \frac{Kd\eta}{S} (C_1 \eta + C_2),
\end{align}
where $C_1=2K H^2_{\rho}$ and $C_2=4\tau$ for \textbf{Scheme I} and $C_1=2\frac{N(1-\frac{S}{N})}{(N-1)}KH_{\rho}$ and $C_2=4\frac{N(1-\frac{S}{N})}{(N-1)}\tau$ for \textbf{Scheme II}.


Repeatedly applying Eq.\eqref{non_period} and Eq.\eqref{period} and arranging terms, we have that
\begin{align*}
    W_2^2(\mu_{k}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)^k W^2_2(\mu_{0}, \pi)+  \frac{2}{\eta m}\bigg(33 \eta^2 d (K^2+\kappa) \big(H_0+L\tau\big)\bigg)\notag\\
    &\qquad+ \frac{(1-(1-\frac{\eta m}{2})^K)^{\lfloor k/K\rfloor}}{1-(1-\frac{\eta m}{2})^K}\left(  \frac{Kd\eta}{S} (C_1 \eta + C_2)   \right)\notag\\
    &\leq \left(1-\frac{\eta m}{2}\right)^k W^2_2(\mu_{0}, \pi)+ 66 \eta d (K^2+\kappa) \big(H_0/m+\kappa\tau\big)\notag\\
    &\quad\quad+\underbrace{\frac{\eta m K}{1-e^{-\frac{\eta m K}{2}}}}_{C_K} \frac{Kd\eta}{\eta mK S} (C_1 \eta + C_2),\notag\\
    &= \left(1-\frac{\eta m}{2}\right)^k W^2_2(\mu_{0}, \pi)+ 66 \eta d (K^2+\kappa) \big(H_0/m+\kappa\tau\big)\notag\\
    &\quad\quad+ \frac{C_KC_1}{Sm} \eta d + \frac{C_K d}{Sm} C_2,\notag
\end{align*}
where the second inequality follows by $(1-r)^K\leq e^{-rK}$ for any $r\geq 0$.





\end{proof}


% \subsection{Connection with optimization}

% \begin{theorem}[Connection with optimization]

% TBD. Follow \cite{Dalalyan17}

% \end{theorem}


% \begin{theorem}[Different seeds?]

% Seems to require a larger temperature in local, which is equivalent to setting a large temperature $\tau N$ and leads to a bad error, but still doable considering the contributions in privacy. 

% Are there any resuls that analyze the average of correlated Gaussian?

% % reference: http://web.ipac.caltech.edu/staff/fmasci/home/astro_refs/SumOfCorrelatedRVs.pdf

% \end{theorem}


% \begin{theorem}[Non-convex?]

% May not have enough time.

% \end{theorem}









\subsection{Important lemmas}

\begin{proof}[Proof of Lemma \ref{contraction}] 
% Let $X_k:=\nabla f(\theta_k+\delta_k)-\nabla f(\theta_k)$, $\delta_k:=\bar\theta_{k\eta}-\theta_{k}$ and $\zeta_k:=\nabla \tilde f(\theta_k)-\nabla f(\theta_k)$, where  $\theta_k$ and $\bar\theta_{t}$ are simulated from Eq.\eqref{fed_avg_langevin_dynamics} from Eq.\eqref{continuous_dynamics}, respectively. Given xxx, we have

Given a client index $c\in[N]$, applying Theorem 2.1.12 \cite{Nesterov04} leads to
\begin{align}
\label{special_inner_product}
    \langle y-x, \nabla f^c(y)-\nabla f^c(x) \rangle\geq \frac{m L}{L+m}\lrn{y-x}_2^2 + \frac{1}{L+m} \lrn{\nabla f^c(y)-\nabla f^c(x)}_2^2,\quad \forall x,y\in\mathbb{R}^d.
\end{align}

In what follows, we have
\begin{align}
\label{iteration}
    &\quad\lrn{\beta-\theta-\eta(\nabla f(\beta)-\nabla f(\theta))}_2^2\notag\\
    &=\lrn{\beta-\theta}_2^2 -2\eta \underbrace{\langle \beta-\theta, \nabla f(\beta)-\nabla f(\theta)\rangle}_{\mathcal{I}}+\eta^2 \lrn{\nabla f(\beta)-\nabla f(\theta)}_2^2.
\end{align}

For the second item $\mathcal{I}$ in the right hand side, we have
\begin{align}
\label{target_contraction}
    \mathcal{I}&=\sum_{c=1}^N p_c\big\langle \beta-\theta, \nabla f^c(\beta^c)-\nabla f^c(\theta^c)\big\rangle\notag\\
    &=\sum_{c=1}^N p_c\big\langle \beta-\beta^c+\beta^c-\theta^c+\theta^c-\theta, \nabla f^c(\beta^c)-\nabla f^c(\theta^c)\big\rangle\notag\\
    &=-\sum_{c=1}^N p_c\left(\big\langle \beta^c-\beta, \nabla f^c(\beta^c)-\nabla f^c(\theta^c)\big\rangle+\big\langle \theta-\theta^c, \nabla f^c(\beta^c)-\nabla f^c(\theta^c)\big\rangle\right)\notag\\
    &\quad\quad+\sum_{c=1}^N p_c\big\langle \beta^c-\theta^c, \nabla f^c(\beta^c)-\nabla f^c(\theta^c)\big\rangle\notag\\
    &\geq -\sum_{c=1}^N p_c \cdot \big((m+L)\lrn{\beta^c-\beta}_2^2+(m+L)\lrn{\theta^c-\theta}_2^2+\frac{1}{2(m+L)}\lrn{\nabla f^c(\beta^c)-\nabla f^c(\theta^c)}_2^2\big)\notag\\
    &\quad\quad+ \sum_{c=1}^N p_c \cdot \big(\frac{m L}{L+m}\lrn{\beta^c-\theta^c}_2^2 + \frac{1}{L+m} \lrn{\nabla f^c(\beta^c)-\nabla f^c(\theta^c)}_2^2 \big)\notag\\
    &\geq -(m+L)\sum_{c=1}^N p_c\left(\lrn{\beta^c-\beta}_2^2+\lrn{\theta^c-\theta}_2^2\right) + \frac{m L}{L+m}\lrn{\beta-\theta}_2^2 \notag\\
    &\quad\quad+ \frac{1}{2(L+m)} \lrn{\nabla f(\beta)-\nabla f(\theta)}_2^2,
\end{align}
where the first inequality follows by the AM-GM inequality and Eq.\eqref{special_inner_product}, respectively; the last inequality follows by Jensen's inequality such that $\sum_{c=1}^N p_c \| \beta^c-\theta^c \|_2^2\geq \|\sum_{c=1}^N p_c  (\beta^c-\theta^c ) \|_2^2$

Plugging Eq.\eqref{target_contraction} into Eq.\eqref{iteration}, we have
\begin{align*}
    &\quad\lrn{\beta-\theta-\eta \cdot (\nabla f(\beta)-\nabla f(\theta))}_2^2\notag\\
    &\leq \big(1-\frac{2\eta mL}{m+L}\big) \cdot \| \beta-\theta \|_2^2+\eta\big(\underbrace{\eta-\frac{1}{m+L}}_{\leq 0 \text{ if } \eta\leq \frac{1}{m+L}}\big) \cdot \| \nabla f(\beta)-\nabla f(\theta) \|_2^2\notag\\
    &\quad\quad+2\eta(m+L)\sum_{c=1}^N p_c \cdot (\| \beta^c-\beta \|_2^2+\| \theta^c-\theta \|_2^2 )\notag\\
    &\leq \left(1-\eta m\right) \|\beta-\theta \|_2^2+4\eta L\sum_{c=1}^N p_c \cdot \big(\| \beta^c-\beta \|_2^2+\| \theta^c-\theta \|_2^2\big),\notag
\end{align*}
where the last inequality follows by $\frac{2L}{m+L}\geq 1$, $m\leq L$, $1-2a\leq (1-a)^2$ for any $a$, and $\eta\in(0, \frac{1}{m+L}]$.

% Applying $\sqrt{a+b}\leq \sqrt{a}+\sqrt{b}$, we have
% \begin{align*}
%     &\quad\lrn{\beta-\theta-\eta(\nabla f(\beta)-\nabla f(\theta))}_2\leq \left(1-\frac{\eta m}{2}\right)^2\lrn{\beta-\theta}_2^2+2\eta(m+L)\sum_{c=1}^N p_c\left(\lrn{\beta^c-\beta}_2+\sqr\lrn{\theta^c-\theta}_2^2\right),\notag
% \end{align*}

\end{proof}


\begin{proof}[Proof of Lemma \ref{lem:discretization}]
For any $s\in[0,\infty)$, there exists a certain $k \in \mathbb{N}^+$ such that $s\in [k\eta, (k+1)\eta)$. By the continuous dynamics of Eq.~\eqref{continuous_dynamics}, we have
\begin{align*}
    \bar\theta_{s}^c = \bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor}+(s-k\eta) \nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor })+\sqrt{2\tau}\int_{k\eta}^s \d \overline{W}_t,
\end{align*}
which suggests that 
\begin{align*}
    \sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2 \leq (s-k\eta) \big\| \nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2+\sup_{ s \in [ k\eta,  (k+1)\eta ) }\lrn{\int_{k\eta}^s \sqrt{2\tau} \d \overline{W}_t}_2.
\end{align*}
We first square the terms on both sides and take Young’s inequality and expectation
\begin{align*}
    \E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2} \leq 2\E{\big\|(s-k\eta)\nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+2\E{\sup_{ s \in [ k\eta,  (k+1)\eta ) }\lrn{\int_{k\eta}^s \sqrt{2\tau} \d \overline{W}_t}_2^2}.
\end{align*}
Then, by Burkholder-Davis-Gundy inequality (\ref{BDG-inequality}) and It\^{o} isometry, we have
\begin{align}
    \label{eq:1st_part}
    \E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2}&\leq 2\E{ \big\| (s-k\eta)\nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+8\sum_{i=1}^d\E{\int_{k\eta}^s 2\tau \d t} \notag \\
    &\leq 2\eta^2\E{ \big\| \nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+16 \eta d\tau.
\end{align}

By Young's inequality and the smoothness assumption \ref{def:smooth},  we have
\begin{align}\label{eq:2nd_part}
    \E{ \| \nabla f^c(\bar\theta_{\eta\lfloor \frac{s}{\eta} \rfloor }) \|_2^2}
    = & ~ \E{\| \nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor })-\nabla f^c(\theta_*) +\nabla f^c(\theta_*) \|_2^2} \notag \\
    \leq & ~ 2\E{\| \nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor })-\nabla f^c(\theta_*) \|_2^2} +2{\lrn{\nabla f^c(\theta_*) }_2^2} \notag \\
    \leq & ~ 2L^2 \E{\|\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }-\theta_*\|_2^2}+2\gamma\notag\\
    \leq & ~ 2L^2 \left(\frac{1}{m}\bigg(\frac{\gamma}{m}+2d\tau\bigg)\right)+ 2\gamma\notag\\
    \leq & ~ 4 d\kappa\bigg(\frac{\kappa \gamma}{d}+4L\tau\bigg),
\end{align}
where the third inequality follows by Lemma \ref{lem:L2_bound_local_continuous}, the fourth step holds since $\kappa \geq 1$. Combining Eq.~\eqref{eq:1st_part} and Eq.~\eqref{eq:2nd_part}, we have
\begin{align*}
\E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2}
&\leq 8\eta^2 d\kappa\bigg(\frac{\kappa\gamma}{d}+L\tau\bigg)+16\eta d\tau.\notag
\end{align*}

\end{proof}



% \Wei{if no decay of learning rate is required, we may polish for a better rate here}
\begin{proof}[Proof of Lemma \ref{divergence}] For any $k \ge 0$, consider $k_0=K\lfloor \frac{k}{K}\rfloor $ such that $k\leq k_0$ and $\theta_{k_0}^c=\theta_{k_0}$ for any $k\geq 0$. It is clear that  $k-k_0 \leq K-1$ for all $k\geq 0$.

By the iterate Eq.\eqref{fed_avg_langevin_dynamics}, we have
\begin{align*}
\sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_k}_2^2}&=\sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_{k_0}-(\theta_k-\theta_{k_0})}_2^2}\notag\\
&\leq \sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_{k_0}}_2^2}\notag\\
&\leq \sum_{c=1}^N p_c \E{\sum_{k=k_0}^{k-1} 2(K-1)\eta^2\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2 + 4(K-1)\eta d\tau}\notag\\
&\leq \sum_{c=1}^N p_c \left(\sum_{k=k_0}^{k-1} 2(K-1)\eta^2\E{\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2}+4(K-1)\eta d\tau\right)\notag\\
&\leq 28(K-1)^2\eta^2 d\kappa m H_{\rho} +4(K-1)\eta d\tau,
\end{align*}
where the first inequality holds by $\E{\| \theta-\E{\theta} \|_2^2}\leq \E{\|\theta \|_2^2}$ for a stochastic variable $\theta$; the second inequality follows by $(\sum_{i=1}^{K-1} a_i)^2\leq (K-1)\sum_{i=1}^{K-1} a_i^2$; the last inequality follows by Lemma \ref{bounded_gradient_l2}. $H_{\rho}$ is defined in Definition \ref{def:H_kappa_gamma}.

% \textbf{Continuous-time diffusion: } For any time $t\geq 0$ at the $k$-th iteration and the closest synchronization time $t_0$ at the $k_0$-th iteration, we have
% \begin{align*}
% \sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t}}_2^2}&=\sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t_0}-(\bar\theta_{t}-\bar\theta_{t_0})}_2^2}\notag\\
% &\leq \sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t_0}}_2^2}\notag\\
% &\leq 8(t-t_0)^2 d\kappa^2\bigg(\frac{\gamma}{d}+m\tau\bigg)+16(t-t_0) d\tau \notag\\
% &\leq 8(K-1)^2\eta^2 d\kappa^2\bigg(\frac{\gamma}{d}+m\tau\bigg)+16(K-1)\eta d\tau\notag\\
% &\leq (K-1)^2\eta^2 d H_{\rho}+16(K-1)\eta d\tau, \notag
% \end{align*}
% where the second inequality follows by applying Lemma \ref{lem:discretization} by treating the learning rate as $t-t_0$; the third inequality follows since the $k$-th iteration and the $k_0$-th iteration has a time difference at most $(K-1)\eta$; the last inequality holds since $H_{\rho}=14 \kappa^2\bigg(m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2\bigg)$.


% By definition, $\bar\theta_{t}^c-\bar\theta_{t_0}=-\int_{t_0}^t \nabla f^c(\bar\theta^c_s)\d s +\sqrt{2}\cdot \overline{W}_{t-t_0}$. Apply Young's inequality, we have
% \begin{align*}
%     \E{\lrn{\bar\theta_{t}^c-\bar\theta_{t_0}}_2^2}&=2\E{\lrn{-\int_{t_0}^t \nabla f^c(\bar\theta^c_s)\d s}_2^2} +2\E{\lrn{\sqrt{2}\cdot \overline{W}_{t-t_0}}_2^2}\notag\\
%     &\leq 2(t-t_0)\E{\int_{t_0}^t \lrn{\nabla f^c(\bar\theta_s^c)}_2^2\d s}
% \end{align*}

% By H\"{o}lder's inequality, we have
% \begin{align*}
% \sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t}}_2^2}&\leq \sum_{c=1}^N p_c { (t-t_0)\int_{t_0}^{t} \E{\lrn{-\int_{t_0}^t \nabla f^c(\bar\theta^c_s)\d s +\sqrt{2}\cdot \overline{W}_{t-t_0}}_2^2}\d s}\notag\\
% &\leq \sum_{c=1}^N p_c { (t-t_0)\int_{t_0}^{t} \E{\lrn{-\int_{t_0}^t \nabla f^c(\bar\theta^c_s)\d s +\sqrt{2}\cdot \overline{W}_{t-t_0}}_2^2}\d s}\notag\\
% &\leq 4(K-1)^2\eta^2 H_{\rho},
% \end{align*}

\end{proof}



\begin{proof}[Proof of Lemma \ref{lem:total_variance}] By assumption \ref{def:variance}, we have
\begin{align*}
    \E{\lrn{\nabla f(\theta)-\nabla \tilde f(\theta)}_2^2}&=\E{\lrn{\sum_{c=1}^N p_c\bigg(\nabla f^c(\theta^c)-\nabla \tilde f^c(\theta^c)\bigg)}_2^2}\\
    &=\sum_{c=1}^N p_c^2\E{\lrn{\nabla f^c(\theta^c)-\nabla \tilde f^c(\theta^c)}_2^2}\\
    &\leq d \sigma^2 \sum_{c=1}^N p_c^2\leq d\sigma^2 \left(\sum_{c=1}^N p_c\right)^2:=d\sigma^2.
\end{align*}

\end{proof}




\subsection{Supporting Lemmas}




\begin{lemma}[Uniform $\ell_2$ upper bound for local clients]
\label{lem:L2_bound_local}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. We consider the generalized formulation in Algorithm \ref{alg:alg_main_text_different_seeds} with the temperature
$$T_{c,\rho}=\tau(\rho^2+(1-\rho^2)/p_c)$$ given a correlation coefficient $\rho$. For any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the $\ell_2$ norm upper bound as follows
\begin{align*}
\sup_k\E{\lrn{\theta_k^c-\theta_*}_2^2}\leq d\mathcal{D}^2 + {\frac{6d}{m}\bigg(\max_{c\in[N]} T_{c, \rho}+\frac{ \sigma^2}{m} + \frac{\gamma }{md}\bigg)},\notag
\end{align*}
where $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$ and $\theta_*$ denotes the global minimum for the function $f$.
\end{lemma}


\begin{proof} First, we consider the $k$-th iteration, where $k\in \{1,2,\cdots, K-2, (K-1)_{-}\}$ and $(K-1)_-$ denotes the $(K-1)$-step before synchronization. Following the iterate of Eq.\eqref{local_client} in a local client of $c\in [N]$, we have
	\begin{align}\label{eq:Langevin_L2_1_local}
&\quad\ \E{\lrn{\theta_{k+1}^c-\theta_*}_2^2}\notag\\
		&= \E{\|\theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c)\|_2^2} + \sqrt{8\eta T_{c,\rho}}\E{ \langle \theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c), \xi_k \rangle } + 2\eta T_{c,\rho}\E{\|\xi_k\|_2^2} \notag \\
		&= \E{\|\theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c)\|_2^2} + 2\eta d T_{c,\rho},
	\end{align}	
	where the last equality follows from $\E{\xi_k}=0$ and the conditional independence of $\theta_k^c-\theta_*- \widetilde f^c(\theta_k^c)$ and $\xi_k$. Note that
\begin{align}\label{eq:ip_1st_local}
%\small
&\quad\ \E{\|\theta_k^c -\theta_*- \eta \widetilde f^c(\theta_k^c)\|_2^2} \notag\\
&= \E{\left\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \right\|_2^2} + \eta^2\E{\|\nabla f^c(\theta_k^c)-\nabla \widetilde f^c(\theta_k^c)\|_2^2}  \notag\\
& \qquad\qquad + 2 \eta \E{ \langle \theta_k^c-\theta_*-\eta \nabla f^c(\theta_k^c),\nabla f^c(\theta_k^c)-\nabla\widetilde f^c(\theta_k^c) \rangle }  \notag\\
&= \E{\left\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \right\|_2^2} + \eta^2\E{\|\nabla f^c(\theta_k^c)-\nabla \widetilde f^c(\theta_k^c)\|_2^2} \notag \\
&\leq \E{\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \|_2^2}  + \eta^2 d\sigma^2, 
\end{align}
where the first step follows from simple algebra, the second step follows from the unbiasedness of the stochastic gradient, and the last step follows from Assumption \ref{def:variance}. For any $q>0$, we can upper bound the first term of Eq.\eqref{eq:ip_1st_local} as follows
\begin{align}\label{eq:ip_2nd_test_theta_star}
	&\quad\ \E{\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \|_2^2}\notag\\
	&=\E{\|\theta_k^c -\theta_*- \eta (\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*))-\eta\nabla f^c(\theta_*) \|_2^2}\notag\\
	&\leq (1+q)\E{\|\theta_k^c -\theta_*- \eta (\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)) \|_2^2}+\eta^2 \left(1+\frac{1}{q}\right) \|\nabla f^c(\theta_*)\|_2^2\notag\\
	&\leq (1+q)\underbrace{\left(1-\frac{\eta m}{2}\right)^2}_{\psi^2}\E{\lrn{\theta_k^c-\theta_*}_2^2}+\eta^2 \left(1+\frac{1}{q}\right)\gamma,
\end{align}
where the first inequality follows by the AM-GM inequality;  the second inequality is a special case of Lemma \ref{contraction} based on Assumption \ref{def:strong_convex}, where no local steps is involved before the synchronization step. Similar results have been achieved in Theorem 3 \cite{Dalalyan17}. In addition, $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$.

Choose $q=(\frac{1+\psi}{2\psi})^2-1$ so that $(1+q)\psi^2=\frac{(1+\psi)^2}{4}$. Moreover, since $\psi=1-\frac{\eta m}{2}$, we get $\frac{1+\psi}{2}=1-\frac{1}{4}\eta m$. In addition, we have $1+\frac{1}{q}= \frac{1+q}{q}= \frac{(1+\psi)^2}{(1-\psi)(1+3\psi)}\leq \frac{2}{\eta m}$.  It follows that
\begin{align}
    \label{nice_inequality}
    \eta^2\left(1+\frac{1}{q}\right)\leq \frac{2\eta}{m}.
\end{align}

Combining Eq.~\eqref{eq:Langevin_L2_1_local}, Eq.~\eqref{eq:ip_1st_local}, Eq.~\eqref{eq:ip_2nd_test_theta_star}, and Eq.~\eqref{nice_inequality}, we have the following iterate
\begin{align*}
	\E{\|\theta_{k+1}^c-\theta_*\|_2^2} 
	\leq & ~ \underbrace{\left(1-\frac{\eta m}{4}\right)^2}_{:=g(\eta)} \E{\|\theta_k^c-\theta_*\|_2^2} + 2\eta d T_{c,\rho} +\eta^2 d \sigma^2+\frac{2\eta \gamma}{m}. \notag
\end{align*}

Note that $\frac{1}{1-g(\eta)}=\frac{1}{\frac{\eta m}{2}(1-\frac{\eta m}{8})}\leq \frac{3}{\eta m}$ given $\eta\in (0, \frac{2}{m})$. Recursively applying the above equation $k$ times, where $k\in \{1,2,\cdots, K-1, K_{-}\}$ and $K_-$ denotes the $K$-step without synchronization, it follows that
\begin{align}\label{recursion_v2}
	\E{\|\theta_k^c-\theta_*\|_2^2} &\le g(\eta)^{k}\| \theta_0^c-\theta_*\|_2^2 + \frac{1- g(\eta)^{k}}{1 - g(\eta)} \cdot \left(2\eta d T_{c,\rho} +\eta^2 d \sigma^2+\frac{2\eta \gamma}{m}\right)  \\
	&\le \|\theta_0^c-\theta_*\|_2^2 + \frac{3}{\eta m} \cdot \left(2\eta d T_{c,\rho} +\eta^2 d \sigma^2+\frac{2\eta \gamma}{m}\right) \notag\\
	&\leq d\mathcal{D}^2 + \underbrace{\frac{6d}{m}\bigg(\max_{c\in[N]}T_{c,\rho}+\frac{ \sigma^2}{m} + \frac{\gamma }{md}\bigg)}_{:=U}\notag
\end{align}
where the second inequality holds by $g(\eta)\leq 1$, the third inequality holds because $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$ and $\eta< \frac{2}{m}$.
In particular, the $K$-th step before synchronization yields that
\begin{align}\label{recursion_v3}
	\E{\|\theta_{K_-}^c-\theta_*\|_2^2} &\le d\mathcal{D}^2 +U.
\end{align}
Having all the results ready, for the $K$-local step after synchronization, applying Jensen's inequality
\begin{align}\label{recursion_v4}
	\E{\|\theta_K^c-\theta_*\|_2^2} 
	= & ~\E{\bigg\|\sum_{c=1}^N p_c\theta_{K-}^c-\theta_*\bigg\|_2^2} \notag \\
	\leq & ~ \sum_{c=1}^N p_c\E{\lrn{\theta_{K-}^c-\theta_*}_2^2} \notag \\
	\leq &~ d\mathcal{D}^2 + U,
 \end{align}
Now starting from iteration $K$, we adapt the recursion of Eq.\eqref{recursion_v2} for the $k$-th step, where $k\in\{K+1,\cdots, 2K-1, (2K)_{-}\}$ and $(2K)_-$ denotes the $2K$-step without synchronization, we have
\begin{align}\label{recursion_v5}
	&\E{\|\theta_k^c-\theta_*\|_2^2} \notag\\
	\leq & ~ g(\eta)^{k-K} \cdot  \E{\|\theta_K^c-\theta_*\|_2^2} + \frac{1- g(\eta)^{k-K}}{1 - g(\eta)}\cdot \left(2\eta d \tau\big(\rho^2+\max_{c\in[N]}\frac{1-\rho^2}{p_c}\big) +\eta^2 d \sigma^2+\frac{2\eta \gamma}{m}\right)\notag \\
	\leq &  g(\eta)^{k-K}(d\mathcal{D}^2+U)+\frac{1- g(\eta)^{k-K}}{m\eta/3} \frac{m\eta}{3} U\notag \\
	\leq & d\mathcal{D}^2+ g(\eta)^{k-K} U +  (1- g(\eta)^{k-K}) U \notag\\
	\leq & d\mathcal{D}^2+U,
\end{align}
where the second inequality follows by Eq.\eqref{recursion_v4}, the fact that $1-g(\eta)\geq \eta m/3$ and $\eta\leq \frac{2}{m}$, and the definition of $U$. The third one holds since $g(\eta)\leq 1$.

By repeating Eq.\eqref{recursion_v4} and \eqref{recursion_v5}, we have that for all $k\geq 0$, we can obtain the desired uniform upper bound.
\end{proof}




\begin{lemma}[Uniform $\ell_2$ upper bound in continuous time]
\label{lem:L2_bound_local_continuous}
Assume assumption \ref{def:strong_convex} holds. We have the $\ell_2$ norm upper bound as follows %\Zhao{The following quantity doesn't have $k$, not sure $\sup_k$ make sense}\Wei{Nice catch, thanks!}
\begin{align*}
\sup_t\E{\lrn{\bar\theta_t^c-\theta_*}_2^2}\leq \frac{1}{m}\bigg(\frac{\gamma}{m}+2d\tau\bigg),\notag
\end{align*}
where $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$ and $\theta_*$ denotes the global minimum for the function $f$.
\end{lemma}

\begin{proof} Since the synchronization is conducted at every time $t$, it essential temperature applied to each client is $\tau$.  Let $q(\bar\theta_t^c)=\lrn{\bar\theta_t^c-\theta_*}_2^2$. For any time $t\geq 0$, applying It\^{o}'s lemma leads to
\begin{align*}
    \d q(\bar\theta_t^c)&=-2\langle \bar\theta_t^c-\theta_*, \nabla f^c(\bar\theta_t^c)\rangle\d t + 2d\tau \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\\
    &=-2\langle \bar\theta_t^c-\theta_*, \nabla f^c(\bar\theta_t^c)-\nabla f^c(\theta_*)+\nabla f^c(\theta_*)\rangle\d t + 2d\tau \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\\
    &\leq -2 m \lrn{\bar\theta_t^c-\theta_*}_2^2\d t-2\langle \bar\theta_t^c-\theta_*, \nabla f^c(\theta_*)\rangle\d t+ 2d\tau \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\\
    &\leq -2m \lrn{\bar\theta_t^c-\theta_*}_2^2\d t+m \lrn{\bar\theta_t^c-\theta_*}_2^2\d t+\frac{\lrn{\nabla f^c(\theta_*)}_2^2}{m}\d t+ 2d\tau \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\\
    &\leq -m q(\bar\theta_t^c)\d t+\left(\frac{\gamma}{m}+ 2d\tau\right) \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle,\notag
\end{align*}
where the first inequality follows by Assumption \ref{def:strong_convex}; the second inequality follows by the AM-GM inequality; the third inequality follows by the definition that $\gamma=\max_{c \in [N]} \lrn{\nabla f^c(\theta_*)}_2^2$. 

In other words, we have
\begin{align*}
    \d (e^{mt} q(\bar\theta_t^c))&=me^{mt} q(\bar\theta_t^c)\d t + e^{mt} \d q(\bar\theta_t^c)\notag\\
    &\leq me^{mt} q(\bar\theta_t^c)\d t + e^{mt}\left(-m q(\bar\theta_t^c)\d t+\left(\frac{\gamma}{m}+ 2d\tau\right) \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\right)\notag\\
    &\leq e^{mt}\left(\frac{\gamma}{m}+ 2d\tau\right) \d t+\sqrt{8\tau}e^{mt}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle.\notag
\end{align*}

The solution is upper bounded by
\begin{align*}
    e^{mt} q(\bar\theta_t^c)\leq e^{m\cdot 0} q(\bar\theta_0^c)+\int_0^t \left(e^{ms}\left(\frac{\gamma}{m}+ 2d\tau\right) \d s+\sqrt{8\tau}e^{ms}\langle \bar\theta_s^c-\theta_*, \d \overline{W}_s\rangle\right)\notag.
\end{align*}

By the martingale property of It\^{o} integral, taking expectations yields
\begin{align}\label{l2_continuous}
    \E{q(\bar\theta_t^c)}
    \leq & ~ e^{-mt}\E{q(\bar\theta_0^c)}+ e^{-mt}\left(\frac{\gamma}{m}+ 2d\tau\right) \int_0^t e^{ms} \d s\notag\\
    = & ~ e^{-mt}\E{q(\bar\theta_0^c)}+ \frac{1-e^{-mt}}{m}\big({\frac{\gamma}{m}+ 2d\tau}\big)\notag\\
    \leq & ~ e^{-mt}\E{q(\bar\theta_0^c)}+ \frac{1-e^{-mt}}{m}\big(\underbrace{\frac{\gamma}{m}+ 2d\tau}_{:=V}\big),
\end{align}
where the last inequality follows since the synchronization is conducted at any time step $t$. Since $\bar\theta_0^c$ is simulated from the stationary distribution $\pi$, by Lemma 12 \cite{dm+16} or Theorem 17 \cite{ccbj18}, we have
\begin{align*}
\E{q(\bar\theta_0^c)}=\E{ \| \bar\theta_0^c-\theta_* \|_2^2}\leq \frac{d\tau}{m}\leq \frac{1}{m}(\frac{\gamma}{m}+2d\tau)=\frac{V}{m},
\end{align*}
which completes the proof.


\end{proof}


\begin{lemma}[Bounded gradient in $\ell_2$]\label{bounded_gradient_l2}
Given assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold, for any client $c$ and any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the $\ell_2$ norm upper bound as follows
\begin{align*}
    \E{ \|\nabla\tilde f^c(\theta_k^c) \|_2^2 }\leq 14d\kappa m H_{\rho},
\end{align*}
where $H_{\rho}=  L \mathcal{D}^2+\kappa \max_{c\in[N]} T_{c,\rho} +\frac{\kappa}{md}\gamma+\frac{\kappa}{m}\sigma^2$.
\end{lemma}

\begin{proof}

Decompose the $\ell_2$ of the gradient as follows
\begin{align*}
    \E{\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2}&= \E{\lrn{\nabla\tilde f^c(\theta_k^c)-\nabla f^c(\theta_k^c)+\nabla f^c(\theta_k^c)}_2^2}\notag\\
    &= \E{\lrn{\nabla f^c(\theta_k^c)}_2^2}+\E{\lrn{\nabla\tilde f^c(\theta_k^c)-\nabla f^c(\theta_k^c)}_2^2}\notag\\
    &\qquad+2\E{\lrw{\nabla\tilde f^c(\theta_k^c)-\nabla f^c(\theta_k^c), \nabla f^c(\theta_k^c)}} \notag \\
    &\leq \E{\lrn{\nabla f^c(\theta_k^c)}_2^2}+\sigma^2d \notag \\
    &=  \E{\lrn{\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)+\nabla f^c(\theta_*)}_2^2}+\sigma^2d \notag \\
    &\leq 2\E{\lrn{\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)}_2^2}+2\E{\big\|\nabla f^c(\theta_*)\big\|_2^2}+\sigma^2d\notag\\
    &\leq 2 L^2 \E{\lrn{\theta_k^c-\theta_*}_2^2}+2 \gamma +\sigma^2d\notag\\
    &\leq 2d L^2 \mathcal{D}^2 + \frac{12d L^2}{m} \cdot \bigg(\max_{c\in[N]}T_{c,\rho}+\frac{ \sigma^2}{m} + \frac{\gamma }{md} \bigg)+\frac{2\gamma}{d}+\sigma^2 \notag \\
    % &\leq 2d \kappa mL d\mathcal{D}^2 +12d\kappa L \cdot \bigg(\max_{c\in[N]}T_{c,\rho}+\frac{ \sigma^2}{m} + \frac{\gamma }{md} \bigg) +\frac{2\gamma}{d}+\sigma^2 \notag \\
    &\leq 14 d\kappa m \cdot \bigg( L \mathcal{D}^2+\kappa \max_{c\in[N]} T_{c,\rho} +\frac{\kappa}{md}\gamma+\frac{\kappa}{m}\sigma^2 \bigg):= 14d\kappa m H_{\rho},
\end{align*}
where the first inequality follows by Assumption \ref{def:variance}; the second inequality follows by Young's inequality; the third inequality follows by Assumption  \ref{def:smooth} and the definition that $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$; the fourth inequality follows by Lemma \ref{lem:L2_bound_local}; the last inequality follows by defining 
$\kappa:=\frac{L}{m}\geq 1$.
\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Beginning of Bounded divergence %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





\begin{lemma}[Initial condition] 
\label{lem:W2_init_bound}
Let $\mu_0$ denote the Dirac delta distribution at $\theta_0$. % and assume $\lrn{\theta_0-\theta_*}_2^2\leq d\mathcal{D}^2$.
Then, we have
\begin{align*}
W_2(\mu_0, \pi)\leq \sqrt{2}(\| \theta_0 - \theta_* \|_2 +  \sqrt{d\tau /m} ). %\sqrt{2d\left(\mathcal{D}^2+\frac{2}{m}\right)}.
\end{align*}
\end{lemma}

\begin{proof}
By \cite{ccbj18}, there exists an optimal coupling between $\mu_0$ and $\pi$ such that
\begin{align*}
    W_2^2(\mu_0, \pi) 
    \leq & ~ \mathbb{E}_{\theta\sim \pi} [\|\theta_0-\theta\|_2^2 ]\\
    \leq & ~ 2\mathbb{E}_{\theta\sim \pi} [\|\theta_0-\theta_*\|_2^2 ] + 2 \mathbb{E}_{\theta\sim \pi}[\|\theta-\theta_*\|_2^2] \\
    = & ~ 2\| \theta_0 - \theta_* \|_2^2 +2\mathbb{E}_{\theta\sim \pi}[\|\theta-\theta_*\|_2^2]\\
    \leq & ~ 2\| \theta_0 - \theta_* \|_2^2 + 2d\tau /m,
\end{align*}
where the second step follows from triangle inequality, the last step follows from Lemma 12 \cite{dm+16} and the temperature $\tau$ is included to adapt to the time scaling.
\end{proof}

\textbf{Burkholder-Davis-Gundy inequality} Let $\phi:[0, \infty)\rightarrow \mathbb{R}^{r\times d}$ for some positive integers $r$ and $d$. In addition, we assume $\E{\int_0^{\infty} |\psi(s)|^2 \d s}<\infty$ and let $Z(t)=\int_0^t \psi(s)\d W_s$, where $W_s$ is a $d$-dimensional Brownian motion. Then for all $t\geq 0$, we have

\begin{align}\label{BDG-inequality}
    \E{\sup_{0\leq s\leq t} |Z(s)|^2}\leq 4\E{\int_0^t|\phi(s)|^2\d s}.
\end{align}

% \section{A simulation example}

% Do a 2D simulation, compare the empirical with the ground truth, plot W2 distance based on \cite{GS84}


% \begin{figure*}[!ht]
%   \centering
%   \vskip -0.1in
%   \subfigure[xx]{\includegraphics[scale=0.2
%   ]{figures/truth.pdf}}\label{fig: 3a}\quad\quad
%   \hspace{-0.5cm}
%   \subfigure[b]{\includegraphics[scale=0.2]{figures/Rplot_chains_5_K_10_1e6.pdf}}\label{fig: 3b}
%   \vspace{-0.5em}
%   \subfigure[b]{\includegraphics[scale=0.2]{figures/Rplot_chains_5_K_10_1e6_different_seeds.pdf}}\label{fig: 3c}
%   \vspace{-0.5em}
%   \caption{xxx}
%   \label{simulation}
%   \vspace{-0.15in}
% \end{figure*}




% \newpage
% \paragraph{Path A} If we use a bound based on $W_2^2$, we have
% \begin{align*}
%     W_2^2(\mu_{k+1}, \pi)&\leq  \left(1-\eta m\right) W_2^2(\mu_{k}, \pi)+\eta^3 + \eta^2\sigma^2.\notag
% \end{align*}

% Recursive applying it
% \begin{align*}
%     W_2^2(\mu_k, \pi)&\leq  \left(1-\eta m\right)^k W_2^2(\mu_{0}, \pi)+ \frac{1-(1-\eta m)^k}{\eta m}\left(\eta^3 + \eta^2\sigma^2 \right).\notag\\
%     &\leq \left(1-\eta m\right)^k W_2^2(\mu_{0}, \pi)+ \frac{1}{ m}\left(\eta^2 + \eta\sigma^2\right).\notag\\
% \end{align*}
% In other words, we can easily get a good rate
% \begin{align*}
%     W_2(\mu_k, \pi)&\leq \left(1-\eta m\right)^\frac{k}{2} W_2(\mu_{0}, \pi)+ \sqrt{\frac{1}{ m}\left(\eta^2 + \eta\sigma^2\right)}.\notag\\
%     &\leq \left(1-\frac{\eta m}{2}\right)^k W_2(\mu_{0}, \pi)+ \sqrt{\frac{1}{ m}\left(\eta^2 + \eta\sigma^2\right)}.\notag\\
% \end{align*}

% \paragraph{Path B} Similarly, if we use
% \begin{align*}
%     W_2(\mu_{k+1}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)W_2 (\mu_{k}, \pi)+\sqrt{\eta^3 +\eta^2\sigma^2}.\notag
% \end{align*}
% Then only by applying a complex trick, i.e.
% \begin{lemma}
% Let $A, B$ and $C$ be three constants such that $A\in(0, 1)$, $B, C\geq 0$. If the sequence $\{x_k\}_{k\geq 0}$ satisfies the recursion as follows
% \begin{align*}
%     x_{k+1}^2\leq [(1-A)x_k + C]^2 + B^2,
% \end{align*}
% where for any $k\geq 0$.  Then, we have that
% \begin{align*}
%     x_{k}\leq (1-A)^k x_0 + \frac{C}{A} + \frac{B^2}{C+\sqrt{A} B}.
% \end{align*}
% \end{lemma}

% , we can have
% \begin{align*}
%     W_2(\mu_{k}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)^k W_2 (\mu_{0}, \pi)+\sqrt{\eta}(\mathcal{O}(1)+\sigma^2).\notag
% \end{align*}

% \Wei{Question: Dalalyan used Path B, why?}




