\section{Our Algorithm}

Let $N$ denote the number of clients. Let $T$ denote the number of global steps. Let $K$ denote the number of local steps. For each $c \in [N]:=\{1,2,\cdots, N\}$, we use $f^c$ and $\nabla f^c$ denote the loss function and gradient of the function $f^c$ in client $c$. For the stochastic gradient oracle, we denote by $\nabla \tilde f^c(\cdot)$ the \emph{unbiased} estimate of the exact gradient $\nabla f^c$ of client $c$. In addition, we denote $p_c$ as the weight of the $c$-th client such that $p_c\geq 0$ and $\sum_{c=1}^N p_c=1$. $\xi_k$ is a standard $d$-dimensional Gaussian vector at iteration $k$, which is independent of the client index and can be achieved by maintaining the same random seed for each client $c\in[N]$.

\begin{algorithm*}[h]\caption{Federated Averaging Langevin dynamics Algorithm (FedAvgLD). Denote by $\theta_k^c$ the model parameter in the $c$-th client at the $k$-th step. Denote the immediate result of one step SGLD update from $\theta_k^c$ by $\beta_k^c$. A global synchronization is conducted every $K$ steps.}\label{alg:alg_main_text_same_seed}
\begin{algorithmic}[1]
\State \begin{equation}\label{local_client}
    \beta_{k+1}^c=\theta_k^c-\eta\nabla f^c(\theta_k^c)+\xi_k,
\end{equation}
\State
\begin{equation}  
\label{synchronization}
\theta_{k+1}^c=\left\{  
             \begin{array}{lr}  
             \beta_{k+1}^c \qquad\qquad\qquad \text{if } k+1 \text{ mod } K\neq 0 \\  
              & \\
             \sum_{c=1}^N p_c \beta_{k+1}^c \ \qquad \text{if } k+1 \text{ mod } K=0.
             \end{array}  
\right.  
\end{equation} 
\end{algorithmic}
\end{algorithm*}

Inspired by \cite{lhy+19}, we define two virtual sequences 
\begin{equation}
\label{virtual_seq}
\beta_k=\sum_{c=1}^N p_c \beta_k^c, \qquad \theta_k=\sum_{c=1}^N p_c \theta_k^c,
\end{equation}
which are \emph{both inaccessible when $k \text{ mod } K\neq 0$}. For the gradients, we also define 
\begin{equation}
\label{sum_grad}
\nabla f(\theta_k)=\sum_{c=1}^N p_c \nabla f^c(\theta_k^c), \qquad \nabla\tilde f(\theta_k)=\sum_{c=1}^N p_c \nabla \tilde f^c(\theta_k^c).
\end{equation}



In what follows, it is clear that $\E{\nabla \tilde f(\theta)}=\sum_{c=1}^N p_c \E{\nabla \tilde f^c(\theta_k^c)}=\nabla f(\theta)$ for any $\theta\in\R^d$. Summing Eq.\eqref{local_client} from clients $c=1$ to $N$ and combining Eq.\eqref{virtual_seq} and Eq.\eqref{sum_grad}, we have
\begin{align*}
\label{fed_avg_langevin_dynamics_preliminary}
    \beta_{k+1}&=\theta_k-\eta \nabla \tilde f(\theta_k)+\xi_k.\notag
\end{align*}
Moreover, we always have $\beta_k=\theta_k$ whether $k+1 \text{ mod } E=0$ or not. In what follows, we can write
\begin{equation}
\label{fed_avg_langevin_dynamics}
\theta_{k+1}=\theta_k-\eta \nabla \tilde f(\theta_k)+\xi_k,
\end{equation}
which resembles the SGLD algorithm \cite{Welling11} except that the construction of stochastic gradients is different and $\theta_k$ is \emph{not accessible when $k\text{ mod } K\neq 0$}. To facilitate the analysis, we also define an auxiliary continuous-time processes $(\bar\theta_t)_{t\geq 0}$ 
\begin{align}
\label{continuous_dynamics}
\d \bar\theta_t = - \nabla f(\bar\theta_t) \d t + \sqrt{2\tau} \d \overline{W}_t,
\end{align}
where $\bar\theta_t=\sum_{c=1}^N p_c \bar\theta_t^c$, $\nabla f(\bar\theta_t)=\sum_{c=1}^N p_c \nabla f^c(\bar\theta_t^c)$, $\bar\theta_t^c$ is the continuous-time variable at client $c$, and $\overline{W}$ is a $d$-dimensional Brownian motion. The continuous-time algorithm is referred to as Federated Averaging Langevin diffusion and is detailed in Algorithm.\ref{alg:alg_main_continuous_text_same_seed}. Moreover, we assume that $\bar\theta_0^c$ simulates from the stationary distribution for each client $c\in[N]$ so that $\bar\theta_t^c\sim\pi$ for any $t\in [0, \eta)$. It is clear that the synchronization step (\ref{synchronization_diffusion}) doesn't affect the stationary property, hence $\theta_t, \theta_t^c\sim \pi$ for all $t\geq 0$ and $c\in[N]$.


\begin{algorithm*}[h]\caption{Federated Averaging Langevin diffusion. Denote by $\bar\theta_t^c$ the model parameter in the $c$-th client at time $t$. Denote the Langevin diffusion update from $\bar\theta_t^c$ by $\bar\beta_t^c$. A global synchronization is conducted every $K\eta$ times}\label{alg:alg_main_continuous_text_same_seed}
\begin{algorithmic}[1]
\State \begin{equation}\label{local_client_continuous}
    \d \bar\beta_{t}^c =\theta_t^c-\nabla f^c(\bar\theta_t^c)\d t+\sqrt{2\tau}\d \overline{W}_t,
\end{equation}
\State
\begin{equation}  
\label{synchronization_diffusion}
\bar\theta_{t}^c=\left\{  
             \begin{array}{lr}  
             \bar\beta_{t}^c \qquad\qquad\qquad \text{if } t \neq t\lfloor\frac{t}{\eta K} \rfloor \\  
              & \\
             \sum_{c=1}^N p_c \bar\beta_{t}^c \ \qquad \text{if } t =t\lfloor\frac{t}{\eta K} \rfloor.
             \end{array}  
\right.  
\end{equation} 
\end{algorithmic}
\end{algorithm*}




\paragraph{Quality of non-i.i.d data} Denote by $\theta_*$ the global minimum of $f$ and by $\theta^c_*$ the global minimum values of $f^c$ for each client $c\in [N]$. It follows that $\theta_*=\sum_{c=1}^N p_c \theta_*^c$. Next, we quantify the degree of the non-i.i.d data by $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$, which is a non-negative constant and yields a larger scale if the data is less identically distributed.

\iffalse
\subsection{Our plan}

What are the assumptions do we need ..
\begin{itemize}
    \item shusen wang's paper
\end{itemize}

We need to prove a new version of Lemma 3 in page 12 in \cite{lhy+19}.
\begin{lemma}[Lemma 3 in page 12 in \cite{lhy+19}]
We have
\begin{align*}
    \mathbb{E} \Big[ \sum_{c=1}^N p_c \| u(t+1) - w_{K,c}(t)  \|^2 \Big] \leq 4 \eta_t^2 (E-1)^2 G^2 
\end{align*}
\end{lemma}

We need to generalize the above lemma to something as follows:
\begin{lemma}[Our version]
We have
\begin{align*}
    \mathbb{E} \Big[ \sum_{c=1}^N p_c \| u(t+1) - w_{K,c}(t)  \|^2 \Big] \leq 4 \eta_t^2 (E-1)^2 G^2  + \| \mathrm{noise} \|^2
\end{align*} 
\end{lemma}

Using Shusen's assumption, we can show Dala's paper \cite{dk19} page 7 contions are holding.

\fi


% \begin{table}[]
%     \centering
%     \begin{tabular}{|l|l|l|l|l|l|} \hline
%         {\bf Notations} & {\bf Ours} & \cite{dk19} & \cite{lhy+19} & \cite{ccbj18} \\ \hline
%         Function & $f$ & $f$ & $F$ & $f$ \\ \hline
%         Parameter &  & $\theta$ & $w$ & $x$ \\ \hline
%         Dimension & $d$ & $p$ & Never & $d$ \\ \hline
%         Smooth & $L$ & $M$ & $L$ & $L$ \\ \hline
%         Strongly convex & $m$ & $m$ & $\mu$ & $m$ \\ \hline
%         Global & & $K$ & $T$ & \\ \hline
%         Local steps & & $1$ & $K$ & \\ \hline
%         Variance & $\sigma^2 d$ & $\sigma^2 p$ & $\sigma^2$ & $\sigma^2 d$ \\ \hline
%         Learning rate & $\eta$ & $h$ & $\eta$ & $\delta$ \\ \hline
%         Choice of LR & & $h = 1/(m+M)$ & $\eta = 2 / (\mu T) $ &  \\\hline
%         \#Devices & & 1 & $N$ & \\ \hline
%         \#Datas per client & & & $n_k$ & \\ \hline
%     \end{tabular}
%     \caption{Notations to compare different papers. We put this table for easy of writing. There is no need to keep this in the final paper.}
%     \label{tab:my_label}
% \end{table}




\Wei{To do:  independence on K. \textcolor{green}{Done}}

\Wei{To do: remove the variance effect (learning rate) or propose a better rate. \textcolor{green}{Done}}

\Wei{To do: require a continuous version for dominated divergence \textcolor{green}{Done}}

\Wei{To do: think about a name for the paper, FedAvg Langevin Dynamics?}

\Wei{To do: Decay learning rate \textcolor{green}{on hold}}

\Wei{To do: When we decay learning rate, it may be harder to prove the L2 bound. \textcolor{green}{on hold}}

\Wei{To do: independent noise in each local client?}

\Wei{To do: convex case or non-convex case?}

\Wei{To do: sampling schemes for different clients}

\Wei{To do: Connection to optimization?}



\subsection{Main result}

% \subsubsection{Notations}

\subsubsection{Assumptions}

\begin{assumption}[Smoothness]\label{def:smooth} For each $c\in [N]$, we say $f^c$ is $L$-smooth if for some $L>0$
\begin{align*}
\| \nabla f^c(y)-\nabla f^c(x) \|_2 \leq L \| y-x \|_2,\quad \forall x, y\in \R^d.
\end{align*}
\end{assumption}

Note that the above assumption is equivalent to saying that
\begin{align*}
f^c(y)\leq f^c(x)+\langle \nabla f^c(x),y-x \rangle+\frac{L}{2}\| y-x \|^2_2\quad \forall x, y\in \R^d.
\end{align*}

\begin{assumption}[Strongly convex]\label{def:strong_convex}
For each $c\in [N]$, $f^c$ is $m$-strongly convex if for some $m>0$
\begin{align*}
f^c(x)\geq f^c(y)+\langle \nabla f^c(y),x-y \rangle + \frac{m}{2} \| y-x \|_2^2\quad \forall x, y\in \R^d.
\end{align*}
\end{assumption}

\begin{assumption}[Bounded variance]\label{def:variance}
For each $c\in [N]$, the variance of noise in the stochastic gradient $\nabla \tilde f^c(x)$ in each client is upper bounded such that %\Zhao{If the following $\sigma$ has $c$, then should we assume $L$ and $m$ also have $c$?} \Wei{I feel we should make $\sigma^2$ uniform}\Zhao{Yes, I plan to change now}
\begin{align*}
\mathbb{E}[ \| \nabla \tilde f^c(x) - \nabla f^c(x) \|_2^2] \leq \sigma^2 d,\quad \forall x\in \R^d.
\end{align*}
\end{assumption}

% Wei's first algorithm formulation based on standard SGLD
% \subsection{Formulation}
% Let $\theta_k\in\R^d$ be the $k$-th iterate of the following stochastic gradient Langevin algorithm.
% \begin{align}\label{eq:sgld}
%     \theta_{k+1}=\theta_k -\eta \nabla \widetilde f(\theta_k)+\sqrt{2\tau\eta}\xi_k,
% \end{align}
% where $\eta$ is the learning rate, $\tau$ is the temperature, $\xi_k$ is a standard $d$-dimensional Gaussian vector, and $\nabla \widetilde f(\theta)$ is an unbiased estimate of the exact gradient $\nabla f(\theta)$.

% \subsubsection{Tools from previous work}

% \textbf{Gronwall's inequality} is a standard tool for obtaining estimates of differential equations. Suppose that $a(\cdot)$, $b(\cdot)$, and $\psi(\cdot)$ are continuous real-valued functions that satisfy
% \begin{align*}
%     \frac{\d}{\d t}\psi(t)\leq a(t)\psi(t)+b(t).
% \end{align*}
% Then 
% \begin{align*}
%     \psi(t)\leq \psi(t_0)e^{\int_{t_0}^t a(s)\d s} + \int_{t_0}^t e^{\int_{s}^t a(u)\d u}b(s)\d s.
% \end{align*}

\textbf{Burkholder-Davis-Gundy inequality} Let $\phi:[0, \infty)\rightarrow \mathbb{R}^{r\times d}$ for some positive integers $r$ and $d$. In addition, we assume $\E{\int_0^{\infty} |\psi(s)|^2 \d s}<\infty$ and let $Z(t)=\int_0^t \psi(s)\d W_s$, where $W_s$ is a $d$-dimensional Brownian motion. Then for all $t\geq 0$, we have

\begin{align*}
    \E{\sup_{0\leq s\leq t} |Z(s)|^2}\leq 4\E{\int_0^t|\phi(s)|^2\d s}.
\end{align*}



\subsubsection{Wasserstein Distance}

% We denote the Borel $\sigma$-algebra 
We define the 2-Wasserstein distance between a pair of Borel probability measures $\mu$ and $\nu$ on $\R^d$ as follows  
\begin{align*}
    W_2(\mu, \nu):=\inf_{\Gamma\in \text{Couplings}(\mu, \nu)}\left(\int\|\bbeta_{\mu}-\bbeta_{\nu}\|_2^2 d \Gamma(\bbeta_{\mu}, \bbeta_{\nu})\right)^{\frac{1}{2}},
\end{align*}
where $\|\cdot\|_2$ denotes the $\ell_2$ norm on $\mathbb{R}^d$ and the pair of random variables $(\bbeta_{\mu}, \bbeta_{\nu})\in \R^d\times\R^d$ is a coupling with the marginals following $\mathcal{L}(\bbeta_{\mu})=\mu$ and $\mathcal{L}(\bbeta_{\nu})=\nu$, where $\mathcal{L}(\cdot)$ denotes a distribution of a random variable.


\Wei{I am trying Dalalyan's User-friendly proof framework to get a better rate.}




\begin{lemma}[Contraction property]
\label{contraction}
Assume assumptions \ref{def:smooth} and \ref{def:strong_convex} hold. For any learning rate $\eta \in (0, \frac{1}{L+m}]$, any $\theta, \bar\theta\in\mathbb{R}^d$, % simulated from Eq.\eqref{fed_avg_langevin_dynamics} and Eq.\eqref{continuous_dynamics}, respectively, 
we have
\begin{align*}
\small
    &\quad\lrn{\bar\theta-\theta-\eta(\nabla f(\bar\theta)-\nabla f(\theta))}_2^2\leq \left(1-\frac{\eta m}{2}\right)^2\lrn{\bar\theta-\theta}_2^2+2\eta(m+L)\sum_{c=1}^N p_c\left(\lrn{\bar\theta^c-\bar\theta}_2^2+\lrn{\theta^c-\theta}_2^2\right).
\end{align*}

\end{lemma}

\begin{lemma}[Discretization error]\label{lem:discretization}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any $s\geq 0$, any learning rate $\eta \in (0 , \frac{2}{m})$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, the iterates of $(\bar \theta_s)$ based on the continuous dynamics of Eq.\eqref{continuous_dynamics} satisfy the following estimate
\begin{align*}
    \E{ \big\| \bar\theta^c_{s} - \bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2} \leq 8\eta^2 d\kappa^2\bigg(\frac{\gamma}{d}+m\tau\bigg)+16\eta d\tau.
\end{align*}
\end{lemma}
Note that the above result only requires a very large upper bound of the learning rate

\begin{lemma}[Dominated divergence]\label{divergence}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the $\ell_2$ upper bound of the divergence between local clients and the center as follows
\begin{align*}
    \text{Discrete dynamics: } \sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_k}_2^2}&\leq 2(K-1)^2\eta^2 dH^2 +4(K-1)\eta d\tau,\notag\\
    \text{Continuous diffusion: } \sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t}}_2^2}&\leq 8(K-1)^2\eta^2 d\kappa^2\bigg(\frac{\gamma}{d}+m\tau\bigg)+16(K-1)\eta d\tau, \notag
\end{align*}
where $H^2=14 d\kappa^2\bigg(m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2\bigg)$, $\kappa=L/m$, and  $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$.
\end{lemma}






\begin{lemma}[Bounded variance] 
\label{lem:total_variance}
Given assumption \ref{def:variance}, we have 
\begin{equation*}
    \E{\lrn{\nabla f(\theta)-\nabla \tilde f(\theta)}_2^2}\leq d \sigma^2 ,\qquad \forall \ \theta\in\R^d.
\end{equation*}
\end{lemma}

% \begin{lemma}[To be proved] 
% \label{lem:gradient_bound}
% Given a client index $c\in[N]$ and assumption XXX\Wei{will fix later}, we have 
% \begin{equation*}
%     \E{\lrn{\nabla f(\bar\theta_s^c)-\nabla f(\bar\theta^c_{k\eta})}_2^2}\leq C \eta^2
% \end{equation*}
% \end{lemma}



\begin{theorem}[One step update]\label{one_step_Dalalyan}

Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , \frac{1}{m+L})$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, where $\theta_*$ is the global minimum for the function $f$. Then
\begin{align*}
    W_2^2(\mu_{k+1}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)^2W_2^2(\mu_{k}, \pi)+32\eta^3 d(K-1)^2 L H^2+2\eta^3 d L^2(\eta H^2+8\tau) + \eta^2\sigma^2 d.\notag
\end{align*}
\end{theorem}
% \Wei{does this $\frac{\eta m}{2}$ make sense? Is there a tradition such that a rate of $\eta m$ is required?}
\begin{proof}


The solution of the above process follows that
\begin{align}
\label{solution_continuous_dynamics}
    \bar\theta_t=\bar\theta_0 -\int_0^t \nabla f(\bar\theta_s)\d s + \sqrt{2}\overline{W}_t, \qquad \forall t\geq 0.
\end{align}


Set $t\rightarrow(k+1)\eta$ and $\bar\theta_0\rightarrow\bar\theta_{k\eta}$ for Eq.\eqref{solution_continuous_dynamics} and consider a synchronous coupling such that $W_{(k+1)\eta}-W_{k\eta}:=\xi_k$
\begin{align}
\label{continuous_one_step}
    \bar\theta_{(k+1)\eta}&=\bar\theta_{k\eta}-\int_{k\eta}^{(k+1)\eta}\nabla f(\bar\theta_s)\d s + \sqrt{2} (W_{(k+1)\eta}-W_{k\eta})\notag\\
    &=\bar\theta_{k\eta}-\int_{k\eta}^{(k+1)\eta}\nabla f(\bar\theta_s)\d s + \sqrt{2}\xi_k.
\end{align}

We first denote $\zeta_k:=\nabla \tilde f(\theta_k)-\nabla f(\theta_k)$. Subtracting Eq.\eqref{fed_avg_langevin_dynamics} from Eq.\eqref{continuous_one_step} yields that
\begin{align*}
    &\quad \bar\theta_{(k+1)\eta}-\theta_{k+1}\notag\\
    &=\bar\theta_{k\eta}-\theta_{k}+\eta \nabla \tilde f(\theta_k) - \int_{k\eta}^{(k+1)\eta}\nabla f(\bar\theta_s)\d s\notag\\
    &=\bar\theta_{k\eta}-\theta_{k}-\eta \bigg(\nabla f(\theta_k+\bar\theta_{k\eta}-\theta_{k})-\nabla\tilde f(\theta_k)\bigg) - \int_{k\eta}^{(k+1)\eta}\bigg(\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})\bigg)\d s\notag\\
    &=\bar\theta_{k\eta}-\theta_{k}-\eta \bigg(\underbrace{\nabla f(\theta_k+\bar\theta_{k\eta}-\theta_{k})-\nabla f(\theta_k)}_{:=X_k}\bigg)+\eta\zeta_k - \underbrace{\int_{k\eta}^{(k+1)\eta}\bigg(\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})\bigg)\d s}_{:=Y_k}.\notag\\
\end{align*}

Applying triangle inequality, we have
\begin{align}
\label{almost_final_v2}
    \lrn{\bar\theta_{(k+1)\eta}-\theta_{k+1}}_2\leq \lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k+\eta \zeta_k}_2 + \lrn{Y_k}_2.
\end{align}


\Wei{should use this one to obtain a sharper result}
\begin{align}
\label{almost_final}
    \lrn{\bar\theta_{(k+1)\eta}-\theta_{k+1}}_2&\leq \lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k-Y_k}_2 + \eta^2\lrn{\zeta_k}_2\notag\\
    &\leq \big(\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k}_2-\lrn{Y_k}_2\big)^2 + \eta^2\lrn{\zeta_k}_2\notag\\
\end{align}

For the first term in the previous result, taking square and expectation, we have
\begin{align}
\label{estimate_of_first_term}
    &\quad\E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k+\eta \zeta_k}_2^2}\notag\\
    &=\E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k}_2^2}+\E{\lrn{\eta\zeta_k}_2^2}+2\eta\underbrace{\E{\langle\bar\theta_{k\eta}-\theta_{k}-\eta X_k,  \zeta_k\rangle}}_{=0}\notag\\
    &\leq \E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k}_2^2}+\eta^2\sigma^2 d\notag \\
    &\leq \left(1-\frac{\eta m}{2}\right)^2\E{\lrn{\bar\theta_{k\eta}-\theta_k}_2^2}+\eta^2\sigma^2 d\notag \\
    &\quad\quad +2\eta(m+L)\sum_{c=1}^N p_c\left(\E{\lrn{\bar\theta_{k\eta}^c-\bar\theta_{k\eta}}_2^2}+\E{\lrn{\theta_k^c-\theta_k}_2^2}\right)\notag\\
    &\leq  \left(1-\frac{\eta m}{2}\right)^2\E{\lrn{\bar\theta_{k\eta}-\theta_k}_2^2}+\eta^2\sigma^2 d+32 \eta^3 d(K-1)^2 LH^2,
\end{align}
where the first inequality follows by Lemma \ref{lem:total_variance}, the second inequality follows by Lemma \ref{contraction}, and the third inequality follows by Lemma \ref{divergence} and $m\leq L$. Since the learning rate $\frac{1}{m+L}\leq \frac{2}{m}$, the requirement of the learning rate is clearly satisfied.

For the second term $Y_k$ in Eq.\eqref{almost_final}, we have the following estimate
\begin{align}
\label{y_estimate}
    \E{\lrn{Y_k}_2^2}&=\E{\lrn{\int_{k\eta}^{(k+1)\eta}\bigg(\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})\bigg)\d s}_2^2}\notag\\
    &\leq\eta\int_{k\eta}^{(k+1)\eta}\E{\lrn{\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})}_2^2}  \d s\notag\\
    &=\eta\int_{k\eta}^{(k+1)\eta}\E{\lrn{\sum_{c=1}^N p_c \bigg(\nabla f^c(\bar\theta_s^c)-\nabla f^c(\bar\theta^c_{k\eta})\bigg)}_2^2}  \d s\notag\\
    &\leq \eta\int_{k\eta}^{(k+1)\eta}\sum_{c=1}^N p_c\E{\lrn{\nabla f^c(\bar\theta_s^c)-\nabla f^c(\bar\theta^c_{k\eta})}_2^2}  \d s\notag\\
    &\leq \eta L^2 \int_{k\eta}^{(k+1)\eta}\sum_{c=1}^N p_c\E{\lrn{\bar\theta_s^c-\bar\theta^c_{k\eta}}_2^2}  \d s\notag\\
    &\leq \eta L^2  \int_{k\eta}^{(k+1)\eta}  2\eta d(\eta H^2+8\tau) \d s\notag\\
    &=2\eta^3 d L^2(\eta H^2+8\tau),
\end{align}
where the first inequality follows by H\"{o}lder's inequality, the second inequality follows by Jensen's inequality, the third inequality follows by Assumption \ref{def:smooth}, and the last inequality follows by Lemma \ref{lem:discretization}.

Taking square and expectation and plugging the estimates of Eq.\eqref{estimate_of_first_term} and Eq.\eqref{y_estimate} into Eq.\eqref{almost_final}, we have
\begin{align*}
    \E{\lrn{\bar\theta_{(k+1)\eta}-\theta_{k+1}}^2_2}&\leq  \left(1-\frac{\eta m}{2}\right)^2\E{\lrn{\bar\theta_{k\eta}-\theta_k}_2^2}+\eta^2\sigma^2 d\notag\\
    &\quad\quad+32\eta^3 d(K-1)^2 L H^2+2\eta^3 d L^2(\eta H^2+8\tau).
\end{align*}

Choose the specific Langevin diffusion $\bar\theta$ in stationary regime, we have $W_2^2(\mu_k,\pi)=\E{\lrn{\bar\theta_{k\eta}-\theta_k}_2^2}$ and  $W_2^2(\mu_{k+1},\pi)\leq\E{\lrn{\bar\theta_{(k+1)\eta}-\theta_{k+1}}_2^2}$. In what follows, we have
\begin{align*}
    W_2^2(\mu_{k+1}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)^2W_2^2(\mu_{k}, \pi)+32\eta^3 d(K-1)^2 L H^2+2\eta^3 d L^2(\eta H^2+8\tau) + \eta^2\sigma^2 d.\notag\\
\end{align*}

\end{proof}


\begin{theorem}[Fixed learning rate] Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. Given a constant learning rate $\eta\in (0, \frac{1}{m+L}]$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have
\begin{align*}
    W_2(\mu_{k}, \pi) &\leq  \left(1-\frac{\eta m}{2}\right)^k \bigg(2\sqrt{d}(\mathcal{D} +  \sqrt{1/m} )\bigg)+\sqrt{\eta d}\bigg((K-1)C_1 + C_2 + \sigma^2 C_3\bigg),\notag
\end{align*}
where $\mu_k$ denotes the probability measure of $\theta_k$, $C_1=\sqrt{32 LH^2}$, $C_2=\sqrt{2L^2(\eta H^2+8\tau)}$, and $C_3$ is defined in the end of the proof.
\end{theorem}


\begin{proof}
Recall from Theorem \ref{one_step_Dalalyan} that
\begin{align*}
    W_2^2(\mu_{k+1}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)^2W_2^2(\mu_{k}, \pi)+32\eta^3 d(K-1)^2 L H^2+2\eta^3 d L^2(\eta H^2+8\tau) + \eta^2\sigma^2 d\notag\\
    &\leq  \bigg(\big(1-\underbrace{\frac{\eta m}{2}}_{:=A}\big)W_2(\mu_{k}, \pi)+\underbrace{\sqrt{32\eta^3 d(K-1)^2 L H^2+2\eta^3 d L^2(\eta H^2+8\tau)}}_{:=C}\bigg)^2+\underbrace{\eta^2\sigma^2 d}_{:=B^2}.\notag
\end{align*}

To obtain a sharper bound on the Wasserstein distance than \cite{Dalalyan17}, a useful ingredient is to adopt Lemma 1 \cite{dk19}, which states as follows
\begin{lemma}
\label{useful_tool}
Let $A, B$ and $C$ be three constants such that $A\in(0, 1)$, $B, C\geq 0$. If the sequence $\{x_k\}_{k\geq 0}$ satisfies the recursion as follows
\begin{align*}
    x_{k+1}^2\leq [(1-A)x_k + C]^2 + B^2,
\end{align*}
where for any $k\geq 0$.  Then, we have that
\begin{align*}
    x_{k}\leq (1-A)^k x_0 + \frac{C}{A} + \frac{B^2}{C+\sqrt{A} B}.
\end{align*}
\end{lemma}

Combining Lemma \ref{useful_tool} with the above Lemma, we have
\begin{align*}
    W_2(\mu_{k}, \pi) &\leq  \left(1-\frac{\eta m}{2}\right)^k W_2(\mu_{0}, \pi)+\sqrt{\eta d}\left((K-1)C_1 + C_2 + \sigma^2 C_3\right),\notag
\end{align*}
where $C_1=\sqrt{32 LH^2}$, $C_2=\sqrt{2L^2(\eta H^2+8\tau)}$, $C_3=\frac{1}{\sqrt{32(K-1)^2 L H^2+2 L^2(\eta H^2+8\tau)}+ \sigma \sqrt{\frac{m}{2}}}\leq \frac{1}{(K-1) C_1}$ for $K>1$.

By Lemma \ref{lem:W2_init_bound} and the initialization condition $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have that
\begin{align*}
W_2(\mu_0, \pi)\leq 2\sqrt{d}(\mathcal{D} +  \sqrt{1/m} ).
\end{align*}
\end{proof}











\subsection{Important lemmas}

\begin{proof}[Proof of Lemma \ref{contraction}] 
% Let $X_k:=\nabla f(\theta_k+\delta_k)-\nabla f(\theta_k)$, $\delta_k:=\bar\theta_{k\eta}-\theta_{k}$ and $\zeta_k:=\nabla \tilde f(\theta_k)-\nabla f(\theta_k)$, where  $\theta_k$ and $\bar\theta_{t}$ are simulated from Eq.\eqref{fed_avg_langevin_dynamics} from Eq.\eqref{continuous_dynamics}, respectively. Given xxx, we have

Given a client index $c\in[N]$, applying Theorem 2.1.12 \cite{Nesterov04} leads to
\begin{align}
\label{special_inner_product}
    \langle y-x, \nabla f^c(y)-\nabla f^c(x) \rangle\geq \frac{m L}{L+m}\lrn{y-x}_2^2 + \frac{1}{L+m} \lrn{\nabla f^c(y)-\nabla f^c(x)}_2^2,\quad \forall x,y\in\mathbb{R}^d.
\end{align}

In what follows, we have
\begin{align}
\label{iteration}
    &\quad\lrn{\bar\theta-\theta-\eta(\nabla f(\bar\theta)-\nabla f(\theta))}_2^2\notag\\
    &=\lrn{\bar\theta-\theta}_2^2 -2\eta \underbrace{\langle \bar\theta-\theta, \nabla f(\bar\theta)-\nabla f(\theta)\rangle}_{\mathcal{I}}+\eta^2 \lrn{\nabla f(\bar\theta)-\nabla f(\theta)}_2^2.
\end{align}

For the second item $\mathcal{I}$ in the right hand side, we have
\begin{align}
\label{target_contraction}
    \mathcal{I}&=\sum_{c=1}^N p_c\bigg\langle \bar\theta-\theta, \nabla f^c(\bar\theta^c)-\nabla f^c(\theta^c)\bigg\rangle\notag\\
    &=\sum_{c=1}^N p_c\bigg\langle \bar\theta-\bar\theta^c+\bar\theta^c-\theta^c+\theta^c-\theta, \nabla f^c(\bar\theta^c)-\nabla f^c(\theta^c)\bigg\rangle\notag\\
    &=-\sum_{c=1}^N p_c\left(\bigg\langle \bar\theta^c-\bar\theta, \nabla f^c(\bar\theta^c)-\nabla f^c(\theta^c)\bigg\rangle+\bigg\langle \theta-\theta^c, \nabla f^c(\bar\theta^c)-\nabla f^c(\theta^c)\bigg\rangle\right)\notag\\
    &\quad\quad+\sum_{c=1}^N p_c\bigg\langle \bar\theta^c-\theta^c, \nabla f^c(\bar\theta^c)-\nabla f^c(\theta^c)\bigg\rangle\notag\\
    &\geq -\sum_{c=1}^N p_c\left((m+L)\lrn{\bar\theta^c-\bar\theta}_2^2+(m+L)\lrn{\theta^c-\theta}_2^2+\frac{1}{2(m+L)}\lrn{\nabla f^c(\bar\theta^c)-\nabla f^c(\theta^c)}_2^2\right)\notag\\
    &\quad\quad+ \sum_{c=1}^N p_c\left(\frac{m L}{L+m}\lrn{\bar\theta^c-\theta^c}_2^2 + \frac{1}{L+m} \lrn{\nabla f^c(\bar\theta^c)-\nabla f^c(\theta^c)}_2^2\right)\notag\\
    &\geq -(m+L)\sum_{c=1}^N p_c\left(\lrn{\bar\theta^c-\bar\theta}_2^2+\lrn{\theta^c-\theta}_2^2\right) + \frac{m L}{L+m}\lrn{\bar\theta-\theta}_2^2 \notag\\
    &\quad\quad+ \frac{1}{2(L+m)} \lrn{\nabla f(\bar\theta)-\nabla f(\theta)}_2^2,
\end{align}
where the first inequality follows by the AM-GM inequality and Eq.\eqref{special_inner_product}, respectively; the last inequality follows by Jensen's inequality such that $\sum_{c=1}^N p_c \lrn{\bar\theta^c-\theta^c}_2^2\geq \lrn{\sum_{c=1}^N p_c \left(\bar\theta^c-\theta^c\right)}_2^2$

Plugging Eq.\eqref{target_contraction} into Eq.\eqref{iteration}, we have
\begin{align*}
    &\quad\lrn{\bar\theta-\theta-\eta(\nabla f(\bar\theta)-\nabla f(\theta))}_2^2\notag\\
    &\leq \left(1-\frac{2\eta mL}{m+L}\right)\lrn{\bar\theta-\theta}_2^2+\eta\left(\eta-\frac{1}{m+L}\right) \lrn{\nabla f(\bar\theta)-\nabla f(\theta)}_2^2\notag\\
    &\quad\quad+2\eta(m+L)\sum_{c=1}^N p_c\left(\lrn{\bar\theta^c-\bar\theta}_2^2+\lrn{\theta^c-\theta}_2^2\right)\notag\\
    &\leq \left(1-\frac{\eta m}{2}\right)^2\lrn{\bar\theta-\theta}_2^2+2\eta(m+L)\sum_{c=1}^N p_c\left(\lrn{\bar\theta^c-\bar\theta}_2^2+\lrn{\theta^c-\theta}_2^2\right),\notag
\end{align*}
where the last inequality follows by $\frac{2L}{m+L}\geq 1$, $1-2a\leq (1-a)^2$ for any $a$, and $\eta\in(0, \frac{1}{m+L}]$.

% Applying $\sqrt{a+b}\leq \sqrt{a}+\sqrt{b}$, we have
% \begin{align*}
%     &\quad\lrn{\bar\theta-\theta-\eta(\nabla f(\bar\theta)-\nabla f(\theta))}_2\leq \left(1-\frac{\eta m}{2}\right)^2\lrn{\bar\theta-\theta}_2^2+2\eta(m+L)\sum_{c=1}^N p_c\left(\lrn{\bar\theta^c-\bar\theta}_2+\sqr\lrn{\theta^c-\theta}_2^2\right),\notag
% \end{align*}

\end{proof}


\begin{proof}[Proof of Lemma \ref{lem:discretization}]
For any $s\in[0,\infty)$, there exists a certain $k \in \mathbb{N}^+$ such that $s\in [k\eta, (k+1)\eta)$. By the continuous dynamics of Eq.~\eqref{continuous_dynamics}, we have
\begin{align*}
    \bar\theta_{s}^c = \bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor}+(s-k\eta) \nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor })+\sqrt{2\tau}\int_{k\eta}^s \d \overline{W}_t,
\end{align*}
which suggests that 
\begin{align*}
    \sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2 \leq (s-k\eta) \big\| \nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2+\sup_{ s \in [ k\eta,  (k+1)\eta ) }\lrn{\int_{k\eta}^s \sqrt{2\tau} \d \overline{W}_t}_2.
\end{align*}
We first square the terms on both sides and take Young’s inequality and expectation
\begin{align*}
    \E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2} \leq 2\E{\big\|(s-k\eta)\nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+2\E{\sup_{ s \in [ k\eta,  (k+1)\eta ) }\lrn{\int_{k\eta}^s \sqrt{2\tau} \d \overline{W}_t}_2^2}.
\end{align*}
Then, by Burkholder-Davis-Gundy inequality and It\^{o} isometry, we have
\begin{align}
    \label{eq:1st_part}
    \E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2}&\leq 2\E{ \big\| (s-k\eta)\nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+8\sum_{i=1}^d\E{\int_{k\eta}^s 2\tau \d t} \notag \\
    &\leq 2\eta^2\E{ \big\| \nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+16 \eta d\tau.
\end{align}

By Young's inequality and the smoothness assumption \ref{def:smooth},  we have
\begin{align}\label{eq:2nd_part}
    \E{ \| \nabla f^c(\bar\theta_{\eta\lfloor \frac{s}{\eta} \rfloor }) \|_2^2}
    = & ~ \E{\| \nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor })-\nabla f^c(\theta_*) +\nabla f^c(\theta_*) \|_2^2} \notag \\
    \leq & ~ 2\E{\| \nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor })-\nabla f^c(\theta_*) \|_2^2} +2{\lrn{\nabla f^c(\theta_*) }_2^2} \notag \\
    \leq & ~ 2L^2 \E{\|\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }-\theta_*\|_2^2}+2\gamma\notag\\
    \leq & ~ 2L^2 \left(\frac{1}{m}\bigg(\frac{\gamma}{m}+2d\tau\bigg)\right)+ 2\gamma\notag\\
    \leq & ~ 4 d\kappa^2\bigg(\frac{\gamma}{d}+m\tau\bigg),
\end{align}
where the third inequality follows by Lemma \ref{lem:L2_bound_local_continuous}, the fourth step follows from the definition of $H^2$ given $\eta\in (0, 2/m]$. Combining Eq.~\eqref{eq:1st_part} and Eq.~\eqref{eq:2nd_part}, we have
\begin{align*}
\E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2}
&\leq 8\eta^2 d\kappa^2\bigg(\frac{\gamma}{d}+m\tau\bigg)+16\eta d\tau.\notag
\end{align*}

\end{proof}



% \Wei{if no decay of learning rate is required, we may polish for a better rate here}
\begin{proof}[Proof of Lemma \ref{divergence}] For any $k \ge 0$, consider $k_0=K\lfloor \frac{k}{K}\rfloor $ such that $k\leq k_0$ and $\theta_{k_0}^c=\theta_{k_0}$ for any $k\geq 0$. It is clear that  $k-k_0 \leq K-1$ for all $k\geq 0$.

\textbf{Discrete dynamics: } By the iterate Eq.\eqref{fed_avg_langevin_dynamics}, we have
\begin{align*}
\sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_k}_2^2}&=\sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_{k_0}-(\theta_k-\theta_{k_0})}_2^2}\notag\\
&\leq \sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_{k_0}}_2^2}\notag\\
&\leq \sum_{c=1}^N p_c \E{\sum_{k=k_0}^{k-1} 2(K-1)\eta^2\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2 + 4(K-1)\eta d\tau}\notag\\
&\leq \sum_{c=1}^N p_c \left(\sum_{k=k_0}^{k-1} 2(K-1)\eta^2\E{\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2}+4(K-1)\eta d\tau\right)\notag\\
&\leq 2(K-1)^2\eta^2 dH^2 +4(K-1)\eta d\tau,
\end{align*}
where the first inequality holds by $\E{\lrn{\theta-\E{\theta}}_2^2}\leq \E{\lrn{\theta}_2^2}$ for a stochastic variable $\theta$; the second inequality follows by $(\sum_{i=1}^{K-1} a_i)^2\leq (K-1)\sum_{i=1}^{K-1} a_i^2$; the last inequality follows by Lemma \ref{bounded_gradient_l2}. $H^2=14 d\kappa^2\bigg(m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2\bigg)$.

\textbf{Continuous-time diffusion: } For any time $t\geq 0$ at the $k$-th iteration and the closest synchronization time $t_0$ at the $k_0$-th iteration, we have
\begin{align*}
\sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t}}_2^2}&=\sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t_0}-(\bar\theta_{t}-\bar\theta_{t_0})}_2^2}\notag\\
&\leq \sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t_0}}_2^2}\notag\\
&\leq 8(t-t_0)^2 d\kappa^2\bigg(\frac{\gamma}{d}+m\tau\bigg)+16(t-t_0) d\tau \notag\\
&\leq 8(K-1)^2\eta^2 d\kappa^2\bigg(\frac{\gamma}{d}+m\tau\bigg)+16(K-1)\eta d\tau, \notag
\end{align*}
where the second inequality follows by applying Lemma \ref{lem:discretization} with treating the learning rate as $t-t_0$; the third inequality follows since the $k$-th iteration and the $k_0$-th iteration has a time difference at most $(K-1)\eta$.


% By definition, $\bar\theta_{t}^c-\bar\theta_{t_0}=-\int_{t_0}^t \nabla f^c(\bar\theta^c_s)\d s +\sqrt{2}\cdot \overline{W}_{t-t_0}$. Apply Young's inequality, we have
% \begin{align*}
%     \E{\lrn{\bar\theta_{t}^c-\bar\theta_{t_0}}_2^2}&=2\E{\lrn{-\int_{t_0}^t \nabla f^c(\bar\theta^c_s)\d s}_2^2} +2\E{\lrn{\sqrt{2}\cdot \overline{W}_{t-t_0}}_2^2}\notag\\
%     &\leq 2(t-t_0)\E{\int_{t_0}^t \lrn{\nabla f^c(\bar\theta_s^c)}_2^2\d s}
% \end{align*}

% By H\"{o}lder's inequality, we have
% \begin{align*}
% \sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t}}_2^2}&\leq \sum_{c=1}^N p_c { (t-t_0)\int_{t_0}^{t} \E{\lrn{-\int_{t_0}^t \nabla f^c(\bar\theta^c_s)\d s +\sqrt{2}\cdot \overline{W}_{t-t_0}}_2^2}\d s}\notag\\
% &\leq \sum_{c=1}^N p_c { (t-t_0)\int_{t_0}^{t} \E{\lrn{-\int_{t_0}^t \nabla f^c(\bar\theta^c_s)\d s +\sqrt{2}\cdot \overline{W}_{t-t_0}}_2^2}\d s}\notag\\
% &\leq 4(K-1)^2\eta^2 H^2,
% \end{align*}

\end{proof}



\begin{proof}[Proof of Lemma \ref{lem:total_variance}] By assumption \ref{def:variance}, we have
\begin{align*}
    \E{\lrn{\nabla f(\theta)-\nabla \tilde f(\theta)}_2^2}&=\E{\lrn{\sum_{c=1}^N p_c\bigg(\nabla f^c(\theta^c)-\nabla \tilde f^c(\theta^c)\bigg)}_2^2}\\
    &=\sum_{c=1}^N p_c^2\E{\lrn{\nabla f^c(\theta^c)-\nabla \tilde f^c(\theta^c)}_2^2}\\
    &\leq d \sigma^2 \sum_{c=1}^N p_c^2\leq d\sigma^2 \left(\sum_{c=1}^N p_c\right)^2:=d\sigma^2.
\end{align*}

\end{proof}

\subsection{Supporting Lemmas}




\begin{lemma}[Uniform $\ell_2$ upper bound for local clients]
\label{lem:L2_bound_local}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the $\ell_2$ norm upper bound as follows
\begin{align*}
\sup_k\E{\lrn{\theta_k^c-\theta_*}_2^2}\leq d\mathcal{D}^2 + \underbrace{\frac{6d}{m}\left(\tau+\frac{ \sigma^2}{m} + \frac{\gamma }{md}\right)}_{:=U},\notag
\end{align*}
where $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$ and $\theta_*$ denotes the global minimum for the function $f$.
\end{lemma}


\begin{proof} First, we consider the $k$-th iteration, where $k\in \{1,2,\cdots, K-2, (K-1)_{-}\}$ and $(K-1)_-$ denotes the $(K-1)$-step without synchronization. Following the iterate of Eq.\eqref{local_client} in a local client of $c\in [N]$, we have
	\begin{align}\label{eq:Langevin_L2_1_local}
&\quad\ \E{\lrn{\theta_{k+1}^c-\theta_*}_2^2}\notag\\
		&= \E{\|\theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c)\|_2^2} + \sqrt{8\eta\tau}\E{ \langle \theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c), \xi_k \rangle } + 2\eta\tau\E{\|\xi_k\|_2^2} \notag \\
		&= \E{\|\theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c)\|_2^2} + 2\eta d\tau,
	\end{align}	
	where the last equality comes from the independence of $\theta_k^c-\theta_*- \widetilde f^c(\theta_k^c)$ and $\xi_k$; $\E{\xi_k}=0$. Note that
\begin{align}\label{eq:ip_1st_local}
%\small
&\quad\ \E{\|\theta_k^c -\theta_*- \eta \widetilde f^c(\theta_k^c)\|_2^2} \notag\\
&= \E{\left\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \right\|_2^2} + \eta^2\E{\|\nabla f^c(\theta_k^c)-\nabla \widetilde f^c(\theta_k^c)\|_2^2}  \notag\\
& \qquad\qquad + 2 \eta \E{ \langle \theta_k^c-\theta_*-\eta \nabla f^c(\theta_k^c),\nabla f^c(\theta_k^c)-\nabla\widetilde f^c(\theta_k^c) \rangle }  \notag\\
&= \E{\left\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \right\|_2^2} + \eta^2\E{\|\nabla f^c(\theta_k^c)-\nabla \widetilde f^c(\theta_k^c)\|_2^2} \notag \\
&\leq \E{\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \|_2^2}  + \eta^2 d\sigma^2, 
\end{align}
where the first step follows from simple algebra, the second step follows from the unbiasedness of the stochastic gradient, and the last step follows from Assumption \ref{def:variance}. For any $q>0$, we can upper bound the first term of Eq.\eqref{eq:ip_1st_local} as follows
\begin{align}\label{eq:ip_2nd_test_theta_star}
	&\quad\ \E{\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \|_2^2}\notag\\
	&=\E{\|\theta_k^c -\theta_*- \eta (\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*))-\eta\nabla f^c(\theta_*) \|_2^2}\notag\\
	&\leq (1+q)\E{\|\theta_k^c -\theta_*- \eta (\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)) \|_2^2}+\eta^2 \left(1+\frac{1}{q}\right) \|\nabla f^c(\theta_*)\|_2^2\notag\\
	&\leq (1+q)\underbrace{\left(1-\frac{\eta m}{2}\right)^2}_{\psi^2}\E{\lrn{\theta_k^c-\theta_*}_2^2}+\eta^2 \left(1+\frac{1}{q}\right)\gamma,
\end{align}
where the first inequality follows by the AM-GM inequality;  the second inequality is a special case of Lemma \ref{contraction} based on Assumption \ref{def:strong_convex}, where no local steps is involved before the synchronization step. In addition, $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$.

Choose $q=(\frac{1+\psi}{2\psi})^2-1$ so that $(1+q)\psi^2=\frac{(1+\psi)^2}{4}$. Moreover, since $\psi=1-\frac{\eta m}{2}$, we get $\frac{1+\psi}{2}=1-\frac{1}{4}\eta m$. In addition, we have $1+\frac{1}{q}= \frac{1+q}{q}\leq \frac{(1+\psi)^2}{(1-\psi)(1+3\psi)}\leq \frac{2}{\eta m}$.  It follows that
\begin{align}
    \label{nice_inequality}
    \eta^2\left(1+\frac{1}{q}\right)\leq \frac{2\eta}{m}.
\end{align}

Combining Eq.~\eqref{eq:Langevin_L2_1_local}, Eq.~\eqref{eq:ip_1st_local}, Eq.~\eqref{eq:ip_2nd_test_theta_star}, and Eq.~\eqref{nice_inequality}, we have the following iterate
\begin{align*}
	\E{\|\theta_{k+1}^c-\theta_*\|_2^2} 
	\leq & ~ \underbrace{\left(1-\frac{\eta m}{4}\right)^2}_{:=g(\eta)} \E{\|\theta_k^c-\theta_*\|_2^2} + 2\eta d\tau +\eta^2 d \sigma^2+\frac{2\eta \gamma}{m}. \notag
\end{align*}

Note that $\frac{1}{1-g(\eta)}=\frac{1}{\frac{\eta m}{2}(1-\frac{\eta m}{8})}\leq \frac{3}{\eta m}$ given $\eta\in (0, \frac{2}{m})$. Recursively applying the above equation $k$ times, where $k\in \{1,2,\cdots, K-1, K_{-}\}$ and $K_-$ denotes the $K$-step without synchronization, it follows that
\begin{align}\label{recursion_v2}
	\E{\|\theta_k^c-\theta_*\|_2^2} &\le g(\eta)^{k}\| \theta_0^c-\theta_*\|_2^2 + \frac{1- g(\eta)^{k}}{1 - g(\eta)} \cdot \left(2\eta d\tau +\eta^2 d \sigma^2+\frac{2\eta \gamma}{m}\right)  \\
	&\le \|\theta_0^c-\theta_*\|_2^2 + \frac{3}{\eta m} \cdot \left(2\eta d\tau +\eta^2 d \sigma^2+\frac{2\eta \gamma}{m}\right) \notag\\
	&\leq d\mathcal{D}^2 + \underbrace{\frac{6d}{m}\left(\tau+\frac{ \sigma^2}{m} + \frac{ \gamma}{md}\right)}_{:=U},\notag
\end{align}
where the second inequality holds by $g(\eta)\leq 1$, the last inequality holds because $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$ and $\eta< \frac{2}{m}$.
In particular, the $K$-th step before synchronization yields that
\begin{align}\label{recursion_v3}
	\E{\|\theta_{K_-}^c-\theta_*\|_2^2} &\le d\mathcal{D}^2 +U.
\end{align}
By contrast, for the $K$-local step after synchronization, applying Jensen's inequality
\begin{align}\label{recursion_v4}
	\E{\|\theta_K^c-\theta_*\|_2^2} 
	= & ~\E{\bigg\|\sum_{c=1}^N p_c\theta_{K-}^c-\theta_*\bigg\|_2^2} \notag \\
	\leq & ~ \sum_{c=1}^N p_c\E{\lrn{\theta_{K-}^c-\theta_*}_2^2} \notag \\
	\leq & ~ d\mathcal{D}^2 +U.
\end{align}
Now starting from iteration $K$, we adapt the recursion of Eq.\eqref{recursion_v2} for the $k$-th step, where $k\in\{K+1,\cdots, 2K-1, (2K)_{-}\}$ and $(2K)_-$ denotes the $2K$-step without synchronization, we have
\begin{align}\label{recursion_v5}
	\E{\|\theta_k^c-\theta_*\|_2^2} 
	\leq & ~ g(\eta)^{k-K} \cdot  \E{\|\theta_K^c-\theta_*\|_2^2} + \frac{1- g(\eta)^{k-K}}{1 - g(\eta)}\cdot \left(2\eta d\tau +\eta^2 d \sigma^2+\frac{2\eta \gamma}{m}\right)\notag \\
	\leq &  g(\eta)^{k-K}(d\mathcal{D}^2+U)+\frac{1- g(\eta)^{k-K}}{m\eta/3} \frac{m\eta}{3} U\notag \\
	\leq & d\mathcal{D}^2+ g(\eta)^{k-K} U +  (1- g(\eta)^{k-K}) U \notag\\
	\leq & d\mathcal{D}^2+U,
\end{align}
where the second inequality follows by Eq.\eqref{recursion_v4}, the fact that $1-g(\eta)\geq \eta m/3$ and $\eta\leq \frac{2}{m}$, and the definition of $U$. The third one holds since $g(\eta)\leq 1$.

By repeating Eq.\eqref{recursion_v4} and \eqref{recursion_v5}, we have that for all $k\geq 0$
\begin{align*}
	\E{\|\theta_k^c-\theta_*\|_2^2} \leq d\mathcal{D}^2 + \underbrace{\frac{6d}{m}\left(\tau+\frac{ \sigma^2}{m} + \frac{\gamma }{md}\right)}_{:=U}.\notag
\end{align*}
\end{proof}


\begin{lemma}[Uniform $\ell_2$ upper bound in continuous time]
\label{lem:L2_bound_local_continuous}
Assume assumption \ref{def:strong_convex} holds. We have the $\ell_2$ norm upper bound as follows
\begin{align*}
\sup_k\E{\lrn{\bar\theta_t^c-\theta_*}_2^2}\leq \frac{1}{m}(\frac{\gamma}{m}+2d\tau),\notag
\end{align*}
where $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$ and $\theta_*$ denotes the global minimum for the function $f$.
\end{lemma}

\begin{proof}
Let $q(\bar\theta_t^c)=\lrn{\bar\theta_t^c-\theta_*}_2^2$. For any time $t\in [0, K\eta)$ before synchronization, applying It\^{o}'s lemma leads to
\begin{align*}
    \d q(\bar\theta_t^c)&=-2\langle \bar\theta_t^c-\theta_*, \nabla f^c(\bar\theta_t^c)\rangle\d t + 2d\tau \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\\
    &=-2\langle \bar\theta_t^c-\theta_*, \nabla f^c(\bar\theta_t^c)-\nabla f^c(\theta_*)+\nabla f^c(\theta_*)\rangle\d t + 2d\tau \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\\
    &\leq -2 m \lrn{\bar\theta_t^c-\theta_*}_2^2\d t-2\langle \bar\theta_t^c-\theta_*, \nabla f^c(\theta_*)\rangle\d t+ 2d\tau \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\\
    &\leq -2m \lrn{\bar\theta_t^c-\theta_*}_2^2\d t+m \lrn{\bar\theta_t^c-\theta_*}_2^2\d t+\frac{\lrn{\nabla f^c(\theta_*)}_2^2}{m}\d t+ 2d\tau \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\\
    &\leq -m q(\bar\theta_t^c)\d t+\left(\frac{\gamma}{m}+ 2d\tau\right) \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle,\notag
\end{align*}
where the first inequality follows by Assumption \ref{def:strong_convex}; the second inequality follows by the AM-GM inequality; the third inequality follows by the definition that $\gamma=\max_{c} \lrn{\nabla f^c(\theta_*)}_2^2$. 

In other words, we have
\begin{align*}
    \d (e^{mt} q(\bar\theta_t^c))&=me^{mt} q(\bar\theta_t^c)\d t + e^{mt} \d q(\bar\theta_t^c)\notag\\
    &\leq me^{mt} q(\bar\theta_t^c)\d t + e^{mt}\left(-m q(\bar\theta_t^c)\d t+\left(\frac{\gamma}{m}+ 2d\tau\right) \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\right)\notag\\
    &\leq e^{mt}\left(\frac{\gamma}{m}+ 2d\tau\right) \d t+\sqrt{8\tau}e^{mt}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle.\notag
\end{align*}

The solution is upper bounded by
\begin{align*}
    e^{mt} q(\bar\theta_t^c)\leq e^{m\cdot 0} q(\bar\theta_0^c)+\int_0^t \left(e^{ms}\left(\frac{\gamma}{m}+ 2d\tau\right) \d s+\sqrt{8\tau}e^{ms}\langle \bar\theta_s^c-\theta_*, \d \overline{W}_s\rangle\right)\notag.
\end{align*}

By the martingale property of It\^{o} integral, taking expectations yields
\begin{align}\label{l2_continuous}
    \E{q(\bar\theta_t^c)}&\leq e^{-mt}\E{q(\bar\theta_0^c)}+ e^{-mt}\left(\frac{\gamma}{m}+ 2d\tau\right) \int_0^t e^{ms} \d s\notag\\
    &=e^{-mt}\E{q(\bar\theta_0^c)}+ \frac{1-e^{-mt}}{m}\bigg(\underbrace{\frac{\gamma}{m}+ 2d\tau}_{:=V}\bigg).
\end{align}

In what follows, it suffices to prove $\E{q(\bar\theta_0^c)}\leq \frac{V}{m}$ so that $\E{q(\bar\theta_t^c)}\leq \frac{V}{m}$ for any $\eta \in [0, K\eta)$ and any $c\in[N]$. In particular for the synchronization step at time $K\eta$, we have
\begin{align}\label{syn_conti}
    \E{q(\bar\theta_{K\eta}^c)}=\lrn{\bar\theta_{K\eta}-\theta_*}_2^2=\lrn{\sum_{c=1}^N p_c\bar\theta^c_{K\eta}-\theta_*}_2^2\leq \sum_{c=1}^N p_c\lrn{\bar\theta^c_{K\eta}-\theta_*}_2^2\leq \frac{V}{m} \sum_{c=1}^N p_c=\frac{V}{m}.
\end{align}

Applying Eq.\eqref{l2_continuous} and Eq.\eqref{syn_conti} for time step $t\in[K\eta, 2K\eta)$, we have
\begin{align}\label{l2_continuous_v2}
    \E{q(\bar\theta_t^c)}&\leq e^{-m(t-K\eta)}\E{q(\bar\theta_{K\eta}^c)}+ \frac{1-e^{-m(t-K\eta)}}{m}V\leq \frac{V}{m}.
\end{align}
Iteratively applying Eq.\eqref{l2_continuous}, Eq.\eqref{syn_conti}, and  Eq.\eqref{l2_continuous_v2}, we have $\E{q(\bar\theta_t^c)}\leq \frac{V}{m}$ for all $t\geq 0$ given $\E{q(\bar\theta_0^c)}=\E{\lrn{\bar\theta_0^c-\theta_*}_2^2}\leq \frac{V}{m}:=\frac{1}{m}(\frac{\gamma}{m}+ 2d\tau)$. Since $\bar\theta_0^c$ is simulated from the stationary distribution $\pi$, by Lemma 12 \cite{dm+16} and Theorem 17 \cite{ccbj18}, we have $\E{\lrn{\bar\theta_0^c-\theta_*}_2^2}\leq \frac{d\tau}{m}\leq \frac{1}{m}(\frac{\gamma}{m}+2d\tau)$, which completes the proof.


\end{proof}


\begin{lemma}[Bounded gradient in $\ell_2$]\label{bounded_gradient_l2}
Given assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold, for any client $c$ and any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the $\ell_2$ norm upper bound as follows
\begin{align*}
    \E{\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2}\leq d H^2,
\end{align*}
where $H^2=14 \kappa^2\bigg(m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2\bigg)$.
\end{lemma}

\begin{proof}

Decompose the $\ell_2$ of the gradient as follows
\begin{align*}
    \E{\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2}&= \E{\lrn{\nabla\tilde f^c(\theta_k^c)-\nabla f^c(\theta_k^c)+\nabla f^c(\theta_k^c)}_2^2}\notag\\
    &= \E{\lrn{\nabla f^c(\theta_k^c)}_2^2}+\E{\lrn{\nabla\tilde f^c(\theta_k^c)-\nabla f^c(\theta_k^c)}_2^2}+2\E{\lrw{\nabla\tilde f^c(\theta_k^c)-\nabla f^c(\theta_k^c), \nabla f^c(\theta_k^c)}} \notag\\
    &\leq \E{\lrn{\nabla f^c(\theta_k^c)}_2^2}+\sigma^2d\notag\\
    &=  \E{\lrn{\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)+\nabla f^c(\theta_*)}_2^2}+\sigma^2d\notag\\
    &\leq 2\E{\lrn{\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)}_2^2}+2\E{\big\|\nabla f^c(\theta_*)\big\|_2^2}+\sigma^2d\notag\\
    &\leq 2 L^2 \E{\lrn{\theta_k^c-\theta_*}_2^2}+2 \gamma +\sigma^2d\notag\\
    &\leq 2L^2 \bigg(d\mathcal{D}^2 + \frac{6d}{m}\left(\tau+\frac{ \sigma^2}{m} + \frac{\gamma }{md}\right)\bigg)+2 \gamma+\sigma^2d\notag\\
    &\leq 14 d\kappa^2\bigg(m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2\bigg):= dH^2,
\end{align*}
where the first inequality follows by assumption \ref{def:variance}; the second inequality follows by Young's inequality; the third inequality follows by assumption  \ref{def:smooth} and the definition that $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$; the fourth inequality follows by Lemma \ref{lem:L2_bound_local}; the last inequality follows by defining 
$\kappa:=\frac{L}{m}\geq 1$.
\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Beginning of dominated divergence %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





\begin{lemma}[Initial condition] 
\label{lem:W2_init_bound}
Let $\mu_0$ denote the Dirac delta distribution at $\theta_0$. % and assume $\lrn{\theta_0-\theta_*}_2^2\leq d\mathcal{D}^2$.
Then, we have
\begin{align*}
W_2(\mu_0, \pi)\leq 2(\| \theta_0 - \theta_* \|_2 +  \sqrt{d\tau /m} ). %\sqrt{2d\left(\mathcal{D}^2+\frac{2}{m}\right)}.
\end{align*}
\end{lemma}
\Wei{adapt the main context with addition $\tau$}

\begin{proof}
By \cite{ccbj18}, there exists an optimal coupling between $\mu_0$ and $\pi$ such that
\begin{align*}
    W_2^2(\mu_0, \pi) 
    \leq & ~ \mathbb{E}_{\theta\sim \pi} [\|\theta_0-\theta\|_2^2 ]\\
    \leq & ~ 2\mathbb{E}_{\theta\sim \pi} [\|\theta_0-\theta_*\|_2^2 ] + 2 \mathbb{E}_{\theta\sim \pi}[\|\theta-\theta_*\|_2^2] \\
    = & ~ 2\| \theta_0 - \theta_* \|_2^2 +2\mathbb{E}_{\theta\sim \pi}[\|\theta-\theta_*\|_2^2]\\
    \leq & ~ 2\| \theta_0 - \theta_* \|_2^2 + 2d\tau /m,
\end{align*}
where the second step follows from triangle inequality, the last step follows from Lemma 12 \cite{dm+16} and the temperature $\tau$ is included to adapt to the time scaling.
\end{proof}




% \newpage
% \paragraph{Path A} If we use a bound based on $W_2^2$, we have
% \begin{align*}
%     W_2^2(\mu_{k+1}, \pi)&\leq  \left(1-\eta m\right) W_2^2(\mu_{k}, \pi)+\eta^3 + \eta^2\sigma^2.\notag
% \end{align*}

% Recursive applying it
% \begin{align*}
%     W_2^2(\mu_k, \pi)&\leq  \left(1-\eta m\right)^k W_2^2(\mu_{0}, \pi)+ \frac{1-(1-\eta m)^k}{\eta m}\left(\eta^3 + \eta^2\sigma^2 \right).\notag\\
%     &\leq \left(1-\eta m\right)^k W_2^2(\mu_{0}, \pi)+ \frac{1}{ m}\left(\eta^2 + \eta\sigma^2\right).\notag\\
% \end{align*}
% In other words, we can easily get a good rate
% \begin{align*}
%     W_2(\mu_k, \pi)&\leq \left(1-\eta m\right)^\frac{k}{2} W_2(\mu_{0}, \pi)+ \sqrt{\frac{1}{ m}\left(\eta^2 + \eta\sigma^2\right)}.\notag\\
%     &\leq \left(1-\frac{\eta m}{2}\right)^k W_2(\mu_{0}, \pi)+ \sqrt{\frac{1}{ m}\left(\eta^2 + \eta\sigma^2\right)}.\notag\\
% \end{align*}

% \paragraph{Path B} Similarly, if we use
% \begin{align*}
%     W_2(\mu_{k+1}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)W_2 (\mu_{k}, \pi)+\sqrt{\eta^3 +\eta^2\sigma^2}.\notag
% \end{align*}
% Then only by applying a complex trick, i.e.
% \begin{lemma}
% Let $A, B$ and $C$ be three constants such that $A\in(0, 1)$, $B, C\geq 0$. If the sequence $\{x_k\}_{k\geq 0}$ satisfies the recursion as follows
% \begin{align*}
%     x_{k+1}^2\leq [(1-A)x_k + C]^2 + B^2,
% \end{align*}
% where for any $k\geq 0$.  Then, we have that
% \begin{align*}
%     x_{k}\leq (1-A)^k x_0 + \frac{C}{A} + \frac{B^2}{C+\sqrt{A} B}.
% \end{align*}
% \end{lemma}

% , we can have
% \begin{align*}
%     W_2(\mu_{k}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)^k W_2 (\mu_{0}, \pi)+\sqrt{\eta}(\mathcal{O}(1)+\sigma^2).\notag
% \end{align*}

% \Wei{Question: Dalalyan used Path B, why?}




