\section{Our Algorithm}

Let $N$ denote the number of clients. Let $T$ denote the number of global steps. Let $K$ denote the number of local steps. For each $c \in [N]:=\{1,2,\cdots, N\}$, we use $f^c$ and $\nabla f^c$ denote the loss function and gradient of the function $f^c$ in client $c$. For the stochastic gradient oracle, we denote by $\nabla \tilde f^c(\cdot)$ the \emph{unbiased} estimate of the exact gradient $\nabla f^c$ of client $c$. In addition, we denote $p_c$ as the weight of the $c$-th client such that $p_c\geq 0$ and $\sum_{c=1}^N p_c=1$. $\xi_k$ is a standard $d$-dimensional Gaussian vector at iteration $k$, which is independent of the client index and can be achieved by maintaining the same random seed for each client $c\in[N]$.
% Zhao's first algorithm formulation
% \begin{algorithm*}[h]\caption{Training via Federated Averaging Algorithm }\label{alg:alg_main_text}
% \begin{algorithmic}[1]
% \State $u_r(0) \sim \N(0,I_d)$ for $r\in [m]$. \Comment{$u \in \R^{d \times m}$}
% \For{$t = 1, \ldots, T$} \Comment{$T$ denotes the number of global steps}
%     \For{$c = 1, \ldots, N$} \Comment{$N$ denote the total number of clients}
%         \State $w_{0,c}(t) \leftarrow u(t)$ \Comment{$w_{0,c}(t),u(t) \in \R^{d \times m}$}
%         \For{$k=1,\ldots,K$} \Comment{$K$ denotes the number of local steps}
%             \State $w_{k,c}(t) \leftarrow w_{k-1,c}(t) - \eta_{\mathrm{local}} \cdot \frac{\partial L_c}{\partial w} |_{w = w_{k-1,c}(t) }$ +{\color{red}noise}
%         \EndFor
%         \State
%         $\Delta u_c \leftarrow w_{k,c}(t) - u(t)$
%     \EndFor
%     \State $\Delta u \leftarrow \frac{1}{N} \sum_{c \in [N]} \Delta u_c$ \Comment{$\Delta u \in \R^{d \times m}$}
%     \State $u(t+1) \leftarrow u(t) + \eta_{\mathrm{global}} \Delta u$ + {\color{red}noise} \Comment{$u(t+1)\in \R^{d \times m}$, add noise here, the novelty is less}
% \EndFor
% \end{algorithmic}
% \end{algorithm*}


\begin{algorithm*}[h]\caption{Federated Averaging Langevin dynamics Algorithm (FedAvgLD). Denote by $\theta_k^c$ the model parameter in the $c$-th client at the $k$-th step. Denote the immediate result of one step SGLD update from $\theta_k^c$ by $\beta_k^c$. A global synchronization is conducted every $K$ steps.}\label{alg:alg_main_text_same_seed}
\begin{algorithmic}[1]
\State \begin{equation}\label{local_client}
    \beta_{k+1}^c=\theta_k^c-\eta_k\nabla f^c(\theta_k^c)+\xi_k,
\end{equation}
\State
\begin{equation*}  
\label{undampedsgld}
\theta_{k+1}^c=\left\{  
             \begin{array}{lr}  
             \beta_{k+1}^c \qquad\qquad\qquad \text{if } k+1 \text{ mod } E\neq 0 \\  
              & \\
             \sum_{c=1}^N p_c \beta_{k+1}^c \ \qquad \text{if } k+1 \text{ mod } E=0.
             \end{array}  
\right.  
\end{equation*} 
\end{algorithmic}
\end{algorithm*}

Inspired by \cite{lhy+19}, we define two virtual sequences 
\begin{equation}
\label{virtual_seq}
\beta_k=\sum_{c=1}^N p_c \beta_k^c, \qquad \theta_k=\sum_{c=1}^N p_c \theta_k^c,
\end{equation}
which are \emph{both inaccessible when $k \text{ mod } K\neq 0$}. We also define 
\begin{equation}
\label{sum_grad}
\nabla f(\theta_k)=\sum_{c=1}^N p_c \nabla f^c(\theta_k^c), \qquad \nabla\tilde f(\theta_k)=\sum_{c=1}^N p_c \nabla \tilde f^c(\theta_k^c).
\end{equation}


\paragraph{Quality of non-i.i.d data} Denote by $\theta_*$ the global minimum of $f$ and by $\theta^c_*$ the global minimum values of $f^c$ for each client $c\in [N]$. It follows that $\theta_*=\sum_{c=1}^N p_c \theta_*^c$. Next, we quantify the degree of the non-i.i.d data by $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$, which is a non-negative constant and yields a larger scale if the data is less identically distributed.

In what follows, it is clear that $\E{\nabla \tilde f(\theta)}=\nabla f(\theta)$ for any $\theta\in\R^d$. Moreover, we always have $\beta_k=\theta_k$ whether $k+1 \text{ mod } E=0$ or not. Summing Eq.\eqref{local_client} from clients $c=1$ to $N$, we have
\begin{equation}
\label{fed_avg_langevin_dynamics}
    \theta_{k+1}=\theta_k-\eta_k \nabla \tilde f(\theta_k)+\xi_k.
\end{equation}

The above formulation resembles the SGLD algorithm except that the construction and analysis of stochastic gradients are different. To facilitate the analysis, we also define an auxiliary continuous-time process $(\bar\theta_t)$ that starts from the stationary distribution so that $\bar\theta_0\sim\pi$
\begin{align}
\label{continuous_dynamics}
\d \bar\theta_t = - \nabla f(\bar\theta_t) \d t + \sqrt{2\tau} \d \overline{W}_t,
\end{align}
where $\overline{W}$ is a $d$-dimensional Brownian motion.
The solution of the above process follows that
\begin{align}
\label{solution_continuous_dynamics}
    \bar\theta_t=\bar\theta_0 -\int_0^t \nabla f(\bar\theta_s)\d s + \sqrt{2}\overline{W}_t, \qquad \forall t\geq 0.
\end{align}



\iffalse
\subsection{Our plan}

What are the assumptions do we need ..
\begin{itemize}
    \item shusen wang's paper
\end{itemize}

We need to prove a new version of Lemma 3 in page 12 in \cite{lhy+19}.
\begin{lemma}[Lemma 3 in page 12 in \cite{lhy+19}]
We have
\begin{align*}
    \mathbb{E} \Big[ \sum_{c=1}^N p_c \| u(t+1) - w_{K,c}(t)  \|^2 \Big] \leq 4 \eta_t^2 (E-1)^2 G^2 
\end{align*}
\end{lemma}

We need to generalize the above lemma to something as follows:
\begin{lemma}[Our version]
We have
\begin{align*}
    \mathbb{E} \Big[ \sum_{c=1}^N p_c \| u(t+1) - w_{K,c}(t)  \|^2 \Big] \leq 4 \eta_t^2 (E-1)^2 G^2  + \| \mathrm{noise} \|^2
\end{align*} 
\end{lemma}

Using Shusen's assumption, we can show Dala's paper \cite{dk19} page 7 contions are holding.

\fi


% \begin{table}[]
%     \centering
%     \begin{tabular}{|l|l|l|l|l|l|} \hline
%         {\bf Notations} & {\bf Ours} & \cite{dk19} & \cite{lhy+19} & \cite{ccbj18} \\ \hline
%         Function & $f$ & $f$ & $F$ & $f$ \\ \hline
%         Parameter &  & $\theta$ & $w$ & $x$ \\ \hline
%         Dimension & $d$ & $p$ & Never & $d$ \\ \hline
%         Smooth & $L$ & $M$ & $L$ & $L$ \\ \hline
%         Strongly convex & $m$ & $m$ & $\mu$ & $m$ \\ \hline
%         Global & & $K$ & $T$ & \\ \hline
%         Local steps & & $1$ & $K$ & \\ \hline
%         Variance & $\sigma^2 d$ & $\sigma^2 p$ & $\sigma^2$ & $\sigma^2 d$ \\ \hline
%         Learning rate & $\eta$ & $h$ & $\eta$ & $\delta$ \\ \hline
%         Choice of LR & & $h = 1/(m+M)$ & $\eta = 2 / (\mu T) $ &  \\\hline
%         \#Devices & & 1 & $N$ & \\ \hline
%         \#Datas per client & & & $n_k$ & \\ \hline
%     \end{tabular}
%     \caption{Notations to compare different papers. We put this table for easy of writing. There is no need to keep this in the final paper.}
%     \label{tab:my_label}
% \end{table}




\Wei{To do:  independence on K. \textcolor{green}{Done}}

\Wei{To do: remove the variance effect (learning rate) or propose a better rate. \textcolor{green}{Done}}

\Wei{To do: require a continuous version for dominated divergence \textcolor{green}{Done}}

\Wei{To do: think about a name for the paper, FedAvg Langevin Dynamics?}

\Wei{To do: Decay learning rate \textcolor{green}{on hold}}

\Wei{To do: When we decay learning rate, it may be harder to prove the L2 bound. \textcolor{green}{on hold}}

\Wei{To do: independent noise in each local client?}

\Wei{To do: convex case or non-convex case?}

\Wei{To do: sampling schemes for different clients}

\Wei{To do: Connection to optimization?}



\subsection{Main result}

% \subsubsection{Notations}

\subsubsection{Assumptions}

\begin{assumption}[Smoothness]\label{def:smooth} For each $c\in [N]$, we say $f^c$ is $L$-smooth if for some $L>0$
\begin{align*}
\| \nabla f^c(y)-\nabla f^c(x) \|_2 \leq L \| y-x \|_2,\quad \forall x, y\in \R^d.
\end{align*}
\end{assumption}

Note that the above assumption is equivalent to saying that
\begin{align*}
f^c(y)\leq f^c(x)+\langle \nabla f^c(x),y-x \rangle+\frac{L}{2}\| y-x \|^2_2\quad \forall x, y\in \R^d.
\end{align*}

\begin{assumption}[Strongly convex]\label{def:strong_convex}
For each $c\in [N]$, $f^c$ is $m$-strongly convex if for some $m>0$
\begin{align*}
f^c(x)\geq f^c(y)+\langle \nabla f^c(y),x-y \rangle + \frac{m}{2} \| y-x \|_2^2\quad \forall x, y\in \R^d.
\end{align*}
\end{assumption}

\begin{assumption}[Bounded variance]\label{def:variance}
For each $c\in [N]$, the variance of noise in the stochastic gradient $\nabla \tilde f^c(x)$ in each client is upper bounded such that %\Zhao{If the following $\sigma$ has $c$, then should we assume $L$ and $m$ also have $c$?} \Wei{I feel we should make $\sigma^2$ uniform}\Zhao{Yes, I plan to change now}
\begin{align*}
\mathbb{E}[ \| \nabla \tilde f^c(x) - \nabla f^c(x) \|_2^2] \leq \sigma^2 d,\quad \forall x\in \R^d.
\end{align*}
\end{assumption}

% Wei's first algorithm formulation based on standard SGLD
% \subsection{Formulation}
% Let $\theta_k\in\R^d$ be the $k$-th iterate of the following stochastic gradient Langevin algorithm.
% \begin{align}\label{eq:sgld}
%     \theta_{k+1}=\theta_k -\eta \nabla \widetilde f(\theta_k)+\sqrt{2\tau\eta}\xi_k,
% \end{align}
% where $\eta$ is the learning rate, $\tau$ is the temperature, $\xi_k$ is a standard $d$-dimensional Gaussian vector, and $\nabla \widetilde f(\theta)$ is an unbiased estimate of the exact gradient $\nabla f(\theta)$.

% \subsubsection{Tools from previous work}

% \textbf{Gronwall's inequality} is a standard tool for obtaining estimates of differential equations. Suppose that $a(\cdot)$, $b(\cdot)$, and $\psi(\cdot)$ are continuous real-valued functions that satisfy
% \begin{align*}
%     \frac{\d}{\d t}\psi(t)\leq a(t)\psi(t)+b(t).
% \end{align*}
% Then 
% \begin{align*}
%     \psi(t)\leq \psi(t_0)e^{\int_{t_0}^t a(s)\d s} + \int_{t_0}^t e^{\int_{s}^t a(u)\d u}b(s)\d s.
% \end{align*}

% \textbf{Burkholder-Davis-Gundy inequality} Let $\phi:[0, \infty)\rightarrow \mathbb{R}^{r\times d}$ for some positive integers $r$ and $d$. In addition, we assume $\E{\int_0^{\infty} |\psi(s)|^2 \d s}<\infty$ and let $Z(t)=\int_0^t \psi(s)\d W_s$, where $W_s$ is a $d$-dimensional Brownian motion. Then for all $t\geq 0$, we have

% \begin{align*}
%     \E{\sup_{0\leq s\leq t} |Z(s)|^2}\leq 4\E{\int_0^t|\phi(s)|^2\d s}.
% \end{align*}

\subsubsection{Wasserstein Distance}

% We denote the Borel $\sigma$-algebra 
We define the 2-Wasserstein distance between a pair of Borel probability measures $\mu$ and $\nu$ on $\R^d$ as follows  
\begin{align*}
    W_2(\mu, \nu):=\inf_{\Gamma\in \text{Couplings}(\mu, \nu)}\left(\int\|\bbeta_{\mu}-\bbeta_{\nu}\|_2^2 d \Gamma(\bbeta_{\mu}, \bbeta_{\nu})\right)^{\frac{1}{2}},
\end{align*}
where $\|\cdot\|_2$ denotes the $\ell_2$ norm on $\mathbb{R}^d$ and the pair of random variables $(\bbeta_{\mu}, \bbeta_{\nu})\in \R^d\times\R^d$ is a coupling with the marginals following $\mathcal{L}(\bbeta_{\mu})=\mu$ and $\mathcal{L}(\bbeta_{\nu})=\nu$, where $\mathcal{L}(\cdot)$ denotes a distribution of a random variable.


\Wei{I am trying Dalalyan's User-friendly proof framework to get a better rate.}




\begin{lemma}[Contraction property]
\label{contraction}
Assume assumptions \ref{def:smooth} and \ref{def:strong_convex} hold. For any learning rate $\eta \in (0, \frac{1}{L+m}]$, any $\theta, \bar\theta\in\mathbb{R}^d$, % simulated from Eq.\eqref{fed_avg_langevin_dynamics} and Eq.\eqref{continuous_dynamics}, respectively, 
we have
\begin{align*}
\small
    &\quad\lrn{\bar\theta-\theta-\eta(\nabla f(\bar\theta)-\nabla f(\theta))}_2^2\leq \left(1-\frac{\eta m}{2}\right)^2\lrn{\bar\theta-\theta}_2^2+2\eta(m+L)\sum_{c=1}^N p_c\left(\lrn{\bar\theta^c-\bar\theta}_2^2+\lrn{\theta^c-\theta}_2^2\right).
\end{align*}

\end{lemma}

\begin{lemma}[Dominated divergence]\label{divergence}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the $\ell_2$ upper bound of the divergence between local clients and the center as follows
\begin{align*}
    \sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_k}_2^2}\leq 4(K-1)^2\eta_k^2 H^2,
\end{align*}


\begin{align*}
\sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t}}_2^2}&\leq 8d (K-1)^2 \eta_{k}^2 H^2 + 32 d (K-1)\eta_{k}\tau\notag
\end{align*}
where $H^2=14 d\kappa^2\bigg(m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2\bigg)$, $\kappa=L/m$, and  $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$.
\end{lemma}



\begin{lemma}[Discretization error]\label{lem:discretization}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any $s\geq 0$, any learning rate $\eta \in (0 , \frac{2}{m})$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, the iterates of $(\bar \theta_s)$ based on the continuous dynamics of Eq.\eqref{continuous_dynamics} satisfy the following estimate
\begin{align*}
    \E{ \big\| \bar\theta^c_{s} - \bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2} \leq 2 d\eta(\eta H^2+8\tau).
\end{align*}
\end{lemma}


\begin{lemma}[Bounded variance] 
\label{lem:total_variance}
Given assumption \ref{def:variance}, we have 
\begin{equation*}
    \E{\lrn{\nabla f(\theta)-\nabla \tilde f(\theta)}_2^2}\leq d \sigma^2 ,\qquad \forall \ \theta\in\R^d.
\end{equation*}
\end{lemma}

% \begin{lemma}[To be proved] 
% \label{lem:gradient_bound}
% Given a client index $c\in[N]$ and assumption XXX\Wei{will fix later}, we have 
% \begin{equation*}
%     \E{\lrn{\nabla f(\bar\theta_s^c)-\nabla f(\bar\theta^c_{k\eta})}_2^2}\leq C \eta^2
% \end{equation*}
% \end{lemma}



\begin{theorem}[One step update]\label{one_step_Dalalyan}

Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , \frac{1}{m+L})$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, where $\theta_*$ is the global minimum for the function $f$. Then
\begin{align*}
    W_2^2(\mu_{k+1}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)^2W_2^2(\mu_{k}, \pi)+32\eta^3 d(K-1)^2 L H^2+2\eta^3 d L^2(\eta H^2+8\tau) + \eta^2\sigma^2 d.\notag
\end{align*}
\end{theorem}
\Wei{does this $\frac{\eta m}{2}$ make sense? Is there a tradition such that a rate of $\eta m$ is required?}
\begin{proof}




Set $t\rightarrow(k+1)\eta$ and $\bar\theta_0\rightarrow\bar\theta_{0}$ for Eq.\eqref{solution_continuous_dynamics} and consider a synchronous coupling such that $W_{(k+1)\eta}-W_{k\eta}:=\xi_k$
\begin{align}
\label{continuous_one_step}
    \bar\theta_{(k+1)\eta}&=\bar\theta_{k\eta}-\int_{k\eta}^{(k+1)\eta}\nabla f(\bar\theta_s)\d s + \sqrt{2} (W_{(k+1)\eta}-W_{k\eta})\notag\\
    &=\bar\theta_{k\eta}-\int_{k\eta}^{(k+1)\eta}\nabla f(\bar\theta_s)\d s + \sqrt{2}\xi_k.
\end{align}

We first denote $\zeta_k:=\nabla \tilde f(\theta_k)-\nabla f(\theta_k)$. Subtracting Eq.\eqref{fed_avg_langevin_dynamics} from Eq.\eqref{continuous_one_step} yields that
\begin{align*}
    &\quad \bar\theta_{(k+1)\eta}-\theta_{k+1}\notag\\
    &=\bar\theta_{k\eta}-\theta_{k}+\eta \nabla \tilde f(\theta_k) - \int_{k\eta}^{(k+1)\eta}\nabla f(\bar\theta_s)\d s\notag\\
    &=\bar\theta_{k\eta}-\theta_{k}-\eta \bigg(\nabla f(\theta_k+\bar\theta_{k\eta}-\theta_{k})-\nabla\tilde f(\theta_k)\bigg) - \int_{k\eta}^{(k+1)\eta}\bigg(\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})\bigg)\d s\notag\\
    &=\bar\theta_{k\eta}-\theta_{k}-\eta \bigg(\underbrace{\nabla f(\theta_k+\bar\theta_{k\eta}-\theta_{k})-\nabla f(\theta_k)}_{:=X_k}\bigg)+\eta\zeta_k - \underbrace{\int_{k\eta}^{(k+1)\eta}\bigg(\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})\bigg)\d s}_{:=Y_k}.\notag\\
\end{align*}

Applying triangle inequality, we have
\begin{align}
\label{almost_final}
    \lrn{\bar\theta_{(k+1)\eta}-\theta_{k+1}}_2\leq \lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k+\eta \zeta_k}_2 + \lrn{Y_k}_2.
\end{align}

For the first term in the previous result, taking square and expectation, we have
\begin{align}
\label{estimate_of_first_term}
    &\quad\E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k+\eta \zeta_k}_2^2}\notag\\
    &=\E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k}_2^2}+\E{\lrn{\eta\zeta_k}_2^2}+2\eta\underbrace{\E{\langle\bar\theta_{k\eta}-\theta_{k}-\eta X_k,  \zeta_k\rangle}}_{=0}\notag\\
    &\leq \E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k}_2^2}+\eta^2\sigma^2 d\notag \\
    &\leq \left(1-\frac{\eta m}{2}\right)^2\E{\lrn{\bar\theta_{k\eta}-\theta_k}_2^2}+\eta^2\sigma^2 d\notag \\
    &\quad\quad +2\eta(m+L)\sum_{c=1}^N p_c\left(\E{\lrn{\bar\theta_{k\eta}^c-\bar\theta_{k\eta}}_2^2}+\E{\lrn{\theta_k^c-\theta_k}_2^2}\right)\notag\\
    &\leq  \left(1-\frac{\eta m}{2}\right)^2\E{\lrn{\bar\theta_{k\eta}-\theta_k}_2^2}+\eta^2\sigma^2 d+32 \eta^3 d(K-1)^2 LH^2,
\end{align}
where the first inequality follows by Lemma \ref{lem:total_variance}, the second inequality follows by Lemma \ref{contraction}, and the third inequality follows by Lemma \ref{divergence} and $m\leq L$. Since the learning rate $\frac{1}{m+L}\leq \frac{2}{m}$, the requirement of the learning rate is clearly satisfied.

For the second term $Y_k$ in Eq.\eqref{almost_final}, we have the following estimate
\begin{align}
\label{y_estimate}
    \E{\lrn{Y_k}_2^2}&=\E{\lrn{\int_{k\eta}^{(k+1)\eta}\bigg(\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})\bigg)\d s}_2^2}\notag\\
    &\leq\eta\int_{k\eta}^{(k+1)\eta}\E{\lrn{\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})}_2^2}  \d s\notag\\
    &=\eta\int_{k\eta}^{(k+1)\eta}\E{\lrn{\sum_{c=1}^N p_c \bigg(\nabla f^c(\bar\theta_s^c)-\nabla f^c(\bar\theta^c_{k\eta})\bigg)}_2^2}  \d s\notag\\
    &\leq \eta\int_{k\eta}^{(k+1)\eta}\sum_{c=1}^N p_c\E{\lrn{\nabla f^c(\bar\theta_s^c)-\nabla f^c(\bar\theta^c_{k\eta})}_2^2}  \d s\notag\\
    &\leq \eta L^2 \int_{k\eta}^{(k+1)\eta}\sum_{c=1}^N p_c\E{\lrn{\bar\theta_s^c-\bar\theta^c_{k\eta}}_2^2}  \d s\notag\\
    &\leq \eta L^2  \int_{k\eta}^{(k+1)\eta}  2\eta d(\eta H^2+8\tau) \d s\notag\\
    &=2\eta^3 d L^2(\eta H^2+8\tau),
\end{align}
where the first inequality follows by H\"{o}lder's inequality, the second inequality follows by Jensen's inequality, the third inequality follows by Assumption \ref{def:smooth}, and the last inequality follows by Lemma \ref{lem:discretization}.

Taking square and expectation and plugging the estimates of Eq.\eqref{estimate_of_first_term} and Eq.\eqref{y_estimate} into Eq.\eqref{almost_final}, we have
\begin{align*}
    \E{\lrn{\bar\theta_{(k+1)\eta}-\theta_{k+1}}^2_2}&\leq  \left(1-\frac{\eta m}{2}\right)^2\E{\lrn{\bar\theta_{k\eta}-\theta_k}_2^2}+\eta^2\sigma^2 d\notag\\
    &\quad\quad+32\eta^3 d(K-1)^2 L H^2+2\eta^3 d L^2(\eta H^2+8\tau).
\end{align*}

Choose the specific Langevin diffusion $\bar\theta$ in stationary regime, we have $W_2^2(\mu_k,\pi)=\E{\lrn{\bar\theta_{k\eta}-\theta_k}_2^2}$ and  $W_2^2(\mu_{k+1},\pi)\leq\E{\lrn{\bar\theta_{(k+1)\eta}-\theta_{k+1}}_2^2}$. In what follows, we have
\begin{align*}
    W_2^2(\mu_{k+1}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)^2W_2^2(\mu_{k}, \pi)+32\eta^3 d(K-1)^2 L H^2+2\eta^3 d L^2(\eta H^2+8\tau) + \eta^2\sigma^2 d.\notag\\
\end{align*}

\end{proof}


\begin{theorem}[Fixed learning rate] Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. Given a constant learning rate $\eta\in (0, \frac{1}{m+L}]$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have
\begin{align*}
    W_2(\mu_{k}, \pi) &\leq  \left(1-\frac{\eta m}{2}\right)^k \bigg(2\sqrt{d}(\mathcal{D} +  \sqrt{1/m} )\bigg)+\sqrt{\eta d}\bigg((K-1)C_1 + C_2 + \sigma^2 C_3\bigg),\notag
\end{align*}
where $\mu_k$ denotes the probability measure of $\theta_k$, $C_1=\sqrt{32 LH^2}$, $C_2=\sqrt{2L^2(\eta H^2+8\tau)}$, and $C_3$ is defined in the end of the proof.
\end{theorem}


\begin{proof}
Recall from Theorem \ref{one_step_Dalalyan} that
\Wei{Why can't we recursively apply bound w.r.t. $W_2^2$, but rather $W_2$}
\begin{align*}
    W_2^2(\mu_{k+1}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)^2W_2^2(\mu_{k}, \pi)+32\eta^3 d(K-1)^2 L H^2+2\eta^3 d L^2(\eta H^2+8\tau) + \eta^2\sigma^2 d\notag\\
    &\leq  \bigg(\big(1-\underbrace{\frac{\eta m}{2}}_{:=A}\big)W_2(\mu_{k}, \pi)+\underbrace{\sqrt{32\eta^3 d(K-1)^2 L H^2+2\eta^3 d L^2(\eta H^2+8\tau)}}_{:=C}\bigg)^2+\underbrace{\eta^2\sigma^2 d}_{:=B^2}.\notag\\
\end{align*}

To obtain a sharper bound on the Wasserstein distance, a useful ingredient is to adopt Lemma 1 \cite{dk19}, which states as follows
\begin{lemma}
\label{useful_tool}
Let $A, B$ and $C$ be three constants such that $A\in(0, 1)$, $B, C\geq 0$. If the sequence $\{x_k\}_{k\geq 0}$ satisfies the recursion as follows
\begin{align*}
    x_{k+1}^2\leq [(1-A)x_k + C]^2 + B^2,
\end{align*}
where for any $k\geq 0$.  Then, we have that
\begin{align*}
    x_{k}\leq (1-A)^k x_0 + \frac{C}{A} + \frac{B^2}{C+\sqrt{A} B}.
\end{align*}
\end{lemma}

Combining Lemma \ref{useful_tool} with the above Lemma, we have
\begin{align*}
    W_2(\mu_{k}, \pi) &\leq  \left(1-\frac{\eta m}{2}\right)^k W_2(\mu_{0}, \pi)+\sqrt{\eta d}\left((K-1)C_1 + C_2 + \sigma^2 C_3\right),\notag
\end{align*}
where $C_1=\sqrt{32 LH^2}$, $C_2=\sqrt{2L^2(\eta H^2+8\tau)}$, $C_3=\frac{1}{\sqrt{32(K-1)^2 L H^2+2 L^2(\eta H^2+8\tau)}+ \sigma \sqrt{\frac{m}{2}}}\leq \frac{1}{(K-1) C_1}$ for $K>1$.

By Lemma \ref{lem:W2_init_bound} and the initialization condition $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have that
\begin{align*}
W_2(\mu_0, \pi)\leq 2\sqrt{d}(\mathcal{D} +  \sqrt{1/m} ).
\end{align*}
\end{proof}


\begin{theorem}[Decay learning rate]

\Wei{The complexity with stochastic noise goes beyond my expectation. It seems that we can be satisfied with the constant learning rate or use decaying learning rate without noise}

\end{theorem}









\subsection{Important lemmas}

\begin{proof}[Proof of Lemma \ref{contraction}] 
% Let $X_k:=\nabla f(\theta_k+\delta_k)-\nabla f(\theta_k)$, $\delta_k:=\bar\theta_{k\eta}-\theta_{k}$ and $\zeta_k:=\nabla \tilde f(\theta_k)-\nabla f(\theta_k)$, where  $\theta_k$ and $\bar\theta_{t}$ are simulated from Eq.\eqref{fed_avg_langevin_dynamics} from Eq.\eqref{continuous_dynamics}, respectively. Given xxx, we have

Given a client index $c\in[N]$, applying Theorem 2.1.12 \cite{Nesterov04} leads to
\begin{align}
\label{special_inner_product}
    \langle y-x, \nabla f^c(y)-\nabla f^c(x) \rangle\geq \frac{m L}{L+m}\lrn{y-x}_2^2 + \frac{1}{L+m} \lrn{\nabla f^c(y)-\nabla f^c(x)}_2^2,\quad \forall x,y\in\mathbb{R}^d.
\end{align}

In what follows, we have
\begin{align}
\label{iteration}
    &\quad\lrn{\bar\theta-\theta-\eta(\nabla f(\bar\theta)-\nabla f(\theta))}_2^2\notag\\
    &=\lrn{\bar\theta-\theta}_2^2 -2\eta \underbrace{\langle \bar\theta-\theta, \nabla f(\bar\theta)-\nabla f(\theta)\rangle}_{\mathcal{I}}+\eta^2 \lrn{\nabla f(\bar\theta)-\nabla f(\theta)}_2^2.
\end{align}

For the second item $\mathcal{I}$ in the right hand side, we have
\begin{align}
\label{target_contraction}
    \mathcal{I}&=\sum_{c=1}^N p_c\bigg\langle \bar\theta-\theta, \nabla f^c(\bar\theta^c)-\nabla f^c(\theta^c)\bigg\rangle\notag\\
    &=\sum_{c=1}^N p_c\bigg\langle \bar\theta-\bar\theta^c+\bar\theta^c-\theta^c+\theta^c-\theta, \nabla f^c(\bar\theta^c)-\nabla f^c(\theta^c)\bigg\rangle\notag\\
    &=-\sum_{c=1}^N p_c\left(\bigg\langle \bar\theta^c-\bar\theta, \nabla f^c(\bar\theta^c)-\nabla f^c(\theta^c)\bigg\rangle+\bigg\langle \theta-\theta^c, \nabla f^c(\bar\theta^c)-\nabla f^c(\theta^c)\bigg\rangle\right)\notag\\
    &\quad\quad+\sum_{c=1}^N p_c\bigg\langle \bar\theta^c-\theta^c, \nabla f^c(\bar\theta^c)-\nabla f^c(\theta^c)\bigg\rangle\notag\\
    &\geq -\sum_{c=1}^N p_c\left((m+L)\lrn{\bar\theta^c-\bar\theta}_2^2+(m+L)\lrn{\theta^c-\theta}_2^2+\frac{1}{2(m+L)}\lrn{\nabla f^c(\bar\theta^c)-\nabla f^c(\theta^c)}_2^2\right)\notag\\
    &\quad\quad+ \sum_{c=1}^N p_c\left(\frac{m L}{L+m}\lrn{\bar\theta^c-\theta^c}_2^2 + \frac{1}{L+m} \lrn{\nabla f^c(\bar\theta^c)-\nabla f^c(\theta^c)}_2^2\right)\notag\\
    &\geq -(m+L)\sum_{c=1}^N p_c\left(\lrn{\bar\theta^c-\bar\theta}_2^2+\lrn{\theta^c-\theta}_2^2\right) + \frac{m L}{L+m}\lrn{\bar\theta-\theta}_2^2 \notag\\
    &\quad\quad+ \frac{1}{2(L+m)} \lrn{\nabla f(\bar\theta)-\nabla f(\theta)}_2^2,
\end{align}
where the first inequality follows by the AM-GM inequality and Eq.\eqref{special_inner_product}, respectively; the last inequality follows by Jensen's inequality such that $\sum_{c=1}^N p_c \lrn{\bar\theta^c-\theta^c}_2^2\geq \lrn{\sum_{c=1}^N p_c \left(\bar\theta^c-\theta^c\right)}_2^2$

Plugging Eq.\eqref{target_contraction} into Eq.\eqref{iteration}, we have
\begin{align*}
    &\quad\lrn{\bar\theta-\theta-\eta(\nabla f(\bar\theta)-\nabla f(\theta))}_2^2\notag\\
    &\leq \left(1-\frac{2\eta mL}{m+L}\right)\lrn{\bar\theta-\theta}_2^2+\eta\left(\eta-\frac{1}{m+L}\right) \lrn{\nabla f(\bar\theta)-\nabla f(\theta)}_2^2\notag\\
    &\quad\quad+2\eta(m+L)\sum_{c=1}^N p_c\left(\lrn{\bar\theta^c-\bar\theta}_2^2+\lrn{\theta^c-\theta}_2^2\right)\notag\\
    &\leq \left(1-\frac{\eta m}{2}\right)^2\lrn{\bar\theta-\theta}_2^2+2\eta(m+L)\sum_{c=1}^N p_c\left(\lrn{\bar\theta^c-\bar\theta}_2^2+\lrn{\theta^c-\theta}_2^2\right),\notag
\end{align*}
where the last inequality follows by $\frac{2L}{m+L}\geq 1$, $1-2a\leq (1-a)^2$ for any $a$, and $\eta\in(0, \frac{1}{m+L}]$.

% Applying $\sqrt{a+b}\leq \sqrt{a}+\sqrt{b}$, we have
% \begin{align*}
%     &\quad\lrn{\bar\theta-\theta-\eta(\nabla f(\bar\theta)-\nabla f(\theta))}_2\leq \left(1-\frac{\eta m}{2}\right)^2\lrn{\bar\theta-\theta}_2^2+2\eta(m+L)\sum_{c=1}^N p_c\left(\lrn{\bar\theta^c-\bar\theta}_2+\sqr\lrn{\theta^c-\theta}_2^2\right),\notag
% \end{align*}

\end{proof}


% \Wei{if no decay of learning rate is required, we may polish for a better rate here}
\begin{proof}[Proof of Lemma \ref{divergence}] For any $k \ge 0$, consider $k_0=K\lfloor \frac{k}{K}\rfloor $ such that $k\leq k_0$ and $\theta_{k_0}^c=\theta_{k_0}$ for any $k\geq 0$. Also, we use the fact that $\eta_k$ is non-increasing and $\eta_{k_0} \leq 2 \eta_k$ for all $k-k_0 \leq K-1$.

\textbf{Discrete dynamics: } By the iterate Eq.\eqref{fed_avg_langevin_dynamics}, we have
\begin{align*}
\sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_k}_2^2}&=\sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_{k_0}-(\theta_k-\theta_{k_0})}_2^2}\notag\\
&\leq \sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_{k_0}}_2^2}\notag\\
&\leq \sum_{c=1}^N p_c \E{\sum_{k=k_0}^{k-1} 2(K-1)\eta_k^2\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2 + 4(K-1)\eta_kd\tau}\notag\\
&\leq \sum_{c=1}^N p_c \left(\sum_{k=k_0}^{k-1} 2(K-1)\eta_{k_0}^2\E{\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2}+4(K-1)\eta_{k_0}d\tau\right)\notag\\
&\leq 8(K-1)^2\eta_k^2 H^2 +8(K-1)\eta_kd\tau,
\end{align*}
where the first inequality holds by $\E{\lrn{\theta-\E{\theta}}_2^2}\leq \E{\lrn{\theta}_2^2}$ for a stochastic variable $\theta$; the second inequality follows by $(\sum_{i=1}^{K-1} a_i)^2\leq (K-1)\sum_{i=1}^{K-1} a_i^2$; the last inequality follows by Lemma \ref{bounded_gradient_l2} and $\eta_{k_0}\leq 2\eta_k$. $H^2=14 d\kappa^2\bigg(m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2\bigg)$.

\textbf{Continuous-time diffusion: } For any time $t\geq 0$ at the $k$-th iteration and the closed synchronization time $t_0$ at the $k_0$-th iteration,
\begin{align*}
\sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t}}_2^2}&=\sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t_0}-(\bar\theta_{t}-\bar\theta_{t_0})}_2^2}\notag\\
&\leq \sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t_0}}_2^2}\notag\\
&\leq 2 d (t-t_0)^2 H^2+16d(t-t_0)\tau\notag\\
&\leq 2d (K-1)^2 \eta_{k_0}^2 H^2 + 16 d (K-1)\eta_{k_0}\tau\notag\\
&\leq 8d (K-1)^2 \eta_{k}^2 H^2 + 32 d (K-1)\eta_{k}\tau\notag
\end{align*}
where the second inequality follows by applying Lemma \ref{lem:discretization} with treating the learning rate as $t-t_0$; the third inequality follows since the $k$-th iteration and the $k_0$-th iteration has a time difference at most $(K-1)\eta_{k_0}$ given non-increasing learning rate $\{\eta_k\}_{k\geq 0}$; the fourth inequality follows by $\eta_{k_0}\leq 2\eta_{k}$.

% By definition, $\bar\theta_{t}^c-\bar\theta_{t_0}=-\int_{t_0}^t \nabla f^c(\bar\theta^c_s)\d s +\sqrt{2}\cdot \overline{W}_{t-t_0}$. Apply Young's inequality, we have
% \begin{align*}
%     \E{\lrn{\bar\theta_{t}^c-\bar\theta_{t_0}}_2^2}&=2\E{\lrn{-\int_{t_0}^t \nabla f^c(\bar\theta^c_s)\d s}_2^2} +2\E{\lrn{\sqrt{2}\cdot \overline{W}_{t-t_0}}_2^2}\notag\\
%     &\leq 2(t-t_0)\E{\int_{t_0}^t \lrn{\nabla f^c(\bar\theta_s^c)}_2^2\d s}
% \end{align*}

% By H\"{o}lder's inequality, we have
% \begin{align*}
% \sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t}}_2^2}&\leq \sum_{c=1}^N p_c { (t-t_0)\int_{t_0}^{t} \E{\lrn{-\int_{t_0}^t \nabla f^c(\bar\theta^c_s)\d s +\sqrt{2}\cdot \overline{W}_{t-t_0}}_2^2}\d s}\notag\\
% &\leq \sum_{c=1}^N p_c { (t-t_0)\int_{t_0}^{t} \E{\lrn{-\int_{t_0}^t \nabla f^c(\bar\theta^c_s)\d s +\sqrt{2}\cdot \overline{W}_{t-t_0}}_2^2}\d s}\notag\\
% &\leq 4(K-1)^2\eta_k^2 H^2,
% \end{align*}

\end{proof}


\begin{proof}[Proof of Lemma \ref{lem:discretization}]
For any $s\in[0,\infty)$, there exists a certain $k \in \mathbb{N}^+$ such that $s\in [k\eta, (k+1)\eta)$. By the continuous dynamics of Eq.~\eqref{continuous_dynamics}, we have
\begin{align*}
    \bar\theta_{s}^c = \bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor}+(s-k\eta) \nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor })+\sqrt{2\tau}\int_{k\eta}^s \d \overline{W}_t,
\end{align*}
which suggests that 
\begin{align*}
    \sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2 \leq (s-k\eta) \big\| \nabla f^c(\bar\theta_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2+\sup_{ s \in [ k\eta,  (k+1)\eta ) }\lrn{\int_{k\eta}^s \sqrt{2\tau} \d \overline{W}_t}_2.
\end{align*}
We first square the terms on both sides and take Young’s inequality and expectation
\begin{align*}
    \E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2} \leq 2\E{\big\|(s-k\eta)\nabla f^c(\bar\theta_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+2\E{\sup_{ s \in [ k\eta,  (k+1)\eta ) }\lrn{\int_{k\eta}^s \sqrt{2\tau} \d \overline{W}_t}_2^2}.
\end{align*}
Then, by Burkholder-Davis-Gundy inequality and It\^{o} isometry, we have
\begin{align}
    \label{eq:1st_part}
    \E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2}&\leq 2\E{ \big\| (s-k\eta)\nabla f^c(\bar\theta_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+8\sum_{i=1}^d\E{\int_{k\eta}^s 2\tau \d t} \notag \\
    &\leq 2\eta^2\E{ \big\| \nabla f^c(\bar\theta_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+16 \eta d\tau.
\end{align}

By Young's inequality and the smoothness assumption \ref{def:smooth},  we have
\begin{align}\label{eq:2nd_part}
    \E{ \| \nabla f^c(\bar\theta_{\eta\lfloor \frac{s}{\eta} \rfloor }) \|_2^2}
    = & ~ \E{\| \nabla f^c(\bar\theta_{\eta\lfloor \frac{s}{\eta} \rfloor })-\nabla f^c(\theta_*) +\nabla f^c(\theta_*) \|_2^2} \notag \\
    \leq & ~ 2\E{\| \nabla f^c(\bar\theta_{\eta\lfloor \frac{s}{\eta} \rfloor })-\nabla f^c(\theta_*) \|_2^2} +2{\lrn{\nabla f^c(\theta_*) }_2^2} \notag \\
    \leq & ~ 2L^2 \E{\|\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }-\theta_*\|_2^2}+2\gamma\notag\\
    \leq & 2L^2 \left(d\mathcal{D}^2 + \frac{6d}{m}\left(\tau+\frac{ \sigma^2}{m} + \frac{\gamma }{md}\right)\right)+ 2\gamma\notag\\
    \leq & 14 d\kappa^2\bigg(m^2 \mathcal{D}^2+m\tau +\frac{L^2\gamma}{d}+\sigma^2\bigg):=d H^2
\end{align}
where the third inequality follows by Lemma \ref{lem:L2_bound_local}, the fourth step follows from the definition of $H^2$ given $\eta\in (0, 2/m]$. Combining Eq.~\eqref{eq:1st_part} and Eq.~\eqref{eq:2nd_part}, we have
\begin{align*}
\E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2}
&\leq 2\eta d(\eta H^2+8\tau).\notag
\end{align*}
\end{proof}


\begin{proof}[Proof of Lemma \ref{lem:total_variance}] By assumption \ref{def:variance}, we have
\begin{align*}
    \E{\lrn{\nabla f(\theta)-\nabla \tilde f(\theta)}_2^2}&=\E{\lrn{\sum_{c=1}^N p_c\bigg(\nabla f^c(\theta^c)-\nabla \tilde f^c(\theta^c)\bigg)}_2^2}\\
    &=\sum_{c=1}^N p_c^2\E{\lrn{\nabla f^c(\theta^c)-\nabla \tilde f^c(\theta^c)}_2^2}\\
    &\leq d \sigma^2 \sum_{c=1}^N p_c^2\leq d\sigma^2 \left(\sum_{c=1}^N p_c\right)^2:=d\sigma^2.
\end{align*}

\end{proof}

\subsection{Supporting Lemmas}




\begin{lemma}[Uniform $\ell_2$ upper bound for local clients]
\label{lem:L2_bound_local}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the $\ell_2$ norm upper bound as follows
\begin{align*}
\sup_k\E{\lrn{\theta_k^c-\theta_*}_2^2}\leq d\mathcal{D}^2 + \underbrace{\frac{6d}{m}\left(\tau+\frac{ \sigma^2}{m} + \frac{\gamma }{md}\right)}_{:=U},\notag
\end{align*}
where $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$ and $\theta_*$ denotes the global minimum for the function $f$.
\end{lemma}


\begin{proof} First, we consider the $k$-th iteration, where $k\in \{1,2,\cdots, K-2, (K-1)_{-}\}$ and $(K-1)_-$ denotes the $(K-1)$-step without synchronization. Following the iterate of Eq.\eqref{local_client} in a local client of $c\in [N]$, we have
	\begin{align}\label{eq:Langevin_L2_1_local}
&\quad\ \E{\lrn{\theta_{k+1}^c-\theta_*}_2^2}\notag\\
		&= \E{\|\theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c)\|_2^2} + \sqrt{8\eta\tau}\E{ \langle \theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c), \xi_k \rangle } + 2\eta\tau\E{\|\xi_k\|_2^2} \notag \\
		&= \E{\|\theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c)\|_2^2} + 2\eta d\tau,
	\end{align}	
	where the last equality comes from the independence of $\theta_k^c-\theta_*- \widetilde f^c(\theta_k^c)$ and $\xi_k$; $\E{\xi_k}=0$. Note that
\begin{align}\label{eq:ip_1st_local}
%\small
&\quad\ \E{\|\theta_k^c -\theta_*- \eta \widetilde f^c(\theta_k^c)\|_2^2} \notag\\
&= \E{\left\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \right\|_2^2} + \eta^2\E{\|\nabla f^c(\theta_k^c)-\nabla \widetilde f^c(\theta_k^c)\|_2^2}  \notag\\
& \qquad\qquad + 2 \eta \E{ \langle \theta_k^c-\theta_*-\eta \nabla f^c(\theta_k^c),\nabla f^c(\theta_k^c)-\nabla\widetilde f^c(\theta_k^c) \rangle }  \notag\\
&= \E{\left\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \right\|_2^2} + \eta^2\E{\|\nabla f^c(\theta_k^c)-\nabla \widetilde f^c(\theta_k^c)\|_2^2} \notag \\
&\leq \E{\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \|_2^2}  + \eta^2 d\sigma^2, 
\end{align}
where the first step follows from simple algebra, the second step follows from the unbiasedness of the stochastic gradient, and the last step follows from Assumption \ref{def:variance}. For any $q>0$, we can upper bound the first term of Eq.\eqref{eq:ip_1st_local} as follows
\begin{align}\label{eq:ip_2nd_test_theta_star}
	&\quad\ \E{\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \|_2^2}\notag\\
	&=\E{\|\theta_k^c -\theta_*- \eta (\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*))-\eta\nabla f^c(\theta_*) \|_2^2}\notag\\
	&\leq (1+q)\E{\|\theta_k^c -\theta_*- \eta (\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)) \|_2^2}+\eta^2 \left(1+\frac{1}{q}\right) \|\nabla f^c(\theta_*)\|_2^2\notag\\
	&\leq (1+q)\underbrace{\left(1-\frac{\eta m}{2}\right)^2}_{\psi^2}\E{\lrn{\theta_k^c-\theta_*}_2^2}+\eta^2 \left(1+\frac{1}{q}\right)\gamma,
\end{align}
where the first inequality follows by the AM-GM inequality;  the second inequality is a special case of Lemma \ref{contraction} based on Assumption \ref{def:strong_convex}, where no local steps is involved before the synchronization step. In addition, $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$.

Choose $q=(\frac{1+\psi}{2\psi})^2-1$ so that $(1+q)\psi^2=\frac{(1+\psi)^2}{4}$. Moreover, since $\psi=1-\frac{\eta m}{2}$, we get $\frac{1+\psi}{2}=1-\frac{1}{4}\eta m$. In addition, we have $1+\frac{1}{q}= \frac{1+q}{q}\leq \frac{(1+\psi)^2}{(1-\psi)(1+3\psi)}\leq \frac{2}{\eta m}$.  It follows that
\begin{align}
    \label{nice_inequality}
    \eta^2\left(1+\frac{1}{q}\right)\leq \frac{2\eta}{m}.
\end{align}

Combining Eq.~\eqref{eq:Langevin_L2_1_local}, Eq.~\eqref{eq:ip_1st_local}, Eq.~\eqref{eq:ip_2nd_test_theta_star}, and Eq.~\eqref{nice_inequality}, we have the following iterate
\begin{align*}
	\E{\|\theta_{k+1}^c-\theta_*\|_2^2} 
	\leq & ~ \underbrace{\left(1-\frac{\eta m}{4}\right)^2}_{:=g(\eta)} \E{\|\theta_k^c-\theta_*\|_2^2} + 2\eta d\tau +\eta^2 d \sigma^2+\frac{2\eta \gamma}{m}. \notag
\end{align*}

Note that $\frac{1}{1-g(\eta)}=\frac{1}{\frac{\eta m}{2}(1-\frac{\eta m}{8})}\leq \frac{3}{\eta m}$ given $\eta\in (0, \frac{2}{m})$. Recursively applying the above equation $k$ times, where $k\in \{1,2,\cdots, K-1, K_{-}\}$ and $K_-$ denotes the $K$-step without synchronization, it follows that
\begin{align}\label{recursion_v2}
	\E{\|\theta_k^c-\theta_*\|_2^2} &\le g(\eta)^{k}\| \theta_0^c-\theta_*\|_2^2 + \frac{1- g(\eta)^{k}}{1 - g(\eta)} \cdot \left(2\eta d\tau +\eta^2 d \sigma^2+\frac{2\eta \gamma}{m}\right)  \\
	&\le \|\theta_0^c-\theta_*\|_2^2 + \frac{3}{\eta m} \cdot \left(2\eta d\tau +\eta^2 d \sigma^2+\frac{2\eta \gamma}{m}\right) \notag\\
	&\leq d\mathcal{D}^2 + \underbrace{\frac{6d}{m}\left(\tau+\frac{ \sigma^2}{m} + \frac{ \gamma}{md}\right)}_{:=U},\notag
\end{align}
where the second inequality holds by $g(\eta)\leq 1$, the last inequality holds because $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$ and $\eta< \frac{2}{m}$.
In particular, the $K$-th step before synchronization yields that
\begin{align}\label{recursion_v3}
	\E{\|\theta_{K_-}^c-\theta_*\|_2^2} &\le d\mathcal{D}^2 +U.
\end{align}
By contrast, for the $K$-local step after synchronization, applying Jensen's inequality
\begin{align}\label{recursion_v4}
	\E{\|\theta_K^c-\theta_*\|_2^2} 
	= & ~\E{\bigg\|\sum_{c=1}^N p_c\theta_{K-}^c-\theta_*\bigg\|_2^2} \notag \\
	\leq & ~ \sum_{c=1}^N p_c\E{\lrn{\theta_{K-}^c-\theta_*}_2^2} \notag \\
	\leq & ~ d\mathcal{D}^2 +U.
\end{align}
Now starting from iteration $K$, we adapt the recursion of Eq.\eqref{recursion_v2} for the $k$-th step, where $k\in\{K+1,\cdots, 2K-1, (2K)_{-}\}$ and $(2K)_-$ denotes the $2K$-step without synchronization, we have
\begin{align}\label{recursion_v5}
	\E{\|\theta_k^c-\theta_*\|_2^2} 
	\leq & ~ g(\eta)^{k-K} \cdot  \E{\|\theta_K^c-\theta_*\|_2^2} + \frac{1- g(\eta)^{k-K}}{1 - g(\eta)}\cdot \left(2\eta d\tau +\eta^2 d \sigma^2+\frac{2\eta \gamma}{m}\right)\notag \\
	\leq &  g(\eta)^{k-K}(d\mathcal{D}^2+U)+\frac{1- g(\eta)^{k-K}}{m\eta/3} \frac{m\eta}{3} U\notag \\
	\leq & d\mathcal{D}^2+ g(\eta)^{k-K} U +  (1- g(\eta)^{k-K}) U \notag\\
	\leq & d\mathcal{D}^2+U,
\end{align}
where the second inequality follows by Eq.\eqref{recursion_v4}, the fact that $1-g(\eta)\geq \eta m/3$ and $\eta\leq \frac{2}{m}$, and the definition of $U$. The third one holds since $g(\eta)\leq 1$.

By repeating Eq.\eqref{recursion_v4} and \eqref{recursion_v5}, we have that for all $k\geq 0$
\begin{align*}
	\E{\|\theta_k^c-\theta_*\|_2^2} \leq d\mathcal{D}^2 + \underbrace{\frac{6d}{m}\left(\tau+\frac{ \sigma^2}{m} + \frac{\gamma }{md}\right)}_{:=U}.\notag
\end{align*}
\end{proof}


\begin{lemma}[Bounded gradient in $\ell_2$]\label{bounded_gradient_l2}
Given assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold, for any client $c$ and any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the $\ell_2$ norm upper bound as follows
\begin{align*}
    \E{\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2}\leq d H^2,
\end{align*}
where $H^2=14 \kappa^2\bigg(m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2\bigg)$.
\end{lemma}

\begin{proof}

Decompose the $\ell_2$ of the gradient as follows
\begin{align*}
    \E{\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2}&= \E{\lrn{\nabla\tilde f^c(\theta_k^c)-\nabla f^c(\theta_k^c)+\nabla f^c(\theta_k^c)}_2^2}\notag\\
    &= \E{\lrn{\nabla f^c(\theta_k^c)}_2^2}+\E{\lrn{\nabla\tilde f^c(\theta_k^c)-\nabla f^c(\theta_k^c)}_2^2}+2\E{\lrw{\nabla\tilde f^c(\theta_k^c)-\nabla f^c(\theta_k^c), \nabla f^c(\theta_k^c)}} \notag\\
    &\leq \E{\lrn{\nabla f^c(\theta_k^c)}_2^2}+\sigma^2d\notag\\
    &=  \E{\lrn{\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)+\nabla f^c(\theta_*)}_2^2}+\sigma^2d\notag\\
    &\leq 2\E{\lrn{\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)}_2^2}+2\E{\big\|\nabla f^c(\theta_*)\big\|_2^2}+\sigma^2d\notag\\
    &\leq 2 L^2 \E{\lrn{\theta_k^c-\theta_*}_2^2}+2 \gamma +\sigma^2d\notag\\
    &\leq 2L^2 \bigg(d\mathcal{D}^2 + \frac{6d}{m}\left(\tau+\frac{ \sigma^2}{m} + \frac{\gamma }{md}\right)\bigg)+2 \gamma+\sigma^2d\notag\\
    &\leq 14 d\kappa^2\bigg(m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2\bigg):= dH^2,
\end{align*}
where the first inequality follows by assumption \ref{def:variance}; the second inequality follows by Young's inequality; the third inequality follows by assumption  \ref{def:smooth} and the definition that $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$; the fourth inequality follows by Lemma \ref{lem:L2_bound_local}; the last inequality follows by defining 
$\kappa:=\frac{L}{m}\geq 1$.
\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Beginning of dominated divergence %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





\begin{lemma}[Initial condition] 
\label{lem:W2_init_bound}
Let $\mu_0$ denote the Dirac delta distribution at $\theta_0$. % and assume $\lrn{\theta_0-\theta_*}_2^2\leq d\mathcal{D}^2$.
Then, we have
\begin{align*}
W_2(\mu_0, \pi)\leq 2(\| \theta_0 - \theta_* \|_2 +  \sqrt{d/m} ). %\sqrt{2d\left(\mathcal{D}^2+\frac{2}{m}\right)}.
\end{align*}
\end{lemma}

\begin{proof}
By \cite{ccbj18}, there exists an optimal coupling between $\mu_0$ and $\pi$ such that
\begin{align*}
    W_2^2(\mu_0, \pi) 
    \leq & ~ \mathbb{E}_{\theta\sim \pi} [\|\theta_0-\theta\|_2^2 ]\\
    \leq & ~ 2\mathbb{E}_{\theta\sim \pi} [\|\theta_0-\theta_*\|_2^2 ] + 2 \mathbb{E}_{\theta\sim \pi}[\|\theta-\theta_*\|_2^2] \\
    = & ~ 2\| \theta_0 - \theta_* \|_2^2 +2\mathbb{E}_{\theta\sim \pi}[\|\theta-\theta_*\|_2^2]\\
    \leq & ~ 2\| \theta_0 - \theta_* \|_2^2 + 4d/m,
\end{align*}
where the second step follows from triangle inequality, the last step follows from Theorem 17 \cite{ccbj18}.
\end{proof}




\newpage
\paragraph{Path A} If we use a bound based on $W_2^2$, we have
\begin{align*}
    W_2^2(\mu_{k+1}, \pi)&\leq  \left(1-\eta m\right) W_2^2(\mu_{k}, \pi)+\eta^3 + \eta^2\sigma^2.\notag
\end{align*}

Recursive applying it
\begin{align*}
    W_2^2(\mu_k, \pi)&\leq  \left(1-\eta m\right)^k W_2^2(\mu_{0}, \pi)+ \frac{1-(1-\eta m)^k}{\eta m}\left(\eta^3 + \eta^2\sigma^2 \right).\notag\\
    &\leq \left(1-\eta m\right)^k W_2^2(\mu_{0}, \pi)+ \frac{1}{ m}\left(\eta^2 + \eta\sigma^2\right).\notag\\
\end{align*}
In other words, we can easily get a good rate
\begin{align*}
    W_2(\mu_k, \pi)&\leq \left(1-\eta m\right)^\frac{k}{2} W_2(\mu_{0}, \pi)+ \sqrt{\frac{1}{ m}\left(\eta^2 + \eta\sigma^2\right)}.\notag\\
    &\leq \left(1-\frac{\eta m}{2}\right)^k W_2(\mu_{0}, \pi)+ \sqrt{\frac{1}{ m}\left(\eta^2 + \eta\sigma^2\right)}.\notag\\
\end{align*}

\paragraph{Path B} Similarly, if we use
\begin{align*}
    W_2(\mu_{k+1}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)W_2 (\mu_{k}, \pi)+\sqrt{\eta^3 +\eta^2\sigma^2}.\notag
\end{align*}
Then only by applying a complex trick, i.e.
\begin{lemma}
Let $A, B$ and $C$ be three constants such that $A\in(0, 1)$, $B, C\geq 0$. If the sequence $\{x_k\}_{k\geq 0}$ satisfies the recursion as follows
\begin{align*}
    x_{k+1}^2\leq [(1-A)x_k + C]^2 + B^2,
\end{align*}
where for any $k\geq 0$.  Then, we have that
\begin{align*}
    x_{k}\leq (1-A)^k x_0 + \frac{C}{A} + \frac{B^2}{C+\sqrt{A} B}.
\end{align*}
\end{lemma}

, we can have
\begin{align*}
    W_2(\mu_{k}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)^k W_2 (\mu_{0}, \pi)+\sqrt{\eta}(\mathcal{O}(1)+\sigma^2).\notag
\end{align*}

\Wei{Question: Dalalyan used Path B, why?}




