\section{Our Algorithm}

Let $N$ denote the number of clients. Let $T$ denote the number of global steps. Let $K$ denote the number of local steps. For each $c \in [N]:=\{1,2,\cdots, N\}$, we use $f^c$ and $\nabla f^c$ denote the loss function and gradient of the function $f^c$ in client $c$. For the stochastic gradient oracle, we denote by $\nabla \tilde f^c(\cdot)$ the \emph{unbiased} estimate of the exact gradient $\nabla f^c$ of client $c$. In addition, we denote $p_c$ as the weight of the $c$-th client such that $p_c\geq 0$ and $\sum_{c=1}^N p_c=1$. $\xi_k$ is a standard $d$-dimensional Gaussian vector at iteration $k$, which is independent of the client index and can be achieved by maintaining the same random seed for each client $c\in[N]$.

\begin{algorithm*}[h]\caption{Federated Averaging Langevin dynamics Algorithm (FedAvgLD). Denote by $\theta_k^c$ the model parameter in the $c$-th client at the $k$-th step. Denote the immediate result of one step SGLD update from $\theta_k^c$ by $\beta_k^c$. A global synchronization is conducted every $K$ steps.}\label{alg:alg_main_text_same_seed}
\begin{algorithmic}[1]
\State \begin{equation}\label{local_client}
    \beta_{k+1}^c=\theta_k^c-\eta\nabla f^c(\theta_k^c)+\xi_k,
\end{equation}
\State
\begin{equation}  
\label{synchronization}
\theta_{k+1}^c=\left\{  
             \begin{array}{lr}  
             \beta_{k+1}^c \qquad\qquad\qquad \text{if } k+1 \text{ mod } K\neq 0 \\  
              & \\
             \sum_{c=1}^N p_c \beta_{k+1}^c \ \qquad \text{if } k+1 \text{ mod } K=0.
             \end{array}  
\right.  
\end{equation} 
\end{algorithmic}
\end{algorithm*}

Inspired by \cite{lhy+19}, we define two virtual sequences 
\begin{equation}
\label{virtual_seq}
\beta_k=\sum_{c=1}^N p_c \beta_k^c, \qquad \theta_k=\sum_{c=1}^N p_c \theta_k^c,
\end{equation}
which are \emph{both inaccessible when $k \text{ mod } K\neq 0$}. For the gradients, we also define 
\begin{equation}
\label{sum_grad}
\nabla f(\theta_k)=\sum_{c=1}^N p_c \nabla f^c(\theta_k^c), \qquad \nabla\tilde f(\theta_k)=\sum_{c=1}^N p_c \nabla \tilde f^c(\theta_k^c).
\end{equation}



In what follows, it is clear that $\E{\nabla \tilde f(\theta)}=\sum_{c=1}^N p_c \E{\nabla \tilde f^c(\theta_k^c)}=\nabla f(\theta)$ for any $\theta\in\R^d$. Summing Eq.\eqref{local_client} from clients $c=1$ to $N$ and combining Eq.\eqref{virtual_seq} and Eq.\eqref{sum_grad}, we have
\begin{align}
\label{fed_avg_langevin_dynamics_preliminary}
    \beta_{k+1}&=\theta_k-\eta \nabla \tilde f(\theta_k)+\xi_k.
\end{align}
Moreover, we always have $\beta_k=\theta_k$ whether $k+1 \text{ mod } E=0$ or not by Eq.\eqref{synchronization} and Eq.\eqref{virtual_seq}. In what follows, we can write
\begin{equation}
\label{fed_avg_langevin_dynamics}
\theta_{k+1}=\theta_k-\eta \nabla \tilde f(\theta_k)+\xi_k,
\end{equation}
which resembles the SGLD algorithm \cite{Welling11} except that the construction of stochastic gradients is different and $\theta_k$ is \emph{not accessible when $k\text{ mod } K\neq 0$}. To facilitate the analysis, we also define an auxiliary continuous-time processes $(\bar\theta_t)_{t\geq 0}$ 
\begin{align}
\label{continuous_dynamics}
\d \bar\theta_t = - \nabla f(\bar\theta_t) \d t + \sqrt{2\tau} \d \overline{W}_t,
\end{align}
where $\bar\theta_t=\sum_{c=1}^N p_c \bar\theta_t^c$, $\nabla f(\bar\theta_t)=\sum_{c=1}^N p_c \nabla f^c(\bar\theta_t^c)$, $\bar\theta_t^c$ is the continuous-time variable at client $c$, and $\overline{W}$ is a $d$-dimensional Brownian motion. The continuous-time algorithm is referred to as Federated Averaging Langevin diffusion and is described as
\begin{align*}\label{local_client_continuous}
    \d \bar\beta_{t}^c &=\theta_t^c-\nabla f^c(\bar\theta_t^c)\d t+\sqrt{2\tau}\d \overline{W}_t \notag\\
    \quad\bar\theta_{t}^c&=\sum_{c=1}^N p_c \bar\beta_{t}^c.
\end{align*}
Since the synchronization step is conducted at every time step $t$, the Federated Averaging Langevin diffusion performs the same as the standard Langevin diffusion. Assume that $\bar\theta_0$ simulates from the stationary distribution $\pi$, then it follows that $\bar\theta_t\sim\pi$ for any $t\geq 0$.


% \begin{algorithm*}[h]\caption{Federated Averaging Langevin diffusion. Denote by $\bar\theta_t^c$ the model parameter in the $c$-th client at time $t$. Denote the Langevin diffusion update from $\bar\theta_t^c$ by $\bar\beta_t^c$. The global synchronization is conducted at any time step $t$.}\label{alg:alg_main_continuous_text_same_seed}
% \begin{algorithmic}[1]
% \State \begin{equation}\label{local_client_continuous}
%     \d \bar\beta_{t}^c =\theta_t^c-\nabla f^c(\bar\theta_t^c)\d t+\sqrt{2\tau}\d \overline{W}_t,
% \end{equation}
% \State
% \begin{equation}  
% \label{synchronization_diffusion}
% \bar\theta_{t}^c=\sum_{c=1}^N p_c \bar\beta_{t}^c.
% \end{equation} 
% \end{algorithmic}
% \end{algorithm*}




\paragraph{Quality of non-i.i.d data} Denote by $\theta_*$ the global minimum of $f$ and by $\theta^c_*$ the global minimum values of $f^c$ for each client $c\in [N]$. It follows that $\theta_*=\sum_{c=1}^N p_c \theta_*^c$. Next, we quantify the degree of the non-i.i.d data by $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$, which is a non-negative constant and yields a larger scale if the data is less identically distributed.

\iffalse
\subsection{Our plan}

What are the assumptions do we need ..
\begin{itemize}
    \item shusen wang's paper
\end{itemize}

We need to prove a new version of Lemma 3 in page 12 in \cite{lhy+19}.
\begin{lemma}[Lemma 3 in page 12 in \cite{lhy+19}]
We have
\begin{align*}
    \mathbb{E} \Big[ \sum_{c=1}^N p_c \| u(t+1) - w_{K,c}(t)  \|^2 \Big] \leq 4 \eta_t^2 (E-1)^2 G^2 
\end{align*}
\end{lemma}

We need to generalize the above lemma to something as follows:
\begin{lemma}[Our version]
We have
\begin{align*}
    \mathbb{E} \Big[ \sum_{c=1}^N p_c \| u(t+1) - w_{K,c}(t)  \|^2 \Big] \leq 4 \eta_t^2 (E-1)^2 G^2  + \| \mathrm{noise} \|^2
\end{align*} 
\end{lemma}

Using Shusen's assumption, we can show Dala's paper \cite{dk19} page 7 contions are holding.

\fi


% \begin{table}[]
%     \centering
%     \begin{tabular}{|l|l|l|l|l|l|} \hline
%         {\bf Notations} & {\bf Ours} & \cite{dk19} & \cite{lhy+19} & \cite{ccbj18} \\ \hline
%         Function & $f$ & $f$ & $F$ & $f$ \\ \hline
%         Parameter &  & $\theta$ & $w$ & $x$ \\ \hline
%         Dimension & $d$ & $p$ & Never & $d$ \\ \hline
%         Smooth & $L$ & $M$ & $L$ & $L$ \\ \hline
%         Strongly convex & $m$ & $m$ & $\mu$ & $m$ \\ \hline
%         Global & & $K$ & $T$ & \\ \hline
%         Local steps & & $1$ & $K$ & \\ \hline
%         Variance & $\sigma^2 d$ & $\sigma^2 p$ & $\sigma^2$ & $\sigma^2 d$ \\ \hline
%         Learning rate & $\eta$ & $h$ & $\eta$ & $\delta$ \\ \hline
%         Choice of LR & & $h = 1/(m+M)$ & $\eta = 2 / (\mu T) $ &  \\\hline
%         \#Devices & & 1 & $N$ & \\ \hline
%         \#Datas per client & & & $n_k$ & \\ \hline
%     \end{tabular}
%     \caption{Notations to compare different papers. We put this table for easy of writing. There is no need to keep this in the final paper.}
%     \label{tab:my_label}
% \end{table}




\Wei{To do:  independence on K. \textcolor{green}{Done}}

\Wei{To do: remove the variance effect (learning rate) or propose a better rate. \textcolor{green}{Done}}

\Wei{To do: require a continuous version for Bounded divergence \textcolor{green}{Done}}

\Wei{To do: think about a name for the paper, FedAvg Langevin Dynamics?}

\Wei{To do: Decay learning rate \textcolor{green}{on hold}}

\Wei{To do: When we decay learning rate, it may be harder to prove the L2 bound. \textcolor{green}{on hold}}

\Wei{To do: independent noise in each local client? \textcolor{green}{Done}}

\Wei{To do: convex case or non-convex case?}

\Wei{To do: sampling schemes for different clients}

\Wei{To do: Connection to optimization?}

\Wei{To do: conditional expectation filtration?}

\Wei{do we need to unify the words, client or device?}

\subsection{Main result}

% \subsubsection{Notations}

\subsubsection{Assumptions}

\begin{assumption}[Smoothness]\label{def:smooth} For each $c\in [N]$, we say $f^c$ is $L$-smooth if for some $L>0$
\begin{align*}
\| \nabla f^c(y)-\nabla f^c(x) \|_2 \leq L \| y-x \|_2,\quad \forall x, y\in \R^d.
\end{align*}
\end{assumption}

Note that the above assumption is equivalent to saying that
\begin{align*}
f^c(y)\leq f^c(x)+\langle \nabla f^c(x),y-x \rangle+\frac{L}{2}\| y-x \|^2_2\quad \forall x, y\in \R^d.
\end{align*}

\begin{assumption}[Strongly convex]\label{def:strong_convex}
For each $c\in [N]$, $f^c$ is $m$-strongly convex if for some $m>0$
\begin{align*}
f^c(x)\geq f^c(y)+\langle \nabla f^c(y),x-y \rangle + \frac{m}{2} \| y-x \|_2^2\quad \forall x, y\in \R^d.
\end{align*}
\end{assumption}

\begin{assumption}[Bounded variance]\label{def:variance}
For each $c\in [N]$, the variance of noise in the stochastic gradient $\nabla \tilde f^c(x)$ in each client is upper bounded such that 
\begin{align*}
\mathbb{E}[ \| \nabla \tilde f^c(x) - \nabla f^c(x) \|_2^2] \leq \sigma^2 d,\quad \forall x\in \R^d.
\end{align*}
\end{assumption}

% Wei's first algorithm formulation based on standard SGLD
% \subsection{Formulation}
% Let $\theta_k\in\R^d$ be the $k$-th iterate of the following stochastic gradient Langevin algorithm.
% \begin{align}\label{eq:sgld}
%     \theta_{k+1}=\theta_k -\eta \nabla \widetilde f(\theta_k)+\sqrt{2\tau\eta}\xi_k,
% \end{align}
% where $\eta$ is the learning rate, $\tau$ is the temperature, $\xi_k$ is a standard $d$-dimensional Gaussian vector, and $\nabla \widetilde f(\theta)$ is an unbiased estimate of the exact gradient $\nabla f(\theta)$.

% \subsubsection{Tools from previous work}

% \textbf{Gronwall's inequality} is a standard tool for obtaining estimates of differential equations. Suppose that $a(\cdot)$, $b(\cdot)$, and $\psi(\cdot)$ are continuous real-valued functions that satisfy
% \begin{align*}
%     \frac{\d}{\d t}\psi(t)\leq a(t)\psi(t)+b(t).
% \end{align*}
% Then 
% \begin{align*}
%     \psi(t)\leq \psi(t_0)e^{\int_{t_0}^t a(s)\d s} + \int_{t_0}^t e^{\int_{s}^t a(u)\d u}b(s)\d s.
% \end{align*}

% \textbf{Burkholder-Davis-Gundy inequality} Let $\phi:[0, \infty)\rightarrow \mathbb{R}^{r\times d}$ for some positive integers $r$ and $d$. In addition, we assume $\E{\int_0^{\infty} |\psi(s)|^2 \d s}<\infty$ and let $Z(t)=\int_0^t \psi(s)\d W_s$, where $W_s$ is a $d$-dimensional Brownian motion. Then for all $t\geq 0$, we have

% \begin{align*}
%     \E{\sup_{0\leq s\leq t} |Z(s)|^2}\leq 4\E{\int_0^t|\phi(s)|^2\d s}.
% \end{align*}



\subsubsection{Wasserstein Distance}

% We denote the Borel $\sigma$-algebra 
We define the 2-Wasserstein distance between a pair of Borel probability measures $\mu$ and $\nu$ on $\R^d$ as follows  
\begin{align*}
    W_2(\mu, \nu):=\inf_{\Gamma\in \text{Couplings}(\mu, \nu)}\left(\int\|\bbeta_{\mu}-\bbeta_{\nu}\|_2^2 d \Gamma(\bbeta_{\mu}, \bbeta_{\nu})\right)^{\frac{1}{2}},
\end{align*}
where $\|\cdot\|_2$ denotes the $\ell_2$ norm on $\mathbb{R}^d$ and the pair of random variables $(\bbeta_{\mu}, \bbeta_{\nu})\in \R^d\times\R^d$ is a coupling with the marginals following $\mathcal{L}(\bbeta_{\mu})=\mu$ and $\mathcal{L}(\bbeta_{\nu})=\nu$, where $\mathcal{L}(\cdot)$ denotes a distribution of a random variable.



\begin{lemma}[Contraction property]
\label{contraction}
Assume assumptions \ref{def:smooth} and \ref{def:strong_convex} hold. For any learning rate $\eta \in (0, \frac{1}{L+m}]$, any $\theta, \beta\in\mathbb{R}^d$, % simulated from Eq.\eqref{fed_avg_langevin_dynamics} and Eq.\eqref{continuous_dynamics}, respectively, 
we have
\begin{align*}
\small
    &\quad\lrn{\beta-\theta-\eta(\nabla f(\beta)-\nabla f(\theta))}_2^2\leq (1-\eta m) \cdot \|\beta-\theta \|_2^2+4\eta L\sum_{c=1}^N p_c \cdot ( \| \beta^c-\beta \|_2^2 + \|\theta^c-\theta \|_2^2 ).
\end{align*}

\end{lemma}

\begin{lemma}[Time error]\label{lem:discretization}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any $s\geq 0$, any learning rate $\eta \in (0 , \frac{2}{m})$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, the iterates of $(\bar \theta_s)$ based on the continuous dynamics of Eq.\eqref{continuous_dynamics} satisfy the following estimate
\begin{align*}
    \E{ \big\| \bar\theta^c_{s} - \bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2} \leq 8\eta^2 d\kappa^2\big(\frac{\gamma}{d}+m\tau\big)+16\eta d\tau.
\end{align*}
\end{lemma}

\begin{definition}\label{def:H_kappa_gamma}
We define parameter $H_{\tau}$, $\kappa$ and $\gamma$
\begin{align*}
    H_{\tau}: = & ~ 14 \kappa^2 \cdot  (m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2 ),\\
    \kappa := & ~ L / m , \\
    \gamma : = & ~ \max_{c \in [N]} \| \nabla f^c (\theta_*) \|_2^2 .
\end{align*}
\end{definition}

\begin{lemma}[Bounded divergence]\label{divergence}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the $\ell_2$ upper bound of the divergence between local clients and the center as follows
\begin{align*}
    \sum_{c=1}^N p_c\E{\|\theta_k^c-\theta_k \|_2^2}&\leq 2(K-1)^2\eta^2 dH_{\tau}^2 +4(K-1)\eta d\tau,\notag
\end{align*}
where $H, \kappa$ and $\gamma$ are defined as Definition~\ref{def:H_kappa_gamma}. % $H_{\tau}^2=14 \kappa^2 \cdot  (m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2 )$, $\kappa=L/m$, and  $\gamma:=\max_{c\in[N]}\| \nabla f^c(\theta_*) \|_2^2$.
\end{lemma}






\begin{lemma}[Bounded variance] 
\label{lem:total_variance}
Given assumption \ref{def:variance}, we have 
\begin{equation*}
    \E{ \|\nabla f(\theta)-\nabla \tilde f(\theta) \|_2^2}\leq d \cdot \sigma^2 ,\qquad \forall \ \theta\in\R^d.
\end{equation*}
\end{lemma}

% \begin{lemma}[To be proved] 
% \label{lem:gradient_bound}
% Given a client index $c\in[N]$ and assumption XXX\Wei{will fix later}, we have 
% \begin{equation*}
%     \E{\lrn{\nabla f(\bar\theta_s^c)-\nabla f(\bar\theta^c_{k\eta})}_2^2}\leq C \eta^2
% \end{equation*}
% \end{lemma}



\begin{lemma}[One step update]\label{one_step_Dalalyan}

Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , \frac{1}{m+L})$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, where $\theta_*$ is the global minimum for the function $f$. Then
\begin{align*}
    W_2^2(\mu_{k+1}, \pi)&\leq  (1-\frac{\eta m}{2} ) \cdot W^2_2(\mu_{k}, \pi)+ 12 \eta^3 d(K-1)^2 LH_{\tau}^2+2\eta^3 d \frac{L^2}{m} H_{\tau}^2\notag\\
    &\quad\quad+24(K-1)\eta^2 dL\tau+32\eta^2 \frac{L^2}{m} d\tau+\eta^2 \sigma^2 d,
\end{align*}
where $\mu_k$ denotes the probability measure of $\theta_k$, where $H, \kappa$ and $\gamma$ are defined as Definition~\ref{def:H_kappa_gamma}. %$H_{\tau}^2=14 \kappa^2\bigg(m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2\bigg)$, $\kappa=L/m$, and  $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$.
\end{lemma}
% \Wei{does this $\frac{\eta m}{2}$ make sense? Is there a tradition such that a rate of $\eta m$ is required?}

\subsection{Full device participation}

\subsubsection{Fast convergence via correlated noise}


\begin{theorem} Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. Given a constant learning rate $\eta\in (0, \frac{1}{m+L}]$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have
\begin{align*}
    W_2(\mu_{k}, \pi) &\leq  \left(1-\frac{\eta m}{4}\right)^k \cdot \bigg(\sqrt{2d}\big(\mathcal{D} +  \sqrt{\tau/m} \big)\bigg)\notag\\
    &\quad\quad+8\sqrt{{\eta} d \kappa} \cdot \bigg(\sqrt{\eta ((K-1)^2+\kappa) H_{\tau}^2}+\sqrt{K\tau}+{\sqrt{\sigma^2/L}} \bigg),\notag
\end{align*}
where $\mu_k$ denotes the probability measure of $\theta_k$, $H_{\tau}^2=14 \kappa^2\big(m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2\big)$, $\kappa=L/m$, and  $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$. The constants are not intended to be optimized.
\end{theorem}


\begin{proof}
Iteratively applying Theorem \ref{one_step_Dalalyan} and arranging terms, we have that
\begin{align*}
    W_2^2(\mu_{k}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)^k W^2_2(\mu_{0}, \pi)+  \frac{2}{\eta m}\bigg(12 \eta^3 d(K-1)^2 LH_{\tau}^2+2\eta^3 d \frac{L^2}{m} H_{\tau}^2\notag\\
    &\quad\quad+24(K-1)\eta^2 dL\tau+32\eta^2 \frac{L^2}{m} d\tau+\eta^2 \sigma^2 d\bigg)\notag\\
    &\leq \left(1-\frac{\eta m}{2}\right)^k W^2_2(\mu_{0}, \pi)+ 24 \eta^2 d(K-1)^2 \kappa H_{\tau}^2+4\eta^2 d \kappa^2 H_{\tau}^2\notag\\
    &\quad\quad+48(K-1)\eta d\kappa \tau+64\eta \kappa d\tau+2\kappa \frac{1}{L}\eta \sigma^2 d,\notag
\end{align*}
where $\kappa=\frac{L}{m}$. By Lemma \ref{lem:W2_init_bound} and the initialization condition $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have that
\begin{align*}
W_2(\mu_0, \pi)\leq \sqrt{2d}(\mathcal{D} +  \sqrt{\tau/m} ).
\end{align*}

Applying the inequality $(1-\frac{\eta m}{2})\leq (1-\frac{\eta m}{4})^2$ and  $\sqrt{a+b+c}\leq \sqrt{a}+\sqrt{b}+\sqrt{c}$ completes the proof. \end{proof}



\subsubsection{Preserving privacy via decorrelated noise}

Note that Algorithm \ref{alg:alg_main_text_same_seed} requires all the local clients to generate the same noise with correlation 1. Such a mechanism enjoys the compactness but cannot fully utilize the noise for protecting the privacy. For extensions, it can be naturally adapted to less correlated noise. Given a target correlation coefficient $\rho\in [0, 1]$, replacing Eq.\eqref{local_client} with 
\begin{equation}\label{local_client_diff_seeds}
    \beta_{k+1}^c=\theta_k^c-\eta\nabla f^c(\theta_k^c)+\sqrt{2\tau \rho^2}\xi_k + \sqrt{2(1-\rho^2)N\tau}\xi_k^c,
\end{equation}
where $\xi_k$ is a $d$-dimensional standard Gaussian random variable (sGRV) shared by all the clients at iteration $k$, $\xi_k^c$ is a unique $d$-dimensional sGRV generated by client $c\in [N]$ only. Moreover, $\xi_k$ is dependent with $\xi_k^c$ for any $c\in[N]$. Following the same synchronization step based Eq.\eqref{virtual_seq}, Eq.\eqref{sum_grad}, Eq.\eqref{fed_avg_langevin_dynamics_preliminary}, we have
\begin{equation}
\label{fed_avg_langevin_dynamics_pp}
\theta_{k+1}=\theta_k-\eta \nabla \tilde f(\theta_k)+\sqrt{2\tau}\Xi_k,
\end{equation}
where $\Xi_k=\rho \xi_k + \sqrt{1-\rho^2}\sum_{c=1}^N p_c \sqrt{N}\xi_k^c$. Since the variance of i.i.d variables is additive, it is clear that $\Xi_k$ follows the standard $d$-dimensional Gaussian distribution. The inclusion of the additional noise naturally protects the privacy and yield balalalala \Wei{Need Zhao to polish the writing}. We refer to the algorithm with less correlated noise as the Privacy-preserved Federated Averaging Langevin dynamics and present it in Algorithm \ref{alg:alg_main_text_different_seeds}.

Since the inclusion of additional noise doesn't affect the formulation of Eq.\eqref{fed_avg_langevin_dynamics_pp}, the algorithm property maintains the same except the scale of the temperature $\tau$ is changed and Gaussian noise proposed in different clients are correlated. Based on a target correlation coefficient $\rho\geq 0$, Eq.\eqref{local_client_diff_seeds} is equivalent to applying a temperature $T_{\rho}=\tau(\rho^2+(1-\rho^2)N)$. In particular, $T_1=\tau$, which exactly recovers Algorithm \ref{alg:alg_main_text_same_seed}; however, setting $\rho=0$ leads to $T_0=\tau N$, where the noise in local clients is magnified by $N$ times. Now we adjust the analysis as follows
\begin{theorem} Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. Given a constant learning rate $\eta\in (0, \frac{1}{m+L}]$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have
\begin{align*}
    W_2(\mu_{k}, \pi) &\leq  \left(1-\frac{\eta m}{4}\right)^k \cdot \bigg(\sqrt{2d}\big(\mathcal{D} +  \sqrt{\tau/m} \big)\bigg)\notag\\
    &\quad\quad+8\sqrt{{\eta} d \kappa} \cdot \bigg(\sqrt{\eta ((K-1)^2+\kappa) H_{T_{\rho}}^2}+\sqrt{K T_{\rho}}+{\sqrt{\sigma^2/L}} \bigg),\notag
\end{align*}
where $\mu_k$ denotes the probability measure of $\theta_k$, $H_{T_{\rho}}^2=14 \kappa^2\big(m^2 \mathcal{D}^2+m T_{\rho} +\frac{\gamma}{d}+\sigma^2\big)$, $T_{\rho}=\tau(\rho^2+(1-\rho^2)N)$. The constants are not intended to be optimized.
\end{theorem}

Admittedly, there is no free lunch, the inclusion of additional noise slows down the convergence. \Wei{Need Zhao here to polish the writing.}

\begin{algorithm*}[h]\caption{Privacy-preserved Federated Averaging Langevin dynamics Algorithm (pFedAvgLD). Denote by $\theta_k^c$ the model parameter in the $c$-th client at the $k$-th step. Denote the immediate result of one step SGLD update from $\theta_k^c$ by $\beta_k^c$. A global synchronization is conducted every $K$ steps.}\label{alg:alg_main_text_different_seeds}
\begin{algorithmic}[1]
\State \begin{equation}\label{local_client_diff_seeds_v2}
    \beta_{k+1}^c=\theta_k^c-\eta\nabla f^c(\theta_k^c)+\sqrt{2\tau \rho^2}\xi_k + \sqrt{2(1-\rho^2)N\tau}\xi_k^c,
\end{equation}
\State
\begin{equation}  
\label{synchronization_diff_seeds}
\theta_{k+1}^c=\left\{  
             \begin{array}{lr}  
             \beta_{k+1}^c \qquad\qquad\qquad \text{if } k+1 \text{ mod } K\neq 0 \\  
              & \\
             \sum_{c=1}^N p_c \beta_{k+1}^c \ \qquad \text{if } k+1 \text{ mod } K=0.
             \end{array}  
\right.  
\end{equation} 
\end{algorithmic}
\end{algorithm*}

\subsection{Partial device participation}

Full device participation enjoys appealing convergence properties. However, it suffers from the straggler's effect in real-world applications, where the communication is limited by the slowest device. Partial device participation handles this issue by only allowing a small portion of devices in each communication and greatly increased the communication efficiency %fault-tolerant capability 
in a federated network. 

\paragraph{Updating scheme}

\begin{algorithm*}[h]\caption{Federated Averaging Langevin dynamics Algorithm (FedAvgLD) with partial device participation. A global synchronization is conducted every $K$ steps. $\mathcal{S}_k$ is a subset that contains $S$ indices according to a device-sampling rule based on scheme \text{I} or \text{II}.}\label{alg:alg_main_text_partial}
\begin{algorithmic}[1]
\State \begin{equation}\label{local_client_partial}
    \beta_{k+1}^c=\theta_k^c-\eta\nabla f^c(\theta_k^c)+\xi_k,
\end{equation}
\State
\begin{equation}  
\label{synchronization_partial}
\theta_{k+1}^c=\left\{  
             \begin{array}{lr}  
             \beta_{k+1}^c \qquad\qquad\qquad \text{if } k+1 \text{ mod } K\neq 0 \\  
              & \\
             \sum_{c\in \mathcal{S}_k} \frac{1}{S} \beta_{k+1}^c \ \qquad \text{if } k+1 \text{ mod } K=0.
             \end{array}  
\right.  
\end{equation} 
\end{algorithmic}
\end{algorithm*}

The first device-sampling scheme \text{I} \cite{LS20} selects a total of $S$ devices, where the $c$-th device is selected with a probability $p_c$. The first theoretical justification for convex optimization has been proposed by \cite{lhy+19}. However, to our best knowledge, the convergence analysis of sampling algorithm was missing in the federated-learning literature. 


\paragraph{(Scheme \text{I}: with replacement).}
Assume $\mathcal{S}_k=\{n_1, n_2, \cdots, n_S\}$, where $n_j\in [N]$ is a random number that takes a value of $c$ with a probability $p_c$ for any $j\in\{1,2,\cdots, S\}$. The synchronization step follows that $\theta_{k+1}=\frac{1}{S}\sum_{c\in \mathcal{S}_k}\theta_{k+1}^c$.

Another strategy is to uniformly select $S$ devices without replacement. We follow  \cite{lhy+19} and assume $S$ indices are selected uniformly without replacement and the synchronization step is the same as before. In addition, the convergence also requires an additional assumption on balanced data \cite{lhy+19}. 
\paragraph{(Scheme \text{II}: without replacement).}  Assume $\mathcal{S}_k=\{n_1, n_2, \cdots, n_S\}$, where $n_j\in [N]$ is a random number that takes a value of $c$ with a probability $\frac{1}{S}$ for any $j\in\{1,2,\cdots, S\}$. Assume the data is balanced such that $p_1=\cdots=p_N=\frac{1}{N}$. The synchronization step follows that $\theta_{k+1}=\frac{N}{S}\sum_{c\in \mathcal{S}_k} p_c\theta_{k+1}^c=\frac{1}{S}\sum_{c\in \mathcal{S}_k} \theta_{k+1}^c$.


\paragraph{Notation: }


\begin{lemma}[Bounded divergence based on partial device]\label{divergence_partial}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the following results

For Scheme \text{I}, the divergence between $\bar\theta_k$ and $\theta_k$ is upper bounded by
\begin{align*}
    \E{\|\bar\beta_k-\bar\theta_k \|_2^2}&\leq ???.\notag
\end{align*}

For Scheme \text{II}, assuming the data is balanced such that $p_1=\cdots=p_N=\frac{1}{N}$, the divergence between $\bar\theta_k$ and $\theta_k$ is upper bounded by
\begin{align*}
    \E{\|\bar\beta_k-\bar\theta_k \|_2^2}&\leq ???.\notag
\end{align*}
where $\textcolor{red}{H}, \kappa$ and $\gamma$ are defined as Definition~\ref{def:H_kappa_gamma}. % $H_{\tau}^2=14 \kappa^2 \cdot  (m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2 )$, $\kappa=L/m$, and  $\gamma:=\max_{c\in[N]}\| \nabla f^c(\theta_*) \|_2^2$.
\end{lemma}

\begin{proof}

\end{proof}




\subsection{Connection with optimization}

\begin{theorem}[Connection with optimization]

TBD. Follow \cite{Dalalyan17}

\end{theorem}


\begin{theorem}[Different seeds?]

Seems to require a larger temperature in local, which is equivalent to setting a large temperature $\tau N$ and leads to a bad error, but still doable considering the contributions in privacy. 

Are there any resuls that analyze the average of correlated Gaussian?

% reference: http://web.ipac.caltech.edu/staff/fmasci/home/astro_refs/SumOfCorrelatedRVs.pdf

\end{theorem}


\begin{theorem}[Non-convex?]

May not have enough time.

\end{theorem}









\subsection{Important lemmas}

\begin{proof}[Proof of Lemma \ref{contraction}] 
% Let $X_k:=\nabla f(\theta_k+\delta_k)-\nabla f(\theta_k)$, $\delta_k:=\bar\theta_{k\eta}-\theta_{k}$ and $\zeta_k:=\nabla \tilde f(\theta_k)-\nabla f(\theta_k)$, where  $\theta_k$ and $\bar\theta_{t}$ are simulated from Eq.\eqref{fed_avg_langevin_dynamics} from Eq.\eqref{continuous_dynamics}, respectively. Given xxx, we have

Given a client index $c\in[N]$, applying Theorem 2.1.12 \cite{Nesterov04} leads to
\begin{align}
\label{special_inner_product}
    \langle y-x, \nabla f^c(y)-\nabla f^c(x) \rangle\geq \frac{m L}{L+m}\lrn{y-x}_2^2 + \frac{1}{L+m} \lrn{\nabla f^c(y)-\nabla f^c(x)}_2^2,\quad \forall x,y\in\mathbb{R}^d.
\end{align}

In what follows, we have
\begin{align}
\label{iteration}
    &\quad\lrn{\beta-\theta-\eta(\nabla f(\beta)-\nabla f(\theta))}_2^2\notag\\
    &=\lrn{\beta-\theta}_2^2 -2\eta \underbrace{\langle \beta-\theta, \nabla f(\beta)-\nabla f(\theta)\rangle}_{\mathcal{I}}+\eta^2 \lrn{\nabla f(\beta)-\nabla f(\theta)}_2^2.
\end{align}

For the second item $\mathcal{I}$ in the right hand side, we have
\begin{align}
\label{target_contraction}
    \mathcal{I}&=\sum_{c=1}^N p_c\big\langle \beta-\theta, \nabla f^c(\beta^c)-\nabla f^c(\theta^c)\big\rangle\notag\\
    &=\sum_{c=1}^N p_c\big\langle \beta-\beta^c+\beta^c-\theta^c+\theta^c-\theta, \nabla f^c(\beta^c)-\nabla f^c(\theta^c)\big\rangle\notag\\
    &=-\sum_{c=1}^N p_c\left(\big\langle \beta^c-\beta, \nabla f^c(\beta^c)-\nabla f^c(\theta^c)\big\rangle+\big\langle \theta-\theta^c, \nabla f^c(\beta^c)-\nabla f^c(\theta^c)\big\rangle\right)\notag\\
    &\quad\quad+\sum_{c=1}^N p_c\big\langle \beta^c-\theta^c, \nabla f^c(\beta^c)-\nabla f^c(\theta^c)\big\rangle\notag\\
    &\geq -\sum_{c=1}^N p_c \cdot \big((m+L)\lrn{\beta^c-\beta}_2^2+(m+L)\lrn{\theta^c-\theta}_2^2+\frac{1}{2(m+L)}\lrn{\nabla f^c(\beta^c)-\nabla f^c(\theta^c)}_2^2\big)\notag\\
    &\quad\quad+ \sum_{c=1}^N p_c \cdot \big(\frac{m L}{L+m}\lrn{\beta^c-\theta^c}_2^2 + \frac{1}{L+m} \lrn{\nabla f^c(\beta^c)-\nabla f^c(\theta^c)}_2^2 \big)\notag\\
    &\geq -(m+L)\sum_{c=1}^N p_c\left(\lrn{\beta^c-\beta}_2^2+\lrn{\theta^c-\theta}_2^2\right) + \frac{m L}{L+m}\lrn{\beta-\theta}_2^2 \notag\\
    &\quad\quad+ \frac{1}{2(L+m)} \lrn{\nabla f(\beta)-\nabla f(\theta)}_2^2,
\end{align}
where the first inequality follows by the AM-GM inequality and Eq.\eqref{special_inner_product}, respectively; the last inequality follows by Jensen's inequality such that $\sum_{c=1}^N p_c \| \beta^c-\theta^c \|_2^2\geq \|\sum_{c=1}^N p_c  (\beta^c-\theta^c ) \|_2^2$

Plugging Eq.\eqref{target_contraction} into Eq.\eqref{iteration}, we have
\begin{align*}
    &\quad\lrn{\beta-\theta-\eta \cdot (\nabla f(\beta)-\nabla f(\theta))}_2^2\notag\\
    &\leq \big(1-\frac{2\eta mL}{m+L}\big) \cdot \| \beta-\theta \|_2^2+\eta\big(\underbrace{\eta-\frac{1}{m+L}}_{\leq 0 \text{ if } \eta\leq \frac{1}{m+L}}\big) \cdot \| \nabla f(\beta)-\nabla f(\theta) \|_2^2\notag\\
    &\quad\quad+2\eta(m+L)\sum_{c=1}^N p_c \cdot (\| \beta^c-\beta \|_2^2+\| \theta^c-\theta \|_2^2 )\notag\\
    &\leq \left(1-\eta m\right) \|\beta-\theta \|_2^2+4\eta L\sum_{c=1}^N p_c \cdot \big(\| \beta^c-\beta \|_2^2+\| \theta^c-\theta \|_2^2\big),\notag
\end{align*}
where the last inequality follows by $\frac{2L}{m+L}\geq 1$, $m\leq L$, $1-2a\leq (1-a)^2$ for any $a$, and $\eta\in(0, \frac{1}{m+L}]$.

% Applying $\sqrt{a+b}\leq \sqrt{a}+\sqrt{b}$, we have
% \begin{align*}
%     &\quad\lrn{\beta-\theta-\eta(\nabla f(\beta)-\nabla f(\theta))}_2\leq \left(1-\frac{\eta m}{2}\right)^2\lrn{\beta-\theta}_2^2+2\eta(m+L)\sum_{c=1}^N p_c\left(\lrn{\beta^c-\beta}_2+\sqr\lrn{\theta^c-\theta}_2^2\right),\notag
% \end{align*}

\end{proof}


\begin{proof}[Proof of Lemma \ref{lem:discretization}]
For any $s\in[0,\infty)$, there exists a certain $k \in \mathbb{N}^+$ such that $s\in [k\eta, (k+1)\eta)$. By the continuous dynamics of Eq.~\eqref{continuous_dynamics}, we have
\begin{align*}
    \bar\theta_{s}^c = \bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor}+(s-k\eta) \nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor })+\sqrt{2\tau}\int_{k\eta}^s \d \overline{W}_t,
\end{align*}
which suggests that 
\begin{align*}
    \sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2 \leq (s-k\eta) \big\| \nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2+\sup_{ s \in [ k\eta,  (k+1)\eta ) }\lrn{\int_{k\eta}^s \sqrt{2\tau} \d \overline{W}_t}_2.
\end{align*}
We first square the terms on both sides and take Young’s inequality and expectation
\begin{align*}
    \E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2} \leq 2\E{\big\|(s-k\eta)\nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+2\E{\sup_{ s \in [ k\eta,  (k+1)\eta ) }\lrn{\int_{k\eta}^s \sqrt{2\tau} \d \overline{W}_t}_2^2}.
\end{align*}
Then, by Burkholder-Davis-Gundy inequality and It\^{o} isometry, we have
\begin{align}
    \label{eq:1st_part}
    \E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2}&\leq 2\E{ \big\| (s-k\eta)\nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+8\sum_{i=1}^d\E{\int_{k\eta}^s 2\tau \d t} \notag \\
    &\leq 2\eta^2\E{ \big\| \nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+16 \eta d\tau.
\end{align}

By Young's inequality and the smoothness assumption \ref{def:smooth},  we have
\begin{align}\label{eq:2nd_part}
    \E{ \| \nabla f^c(\bar\theta_{\eta\lfloor \frac{s}{\eta} \rfloor }) \|_2^2}
    = & ~ \E{\| \nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor })-\nabla f^c(\theta_*) +\nabla f^c(\theta_*) \|_2^2} \notag \\
    \leq & ~ 2\E{\| \nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor })-\nabla f^c(\theta_*) \|_2^2} +2{\lrn{\nabla f^c(\theta_*) }_2^2} \notag \\
    \leq & ~ 2L^2 \E{\|\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }-\theta_*\|_2^2}+2\gamma\notag\\
    \leq & ~ 2L^2 \left(\frac{1}{m}\bigg(\frac{\gamma}{m}+2d\tau\bigg)\right)+ 2\gamma\notag\\
    \leq & ~ 4 d\kappa^2\bigg(\frac{\gamma}{d}+m\tau\bigg),
\end{align}
where the third inequality follows by Lemma \ref{lem:L2_bound_local_continuous}, the fourth step holds since $\kappa \geq 1$. Combining Eq.~\eqref{eq:1st_part} and Eq.~\eqref{eq:2nd_part}, we have
\begin{align*}
\E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2}
&\leq 8\eta^2 d\kappa^2\bigg(\frac{\gamma}{d}+m\tau\bigg)+16\eta d\tau.\notag
\end{align*}

\end{proof}



% \Wei{if no decay of learning rate is required, we may polish for a better rate here}
\begin{proof}[Proof of Lemma \ref{divergence}] For any $k \ge 0$, consider $k_0=K\lfloor \frac{k}{K}\rfloor $ such that $k\leq k_0$ and $\theta_{k_0}^c=\theta_{k_0}$ for any $k\geq 0$. It is clear that  $k-k_0 \leq K-1$ for all $k\geq 0$.

By the iterate Eq.\eqref{fed_avg_langevin_dynamics}, we have
\begin{align*}
\sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_k}_2^2}&=\sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_{k_0}-(\theta_k-\theta_{k_0})}_2^2}\notag\\
&\leq \sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_{k_0}}_2^2}\notag\\
&\leq \sum_{c=1}^N p_c \E{\sum_{k=k_0}^{k-1} 2(K-1)\eta^2\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2 + 4(K-1)\eta d\tau}\notag\\
&\leq \sum_{c=1}^N p_c \left(\sum_{k=k_0}^{k-1} 2(K-1)\eta^2\E{\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2}+4(K-1)\eta d\tau\right)\notag\\
&\leq 2(K-1)^2\eta^2 dH_{\tau}^2 +4(K-1)\eta d\tau,
\end{align*}
where the first inequality holds by $\E{\| \theta-\E{\theta} \|_2^2}\leq \E{\|\theta \|_2^2}$ for a stochastic variable $\theta$; the second inequality follows by $(\sum_{i=1}^{K-1} a_i)^2\leq (K-1)\sum_{i=1}^{K-1} a_i^2$; the last inequality follows by Lemma \ref{bounded_gradient_l2}. $H_{\tau}^2=14 \kappa^2 \cdot (m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2 )$.

% \textbf{Continuous-time diffusion: } For any time $t\geq 0$ at the $k$-th iteration and the closest synchronization time $t_0$ at the $k_0$-th iteration, we have
% \begin{align*}
% \sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t}}_2^2}&=\sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t_0}-(\bar\theta_{t}-\bar\theta_{t_0})}_2^2}\notag\\
% &\leq \sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t_0}}_2^2}\notag\\
% &\leq 8(t-t_0)^2 d\kappa^2\bigg(\frac{\gamma}{d}+m\tau\bigg)+16(t-t_0) d\tau \notag\\
% &\leq 8(K-1)^2\eta^2 d\kappa^2\bigg(\frac{\gamma}{d}+m\tau\bigg)+16(K-1)\eta d\tau\notag\\
% &\leq (K-1)^2\eta^2 d H_{\tau}^2+16(K-1)\eta d\tau, \notag
% \end{align*}
% where the second inequality follows by applying Lemma \ref{lem:discretization} by treating the learning rate as $t-t_0$; the third inequality follows since the $k$-th iteration and the $k_0$-th iteration has a time difference at most $(K-1)\eta$; the last inequality holds since $H_{\tau}^2=14 \kappa^2\bigg(m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2\bigg)$.


% By definition, $\bar\theta_{t}^c-\bar\theta_{t_0}=-\int_{t_0}^t \nabla f^c(\bar\theta^c_s)\d s +\sqrt{2}\cdot \overline{W}_{t-t_0}$. Apply Young's inequality, we have
% \begin{align*}
%     \E{\lrn{\bar\theta_{t}^c-\bar\theta_{t_0}}_2^2}&=2\E{\lrn{-\int_{t_0}^t \nabla f^c(\bar\theta^c_s)\d s}_2^2} +2\E{\lrn{\sqrt{2}\cdot \overline{W}_{t-t_0}}_2^2}\notag\\
%     &\leq 2(t-t_0)\E{\int_{t_0}^t \lrn{\nabla f^c(\bar\theta_s^c)}_2^2\d s}
% \end{align*}

% By H\"{o}lder's inequality, we have
% \begin{align*}
% \sum_{c=1}^N p_c\E{\lrn{\bar\theta_{t}^c-\bar\theta_{t}}_2^2}&\leq \sum_{c=1}^N p_c { (t-t_0)\int_{t_0}^{t} \E{\lrn{-\int_{t_0}^t \nabla f^c(\bar\theta^c_s)\d s +\sqrt{2}\cdot \overline{W}_{t-t_0}}_2^2}\d s}\notag\\
% &\leq \sum_{c=1}^N p_c { (t-t_0)\int_{t_0}^{t} \E{\lrn{-\int_{t_0}^t \nabla f^c(\bar\theta^c_s)\d s +\sqrt{2}\cdot \overline{W}_{t-t_0}}_2^2}\d s}\notag\\
% &\leq 4(K-1)^2\eta^2 H_{\tau}^2,
% \end{align*}

\end{proof}



\begin{proof}[Proof of Lemma \ref{lem:total_variance}] By assumption \ref{def:variance}, we have
\begin{align*}
    \E{\lrn{\nabla f(\theta)-\nabla \tilde f(\theta)}_2^2}&=\E{\lrn{\sum_{c=1}^N p_c\bigg(\nabla f^c(\theta^c)-\nabla \tilde f^c(\theta^c)\bigg)}_2^2}\\
    &=\sum_{c=1}^N p_c^2\E{\lrn{\nabla f^c(\theta^c)-\nabla \tilde f^c(\theta^c)}_2^2}\\
    &\leq d \sigma^2 \sum_{c=1}^N p_c^2\leq d\sigma^2 \left(\sum_{c=1}^N p_c\right)^2:=d\sigma^2.
\end{align*}

\end{proof}



\begin{proof}[Proof of Lemma \ref{one_step_Dalalyan}]


The solution of the continuous-time process Eq.\eqref{continuous_dynamics} follows that
\begin{align}
\label{solution_continuous_dynamics}
    \bar\theta_t=\bar\theta_0 -\int_0^t \nabla f(\bar\theta_s)\d s + \sqrt{2}\cdot\overline{W}_t, \qquad \forall t\geq 0.
\end{align}


Set $t\rightarrow(k+1)\eta$ and $\bar\theta_0\rightarrow\bar\theta_{k\eta}$ for Eq.\eqref{solution_continuous_dynamics} and consider a synchronous coupling such that $W_{(k+1)\eta}-W_{k\eta}:=\xi_k$
\begin{align}
\label{continuous_one_step}
    \bar\theta_{(k+1)\eta}&=\bar\theta_{k\eta}-\int_{k\eta}^{(k+1)\eta}\nabla f(\bar\theta_s)\d s + \sqrt{2} (W_{(k+1)\eta}-W_{k\eta})\notag\\
    &=\bar\theta_{k\eta}-\int_{k\eta}^{(k+1)\eta}\nabla f(\bar\theta_s)\d s + \sqrt{2}\xi_k.
\end{align}

We first denote $\zeta_k:=\nabla \tilde f(\theta_k)-\nabla f(\theta_k)$. Subtracting Eq.\eqref{fed_avg_langevin_dynamics} from Eq.\eqref{continuous_one_step} yields that
\begin{align*}
    &\quad \bar\theta_{(k+1)\eta}-\theta_{k+1}\notag\\
    &=\bar\theta_{k\eta}-\theta_{k}+\eta \nabla \tilde f(\theta_k) - \int_{k\eta}^{(k+1)\eta}\nabla f(\bar\theta_s)\d s\notag\\
    &=\bar\theta_{k\eta}-\theta_{k}-\eta \bigg(\nabla f(\theta_k+\bar\theta_{k\eta}-\theta_{k})-\nabla\tilde f(\theta_k)\bigg) - \int_{k\eta}^{(k+1)\eta}\bigg(\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})\bigg)\d s\notag\\
    &=\bar\theta_{k\eta}-\theta_{k}-\eta \bigg(\underbrace{\nabla f(\theta_k+\bar\theta_{k\eta}-\theta_{k})-\nabla f(\theta_k)}_{:=X_k}\bigg)- \underbrace{\int_{k\eta}^{(k+1)\eta}\bigg(\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})\bigg)\d s}_{:=Y_k} +\eta\zeta_k.\notag
\end{align*}

Taking square and expectation on both sides, we have
\begin{align}
\label{reestimate}
    &\quad\ \E{\|\bar\theta_{(k+1)\eta}-\theta_{k+1} \|_2^2}\notag\\
    &=\E{\| \bar\theta_{k\eta}-\theta_{k}-\eta X_k-Y_k \|_2^2}+\E{\| \eta\zeta_k \|_2^2}+2\eta\underbrace{\E{\langle\bar\theta_{k\eta}-\theta_{k}-\eta X_k-Y_k,  \zeta_k\rangle}}_{\E{\zeta_k}=0}\notag\\
    &\leq (1+q) \cdot \E{\| \bar\theta_{k\eta}-\theta_{k}-\eta X_k \|_2^2}+ ( 1 + 1 / q ) \cdot \E{\|Y_k \|_2^2}+\E{ \| \eta\zeta_k \|_2^2}\notag\\
    &\leq (1+q) \cdot \big( (1-\eta m) \cdot \E{\| \bar\theta_{k\eta}-\theta_k \|_2^2}+4\eta L\sum_{c=1}^N p_c \cdot \left(\E { \| \bar\theta_{k\eta}^c-\bar\theta_{k\eta} \|_2^2}+\E{\| \theta_k^c-\theta_k \|_2^2}\right) \big)\notag\\
    &\quad\quad + ( 1 + 1/q ) \cdot \E{ \| Y_k \|_2^2 } + \eta^2\sigma^2 d\notag\\
    &\leq (1+q) \cdot \bigg(\underbrace{\left(1-\eta m\right)}_{\phi}\E{ \| \bar\theta_{k\eta}-\theta_k \|_2^2}+12 \eta^3 d(K-1)^2 LH_{\tau}^2+16(K-1)\eta^2 dL\tau\bigg)\notag\\
    &\quad\quad + (1+ 1 / q ) \cdot \E{ \| Y_k \|_2^2}+\eta^2\sigma^2  d,
\end{align}
where the first inequality follows by the AM-GM inequality for any $q>0$, the second inequality follows by Lemma \ref{contraction} and Assumption \ref{def:variance}. The third inequality follows by Lemma \ref{divergence}; moreover, the continuous-time process conducts synchronization at any time step, hence $\bar\theta^c_{k\eta}=\bar\theta_{k\eta}$. Since the learning rate follows $\frac{1}{m+L}\leq \frac{2}{m}$, the requirement of the learning rate is clearly satisfied.


Recall that $\phi=1-\eta m$, we get $\frac{1+\phi}{2}=1-\frac{1}{2}\eta m$. Choose $q=\frac{1+\phi}{2\phi}-1$ so that $(1+q)\phi=\frac{(1+\phi)}{2}=1-\frac{1}{2}\eta m$. In addition, we have $1+\frac{1}{q}= \frac{1+q}{q}=\frac{1+\phi}{1-\phi}\leq \frac{2}{\eta m}$.  It follows that
\begin{align}
    \label{nice_inequality_v0}
    (1+q) \cdot (1-\eta m)\leq 1-\frac{1}{2}\eta m,  \quad  1+q\leq \frac{1-\frac{1}{2}\eta m}{1-\eta m}\leq 1.5, \quad (1 + 1/q )\leq \frac{2}{m\eta},
\end{align}
where the second inequality holds because $\eta\in (0, \frac{1}{m+L}]\leq \frac{1}{2m}$.


% Applying Minkowski's inequality, we have
% \begin{align}
% \label{almost_final}
%     \E{\lrn{\bar\theta_{(k+1)\eta}-\theta_{k+1}}_2^2}^{1/2}\leq \E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k+\eta \zeta_k}_2^2}^{1/2} + \E{\lrn{Y_k}_2^2}^{1/2}.
% \end{align}


% \Wei{should use this one to obtain a sharper result}
% \begin{align}
% \label{almost_final}
%     \E{\lrn{\bar\theta_{(k+1)\eta}-\theta_{k+1}}_2^2}&= \E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k-Y_k}_2^2} + \eta^2\E{\lrn{\zeta_k}_2^2}\notag\\
%     &\leq \bigg(\E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k}_2^2}^{1/2}+\E{\lrn{Y_k}_2^2}^{1/2}\bigg)^2 + \eta^2\E{\lrn{\zeta_k}_2},
% \end{align}

% For the first term in the previous result, taking square and expectation, we have
% \begin{align}
% \label{estimate_of_first_term}
%     &\quad\E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k+\eta \zeta_k}_2^2}\notag\\
%     &=\E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k}_2^2}+\E{\lrn{\eta\zeta_k}_2^2}+2\eta\underbrace{\E{\langle\bar\theta_{k\eta}-\theta_{k}-\eta X_k,  \zeta_k\rangle}}_{=0}\notag\\
%     &\leq \E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k}_2^2}+\eta^2\sigma^2 d\notag \\
%     &\leq \left(1-\frac{\eta m}{2}\right)^2\E{\lrn{\bar\theta_{k\eta}-\theta_k}_2^2}+\eta^2\sigma^2 d\notag \\
%     &\quad\quad +2\eta(m+L)\sum_{c=1}^N p_c\left(\E{\lrn{\bar\theta_{k\eta}^c-\bar\theta_{k\eta}}_2^2}+\E{\lrn{\theta_k^c-\theta_k}_2^2}\right)\notag\\
%     &\leq  \left(1-\frac{\eta m}{2}\right)^2\E{\lrn{\bar\theta_{k\eta}-\theta_k}_2^2}+12 \eta^3 d(K-1)^2 LH_{\tau}^2+\eta^2\sigma^2 d+80(K-1)\eta^2 dL\tau,
% \end{align}
% \Wei{cannot go through with order 2; seems to work if we apply $\E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k+\eta \zeta_k}_2^2}\leq (1-\eta m)\E{\lrn{\bar\theta_{k\eta}-\theta_{k}}_2^2}+\eta^3+\eta^2$, $\E{\lrn{\bar\theta_{(k+1)\eta}-\theta_{k+1}}_2^2}\leq (1+\gamma)\E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k}_2^2}+\underbrace{(1+\frac{1}{\gamma}) \E{\lrn{Y_k}_2^2}}_{\text{order becomes worse from 3 to 2}}+\eta^2\sigma^2 d$ based on 1+gamma and 1-gamma trick}
% \begin{align}
% \label{estimate_of_first_term}
%     \E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta X_k}_2^2}&= \E{\lrn{\bar\theta_{k\eta}-\theta_{k}-\eta (\nabla f(\bar\theta_{k\eta})-\nabla f(\theta_k))}_2^2}\notag\\
%     &\leq \left(1-\frac{\eta m}{2}\right)^2\E{\lrn{\bar\theta_{k\eta}-\theta_k}_2^2}\notag \\
%     &\quad\quad +2\eta(m+L)\sum_{c=1}^N p_c\left(\E{\lrn{\bar\theta_{k\eta}^c-\bar\theta_{k\eta}}_2^2}+\E{\lrn{\theta_k^c-\theta_k}_2^2}\right)\notag\\
%     &\leq  \left(1-\frac{\eta m}{2}\right)^2\E{\lrn{\bar\theta_{k\eta}-\theta_k}_2^2}+12 \eta^3 d(K-1)^2 LH_{\tau}^2+80(K-1)\eta^2 dL\tau,
% \end{align}
% where the first inequality follows by Lemma \ref{contraction}, and the second inequality follows by Lemma \ref{divergence} and $m\leq L$. Since the learning rate $\frac{1}{m+L}\leq \frac{2}{m}$, the requirement of the learning rate is clearly satisfied.

For the term $\E{ \| Y_k \|_2^2 }$ in Eq.\eqref{reestimate}, we have the following estimate
\begin{align}
\label{y_estimate}
    \E{ \| Y_k \|_2^2}&=\E{\lrn{\int_{k\eta}^{(k+1)\eta}\bigg(\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})\bigg)\d s}_2^2}\notag\\
    &\leq\eta\int_{k\eta}^{(k+1)\eta}\E{\lrn{\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})}_2^2}  \d s\notag\\
    &=\eta\int_{k\eta}^{(k+1)\eta}\E{\lrn{\sum_{c=1}^N p_c \bigg(\nabla f^c(\bar\theta_s^c)-\nabla f^c(\bar\theta^c_{k\eta})\bigg)}_2^2}  \d s\notag\\
    &\leq \eta\int_{k\eta}^{(k+1)\eta}\sum_{c=1}^N p_c \cdot \E{\lrn{\nabla f^c(\bar\theta_s^c)-\nabla f^c(\bar\theta^c_{k\eta})}_2^2}  \d s\notag\\
    &\leq \eta L^2 \int_{k\eta}^{(k+1)\eta}\sum_{c=1}^N p_c \cdot \E{\lrn{\bar\theta_s^c-\bar\theta^c_{k\eta}}_2^2}  \d s\notag\\
    &\leq \eta L^2  \int_{k\eta}^{(k+1)\eta}  \left(8\eta^2 d\kappa^2\bigg(\frac{\gamma}{d}+m\tau\bigg)+16\eta d\tau\right) \d s\notag\\
    &=\eta^4 d L^2 H_{\tau}^2+16\eta^3 L^2 d\tau,
\end{align}
where the first inequality follows by H\"{o}lder's inequality, the second inequality follows by Jensen's inequality, the third inequality follows by Assumption \ref{def:smooth}, and the last inequality follows by Lemma \ref{lem:discretization}.

Plugging Eq.\eqref{nice_inequality_v0} and Eq.\eqref{y_estimate} into Eq.\eqref{reestimate}, we have
\begin{align*}
    \E{\|\bar\theta_{(k+1)\eta}-\theta_{k+1} \|^2_2}&\leq  (1-\frac{\eta m}{2} ) \cdot \E{\|\bar\theta_{k\eta}-\theta_k\|_2^2}+12 \eta^3 d(K-1)^2 LH_{\tau}^2+24(K-1)\eta^2 dL\tau\notag\\
    &\quad\quad+2\eta^3 d \frac{L^2}{m} H_{\tau}^2+32\eta^2 \frac{L^2}{m} d\tau+\eta^2 \sigma^2 d.
\end{align*}

Choose the specific Langevin diffusion $\bar\theta$ in stationary regime, we have $W_2^2(\mu_k,\pi)=\E{\|\bar\theta_{k\eta}-\theta_k \|_2^2}$ and  $W_2^2(\mu_{k+1},\pi)\leq\E{\| \bar\theta_{(k+1)\eta}-\theta_{k+1} \|_2^2}$. Arranging the terms, we have
\begin{align*}
    W_2^2(\mu_{k+1}, \pi)&\leq  (1-\frac{\eta m}{2}) \cdot W^2_2(\mu_{k}, \pi)+ 12 \eta^3 d(K-1)^2 LH_{\tau}^2+2\eta^3 d \frac{L^2}{m} H_{\tau}^2\notag\\
    &\quad\quad+24(K-1)\eta^2 dL\tau+32\eta^2 \frac{L^2}{m} d\tau+\eta^2 \sigma^2 d.
\end{align*}

\end{proof}

\subsection{Supporting Lemmas}




\begin{lemma}[Uniform $\ell_2$ upper bound for local clients]
\label{lem:L2_bound_local}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the $\ell_2$ norm upper bound as follows
\begin{align*}
\sup_k\E{\lrn{\theta_k^c-\theta_*}_2^2}\leq d\mathcal{D}^2 + \underbrace{\frac{6d}{m}\left(\tau+\frac{ \sigma^2}{m} + \frac{\gamma }{md}\right)}_{:=U},\notag
\end{align*}
where $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$ and $\theta_*$ denotes the global minimum for the function $f$.
\end{lemma}


\begin{proof} First, we consider the $k$-th iteration, where $k\in \{1,2,\cdots, K-2, (K-1)_{-}\}$ and $(K-1)_-$ denotes the $(K-1)$-step before synchronization. Following the iterate of Eq.\eqref{local_client} in a local client of $c\in [N]$, we have
	\begin{align}\label{eq:Langevin_L2_1_local}
&\quad\ \E{\lrn{\theta_{k+1}^c-\theta_*}_2^2}\notag\\
		&= \E{\|\theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c)\|_2^2} + \sqrt{8\eta\tau}\E{ \langle \theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c), \xi_k \rangle } + 2\eta\tau\E{\|\xi_k\|_2^2} \notag \\
		&= \E{\|\theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c)\|_2^2} + 2\eta d\tau,
	\end{align}	
	where the last equality follows from $\E{\xi_k}=0$ and the conditional independence of $\theta_k^c-\theta_*- \widetilde f^c(\theta_k^c)$ and $\xi_k$. Note that
\begin{align}\label{eq:ip_1st_local}
%\small
&\quad\ \E{\|\theta_k^c -\theta_*- \eta \widetilde f^c(\theta_k^c)\|_2^2} \notag\\
&= \E{\left\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \right\|_2^2} + \eta^2\E{\|\nabla f^c(\theta_k^c)-\nabla \widetilde f^c(\theta_k^c)\|_2^2}  \notag\\
& \qquad\qquad + 2 \eta \E{ \langle \theta_k^c-\theta_*-\eta \nabla f^c(\theta_k^c),\nabla f^c(\theta_k^c)-\nabla\widetilde f^c(\theta_k^c) \rangle }  \notag\\
&= \E{\left\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \right\|_2^2} + \eta^2\E{\|\nabla f^c(\theta_k^c)-\nabla \widetilde f^c(\theta_k^c)\|_2^2} \notag \\
&\leq \E{\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \|_2^2}  + \eta^2 d\sigma^2, 
\end{align}
where the first step follows from simple algebra, the second step follows from the unbiasedness of the stochastic gradient, and the last step follows from Assumption \ref{def:variance}. For any $q>0$, we can upper bound the first term of Eq.\eqref{eq:ip_1st_local} as follows
\begin{align}\label{eq:ip_2nd_test_theta_star}
	&\quad\ \E{\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \|_2^2}\notag\\
	&=\E{\|\theta_k^c -\theta_*- \eta (\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*))-\eta\nabla f^c(\theta_*) \|_2^2}\notag\\
	&\leq (1+q)\E{\|\theta_k^c -\theta_*- \eta (\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)) \|_2^2}+\eta^2 \left(1+\frac{1}{q}\right) \|\nabla f^c(\theta_*)\|_2^2\notag\\
	&\leq (1+q)\underbrace{\left(1-\frac{\eta m}{2}\right)^2}_{\psi^2}\E{\lrn{\theta_k^c-\theta_*}_2^2}+\eta^2 \left(1+\frac{1}{q}\right)\gamma,
\end{align}
where the first inequality follows by the AM-GM inequality;  the second inequality is a special case of Lemma \ref{contraction} based on Assumption \ref{def:strong_convex}, where no local steps is involved before the synchronization step. Similar results have been achieved in Theorem 3 \cite{Dalalyan17}. In addition, $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$.

Choose $q=(\frac{1+\psi}{2\psi})^2-1$ so that $(1+q)\psi^2=\frac{(1+\psi)^2}{4}$. Moreover, since $\psi=1-\frac{\eta m}{2}$, we get $\frac{1+\psi}{2}=1-\frac{1}{4}\eta m$. In addition, we have $1+\frac{1}{q}= \frac{1+q}{q}= \frac{(1+\psi)^2}{(1-\psi)(1+3\psi)}\leq \frac{2}{\eta m}$.  It follows that
\begin{align}
    \label{nice_inequality}
    \eta^2\left(1+\frac{1}{q}\right)\leq \frac{2\eta}{m}.
\end{align}

Combining Eq.~\eqref{eq:Langevin_L2_1_local}, Eq.~\eqref{eq:ip_1st_local}, Eq.~\eqref{eq:ip_2nd_test_theta_star}, and Eq.~\eqref{nice_inequality}, we have the following iterate
\begin{align*}
	\E{\|\theta_{k+1}^c-\theta_*\|_2^2} 
	\leq & ~ \underbrace{\left(1-\frac{\eta m}{4}\right)^2}_{:=g(\eta)} \E{\|\theta_k^c-\theta_*\|_2^2} + 2\eta d\tau +\eta^2 d \sigma^2+\frac{2\eta \gamma}{m}. \notag
\end{align*}

Note that $\frac{1}{1-g(\eta)}=\frac{1}{\frac{\eta m}{2}(1-\frac{\eta m}{8})}\leq \frac{3}{\eta m}$ given $\eta\in (0, \frac{2}{m})$. Recursively applying the above equation $k$ times, where $k\in \{1,2,\cdots, K-1, K_{-}\}$ and $K_-$ denotes the $K$-step without synchronization, it follows that
\begin{align}\label{recursion_v2}
	\E{\|\theta_k^c-\theta_*\|_2^2} &\le g(\eta)^{k}\| \theta_0^c-\theta_*\|_2^2 + \frac{1- g(\eta)^{k}}{1 - g(\eta)} \cdot \left(2\eta d\tau +\eta^2 d \sigma^2+\frac{2\eta \gamma}{m}\right)  \\
	&\le \|\theta_0^c-\theta_*\|_2^2 + \frac{3}{\eta m} \cdot \left(2\eta d\tau +\eta^2 d \sigma^2+\frac{2\eta \gamma}{m}\right) \notag\\
	&\leq d\mathcal{D}^2 + \underbrace{\frac{6d}{m}\left(\tau+\frac{ \sigma^2}{m} + \frac{ \gamma}{md}\right)}_{:=U},\notag
\end{align}
where the second inequality holds by $g(\eta)\leq 1$, the last inequality holds because $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$ and $\eta< \frac{2}{m}$.
In particular, the $K$-th step before synchronization yields that
\begin{align}\label{recursion_v3}
	\E{\|\theta_{K_-}^c-\theta_*\|_2^2} &\le d\mathcal{D}^2 +U.
\end{align}
Having all the results ready, for the $K$-local step after synchronization, applying Jensen's inequality
\begin{align}\label{recursion_v4}
	\E{\|\theta_K^c-\theta_*\|_2^2} 
	= & ~\E{\bigg\|\sum_{c=1}^N p_c\theta_{K-}^c-\theta_*\bigg\|_2^2} \notag \\
	\leq & ~ \sum_{c=1}^N p_c\E{\lrn{\theta_{K-}^c-\theta_*}_2^2} \notag \\
	\leq & ~ d\mathcal{D}^2 +U.
\end{align}
Now starting from iteration $K$, we adapt the recursion of Eq.\eqref{recursion_v2} for the $k$-th step, where $k\in\{K+1,\cdots, 2K-1, (2K)_{-}\}$ and $(2K)_-$ denotes the $2K$-step without synchronization, we have
\begin{align}\label{recursion_v5}
	\E{\|\theta_k^c-\theta_*\|_2^2} 
	\leq & ~ g(\eta)^{k-K} \cdot  \E{\|\theta_K^c-\theta_*\|_2^2} + \frac{1- g(\eta)^{k-K}}{1 - g(\eta)}\cdot \left(2\eta d\tau +\eta^2 d \sigma^2+\frac{2\eta \gamma}{m}\right)\notag \\
	\leq &  g(\eta)^{k-K}(d\mathcal{D}^2+U)+\frac{1- g(\eta)^{k-K}}{m\eta/3} \frac{m\eta}{3} U\notag \\
	\leq & d\mathcal{D}^2+ g(\eta)^{k-K} U +  (1- g(\eta)^{k-K}) U \notag\\
	\leq & d\mathcal{D}^2+U,
\end{align}
where the second inequality follows by Eq.\eqref{recursion_v4}, the fact that $1-g(\eta)\geq \eta m/3$ and $\eta\leq \frac{2}{m}$, and the definition of $U$. The third one holds since $g(\eta)\leq 1$.

By repeating Eq.\eqref{recursion_v4} and \eqref{recursion_v5}, we have that for all $k\geq 0$
\begin{align*}
	\E{\|\theta_k^c-\theta_*\|_2^2} \leq d\mathcal{D}^2 + \underbrace{\frac{6d}{m}\left(\tau+\frac{ \sigma^2}{m} + \frac{\gamma }{md}\right)}_{:=U}.\notag
\end{align*}
\end{proof}


\begin{lemma}[Uniform $\ell_2$ upper bound in continuous time]
\label{lem:L2_bound_local_continuous}
Assume assumption \ref{def:strong_convex} holds. We have the $\ell_2$ norm upper bound as follows \Zhao{The following quantity doesn't have $k$, not sure $\sup_k$ make sense}\Wei{Nice catch, thanks!}
\begin{align*}
\sup_t\E{\lrn{\bar\theta_t^c-\theta_*}_2^2}\leq \frac{1}{m}\bigg(\frac{\gamma}{m}+2d\tau\bigg),\notag
\end{align*}
where $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$ and $\theta_*$ denotes the global minimum for the function $f$.
\end{lemma}

\begin{proof}
Let $q(\bar\theta_t^c)=\lrn{\bar\theta_t^c-\theta_*}_2^2$. For any time $t\geq 0$, applying It\^{o}'s lemma leads to
\begin{align*}
    \d q(\bar\theta_t^c)&=-2\langle \bar\theta_t^c-\theta_*, \nabla f^c(\bar\theta_t^c)\rangle\d t + 2d\tau \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\\
    &=-2\langle \bar\theta_t^c-\theta_*, \nabla f^c(\bar\theta_t^c)-\nabla f^c(\theta_*)+\nabla f^c(\theta_*)\rangle\d t + 2d\tau \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\\
    &\leq -2 m \lrn{\bar\theta_t^c-\theta_*}_2^2\d t-2\langle \bar\theta_t^c-\theta_*, \nabla f^c(\theta_*)\rangle\d t+ 2d\tau \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\\
    &\leq -2m \lrn{\bar\theta_t^c-\theta_*}_2^2\d t+m \lrn{\bar\theta_t^c-\theta_*}_2^2\d t+\frac{\lrn{\nabla f^c(\theta_*)}_2^2}{m}\d t+ 2d\tau \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\\
    &\leq -m q(\bar\theta_t^c)\d t+\left(\frac{\gamma}{m}+ 2d\tau\right) \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle,\notag
\end{align*}
where the first inequality follows by Assumption \ref{def:strong_convex}; the second inequality follows by the AM-GM inequality; the third inequality follows by the definition that $\gamma=\max_{c \in [N]} \lrn{\nabla f^c(\theta_*)}_2^2$. 

In other words, we have
\begin{align*}
    \d (e^{mt} q(\bar\theta_t^c))&=me^{mt} q(\bar\theta_t^c)\d t + e^{mt} \d q(\bar\theta_t^c)\notag\\
    &\leq me^{mt} q(\bar\theta_t^c)\d t + e^{mt}\left(-m q(\bar\theta_t^c)\d t+\left(\frac{\gamma}{m}+ 2d\tau\right) \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\right)\notag\\
    &\leq e^{mt}\left(\frac{\gamma}{m}+ 2d\tau\right) \d t+\sqrt{8\tau}e^{mt}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle.\notag
\end{align*}

The solution is upper bounded by
\begin{align*}
    e^{mt} q(\bar\theta_t^c)\leq e^{m\cdot 0} q(\bar\theta_0^c)+\int_0^t \left(e^{ms}\left(\frac{\gamma}{m}+ 2d\tau\right) \d s+\sqrt{8\tau}e^{ms}\langle \bar\theta_s^c-\theta_*, \d \overline{W}_s\rangle\right)\notag.
\end{align*}

By the martingale property of It\^{o} integral, taking expectations yields
\begin{align}\label{l2_continuous}
    \E{q(\bar\theta_t^c)}
    \leq & ~ e^{-mt}\E{q(\bar\theta_0^c)}+ e^{-mt}\left(\frac{\gamma}{m}+ 2d\tau\right) \int_0^t e^{ms} \d s\notag\\
    = & ~ e^{-mt}\E{q(\bar\theta_0^c)}+ \frac{1-e^{-mt}}{m}\big(\underbrace{\frac{\gamma}{m}+ 2d\tau}_{:=V}\big).
\end{align}
Since $\bar\theta_0^c$ is simulated from the stationary distribution $\pi$, by Lemma 12 \cite{dm+16} or Theorem 17 \cite{ccbj18}, we have
\begin{align*}
\E{q(\bar\theta_0^c)}=\E{ \| \bar\theta_0^c-\theta_* \|_2^2}\leq \frac{d\tau}{m}\leq \frac{1}{m}(\frac{\gamma}{m}+2d\tau)=\frac{V}{m},
\end{align*}
which completes the proof.


\end{proof}


\begin{lemma}[Bounded gradient in $\ell_2$]\label{bounded_gradient_l2}
Given assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold, for any client $c$ and any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the $\ell_2$ norm upper bound as follows
\begin{align*}
    \E{ \|\nabla\tilde f^c(\theta_k^c) \|_2^2 }\leq d H_{\tau}^2,
\end{align*}
where $H_{\tau}^2=14 \kappa^2 \cdot (m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2 )$.
\end{lemma}

\begin{proof}

Decompose the $\ell_2$ of the gradient as follows
\begin{align*}
    \E{\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2}&= \E{\lrn{\nabla\tilde f^c(\theta_k^c)-\nabla f^c(\theta_k^c)+\nabla f^c(\theta_k^c)}_2^2}\notag\\
    &= \E{\lrn{\nabla f^c(\theta_k^c)}_2^2}+\E{\lrn{\nabla\tilde f^c(\theta_k^c)-\nabla f^c(\theta_k^c)}_2^2}+2\E{\lrw{\nabla\tilde f^c(\theta_k^c)-\nabla f^c(\theta_k^c), \nabla f^c(\theta_k^c)}} \notag \\
    &\leq \E{\lrn{\nabla f^c(\theta_k^c)}_2^2}+\sigma^2d \notag \\
    &=  \E{\lrn{\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)+\nabla f^c(\theta_*)}_2^2}+\sigma^2d \notag \\
    &\leq 2\E{\lrn{\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)}_2^2}+2\E{\big\|\nabla f^c(\theta_*)\big\|_2^2}+\sigma^2d\notag\\
    &\leq 2 L^2 \E{\lrn{\theta_k^c-\theta_*}_2^2}+2 \gamma +\sigma^2d\notag\\
    &\leq 2L^2 \cdot \big(d\mathcal{D}^2 + \frac{6d}{m} \cdot (\tau+\frac{ \sigma^2}{m} + \frac{\gamma }{md} )\big)+2 \gamma+\sigma^2d \notag \\
    &\leq 14 d\kappa^2 \cdot ( m^2 \mathcal{D}^2+m\tau +\frac{\gamma}{d}+\sigma^2 ):= dH_{\tau}^2,
\end{align*}
where the first inequality follows by Assumption \ref{def:variance}; the second inequality follows by Young's inequality; the third inequality follows by Assumption  \ref{def:smooth} and the definition that $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2^2$; the fourth inequality follows by Lemma \ref{lem:L2_bound_local}; the last inequality follows by defining 
$\kappa:=\frac{L}{m}\geq 1$.
\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Beginning of Bounded divergence %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%





\begin{lemma}[Initial condition] 
\label{lem:W2_init_bound}
Let $\mu_0$ denote the Dirac delta distribution at $\theta_0$. % and assume $\lrn{\theta_0-\theta_*}_2^2\leq d\mathcal{D}^2$.
Then, we have
\begin{align*}
W_2(\mu_0, \pi)\leq \sqrt{2}(\| \theta_0 - \theta_* \|_2 +  \sqrt{d\tau /m} ). %\sqrt{2d\left(\mathcal{D}^2+\frac{2}{m}\right)}.
\end{align*}
\end{lemma}

\begin{proof}
By \cite{ccbj18}, there exists an optimal coupling between $\mu_0$ and $\pi$ such that
\begin{align*}
    W_2^2(\mu_0, \pi) 
    \leq & ~ \mathbb{E}_{\theta\sim \pi} [\|\theta_0-\theta\|_2^2 ]\\
    \leq & ~ 2\mathbb{E}_{\theta\sim \pi} [\|\theta_0-\theta_*\|_2^2 ] + 2 \mathbb{E}_{\theta\sim \pi}[\|\theta-\theta_*\|_2^2] \\
    = & ~ 2\| \theta_0 - \theta_* \|_2^2 +2\mathbb{E}_{\theta\sim \pi}[\|\theta-\theta_*\|_2^2]\\
    \leq & ~ 2\| \theta_0 - \theta_* \|_2^2 + 2d\tau /m,
\end{align*}
where the second step follows from triangle inequality, the last step follows from Lemma 12 \cite{dm+16} and the temperature $\tau$ is included to adapt to the time scaling.
\end{proof}

\section{A simulation example}

Do a 2D simulation, compare the empirical with the ground truth, plot W2 distance based on \cite{GS84}


\begin{figure*}[!ht]
  \centering
  \vskip -0.1in
  \subfigure[xx]{\includegraphics[scale=0.2
  ]{figures/truth.pdf}}\label{fig: 3a}\quad\quad
  \hspace{-0.5cm}
  \subfigure[b]{\includegraphics[scale=0.2]{figures/Rplot_chains_5_K_10_1e6.pdf}}\label{fig: 3b}
  \vspace{-0.5em}
  \subfigure[b]{\includegraphics[scale=0.2]{figures/Rplot_chains_5_K_10_1e6_different_seeds.pdf}}\label{fig: 3c}
  \vspace{-0.5em}
  \caption{xxx}
  \label{simulation}
  \vspace{-0.15in}
\end{figure*}




% \newpage
% \paragraph{Path A} If we use a bound based on $W_2^2$, we have
% \begin{align*}
%     W_2^2(\mu_{k+1}, \pi)&\leq  \left(1-\eta m\right) W_2^2(\mu_{k}, \pi)+\eta^3 + \eta^2\sigma^2.\notag
% \end{align*}

% Recursive applying it
% \begin{align*}
%     W_2^2(\mu_k, \pi)&\leq  \left(1-\eta m\right)^k W_2^2(\mu_{0}, \pi)+ \frac{1-(1-\eta m)^k}{\eta m}\left(\eta^3 + \eta^2\sigma^2 \right).\notag\\
%     &\leq \left(1-\eta m\right)^k W_2^2(\mu_{0}, \pi)+ \frac{1}{ m}\left(\eta^2 + \eta\sigma^2\right).\notag\\
% \end{align*}
% In other words, we can easily get a good rate
% \begin{align*}
%     W_2(\mu_k, \pi)&\leq \left(1-\eta m\right)^\frac{k}{2} W_2(\mu_{0}, \pi)+ \sqrt{\frac{1}{ m}\left(\eta^2 + \eta\sigma^2\right)}.\notag\\
%     &\leq \left(1-\frac{\eta m}{2}\right)^k W_2(\mu_{0}, \pi)+ \sqrt{\frac{1}{ m}\left(\eta^2 + \eta\sigma^2\right)}.\notag\\
% \end{align*}

% \paragraph{Path B} Similarly, if we use
% \begin{align*}
%     W_2(\mu_{k+1}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)W_2 (\mu_{k}, \pi)+\sqrt{\eta^3 +\eta^2\sigma^2}.\notag
% \end{align*}
% Then only by applying a complex trick, i.e.
% \begin{lemma}
% Let $A, B$ and $C$ be three constants such that $A\in(0, 1)$, $B, C\geq 0$. If the sequence $\{x_k\}_{k\geq 0}$ satisfies the recursion as follows
% \begin{align*}
%     x_{k+1}^2\leq [(1-A)x_k + C]^2 + B^2,
% \end{align*}
% where for any $k\geq 0$.  Then, we have that
% \begin{align*}
%     x_{k}\leq (1-A)^k x_0 + \frac{C}{A} + \frac{B^2}{C+\sqrt{A} B}.
% \end{align*}
% \end{lemma}

% , we can have
% \begin{align*}
%     W_2(\mu_{k}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)^k W_2 (\mu_{0}, \pi)+\sqrt{\eta}(\mathcal{O}(1)+\sigma^2).\notag
% \end{align*}

% \Wei{Question: Dalalyan used Path B, why?}




