\section{Our Algorithm}

Let $N$ denote the number of clients. Let $T$ denote the number of global steps. Let $K$ denote the number of local steps. For each $c \in [N]:=\{1,2,\cdots, N\}$, we use $f^c$ and $\nabla f^c$ denote the loss function and gradient of client $c$ (based on the datas in client $c$). For the stochastic gradient oracle, we denote by $\nabla \tilde f^c(\cdot)$ the unbiasedness estimate of the exact gradient $\nabla f^c$ of client $c$. In addition, we denote $p_c$ as the weight of the $c$-th client such that $p_c\geq 0$ and $\sum_{c=1}^N p_c=1$. $\xi_k$ is a standard $d$-dimensional Gaussian vector at iteration $k$, which is independent of the client index and can be achieved by maintaining the same random seed for each client $c\in[N]$.
% Zhao's first algorithm formulation
% \begin{algorithm*}[h]\caption{Training via Federated Averaging Algorithm }\label{alg:alg_main_text}
% \begin{algorithmic}[1]
% \State $u_r(0) \sim \N(0,I_d)$ for $r\in [m]$. \Comment{$u \in \R^{d \times m}$}
% \For{$t = 1, \ldots, T$} \Comment{$T$ denotes the number of global steps}
%     \For{$c = 1, \ldots, N$} \Comment{$N$ denote the total number of clients}
%         \State $w_{0,c}(t) \leftarrow u(t)$ \Comment{$w_{0,c}(t),u(t) \in \R^{d \times m}$}
%         \For{$k=1,\ldots,K$} \Comment{$K$ denotes the number of local steps}
%             \State $w_{k,c}(t) \leftarrow w_{k-1,c}(t) - \eta_{\mathrm{local}} \cdot \frac{\partial L_c}{\partial w} |_{w = w_{k-1,c}(t) }$ +{\color{red}noise}
%         \EndFor
%         \State
%         $\Delta u_c \leftarrow w_{k,c}(t) - u(t)$
%     \EndFor
%     \State $\Delta u \leftarrow \frac{1}{N} \sum_{c \in [N]} \Delta u_c$ \Comment{$\Delta u \in \R^{d \times m}$}
%     \State $u(t+1) \leftarrow u(t) + \eta_{\mathrm{global}} \Delta u$ + {\color{red}noise} \Comment{$u(t+1)\in \R^{d \times m}$, add noise here, the novelty is less}
% \EndFor
% \end{algorithmic}
% \end{algorithm*}


\begin{algorithm*}[h]\caption{Federated Averaging Langevin dynamics Algorithm (FedAvgLD). Denote by $\theta_k^c$ the model parameter in the $c$-th client at the $k$-th step. Denote the immediate result of one step SGLD update from $\theta_k^c$ by $\beta_k^c$. A global synchronization is conducted every $K$ steps.}\label{alg:alg_main_text_same_seed}
\begin{algorithmic}[1]
\State \begin{equation}\label{local_client}
    \beta_{k+1}^c=\theta_k^c-\eta_k\nabla f^c(\theta_k^c)+\xi_k,
\end{equation}
\State
\begin{equation*}  
\label{undampedsgld}
\theta_{k+1}^c=\left\{  
             \begin{array}{lr}  
             \beta_{k+1}^c \qquad\qquad\qquad \text{if } k+1 \text{ mod } E\neq 0 \\  
              & \\
             \sum_{c=1}^N p_c \beta_{k+1}^c \ \qquad \text{if } k+1 \text{ mod } E=0.
             \end{array}  
\right.  
\end{equation*} 
\end{algorithmic}
\end{algorithm*}

Inspired by \cite{lhy+19}, we define two virtual sequences $\beta_k=\sum_{c=1}^N p_c \beta_k^c$ and $\theta_k=\sum_{c=1}^N p_c \theta_k^c$, which are \emph{both inaccessible when $k \text{ mod } E\neq 0$}. We also define 
\begin{equation}
\label{sum_grad}
\nabla f(\theta_k)=\sum_{c=1}^N p_c \nabla f^c(\theta_k^c), \qquad \nabla\tilde f(\theta_k)=\sum_{c=1}^N p_c \nabla \tilde f^c(\theta_k^c).
\end{equation}

In what follows, it is clear that $\E{\nabla \tilde f(\theta)}=\nabla f(\theta)$ for any $\theta\in\R^d$. Moreover, we always have $\beta_k=\theta_k$ whether $k+1 \text{ mod } E=0$ or not. Summing Eq.\eqref{local_client} from clients $c=1$ to $N$, we have
\begin{equation}
\label{fed_avg_langevin_dynamics}
    \theta_{k+1}=\theta_k-\eta_k \nabla \tilde f(\theta_k)+\xi_k.
\end{equation}

The above formulation resembles the SGLD algorithm except that the construction and analysis of stochastic gradients are different.

\paragraph{Quality of non-i.i.d data} Denote by $f_*$ the minimum values of $f$ and by $f^c_*$ the minimum values of $f^c$ for each client $c\in [N]$. We quantify the degree of the non-i.i.d data by $\gamma:=f_*-\sum_{c=1}^N p_c f^c_*$, which is a non-negative constant and yields a larger scale if the data is less identically distributed.


\iffalse
\subsection{Our plan}

What are the assumptions do we need ..
\begin{itemize}
    \item shusen wang's paper
\end{itemize}

We need to prove a new version of Lemma 3 in page 12 in \cite{lhy+19}.
\begin{lemma}[Lemma 3 in page 12 in \cite{lhy+19}]
We have
\begin{align*}
    \mathbb{E} \Big[ \sum_{c=1}^N p_c \| u(t+1) - w_{K,c}(t)  \|^2 \Big] \leq 4 \eta_t^2 (E-1)^2 G^2 
\end{align*}
\end{lemma}

We need to generalize the above lemma to something as follows:
\begin{lemma}[Our version]
We have
\begin{align*}
    \mathbb{E} \Big[ \sum_{c=1}^N p_c \| u(t+1) - w_{K,c}(t)  \|^2 \Big] \leq 4 \eta_t^2 (E-1)^2 G^2  + \| \mathrm{noise} \|^2
\end{align*} 
\end{lemma}

Using Shusen's assumption, we can show Dala's paper \cite{dk19} page 7 contions are holding.

\fi


% \begin{table}[]
%     \centering
%     \begin{tabular}{|l|l|l|l|l|l|} \hline
%         {\bf Notations} & {\bf Ours} & \cite{dk19} & \cite{lhy+19} & \cite{ccbj18} \\ \hline
%         Function & $f$ & $f$ & $F$ & $f$ \\ \hline
%         Parameter &  & $\theta$ & $w$ & $x$ \\ \hline
%         Dimension & $d$ & $p$ & Never & $d$ \\ \hline
%         Smooth & $L$ & $M$ & $L$ & $L$ \\ \hline
%         Strongly convex & $m$ & $m$ & $\mu$ & $m$ \\ \hline
%         Global & & $K$ & $T$ & \\ \hline
%         Local steps & & $1$ & $K$ & \\ \hline
%         Variance & $\sigma^2 d$ & $\sigma^2 p$ & $\sigma^2$ & $\sigma^2 d$ \\ \hline
%         Learning rate & $\eta$ & $h$ & $\eta$ & $\delta$ \\ \hline
%         Choice of LR & & $h = 1/(m+M)$ & $\eta = 2 / (\mu T) $ &  \\\hline
%         \#Devices & & 1 & $N$ & \\ \hline
%         \#Datas per client & & & $n_k$ & \\ \hline
%     \end{tabular}
%     \caption{Notations to compare different papers. We put this table for easy of writing. There is no need to keep this in the final paper.}
%     \label{tab:my_label}
% \end{table}


\subsection{Main result}

% \subsubsection{Notations}

\subsubsection{Assumptions}

\begin{assumption}[Smoothness]\label{def:smooth} For each $c\in [N]$, we say $f^c$ is $L$-smooth if for some $L>0$
\begin{align*}
\| \nabla f^c(y)-\nabla f^c(x) \|_2 \leq L \| y-x \|_2,\quad \forall x, y\in \R^d.
\end{align*}
\end{assumption}

Note that the above assumption is equivalent to saying that
\begin{align*}
f^c(y)\leq f^c(x)+\langle \nabla f^c(x),y-x \rangle+\frac{L}{2}\| y-x \|^2_2\quad \forall x, y\in \R^d.
\end{align*}

\begin{assumption}[Strongly convex]\label{def:strong_convex}
For each $c\in [N]$, $f^c$ is $m$-strongly convex if for some $m>0$
\begin{align*}
f^c(x)\geq f^c(y)+\langle \nabla f^c(y),x-y \rangle + \frac{m}{2} \| y-x \|_2^2\quad \forall x, y\in \R^d.
\end{align*}
\end{assumption}

\begin{assumption}[Bounded variance]\label{def:variance}
For each $c\in [N]$, the variance of stochastic gradient $\nabla \tilde f^c(x)$ in each client is upper bounded such that %\Zhao{If the following $\sigma$ has $c$, then should we assume $L$ and $m$ also have $c$?} \Wei{I feel we should make $\sigma^2$ uniform}\Zhao{Yes, I plan to change now}
\begin{align*}
\mathbb{E}[ \| \nabla \tilde f^c(x) - \nabla f^c(x) \|_2^2] \leq \sigma^2 d,\quad \forall x\in \R^d.
\end{align*}
\end{assumption}

% Wei's first algorithm formulation based on standard SGLD
% \subsection{Formulation}
% Let $\theta_k\in\R^d$ be the $k$-th iterate of the following stochastic gradient Langevin algorithm.
% \begin{align}\label{eq:sgld}
%     \theta_{k+1}=\theta_k -\eta \nabla \widetilde f(\theta_k)+\sqrt{2\tau\eta}\xi_k,
% \end{align}
% where $\eta$ is the learning rate, $\tau$ is the temperature, $\xi_k$ is a standard $d$-dimensional Gaussian vector, and $\nabla \widetilde f(\theta)$ is an unbiased estimate of the exact gradient $\nabla f(\theta)$.

\subsubsection{Tools from previous work}

\textbf{Gronwall's inequality} is a standard tool for obtaining estimates of differential equations. Suppose that $a(\cdot)$, $b(\cdot)$, and $\psi(\cdot)$ are continuous real-valued functions that satisfy
\begin{align*}
    \frac{\d}{\d t}\psi(t)\leq a(t)\psi(t)+b(t).
\end{align*}
Then 
\begin{align*}
    \psi(t)\leq \psi(t_0)e^{\int_{t_0}^t a(s)\d s} + \int_{t_0}^t e^{\int_{s}^t a(u)\d u}b(s)\d s.
\end{align*}

\textbf{Burkholder-Davis-Gundy inequality} Let $\phi:[0, \infty)\rightarrow \mathbb{R}^{r\times d}$ for some positive integers $r$ and $d$. In addition, we assume $\E{\int_0^{\infty} |\psi(s)|^2 \d s}<\infty$ and let $Z(t)=\int_0^t \psi(s)\d W_s$, where $W_s$ is a $d$-dimensional Brownian motion. Then for all $t\geq 0$, we have

\begin{align*}
    \E{\sup_{0\leq s\leq t} |Z(s)|^2}\leq 4\E{\int_0^t|\phi(s)|^2\d s}.
\end{align*}
%\Zhao{The last term is $\phi(s)$ but taking the integral over $t$. This seems wired.}

\subsubsection{Wasserstein Distance}

% We denote the Borel $\sigma$-algebra 
We define the 2-Wasserstein distance between a pair of Borel probability measures $\mu$ and $\nu$ on $\R^d$ as follows  
\begin{align*}
    W_2(\mu, \nu):=\inf_{\Gamma\in \text{Couplings}(\mu, \nu)}\left(\int\|\bbeta_{\mu}-\bbeta_{\nu}\|_2^2 d \Gamma(\bbeta_{\mu}, \bbeta_{\nu})\right)^{\frac{1}{2}},
\end{align*}
where $\|\cdot\|_2$ denotes the $\ell_2$ norm on $\mathbb{R}^d$ and the pair of random variables $(\bbeta_{\mu}, \bbeta_{\nu})\in \R^d\times\R^d$ is a coupling with the marginals following $\mathcal{L}(\bbeta_{\mu})=\mu$ and $\mathcal{L}(\bbeta_{\nu})=\nu$, where $\mathcal{L}(\cdot)$ denotes a distribution of a random variable.



% The following theorem is from previous work \cite{dk19}.
% \begin{theorem}[Theorem 4 in \cite{dk19}]
% Let $\theta_{K,h}$ be the $K$-th iterate of the nLMC algorithm and $\nu_K$ be its distribution. If the function $f$ satisfies condition (1) and $h \leq 2/(m+M)$ then
% \begin{align*}
%     W_2 (\nu_K, \pi) \leq (1-mh)^K W_2(\nu_0, \pi) + 1.65 (M/m) (hp)^{1/2} + \frac{\delta \sqrt{p} }{ m} + \frac{\sigma^2 (hp)^{1/2} }{1.65 M + \sigma \sqrt{m}}
% \end{align*}
% \end{theorem}


% Remove Assumption 4 \cite{dk19}. Follow ideas similar to Lemma 4  \cite{decent21}



\begin{theorem}\label{thm:non_asymptotic}

Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0, {m}/{L^2}]$ and $\| \theta_0-\theta_* \|_2 \leq \sqrt{d} \mathcal{D}$, where $\theta_*$ is a stationary point. Then
\begin{align*}
W_2(\mu_k, \pi) \leq e^{-{mk\eta}/{2}} \cdot 2 ( \sqrt{d} {\cal D} + \sqrt{d/m} ) + \sqrt{\frac{1}{m^2} \big(L^2 (2\eta(\eta G+8d\tau))+ d \sigma^2\big)}, %\frac{L}{m} \sqrt{G\eta d},
\end{align*}
where $G\leq 4L^2 d\mathcal{D}^2 +\frac{8L^2 d\tau}{m}+(d+4L^2) \max_{i\in[N]}\left(\sigma^2 +\lrn{\theta_*^i-\theta_*}_2^2\right)$ and $\mu_k$ denotes the probability measure of $\theta_k$.
\end{theorem}


\Wei{think about a name for the paper}

\Wei{Why the current algorithm works even if $K$ is infinitely large?}

\Wei{To do: remove the variance effect}

\Wei{To do: Decay learning rate}

% %Wei: Nice interpretation from another perspective. Thanks.
% \begin{proposition}
% Assume that $\lrn{\nabla f(\theta_k) - \nabla \tilde{f}(\theta_k)}_2^2 \leq C_1 \epsilon \lrn{\theta_k-\theta_*}_2^2 + C_2 \epsilon$. 
% Then 
% \begin{align*}
% W_2^2(\mu_{k+1}, \pi) \leq e^{-m\eta} W_2^2(\mu_k, \pi) + O(\eta \epsilon d).
% \end{align*}
% \end{proposition}
% Follow Theorem 2 of \cite{Bayes_Rob} for proof.

% Therefore, $W_2^2(\mu_k, \pi) \leq e^{-m\eta k} W_2^2(\mu_0, \pi) + O(\epsilon d / m)$.

\begin{proof}
% Denote $\htheta_t$ as the continuous-time interpolation of the FedAvg Langevin dynamics in Eq.\eqref{fed_avg_langevin_dynamics} as follows
% \begin{align}\label{eq:continuous_interpolation}
% \d {\htheta}_t = - \nabla \tilde f(\widehat\theta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \d t + \sqrt{2\tau} \d \hat{W}_t,
% \end{align}
% where ${\htheta}_0=\theta_0$. For the gradient, $\nabla\tilde f(\widehat\theta_{\eta\lfloor\frac{t}{\eta} \rfloor})=\sum_{c=1}^N p_c \nabla \tilde f^c\big(\widehat\theta_{\eta\lfloor\frac{t}{\eta} \rfloor}^c\big)$ and $\widehat\theta_{\eta\lfloor\frac{t}{\eta} \rfloor}^{c}$ denotes the local process $\big(\widehat\theta_{\eta\lfloor\frac{t}{\eta} \rfloor}^{c}\big)$ in client $c$. For any $k\in \mathbb{N}^{+}$ and a time $t$ that satisfies $t=k\eta$, it is apparent that $\widehat\mu_t=\mathcal{L}({\htheta}_t)$ is the same as $\mu_k=\mathcal{L}(\theta_k)$. In addition, 


Denote $\htheta_t$ as the continuous-time interpolation of the stochastic gradient Langevin dynamics as follows
\begin{align}\label{eq:continuous_interpolation}
\d {\htheta}_t = - \nabla \widetilde f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \d t + \sqrt{2\tau} \d \hat{W}_t,
\end{align}
where ${\htheta}_0=\theta_0$. For any $k\in \mathbb{N}^{+}$ and a time $t$ that satisfies $t=k\eta$, it is apparent that $\widehat\mu_t=\mathcal{L}({\htheta}_t)$ is the same as $\mu_k=\mathcal{L}(\theta_k)$. In addition, we define an auxiliary process $(\bar\theta_t)$ that starts from the stationary distribution $\pi$
\begin{align}
\d \bar\theta_t = - \nabla f(\bar\theta_t) \d t + \sqrt{2\tau} \d \overline{W}_t.
\end{align}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% new proof style
% We first define an auxiliary process $(\bar\theta_t)$ that starts from the stationary distribution $\pi$
% \begin{align}\label{couple_process}
% \d \bar\theta_t = - \nabla f(\bar\theta_t) \d t + \sqrt{2\tau} \d \overline{W}_t,
% \end{align}
% where $\bar\theta_0=\theta_0$. For the gradients, $\nabla f(\bar\theta_t)=\sum_{c=1}^N p_c \nabla f^c(\bar\theta_t^{c})$ and $\bar\theta_t^{c}$ denotes the auxiliary stationary local process $(\bar\theta_t^{c})$ in client $c$. Since $\bar\theta_0$ is drawn from $\pi$, it follows that $\bar\theta_t\sim \pi$ for every $t\geq 0$. Denote $\mathcal{T}_k=\sum_{i=1}^k\eta_k$, the solution of Eq.\eqref{couple_process} follows that 
% \begin{align*}
%     \bar\theta_{(k+1)\eta}&=\bar\theta_{k\eta}-\int_{{k\eta}}^{(k+1)\eta} \nabla f(\bar\theta_s)\d s+\sqrt{2\tau} \overline{W}_{\eta}\\
%     &=\bar\theta_{k\eta}-\int_{k\eta}^{(k+1)\eta} \nabla f(\bar\theta_s)ds+\sqrt{2\tau\eta} \bar\xi_k.
% \end{align*}

% Recall that $\theta_{k+1}=\theta_k-\eta_k \nabla \tilde f(\theta_k)+\xi_k$. Considering a synchronous coupling such that $\bar\xi_k=\xi_k$, we have
% \begin{align*}
%     \lrn{\theta_{k+1}-\bar\theta_{(k+1)\eta}}_2^2&=\lrn{\theta_k-\bar\theta_{k\eta}-\eta_k \nabla  f(\theta_k)+\int_{k\eta}^{(k+1)\eta} \nabla f(\bar\theta_s)\d s+\eta_k \nabla f(\theta_k)-\eta_k \nabla \tilde f(\theta_k)}_2^2\\
%     &=\underbrace{\lrn{\theta_k-\bar\theta_{k\eta}-\eta_k \nabla  f(\theta_k)+\int_{k\eta}^{(k+1)\eta} \nabla f(\bar\theta_s)\d s}_2^2}_{\mathcal{I}} + \eta_k^2 \lrn{\nabla f(\theta_k)- \nabla \tilde f(\theta_k)}_2^2\\
%     &\qquad +2\eta_k \bigg\langle \theta_k-\bar\theta_{k\eta}-\eta_k \nabla  f(\theta_k)+\int_{k\eta}^{(k+1)\eta} \nabla f(\bar\theta_s)\d s, \nabla f(\theta_k)- \nabla \tilde f(\theta_k) \bigg\rangle\\
% \end{align*}
% Lemma \ref{lem:total_variance} for the second term and unbiasedness for the third term.


% Decompose $\mathcal{I}$ as follows
% \begin{align*}
%     \lrn{\theta_k-\bar\theta_{k\eta}-\eta_k \nabla  f(\theta_k)+\int_{k\eta}^{(k+1)\eta} \nabla f(\bar\theta_s)\d s}_2^2&=\lrn{\theta_k-\bar\theta_{k\eta}}_2^2+\\
%     &= 2\eta_k\langle \rangle +\lrn{-\eta_k \nabla  f(\theta_k)+\int_{k\eta}^{(k+1)\eta} \nabla f(\bar\theta_s)\d s}_2^2 
% \end{align*}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%








Consider It\^{o}'s formula for the sequence of $\frac{1}{2}  \| \htheta_t - \bar\theta_t \|_2^2$
\begin{align*}
& ~ \frac{1}{2} \d  \| \htheta_t - \bar\theta_t \|_2^2 \\
= & ~ \lrw{ \htheta_t - \bar\theta_t, \d \htheta_t - \d \bar\theta_t } + \mathrm{Tr}[ \d^2 \htheta_t - \d^2 \bar\theta_t ] \\
= & ~ \lrw{ \htheta_t - \bar\theta_t, \big(\nabla f(\bar\theta_t) -\nabla\tilde f(\widehat\theta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big) \d t + \sqrt{2\tau}\big( \d \hat{W}_t - \d \overline{W}_t \big) } + 2\tau \mathrm{Tr}[ \d^2 \hat{W}_t - \d^2 \overline{W}_t ].
\end{align*}




By decomposing 
\begin{align*}
\nabla f(\bar\theta_t)-\nabla\tilde f(\widehat\theta_{\eta\lfloor\frac{t}{\eta} \rfloor})=\nabla f(\bar\theta_t)-\nabla f(\widehat\theta_t)+\underbrace{\nabla f(\widehat\theta_t)-\nabla f(\widehat\theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_{\text{discretization}}+\underbrace{\nabla f(\widehat\theta_{\eta\lfloor\frac{t}{\eta} \rfloor})-\nabla\tilde f(\widehat\theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_{\text{stochastic gradient}}
\end{align*}
and considering a synchronous coupling such that $\hat{W}_t = \overline{W}_t$, we have

% Taking $\hat{W}_t = \overline{W}_t$ defines a coupling between the two processes and leads to
\begin{align}\label{diff}
\frac{1}{2} \d \| \htheta_t - \bar\theta_t \|_2^2
&= \underbrace{\lrw{ \htheta_t - \bar\theta_t, \nabla f(\bar\theta_t)-\nabla f(\htheta_t)} }_{\mathcal{A}}\d t+ \underbrace{\lrw{ \htheta_t - \bar\theta_t,  \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor})  } }_{\mathcal{B}} \d t\\
&\qquad \qquad+ \underbrace{\lrw{ \htheta_t - \bar\theta_t, \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor}) }}_{\mathcal{C}} \d t \notag
% &\leq - m \| \htheta_t - \bar\theta_t \|_2^2 \d t + \frac{m}{4} \| \htheta_t - \bar\theta_t \|_2^2 \d t + \frac{1}{m} \big\| \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big\|_2^2 \d t
% &\qquad\qquad  + \frac{m}{4} \| \htheta_t - \bar\theta_t \|_2^2\d t + \frac{1}{m} \lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2\d t\\
% &\leq  - \frac{m}{2} \| \htheta_t - \bar\theta_t \|_2^2 \d t + \frac{1}{m} \big\| \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big\|_2^2 \d t+\frac{1}{m} \lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2\d t\\
% &\leq - \frac{m}{2} \| \htheta_t - \bar\theta_t \|_2^2 \d t + \frac{L^2}{m} \big\| \htheta_{t} - \htheta_{\eta\lfloor\frac{t}{\eta} \rfloor} \big\|_2^2 \d t+\frac{1}{m} \lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2\d t,
\end{align}


\paragraph{Estimate of $\mathcal{A}$} Since $\htheta_t =\sum_{c=1}^N p_c\htheta_t^c$ and $\bar\theta_t=\sum_{c=1}^N p_c \bar\theta_t^c$, we have \Zhao{Yian said that the second step has a bug, we might need to make extra assumptions.}
\begin{align}
\label{A_estimate}
\footnotesize
    &\quad\lrw{ \htheta_t - \bar\theta_t, \nabla f(\bar\theta_t)-\nabla f(\htheta_t)}\\
    &=\sum_{c=1}^N p_c\lrw{ \htheta_t - \bar\theta_t, \nabla f^c(\bar\theta_t)-\nabla f^c(\htheta_t)}\notag\\
    &=\sum_{c=1}^N p_c\lrw{\htheta_t -\htheta_t^c+\htheta_t^c-\bar\theta_t^c+\bar\theta_t^c - \bar\theta_t,\nabla f^c(\bar\theta_t)-\nabla f^c(\htheta_t)}\notag\\
    &=\sum_{c=1}^N p_c \lrw{\htheta_t -\bar \theta_t^c,\nabla f^c(\bar\theta_t)-\nabla f^c(\htheta_t)}+\sum_{c=1}^N p_c\lrw{\htheta_t -\htheta_t^c+\bar\theta^c_t-\bar\theta_t,\nabla f^c(\bar\theta_t)-\nabla f^c(\htheta_t)}\notag\\
    &\leq -\sum_{c=1}^N p_c m\lrn{\widehat\theta_t^c - \bar\theta_t^c}_2^2 +\sum_{c=1}^N p_c \left(\frac{2L^2}{m}\lrn{\htheta_t -\htheta_t^c}_2^2 +\frac{2L^2}{m}\lrn{\bar\theta^c_t-\bar\theta_t}_2^2 +\frac{m}{4L^2}\lrn{\nabla f^c(\bar\theta_t)-\nabla f^c(\htheta_t)}_2^2\right) \notag\\
    &\leq -m \lrn{\sum_{c=1}^N p_c\big(\widehat\theta_t^c - \bar\theta_t^c\big)}_2^2+\sum_{c=1}^N p_c \left(\frac{2L^2}{m}\lrn{\htheta_t -\htheta_t^c}_2^2 +\frac{2L^2}{m}\lrn{\bar\theta^c_t-\bar\theta_t}_2^2 +\frac{m}{4}\lrn{\bar\theta_t-\htheta_t}_2^2\right) \notag\\
    &\leq -\frac{3}{4}m \lrn{\htheta_t - \bar\theta_t}_2^2+\frac{2L^2}{m}\sum_{c=1}^N p_c \left(\lrn{\htheta_t -\htheta_t^c}_2^2 +\lrn{\bar\theta^c_t-\bar\theta_t}_2^2\right)
 \end{align}
where the first inequality follows by the strong-convexity in assumption \ref{def:strong_convex}, the second inequality follows by Jensen's inequality, and the last equality follows by the definition.

\paragraph{Estimate of $\mathcal{B}$ and $\mathcal{C}$}  Applying Cauchy-Schwarz inequality and AM-GM inequality leads to
\begin{align}\label{B_estimate}
    \lrw{ \htheta_t - \bar\theta_t,  \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor})  } &\leq \frac{m}{8} \| \htheta_t - \bar\theta_t \|_2^2 + \frac{2}{m} \big\| \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big\|_2^2\notag\\
    &\leq \frac{m}{8} \| \htheta_t - \bar\theta_t \|_2^2 +\frac{2}{m} \bigg\| \sum_{c=1}^N p_c\bigg(\nabla f^c(\htheta^c_t) - \nabla f^c(\htheta^c_{\eta\lfloor\frac{t}{\eta} \rfloor})\bigg) \bigg\|_2^2\notag\\
    &\leq \frac{m}{8} \| \htheta_t - \bar\theta_t \|_2^2 +\frac{2L^2}{m} \sum_{c=1}^N p_c\big\| \htheta^c_t-\htheta^c_{\eta\lfloor\frac{t}{\eta} \rfloor} \big\|_2^2, 
\end{align}
where the third inequality follows by Jensen's inequality and assumption \ref{def:smooth}. For the other, we have
\begin{align}\label{C_estimate}
    \lrw{ \htheta_t - \bar\theta_t, \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor}) } &\leq \frac{m}{8} \| \htheta_t - \bar\theta_t \|_2^2 + \frac{2}{m} \lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2,
    % &\leq \frac{m}{4} \| \htheta_t - \bar\theta_t \|_2^2+ d\sum_{c=1}^N p_c^2\sigma_c^2,
\end{align}
% where the second inequality follows by Lemma \ref{lem:total_variance}.

% \textcolor{red}{I am working on how to combining these local strong-convexity and smoothness properties by introducing $\gamma$ based on non-i.i.d data.}

% use Lemma \ref{lem:total_variance} and xx and xx.

% where the first inequality follows from the strong-convexity property and $ab\leq  (\frac{\sqrt{m}}{2}a)^2+({\frac{1}{\sqrt{m}}}b)^2$; in particular, we don't attempt to optimize the constants of $-\frac{m}{2}$ for the item $\| \htheta_t - \bar\theta_t \|_2^2$; the last inequality follows by the smoothness assumption \ref{def:smooth}.

\paragraph{Combing estimates of $\mathcal{A}$, $\mathcal{B}$, and $\mathcal{C}$} Plugging Eq.\eqref{A_estimate}, Eq.\eqref{B_estimate}, and Eq.\eqref{C_estimate} into Eq.\eqref{diff}, we have
\begin{align*}
\d \| \htheta_t - \bar\theta_t \|_2^2
\leq - m \| \htheta_t - \bar\theta_t \|_2^2 \d t + \frac{2}{m} \left(L^2 \sum_{c=1}^N p_c\big\| \htheta^c_t-\htheta^c_{\eta\lfloor\frac{t}{\eta} \rfloor} \big\|_2^2+ \lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2\right)\d t.
\end{align*}
Now apply Gr\"{o}nwall's inequality to the preceding inequality and take expectation respect to a coupling $(\htheta_t, \bar\theta_t) \sim \Gamma(\widehat\mu_t,\pi)$
\begin{align}\label{eq:1st_gronwall}
     &\E{ \|\htheta_t - \bar\theta_t \|_2^2}\leq  \E{\| \htheta_0 - \bar\theta_0 \|_2^2} e^{-mt}\notag\\
     &+\frac{2}{m}\int_0^t \bigg(L^2 \sum_{c=1}^N p_c\E{\big\| \htheta^c_t-\htheta^c_{\eta\lfloor\frac{t}{\eta} \rfloor} \big\|_2^2}+ \E{\lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2}\bigg) e^{-(t-s)m} \d s\notag\\
     &\leq {\| \htheta_0 - \bar\theta_0 \|_2^2} e^{-mt}+\frac{2}{m}\int_0^t \bigg(L^2 (2\eta(\eta G+8d\tau))+ d \sigma^2 \bigg) e^{-(t-s)m} \d s
\end{align}
where Lemma \ref{lem:estimate_of_I}, Lemma \ref{lem:total_variance} and $\sum_{c}^N p_c^2\leq 1$ have been applied to the second inequality.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% % Yian's revision:
% \begin{align*}
% \frac{1}{2} \d \| \htheta_t - \bar\theta_t \|_2^2
% &= \lrw{ \htheta_t - \bar\theta_t, \nabla f(\bar\theta_t)-\nabla f(\htheta_t)} \d t+ \lrw{ \htheta_t - \bar\theta_t,  \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor})  } \d t \\
% &\qquad \qquad+ \lrw{ \htheta_t - \bar\theta_t, \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor}) } \d t \\
% &\leq - \frac{m}{2} \| \htheta_t - \bar\theta_t \|_2^2 \d t + \frac{1}{m} \big\| \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big\|_2^2 \d t 
% + \frac{1}{m} \lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2\d t.
% \end{align*}
% Therefore, 
% \begin{align*}
% \frac{1}{2} \d \lrp{ e^{m t} \| \htheta_t - \bar\theta_t \|_2^2 } \leq \frac{1}{m} e^{m t} \lrp{ \big\| \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big\|_2^2 + \lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2 } \d t.
% \end{align*}
% By the fundamental theorem of calculus, for $t\in[\eta k, \eta (k+1)]$,
% \begin{align*}
% &\| \htheta_{\eta (k+1)} - \bar\theta_{\eta (k+1)} \|_2^2 - e^{-m \eta} \| \htheta_{\eta k} - \bar\theta{\eta k} \|_2^2 \\
% &\leq \frac{1}{m} \int_{\eta k}^{\eta (k+1)} e^{m (t-\eta (k+1))} \lrp{ \big\| \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big\|_2^2 + \lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2 } \d t.
% \end{align*}
% Take expectation
% \begin{align*}
% & \E{\| \htheta_{\eta (k+1)} - \bar\theta_{\eta (k+1)} \|_2^2} - e^{-m \eta} \E{\| \htheta_{\eta k} - \bar\theta{\eta k} \|_2^2} \\
% &\leq \frac{1}{m} \int_{\eta k}^{\eta (k+1)} e^{m (t-\eta (k+1))} \lrp{ \E{\big\| \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big\|_2^2} + \E{\lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2} } \d t \\
% &\leq \frac{\eta}{m} \sup_{t\in[\eta k, \eta (k+1)]}\lrp{ \E{\big\| \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big\|_2^2} +\E{\lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2} }.
% \end{align*}

% Wei's comment: \textcolor{red}{$\frac{\frac{\eta}{m}}{1-e^{-m\eta}}\approx \frac{\frac{\eta}{m}}{m\eta}\approx \frac{1}{m^2}$}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% \Zhao{In the second term of the above equation, is that $t/\eta$ actually $s/\eta$?} \textcolor{red}{nice catch}

%\paragraph{Estimate of $\mathcal{I}$} 

%\paragraph{Proof of Theorem \ref{thm:non_asymptotic} (continued)} 
Continuing the computations in Eq.\eqref{eq:1st_gronwall}, we have
\begin{align*}
    \E{ \| \htheta_t - \bar\theta_t \|_2^2}&\leq  {\| \htheta_0 - \bar\theta_0 \|_2^2} e^{-mt}+\frac{2}{m} (L^2 (2\eta(\eta G+8d\tau))+ d \sigma^2) \int_0^t  e^{-(t-s)m} \d s\\
     &\leq {\| \htheta_0 - \bar\theta_0 \|_2^2} e^{-mt}+\frac{1}{m^2} \big(L^2 (2\eta(\eta G+8d\tau))+ d  \sigma^2 \big),
\end{align*}
where $\bar\theta_0$ is an arbitrary point that follows the distribution $\pi$.

Recall that $\theta_k$ and $\widehat\theta_{t\eta}$ have the same distribution $\mu_k$. By the definition of $W_2$ distance, we have
\begin{align*}
W_2(\mu_k, \pi) 
%\leq \left(\E{ \| \htheta_{k\eta} - \bar\theta_{k\eta} \|_2^2}\right)^{1/2}
\leq & ~ e^{-{mk\eta}/{2}} \cdot W_2(\mu_0, \pi) + \sqrt{ \frac{1}{m^2} \big(L^2 (2\eta(\eta G+8d\tau))+ d \sigma^2\big)} \\
\leq & ~ e^{-{mk\eta}/{2}} \cdot 2 (\| \theta_0 - \theta_* \|_2 +  \sqrt{d/m} )+ \sqrt{ \frac{1}{m^2} \big(L^2 (2\eta(\eta G+8d\tau))+ d  \sigma^2\big)} \\
\leq & ~ e^{-{mk\eta}/{2}} \cdot 2 ( \sqrt{d} {\cal D} +  \sqrt{d/m} )+  \sqrt{ \frac{1}{m^2} \big(L^2 (2\eta(\eta G+8d\tau))+ d  \sigma^2\big)},
\end{align*}
where the first inequality follows by taking expectation with respect to the optimal coupling $(\htheta_0, \bar\theta_0) \sim \Gamma(\widehat\mu_0,\pi)$ and applying  $(a+b)^{1/2}\leq |a|^{1/2}+|b|^{1/2}$, the second one follows by Lemma \ref{lem:W2_init_bound}, and the last step follows from assumption on $\| \theta_0 - \theta_* \|_2$.



\end{proof}


\subsection{Technical Results}


%\Zhao{We only use eqref for equation.}

\begin{lemma}[Uniform $\ell_2$ upper bound for local clients]
\label{lem:L2_bound_local}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , {m}/{L^2})$, we have the $\ell_2$ norm upper bound as follows
\begin{align*}
\sup_k\E{\lrn{\theta_k^c-\theta_*}_2^2}\leq {\|\theta_0-\theta_*\|_2^2} + \underbrace{2 \frac{d}{m} \left(\tau + \eta\max_{i\in[N]}\left(\sigma^2 +\lrn{\theta_*^i-\theta_*}_2^2\right)\right)}_{\mathcal{U}},
\end{align*}
where $\theta_*$ is the global minimum.
\end{lemma}


\begin{proof} First, we consider the $k$-th iteration, where $k\in \{1,2,\cdots, K-2, (K-1)_{-}\}$ and $(K-1)_-$ denotes the $K-1$-step without synchronization. Following the iterate of Eq.\eqref{local_client} in a local client of $c\in [N]$, we have
	\begin{align}\label{eq:Langevin_L2_1_local}
&\quad\ \E{\lrn{\theta_{k+1}^c-\theta_*}_2^2}\notag\\
		&= \E{\|\theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c)\|_2^2} + \sqrt{8\eta\tau}\E{ \langle \theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c), \xi_k \rangle } + 2\eta\tau\E{\|\xi_k\|_2^2} \notag \\
		&= \E{\|\theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c)\|_2^2} + 2\eta d\tau,
	\end{align}	
	where the last equality comes from the independence of $\theta_k^c-\theta_*- \widetilde f^c(\theta_k^c)$ and $\xi_k$ and $\E{\xi_k}=0$. Note that
\begin{align}\label{eq:ip_1st_local}
%\small
&\quad\ \E{\|\theta_k^c -\theta_*- \eta \widetilde f^c(\theta_k^c)\|_2^2} \notag\\
&= \E{\left\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \right\|_2^2} + \eta^2\E{\|\nabla f^c(\theta_k^c)-\nabla \widetilde f^c(\theta_k^c)\|_2^2}  \notag\\
& \qquad\qquad + 2 \eta \E{ \langle \theta_k^c-\theta_*-\eta \nabla f^c(\theta_k^c),\nabla f^c(\theta_k^c)-\nabla\widetilde f^c(\theta_k^c) \rangle }  \notag\\
&= \E{\left\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \right\|_2^2} + \eta^2\E{\|\nabla f^c(\theta_k^c)-\nabla \widetilde f^c(\theta_k^c)\|_2^2} \notag \\
&\leq \E{\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \|_2^2}  + \eta^2 d\sigma^2, 
\end{align}
where the first step follows from simple algebra, the second step follows from the unbiasedness of the stochastic gradient, and the last step follows from Assumption \ref{def:variance}. In what follows, we can upper bound the first term of Eq.\eqref{eq:ip_1st_local} as follows
\begin{align}\label{eq:ip_2nd_test_theta_star}
	&\quad\ \E{\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \|_2^2}\notag\\
	&=\E{\|\theta_k^c -\theta_*- \eta (\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*))-\eta\nabla f^c(\theta_*) \|_2^2}\notag\\
	&=\E{\|\theta_k^c -\theta_*- \eta (\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)) \|_2^2}+\eta^2 \|\nabla f^c(\theta_*)\|_2^2\notag\\
	&\qquad\qquad-2\eta\underbrace{\E{\langle \theta_k^c -\theta_*- \eta (\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)),  \nabla f^c(\theta_*)\rangle}}_{\E{\nabla f^c(\theta_*)}=\E{\nabla f(\theta_*)}=0}\notag\\
	&= \E{\|\theta_k^c-\theta_*\|_2^2} - 2 \eta \E{ \langle \theta_k^c-\theta_*,\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)\rangle } \notag\\
	&\qquad\qquad+ \eta^2\E{\| \nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)\|_2^2} +\eta^2 \|\nabla f^c(\theta_*)\|_2^2\notag\\
	&\le \E{\|\theta_k^c-\theta_*\|_2^2} - 2\eta m\E{\|\theta_k^c-\theta_*\|_2^2} + \eta^2 L^2\E{\|\theta_k^c-\theta_*\|_2^2}+\eta^2 \lrn{\nabla f^c(\theta_*)}_2^2 \notag\\
	&= \left(1-2\eta m + \eta^2 L^2\right)\E{\|\theta_k^c-\theta_*\|_2^2}+\eta^2 \lrn{\nabla f^c(\theta_*)}_2^2.
\end{align}

Combining Eq.~\eqref{eq:Langevin_L2_1_local}, Eq.~\eqref{eq:ip_1st_local}, and Eq.~\eqref{eq:ip_2nd_test_theta_star}, we have the following iterate
\begin{align*}
	\E{\|\theta_{k+1}^c-\theta_*\|_2^2} 
	\leq & ~ (1-2\eta m+\eta^2L^2)  \E{\|\theta_k^c-\theta_*\|_2^2} + 2\eta d\tau +\eta^2 d \sigma_c^2 +\eta^2 \lrn{\nabla f^c(\theta_*)}_2^2\notag\\
	\leq & ~ \underbrace{(1-2\eta m+\eta^2L^2)}_{:=g(\eta)}  \E{\|\theta_k^c-\theta_*\|_2^2} + 2\eta d (\tau + \eta \sigma_c^2+\eta \lrn{\theta_*^c-\theta_*}_2^2), \notag
\end{align*}
where the last inequality follows since $\lrn{\nabla f^c(\theta_*)}_2^2=\lrn{\nabla f^c(\theta_*)-\nabla f^c(\theta_*^c)}_2^2\leq L^2 \lrn{\theta_*^c-\theta_*}_2^2$ by applying the smoothness assumption \ref{def:smooth} and the fact that $\nabla f^c(\theta_*^c)=0$.

Since $g(\eta)$ is a quadratic equation and $g(0)=g ({2m}/{L^2} )=1$, if $\eta\in (0,  2m / L^2  )$, then we have $g(\eta) \in (0, 1]$. Further if $ \eta \in (0, m/L^2 )$, then we have $1-g(\eta) \geq \eta m$.

Recursively applying the above equation $k$ times, where $k\in \{1,2,\cdots, K-1, K_{-}\}$ and $K_-$ denotes the $K$-step without synchronization, it follows that
\begin{align}\label{recursion_v2}
	\E{\|\theta_k^c-\theta_*\|_2^2} &\le g(\eta)^{k}\| \theta_0^c-\theta_*\|_2^2 + \frac{1- g(\eta)^{k}}{1 - g(\eta)} \cdot 2\eta d (\tau + \eta \sigma^2+\eta \lrn{\nabla f^c(\theta_*)}_2^2)  \\
	&\le \|\theta_0^c-\theta_*\|_2^2 + \frac{2d}{m} (\tau + \eta\sigma^2+\eta \lrn{\nabla f^c(\theta_*)}_2^2 )\notag\\
	&\leq \|\theta_0-\theta_*\|_2^2 + \underbrace{\frac{2d}{m} \left(\tau + \eta\max_{i\in[N]}\left(\sigma^2 +\lrn{\theta_*^i-\theta_*}_2^2\right)\right)}_{\mathcal{U}},\notag
\end{align}
where the last one holds because $\theta_0=\theta_0^c$ for any $c\in[N]$.
In particular, the $K$-th step before synchronization yields that
\begin{align}\label{recursion_v3}
	\E{\|\theta_{K_-}^c-\theta_*\|_2^2} &\le \|\theta_0-\theta_*\|_2^2+\mathcal{U}.
\end{align}
By contrast, for the $K$-local step after synchronization, applying Jensen's inequality
\begin{align}\label{recursion_v4}
	\E{\|\theta_K^c-\theta_*\|_2^2} &=\E{\bigg\|\sum_{c=1}^N p_c\theta_{K-}^c-\theta_*\bigg\|_2^2}\leq  \sum_{c=1}^N p_c\E{\lrn{\theta_{K-}^c-\theta_*}_2^2}\leq \|\theta_0-\theta_*\|_2^2+\mathcal{U}.
\end{align}
Now starting from iteration $K$, we adapt the recursion of Eq.\eqref{recursion_v2} for the $k$-th step, where $k\in\{K+1,\cdots, 2K-1, (2K)_{-}\}$ and $(2K)_-$ denotes the $2K$-step without synchronization, we have
\begin{align}\label{recursion_v5}
	\E{\|\theta_k^c-\theta_*\|_2^2} 
	\leq & ~ g(\eta)^{k-K} \cdot  \E{\|\theta_K^c-\theta_*\|_2^2} + \frac{1- g(\eta)^{k-K}}{1 - g(\eta)}\cdot 2\eta d \left(\tau + \eta\left(\sigma^2+ \lrn{\theta_*^c-\theta_*}_2^2\right)\right)\notag \\
	\leq &  g(\eta)^{k-K}(\|\theta_0-\theta_*\|_2^2+\mathcal{U})+\frac{1- g(\eta)^{k-K}}{m\eta} m\eta \mathcal{U}\notag \\
	\leq & \|\theta_0-\theta_*\|_2^2+ g(\eta)^{k-K} \mathcal{U} +  (1- g(\eta)^{k-K}) \mathcal{U} \notag\\
	\leq & \|\theta_0-\theta_*\|_2^2+\mathcal{U},
\end{align}
where the second inequality follows by Eq.\eqref{recursion_v4}, the fact that $1-g(\eta)\geq \eta m$, and the definition of $\mathcal{U}$. The third one holds since $g(\eta)\leq 1$.

By repeating Eq.\eqref{recursion_v4} and \eqref{recursion_v5}, we have that for all $k\geq 0$
\begin{align*}
	\E{\|\theta_k^c-\theta_*\|_2^2} \leq \|\theta_0-\theta_*\|_2^2+\mathcal{U}.\notag\\
\end{align*}\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Beginning of dominated divergence %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{lemma}[Dominated divergence]\label{divergence}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , {m}/{L^2})$, we have the $\ell_2$ norm upper bound as follows
\begin{align*}
    \sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_k}_2^2}\leq 2\|\theta_0-\theta_*\|_2^2+2\mathcal{U}+2\E{\|\theta_k-\theta_*\|_2^2}.
\end{align*}
\end{lemma}

\begin{proof}

Applying Young's inequality, we have
\begin{align*}
	\E{\|\theta_k^c-\theta_k\|_2^2}=\E{\|\theta_k^c-\theta_*+\theta_*-\theta_k\|_2^2} \leq 2(\|\theta_0-\theta_*\|_2^2+\mathcal{U})+2\E{\|\theta_k-\theta_*\|_2^2}.\notag\\
\end{align*}
Summing client index from $1$ to $N$, we have
\begin{align*}
    \sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_k}_2^2}\leq 2\|\theta_0-\theta_*\|_2^2+2\mathcal{U}+2\E{\|\theta_k-\theta_*\|_2^2}.
\end{align*}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% End of dominated divergence %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% For any $k \ge 0$, consider $k_0=K\lfloor \frac{k}{K}\rfloor $ such that $k\leq k_0$ and $\theta_{k_0}^c=\theta_{k_0}$ for any $k\geq 0$. Also, we use the fact that $\eta_t$ is non-increasing and $\eta_{t_0} \leq 2 \eta_t$ for all $t-t_0 \leq E-1$, then
% 	\begin{align*}
% 	\sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_k}_2^2}&=\sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_{k_0}-(\theta_k-\theta_{k_0})}_2^2}\notag\\
% 	&\leq \sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_{k_0}}_2^2}\notag\\
% 	&\leq \sum_{c=1}^N p_c \E{\lrn{\theta_k^c-\theta_*^c+\theta_*^c-\theta_*+\theta_*-\theta_{k_0}}_2^2}\notag\\
% 	\end{align*}
	
	
	
% 	\begin{align*}
% 	\sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_k}_2^2}&= \sum_{c=1}^N p_c \E{\lrn{\theta_k^c-\theta_*^c+\theta_*^c-\theta_*+\theta_*-\theta_k}_2^2}\notag\\
% 	&\leq \underbrace{3\sum_{c=1}^N p_c \E{\lrn{\theta_k^c-\theta_*^c}_2^2}}_{\textcolor{red}{how\ to\  prove\ this\ has\  a\  similar \ order\  as\  the\  third\  term?}}+3\gamma+3 \E{\lrn{\theta_*-\theta_k}_2^2}\notag\\
% 	&\leq 3\sum_{c=1}^N p_c \frac{1}{L^2} \E{\lrn{\nabla f^c (\theta_k^c)-\nabla f^c(\theta_*^c)}_2^2}+3\gamma+3 \E{\lrn{\theta_*-\theta_k}_2^2}\notag\\
% 	&\leq 3\sum_{c=1}^N p_c \frac{1}{L^2} \E{\lrn{\nabla f^c (\theta_k^c)}_2^2}+3\gamma+3 \E{\lrn{\theta_*-\theta_k}_2^2}\notag\\
% 	&\leq 3\sum_{c=1}^N p_c \frac{1}{L^2} \E{2L(f^c(\theta_k^c)-f^c(\theta_*^c))}+3\gamma+3 \E{\lrn{\theta_*-\theta_k}_2^2}\notag\\
% 	&= \frac{6}{L} \E{(f(\theta_k)-f(\theta_*))}+3\gamma+3 \E{\lrn{\theta_*-\theta_k}_2^2}\notag\\
% 	&= \frac{6}{L} \E{(f(\theta_k)-f(\theta_*))}+3\gamma+3 \E{\lrn{\theta_*-\theta_k}_2^2}\notag\\
% 	\end{align*}
\end{proof}




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% End of local proof %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


% \begin{lemma}[Uniform $\ell_2$ upper bound for the center]
% \label{lem:L2_bound}
% Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , \min\{1, {m}/{9L^2}\})$, we have the $\ell_2$ norm upper bound as follows
% \begin{align*}
% \sup_k\E{\| \theta_k-\theta_* \|_2^2}\leq 2{\|\theta_0-\theta_*\|_2^2} + \frac{4d\tau}{m}+ \frac{3\eta}{m}\left(d\max_{i\in[N]}\sigma^2_i+(L^2+d)\max_{i\in[N]} \lrn{\theta_*^i-\theta_*}_2^2\right),
% \end{align*}
% where $\theta_*$ is the global minimum.
% \end{lemma}


% \begin{proof}
% 	By the iterate in Eq.\eqref{fed_avg_langevin_dynamics}, we have
% 	\begin{align}\label{eq:Langevin_L2_1}
% \E{\|\theta_{k+1}-\theta_*\|_2^2}
% 		&= \E{\|\theta_k -\theta_*- \eta \nabla\tilde f(\theta_k)\|_2^2} + \sqrt{8\eta\tau}\E{ \langle \theta_k -\theta_*- \eta \nabla\tilde f(\theta_k), \xi_k \rangle } + 2\eta\tau\E{\|\xi_k\|_2^2} \notag \\
% 		&= \E{\|\theta_k -\theta_*- \eta \nabla\tilde f(\theta_k)\|_2^2} + 2\eta d\tau,
% 	\end{align}	
% 	where the last equality is from the independence of $\theta_k-\theta_*- \widetilde f(\theta_k)$ and $\xi_k$ and $\E{\xi_k}=0$. Note that
% \begin{align}\label{eq:ip_1st}
% %\small
% &\quad\ \E{\|\theta_k -\theta_*- \eta \widetilde f(\theta_k)\|_2^2} \notag\\
% &= \E{\left\|\theta_k -\theta_*- \eta \nabla f(\theta_k) \right\|_2^2} + \eta^2\E{\|\nabla f(\theta_k)-\nabla \widetilde f(\theta_k)\|_2^2}  \notag\\
% & \qquad\qquad + 2 \eta \E{ \langle \theta_k-\theta_*-\eta \nabla f(\theta_k),\nabla f(\theta_k)-\nabla\widetilde f(\theta_k) \rangle }  \notag\\
% &= \E{\left\|\theta_k -\theta_*- \eta \nabla f(\theta_k) \right\|_2^2} + \eta^2\E{\|\nabla f(\theta_k)-\nabla \widetilde f(\theta_k)\|_2^2} \notag \\
% &= \E{\|\theta_k -\theta_*- \eta \nabla f(\theta_k) \|_2^2}  + \eta^2 d\sum_{c=1}^N p_c^2\sigma_c^2, 
% \end{align}
% where the first step follows from simple algebra, the second step follows from the unbiasedness of the stochastic gradient, and the last step follows from Lemma \ref{lem:total_variance}.


% Recall that $\theta_*$ is the stationary point that yields $\nabla f(\theta_*)=0$, we can upper bound the first term of Eq.\eqref{eq:ip_1st} as follows
% \begin{align}\label{eq:ip_2nd}
% 	&\quad\ \E{\|\theta_k -\theta_*- \eta \nabla f(\theta_k) \|_2^2}\notag\\
% 	&=\E{\|\theta_k -\theta_*- \eta (\nabla f(\theta_k)-\nabla f(\theta_*)) \|_2^2}\notag\\
% 	&= \E{\|\theta_k-\theta_*\|_2^2} - 2 \eta \underbrace{\E{ \langle \theta_k-\theta_*,\nabla f(\theta_k)-\nabla f(\theta_*) \rangle }}_{\mathcal{I}_1} + \eta^2\underbrace{\E{\| \nabla f(\theta_k)-\nabla f(\theta_*)\|_2^2}}_{\mathcal{I}_2}
% \end{align}
% \paragraph{Estimate of $\mathcal{I}_1$} Since $\nabla f(\theta)=\sum_{c=1}^N p_c \nabla f^c(\theta^c)$ and   $\theta=\sum_{c=1}^N p_c \theta^c$ for any $\theta$, we have
% \begin{align}\label{estimate_i_1}
%     \E{ \langle \theta_k-\theta_*,\nabla f(\theta_k)-\nabla f(\theta_*) \rangle }&=\E{\sum_{c=1}^N p_c \langle \theta_k-\theta_*,\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*^c) \rangle }\notag\\
%     &=\E{\sum_{c=1}^N p_c \langle \underbrace{\theta_k-\theta_k^c}_{=0}+\theta_k^c-\theta_*^c+\underbrace{\theta_*^c-\theta_*}_{=0},\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*^c) \rangle }\notag\\
%     &\geq m\E{\sum_{c=1}^N p_c \lrn{\theta_k^c-\theta_*^c}_2^2}\notag\\
%     &\geq m\E{\lrn{\sum_{c=1}^N p_c \bigg(\theta_k^c-\theta_*^c\bigg)}_2^2}:= m\E{\lrn{\theta_k-\theta_*}_2^2},
% \end{align}
% where the first inequality follows by the strong convexity assumption \ref{def:strong_convex} and the second inequality follows by the Jensen's inequality.

% \paragraph{Estimate of $\mathcal{I}_2$}  By the definition of $\nabla f(\theta)=\sum_{c=1}^N p_c \nabla f^c(\theta^c)$, Jensen's inequality, and the smoothness assumption \ref{def:smooth}, respectively, we have
% \begin{align}\label{estimate_of_i_2_v1}
%     \E{\| \nabla f(\theta_k)-\nabla f(\theta_*)\|_2^2}&=\E{\lrn{ \sum_{c=1}^N p_c\bigg(\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*^c)\bigg)}_2^2}\notag\\
%     &\leq \sum_{c=1}^N p_c\E{\lrn{\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*^c)}_2^2}\notag\\
%     &\leq L^2 \sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_*^c}_2^2}
% \end{align}

% Consider a decomposition $\theta_k^c-\theta_*^c=\theta_k^c-\theta_k+\theta_k-\theta_*+\theta_*-\theta_*^c$ for each client $c$ and Young's inequality, we can futher upper bound Eq.\eqref{estimate_of_i_2_v1} as follows
% \begin{align}\label{estimate_i2_v2}
%     \E{\| \nabla f(\theta_k)-\nabla f(\theta_*)\|_2^2}&\leq L^2 \sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_k+\theta_k-\theta_*+\theta_*-\theta_*^c}_2^2}\notag\\
%     &\leq 3 L^2 \sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_k}_2^2}+ 3L^2\E{\lrn{\theta_k-\theta_*}_2^2}+3L^2{\sum_{c=1}^N p_c\lrn{\theta_*-\theta_*^c}_2^2}\notag\\
%     &\leq 6L^2(\|\theta_0-\theta_*\|_2^2+\mathcal{U})+9L^2\E{\|\theta_k-\theta_*\|_2^2}+3L^2 \gamma,
%     % &\leq 3L^2\sum_{c=1}^N p_c \left(8\E{\|\theta_{K\lfloor\frac{k}{K}\rfloor}-\theta_*\|_2^2}+8\E{\|\theta_*^c-\theta_*\|_2^2} + 4 \frac{d}{m} (\tau + \sigma^2)\right)\notag\\
%     % &\qquad\qquad+ 3L^2\E{\lrn{\theta_k-\theta_*}_2^2}+3L^2 \gamma\notag\\
%     % &=27L^2\E{\lrn{\theta_k-\theta_*}_2^2}+27L^2\gamma +12\frac{d}{m}(\tau+\sigma_c^2)
% \end{align}
% where the second inequality follows by Lemma \ref{divergence} and $\gamma=\sum_{c=1}^N p_c\lrn{\theta_*-\theta_*^c}_2^2$.


% \paragraph{Combine $\mathcal{I}_1$ and $\mathcal{I}_2$}  Plugging Eq.~\eqref{estimate_i_1} and Eq.~\eqref{estimate_i2_v2} into Eq.~\eqref{eq:ip_2nd}, we have the following iterate
% \begin{align*}
% 	\E{\|\theta_{k+1}-\theta_*\|_2^2} 
% 	\leq & ~ \underbrace{(1-2\eta m+9\eta^2L^2)}_{q(\eta)} \cdot  \E{\|\theta_k-\theta_*\|_2^2} \notag \\
% 	&\qquad+6 \eta^2 L^2\left(\|\theta_0-\theta_*\|_2^2+\mathcal{U}\right)+3\eta^2 L^2\gamma+ 2\eta d\tau +\eta^2 d\sum_{c=1}^N p_c^2\sigma_c^2,\notag\\
% \end{align*}
% Since $q(\eta)$ is a quadratic equation and $q(0)=q ({2m}/(9L^2) )=1$, if $\eta\in (0,  2m / (9L^2))$, then we have $q(\eta) \in (0, 1]$. Further if $ \eta \in (0, m/(9L^2))$, then we have $1-q(\eta) \geq \eta m$.

% Recursively applying the above equation $k$ times gives us
% \begin{align}
% 	\E{\|\theta_k-\theta_*\|_2^2} &\le q(\eta)^k \E{\|\theta_0-\theta_*\|_2^2} \notag\\
% 	&\qquad\qquad+ \frac{1- q(\eta)^k}{1 - q(\eta)} \left(6 \eta^2 L^2\left(\|\theta_0-\theta_*\|_2^2+\mathcal{U}\right)+3\eta^2 L^2\gamma+ 2\eta d\tau +\eta^2 d\sum_{c=1}^N p_c^2\sigma_c^2\right)  \notag\\
% 	&\le \E{\|\theta_0-\theta_*\|_2^2} + \frac{1}{m}\left(6 \eta L^2\left(\|\theta_0-\theta_*\|_2^2+\mathcal{U}\right)+ 3\eta L^2\gamma+ 2d\tau +\eta d\sum_{c=1}^N p_c^2\sigma_c^2\right)\notag\\
% 	&\leq \bigg(1+\frac{6\eta L^2}{m}\bigg){\|\theta_0-\theta_*\|_2^2} + \frac{1}{m}\left(6 \eta L^2\mathcal{U}+ 3\eta L^2\gamma+ 2d\tau +\eta d\sum_{c=1}^N p_c^2\sigma_c^2\right)\notag\\
% 	&\leq 2{\|\theta_0-\theta_*\|_2^2} + \frac{4d\tau}{m}+ \frac{3\eta}{m}\left(d\max_{i\in[N]}\sigma^2_i+(L^2+d)\max_{i\in[N]} \lrn{\theta_*^i-\theta_*}_2^2\right), \label{almost_final}
% \end{align}
% where the last inequality follows by $\eta\leq m/(9L^2)$, $\gamma=\sum_{c=1}^N p_c\lrn{\theta_*-\theta_*^c}_2^2\leq \max_{i\in[N]} \lrn{\theta_*-\theta_*^i}_2^2$, $\sum_{c=1}^N p_c^2\sigma_c^2\leq \max_{i\in[N]}\sigma_i^2 \sum_{c=1}^N p_c^2\leq \max_{i\in[N]}\sigma_i^2$ and the definition of  $\mathcal{U}$.
% \end{proof}


\begin{lemma}[Initial condition] 
\label{lem:W2_init_bound}
Let $\mu_0$ denote the Dirac delta distribution at $\theta_0$. % and assume $\lrn{\theta_0-\theta_*}_2^2\leq d\mathcal{D}^2$.
Then, we have
\begin{align*}
W_2(\mu_0, \pi)\leq 2 (\| \theta_0 - \theta_* \|_2 +  \sqrt{d/m} ). %\sqrt{2d\left(\mathcal{D}^2+\frac{2}{m}\right)}.
\end{align*}
\end{lemma}

\begin{proof}
By \cite{ccbj18}, there exists an optimal coupling between $\mu_0$ and $\pi$ such that
\begin{align*}
    W_2^2(\mu_0, \pi) 
    \leq & ~ \mathbb{E}_{\theta\sim \pi} [\|\theta_0-\theta\|_2^2 ]\\
    \leq & ~ 2\mathbb{E}_{\theta\sim \pi} [\|\theta_0-\theta_*\|_2^2 ] + 2 \mathbb{E}_{\theta\sim \pi}[\|\theta-\theta_*\|_2^2] \\
    = & ~ 2\| \theta_0 - \theta_* \|_2^2 +2\mathbb{E}_{\theta\sim \pi}[\|\theta-\theta_*\|_2^2]\\
    \leq & ~ 2\| \theta_0 - \theta_* \|_2^2 + 4d/m,
\end{align*}
where the second step follows from triangle inequality, the last step follows from Theorem 17 \cite{ccbj18}.
\end{proof}



\begin{lemma}[Discretization error]\label{lem:estimate_of_I}
For any fixed learning rate $\eta\in(0, \frac{m}{9L^2}]$ and any client $c\in[N]$, the iterates of $(\widehat \theta_s)$ based on the dynamics of Eq.\eqref{eq:continuous_interpolation} satisfy the following estimate
\begin{align*}
    \E{ \big\| \htheta^c_{s} - \htheta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2} \leq 2\eta(\eta G+8d\tau),
\end{align*}
where $G=4L^2 \|\theta_0-\theta_*\|_2^2 +\frac{8L^2 d\tau}{m}+(d+4L^2) \max_{i\in[N]}\left(\sigma^2 +\lrn{\theta_*^i-\theta_*}_2^2\right)$.
\end{lemma}
\begin{proof}
For any $s\in[0,\infty)$, there exists a certain $k \in \mathbb{N}^+$ such that $s\in [k\eta, (k+1)\eta)$. By the dynamics of Eq.~\eqref{eq:continuous_interpolation}, we have
\begin{align*}
    \htheta_{s}^c = \htheta^c_{\eta\lfloor\frac{s}{\eta} \rfloor}+(s-k\eta) \nabla f^c(\widehat\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor })+\sqrt{2\tau}\int_{k\eta}^s \d \widehat W_t,
\end{align*}
which suggests that 
\begin{align*}
    \sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \htheta^c_{s}-\htheta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2 \leq (s-k\eta) \big\| \nabla f^c(\widehat\theta_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2+\sup_{ s \in [ k\eta,  (k+1)\eta ) }\lrn{\int_{k\eta}^s \sqrt{2\tau} \d \widehat W_t}_2.
\end{align*}
We first square the terms on both sides and take Young’s inequality and expectation
\begin{align*}
    \E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \htheta^c_{s}-\htheta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2} \leq 2\E{\big\|(s-k\eta)\nabla f^c(\widehat\theta_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+2\E{\sup_{ s \in [ k\eta,  (k+1)\eta ) }\lrn{\int_{k\eta}^s \sqrt{2\tau} \d \widehat W_t}_2^2}.
\end{align*}
Then, by Burkholder-Davis-Gundy inequality and It\^{o} isometry, we have
\begin{align}
    \label{eq:1st_part}
    \E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \htheta^c_{s}-\htheta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2}&\leq 2\E{ \big\| (s-k\eta)\nabla f^c(\widehat\theta_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+8\sum_{i=1}^d\E{\int_{k\eta}^s 2\tau \d t} \notag \\
    &\leq 2\eta^2\E{ \big\| \nabla f^c(\widehat\theta_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+16 \eta d\tau.
\end{align}

By Young’s inequality and the fact that $\theta_*^c$ is the global minimizer for $f^c$, we have
\begin{align}\label{eq:2nd_part}
    \E{ \| \nabla f^c(\widehat\theta_{\eta\lfloor \frac{s}{\eta} \rfloor }) \|_2^2}
    = & ~ \E{\| \nabla f^c(\widehat\theta_{\eta\lfloor \frac{s}{\eta} \rfloor })-\nabla f^c(\theta_*^c)+\nabla f^c(\theta_*^c) \|_2^2} \notag \\
    \leq & ~ 2\E{\| \nabla f^c(\widehat\theta_{\eta\lfloor \frac{s}{\eta} \rfloor })-\nabla f^c(\theta_*^c) \|_2^2}.
\end{align}

In what follows, by the smoothness assumption \ref{def:smooth}, the fact that $\mathcal{L}(\widehat \theta_{\eta\lfloor \frac{s}{\eta} \rfloor})=\mathcal{L}(\theta_{\eta\lfloor \frac{s}{\eta} \rfloor})$, we have 
\begin{align}\label{eq:2nd_part_continued}
    \E{ \| \nabla f^c(\widehat\theta_{\eta\lfloor \frac{s}{\eta} \rfloor }) \|_2^2}
    \leq & ~ 2L^2 \E{\|\widehat\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }-\theta_*^c \|_2^2} \notag \\
     \leq & ~ 2L^2 \E{\|\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }-\theta_*+\theta_*-\theta_*^c \|_2^2} \notag \\
    \leq & ~ 4L^2 \E{\|\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }-\theta_*\|_2^2}+4L^2 {\|\theta_*^c-\theta_*\|_2^2}\notag\\
    \leq & 4L^2 \|\theta_0-\theta_*\|_2^2 +\frac{8L^2 d}{m} \left(\tau + \eta\max_{i\in[N]}\left(\sigma^2 +\lrn{\theta_*^i-\theta_*}_2^2\right)\right)+4L^2 {\|\theta_*^c-\theta_*\|_2^2}\notag\\
    \leq & 4L^2 \|\theta_0-\theta_*\|_2^2 +\frac{8L^2 d\tau}{m}+(d+4L^2) \max_{i\in[N]}\left(\sigma^2 +\lrn{\theta_*^i-\theta_*}_2^2\right):=G,
\end{align}
where the third inequality follows by Young's inequality, the fourth step follows from Lemma \ref{lem:L2_bound_local}, and the last step holds by applying $\eta\in (0, \frac{m}{9L^2}]$.


Combining Eq.~\eqref{eq:1st_part} and Eq.~\eqref{eq:2nd_part_continued}, we have
\begin{align}\label{eq:combined_bound}
\E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \htheta^c_{s}-\htheta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2}
&\leq 2\eta(\eta G+8d\tau),
\end{align}
where $G=4L^2 \|\theta_0-\theta_*\|_2^2 +\frac{8L^2 d\tau}{m}+(d+4L^2) \max_{i\in[N]}\left(\sigma_i^2 +\lrn{\theta_*^i-\theta_*}_2^2\right)$.
 
\end{proof}


\begin{lemma}[Stochastic variance] 
\label{lem:total_variance}
Given assumption \ref{def:variance}, we have 
\begin{equation*}
    \E{\lrn{\nabla f(\theta)-\nabla \tilde f(\theta)}_2^2}\leq d \sigma^2 \sum_{c=1}^N p_c^2 ,\qquad \forall \ \theta\in\R^d.
\end{equation*}
\end{lemma}

\begin{proof} By assumption \ref{def:variance}, we have

\begin{align*}
    \E{\lrn{\nabla f(\theta)-\nabla \tilde f(\theta)}_2^2}&=\E{\lrn{\sum_{c=1}^N p_c\bigg(\nabla f^c(\theta^c)-\nabla \tilde f^c(\theta^c)\bigg)}_2^2}\\
    &=\sum_{c=1}^N p_c^2\E{\lrn{\nabla f^c(\theta^c)-\nabla \tilde f^c(\theta^c)}_2^2}\\
    &\leq d \sigma^2 \sum_{c=1}^N p_c^2.
\end{align*}

\end{proof}



\section{Non-strongly convex cases}
We can prove convergence in non-strongly convex case by proving the next lemmas.
\begin{lemma}
Assume $\rE_z \phi(w,z)$ is  convex and Lipschitz:
$\|\nabla \phi(w,z)\|_2 \leq G$.
We have
\[
2 \eta_t \rE_z [ \phi(\tilde{w}_t,z)- \phi(w,z)]
\leq \|\tilde{w}_t -w \|_2^2 - \rE_{w_t|\tilde{w}_t} \|w_{t}-w\|_2^2 + \eta_t^2 G^2 .
\]
\label{lem:convex_result}
\end{lemma}

It implies the following bound.
\begin{lemma}
Define
\[
\phi(q)= \rE_{w \sim q} \rE_z \ell(w,z) 
- \rE_{w \sim q} \ln p_0(w) ,
\]
then we have
\[
2 \eta_t [\phi(\tilde{p}_t)- \phi(p)]
\leq  W_2(\tilde{p}_t,p)^2 - W_2(p_t,p)^2
+ \eta_t^2 G^2 . 
\]
\label{lem:sgd}
\end{lemma}


\textcolor{red}{
To Do List
\begin{itemize}
    \item variance analysis of stochastic gradients based on $K$ local steps
    \item general decaying learning rates ($n^{-\alpha}$, where $\alpha\in (0, 1]$)
    \item different sampling scheme of clients
    \item non-convex extensions?
    \item simulations?
\end{itemize}
}
% Inspired by Yian
% \begin{proof}
% Define two processes, $\htheta_t$ and $\bar\theta_t$, where
% $\htheta_0 = \theta_0$, $\bar\theta_0\sim p^*$, and
% \begin{align}
% d {\htheta}_t = - \nabla f(\htheta_{kh}) d t + \sqrt{2} d \hat{W}_t,
% \end{align}
% and 
% \begin{align}
% d \bar\theta_t = - \nabla f(\bar\theta_t) d t + \sqrt{2} d \overline{W}_t.
% \end{align}
% Consider It\^{o}'s formula for the sequence of $\frac{1}{2}  \lrn{\htheta_t - \bar\theta_t}_2^2$
% \begin{align*}
% &\frac{1}{2} d  \lrn{\htheta_t - \bar\theta_t}_2^2 \\
% &= \lrw{ \htheta_t - \bar\theta_t, d \htheta_t - d \bar\theta_t } + \mathrm{Tr}\lrp{ d^2 \htheta_t - d^2 \bar\theta_t } \\
% &= \lrw{ \htheta_t - \bar\theta_t, - \lrp{\nabla f(\htheta_{kh}) - \nabla f(\bar\theta_t)} d t + \sqrt{2}\lrp{d \hat{W}_t - d \overline{W}_t} } + 2 \mathrm{Tr}\lrp{ d^2 \hat{W}_t - d^2 \overline{W}_t }.
% \end{align*}
% Taking $\hat{W}_t = \overline{W}_t$ defines a coupling between the two processes and leads to
% \begin{align*}
% \frac{1}{2} d \lrn{\htheta_t - \bar\theta_t}_2^2
% &= - \lrw{ \htheta_t - \bar\theta_t, \lrp{\nabla f(\htheta_t) - \nabla f(\bar\theta_t)} } d t + \lrw{ \htheta_t - \bar\theta_t, \lrp{\nabla f(\htheta_{t}) - \nabla f(\htheta_{kh})} } d t \\
% &\leq - m \lrn{\htheta_t - \bar\theta_t}_2^2 d t + \frac{m}{2} \lrn{\htheta_t - \bar\theta_t}_2^2 d t + \frac{1}{2m} \lrn{\nabla f(\htheta_{t}) - \nabla f(\htheta_{kh})}_2^2 d t \\
% &= - \frac{m}{2} \lrn{\htheta_t - \bar\theta_t}_2^2 d t + \frac{1}{2m} \lrn{\nabla f(\htheta_{t}) - \nabla f(\htheta_{kh})}_2^2 d t.
% \end{align*}

% After using Gronwall's inequality, take $\Ep{(\htheta_t, \bar\theta_t) \sim \gamma(\mu_t,\mu^*)}{\cdot}$.
% \end{proof}