\section{Our Algorithm}

Let $N$ denote the number of clients. Let $T$ denote the number of global steps. Let $K$ denote the number of local steps. For each $c \in [N]:=\{1,2,\cdots, N\}$, we use $f^c$ and $\nabla f^c$ denote the loss function and gradient of client $c$ (based on the datas in client $c$). For the stochastic gradient oracle, we denote by $\nabla \tilde f^c(\cdot)$ the unbiasedness estimate of the exact gradient $\nabla f^c$. In addition, we denote $p^c$ as the weight of the $c$-th device such that $p^c\geq 0$ and $\sum_{c=1}^N p^c=1$. $\xi_k$ is a standard $d$-dimensional Gaussian vector at iteration $k$, which is independent of the device index and can be achieved by maintaining the same random seed for each device $c\in[N]$.
% Zhao's first algorithm formulation
% \begin{algorithm*}[h]\caption{Training via Federated Averaging Algorithm }\label{alg:alg_main_text}
% \begin{algorithmic}[1]
% \State $u_r(0) \sim \N(0,I_d)$ for $r\in [m]$. \Comment{$u \in \R^{d \times m}$}
% \For{$t = 1, \ldots, T$} \Comment{$T$ denotes the number of global steps}
%     \For{$c = 1, \ldots, N$} \Comment{$N$ denote the total number of clients}
%         \State $w_{0,c}(t) \leftarrow u(t)$ \Comment{$w_{0,c}(t),u(t) \in \R^{d \times m}$}
%         \For{$k=1,\ldots,K$} \Comment{$K$ denotes the number of local steps}
%             \State $w_{k,c}(t) \leftarrow w_{k-1,c}(t) - \eta_{\mathrm{local}} \cdot \frac{\partial L_c}{\partial w} |_{w = w_{k-1,c}(t) }$ +{\color{red}noise}
%         \EndFor
%         \State
%         $\Delta u_c \leftarrow w_{k,c}(t) - u(t)$
%     \EndFor
%     \State $\Delta u \leftarrow \frac{1}{N} \sum_{c \in [N]} \Delta u_c$ \Comment{$\Delta u \in \R^{d \times m}$}
%     \State $u(t+1) \leftarrow u(t) + \eta_{\mathrm{global}} \Delta u$ + {\color{red}noise} \Comment{$u(t+1)\in \R^{d \times m}$, add noise here, the novelty is less}
% \EndFor
% \end{algorithmic}
% \end{algorithm*}


\begin{algorithm*}[h]\caption{Training via Federated Averaging Algorithm. Denote by $\theta_k^c$ the model parameter in the $c$-th device at the $k$-th step. Denote the immediate result of one step SGLD update from $\theta_k^c$ by $\beta_k^c$. If $k+1 \text{ mod }E=0$, a global synchronization is conducted.}\label{alg:alg_main_text_same_seed}
\begin{algorithmic}[1]
\State \begin{equation}\label{local_device}
    \beta_{k+1}^c=\theta_k^c-\eta_k\nabla f^c(\theta_k^c)+\xi_k,
\end{equation}
\State
\begin{equation*}  
\label{undampedsgld}
\theta_{k+1}^c=\left\{  
             \begin{array}{lr}  
             \beta_{k+1}^c \qquad\qquad\qquad \text{if } k+1 \text{ mod } E\neq 0 \\  
              & \\
             \sum_{k=1}^N p^c \beta_{k+1}^c \ \qquad \text{if } k+1 \text{ mod } E=0
             \end{array}  
\right.  
\end{equation*} 
\end{algorithmic}
\end{algorithm*}

Inspired by \cite{lhy+19}, we define two virtual sequences $\beta_k=\sum_{c=1}^N p^c \beta_k^c$ and $\theta_k=\sum_{c=1}^N p^c \theta_k^c$, which are both inaccessible when $k+1 \text{ mod } E\neq 0$. We also define $g_k=\sum_{c=1}^N p^c \nabla f^c(w_k^c)$ and $\tilde g_k=\sum_{c=1}^N p^c \nabla \tilde f^c(w_k^c)$. In what follows, it is clear that $\E{\tilde g_k}=g_k$. Moreover, we always have $\beta_k=\theta_k$ whether $k+1 \text{ mod } E=0$ or not. Summing Eq.\eqref{local_device} from devices $c=1$ to $N$, we have
\begin{equation*}
    \theta_{k+1}=\theta_k-\eta_k \tilde g_k+\xi_k.
\end{equation*}

The above formulation resembles the SGLD algorithm except that the construction and analysis of stochastic gradients are different.

\paragraph{Quality of non-i.i.d data} Denote by $f_*$ the minimum values of $f$ and by $f^c_*$ the minimum values of $f^c$ for each device $c\in [N]$. We quantify the degree of the non-i.i.d data by $\gamma:=f_*-\sum_{c=1}^N p^c f^c_*$, which is a non-negative constant and yields a larger scale if the data is less identically distributed.


\iffalse
\subsection{Our plan}

What are the assumptions do we need ..
\begin{itemize}
    \item shusen wang's paper
\end{itemize}

We need to prove a new version of Lemma 3 in page 12 in \cite{lhy+19}.
\begin{lemma}[Lemma 3 in page 12 in \cite{lhy+19}]
We have
\begin{align*}
    \mathbb{E} \Big[ \sum_{c=1}^N p^c \| u(t+1) - w_{K,c}(t)  \|^2 \Big] \leq 4 \eta_t^2 (E-1)^2 G^2 
\end{align*}
\end{lemma}

We need to generalize the above lemma to something as follows:
\begin{lemma}[Our version]
We have
\begin{align*}
    \mathbb{E} \Big[ \sum_{c=1}^N p^c \| u(t+1) - w_{K,c}(t)  \|^2 \Big] \leq 4 \eta_t^2 (E-1)^2 G^2  + \| \mathrm{noise} \|^2
\end{align*} 
\end{lemma}

Using Shusen's assumption, we can show Dala's paper \cite{dk19} page 7 contions are holding.

\fi


% \begin{table}[]
%     \centering
%     \begin{tabular}{|l|l|l|l|l|l|} \hline
%         {\bf Notations} & {\bf Ours} & \cite{dk19} & \cite{lhy+19} & \cite{ccbj18} \\ \hline
%         Function & $f$ & $f$ & $F$ & $f$ \\ \hline
%         Parameter &  & $\theta$ & $w$ & $x$ \\ \hline
%         Dimension & $d$ & $p$ & Never & $d$ \\ \hline
%         Smooth & $L$ & $M$ & $L$ & $L$ \\ \hline
%         Strongly convex & $m$ & $m$ & $\mu$ & $m$ \\ \hline
%         Global & & $K$ & $T$ & \\ \hline
%         Local steps & & $1$ & $K$ & \\ \hline
%         Variance & $\sigma^2 d$ & $\sigma^2 p$ & $\sigma^2$ & $\sigma^2 d$ \\ \hline
%         Learning rate & $\eta$ & $h$ & $\eta$ & $\delta$ \\ \hline
%         Choice of LR & & $h = 1/(m+M)$ & $\eta = 2 / (\mu T) $ &  \\\hline
%         \#Devices & & 1 & $N$ & \\ \hline
%         \#Datas per device & & & $n_k$ & \\ \hline
%     \end{tabular}
%     \caption{Notations to compare different papers. We put this table for easy of writing. There is no need to keep this in the final paper.}
%     \label{tab:my_label}
% \end{table}


\subsection{Main result}

% \subsubsection{Notations}

\subsubsection{Assumptions}

\begin{assumption}[Smoothness]\label{def:smooth} For each $c\in [N]$, we say $f^c$ is $L$-smooth if for some $L>0$
\begin{align*}
\| \nabla f^c(y)-\nabla f^c(x) \|_2 \leq L \| y-x \|_2,\quad \forall x, y\in \R^d.
\end{align*}
\end{assumption}

Note that the above assumption is equivalent to saying that
\begin{align*}
f^c(y)\leq f^c(x)+\langle \nabla f^c(x),y-x \rangle+\frac{L}{2}\| y-x \|^2_2\quad \forall x, y\in \R^d.
\end{align*}

\begin{assumption}[Strongly convex]\label{def:strong_convex}
For each $c\in [N]$, $f^c$ is $m$-strongly convex if for some $m>0$
\begin{align*}
f^c(x)\geq f^c(y)+\langle \nabla f^c(y),x-y \rangle + \frac{m}{2} \| y-x \|_2^2\quad \forall x, y\in \R^d.
\end{align*}
\end{assumption}

\begin{assumption}[Bounded variance]\label{def:variance}
For each $c\in [N]$, the variance of stochastic gradient $\nabla \tilde f^c(x)$ in each device is upper bounded such that
\begin{align*}
\mathbb{E}[ \| \nabla \tilde f^c(x) - \nabla f^c(x) \|_2^2] \leq \sigma_c^2 d,\quad \forall x\in \R^d.
\end{align*}
\end{assumption}

% Wei's first algorithm formulation based on standard SGLD
% \subsection{Formulation}
% Let $\theta_k\in\R^d$ be the $k$-th iterate of the following stochastic gradient Langevin algorithm.
% \begin{align}\label{eq:sgld}
%     \theta_{k+1}=\theta_k -\eta \nabla \widetilde f(\theta_k)+\sqrt{2\tau\eta}\xi_k,
% \end{align}
% where $\eta$ is the learning rate, $\tau$ is the temperature, $\xi_k$ is a standard $d$-dimensional Gaussian vector, and $\nabla \widetilde f(\theta)$ is an unbiased estimate of the exact gradient $\nabla f(\theta)$.

\subsubsection{Tools from previous work}

\textbf{Gronwall's inequality} is a standard tool for obtaining estimates of differential equations. Suppose that $a(\cdot)$, $b(\cdot)$, and $\psi(\cdot)$ are continuous real-valued functions that satisfy
\begin{align*}
    \frac{\d}{\d t}\psi(t)\leq a(t)\psi(t)+b(t).
\end{align*}
Then 
\begin{align*}
    \psi(t)\leq \psi(t_0)e^{\int_{t_0}^t a(s)\d s} + \int_{t_0}^t e^{\int_{s}^t a(u)\d u}b(s)\d s.
\end{align*}

\textbf{Burkholder-Davis-Gundy inequality} Let $\phi:[0, \infty)\rightarrow \mathbb{R}^{r\times d}$ for some positive integers $r$ and $d$. In addition, we assume $\E{\int_0^{\infty} |\psi(s)|^2 \d s}<\infty$ and let $Z(t)=\int_0^t \psi(s)\d W_s$, where $W_s$ is a $d$-dimensional Brownian motion. Then for all $t\geq 0$, we have

\begin{align*}
    \E{\sup_{0\leq s\leq t} |Z(s)|^2}\leq 4\E{\int_0^t|\phi(s)|^2\d s}.
\end{align*}
%\Zhao{The last term is $\phi(s)$ but taking the integral over $t$. This seems wired.}

\subsubsection{Wasserstein Distance}

% We denote the Borel $\sigma$-algebra 
We define the 2-Wasserstein distance between a pair of Borel probability measures $\mu$ and $\nu$ on $\R^d$ as follows  
\begin{align*}
    W_2(\mu, \nu):=\inf_{\Gamma\in \text{Couplings}(\mu, \nu)}\left(\int\|\bbeta_{\mu}-\bbeta_{\nu}\|_2^2 d \Gamma(\bbeta_{\mu}, \bbeta_{\nu})\right)^{\frac{1}{2}},
\end{align*}
where $\|\cdot\|_2$ denotes the $\ell_2$ norm on $\mathbb{R}^d$ and the pair of random variables $(\bbeta_{\mu}, \bbeta_{\nu})\in \R^d\times\R^d$ is a coupling with the marginals following $\mathcal{L}(\bbeta_{\mu})=\mu$ and $\mathcal{L}(\bbeta_{\nu})=\nu$, where $\mathcal{L}(\cdot)$ denotes a distribution of a random variable.



% The following theorem is from previous work \cite{dk19}.
% \begin{theorem}[Theorem 4 in \cite{dk19}]
% Let $\theta_{K,h}$ be the $K$-th iterate of the nLMC algorithm and $\nu_K$ be its distribution. If the function $f$ satisfies condition (1) and $h \leq 2/(m+M)$ then
% \begin{align*}
%     W_2 (\nu_K, \pi) \leq (1-mh)^K W_2(\nu_0, \pi) + 1.65 (M/m) (hp)^{1/2} + \frac{\delta \sqrt{p} }{ m} + \frac{\sigma^2 (hp)^{1/2} }{1.65 M + \sigma \sqrt{m}}
% \end{align*}
% \end{theorem}


% Remove Assumption 4 \cite{dk19}. Follow ideas similar to Lemma 4  \cite{decent21}



\begin{theorem}\label{thm:non_asymptotic}

Assume assumptions \ref{def:strong_convex}, \ref{def:smooth}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , \min\{1, {m}/{L^2}\})$ and $\| \theta_0-\theta_* \|_2 \leq \sqrt{d} \mathcal{D}$, where $\theta_{\star}$ is a stationary point. Then
\begin{align*}
W_2(\mu_k, \pi) \leq e^{-{mk\eta}/{2}} \cdot 2 ( \sqrt{d} {\cal D} + \sqrt{d/m} ) + \sqrt{ 2d (\sigma^2+L^2 G\eta) / m^2}, %\frac{L}{m} \sqrt{G\eta d},
\end{align*}
where $\mu_k$ denotes the probability measure of $\theta_k$ and $G:=25(\tau+m\mathcal{D}^2+\sigma^2)$.
\end{theorem}

\begin{proposition}
Assume that $\lrn{\nabla f(\theta_k) - \nabla \tilde{f}(\theta_k)}_2^2 \leq C_1 \epsilon \lrn{\theta_k-\theta_*}_2^2 + C_2 \epsilon$. 
Then 
\begin{align*}
W_2^2(\mu_{k+1}, \pi) \leq e^{-m\eta} W_2^2(\mu_k, \pi) + O(\eta \epsilon d).
\end{align*}
\end{proposition}
Follow Theorem 2 of \cite{Bayes_Rob} for proof.

Therefore, $W_2^2(\mu_k, \pi) \leq e^{-m\eta k} W_2^2(\mu_0, \pi) + O(\epsilon d / m)$.

\begin{proof}
Denote $\htheta_t$ as the continuous-time interpolation of the stochastic gradient Langevin dynamics as follows
\begin{align}\label{eq:continuous_interpolation}
\d {\htheta}_t = - \nabla \widetilde f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \d t + \sqrt{2\tau} \d \hat{W}_t,
\end{align}
where ${\htheta}_0=\theta_0$. For any $k\in \mathbb{N}^{+}$ and a time $t$ that satisfies $t=k\eta$, it is apparent that $\widehat\mu_t=\mathcal{L}({\htheta}_t)$ is the same as $\mu_k=\mathcal{L}(\theta_k)$. In addition, we define an auxiliary process $(\theta^*_t)$ that starts from the stationary distribution $\pi$
\begin{align}
\d \theta^*_t = - \nabla f(\theta^*_t) \d t + \sqrt{2\tau} \d W^*_t.
\end{align}
Consider It\^{o}'s formula for the sequence of $\frac{1}{2}  \| \htheta_t - \theta^*_t \|_2^2$
\begin{align*}
& ~ \frac{1}{2} \d  \| \htheta_t - \theta^*_t \|_2^2 \\
= & ~ \lrw{ \htheta_t - \theta^*_t, \d \htheta_t - \d \theta^*_t } + \mathrm{Tr}[ \d^2 \htheta_t - \d^2 \theta^*_t ] \\
= & ~ \lrw{ \htheta_t - \theta^*_t, \big(\nabla f(\theta^*_t) -\nabla\widetilde  f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big) \d t + \sqrt{2\tau}\big( \d \hat{W}_t - \d W^*_t \big) } + 2\tau \mathrm{Tr}[ \d^2 \hat{W}_t - \d^2 W^*_t ].
\end{align*}
Taking $\hat{W}_t = W^*_t$ defines a coupling between the two processes and leads to
\begin{align*}
\frac{1}{2} \d \| \htheta_t - \theta^*_t \|_2^2
&= \lrw{ \htheta_t - \theta^*_t, \nabla f(\theta^*_t)-\nabla f(\htheta_t)} \d t+ \lrw{ \htheta_t - \theta^*_t,  \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor})  } \d t \\
&\qquad \qquad+ \lrw{ \htheta_t - \theta^*_t, \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor}) } \d t \\
&\leq - m \| \htheta_t - \theta_t^* \|_2^2 \d t + \frac{m}{4} \| \htheta_t - \theta^*_t \|_2^2 \d t + \frac{1}{m} \big\| \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big\|_2^2 \d t \\
&\qquad\qquad  + \frac{m}{4} \| \htheta_t - \theta^*_t \|_2^2\d t + \frac{1}{m} \lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2\d t\\
&\leq  - \frac{m}{2} \| \htheta_t - \theta_t^* \|_2^2 \d t + \frac{1}{m} \big\| \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big\|_2^2 \d t+\frac{1}{m} \lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2\d t\\
&\leq - \frac{m}{2} \| \htheta_t - \theta_t^* \|_2^2 \d t + \frac{L^2}{m} \big\| \htheta_{t} - \htheta_{\eta\lfloor\frac{t}{\eta} \rfloor} \big\|_2^2 \d t+\frac{1}{m} \lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2\d t,
\end{align*}
where the first inequality follows from the strong-convexity property and $ab\leq  (\frac{\sqrt{m}}{2}a)^2+({\frac{1}{\sqrt{m}}}b)^2$; in particular, we don't attempt to optimize the constants of $-\frac{m}{2}$ for the item $\| \htheta_t - \theta_t^* \|_2^2$; the last inequality follows by the smoothness assumption \ref{def:smooth}.

Now apply Gr\"{o}nwall's inequality to the preceding inequality and take expectation respect to a coupling $(\htheta_t, \theta^*_t) \sim \Gamma(\widehat\mu_t,\pi)$
\begin{align}\label{eq:1st_gronwall}
     \E{ \|\htheta_t - \theta^*_t \|_2^2}\leq  \E{\| \htheta_0 - \theta^*_0 \|_2^2} e^{-mt}+\frac{2}{m}\int_0^t \bigg(d\sigma^2+ L^2\underbrace{\E{ \big\| \htheta_{s} - \htheta_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2}}_{\mathcal{I}} \bigg) e^{-(t-s)m} \d s,
\end{align}
where assumption \ref{def:variance} is applied to $\E{\lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2}$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Yian's revision:
\begin{align*}
\frac{1}{2} \d \| \htheta_t - \theta^*_t \|_2^2
&= \lrw{ \htheta_t - \theta^*_t, \nabla f(\theta^*_t)-\nabla f(\htheta_t)} \d t+ \lrw{ \htheta_t - \theta^*_t,  \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor})  } \d t \\
&\qquad \qquad+ \lrw{ \htheta_t - \theta^*_t, \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor}) } \d t \\
&\leq - \frac{m}{2} \| \htheta_t - \theta_t^* \|_2^2 \d t + \frac{1}{m} \big\| \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big\|_2^2 \d t 
+ \frac{1}{m} \lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2\d t.
\end{align*}
Therefore, 
\begin{align*}
\frac{1}{2} \d \lrp{ e^{m t} \| \htheta_t - \theta^*_t \|_2^2 } \leq \frac{1}{m} e^{m t} \lrp{ \big\| \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big\|_2^2 + \lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2 } \d t.
\end{align*}
By the fundamental theorem of calculus, for $t\in[\eta k, \eta (k+1)]$,
\begin{align*}
&\| \htheta_{\eta (k+1)} - \theta^*_{\eta (k+1)} \|_2^2 - e^{-m \eta} \| \htheta_{\eta k} - \theta^*{\eta k} \|_2^2 \\
&\leq \frac{1}{m} \int_{\eta k}^{\eta (k+1)} e^{m (t-\eta (k+1))} \lrp{ \big\| \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big\|_2^2 + \lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2 } \d t.
\end{align*}
Take expectation
\begin{align*}
& \E{\| \htheta_{\eta (k+1)} - \theta^*_{\eta (k+1)} \|_2^2} - e^{-m \eta} \E{\| \htheta_{\eta k} - \theta^*{\eta k} \|_2^2} \\
&\leq \frac{1}{m} \int_{\eta k}^{\eta (k+1)} e^{m (t-\eta (k+1))} \lrp{ \E{\big\| \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big\|_2^2} + \E{\lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2} } \d t \\
&\leq \frac{\eta}{m} \sup_{t\in[\eta k, \eta (k+1)]}\lrp{ \E{\big\| \nabla f(\htheta_{t}) - \nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) \big\|_2^2} +\E{\lrn{\nabla f(\htheta_{\eta\lfloor\frac{t}{\eta} \rfloor}) - \nabla \widetilde f(\widehat \theta_{\eta\lfloor\frac{t}{\eta} \rfloor})}_2^2} }.
\end{align*}

\textcolor{red}{$\frac{\frac{\eta}{m}}{1-e^{-m\eta}}\approx \frac{\frac{\eta}{m}}{m\eta}\approx \frac{1}{m^2}$}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

% \Zhao{In the second term of the above equation, is that $t/\eta$ actually $s/\eta$?} \textcolor{red}{nice catch}

%\paragraph{Estimate of $\mathcal{I}$} 

%\paragraph{Proof of Theorem \ref{thm:non_asymptotic} (continued)} 
Plugging the estimate of $\mathcal{I}$ in Lemma~\ref{lem:estimate_of_I} %Eq.~\eqref{eq:combined_bound} 
into Eq.~\eqref{eq:1st_gronwall}, we have
\begin{align*}
    \E{ \| \htheta_t - \theta^*_t \|_2^2}&\leq  \E{\| \htheta_0 - \theta^*_0 \|_2^2} e^{-mt}+\frac{2d}{m} (\sigma^2+L^2 G\eta) \int_0^t  e^{-(t-s)m} \d s\\
     &\leq \E{\| \htheta_0 - \theta^*_0 \|_2^2} e^{-mt}+\frac{2d}{m^2} (\sigma^2+L^2 G\eta).
\end{align*}

Recall that $\theta_k$ and $\widehat\theta_{t\eta}$ have the same distribution $\mu_k$. 


By the definition of $W_2$ distance, we have
\begin{align*}
W_2(\mu_k, \pi) 
%\leq \left(\E{ \| \htheta_{k\eta} - \theta^*_{k\eta} \|_2^2}\right)^{1/2}
\leq & ~ e^{-{mk\eta}/{2}} \cdot W_2(\mu_0, \pi) + \sqrt{ 2d (\sigma^2+L^2 G\eta) / m^2} \\
\leq & ~ e^{-{mk\eta}/{2}} \cdot 2 (\| \theta_0 - \theta_* \|_2 +  \sqrt{d/m} )+ \sqrt{ 2d(\sigma^2+L^2 G\eta) / m^2} \\
\leq & ~ e^{-{mk\eta}/{2}} \cdot 2 ( \sqrt{d} {\cal D} +  \sqrt{d/m} )+  \sqrt{ 2 d(\sigma^2+L^2 G\eta) / m^2},
\end{align*}
where the first inequality follows by applying $(a+b)^{1/2}\leq |a|^{1/2}+|b|^{1/2}$, the second one follows by Lemma \ref{lem:W2_init_bound}, and the last step follows from assumption on $\| \theta_0 - \theta_* \|_2$.



\end{proof}


\subsection{Technical Results}

%\Zhao{We only use eqref for equation.}
\begin{lemma}[Uniform $\ell_2$ upper bound]
\label{lem:L2_bound}
Assume assumptions \ref{def:strong_convex}, \ref{def:smooth}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , \min\{1, {m}/{L^2}\})$ %and $\lrn{\theta_0-\theta_*}_2^2\leq d\mathcal{D}^2$, 
we have the $\ell_2$ norm upper bound as follows
\begin{align*}
\sup_k\E{\| \theta_k-\theta_* \|_2^2}\leq \| \theta_0-\theta_* \|_2^2 + 2   (\tau+\sigma^2 )d/m.
%\sup_k\E{\| \theta_k-\theta_* \|_2^2}\leq d\mathcal{D}^2 + 2 \frac{d}{m} (\tau+\sigma^2 ).
\end{align*}
\end{lemma}


\begin{proof}
	By the iterate (Eq.\eqref{eq:sgld}), we have
	\begin{align}\label{eq:Langevin_L2_1}
\E{\|\theta_{k+1}-\theta_*\|_2^2}
		&= \E{\|\theta_k -\theta_*- \eta \widetilde \nabla f(\theta_k)\|_2^2} + \sqrt{8\eta\tau}\E{ \langle \theta_k -\theta_*- \eta \widetilde \nabla f(\theta_k), \xi_k \rangle } + 2\eta\tau\E{\|\xi_k\|_2^2} \notag \\
		&= \E{\|\theta_k -\theta_*- \eta \widetilde \nabla f(\theta_k)\|_2^2} + 2\eta d\tau,
	\end{align}	
	where the last equality is from the independence of $\theta_k-\theta_*- \widetilde f(\theta_k)$ and $\xi_k$ and $\E{\xi_k}=0$. Note that
\begin{align}\label{eq:ip_1st}
%\small
&\quad\ \E{\|\theta_k -\theta_*- \eta \widetilde f(\theta_k)\|_2^2} \notag\\
&= \E{\left\|\theta_k -\theta_*- \eta \nabla f(\theta_k) \right\|_2^2} + \eta^2\E{\|\nabla f(\theta_k)-\nabla \widetilde f(\theta_k)\|_2^2}  \notag\\
& \qquad\qquad + 2 \eta \E{ \langle \theta_k-\theta_*-\eta \nabla f(\theta_k),\nabla f(\theta_k)-\nabla\widetilde f(\theta_k) \rangle }  \notag\\
&= \E{\left\|\theta_k -\theta_*- \eta \nabla f(\theta_k) \right\|_2^2} + \eta^2\E{\|\nabla f(\theta_k)-\nabla \widetilde f(\theta_k)\|_2^2} \notag \\
&= \E{\|\theta_k -\theta_*- \eta \nabla f(\theta_k) \|_2^2}  + \eta^2 d\sigma^2, 
\end{align}
where the first step follows from simple algebra, the second step follows from the unbiasedness of the stochastic gradient, and the last step follows from Assumption \ref{def:variance}.


Recall that $\theta_*$ is the stationary point that yields $\nabla f(\theta_*)=0$, we can upper bound the first term of Eq.\eqref{eq:ip_1st} as follows
\begin{align}\label{eq:ip_2nd}
	&\quad\ \E{\|\theta_k -\theta_*- \eta \nabla f(\theta_k) \|_2^2}\notag\\
	&=\E{\|\theta_k -\theta_*- \eta (\nabla f(\theta_k)-\nabla f(\theta_*)) \|_2^2}\notag\\
	&= \E{\|\theta_k-\theta_*\|_2^2} - 2 \eta \E{ \langle \theta_k-\theta_*,\nabla f(\theta_k)-\nabla f(\theta_*) \rangle } + \eta^2\E{\| \nabla f(\theta_k)-\nabla f(\theta_*)\|_2^2} \notag\\
	&\le \E{\|\theta_k-\theta_*\|_2^2} - 2\eta m\E{\|\theta_k-\theta_*\|_2^2} + \eta^2 L^2\E{\|\theta_k-\theta_*\|_2^2} \notag\\
	&= \left(1-2\eta m + \eta^2 L^2\right)\E{\|\theta_k-\theta_*\|_2^2}, 
\end{align}
where the inequality follows from the strongly convex assumption (see Assumption~\ref{def:strong_convex}) and the smoothness assumption (see Assumption~\ref{def:smooth}).

Combining Eq.~\eqref{eq:Langevin_L2_1}, Eq.~\eqref{eq:ip_1st}, and Eq.~\eqref{eq:ip_2nd}, we have the following iterate
\begin{align*}
	\E{\|\theta_{k+1}-\theta_*\|_2^2} 
	\leq & ~ (1-2\eta m+\eta^2L^2) \cdot  \E{\|\theta_k-\theta_*\|_2^2} + 2\eta d\tau +\eta^2 d \sigma^2 \\
	\leq & ~ \underbrace{(1-2\eta m+\eta^2L^2)}_{:=g(\eta)} \cdot  \E{\|\theta_k-\theta_*\|_2^2} + 2\eta d (\tau + \sigma^2) 
\end{align*}
where the last step follows from $\eta \in (0, 1)$. Since $g(\eta)$ is a quadratic equation and $g(0)=g ({2m}/{L^2} )=1$, if $\eta\in (0,  2m / L^2  )$, then we have $g(\eta) \in (0, 1]$. Further if $ \eta \in (0, m/L^2 )$, then we have $1-g(\eta) \geq \eta m$.

Recursively applying the above equation $k$ times gives us
\begin{align*}
	\E{\|\theta_k-\theta_*\|_2^2} &\le g(\eta)^k \cdot \E{\|\theta_0-\theta_*\|_2^2} + \frac{1- g(\eta)^k}{1 - g(\eta)} \cdot 2\eta d (\tau + \sigma^2)  \notag\\
	&\le \E{\|\theta_0-\theta_*\|_2^2} + 2 \frac{d}{m} (\tau + \sigma^2 ).
\end{align*}
%which completes the proof by the initialization of  $\|\theta_0-\theta_*\|^2\leq d \mathcal{D}^2$.


% Combining Eq.~\eqref{eq:Langevin_L2_1}, Eq.~\eqref{eq:ip_1st}, and Eq.~\eqref{eq:ip_2nd}, we have the following iterate
% \begin{align*}
% 	\E{\|\theta_{k+1}-\theta_*\|^2} 
% 	\leq & ~ (1-2\eta m+\eta^2L^2) \cdot  \E{\|\theta_k-\theta_*\|^2} + 2\eta d\tau +\eta^2 d \sigma^2 \\
% 	\leq & ~ (1-\eta m) \cdot  \E{\|\theta_k-\theta_*\|^2} + 2\eta d (\tau + \eta\sigma^2) 
% \end{align*}
% where the last step follows from $\eta \in (0, \frac{m}{L^2})$. 
% Recursively applying the above equation $k$ times gives us
% \begin{align*}
% 	\E{\|\theta_k-\theta_*\|^2} &\le (1-\eta m)^k \cdot \E{\|\theta_0-\theta_*\|^2} + \frac{1- (1-\eta m)^k}{\eta m} \cdot 2\eta d (\tau + \eta\sigma^2)  \notag\\
% 	&\le \E{\|\theta_0-\theta_*\|^2} + \frac{2d}{m} (\tau + \frac{\sigma^2}{m}),
% \end{align*}
% where the last inequality follows by $\eta\in(0, \frac{1}{m})$.
\end{proof}


\begin{lemma}[Initial condition] 
\label{lem:W2_init_bound}
Let $\mu_0$ denote the Dirac delta distribution at $\theta_0$.% and assume $\lrn{\theta_0-\theta_*}_2^2\leq d\mathcal{D}^2$.
Then, we have
\begin{align*}
W_2(\mu_0, \pi)\leq 2 (\| \theta_0 - \theta_* \|_2 +  \sqrt{d/m} ). %\sqrt{2d\left(\mathcal{D}^2+\frac{2}{m}\right)}.
\end{align*}
\end{lemma}

\begin{proof}
By \cite{ccbj18}, there exists one coupling between $\mu_0$ and $\pi$ such that
\begin{align*}
    W_2^2(\mu_0, \pi) 
    \leq & ~ \mathbb{E}_{\theta\sim \pi} [\|\theta_0-\theta\|_2^2 ]\\
    \leq & ~ 2\mathbb{E}_{\theta\sim \pi} [\|\theta_0-\theta_*\|_2^2 ] + 2 \mathbb{E}_{\theta\sim \pi}[\|\theta-\theta_*\|_2^2] \\
    = & ~ 2\| \theta_0 - \theta_* \|_2^2 +2\mathbb{E}_{\theta\sim \pi}[\|\theta-\theta_*\|_2^2]\\
    \leq & ~ 2\| \theta_0 - \theta_* \|_2^2 + 4d/m,
\end{align*}
where the second step follows from triangle inequality, the last step follows from Theorem 17 \cite{ccbj18}.
\end{proof}



\begin{lemma}[Estimate of $I$]\label{lem:estimate_of_I}
For a fixed learning rate $\eta$ and any $s\in [0, \infty)$, the iterates of $(\widehat \theta_s)$ based on the dynamics of Eq.\eqref{eq:continuous_interpolation} satisfy the following estimate
\begin{align*}
    \E{ \big\| \htheta_{s} - \htheta_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2} \leq \eta d \cdot G,
\end{align*}
where  $G:=25(\tau+m\mathcal{D}^2+\sigma^2)$.
\end{lemma}
\begin{proof}
For any $s\in[0,\infty)$, there exists a certain $k \in \mathbb{N}^+$ such that $s\in [k\eta, (k+1)\eta)$. By the dynamics of Eq.~\eqref{eq:continuous_interpolation}, we have
\begin{align*}
    \htheta_{s} = \htheta_{\eta\lfloor\frac{s}{\eta} \rfloor}+(s-k\eta) \nabla f(\widehat\theta_{k\lfloor \frac{s}{\eta} \rfloor })+\sqrt{2\tau}\int_{k\eta}^s \d \widehat W_t,
\end{align*}
which suggests that 
\begin{align*}
    \sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \htheta_{s}-\htheta_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2 \leq (s-k\eta) \big\| \nabla f(\widehat\theta_{k\lfloor \frac{s}{\eta} \rfloor }) \big\|_2+\sup_{ s \in [ k\eta,  (k+1)\eta ) }\lrn{\int_{k\eta}^s \sqrt{2\tau} \d \widehat W_t}_2.
\end{align*}
We first square the terms on both sides and take Young’s inequality and expectation
\begin{align*}
    \E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \htheta_{s}-\htheta_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2} \leq 2\E{\big\|(s-k\eta)\nabla f(\widehat\theta_{k\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+2\E{\sup_{ s \in [ k\eta,  (k+1)\eta ) }\lrn{\int_{k\eta}^s \sqrt{2\tau} \d \widehat W_t}_2^2}.
\end{align*}
Then, by Burkholder-Davis-Gundy inequality and It\^{o} isometry, we have
\begin{align}
    \label{eq:1st_part}
    \E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \htheta_{s}-\htheta_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2}&\leq 2\E{ \big\| (s-k\eta)\nabla f(\widehat\theta_{k\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+8\sum_{i=1}^d\E{\int_{k\eta}^s 2\tau \d t} \notag \\
    &\leq 2\eta^2\E{ \big\| \nabla f(\widehat\theta_{k\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+16 \eta d\tau.
\end{align}
Recall that $\theta_{\star}$ is a minimizer such that $\lrn{\nabla f(\theta_{\star})}_2=0$. By the smoothness assumption \ref{def:smooth}, we have
\begin{align}\label{eq:2nd_part}
    \E{ \| \nabla f(\widehat\theta_{k\lfloor \frac{s}{\eta} \rfloor }) \|_2^2}
    = & ~ \E{\| \nabla f(\widehat\theta_{k\lfloor \frac{s}{\eta} \rfloor })-\nabla f(\theta_{\star})+\nabla f(\theta_{\star}) \|_2^2} \notag \\
    \leq & ~ 2\E{\| \nabla f(\widehat\theta_{k\lfloor \frac{s}{\eta} \rfloor })-\nabla f(\theta_{\star}) \|_2^2}+2\lrn{\nabla f(\theta_{\star})}_2^2 \notag \\
    \leq & ~ 2L^2\E{\|\theta_k-\theta_* \|_2^2} \notag \\
    \leq & ~ 2L^2 \cdot ( \| \theta_0 - \theta_* \|_2^2 + 2(\tau+\sigma^2) d/m ) \notag \\
    \leq & ~ 4L^2 d\left(\mathcal{D}^2 +  (\tau+\sigma^2 )/m \right),
\end{align}
where the forth step follows by Lemma~\ref{lem:L2_bound}. 


Combining Eq.~\eqref{eq:1st_part}, Eq.~\eqref{eq:2nd_part} and , we have
\begin{align}\label{eq:combined_bound}
\E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \htheta_{s}-\htheta_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2}
\leq & ~ 2\eta^2 \cdot 4L^2 d(\mathcal{D}^2 +  (\tau+\sigma^2 )/ m) + 16 \eta d \tau  \notag \\
\leq & ~ 8 \eta d (m {\cal D}^2 + \tau + \sigma^2) + 16 \eta d \tau  \notag \\
\leq & ~\eta d \cdot G,
\end{align}
where the second step follows from $\eta\in (0, \min\{1, \frac{m}{L^2}\})$, and the last step follows from definition of $G:=25(\tau+m\mathcal{D}^2+\sigma^2)$.
 
\end{proof}


\textcolor{red}{
To Do List
\begin{itemize}
    \item variance analysis of stochastic gradients based on $K$ local steps
    \item general decaying learning rates ($n^{-\alpha}$, where $\alpha\in (0, 1]$)
    \item different sampling scheme of devices
    \item non-convex extensions?
    \item simulations?
\end{itemize}
}
% Inspired by Yian
% \begin{proof}
% Define two processes, $\htheta_t$ and $\theta^*_t$, where
% $\htheta_0 = \theta_0$, $\theta^*_0\sim p^*$, and
% \begin{align}
% d {\htheta}_t = - \nabla f(\htheta_{kh}) d t + \sqrt{2} d \hat{W}_t,
% \end{align}
% and 
% \begin{align}
% d \theta^*_t = - \nabla f(\theta^*_t) d t + \sqrt{2} d W^*_t.
% \end{align}
% Consider It\^{o}'s formula for the sequence of $\frac{1}{2}  \lrn{\htheta_t - \theta^*_t}_2^2$
% \begin{align*}
% &\frac{1}{2} d  \lrn{\htheta_t - \theta^*_t}_2^2 \\
% &= \lrw{ \htheta_t - \theta^*_t, d \htheta_t - d \theta^*_t } + \mathrm{Tr}\lrp{ d^2 \htheta_t - d^2 \theta^*_t } \\
% &= \lrw{ \htheta_t - \theta^*_t, - \lrp{\nabla f(\htheta_{kh}) - \nabla f(\theta^*_t)} d t + \sqrt{2}\lrp{d \hat{W}_t - d W^*_t} } + 2 \mathrm{Tr}\lrp{ d^2 \hat{W}_t - d^2 W^*_t }.
% \end{align*}
% Taking $\hat{W}_t = W^*_t$ defines a coupling between the two processes and leads to
% \begin{align*}
% \frac{1}{2} d \lrn{\htheta_t - \theta^*_t}_2^2
% &= - \lrw{ \htheta_t - \theta^*_t, \lrp{\nabla f(\htheta_t) - \nabla f(\theta^*_t)} } d t + \lrw{ \htheta_t - \theta^*_t, \lrp{\nabla f(\htheta_{t}) - \nabla f(\htheta_{kh})} } d t \\
% &\leq - m \lrn{\htheta_t - \theta_t^*}_2^2 d t + \frac{m}{2} \lrn{\htheta_t - \theta^*_t}_2^2 d t + \frac{1}{2m} \lrn{\nabla f(\htheta_{t}) - \nabla f(\htheta_{kh})}_2^2 d t \\
% &= - \frac{m}{2} \lrn{\htheta_t - \theta_t^*}_2^2 d t + \frac{1}{2m} \lrn{\nabla f(\htheta_{t}) - \nabla f(\htheta_{kh})}_2^2 d t.
% \end{align*}

% After using Gronwall's inequality, take $\Ep{(\htheta_t, \theta^*_t) \sim \gamma(\mu_t,\mu^*)}{\cdot}$.
% \end{proof}