
\paragraph{Roadmap.}
In Section~\ref{sec:preli}, we layout the formulation of the algorithm, basic notations, and definitions. In Section~\ref{sec:full_device_participation}, we present the main convergence analysis for full device participation. We discuss the optimal number of local updates based on a fixed learning rate, the acceleration achieved by varying learning rates, and the privacy-accuracy trade-off through correlated noises. In Section~\ref{sec:partial_device_participation}, we analyze the convergence of partial device participation through two device-sampling schemes. In Section~\ref{sec:bouding_contraction_discretization_divergence}, we provide lemmas to upper bound the contraction, discretization and divergence for proving the main convergence results. In Section~\ref{sec:uniform_upper_bound}, we include supporting lemmas to prove results in the previous section. In Section~\ref{sec:initial_condition}, we establish the initial condition. In Section~\ref{dp_guarantee}, we prove differential privacy guarantees. 


% \setcounter{algorithm}{0}


\section{Preliminaries}\label{sec:preli}

\subsection{Basic notations and backgrounds} 

Let $N$ denote the number of clients. Let $T_{\epsilon}$ denote the number of global steps to achieve the precision $\epsilon$. Let $K$ denote the number of local steps. For each $c \in [N]:=\{1,2,\cdots, N\}$, we use $f^c$ and $\nabla f^c$ denote the loss function and gradient of the function $f^c$ in client $c$. Notably, $\nabla f$ is not a standard gradient operator acting on $ f$ when multiple local steps are adopted ($K>1$). For the stochastic gradient oracle, we denote by $\nabla \tilde f^c(\cdot)$ the \emph{unbiased} estimate of the exact gradient $\nabla f^c$ of client $c$. In addition, we denote $p_c$ as the weight of the $c$-th client such that $p_c\geq 0$ and $\sum_{c=1}^N p_c=1$. $\xi_k^c$ is an independent standard $d$-dimensional Gaussian vector at iteration $k$ for each client $c\in[N]$ and $\dot\xi_k$ is a unique Gaussian vector shared by all the clients.

\begin{algorithm*}[h]\caption{Federated averaging Langevin dynamics algorithm (FA-LD). Denote by $\theta_k^c$ the model parameter in the $c$-th client at the $k$-th step. Denote the immediate result of one step SGLD update from $\theta_k^c$ by $\beta_k^c$. $\xi_k^c$ is an independent standard $d$-dimensional Gaussian vector at iteration $k$ for each client $c\in[N]$. A global synchronization is conducted every $K$ steps. This is a complete version of Algorithm~\ref{alg:alg_main_paper_text_independent_noise}.}\label{alg:alg_main_text_independent_noise}
% \begin{algorithmic}[1]
\begin{equation}\label{local_client}
    \beta_{k+1}^c=\theta_k^c-\eta\nabla \tilde f^c(\theta_k^c)+\sqrt{2\eta\tau/p_c}\xi_k^c,
\end{equation}
\begin{equation}  
\label{synchronization}
\theta_{k+1}^c=\left\{  
             \begin{array}{lr}  
             \beta_{k+1}^c \qquad\qquad\qquad \text{if } k+1 \text{ mod } K\neq 0 \\  
              & \\
             \sum_{c=1}^N p_c \beta_{k+1}^c \ \qquad \text{if } k+1 \text{ mod } K=0.
             \end{array}  
\right.  
\end{equation} 
% \end{algorithmic}
\end{algorithm*}

Inspired by \cite{lhy+20}, we define two virtual sequences 
\begin{equation}
\label{virtual_seq}
\beta_k=\sum_{c=1}^N p_c \beta_k^c, \qquad \theta_k=\sum_{c=1}^N p_c \theta_k^c,
\end{equation}
which are \emph{both inaccessible when $k \text{ mod } K\neq 0$}. For the gradients and injected noise, we also define 
\begin{equation}
\label{sum_grad}
\nabla f(\theta_k)=\sum_{c=1}^N p_c \nabla f^c(\theta_k^c), \quad \nabla\tilde f(\theta_k)=\sum_{c=1}^N p_c \nabla \tilde f^c(\theta_k^c), \quad \xi_k=\sum_{c=1}^N \sqrt{p_c} \xi_k^c.
\end{equation}



In what follows, it is clear that $\E{\nabla \tilde f(\theta)}=\sum_{c=1}^N p_c \E{\nabla \tilde f^c(\theta^c)}=\nabla f(\theta)$ for any $\theta^c\in\R^d$ and any $c\in[N]$. Summing Eq.\eqref{local_client} from clients $c=1$ to $N$ and combining Eq.\eqref{virtual_seq} and Eq.\eqref{sum_grad}, we have
\begin{align}
\label{fed_avg_langevin_dynamics_preliminary}
    \beta_{k+1}&=\theta_k-\eta \nabla \tilde f(\theta_k)+\sqrt{2\eta\tau}\xi_k.
\end{align}
Moreover, we always have $\beta_k=\theta_k$ whether $k+1 \text{ mod } K=0$ or not by Eq.\eqref{synchronization} and Eq.\eqref{virtual_seq}. In what follows, we can write
\begin{equation}
\label{fed_avg_langevin_dynamics}
\theta_{k+1}=\theta_k-\eta \nabla \tilde f(\theta_k)+\sqrt{2\eta\tau}\xi_k,
\end{equation}
which resembles the SGLD algorithm \cite{Welling11} except that the construction of stochastic gradients is different and $\theta_k$ is \emph{not accessible when $k\text{ mod } K\neq 0$}. To facilitate the analysis, we also define an auxiliary continuous-time processes $(\bar\theta_t)_{t\geq 0}$ 
\begin{align}
\label{continuous_dynamics}
\d \bar\theta_t = - \nabla f(\bar\theta_t) \cdot \d t + \sqrt{2\tau} \cdot \d \overline{W}_t,
\end{align}
where $\bar\theta_t=\sum_{c=1}^N p_c \bar\theta_t^c$, $\nabla f(\bar\theta_t)=\sum_{c=1}^N p_c \nabla f^c(\bar\theta_t^c)$, $\bar\theta_t^c$ is the continuous-time variable at client $c$, and $\overline{W}$ is a $d$-dimensional Brownian motion. The continuous-time algorithm is referred to as Federated Averaging Langevin diffusion and is described as
\begin{align*}\label{local_client_continuous}
    \d \bar\beta_{t}^c &=-\nabla f^c(\bar\theta_t^c) \cdot \d t+\sqrt{2\tau/p_c} \cdot \d \overline{W}^c_t \notag\\
    \quad\bar\theta_{t}^c&=\sum_{c=1}^N p_c \bar\beta_{t}^c.
\end{align*}
Since the synchronization step is conducted at every time step $t$, the Federated Averaging Langevin diffusion performs the same as the standard Langevin diffusion with the temperature $\tau$ and convergences to the stationary distribution $\pi(\theta)\propto \exp(- f(\theta) / \tau )$, where $f(\theta)=\sum_{c=1}^N p_c  f^c(\theta)$. Assume that $\bar\theta_0$ simulates from the stationary distribution $\pi$, then it follows that $\bar\theta_t\sim\pi$ for any $t\geq 0$.






\iffalse
\subsection{Our plan}

What are the assumptions do we need ..
\begin{itemize}
    \item shusen wang's paper
\end{itemize}

We need to prove a new version of Lemma 3 in page 12 in \cite{lhy+20}.
\begin{lemma}[Lemma 3 in page 12 in \cite{lhy+20}]
We have
\begin{align*}
    \mathbb{E} \Big[ \sum_{c=1}^N p_c \| u(t+1) - w_{K,c}(t)  \|^2 \Big] \leq 4 \eta_t^2 (E-1)^2 G^2 
\end{align*}
\end{lemma}

We need to generalize the above lemma to something as follows:
\begin{lemma}[Our version]
We have
\begin{align*}
    \mathbb{E} \Big[ \sum_{c=1}^N p_c \| u(t+1) - w_{K,c}(t)  \|^2 \Big] \leq 4 \eta_t^2 (E-1)^2 G^2  + \| \mathrm{noise} \|^2
\end{align*} 
\end{lemma}

Using Shusen's assumption, we can show Dala's paper \cite{dk19} page 7 contions are holding.

\fi






\subsection{Assumptions and definitions}

\begin{assumption}[Smoothness, restatement of Assumption \ref{def:smooth_main}]\label{def:smooth} For each $c\in [N]$, we say $f^c$ is $L$-smooth if for some $L>0$
\begin{align*}
f^c(y)\leq f^c(x)+\langle \nabla f^c(x),y-x \rangle+\frac{L}{2}\| y-x \|^2_2\quad \forall x, y\in \R^d.
\end{align*}
\end{assumption}

Note that the above assumption is equivalent to saying that
\begin{align*}
\| \nabla f^c(y)-\nabla f^c(x) \|_2 \leq L \| y-x \|_2,\quad \forall x, y\in \R^d.
\end{align*}

\begin{assumption}[Strong convexity, restatement of Assumption \ref{def:strong_convex_main}]\label{def:strong_convex}
For each $c\in [N]$, $f^c$ is $m$-strongly convex if for some $m>0$
\begin{align*}
f^c(x)\geq f^c(y)+\langle \nabla f^c(y),x-y \rangle + \frac{m}{2} \| y-x \|_2^2\quad \forall x, y\in \R^d.
\end{align*}
\end{assumption}

An alternative formulation for strong convexity is that
\begin{align*}
\langle \nabla f^c(x)-\nabla  f^c(y), x-y\rangle \geq m \lrn{x-y}_2^2\quad \forall x, y\in \R^d.
\end{align*}

%restatement of assumption \ref{def:variance_main} in the main paper, which 
% The following assumption 
% ensures a bounded distance between $\nabla \tilde f^c$ and $\nabla f^c$ in $\ell_2$ norm for each client $c\in[N]$.
\begin{assumption}[Bounded variance, restatement of Assumption \ref{def:variance_main}]\label{def:variance}
For each $c\in [N]$, the variance of noise in the stochastic gradient $\nabla \tilde f^c(x)$ in each client is upper bounded such that 
\begin{align*}
\mathbb{E}[ \| \nabla \tilde f^c(x) - \nabla f^c(x) \|_2^2] \leq \sigma^2 d,\quad \forall x\in \R^d.
\end{align*}
\end{assumption}

The bounded variance in the stochastic gradient is a rather standard assumption and has been widely used in \cite{ccbj18, dk19, lhy+20}. Extension of bounded variance to unbounded cases such as $\mathbb{E}[ \| \nabla \tilde f^c(x) - \nabla f^c(x) \|_2^2]\leq \delta (L^2 x^2 + B^2)$ for some $M$ and $\delta\in[0,1)$ is quite straightforward and has been adopted in assumption A.4 stated in \cite{Maxim17}. The proof framework remains the same.


\paragraph{Quality of non-i.i.d data} Denote by $\theta_*$ the global minimum of $f$. Next, we quantify the degree of the non-i.i.d data by $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2$, which is a non-negative constant and yields a smaller scale if the data is more evenly distributed.

\begin{definition}\label{def:H_kappa_gamma}
We define parameter $T_{c, \rho}$ $H_{\rho}$, $\kappa$ and $\gamma^2$
\begin{align*}
    T_{c,\rho}: = & ~ \tau(\rho^2+(1-\rho^2)/p_c),\\
    H_{\rho}: = & ~  \underbrace{ \mathcal{D}^2}_{\small{\mathrm{initialization}}}+\underbrace{ \frac{1}{m}\max_{c\in[N]} T_{c,\rho}}_{\small{\mathrm{injected~noise}}} +\underbrace{\frac{\gamma^2}{m^2d}}_{\small{\mathrm{data~heterogeneity}}}+\underbrace{\frac{\sigma^2}{m^2}}_{\small{\mathrm{stochastic~noise}}},\\
    \kappa := & ~ L / m , \\
    \gamma^2 : = & ~ \max_{c \in [N]} \| \nabla f^c (\theta_*) \|_2^2 .
\end{align*}
\end{definition}







\section{Full device participation}\label{sec:full_device_participation}

\subsection{One-step update}
\paragraph{Wasserstein distance}

% We denote the Borel $\sigma$-algebra 
We define the 2-Wasserstein distance between a pair of Borel probability measures $\mu$ and $\nu$ on $\R^d$ as follows  
\begin{align*}
    W_2(\mu, \nu):=\inf_{\gamma^2\in \text{Couplings}(\mu, \nu)}\left(\int\|\bbeta_{\mu}-\bbeta_{\nu}\|_2^2 d \gamma^2(\bbeta_{\mu}, \bbeta_{\nu})\right)^{\frac{1}{2}},
\end{align*}
where $\|\cdot\|_2$ denotes the $\ell_2$ norm on $\mathbb{R}^d$ and the pair of random variables $(\bbeta_{\mu}, \bbeta_{\nu})\in \R^d\times\R^d$ is a coupling with the marginals following $\mathcal{L}(\bbeta_{\mu})=\mu$ and $\mathcal{L}(\bbeta_{\nu})=\nu$. $\mathcal{L}(\cdot)$ denotes a distribution of a random variable.

The following result %is a restatement of Lemma \ref{contraction_main} in the main paper. It 
provides a crucial contraction property based on distributed clients with infrequent synchronizations. 
\begin{lemma}[Dominated contraction property, restatement of Lemma \ref{contraction_main}]
\label{contraction}
Assume assumptions \ref{def:smooth} and \ref{def:strong_convex} hold. For any learning rate $\eta \in (0, \frac{1}{L+m}]$, any $\{\theta^c\}_{c=1}^N, \{\beta^c\}_{c=1}^N\in\mathbb{R}^d$, 
we have
\begin{align*}
\small
    &\quad\lrn{\beta-\theta-\eta(\nabla f(\beta)-\nabla f(\theta))}_2^2\leq (1-\eta m) \cdot \|\beta-\theta \|_2^2+4\eta L\sum_{c=1}^N p_c \cdot ( \| \beta^c-\beta \|_2^2 + \|\theta^c-\theta \|_2^2 ).
\end{align*}
\end{lemma}
where $\beta=\sum_{c=1}^N p_c \beta^c$, $\theta=\sum_{c=1}^N p_c \theta^c$, $\nabla f(\theta)=\sum_{c=1}^N p_c \nabla f^c(\theta^c)$, and $\nabla f(\beta)=\sum_{c=1}^N p_c \nabla f^c(\beta^c)$. We postpone the proof into Section \ref{DCP}. The above result implies that as long as the local parameters $\theta^c,\beta^c$ and global $\theta,\beta$ don't differ each other too much, we can guarantee the desired convergence. 

The following result ensures a bounded gap between $\bar\theta^c_{s}$ and $\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor}$ in $\ell_2$ norm for any $s\geq 0$ and $c\in[N]$. We postpone the proof of Lemma~\ref{lem:discretization} into Section~\ref{dis_eroor}.


\begin{lemma}[Discretization error]\label{lem:discretization}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any $s\geq 0$, any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, the iterates of $(\bar \theta_s)$ based on the continuous dynamics of Eq.\eqref{continuous_dynamics} satisfy the following estimate
\begin{align*}
    \E{ \big\| \bar\theta^c_{s} - \bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2} \leq 8\eta^2 d\kappa\bigg(\frac{\kappa\gamma^2}{d}+L\tau\bigg)+16\eta d\tau/p_c.
\end{align*}
\end{lemma}

The following result  %is restatement of Lemma \ref{divergence_main}. It
shows that given a finite number of local steps $K$, the divergence between $\theta^c$ in local client and $\theta$ in the center is bounded in $\ell_2$ norm. Notably, since the non-differentiable Brownian motion leads to a lower order term $O(\eta)$ instead of $O(\eta^2)$ in $\ell^2$ norm, a na\"{i}ve proof may lead to a crude upper bound.  We delay the proof of Lemma~\ref{divergence} into Section~\ref{bounded_divergence}.
\begin{lemma}[Bounded divergence, restatement of Lemma \ref{divergence_main}]\label{divergence}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. For any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the $\ell_2$ upper bound of the divergence between local clients and the center as follows
\begin{align*}
    \sum_{c=1}^N p_c\E{\|\theta_k^c-\theta_k \|_2^2}&\leq 112(K-1)^2\eta^2 d L^2 H_{\rho} +8(K-1)\eta d \tau(\rho^2 + N(1-\rho^2)),\notag
\end{align*}
where $H_{\rho}, \kappa$ and $\gamma^2$ are defined as Definition~\ref{def:H_kappa_gamma}. % $H_{\rho}=14 \kappa^2 \cdot  (m^2 \mathcal{D}^2+m\tau +\frac{\gamma^2}{d}+\sigma^2 )$, $\kappa=L/m$, and  $\gamma^2:=\max_{c\in[N]}\| \nabla f^c(\theta_*) \|_2^2$.
\end{lemma}




The following presents a standard result for bounding the gap between $\nabla f(\theta)$ and $\nabla \tilde f(\theta)$. We delay the proof of Lemma~\ref{lem:total_variance} into Setion~\ref{sec:bouding_contraction_discretization_divergence}.

\begin{lemma}[Bounded variance] 
\label{lem:total_variance}
Given assumption \ref{def:variance}, we have 
\begin{equation*}
    \E{ \|\nabla f(\theta)-\nabla \tilde f(\theta) \|_2^2}\leq d \cdot \sigma^2 ,\qquad \forall \ \theta\in\R^d.
\end{equation*}
\end{lemma}

% \begin{lemma}[To be proved] 
% \label{lem:gradient_bound}
% Given a client index $c\in[N]$ and assumption XXX\Wei{will fix later}, we have 
% \begin{equation*}
%     \E{\lrn{\nabla f(\bar\theta_s^c)-\nabla f(\bar\theta^c_{k\eta})}_2^2}\leq C \eta^2
% \end{equation*}
% \end{lemma}

Having all the preliminary results ready, now we present a crucial lemma for proving the convergence of all the algorithms.

\begin{lemma}[One step update, restatement of Lemma~\ref{one_step_Dalalyan_main}]\label{one_step_Dalalyan}

Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. Consider Algorithm \ref{alg:alg_main_text_independent_noise} with independently injected noise $\rho=0$, any learning rate $\eta \in (0 , \frac{1}{2L})$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, where $\theta_*$ is the global minimum for the function $f$. Then
\begin{align*}
    W_2^2(\mu_{k+1}, \pi)&\leq  \bigg(1-\frac{\eta m}{2}\bigg) \cdot W^2_2(\mu_{k}, \pi)+ 400\eta^2 d L^2 H_0((K-1)^2+\kappa),
\end{align*}
where $\mu_k$ denotes the probability measure of $\theta_k$, $H_0, \kappa$ and $\gamma^2$ are defined as Definition~\ref{def:H_kappa_gamma}. 
\end{lemma}
% \Wei{does this $\frac{\eta m}{2}$ make sense? Is there a tradition such that a rate of $\eta m$ is required?}


\begin{proof}
%[Proof of Lemma \ref{one_step_Dalalyan} (equiv. to Lemma \textcolor{red}{5.6} in the main paper)]


The solution of the continuous-time process Eq.\eqref{continuous_dynamics} follows that
\begin{align}
\label{solution_continuous_dynamics}
    \bar\theta_t=\bar\theta_0 -\int_0^t \nabla f(\bar\theta_s)\d s + \sqrt{2\tau}\cdot\overline{W}_t, \qquad \forall t\geq 0.
\end{align}


Set $t\rightarrow(k+1)\eta$ and $\bar\theta_0\rightarrow\bar\theta_{k\eta}$ for Eq.\eqref{solution_continuous_dynamics} and consider a synchronous coupling such that $W_{(k+1)\eta}-W_{k\eta}:=\sqrt{\eta}\xi_k$
\begin{align}
\label{continuous_one_step}
    \bar\theta_{(k+1)\eta}&=\bar\theta_{k\eta}-\int_{k\eta}^{(k+1)\eta}\nabla f(\bar\theta_s)\d s + \sqrt{2\tau} (W_{(k+1)\eta}-W_{k\eta})\notag\\
    &=\bar\theta_{k\eta}-\int_{k\eta}^{(k+1)\eta}\nabla f(\bar\theta_s)\d s + \sqrt{2\tau\eta}\xi_k.
\end{align}

We first denote $\zeta_k:=\nabla \tilde f(\theta_k)-\nabla f(\theta_k)$. Subtracting Eq.\eqref{fed_avg_langevin_dynamics} from Eq.\eqref{continuous_one_step} yields that
\begin{align}
\label{decompose_full}
    &\quad \bar\theta_{(k+1)\eta}-\theta_{k+1}\notag\\
    &=\bar\theta_{k\eta}-\theta_{k}+\eta \nabla \tilde f(\theta_k) - \int_{k\eta}^{(k+1)\eta}\nabla f(\bar\theta_s)\d s\notag\\
    &=\bar\theta_{k\eta}-\theta_{k}-\eta \bigg(\nabla f(\theta_k+\bar\theta_{k\eta}-\theta_{k})-\nabla\tilde f(\theta_k)\bigg) - \int_{k\eta}^{(k+1)\eta}\bigg(\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})\bigg)\d s\\
    &=\bar\theta_{k\eta}-\theta_{k}-\eta \bigg(\underbrace{\nabla f(\theta_k+\bar\theta_{k\eta}-\theta_{k})-\nabla f(\theta_k)}_{:=X_k}\bigg)- \underbrace{\int_{k\eta}^{(k+1)\eta}\bigg(\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})\bigg)\d s}_{:=Y_k} +\eta\zeta_k.\notag
\end{align}

Taking square and expectation on both sides, we have
\begin{align}
\label{reestimate}
    &\quad\ \E{\|\bar\theta_{(k+1)\eta}-\theta_{k+1} \|_2^2}\notag\\
    &=\E{\| \bar\theta_{k\eta}-\theta_{k}-\eta X_k-Y_k \|_2^2}+\E{\| \eta\zeta_k \|_2^2}+2\eta\underbrace{\E{\langle\bar\theta_{k\eta}-\theta_{k}-\eta X_k-Y_k,  \zeta_k\rangle}}_{\E{\zeta_k}=0 \text{ and mutual independence}}\notag\\
    &\leq (1+q) \cdot \E{\| \bar\theta_{k\eta}-\theta_{k}-\eta X_k \|_2^2}+ ( 1 + 1 / q ) \cdot \E{\|Y_k \|_2^2}+\E{ \| \eta\zeta_k \|_2^2}\notag\\
    &\leq (1+q) \cdot \big( (1-\eta m) \cdot \E{\| \bar\theta_{k\eta}-\theta_k \|_2^2}+4\eta L\sum_{c=1}^N p_c \cdot \left(\E { \| \bar\theta_{k\eta}^c-\bar\theta_{k\eta} \|_2^2}+\E{\| \theta_k^c-\theta_k \|_2^2}\right) \big)\notag\\
    &\quad\quad + ( 1 + 1/q ) \cdot \E{ \| Y_k \|_2^2 } + \eta^2\sigma^2 d\notag\\
    &\leq (1+q) \cdot \bigg(\underbrace{\left(1-\eta m\right)}_{\phi}\E{ \| \bar\theta_{k\eta}-\theta_k \|_2^2}+448\eta^3 d(K-1)^2 L^3 H_0+32(K-1)\eta^2 dL \tau N\bigg)\notag\\
    &\qquad\qquad+ (1+ 1 / q ) \cdot \E{ \| Y_k \|_2^2}+\eta^2\sigma^2  d,
\end{align}
where the first inequality follows by the AM-GM inequality for any $q>0$, the second inequality follows by Lemma \ref{contraction} and Assumption \ref{def:variance}. The third inequality follows by Lemma \ref{divergence} with $\rho=0$; moreover, the continuous-time process conducts synchronization at any time step, hence $\bar\theta^c_{k\eta}=\bar\theta_{k\eta}$. Since the learning rate follows $\frac{1}{2L}\leq \frac{1}{m+L}\leq \frac{2}{m}$, the requirement of the learning rate for Lemma \ref{contraction} and Lemma \ref{divergence} is clearly satisfied.

Recall that $\phi=1-\eta m$, we get $\frac{1+\phi}{2}=1-\frac{1}{2}\eta m$. Choose $q=\frac{1+\phi}{2\phi}-1$ so that $(1+q)\phi=\frac{(1+\phi)}{2}=1-\frac{1}{2}\eta m$. In addition, we have $1+\frac{1}{q}= \frac{1+q}{q}=\frac{1+\phi}{1-\phi}\leq \frac{2}{\eta m}$.  It follows that
\begin{align}
    \label{nice_inequality_v0}
    (1+q) \cdot (1-\eta m)\leq 1-\frac{1}{2}\eta m,  \quad  1+q\leq \frac{1-\frac{1}{2}\eta m}{1-\eta m}\leq 1.5, \quad (1 + 1/q )\leq \frac{2}{m\eta},
\end{align}
where the second inequality holds because $\eta\in (0, \frac{1}{2L}]\leq \frac{1}{2m}$.



For the term $\E{ \| Y_k \|_2^2 }$ in Eq.\eqref{reestimate}, we have the following estimate
\begin{align}
\label{y_estimate}
    \E{ \| Y_k \|_2^2}&=\E{\lrn{\int_{k\eta}^{(k+1)\eta}\bigg(\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})\bigg)\d s}_2^2}\notag\\
    &\leq\eta\int_{k\eta}^{(k+1)\eta}\E{\lrn{\nabla f(\bar\theta_s)-\nabla f(\bar\theta_{k\eta})}_2^2}  \d s\notag\\
    &=\eta\int_{k\eta}^{(k+1)\eta}\E{\lrn{\sum_{c=1}^N p_c \bigg(\nabla f^c(\bar\theta_s^c)-\nabla f^c(\bar\theta^c_{k\eta})\bigg)}_2^2}  \d s\notag\\
    &\leq \eta\int_{k\eta}^{(k+1)\eta}\sum_{c=1}^N p_c \cdot \E{\lrn{\nabla f^c(\bar\theta_s^c)-\nabla f^c(\bar\theta^c_{k\eta})}_2^2}  \d s\notag\\
    &\leq \eta L^2 \int_{k\eta}^{(k+1)\eta}\sum_{c=1}^N p_c \cdot \E{\lrn{\bar\theta_s^c-\bar\theta^c_{k\eta}}_2^2}  \d s\notag\\
    &\leq \eta L^2  \int_{k\eta}^{(k+1)\eta}  \left(8\eta^2 d\kappa\bigg(\frac{\kappa\gamma^2}{d}+L\tau\bigg)+16\eta d\tau N\right) \d s\notag\\
    &=8\eta^4 d L^4 H_0+16\eta^3 L^2 d\tau N,
\end{align}
where the first inequality follows by H\"{o}lder's inequality, the second inequality follows by Jensen's inequality, the third inequality follows by Assumption \ref{def:smooth}, and the last inequality follows by Lemma \ref{lem:discretization}. The last equality holds since $\frac{\kappa }{d}\gamma^2 + L\tau\leq L m H_0$ and $\kappa=L/m$.

Plugging Eq.\eqref{nice_inequality_v0} and Eq.\eqref{y_estimate} into Eq.\eqref{reestimate}, we have
\begin{align*}
    \E{\|\bar\theta_{(k+1)\eta}-\theta_{k+1} \|^2_2}&\leq  (1-\frac{\eta m}{2} ) \cdot \E{\|\bar\theta_{k\eta}-\theta_k\|_2^2}\notag\\
    &\quad\quad+ 672\eta^3 d(K-1)^2 L^3 H_0+ 48\eta^2 d(K-1)L \tau N\notag\\
    &\quad\quad+16\eta^3 d L^3\kappa H_0+32\eta^2 d\frac{L^2}{m} \tau N+\eta^2 \sigma^2 d.
\end{align*}

Choose the specific Langevin diffusion $\bar\theta$ in stationary regime, we have $W_2^2(\mu_k,\pi)=\E{\|\bar\theta_{k\eta}-\theta_k \|_2^2}$ and  $W_2^2(\mu_{k+1},\pi)\leq\E{\| \bar\theta_{(k+1)\eta}-\theta_{k+1} \|_2^2}$. Arranging the terms, we have
\begin{align*}
    W_2^2(\mu_{k+1}, \pi)&\leq  (1-\frac{\eta m}{2}) \cdot W^2_2(\mu_{k}, \pi)+ 400\eta^2 d L^2 H_0((K-1)^2+\kappa),
\end{align*}
where $\eta\leq \frac{1}{2L}$, $\kappa\geq 1$, $m\tau\leq L\tau\leq L\tau N\leq L \max_{c\in[N]}T_{c,0}\leq Lm H_0$, and $\sigma^2\leq L^2 H_0$ are applied to the result.

\end{proof}


\subsection{Convergence via independent noises}


\begin{theorem}[Restatement of Theorem~\ref{main_paper_theorem}]\label{main_theorem} 
Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. Consider Algorithm \ref{alg:alg_main_text_independent_noise} with a fixed learning rate $\eta\in (0, \frac{1}{2L}]$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have
\begin{align*}
    W_2(\mu_{k}, \pi) &\leq  \left(1-\frac{\eta m}{4}\right)^k \cdot \bigg(\sqrt{2d}\big(\mathcal{D} +  \sqrt{\tau/m} \big)\bigg)+30\kappa\sqrt{{\eta} m d} \cdot \sqrt{((K-1)^2+\kappa)H_0} .\notag
\end{align*}
where $\mu_k$ denotes the probability measure of $\theta_k$, $H_0, \kappa$ and $\gamma^2$ are defined as Definition~\ref{def:H_kappa_gamma}.
\end{theorem}


\begin{proof}
%[Proof of Theorem \ref{main_theorem} (equiv. to Theorem \textcolor{red}{5.7} in the main paper)]
Iteratively applying Theorem \ref{one_step_Dalalyan} and arranging terms, we have that
\begin{align}\label{one_step_squared}
    W_2^2(\mu_{k}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)^k W^2_2(\mu_{0}, \pi)+  \frac{1-(1-\frac{\eta m}{2})^k}{1-(1-\frac{\eta m}{2})}\bigg(400\eta^2 d L^2 H_0((K-1)^2+\kappa)\bigg)\notag\\
    &\leq \left(1-\frac{\eta m}{2}\right)^k W^2_2(\mu_{0}, \pi)+  \frac{2}{\eta m}\bigg(400\eta^2 d L^2 H_0((K-1)^2+\kappa)\bigg)\notag\\
    &\leq \left(1-\frac{\eta m}{2}\right)^k W^2_2(\mu_{0}, \pi)+ 800\kappa^2 \eta m d ((K-1)^2+\kappa) H_{0},
\end{align}
where $\kappa=\frac{L}{m}$. By Lemma \ref{lem:W2_init_bound} and the initialization condition $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have that
\begin{align*}
W_2(\mu_0, \pi)\leq \sqrt{2d}(\mathcal{D} +  \sqrt{\tau/m} ).
\end{align*}

Applying the inequality $(1-\frac{\eta m}{2})\leq (1-\frac{\eta m}{4})^2$  completes the proof. 
\end{proof}

\textbf{Discussions}


\textbf{Optimal choice of $K$.} To ensure the algorithm to achieve the $\epsilon$ precision based on the total number of steps $T_{\epsilon}$ and the learning rate $\eta$, we can set
\begin{align*}
    &30\kappa\sqrt{{\eta}m d} \cdot \bigg(\sqrt{((K-1)^2+\kappa)H_0} \bigg)\leq \frac{\epsilon}{2}\notag\\
    &e^{-\frac{\eta m}{4} T_{\epsilon}} \cdot \bigg(\sqrt{2d}\big(\mathcal{D} +  \sqrt{\tau/m} \big)\bigg)\leq \frac{\epsilon}{2}.
\end{align*}
This directly leads to
\begin{align*}
    \eta m\leq \min\bigg\{\frac{m}{2L}, O\bigg(\frac{\epsilon^2}{d\kappa^2 {((K-1)^2+\kappa)H_0}}\bigg)\bigg\},\quad T_{\epsilon}\geq \Omega\bigg(\frac{\log\big(\frac{d}{\epsilon}\big)}{m\eta}\bigg).
\end{align*}

Plugging into the upper bound of $\eta$, it implies that to reach the precision level $\epsilon$, it suffices to set
\begin{align}\label{def_T}
    T_{\epsilon}=\Omega\bigg(\frac{d\kappa^2 {((K-1)^2+\kappa)H_0}}{\epsilon^2}\cdot \log\bigg(\frac{d}{\epsilon}\bigg)\bigg).
\end{align}
Since $H_0 = \Omega(\mathcal{D}^2+\frac{\tau}{m})$, we observe that the number of communication rounds is around the order
\begin{align*}
    \frac{T_{\epsilon}}{K}=\Omega\bigg( K+\frac{\kappa}{K}\bigg),
\end{align*}
where the value of $\frac{T_{\epsilon}}{K}$ first decreases and then increases with respect to $K$, indicating that setting $K$ either too large or too small may lead to high communication costs and hurt the performance. Ideally, $K$ should be selected in the scale of $\Omega(\sqrt{\kappa})$. Combining the definition of $T_{\epsilon}$ in Eq.\eqref{def_T}, this suggests an interesting result that the optimal $K$ should be in the order of $O(\sqrt{T_{\epsilon}})$. Similar results have been achieved by \cite{Stich19, lhy+20}.


\subsection{Convergence via varying learning rates}

\begin{theorem}[Restatement of Theorem~\ref{main_paper_theorem_decay}]\label{main_theorem_decay} Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. Consider Algorithm \ref{alg:alg_main_text_independent_noise} with an initialization satisfying $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$ and varying learning rate following
\begin{align*}
    \eta_{k}=\frac{1}{2L+(1/12)m k},\qquad k=1,2,\cdots.
\end{align*}
Then for any $k\geq 0$, we have
\begin{align*}
    W_2(\mu_{k}, \pi)\leq 45\kappa\sqrt{ ((K-1)^2+\kappa)H_0}\cdot\big(\eta_k m d\big)^{1/2}, \qquad \forall k \geq 0,
\end{align*}
\end{theorem}

\begin{proof}
%[Proof of Theorem \ref{main_theorem_decay} (equiv. to Theorem \textcolor{red}{5.8} in the main paper)]

We first denote 
\begin{align*}
    C_{\kappa}=30\kappa\sqrt{ ((K-1)^2+\kappa)H_0}.
\end{align*}
Next, we proceed to show the following inequality by the induction method
\begin{align}\label{induction}
    W_2(\mu_{k}, \pi)\leq 1.5C_{\kappa}\bigg(\frac{d}{2L+(1/12){m k}}\bigg)^{1/2}=1.5C_{\kappa}\big(\eta_k m d\big)^{1/2}, \qquad \forall k \geq 0,
\end{align}
where the decreasing learning rate follows that
\begin{align*}
    \eta_{k}=\frac{1}{2L+(1/12)m k}.
\end{align*}
(i) For the case of $k=0$, since 
\begin{align}\label{up_bd}
    C_{\kappa}&\geq 4\sqrt{\kappa} \sqrt{H_0}\geq 4\sqrt{\kappa}\sqrt{\mathcal{D}^2 + \frac{1}{m} \max_{c\in[N]} T_{c,0}}\geq 4\sqrt{\kappa/d} \bigg(\sqrt{d\mathcal{D}^2}+\sqrt{\frac{d}{m} \max_{c\in[N]} T_{c,0}}\bigg)\notag\\
    &\geq 4\sqrt{\kappa/d} W_2(\mu_0, \pi),
\end{align}
where the last inequality follows by Lemma \ref{lem:W2_init_bound} and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$. 

It is clear that $W_2(\mu_0, \pi)\leq \frac{1}{4}C_{\kappa} \sqrt{\frac{md}{L}}\leq 1.5C_{\kappa} \sqrt{\eta_0 m d}$ by Eq.\eqref{up_bd}.



(ii) If now that Eq.\eqref{induction} holds for some $k\geq 0$, it follows by Lemma \ref{one_step_Dalalyan} that
\begin{align*}
    W_2^2(\mu_{k+1}, \pi)&\leq \big(1-\frac{\eta_k m}{2}\big) \cdot W_2^2(\mu_{k}, \pi)+400\eta_k^2 d L^2 H_0((K-1)^2+\kappa)\notag\\
    &\leq \big(1-\frac{\eta_k m}{2}\big) \cdot W_2^2(\mu_{k}, \pi)+ \frac{\eta_k^2m^2}{2}C_{\kappa}^2 d\notag\\
    &\leq  \big(1-\frac{\eta_k m}{2}\big) \cdot  2.25  C^2_{\kappa} \eta_k m d+ \frac{\eta_k m}{3}2.25C^2_{\kappa}\eta_k m d\notag\\
    &\leq  \big(1-\frac{\eta_k m}{6}\big) \cdot 2.25   C^2_{\kappa}\eta_k m d.\notag
\end{align*}


Since $\big(1-\frac{\eta_k m}{6}\big)\leq \big(1-\frac{\eta_k m}{12}\big)^2$, we have
\begin{align*}
    W_2(\mu_{k+1}, \pi)&\leq  \big(1-\frac{\eta_k m}{12}\big) \cdot 1.5   C_{\kappa}\big(\eta_k m d\big)^{1/2}.\notag
\end{align*}

To prove $W_2(\mu_{k+1}, \pi)\leq 1.5C_{\kappa}\big(\eta_{k+1} md\big)^{1/2}$, it suffices to show $\big(1-\frac{\eta_k m}{12}\big) \eta_k^{1/2}\leq \eta_{k+1}$, which is detailed as follows
\begin{align*}
    \big(1-\frac{\eta_k m}{12}\big) \eta_k^{1/2}&=\frac{\sqrt{12}(24L+mk-m)}{(24L+mk)^{3/2}}\notag\\
    &\leq \frac{\sqrt{12}(24L+mk-m)^{1/2}}{24L+mk}\notag\\
    &\leq \frac{\sqrt{12}}{(24L+m(k+1))^{1/2}}:=\eta_{k+1},
\end{align*}
where the last inequality follows since 
\begin{align*}
    (24L+mk-m)(24L+m k+m))\leq (24L+mk)^2.
\end{align*}
\end{proof}


The above result implies that to achieve the precision $\epsilon$, we require
\begin{align*}
     W_2(\mu_{k}, \pi)\leq 1.5C_{\kappa}\bigg(\frac{md}{2L+(1/12){mk}}\bigg)^{1/2}\leq \epsilon.
\end{align*}


The means that we only require $k={\Omega}(\frac{d}{\epsilon^2})$ to achieve the precision $\epsilon$. By contrast, the fixed learning rate requires $T_{\epsilon}=\Omega\bigg(\frac{d}{\epsilon^2}\cdot \log\big( {d}/{\epsilon}\big)\bigg)$, which is much slower than the algorithm with varying learning rate by $O\big(\log \big({d}/{\epsilon}\big)\big)$ times.










\subsection{Privacy-accuracy trade-off via correlated noises}

Note that Algorithm \ref{alg:alg_main_text_independent_noise} requires all the local clients to generate the independent noise $\xi^c_k$. Such a mechanism enjoys the convenience of the implementation and yields a potential to protect the privacy of data and alleviates the security issue. However, the scale of noises is maximized and inevitable slows down the convergence. For extensions, it can be naturally generalized to correlated noise based on a hyperparameter, namely the correlation coefficient $\rho$ between different clients. Replacing Eq.\eqref{local_client} with 
\begin{equation}\label{local_client_diff_seeds}
    \beta_{k+1}^c=\theta_k^c-\eta\nabla \tilde f^c(\theta_k^c)+\sqrt{2\eta\tau \rho^2}\dot{\xi}_k + \sqrt{2\eta(1-\rho^2)\tau/p_c}\xi_k^c,
\end{equation}
where $\dot{\xi}_k$ is a $d$-dimensional standard Gaussian vector shared by all the clients at iteration $k$, $\xi_k^c$ is a unique $d$-dimensional Gaussian vector generated by client $c\in [N]$ only. Moreover, $\dot\xi_k$ is dependent with $\xi_k^c$ for any $c\in[N]$. Following the same synchronization step based Eq.\eqref{synchronization}, we have
\begin{equation}
\label{fed_avg_langevin_dynamics_pp}
\theta_{k+1}=\theta_k-\eta \nabla \tilde f(\theta_k)+\sqrt{2\eta\tau}\xi_k,
\end{equation}
where $\xi_k=\rho \xi_k + \sqrt{1-\rho^2}\sum_{c=1}^N \sqrt{p_c}\xi_k^c$. Since the variance of i.i.d variables is additive, it is clear that $\xi_k$ follows the standard $d$-dimensional Gaussian distribution. The inclusion of the correlated noise implicitly reduces the temperature and naturally yields a trade-off between federation and accuracy. We refer to the algorithm with correlated noise as the generalized Federated Averaging Langevin dynamics (gFA-LD) and present it in Algorithm \ref{alg:alg_main_text_different_seeds}.

Since the inclusion of correlated noise doesn't affect the formulation of Eq.\eqref{fed_avg_langevin_dynamics_pp}, the algorithm property maintains the same except the scale of the temperature $\tau$ and federation are changed. Based on a target correlation coefficient $\rho\geq 0$, Eq.\eqref{local_client_diff_seeds} is equivalent to applying a temperature $T_{c,\rho}=\tau(\rho^2+(1-\rho^2)/p_c)$. In particular, setting $\rho=0$ leads to $T_{c, 0}=(1-\rho^2)/p_c$, which exactly recovers Algorithm \ref{alg:alg_main_text_independent_noise}; however, setting $\rho=1$ leads to $T_{c, 1}=\tau$, where the injected noise in local clients is reduced by $1/p_c$ times. Now we adjust the analysis as follows
\begin{theorem}[Restatement of Theorem \ref{correlated_noise_main}]\label{correlated_noise_supp} Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold.  Consider Algorithm \ref{alg:alg_main_text_different_seeds} with a correlation coefficient $\rho\in[0, 1]$, a fixed learning rate $\eta\in (0, \frac{1}{2L}]$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have
\begin{align*}
    W_2(\mu_{k}, \pi) &\leq  \left(1-\frac{\eta m}{4}\right)^k \cdot \bigg(\sqrt{2d}\big(\mathcal{D} +  \sqrt{\tau/m} \big)\bigg)+30\kappa\sqrt{{\eta} m d} \cdot \sqrt{((K-1)^2+\kappa)H_{\rho}},\notag
\end{align*}
where $\mu_k$ denotes the probability measure of $\theta_k$, $H_{\rho}, \kappa$ and $\gamma^2$ are defined as Definition~\ref{def:H_kappa_gamma}.
\end{theorem}

\begin{proof}
The proof follows the same techniques as in Theorem \ref{main_theorem} except that $H_0$ is generalized to $H_{\rho}$ to accommodate to the changes of the \emph{injected noise}. The details are omitted.
\end{proof}


\begin{algorithm*}[h]\caption{Hybrid federated averaging Langevin dynamics algorithm. Denote by $\theta_k^c$ the model parameter in the $c$-th client at the $k$-th step. Denote the immediate result of one step SGLD update from $\theta_k^c$ by $\beta_k^c$. $\xi_k^c$ is an independent standard $d$-dimensional Gaussian vector at iteration $k$ for each client $c\in[N]$ and $\dot{\xi}_k$ is a $d$-dimensional standard Gaussian vector shared by all the clients. $\rho$ denotes the correlation coefficient of the injected noises. A global synchronization is conducted every $K$ steps. This is a complete version of Algorithm~\ref{alg:alg_main_paper_text_different_seeds}.}\label{alg:alg_main_text_different_seeds}
% \begin{algorithmic}[1]
\begin{equation*}
    \beta_{k+1}^c=\theta_k^c-\eta\nabla \tilde f^c(\theta_k^c)+\sqrt{2\eta\tau \rho^2}\dot\xi_k + \sqrt{2\eta(1-\rho^2)\tau/p_c}\xi_k^c,
\end{equation*}
\begin{equation*}  
\theta_{k+1}^c=\left\{  
             \begin{array}{lr}  
             \beta_{k+1}^c \qquad\qquad\qquad \text{if } k+1 \text{ mod } K\neq 0 \\  
              & \\
             \sum_{c=1}^N p_c \beta_{k+1}^c \ \qquad \text{if } k+1 \text{ mod } K=0.
             \end{array}  
\right.  
\end{equation*} 
% \end{algorithmic}
\end{algorithm*}



\section{Partial device participation}\label{sec:partial_device_participation}

Full device participation enjoys appealing convergence properties. However, it suffers from the straggler's effect in real-world applications, where the communication is limited by the slowest device. Partial device participation handles this issue by only allowing a small portion of devices in each communication and greatly increased the communication efficiency %fault-tolerant capability 
in a federated network. 


\subsection{Unbiased sampling schemes}
\label{unbiased_sampling_schems_appendix}
The first device-sampling scheme \text{I} \cite{LS20} selects a total of $S$ devices, where the $c$-th device is selected with a probability $p_c$. The first theoretical justification for convex optimization has been proposed by \cite{lhy+20}. %However, to our best knowledge, the convergence analysis of sampling algorithm was missing in the federated-learning literature. 


\paragraph{(Scheme \text{I}: with replacement).}
Assume $\mathcal{S}_k=\{n_1, n_2, \cdots, n_S\}$, where $n_j\in [N]$ is a random number that takes a value of $c$ with a probability $p_c$ for any $j\in\{1,2,\cdots, S\}$. The synchronization step follows that $\theta_{k}=\frac{1}{S}\sum_{c\in \mathcal{S}_k}\theta_{k}^c$.

Another strategy is to uniformly select $S$ devices without replacement. We follow  \cite{lhy+20} and assume $S$ indices are selected uniformly without replacement and the synchronization step is the same as before. In addition, the convergence also requires an additional assumption on balanced data \cite{lhy+20}. 
\paragraph{(Scheme \text{II}: without replacement).}  Assume $\mathcal{S}_k=\{n_1, n_2, \cdots, n_S\}$, where $n_j\in [N]$ is a random number that takes a value of $c$ with a probability $\frac{1}{S}$ for any $j\in\{1,2,\cdots, S\}$. Assume the data is balanced such that $p_1=\cdots=p_N=\frac{1}{N}$. The synchronization step follows that $\theta_{k}=\frac{N}{S}\sum_{c\in \mathcal{S}_k} p_c\theta_{k}^c=\frac{1}{S}\sum_{c\in \mathcal{S}_k} \theta_{k}^c$.


% \paragraph{Notation: }


\begin{algorithm*}[h]\caption{Hybrid federated averaging Langevin dynamics algorithm with partial device participation. $\xi_k^c$ is the independent Gaussian vector proposed by each client $c\in[N]$ and $\dot{\xi}_k$ is a unique Gaussian vector shared by all the clients. $\rho$ denotes the correlation coefficient. A global synchronization is conducted every $K$ steps. $\mathcal{S}_k$ is a subset that contains $S$ indices according to a device-sampling rule based on scheme \text{I} or \text{II}. This is a complete version of Algorithm~\ref{alg:alg_main_text_partial_main}.}\label{alg:alg_main_text_partial}
% \begin{algorithmic}[1]
\begin{equation*}
    \beta_{k+1}^c=\theta_k^c-\eta\nabla \tilde f^c(\theta_k^c)+\sqrt{2\eta\tau \rho^2}\dot\xi_k + \sqrt{2\eta(1-\rho^2)\tau/p_c}\xi_k^c,
\end{equation*}
\begin{equation*}  
\theta_{k+1}^c=\left\{  
             \begin{array}{lr}  
             \beta_{k+1}^c \qquad\qquad\qquad\quad\text{if } k+1 \text{ mod } K\neq 0 \\  
              & \\
             \sum_{c\in \mathcal{S}_{k+1}} \frac{1}{S} \beta_{k+1}^c \ \qquad \text{if } k+1 \text{ mod } K=0.
             \end{array}  
\right.  
\end{equation*} 
% \end{algorithmic}
\end{algorithm*}


%The goal of this section is to prove Lemma~\ref{unbiased_scheme}.
\begin{lemma}[Unbiased sampling scheme]\label{unbiased_scheme}
For any $k \text{ mod } K=0$ based on scheme \text{I} or \text{II}, we have
\begin{align*}
    \E{\theta_k}=\E{\sum_{c\in \mathcal{S}_k} \theta_k^c}=\beta_k:=\sum_{c=1}^N p_c \beta_k^c.
\end{align*}
\end{lemma}

\begin{proof}

According to the definition of scheme \text{I} or \text{II}, we have $\theta_{k}=\frac{1}{S}\sum_{c\in \mathcal{S}_k} \theta_{k}^c$. In what follows, $\E{\theta_k}=\frac{1}{S}\E{\sum_{c\in \mathcal{S}_k} \theta_{k}^c}=\frac{1}{S}\sum_{c_0\in\mathcal{S}_k}\sum_{c=1}^N p_c \beta_k^c=\sum_{c=1}^N p_c \beta_k^c$, where $p_1=p_2=\cdots=p_N$ for scheme \text{II} in particular.
\end{proof}


\subsection{Bounded divergence based on partial device}

\begin{lemma}[Bounded divergence based on partial device]\label{divergence_partial}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold.  Consider Algorithm \ref{alg:alg_main_text_partial} with a correlation coefficient $\rho\in[0, 1]$, any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the following results

For Scheme \text{I}, the divergence between $\theta_k$ and $\beta_k$ is upper bounded by
\begin{align*}
    \E{\|\beta_k-\theta_k \|_2^2}&\leq \frac{112}{S}K^2\eta^2 dL^2H_{\rho} +\frac{8}{S}K\eta d \tau(\rho^2+N(1-\rho^2)).\notag
\end{align*}

For Scheme \text{II}, assuming the data is balanced such that $p_1=\cdots=p_N=\frac{1}{N}$, the divergence between $\theta_k$ and $\beta_k$ is upper bounded by
\begin{align*}
    \E{\|\beta_k-\theta_k \|_2^2}&\leq \frac{N-S}{S(N-1)} \bigg(112K^2\eta^2 dL^2H_{\rho} +8K\eta d \tau(\rho^2+N(1-\rho^2))\bigg).\notag
\end{align*}
where $H_{\rho}, \kappa$ and $\gamma^2$ are defined as Definition~\ref{def:H_kappa_gamma}. % $H_{\rho}=14 \kappa^2 \cdot  (m^2 \mathcal{D}^2+m\tau +\frac{\gamma^2}{d}+\sigma^2 )$, $\kappa=L/m$, and  $\gamma^2:=\max_{c\in[N]}\| \nabla f^c(\theta_*) \|_2^2$.
\end{lemma}

\begin{proof} We prove the bounded divergence for the two schemes, respectively.

For \textbf{scheme \text{I}} with replacement, $\bar\theta_{k}=\sum_{c\in \mathcal{S}_k} \frac{1}{S} \beta_{k}^c$ for a subset of indices $\mathcal{S}_k$. Taking expectation with respect to $\mathcal{S}_{k}$,
we have
\begin{align}\label{scheme_1}
    \E{\lrn{\theta_{k}-\beta_{k}}_2^2}=\frac{1}{S^2}\sum_{i=1}^S\E{\lrn{\beta_{k}^{n_i}-\beta_{k}}_2^2}=\frac{1}{S}\sum_{c=1}^N p_c \lrn{\beta_{k}^c-\beta_{k}}_2^2,
\end{align}
where the first equality follows by the independence and unbiasedness of $\theta_{k}^{n_i}$ for any $i\in [S]$. 

To further upper bound Eq.\eqref{scheme_1}, we follow the same technique as in Lemma \ref{divergence}. Since $k\text{ mod } K=0$, $k_0=k-K$ is also the communication time, which yields the same $\theta_{k_0}^{c}$ for any $c\in[N]$. in what follows,
\begin{align}\label{scheme_1_step2}
    \sum_{c=1}^N p_c\lrn{\beta_{k}^c-\beta_{k}}_2^2&=\sum_{c=1}^N p_c \lrn{\beta_k^c-\theta_{k_0}-(\beta_k-\theta_{k_0})}_2^2\notag\\
&\leq \sum_{c=1}^N p_c \lrn{\beta_k^c-\theta_{k_0}}_2^2,
\end{align}
where the last inequality follows by $\beta_{k}=\sum_{c=1}^N p_c \beta_{k}^c$ and $\E{\lrn{x-\E{x}}_2^2}\leq \E{\lrn{x}_2^2}$. Combining Eq.\eqref{scheme_1} and Eq.\eqref{scheme_1_step2}, we have
\begin{align*}
    \E{\lrn{\theta_{k}-\beta_{k}}_2^2}&\leq \frac{1}{S}\sum_{c=1}^N p_c \lrn{\beta_k^c-\theta_{k_0}}_2^2\notag\\
    &\leq \frac{1}{S}\sum_{c=1}^N p_c \lrn{\beta_k^c-\theta^c_{k_0}}_2^2\notag\\
    &\leq \frac{1}{S}\sum_{c=1}^N p_c \E{\sum_{k=k_0}^{k-1} 2K\eta^2\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2 + 4K\eta d \tau\big(\rho^2+(1-\rho^2)/p_c\big)}\notag\\
&\leq \frac{1}{S}\sum_{c=1}^N p_c \left(\sum_{k=k_0}^{k-1} 2K\eta^2\E{\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2}+4K\eta d \tau\big(\rho^2+(1-\rho^2)/p_c\big)\right)\notag\\
&\leq \frac{28}{S}K^2\eta^2 dL^2 H_{\rho} +\frac{4}{S}K\eta d \tau(\rho^2+N(1-\rho^2))\notag\\
% &\leq \frac{112}{S}K^2\eta_{k}^2 dL^2 H_{\rho} +\frac{8}{S}K\eta_{k} d \tau(\rho^2+N(1-\rho^2))\notag\\
\end{align*}
where the last inequality follows a similar argument as in Lemma \ref{divergence}.

For \textbf{scheme \text{II}}, given $p_1=p_2=\cdots=p_N=\frac{1}{N}$, we have $\theta_{k}=\frac{1}{S}\sum_{c\in \mathcal{S}_k} \beta_{k}^{c}$, which leads to
\begin{align*}
    &\quad\E{\lrn{\theta_{k}-\beta_{k}}_2^2}=\E{\lrn{\frac{1}{S}\sum_{c\in \mathcal{S}_k} \beta_{k}^{c}-\beta_{k}}_2^2}=\frac{1}{S^2}\E{\lrn{\sum_{c=1}^N \mathbb{I}_{c\in \mathcal{S}_k}(\beta_{k}^c-\beta_{k})}_2^2},
\end{align*}
where $\mathbb{I}_{A}$ is an indicator function that equals to 1 if the event $A$ happens.

Plugging the facts that $\mathbb{P}(c\in \mathcal{S}_{k})=\frac{S}{N}$ and $\mathbb{P}(c_1,c_2\in \mathcal{S}_{k})=\frac{S(S-1)}{N(N-1)}$ for any $c_1\neq c_2\in [N]$ into the above equation, we have
\begin{align*}
    &\quad\E{\lrn{\theta_{k}-\beta_{k}}_2^2}\notag\\
    &=\frac{1}{S^2}\bigg[\sum_{c\in [N]} \mathbb{P}(c\in \mathcal{S}_{k}) \lrn{\beta_{k}^c-\beta_{k}}_2^2+\sum_{c_1\neq c_2} \mathbb{P}(c_1,c_2\in \mathcal{S}_{k})\langle \beta_{k}^{c_1}-\beta_{k}, \beta_{k}^{c_2}-\beta_{k} \rangle \bigg]\notag\\
    &=\frac{1}{SN}\sum_{c=1}^N\lrn{\beta_{k}^c-\beta_{k}}_2^2+\sum_{c_1\neq c_2}\frac{S-1}{SN(N-1)} \langle \beta_{k}^{c_1}-\beta_{k}, \beta_{k}^{c_2}-\beta_{k} \rangle\notag\\
    &=\frac{1-\frac{S}{N}}{S(N-1)}\sum_{c=1}^N\lrn{\beta_{k}^c-\beta_{k}}_2^2,
\end{align*}
where the last equality holds since $\sum_{c\in[N]}\lrn{\beta_{k}^c-\beta_{k}}_2^2 +\sum_{c_1\neq c_2}\langle \beta_{k}^{c_1}-\beta_{k},\beta_{k}^{c_2}-\beta_{k}\rangle=\lrn{\beta_{k}-\beta_{k}}_2^2=0$.

Eventually, we have
\begin{align*}
    \E{\lrn{\theta_{k}-\beta_{k}}_2^2}&=\frac{N-S}{S(N-1)} \E{\frac{1}{N} \sum_{c=1}^N\lrn{\beta_{k}^c-\beta_{k}}_2^2}\notag\\
    &\leq\frac{N-S}{S(N-1)} \E{\frac{1}{N} \sum_{c=1}^N\lrn{\beta_{k}^c-\theta_{k_0}}_2^2}\notag\\
    &\leq \frac{N-S}{S(N-1)} \bigg(28 K^2\eta^2 dL^2 H_{\rho} +4K\eta d \tau\big(\rho^2+N(1-\rho^2)\big)\bigg),
\end{align*}
where the first inequality follows a similar argument as in Eq.\eqref{scheme_1_step2} and the last inequality follows by Lemma \ref{divergence}.


\end{proof}


\subsection{Convergence via partial device participation}


\begin{theorem}[Restatement of Theorem~\ref{thm:partial_II}]\label{theorem_partial} Assume assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. Consider Algorithm \ref{alg:alg_main_text_partial} with a correlation coefficient $\rho\in[0, 1]$, a fixed learning rate $\eta\in (0, \frac{1}{2L}]$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have
\begin{align*}
    W_2(\mu_{k}, \pi) &\leq  \left(1-\frac{\eta m}{4}\right)^k \cdot \bigg(\sqrt{2d}\big(\mathcal{D} +  \sqrt{\tau/m} \big)\bigg)\notag\\
    &\qquad+30\kappa\sqrt{\eta m d } \cdot \sqrt{ H_{\rho}((K-1)^2+\kappa)}+2\sqrt{\frac{C_K d\tau}{Sm}(\rho^2+N(1-\rho^2)) C_S},
\end{align*}
where $C_K=\frac{\eta m K}{1-e^{-\frac{\eta m K}{2}}}$, $C_S=1$ for \emph{Scheme I} and $C_S=\frac{N-S}{N-1}$ for \emph{Scheme II}.
\end{theorem}

\begin{proof}%[Proof of Theorem \ref{theorem_partial} (equiv. to Theorem \textcolor{red}{5.10} in the main paper)]

Note that 
\begin{align*}
&\quad\ \E{\lrn{\bar\theta_{(k+1)\eta}-\theta_{k+1}}_2^2}\notag\\
&= \E{\lrn{\bar\theta_{(k+1)\eta}-\beta_{k+1}+\beta_{k+1}-\theta_{k+1}}_2^2}\notag\\
&= \E{\lrn{\bar\theta_{(k+1)\eta}-\beta_{k+1}}_2^2}+ \E{\lrn{\beta_{k+1}-\theta_{k+1}}_2^2}+\E{2\langle \bar\theta_{(k+1)\eta}-\beta_{k+1}, \beta_{k+1}-\theta_{k+1} \rangle}\notag\\
&= \E{\lrn{\bar\theta_{(k+1)\eta}-\beta_{k+1}}_2^2}+ \E{\lrn{\beta_{k+1}-\theta_{k+1}}_2^2},\notag
\end{align*}
where the last equality follows by the unbiasedness of the device-sampling scheme in Lemma \ref{unbiased_scheme}.


If $k+1 \text{ mod } K\neq 0$, we always have $\beta_{k+1}=\theta_{k+1}$ and $\E{\lrn{\beta_{k+1}-\theta_{k+1}}_2^2}=0$. Following the same argument as in Lemma \ref{one_step_Dalalyan}, both schemes lead to the one-step iterate as follows
\begin{align}\label{non_period}
    W_2^2(\mu_{k+1}, \pi)&\leq  (1-\frac{\eta m}{2}) \cdot W^2_2(\mu_{k}, \pi)+  400\eta^2 d L^2 H_{\rho}((K-1)^2+\kappa).
\end{align}


If $k+1 \text{ mod } K= 0$, combining Lemma \ref{divergence_partial} and Lemma \ref{one_step_Dalalyan}, we have
\begin{align}\label{period}
    W_2^2(\mu_{k+1}, \pi)&\leq  (1-\frac{\eta m}{2}) \cdot W^2_2(\mu_{k}, \pi)+ 450\eta^2 d L^2 H_{\rho}(K^2+\kappa) + \frac{4Kd\eta\tau}{S}(\rho^2+N(1-\rho^2)) C_S,
\end{align}
where $C_S=1$ for \emph{Scheme I} and $C_S=\frac{N-S}{N-1}$ for \emph{Scheme II}.


Repeatedly applying Eq.\eqref{non_period} and Eq.\eqref{period} and arranging terms, we have that
\begin{align*}
    W_2^2(\mu_{k}, \pi)&\leq  \left(1-\frac{\eta m}{2}\right)^k W^2_2(\mu_{0}, \pi)+  \frac{2}{\eta m}\bigg(450\eta^2 d L^2 H_{\rho}(K^2+\kappa)\bigg)\notag\\
    &\qquad+ \frac{(1-(1-\frac{\eta m}{2})^K)^{\lfloor k/K\rfloor}}{1-(1-\frac{\eta m}{2})^K}\left(  \frac{4Kd\eta\tau}{S} (\rho^2+N(1-\rho^2)) C_S  \right)\notag\\
    &\leq \left(1-\frac{\eta m}{2}\right)^k W^2_2(\mu_{0}, \pi)+ 900\eta m d \kappa^2  H_0((K-1)^2+\kappa)\notag\\
    &\quad\quad+\underbrace{\frac{\eta m K}{1-e^{-\frac{\eta m K}{2}}}}_{C_K} \frac{4Kd\eta\tau}{\eta mK S} (\rho^2+N(1-\rho^2)) C_S ,\notag\\
    &= \left(1-\frac{\eta m}{2}\right)^k W^2_2(\mu_{0}, \pi)+ 900\eta m d \kappa^2  H_0((K-1)^2+\kappa)\notag\\
    &\quad\quad+  \frac{4C_K d\tau}{Sm}(\rho^2+N(1-\rho^2)) C_S,\notag
\end{align*}
where the second inequality follows by $(1-r)^K\leq e^{-rK}$ for any $r\geq 0$.


\end{proof}



\section{Bounding contraction, discretization, and divergence}\label{sec:bouding_contraction_discretization_divergence}
%This section is organized as follows:
%\begin{itemize}
%    \item 
%\end{itemize}

\subsection{Dominated contraction property}
\label{DCP}
\begin{proof}[Proof of Lemma \ref{contraction} ] 



Given a client index $c\in[N]$, applying Theorem 2.1.12 \cite{Nesterov04} leads to
\begin{align}
\label{special_inner_product}
    \langle y-x, \nabla f^c(y)-\nabla f^c(x) \rangle\geq \frac{m L}{L+m}\lrn{y-x}_2^2 + \frac{1}{L+m} \lrn{\nabla f^c(y)-\nabla f^c(x)}_2^2,\quad \forall x,y\in\mathbb{R}^d.
\end{align}

In what follows, we have
\begin{align}
\label{iteration}
    &\quad\lrn{\beta-\theta-\eta(\nabla f(\beta)-\nabla f(\theta))}_2^2\notag\\
    &=\lrn{\beta-\theta}_2^2 -2\eta \underbrace{\langle \beta-\theta, \nabla f(\beta)-\nabla f(\theta)\rangle}_{\mathcal{I}}+\eta^2 \lrn{\nabla f(\beta)-\nabla f(\theta)}_2^2.
\end{align}

For the second item $\mathcal{I}$ in the right hand side, we have
\begin{align}
\label{target_contraction}
    \mathcal{I}&=\sum_{c=1}^N p_c\big\langle \beta-\theta, \nabla f^c(\beta^c)-\nabla f^c(\theta^c)\big\rangle\notag\\
    &=\sum_{c=1}^N p_c\big\langle \beta-\beta^c+\beta^c-\theta^c+\theta^c-\theta, \nabla f^c(\beta^c)-\nabla f^c(\theta^c)\big\rangle\notag\\
    &=-\sum_{c=1}^N p_c\left(\big\langle \beta^c-\beta, \nabla f^c(\beta^c)-\nabla f^c(\theta^c)\big\rangle+\big\langle \theta-\theta^c, \nabla f^c(\beta^c)-\nabla f^c(\theta^c)\big\rangle\right)\notag\\
    &\quad\quad+\sum_{c=1}^N p_c\big\langle \beta^c-\theta^c, \nabla f^c(\beta^c)-\nabla f^c(\theta^c)\big\rangle\notag\\
    &\geq -\sum_{c=1}^N p_c \cdot \big((m+L)\lrn{\beta^c-\beta}_2^2+(m+L)\lrn{\theta^c-\theta}_2^2+\frac{1}{2(m+L)}\lrn{\nabla f^c(\beta^c)-\nabla f^c(\theta^c)}_2^2\big)\notag\\
    &\quad\quad+ \sum_{c=1}^N p_c \cdot \big(\frac{m L}{L+m}\lrn{\beta^c-\theta^c}_2^2 + \frac{1}{L+m} \lrn{\nabla f^c(\beta^c)-\nabla f^c(\theta^c)}_2^2 \big)\notag\\
    &\geq -(m+L)\sum_{c=1}^N p_c\left(\lrn{\beta^c-\beta}_2^2+\lrn{\theta^c-\theta}_2^2\right) + \frac{m L}{L+m}\lrn{\beta-\theta}_2^2 \notag\\
    &\quad\quad+ \frac{1}{2(L+m)} \lrn{\nabla f(\beta)-\nabla f(\theta)}_2^2,
\end{align}
where the first inequality follows by the AM-GM inequality and Eq.\eqref{special_inner_product}, respectively; the last inequality follows by Jensen's inequality such that
\begin{align*}
    \sum_{c=1}^N p_c \| \beta^c-\theta^c \|_2^2&\geq \lrn{\sum_{c=1}^N p_c  (\beta^c-\theta^c )}_2^2=\| \beta-\theta \|_2^2\notag\\
    \sum_{c=1}^N p_c \lrn{\nabla f^c(\beta^c)-\nabla f^c(\theta^c)}_2^2&\geq  \lrn{\sum_{c=1}^N p_c\bigg(\nabla f^c(\beta^c)-\nabla f^c(\theta^c)\bigg)}_2^2= \lrn{\nabla f(\beta)-\nabla f(\theta)}_2^2.
\end{align*}

Plugging Eq.\eqref{target_contraction} into Eq.\eqref{iteration}, we have
\begin{align*}
    &\quad\lrn{\beta-\theta-\eta \cdot (\nabla f(\beta)-\nabla f(\theta))}_2^2\notag\\
    &\leq \big(1-\frac{2\eta mL}{m+L}\big) \cdot \| \beta-\theta \|_2^2+\eta\big(\underbrace{\eta-\frac{1}{m+L}}_{\leq 0 \text{ if } \eta\leq \frac{1}{m+L}}\big) \cdot \| \nabla f(\beta)-\nabla f(\theta) \|_2^2\notag\\
    &\quad\quad+2\eta(m+L)\sum_{c=1}^N p_c \cdot (\| \beta^c-\beta \|_2^2+\| \theta^c-\theta \|_2^2 )\notag\\
    &\leq \left(1-\eta m\right) \|\beta-\theta \|_2^2+4\eta L\sum_{c=1}^N p_c \cdot \big(\| \beta^c-\beta \|_2^2+\| \theta^c-\theta \|_2^2\big),\notag
\end{align*}
where the last inequality follows by $\frac{2L}{m+L}\geq 1$, $m\leq L$, $1-2a\leq (1-a)^2$ for any $a$, and $\eta\in(0, \frac{1}{m+L}]$.

\end{proof}


%%%%%%%%%%%%%%% version after Nov.2 %%%%%%%%%%%%%%% 

\subsection{Discretization error}\label{dis_eroor}% actually it is not formally the numerical error.
\begin{proof}[Proof of Lemma \ref{lem:discretization}]
For any $s\in[0,\infty)$, there exists a certain $k \in \mathbb{N}^+$ such that $s\in [k\eta, (k+1)\eta)$. By the continuous dynamics of Eq.~\eqref{continuous_dynamics}, we have
\begin{align*}
    \bar\theta_{s}^c = -\int^s_{k\eta} \nabla f^c(\bar\theta^c_{t})d t+\sqrt{2\tau/p_c}\int_{k\eta}^s \d \overline{W}_t,
\end{align*}
which suggests that 
\begin{align*}
    \sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2 \leq \bigg\| \int^s_{k\eta}\nabla f^c(\bar\theta^c_t) d t \bigg\|_2+\sup_{ s \in [ k\eta,  (k+1)\eta ) }\lrn{\int_{k\eta}^s \sqrt{2\tau/p_c} \d \overline{W}_t}_2.
\end{align*}
We first square the terms on both sides and take Young’s inequality and expectation
\begin{align*}
    \E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2} &\leq 2\E{\bigg\|\int^s_{k\eta}\nabla f^c(\bar\theta^c_t) dt \bigg\|_2^2}+2\E{\sup_{ s \in [ k\eta,  (k+1)\eta ) }\lrn{\int_{k\eta}^s \sqrt{2\tau/p_c} \d \overline{W}_t}_2^2}.
\end{align*}
Then, by Cauchy Schwarz inequality and the fact that $|s-k\eta|\leq \eta$, we have
\begin{align}
    \label{eq:1st_part}
    \E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2}&\leq 2\eta\E{\int^s_{k\eta} \big\| \nabla f^c(\bar\theta_t^c)dt \big\|_2^2 dt}+8\sum_{i=1}^d\E{\int_{k\eta}^s 2\tau/p_c \d t} \notag \\
    % &\leq 2\E{ \big\| (s-k\eta)\nabla f^c(\bar\theta^c_{\eta\lfloor \frac{s}{\eta} \rfloor }) \big\|_2^2}+8\sum_{i=1}^d\E{\int_{k\eta}^s 2\tau \d t} \notag \\
    &\leq 2\eta^2 \sup_{s}\E{ \big\| \nabla f^c(\bar\theta^c_{s}) \big\|_2^2}+16 \eta d\tau/p_c,
\end{align}
where the last inequality follows by Burkholder-Davis-Gundy inequality (\ref{BDG-inequality}) and It\^{o} isometry.

By Young's inequality and the smoothness assumption \ref{def:smooth},  we have
\begin{align}\label{eq:2nd_part}
    \sup_s \E{ \| \nabla f^c(\bar\theta_{s}) \|_2^2}
    = & ~ \sup_s \E{\| \nabla f^c(\bar\theta^c_{s})-\nabla f^c(\theta_*) +\nabla f^c(\theta_*) \|_2^2} \notag \\
    \leq & ~ 2\sup_s \E{\| \nabla f^c(\bar\theta^c_{s})-\nabla f^c(\theta_*) \|_2^2} +2{\lrn{\nabla f^c(\theta_*) }_2^2} \notag \\
    \leq & ~ 2L^2 \sup_s \E{\|\bar\theta^c_{s }-\theta_*\|_2^2}+2\gamma^2\notag\\
    \leq & ~ 2L^2 \left(\frac{1}{m}\bigg(\frac{\gamma^2}{m}+2d\tau\bigg)\right)+ 2\gamma^2\notag\\
    \leq & ~ 4 d\kappa\bigg(\frac{\kappa \gamma^2}{d}+4L\tau\bigg),
\end{align}
where the third inequality follows by Lemma \ref{lem:L2_bound_local_continuous}, the fourth step holds since $\kappa \geq 1$. Combining Eq.~\eqref{eq:1st_part} and Eq.~\eqref{eq:2nd_part}, we have
\begin{align*}
\E{\sup_{ s \in [ k\eta, (k+1)\eta ) } \big\| \bar\theta^c_{s}-\bar\theta^c_{\eta\lfloor\frac{s}{\eta} \rfloor} \big\|_2^2}
&\leq 8\eta^2 d\kappa\bigg(\frac{\kappa\gamma^2}{d}+L\tau\bigg)+16\eta d\tau/p_c.\notag
\end{align*}

\end{proof}


\subsection{Bounded divergence and variance}\label{bounded_divergence}
% \Wei{if no decay of learning rate is required, we may polish for a better rate here}
\begin{proof}[Proof of Lemma \ref{divergence}] For any $k \ge 0$, consider $k_0=K\lfloor \frac{k}{K}\rfloor $ such that $k\leq k_0$ and $\theta_{k_0}^c=\theta_{k_0}$ for any $k\geq 0$. It is clear that  $k-k_0 \leq K-1$ for all $k\geq 0$. Consider the non-increasing learning rate such that $\eta_{k_0}\leq 2\eta_k$ for all $k-k_0\leq K-1$.

By the iterate Eq.\eqref{fed_avg_langevin_dynamics}, we have
\begin{align*}
&\quad\sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_k}_2^2}\notag\\
&=\sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_{k_0}-(\theta_k-\theta_{k_0})}_2^2}\notag\\
&\leq \sum_{c=1}^N p_c\E{\lrn{\theta_k^c-\theta_{k_0}}_2^2}\notag\\
&\leq \sum_{c=1}^N p_c \E{\sum_{k=k_0}^{k-1} 2 (K-1)\eta_k^2\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2 + 4(K-1)\eta_k d \tau(\rho^2+(1-\rho^2)/p_c)}\notag\\
&\leq \sum_{c=1}^N p_c \bigg(\sum_{k=k_0}^{k-1} 2(K-1)\eta_{k_0}^2\E{\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2}+4(K-1)\eta_{k_0} d \tau(\rho^2+(1-\rho^2)/p_c)\bigg)\notag\\
&\leq 112(K-1)^2\eta_k^2 d L^2 H_{\rho} +8(K-1)\eta_k d\tau(\rho^2 + N(1-\rho^2)),
\end{align*}
where the first inequality holds by $\E{\| \theta-\E{\theta} \|_2^2}\leq \E{\|\theta \|_2^2}$ for a stochastic variable $\theta$; the second inequality follows by $(\sum_{i=1}^{K-1} a_i)^2\leq (K-1)\sum_{i=1}^{K-1} a_i^2$; the last inequality follows by Lemma \ref{bounded_gradient_l2} and  $\eta_{k_0}^2\leq 4\eta_k^2$. $H_{\rho}$ is defined in Definition \ref{def:H_kappa_gamma}.




\end{proof}


% \subsection{Bounded variance}
\begin{proof}[Proof of Lemma \ref{lem:total_variance}] By assumption \ref{def:variance}, we have
\begin{align*}
    \E{\lrn{\nabla f(\theta)-\nabla \tilde f(\theta)}_2^2}&=\E{\lrn{\sum_{c=1}^N p_c\bigg(\nabla f^c(\theta^c)-\nabla \tilde f^c(\theta^c)\bigg)}_2^2}\\
    &=\sum_{c=1}^N p_c^2\E{\lrn{\nabla f^c(\theta^c)-\nabla \tilde f^c(\theta^c)}_2^2}\\
    &\leq d \sigma^2 \sum_{c=1}^N p_c^2\leq d\sigma^2 \left(\sum_{c=1}^N p_c\right)^2:=d\sigma^2.
\end{align*}

\end{proof}









\section{Uniform upper bound}\label{sec:uniform_upper_bound}


\subsection{Discrete dynamics}

\begin{lemma}[Discrete dynamics]
\label{lem:L2_bound_local}
Assume assumptions  \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold. We consider the generalized formulation in Algorithm \ref{alg:alg_main_text_different_seeds} with the temperature
$$T_{c,\rho}=\tau(\rho^2+(1-\rho^2)/p_c)$$ given a correlation coefficient $\rho$. For any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the $\ell_2$ norm upper bound as follows
\begin{align*}
\sup_k\E{\lrn{\theta_k^c-\theta_*}_2^2}\leq d\mathcal{D}^2 + {\frac{6d}{m}\bigg(\max_{c\in[N]} T_{c, \rho}+\frac{ \sigma^2}{m} + \frac{\gamma^2 }{md}\bigg)},\notag
\end{align*}
where $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2$ and $\theta_*$ denotes the global minimum for the function $f$.
\end{lemma}


\begin{proof} First, we consider the $k$-th iteration, where $k\in \{1,2,\cdots, K-2, (K-1)_{-}\}$ and $(K-1)_-$ denotes the $(K-1)$-step before synchronization. Following the iterate of Eq.\eqref{local_client} in a local client of $c\in [N]$, we have
	\begin{align}\label{eq:Langevin_L2_1_local}
&\quad\ \E{\lrn{\theta_{k+1}^c-\theta_*}_2^2}\notag\\
		&= \E{\|\theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c)\|_2^2} + \sqrt{8\eta T_{c,\rho}}\E{ \langle \theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c), \xi_k \rangle } + 2\eta T_{c,\rho}\E{\|\xi_k\|_2^2} \notag \\
		&= \E{\|\theta_k^c -\theta_*- \eta\nabla \tilde f^c(\theta_k^c)\|_2^2} + 2\eta d T_{c,\rho},
	\end{align}	
	where the last equality follows from $\E{\xi_k}=0$ and the conditional independence of $\theta_k^c-\theta_*- \widetilde f^c(\theta_k^c)$ and $\xi_k$. Note that
\begin{align}\label{eq:ip_1st_local}
%\small
&\quad\ \E{\|\theta_k^c -\theta_*- \eta \widetilde f^c(\theta_k^c)\|_2^2} \notag\\
&= \E{\left\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \right\|_2^2} + \eta^2\E{\|\nabla f^c(\theta_k^c)-\nabla \widetilde f^c(\theta_k^c)\|_2^2}  \notag\\
& \qquad\qquad + 2 \eta \E{ \langle \theta_k^c-\theta_*-\eta \nabla f^c(\theta_k^c),\nabla f^c(\theta_k^c)-\nabla\widetilde f^c(\theta_k^c) \rangle }  \notag\\
&= \E{\left\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \right\|_2^2} + \eta^2\E{\|\nabla f^c(\theta_k^c)-\nabla \widetilde f^c(\theta_k^c)\|_2^2} \notag \\
&\leq \E{\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \|_2^2}  + \eta^2 d\sigma^2, 
\end{align}
where the first step follows from simple algebra, the second step follows from the unbiasedness of the stochastic gradient, and the last step follows from Assumption \ref{def:variance}. For any $q>0$, we can upper bound the first term of Eq.\eqref{eq:ip_1st_local} as follows
\begin{align}\label{eq:ip_2nd_test_theta_star}
	&\quad\ \E{\|\theta_k^c -\theta_*- \eta \nabla f^c(\theta_k^c) \|_2^2}\notag\\
	&=\E{\|\theta_k^c -\theta_*- \eta (\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*))-\eta\nabla f^c(\theta_*) \|_2^2}\notag\\
	&\leq (1+q)\E{\|\theta_k^c -\theta_*- \eta (\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)) \|_2^2}+\eta^2 \left(1+\frac{1}{q}\right) \|\nabla f^c(\theta_*)\|_2^2\notag\\
	&\leq (1+q)\underbrace{\left(1-\frac{\eta m}{2}\right)^2}_{\psi^2}\E{\lrn{\theta_k^c-\theta_*}_2^2}+\eta^2 \left(1+\frac{1}{q}\right)\gamma^2,
\end{align}
where the first inequality follows by the AM-GM inequality;  the second inequality is a special case of Lemma \ref{contraction} based on Assumption \ref{def:strong_convex}, where no local steps is involved before the synchronization step. Similar results have been achieved in Theorem 3 \cite{Dalalyan17}. In addition, $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2$.

Choose $q=(\frac{1+\psi}{2\psi})^2-1$ so that $(1+q)\psi^2=\frac{(1+\psi)^2}{4}$. Moreover, since $\psi=1-\frac{\eta m}{2}$, we get $\frac{1+\psi}{2}=1-\frac{1}{4}\eta m$. In addition, we have $1+\frac{1}{q}= \frac{1+q}{q}= \frac{(1+\psi)^2}{(1-\psi)(1+3\psi)}\leq \frac{2}{\eta m}$.  It follows that
\begin{align}
    \label{nice_inequality}
    \eta^2\left(1+\frac{1}{q}\right)\leq \frac{2\eta}{m}.
\end{align}

Combining Eq.~\eqref{eq:Langevin_L2_1_local}, Eq.~\eqref{eq:ip_1st_local}, Eq.~\eqref{eq:ip_2nd_test_theta_star}, and Eq.~\eqref{nice_inequality}, we have the following iterate
\begin{align*}
	\E{\|\theta_{k+1}^c-\theta_*\|_2^2} 
	\leq & ~ \underbrace{\left(1-\frac{\eta m}{4}\right)^2}_{:=g(\eta)} \E{\|\theta_k^c-\theta_*\|_2^2} + 2\eta d T_{c,\rho} +\eta^2 d \sigma^2+\frac{2\eta \gamma^2}{m}. \notag
\end{align*}

Note that $\frac{1}{1-g(\eta)}=\frac{1}{\frac{\eta m}{2}(1-\frac{\eta m}{8})}\leq \frac{3}{\eta m}$ given $\eta\in (0, \frac{2}{m})$. Recursively applying the above equation $k$ times, where $k\in \{1,2,\cdots, K-1, K_{-}\}$ and $K_-$ denotes the $K$-step without synchronization, it follows that
\begin{align}\label{recursion_v2}
	\E{\|\theta_k^c-\theta_*\|_2^2} &\le g(\eta)^{k}\| \theta_0^c-\theta_*\|_2^2 + \frac{1- g(\eta)^{k}}{1 - g(\eta)} \cdot \left(2\eta d T_{c,\rho} +\eta^2 d \sigma^2+\frac{2\eta \gamma^2}{m}\right)  \\
	&\le \|\theta_0^c-\theta_*\|_2^2 + \frac{3}{\eta m} \cdot \left(2\eta d T_{c,\rho} +\eta^2 d \sigma^2+\frac{2\eta \gamma^2}{m}\right) \notag\\
	&\leq d\mathcal{D}^2 + \underbrace{\frac{6d}{m}\bigg(\max_{c\in[N]}T_{c,\rho}+\frac{ \sigma^2}{m} + \frac{\gamma^2 }{md}\bigg)}_{:=U},\notag
\end{align}
where the second inequality holds by $g(\eta)\leq 1$, the third inequality holds because $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$ and $\eta< \frac{2}{m}$.
In particular, the $K$-th step before synchronization yields that
\begin{align}\label{recursion_v3}
	\E{\|\theta_{K_-}^c-\theta_*\|_2^2} &\le d\mathcal{D}^2 +U.
\end{align}
Having all the results ready, for the $K$-local step after synchronization, applying Jensen's inequality
\begin{align}\label{recursion_v4}
	\E{\|\theta_K^c-\theta_*\|_2^2} 
	= & ~\E{\bigg\|\sum_{c=1}^N p_c\theta_{K-}^c-\theta_*\bigg\|_2^2} \notag \\
	\leq & ~ \sum_{c=1}^N p_c\E{\lrn{\theta_{K-}^c-\theta_*}_2^2} \notag \\
	\leq &~ d\mathcal{D}^2 + U,
 \end{align}
Now starting from iteration $K$, we adapt the recursion of Eq.\eqref{recursion_v2} for the $k$-th step, where $k\in\{K+1,\cdots, 2K-1, (2K)_{-}\}$ and $(2K)_-$ denotes the $2K$-step without synchronization, we have
\begin{align}\label{recursion_v5}
	&\E{\|\theta_k^c-\theta_*\|_2^2} \notag\\
	\leq & ~ g(\eta)^{k-K} \cdot  \E{\|\theta_K^c-\theta_*\|_2^2} + \frac{1- g(\eta)^{k-K}}{1 - g(\eta)}\cdot \left(2\eta d \max_{c\in[N]} T_{c,\rho} +\eta^2 d \sigma^2+\frac{2\eta \gamma^2}{m}\right)\notag \\
	\leq &  g(\eta)^{k-K}(d\mathcal{D}^2+U)+\frac{1- g(\eta)^{k-K}}{m\eta/3} \frac{m\eta}{3} U\notag \\
	\leq & d\mathcal{D}^2+ g(\eta)^{k-K} U +  (1- g(\eta)^{k-K}) U \notag\\
	\leq & d\mathcal{D}^2+U,
\end{align}
where the second inequality follows by Eq.\eqref{recursion_v4}, the fact that $1-g(\eta)\geq \eta m/3$ and $\eta\leq \frac{2}{m}$, and the definition of $U$. The third one holds since $g(\eta)\leq 1$.

By repeating Eq.\eqref{recursion_v4} and \eqref{recursion_v5}, we have that for all $k\geq 0$, we can obtain the desired uniform upper bound.
\end{proof}

\emph{Discussions:} Since the above result is independent of the learning rate $\eta$, it can be naturally applied to the setting with decreasing learning rates. The details are omitted.
$\newline$

\subsection{Continuous diffusion}

\begin{lemma}[Continuous time]
\label{lem:L2_bound_local_continuous}
Assume assumption \ref{def:strong_convex} holds. We have the $\ell_2$ norm upper bound as follows %\Zhao{The following quantity doesn't have $k$, not sure $\sup_k$ make sense}\Wei{Nice catch, thanks!}
\begin{align*}
\sup_t\E{\lrn{\bar\theta_t^c-\theta_*}_2^2}\leq \frac{1}{m}\bigg(\frac{\gamma^2}{m}+2d\tau\bigg),\notag
\end{align*}
where $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2$ and $\theta_*$ denotes the global minimum for the function $f$.
\end{lemma}

\begin{proof} Since the synchronization is conducted at every time $t$, the essential temperature applied to each client is $\tau$.  Let $q(\bar\theta_t^c)=\lrn{\bar\theta_t^c-\theta_*}_2^2$. For any time $t\geq 0$, applying It\^{o}'s lemma leads to
\begin{align*}
    \d q(\bar\theta_t^c)&=-2\langle \bar\theta_t^c-\theta_*, \nabla f^c(\bar\theta_t^c)\rangle\d t + 2d\tau \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\\
    &=-2\langle \bar\theta_t^c-\theta_*, \nabla f^c(\bar\theta_t^c)-\nabla f^c(\theta_*)+\nabla f^c(\theta_*)\rangle\d t + 2d\tau \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\\
    &\leq -2 m \lrn{\bar\theta_t^c-\theta_*}_2^2\d t-2\langle \bar\theta_t^c-\theta_*, \nabla f^c(\theta_*)\rangle\d t+ 2d\tau \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\\
    &\leq -2m \lrn{\bar\theta_t^c-\theta_*}_2^2\d t+m \lrn{\bar\theta_t^c-\theta_*}_2^2\d t+\frac{\lrn{\nabla f^c(\theta_*)}_2^2}{m}\d t+ 2d\tau \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\\
    &\leq -m q(\bar\theta_t^c)\d t+\left(\frac{\gamma^2}{m}+ 2d\tau\right) \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle,\notag
\end{align*}
where the first inequality follows by Assumption \ref{def:strong_convex}; the second inequality follows by the AM-GM inequality; the third inequality follows by the definition that $\gamma^2=\max_{c \in [N]} \lrn{\nabla f^c(\theta_*)}_2^2$. 

In other words, we have
\begin{align*}
    \d (e^{mt} q(\bar\theta_t^c))&=me^{mt} q(\bar\theta_t^c)\d t + e^{mt} \d q(\bar\theta_t^c)\notag\\
    &\leq me^{mt} q(\bar\theta_t^c)\d t + e^{mt}\left(-m q(\bar\theta_t^c)\d t+\left(\frac{\gamma^2}{m}+ 2d\tau\right) \d t+\sqrt{8\tau}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle\notag\right)\notag\\
    &\leq e^{mt}\left(\frac{\gamma^2}{m}+ 2d\tau\right) \d t+\sqrt{8\tau}e^{mt}\langle \bar\theta_t^c-\theta_*, \d \overline{W}_t\rangle.\notag
\end{align*}

The solution is upper bounded by
\begin{align*}
    e^{mt} q(\bar\theta_t^c)\leq e^{m\cdot 0} q(\bar\theta_0^c)+\int_0^t \left(e^{ms}\left(\frac{\gamma^2}{m}+ 2d\tau\right) \d s+\sqrt{8\tau}e^{ms}\langle \bar\theta_s^c-\theta_*, \d \overline{W}_s\rangle\right)\notag.
\end{align*}

By the martingale property of It\^{o} integral, taking expectations yields
\begin{align}\label{l2_continuous}
    \E{q(\bar\theta_t^c)}
    \leq & ~ e^{-mt}\E{q(\bar\theta_0^c)}+ e^{-mt}\left(\frac{\gamma^2}{m}+ 2d\tau\right) \int_0^t e^{ms} \d s\notag\\
    = & ~ e^{-mt}\E{q(\bar\theta_0^c)}+ \frac{1-e^{-mt}}{m}\big({\frac{\gamma^2}{m}+ 2d\tau}\big)\notag\\
    \leq & ~ e^{-mt}\E{q(\bar\theta_0^c)}+ \frac{1-e^{-mt}}{m}\big(\underbrace{\frac{\gamma^2}{m}+ 2d\tau}_{:=V}\big),
\end{align}
where the last inequality follows since the synchronization is conducted at any time step $t$. Since $\bar\theta_0^c$ is simulated from the stationary distribution $\pi$, by Lemma 12 \cite{dm+16} or Theorem 17 \cite{ccbj18}, we have
\begin{align*}
\E{q(\bar\theta_0^c)}=\E{ \| \bar\theta_0^c-\theta_* \|_2^2}\leq \frac{d\tau}{m}\leq \frac{1}{m}(\frac{\gamma^2}{m}+2d\tau)=\frac{V}{m},
\end{align*}
which completes the proof.


\end{proof}

% \subsection{Bounded gradient}
\begin{lemma}[Bounded gradient in $\ell_2$ norm]\label{bounded_gradient_l2}
Given assumptions \ref{def:smooth}, \ref{def:strong_convex}, and \ref{def:variance} hold, for any client $c$ and any learning rate $\eta \in (0 , 2/m)$ and $\lrn{\theta_0^c-\theta_*}_2^2\leq d\mathcal{D}^2$ for any $c\in[N]$, we have the $\ell_2$ norm upper bound as follows
\begin{align*}
    \E{ \|\nabla\tilde f^c(\theta_k^c) \|_2^2 }\leq 14dL^2 H_{\rho},
\end{align*}
where $H_{\rho}=  \mathcal{D}^2+ \frac{1}{m}\max_{c\in[N]}T_{c,\rho} +\frac{\gamma^2}{m^2 d}+\frac{\sigma^2}{m^2}$.
\end{lemma}

\begin{proof}

Decompose the $\ell_2$ of the gradient as follows
\begin{align*}
    \E{\lrn{\nabla\tilde f^c(\theta_k^c)}_2^2}&= \E{\lrn{\nabla\tilde f^c(\theta_k^c)-\nabla f^c(\theta_k^c)+\nabla f^c(\theta_k^c)}_2^2}\notag\\
    &= \E{\lrn{\nabla f^c(\theta_k^c)}_2^2}+\E{\lrn{\nabla\tilde f^c(\theta_k^c)-\nabla f^c(\theta_k^c)}_2^2}\notag\\
    &\qquad+2\E{\lrw{\nabla\tilde f^c(\theta_k^c)-\nabla f^c(\theta_k^c), \nabla f^c(\theta_k^c)}} \notag \\
    &\leq \E{\lrn{\nabla f^c(\theta_k^c)}_2^2}+\sigma^2d \notag \\
    &=  \E{\lrn{\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)+\nabla f^c(\theta_*)}_2^2}+\sigma^2d \notag \\
    &\leq 2\E{\lrn{\nabla f^c(\theta_k^c)-\nabla f^c(\theta_*)}_2^2}+2\E{\big\|\nabla f^c(\theta_*)\big\|_2^2}+\sigma^2d\notag\\
    &\leq 2 L^2 \E{\lrn{\theta_k^c-\theta_*}_2^2}+2 \gamma^2 +\sigma^2d\notag\\
    &\leq 2d L^2 \mathcal{D}^2 + \frac{12d L^2}{m} \cdot \bigg(\max_{c\in[N]}T_{c,\rho}+\frac{ \sigma^2}{m} + \frac{\gamma^2 }{md} \bigg)+{2\gamma^2}+\sigma^2 d \notag \\
    % &\leq 2d \kappa mL d\mathcal{D}^2 +12d\kappa L \cdot \bigg(\max_{c\in[N]}T_{c,\rho}+\frac{ \sigma^2}{m} + \frac{\gamma^2 }{md} \bigg) +\frac{2\gamma^2}{d}+\sigma^2 \notag \\
    &\leq 14 d L^2 \cdot \bigg( \mathcal{D}^2+\frac{1}{m}\max_{c\in[N]} T_{c,\rho} +\frac{\gamma^2}{m^2 d}+\frac{\sigma^2}{m^2} \bigg):= 14d L^2 H_{\rho},
\end{align*}
where the first inequality follows by Assumption \ref{def:variance}; the second inequality follows by Young's inequality; the third inequality follows by Assumption  \ref{def:smooth} and the definition that $\gamma:=\max_{c\in[N]}\lrn{\nabla f^c(\theta_*)}_2$; the fourth inequality follows by Lemma \ref{lem:L2_bound_local}; the last inequality follows by $\kappa:=\frac{L}{m}\geq 1$.
\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Beginning of Bounded divergence %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



\section{Initial condition}\label{sec:initial_condition}

\begin{lemma}[Initial condition] 
\label{lem:W2_init_bound}
Let $\mu_0$ denote the Dirac delta distribution at $\theta_0$. % and assume $\lrn{\theta_0-\theta_*}_2^2\leq d\mathcal{D}^2$.
Then, we have
\begin{align*}
W_2(\mu_0, \pi)\leq \sqrt{2}(\| \theta_0 - \theta_* \|_2 +  \sqrt{d\tau /m} ). %\sqrt{2d\left(\mathcal{D}^2+\frac{2}{m}\right)}.
\end{align*}
\end{lemma}

\begin{proof}
By \cite{ccbj18}, there exists an optimal coupling between $\mu_0$ and $\pi$ such that
\begin{align*}
    W_2^2(\mu_0, \pi) 
    \leq & ~ \mathbb{E}_{\theta\sim \pi} [\|\theta_0-\theta\|_2^2 ]\\
    \leq & ~ 2\mathbb{E}_{\theta\sim \pi} [\|\theta_0-\theta_*\|_2^2 ] + 2 \mathbb{E}_{\theta\sim \pi}[\|\theta-\theta_*\|_2^2] \\
    = & ~ 2\| \theta_0 - \theta_* \|_2^2 +2\mathbb{E}_{\theta\sim \pi}[\|\theta-\theta_*\|_2^2]\\
    \leq & ~ 2\| \theta_0 - \theta_* \|_2^2 + 2d\tau /m,
\end{align*}
where the second step follows from triangle inequality, the last step follows from Lemma 12 \cite{dm+16} and the temperature $\tau$ is included to adapt to the time scaling.
\end{proof}

\textbf{Burkholder-Davis-Gundy inequality} Let $\phi:[0, \infty)\rightarrow \mathbb{R}^{r\times d}$ for some positive integers $r$ and $d$. In addition, we assume $\E{\int_0^{\infty} |\psi(s)|^2 \d s}<\infty$ and let $Z(t)=\int_0^t \psi(s)\d W_s$, where $W_s$ is a $d$-dimensional Brownian motion. Then for all $t\geq 0$, we have

\begin{align}\label{BDG-inequality}
    \E{\sup_{0\leq s\leq t} |Z(s)|^2}\leq 4\E{\int_0^t|\phi(s)|^2\d s}.
\end{align}



\input{dp}

