\section{Differential Privacy Guarantees}
Proof sketch:

% \Wei{$\beta$ notition has been used before.what is T and $\gamma$}

Let
\[
\beta_{k+1}^c (\mathcal{D}) = \nabla \tilde f^c(\theta_k^c) + \sqrt{2(1-\rho^2)\tau/(\eta p_c)} \cdot \xi_k^c,
\]
be a random variable conditioning on $\theta_k^c$ and the sampled data.
Denote $\mu_{\beta_{k+1}^c(\mathcal{D})}(x)$ as the measure of the random variable $\beta_{k+1}^c(\mathcal{D})$.
For two neighboring data sets $\mathcal{D}$ and $\mathcal{D'}$, define the privacy loss random variable 
\[
W_k = \left| \log\frac{ \mu_{\beta_{k+1}^c(\mathcal{D})}(\beta_{k+1}^c(\mathcal{D})) }{ \mu_{\beta_{k+1}^c(\mathcal{D'})}(\beta_{k+1}^c(\mathcal{D})) } \right|%\Wei{\text{is the denominator all }\mathcal{D}'?}.
\]
Using Gaussian differential privacy protocol results, we know that w.p. $1-\frac{\delta}{2}$,
$W_k\leq 4 L \sqrt{ \log (T/\delta) } \frac{\eta p_c}{\tau (1-\rho^2)}$, for all $k\in[T]$.

Using this result, we can prove that our protocal is $(\epsilon, \delta)$-dp, with 
\[
\epsilon \leq 4 L \cdot n \log (T/\delta) \frac{\eta p_c}{\tau (1-\rho^2)}.
\]

When $\rho = 1$, $\epsilon$ blows up.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Assume that the gradient of loss function $l:\R^d\times \mathcal{X}\rightarrow \R,\ (\theta,x)\mapsto l(\theta;x)$  with respect to $\theta$ has a uniformly bounded $\ell_2$-sensitivity for $\forall\theta\in\R^{d}$:
\begin{equation}
    \Delta_l:=\sup_{\theta\in\R^d}\sup_{x,x'\in\mathcal{X}}\|\nabla l(\theta;x)-\nabla l(\theta;x')\|_2<\infty
\end{equation}

Let $\D_c:=\{x_{c,i}\}_{i\in[n_c]}$ denote the dataset of the $c$-th client for $c\in[N]$. Let $\D:=\cup_{c\in[N]}\D_c$ denote the whole dataset. 
We consider the $(\epsilon,\delta)$-differential privacy with respect to the substitute-one relation $\simeq_{s}$ \cite{NEURIPS2018_3b5020bb}. Two datasets $\cS\simeq_{s}\cS'$ if they have the same size and differ by exactly one data point. For $\epsilon\ge 0$ and $\delta\in[0,1]$, a mechanism $\M$ is $(\epsilon,\delta)$-differentially private w.r.t. $\simeq_{s}$ if for any pair of input datasets $\cS\simeq_{s}\cS'$, 
and every measurable subset $E\subset \textup{Range}(\M)$, we have
\begin{equation} \label{eq:DP-def}
\prob[\M(\cS)\in E]\le e^{\epsilon}\prob[\M(\cS')\in E]+\delta.
\end{equation}

For any two datasets $\D\simeq_{s}\D'$, there exists $c\in[N]$ such that $\D_c\simeq_{s}\D'_{c}$ and $\D_{c'}=\D'_{c'}$ for any $c'\in[N],c'\neq c$.

Consider the function 
$m_{c}(\cS;\theta)=\theta-\frac{\eta}{\gamma} \nabla f^c(\theta;\cS)=\theta-\frac{\eta}{\gamma p_c}\sum_{x\in \cS} \nabla l^c(\theta;x)$ with $|\cS|=\gamma n_c$ ($|A|$ denotes the cardinality of set $A$). 
For any $\theta\in \R^d$, the sensitivity of $m_c(\D_c;\theta)$ is
\begin{equation}
\Delta m_c:=\sup_{\cS_c\simeq_s \cS'_c}\|m_c(\cS_c;\theta)-m_c(\cS_c';\theta)\| = \frac{\eta}{\gamma p_c}\Delta_{l}
\end{equation}

For the mechanism $\M_{c}(\cS;\theta):=m_{c}(\cS;\theta)+\sqrt{2\eta\tau\rho^2}\dot{\xi}+\sqrt{2\eta(1-\rho^2)\tau/p_c}\xi$ with $\dot{\xi}$ and $\xi$ being two independent standard $d$-dimensional Gaussian vector, since $\dot{\xi}$ is broadcasted to all the clients, it can be treated as some known constant which does not contribute to the differential privacy. Thus, the standard deviation of the added Gaussian noise is $\sqrt{2\eta\tau(1-\rho^2)/p_c}$ at each dimension. Then, according to the Gaussian mechanism \cite{dwork2014algorithmic}, $M_c(\D_c;\theta)$ is $(\epsilon_0,\delta_0)$-differentially private for any $\theta\in\R^d$ with
\begin{align}
\epsilon_0=c(\delta_0)\frac{\Delta_l}{\gamma}\sqrt{\frac{\eta}{2p_c\tau(1-\rho^2)}},\quad 
c(\delta_0)=\sqrt{2\log(1.25/\delta_1)},\quad
\delta_0\in (0,1).
\end{align}

For $\cS^c$ sampled uniformly at random from all the subsets of size $\gamma n_c$ of $\D_c$, define $\tilde{M}_c(\D_c;\theta):=M_c(\cS^c;\theta)$. Then, according to Theorem 9 in \cite{NEURIPS2018_3b5020bb}, $\tilde{M}_c$ is $(\log\left(1+\gamma (e^{\epsilon_0}-1)\right), \gamma\delta_0)$-differentially private. 
Notice that for any $\epsilon_0\in[0,1]$, we have $0\le e^{\epsilon_0}-1\le 2\epsilon_0$ and
\begin{equation}
\log\left(1+\gamma (e^{\epsilon_0}-1)\right)\le 
\log\left(1+2\gamma\epsilon_0
\right)\le 2\gamma \epsilon_0=
2c(\delta_0)\Delta \sqrt{\frac{\eta}{2p_c\tau(1-\rho^2)}}=: \epsilon_1.
\end{equation}
Therefore, for $\epsilon_0\in[0,1]$, $\tilde{M}_c(\D_c;\theta)$ is $(\epsilon_1,\gamma\delta_0)$-differentially private for any $\theta\in\R^d$. 

Define $\M^K_c(\D_c;\theta)$ to be the $K$-fold composition of $\tilde{\M}_c(\D_c;\theta)$. According to the advanced composition rule (Theorem 3.20 in \cite{dwork2014algorithmic}), $\M^K_c(\D_c;\theta)$ is $(\epsilon^{(K)}_0,\delta^{(K)})$-differentially private with
\begin{align}
\epsilon^{(K)}_0=\sqrt{2K\log(1/\delta_1)}\epsilon_1 + K\epsilon_1(e^{\epsilon_1}-1),
\quad \delta^{(K)}=K\gamma\delta_0+\delta_1.
\end{align}
If $\epsilon_1\in [0,1]$, we have
\begin{equation}
\epsilon^{(K)}_0\le \sqrt{2K\log(1/\delta_1)}\epsilon_1 + 2K\epsilon_1^2.
\end{equation}

In the synchronization process, $S$ clients selected via device-sampling scheme I or II send their local models to the center. Thus, for scheme II (without replacement), according to Theorem 9 in \cite{NEURIPS2018_3b5020bb}, each synchronization process is $(\epsilon^{(K)}_2, \delta^{(K)}_2)$-differentially private with
\begin{align}
\epsilon^{(K)}_2 = 
\log\left(1+\frac{S}{N}\left(e^{\epsilon^{(K)}_0}-1\right)\right),\quad
\delta^{(K)}_2=\frac{S}{N}\delta^{(K)}_0=\frac{S}{N}\left(K\gamma \delta_0+\delta_1\right).
\end{align}
Notice that similarly, if $\epsilon^{(K)}_0\in [0,1]$, we have
\begin{align}
\epsilon^{(K)}_2 \le \frac{2S}{N} \epsilon^{(K)}_0.
\end{align}

The aggregation and broadcasting process is post-processing and preserves the same amount of differential privacy (Proposition 2.1 in \cite{dwork2014algorithmic}).
When executed $T$ iterations, Algorithm \ref{alg:alg_main_text_partial_main} is the $T/K$-fold composition of local updates, synchronization, and broadcasting. According to the advanced composition rule (Theorem 3.20 in \cite{dwork2014algorithmic}), Algorithm \ref{alg:alg_main_text_partial_main} is $(\epsilon_{K,T},\delta_{K,T})$-differentially private after $T$ iterations with
\begin{align}
&\epsilon_{K,T}=\sqrt{2\frac{T}{K}\log(1/\delta_2)}\epsilon^{(K)}_2 + K\epsilon^{(K)}_2(e^{\epsilon^{(K)}_2}-1),\\
&\delta_{K,T}=\frac{T}{K}\delta_2^{(K)}+\delta_2.
\end{align}



$$
\theta_k^c-\eta\nabla \tilde f^c(\theta_k^c)+\sqrt{2\eta\tau \rho^2}\dot\xi_k + \sqrt{2\eta(1-\rho^2)\tau/p_c}\xi_k^c
$$

-----------------------------------------------------

For any two neighboring datasets $\D'$ and $\D$, let $c'\in[N]$ be such that $\D_{c'}\sim \D'_{c'}$ and $\D_{c}= \D'_{c}$ for any other $c\in[N]$, $c\neq c'$. Then we have
\begin{equation} \label{eq:sensitivity_client}
\begin{aligned}
\Delta^{c'}:=&\sup_{\theta\in\R^d}\sup_{\D_{c'}\sim\D'_{c'}}\|\nabla f^{c'}(\theta;\D_{c'})-\nabla f^{c'}(\theta;\D'_{c'})\|_2\\=&
\frac{1}{p_{c'}}\sup_{\theta\in\R^d}\sup_{\D_{c'}\sim\D'_{c'}}\|\nabla \ell^{c'}(\theta;\D_{c'})-\nabla \ell^{c'}(\theta;\D'_{c'})\|_2\\=&
\frac{\Delta_l}{p_{c'}}%\\ \le&
%\frac{\Delta_l}{\min_{c\in[N]}p_c}=:\Delta
\end{aligned}
\end{equation}
for any $c'\in[n]$. 

Consider the following algorithm:
\begin{equation}
\mathcal{M}(\theta_0,\cS,\eta,\tau,p,n)=\theta_0-\frac{\eta n}{|\cS|}\nabla f(\theta_0;\cS,p)+\sqrt{2\eta\tau/p}\xi,\ 
\text{with } \xi\sim \N(0,I_d),
\end{equation}
where $\theta_0\in\R^d$, $p\in\{p_c:c\in[N]\}$, $\nabla f(\theta;\cS,p)=\frac{1}{p}\sum_{x\in\cS}\nabla l(\theta;x)$, and $|\cS|\le n$. Define $\gamma=\frac{|\cS|}{n}$. Then according to \eqref{eq:sensitivity_client},
\begin{equation} \label{eq:sensitivity_M}
\sup_{\theta\in\R^d}\sup_{\cS\sim\cS'}\left\|\frac{\eta n}{|\cS|}\nabla f(\theta;\cS,p)-\frac{\eta n}{|\cS|}\nabla f(\theta;\cS',p)\right\|_2\le \frac{\eta\Delta_l}{\gamma p}
\end{equation}
for any $p\in\{p_c:c\in[N]\}$.

For fixed $\theta_0,\eta,\tau,p,n$, let $\pi_{\M(\cS)}(\cdot)$ be the probability density function of $\mathcal{M}(\theta_0;\eta,\tau,p,n,\cS)$. 
According to \eqref{eq:sensitivity_M} and the Gaussian mechanism \cite{dwork2014algorithmic}, for any $\cS'\sim\cS$, with probability at least $1-\delta_1$ ($0<\delta_1<1$), we have
\begin{equation}
\left|\log\frac{\pi_{\M(\cS)}(\theta)}{\pi_{\M(\cS')}(\theta)} \right|\le \epsilon
\end{equation}
for any $\theta\in\R^d$, $\epsilon\ge \frac{c\eta\Delta_l}{\gamma p\sqrt{2\eta\tau/p}}=\frac{c\eta\Delta_l}{\gamma\sqrt{2\eta\tau p}}$ with $c\ge\sqrt{2\log(1.25/\delta_1)}$. Since for any $p\in\{p_c:c\in[n]\}$,
$$
\frac{\eta\Delta_l\sqrt{2\log(1.25/\delta_1)}}{\gamma\sqrt{2\eta\tau p}}=\frac{\Delta_l}{\gamma}\sqrt{\frac{\eta\log(1.25/\delta_1)}{\tau p}}\le 
\frac{\Delta_l}{\gamma}\sqrt{\frac{\eta\log(1.25/\delta_1)}{\tau \min_{c\in[n]}p_c}},
$$
we define $\epsilon_M:=\frac{\Delta_l}{\gamma}\sqrt{\frac{\eta\log(1.25/\delta_1)}{\tau \min_{c\in[n]}p_c}}$ and therefore with probability at least $1-\delta_1$,
\begin{equation} \label{eq:epsilon_M}
\left|\log\frac{\pi_{\M(\cS)}(\theta)}{\pi_{\M(\cS')}(\theta)} \right|\le \epsilon_M
\end{equation}

Now consider a new algorithm $\mathcal{A}$ that samples a subset $\cS$ uniformly at random from the dataset $\D$ and then runs $\mathcal{M}$ with the sampled $\cS$, i.e., 
\begin{equation}
\mathcal{A}(\theta_0,\D,\eta,\tau,p,n)=\mathcal{M}(\theta_0,\cS,\eta,\tau,p,n),\ 
\text{with } \cS \text{ draw uniformly at random from } \D.
\end{equation}
Since for any $\cS\sim\cS'$, with probability at least $1-\delta_1$, \eqref{eq:epsilon_M} holds, with similar analysis as Lemma 4.4 in \cite{beimel2014bounds}, we can prove that for any $\D\sim\D'$, with probability at least $1-\delta_1$,
\begin{equation}
\frac{\pi_{\mathcal{A}(\D)}(\theta)}{\pi_{\mathcal{A}(\D')}(\theta)}\le \frac{1+\gamma\left(e^{\epsilon_M}-1\right)}{1-\gamma\left(1-e^{-\epsilon_M}\right)}.
\end{equation}
for any $\theta\in\R^d$.
Then, we know that $\mathcal{A}$ preserves $(\epsilon_A,\delta)$-differential privacy with
\begin{equation}
\epsilon_A:=\log \frac{1+\gamma\left(e^{\epsilon_M}-1\right)}{1-\gamma\left(1-e^{-\epsilon_M}\right)}
\end{equation}

For Algorithm \ref{alg:alg_main_paper_text_independent_noise} (\ref{alg:alg_main_paper_text_different_seeds}),
notice that every $K$ local steps of each client $c\in[n]$ is a composition of $\mathcal{A}(\theta_{k},\D_{c},\eta,\tau,p_{c},n_{c})$ for $k=0,\dots,K-1$. Thus, according to the advanced composition theorem in \cite{dwork2016concentrated}, for client $c'$, the $K$ local step preserves $(\epsilon_K:=\sqrt{2K\log(1/\delta_2)}\epsilon_A+K\epsilon_A(e^{\epsilon_A}-1)/2,\delta_K:=K\delta+\delta_2)$-differential privacy with $\delta_2\in(0,1)$. Let $\pi_{c,K}$ denotes the probability density of $\theta_K$. 



Assume the gradient of the loss function $\ell^c(\theta;\D_c)$ has $L_2$-sensitivity 
$$
\Delta_2^c:=\Delta_2(\nabla \ell^c)=\sup_{\theta\in\R^{d}}\sup_{\D_c\sim\D_c'}\|\nabla \ell^c(\theta;\D_c)-\nabla \ell^c(\theta;\D_c')\|_2.
$$
for $c=1,\dots,N$. 

Assume we calculate $\tilde{\ell}^c$ using a uniform sub-sample $\mathcal{S}_c\subseteq \D_c$ \Wei{will with/out replacement matter here?} such that $|\mathcal{S}_c|=\gamma |\D_c|$ for $c=1,\dots,N$. Then we have
$$
\tilde{\Delta}^c_2:=\Delta_2\left(\nabla \tilde{\ell}^c\right)=\sup_{\theta\in\R^d}\sup_{\D_c\sim\D_c'}\left\|\frac{1}{\gamma}\sum_{i\in \mathcal{S}_c}\left(\nabla \ell(\theta;x_{c,i})-\nabla \ell(\theta;x_{i,c}')\right)\right\|_2\le \frac{\Delta^c_2}{\gamma},\quad a.s.
$$
Define $\Delta_2=\max_{1\le c\le N}\Delta_2^c$. For $\tilde{f}=\sum_{c=1}^N\tilde{\ell}^c$, we have
$$
\Delta_2\left(\nabla \tilde{f}\right)=\Delta_2\left(\sum_{c=1}^N\nabla \tilde{\ell}^c\right)\le \max_{1\le c\le N}\tilde{\Delta}_2^c=\max_{1\le c\le N}\frac{\Delta_2^c}{\gamma}=\frac{\Delta_2}{\gamma},\quad a.s.
$$

For Algorithm \ref{alg:alg_main_paper_text_independent_noise} and \ref{alg:alg_main_paper_text_different_seeds}, by the synchronization step, we have
\begin{equation} \label{eq:DP_tf}
\theta_{k+1}=\theta_k-\eta\nabla \tilde{f}(\theta_k)+\sqrt{2\eta\tau}\xi_k
\end{equation}
with $\xi_k=\rho \dot\xi_k + \sqrt{1-\rho^2}\sum_{c=1}^N \sqrt{p_c}\xi_k^c$ for Algorithm \ref{alg:alg_main_paper_text_different_seeds}. Thus we have $\xi\sim \mathcal{N}(0,I_d)$. According to the Gaussian mechanism \cite{dwork2014algorithmic}, we know that the single step \eqref{eq:DP_tf} satisfies $(\epsilon,\delta)$-differential privacy with 
\begin{equation} \label{eq:epsilon_delta}
\frac{\eta\Delta_2}{\gamma\sqrt{2\eta\tau}}\sqrt{2\log(1.25/\delta)}=\frac{\Delta_2}{\gamma}\sqrt{\frac{\eta\log(1.25/\delta)}{\tau}}\le \epsilon<1,\quad \delta>0
\end{equation}
Then according to Theorem 3.20 in \cite{dwork2014algorithmic}, Algorithm \ref{alg:alg_main_paper_text_independent_noise} and \ref{alg:alg_main_paper_text_different_seeds} satisfies $(\epsilon',k\delta+\delta')$-differential privacy with 
\begin{equation}
\epsilon'=\sqrt{2k\log(1/\delta')}\epsilon+k\epsilon\left(e^{\epsilon}-1\right),\quad \delta'>0
\end{equation}
after $k$ iterations.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Consider Algorithm \ref{alg:alg_main_paper_text_different_seeds}, for each $c=1,\dots,N$, we have
\begin{equation} \label{eq:DP_beta}
\beta_{k+1}^c=\theta_k^c-\eta\nabla \tilde f^c(\theta_k^c)+\sqrt{2\eta\tau \rho^2}\dot\xi_k + \sqrt{2\eta(1-\rho^2)\tau/p_c}\xi_k^c,
\end{equation}
where $\nabla \tilde f^c(\theta_k^c)=\frac{1}{p_c\gamma}\sum_{i\in\mathcal{S}_c}\nabla \ell(\theta;x_{c,i})$.
Define $\zeta_k^c=\sqrt{\tau\rho^2}\dot\xi_k+\sqrt{(1-\rho^2)\tau/p_c}\xi_k^c$. Then we have $\zeta_k^c\sim \N(0,\sigma_c^2I_d)$ with $\sigma_c:=\sqrt{\tau\left(\rho^2+(1-\rho^2)/p_c\right)}$. We can rewrite \eqref{eq:DP_beta} as
\begin{equation} \label{eq:DP_beta1}
\beta_{k+1}^c=\theta_k^c-\eta\nabla \tilde f^c(\theta_k^c)+\sqrt{2\eta}\zeta_k^c,
\end{equation}

Suppose $\D'$ differs from $\D$ by $i\in\D'_{c'}$ for some $c'\in\{1,\dots, N\}$. Then for $c'$, we have
\begin{equation}
\Delta_2\left(\nabla \tilde{f}^{c'}\right)=\sup_{\theta\in\R^d}\sup_{\D_{c'}\sim\D'_{c'}}\left\|\frac{1}{p_c\gamma}\sum_{i\in\mathcal{S}_{c'}}\left(\nabla\ell(\theta;x_{c',i})-\nabla\ell(\theta;x'_{c',i})\right)\right\|_2\le \frac{\Delta_2^{c'}}{p_c\gamma}
\end{equation}
Thus, by the Gaussian mechanism \cite{dwork2014algorithmic}, \eqref{eq:DP_beta1} is $(\epsilon,\delta)$-differentially private with
\begin{equation}
\frac{\eta\Delta_2^{c'}}{p_c\gamma\sqrt{2\eta\tau(\rho^2+(1-\rho^2)/p_c)}}\sqrt{2\log(1.25/\delta)}=\frac{\Delta_2^{c'}}{p_c\gamma}\sqrt{\frac{\eta\log(1.25/\delta)}{\tau(\rho^2+(1-\rho^2)/p_c)}}\le \epsilon<1, \ \ \delta>0
\end{equation}
After $K$ iterations, for $c'$, $(\epsilon_{K},\delta_{K})$-differential privacy is satisfied with
\begin{equation}
\epsilon_K=\sqrt{2K\log(1/\delta')}\epsilon+K\epsilon\left(e^{\epsilon}-1\right),\quad \delta_K=K\delta+\delta',\quad \delta'>0
\end{equation}

For the synchronization step, we have
$$
\theta_{K+1}=\sum_{c=1}^Np_c\beta_{K}^c
$$
Define $Y_{K}=\sum_{1\le c\le N,c\neq c'}p_c\beta_{K}^c$. Then we have $\theta_{K+1}=p_{c'}\beta_{K}^{c'}+Y_K$ and $Y(\D)=Y(\D')$ a.s..
Since $\beta_K^{c'}$ has $(\epsilon_K,\delta_K)$-differential privacy, for $\theta_{K+1}$, we have
\begin{equation}
\begin{aligned}
\prob[\theta_{K+1}(\D)\in E]&=\prob\left[p_{c'}\beta_{K}^{c'}(\D_{c'})+Y(\D)\in E\right]\\&=
\rE\left[\prob\left[\beta_{K}^{c'}(\D_{c'})\in \frac{1}{p_{c'}}\left(E-Y(\D)\right)\big|Y(\D)\right]\right]\\&\le
\rE\left[\exp(\epsilon_K)\prob\left[\beta_{K}^{c'}(\D_{c'})\in \frac{1}{p_{c'}}\left(E-Y(\D)\right)\big|Y(\D)\right]+\delta_K\right]\\&=
\exp(\epsilon_K)\rE\left[\prob\left[p_{c'}\beta_K^{c'}(\D_{c'}')+Y(\D')\in E\big|Y(\D')\right]\right]+\delta_{K}\\&=
\exp(\epsilon_K)\prob\left[\theta_{K+1}(\D')\in E\right]+\delta_K
\end{aligned}
\end{equation}
Thus, Algorithm \ref{alg:alg_main_paper_text_different_seeds} has $(\epsilon_K,\delta_K)$-differential privacy after the first $K$ iterations. According to Theorem 3.20 in \cite{dwork2014algorithmic}, after $k$ iterations ($k=qK,\ q\in \mathbb{N}^+$), Algorithm \ref{alg:alg_main_paper_text_different_seeds} has $(\epsilon_k,\delta_k)$-differential privacy with
\begin{equation}
\epsilon_k=\sqrt{2q\log(1/\delta')}\epsilon_K+q\epsilon_K\left(e^{\epsilon_K}-1\right),\quad \delta_k=q\delta_K+\delta',\quad \delta'>0
\end{equation}