\section{Differential Privacy Guarantees}
Let $\D_c:=\{x_{c,i}\}_{i\in[n_c]}$ denote the dataset of the $c$-th client for $c\in[N]$. Let $\D:=\cup_{c\in[N]}\D_c$ denote the whole dataset. 
We consider the $(\epsilon,\delta)$-differential privacy with respect to the substitute-one relation $\simeq_{s}$ \cite{NEURIPS2018_3b5020bb}. Two datasets $\cS\simeq_{s}\cS'$ if they have the same size and differ by exactly one data point. For $\epsilon\ge 0$ and $\delta\in[0,1]$, a mechanism $\M$ is $(\epsilon,\delta)$-differentially private w.r.t. $\simeq_{s}$ if for any pair of input datasets $\cS\simeq_{s}\cS'$, 
and every measurable subset $E\subset \textup{Range}(\M)$, we have
\begin{equation} \label{eq:DP-def}
\prob[\M(\cS)\in E]\le e^{\epsilon}\prob[\M(\cS')\in E]+\delta.
\end{equation} 

Since Algorithm \ref{alg:alg_main_paper_text_independent_noise} is a special case of Algorithm \ref{alg:alg_main_paper_text_different_seeds} when $\rho=0$, we analyze the differential privacy guarantees Algorithm \ref{alg:alg_main_paper_text_different_seeds} and Algorithm \ref{alg:alg_main_text_partial_main}. As FedAvg algorithms can be divided into the processes of local updates, synchronization, and broadcasting with risks of information leakage in synchronization (local model uploading and aggregation) and broadcasting, we consider the differential privacy guarantees in synchronization and broadcasting similar to \cite{wei2020federated}. Since there is no involvement of data in model aggregation and broadcasting, they are post-processing processes. Thus, it suffices to analyze the differential privacy guarantees in local model uploading.

Assume that the gradient of loss function $l:\R^d\times \mathcal{X}\rightarrow \R,\ (\theta,x)\mapsto l(\theta;x)$  with respect to $\theta$ has a uniformly bounded $\ell_2$-sensitivity for $\forall\theta\in\R^{d}$:
\begin{equation}
    \Delta_l:=\sup_{\theta\in\R^d}\sup_{x,x'\in\mathcal{X}}\|\nabla l(\theta;x)-\nabla l(\theta;x')\|_2<\infty
\end{equation}

For any two datasets $\D\simeq_{s}\D'$, there exists $c\in[N]$ such that $\D_c\simeq_{s}\D'_{c}$ and $\D_{c'}=\D'_{c'}$ for any $c'\in[N],c'\neq c$.

Consider the function 
$m_{c}(\cS;\theta)=\theta-\frac{\eta}{\gamma} \nabla f^c(\theta;\cS)=\theta-\frac{\eta}{\gamma p_c}\sum_{x\in \cS} \nabla l^c(\theta;x)$ with $|\cS|=\gamma n_c$ ($|A|$ denotes the cardinality of set $A$). 
For any $\theta\in \R^d$, the sensitivity of $m_c(\D_c;\theta)$ is
\begin{equation}
\Delta m_c:=\sup_{\cS_c\simeq_s \cS'_c}\|m_c(\cS_c;\theta)-m_c(\cS_c';\theta)\| = \frac{\eta}{\gamma p_c}\Delta_{l}
\end{equation}

For the mechanism $\M_{c}(\cS;\theta):=m_{c}(\cS;\theta)+\sqrt{2\eta\tau\rho^2}\dot{\xi}+\sqrt{2\eta(1-\rho^2)\tau/p_c}\xi$ with $\dot{\xi}$ and $\xi$ being two independent standard $d$-dimensional Gaussian vector, since $\dot{\xi}$ is broadcasted to all the clients, it can be treated as some known constant which does not contribute to the differential privacy. Thus, the standard deviation of the added Gaussian noise is $\sqrt{2\eta\tau(1-\rho^2)/p_c}$ at each dimension. Then, according to the Gaussian mechanism \cite{dwork2014algorithmic}, $M_c(\D_c;\theta)$ is $(\epsilon_{0,c},\delta_0)$-differentially private for any $\theta\in\R^d$ with
\begin{align}
\epsilon_{0,c}=c(\delta_0)\frac{\Delta_l}{\gamma}\sqrt{\frac{\eta}{2p_c\tau(1-\rho^2)}},\quad 
c(\delta_0)=\sqrt{2\log(1.25/\delta_0)},\quad
\delta_0\in (0,1).
\end{align}

For $\cS^c$ sampled uniformly at random from all the subsets of size $\gamma n_c$ of $\D_c$, define $\tilde{M}_c(\D_c;\theta):=M_c(\cS^c;\theta)$. Then, according to Theorem 9 in \cite{NEURIPS2018_3b5020bb}, $\tilde{M}_c$ is $(\log\left(1+\gamma (e^{\epsilon_{0,c}}-1)\right), \gamma\delta_0)$-differentially private. 
Notice that for any $\epsilon_{0,c}\in[0,1]$ (i.e., $0\le \eta\le \frac{2p_c\tau(1-\rho^2)\gamma^2}{\Delta_l^2c(\delta_0)^2}$), we have $0\le e^{\epsilon_{0,c}}-1\le 2\epsilon_{0,c}$ and
\begin{equation}
\log\left(1+\gamma (e^{\epsilon_{0,c}}-1)\right)\le 
\log\left(1+2\gamma\epsilon_{0,c}
\right)\le 2\gamma \epsilon_{0,c}=
c(\delta_0)\Delta_l \sqrt{\frac{2\eta}{p_c\tau(1-\rho^2)}}=: \epsilon_{1,c}.
\end{equation}

Define 
\begin{equation} \label{eq:epsilon1_def}
\epsilon_1:=c(\delta_0)\Delta_l \sqrt{\frac{2\eta}{\tau(1-\rho^2)\min_{c'\in [N]}p_{c'}}}.
\end{equation}
Then, we have $\max_{c\in[N]} \epsilon_{1,c}=\epsilon_{1}$
and $\max_{c\in[N]}\epsilon_{0,c}\in[0,1]$ if
\begin{equation} \label{eq:eta_condition0}
0\le \eta\le \frac{2\tau(1-\rho^2)\gamma^2\min_{c'\in[N]}p_{c'}}{\Delta_l^2c(\delta_0)^2}.
\end{equation}
Thus, for $0\le \eta\le \frac{2\tau(1-\rho^2)\gamma^2\min_{c'\in[N]}p_{c'}}{\Delta_l^2c(\delta_0)^2}$, 
$\tilde{M}_c(\D_c;\theta)$ is $(\epsilon_1,\gamma\delta_0)$-differentially private for any $\theta\in\R^d$. From now on, we assume that \eqref{eq:eta_condition0} holds.

Define $\M^K_c(\D_c;\theta)$ to be the $K$-fold composition of $\tilde{\M}_c(\D_c;\theta)$. According to the composition rules of $(\epsilon,\delta)$-differential privacy (Theorem 3.1 and  3.3 in \cite{dwork2010boosting}), $\M^K_c(\D_c;\theta)$ is $(\epsilon_K,\delta_K)$-differentially private with
\begin{align}
\label{eq:epsilon_K}
&\epsilon_K=\min\left\{\sqrt{2K\log(1/\delta_1)}\epsilon_1 + K\epsilon_1(e^{\epsilon_1}-1),\ 
K\epsilon_1\right\},\\
\label{eq:delta_K}
&\delta_K=K\gamma\delta_0+\delta_1.
\end{align}
for any $\delta_1\in[0,1)$.

If $\epsilon_1\in [0,1]$, we have
\begin{equation}
\epsilon_K\le \left\{\sqrt{2K\log(1/\delta_1)}\epsilon_1 + 2K\epsilon_1^2,\ K\epsilon_1\right\}.
\end{equation}
By \eqref{eq:epsilon1_def}, if
\begin{equation} \label{eq:eta_condition1}
0\le \eta\le \frac{\tau(1-\rho^2)\min_{c\in[N]}p_c}{2\Delta_l^2c(\delta_0)^2}\log^2\left(1+\sqrt{\frac{2\log(1/\delta_1)}{K}}\right),
\end{equation}
we have $\epsilon_1\in\left[0, \log\left(1+\sqrt{\frac{2\log(1/\delta_1)}{K}}\right)\right]$ which implies that
$K\epsilon_1(e^{\epsilon_1}-1)\le \sqrt{2K\log(1/\delta_1)}\epsilon_1$
and
\begin{equation} \label{eq:epsilon_K_bound}
\epsilon_K\le 2\sqrt{2K\log(1/\delta_1)}\epsilon_1. 
\end{equation}



\subsection{Differential privacy guarantee of Algorithm \ref{alg:alg_main_paper_text_different_seeds}}
For Algorithm \ref{alg:alg_main_paper_text_different_seeds}, since all the local models are uploaded in the synchronization process and broadcasting is post-processing, one round of local updates, synchronization, and broadcasting is $(\epsilon_K,\delta_K)$-differentially private. Thus, according to the composition rules of $(\epsilon,\delta)$-differential privacy (Theorem 3.1 and  3.3 in \cite{dwork2010boosting}), when executed $T$ iterations (i.e., $T/K$ rounds), Algorithm \ref{alg:alg_main_paper_text_different_seeds} is $(\epsilon^{(2)}_{K,T},\delta^{(2)}_{K,T})$-differentially private with 
\begin{align}
\label{eq:epsilon_alg2}
&\epsilon^{(2)}_{K,T}=\min\left\{\sqrt{2\frac{T}{K}\log(1/\delta_2)}\epsilon_{K} + \frac{T}{K}\epsilon_{K}(e^{\epsilon_K}-1),\ 
\frac{T}{K}\epsilon_K\right\},\\
\label{eq:delta_alg2}
&\delta^{(2)}_{K,T}=\frac{T}{K}\delta^{(K)}+\delta_2=T\gamma \delta_0+\frac{T}{K}\delta_1+\delta_2.
\end{align}
for any $\delta_1,\delta_2\in[0,1)$. 

According to \eqref{eq:epsilon_K}, \eqref{eq:delta_K}, \eqref{eq:epsilon_alg2}, and \eqref{eq:delta_alg2}, by letting $\delta_1,\delta_2=0$, we know that Algorithm \ref{alg:alg_main_paper_text_different_seeds} is at least $(T\epsilon_1,T\gamma \delta_0)$-differentially private. 

For $\eta$ satisfying
\begin{equation*}
0\le \eta\le \frac{\tau(1-\rho^2)\min_{c\in[N]}p_c}{16\Delta_l^2c(\delta_0)^2K\log(1/\delta_1)}
\log^2\left(1+\sqrt{\frac{2K\log(1/\delta_2)}{T}}\right),
\end{equation*}
we have 
\begin{equation} \label{eq:epsilon_K_bound_bound}
2\sqrt{2K\log(1/\delta_1)}\epsilon_1\le \log\left(1+\sqrt{\frac{2K\log(1/\delta_2)}{T}}\right).
\end{equation}
Thus, if 
\begin{equation} \label{eq:eta_condition2}
0\le \eta\le \frac{\tau(1-\rho^2)\min_{c\in[N]}p_c}{2\Delta_l^2c(\delta_0)^2}
\min\left\{
\log^2\left(1+\sqrt{\frac{2\log(1/\delta_1)}{K}}\right),\ 
\frac{\log^2\left(1+\sqrt{\frac{2K\log(1/\delta_2)}{T}}\right)}{8K\log(1/\delta_1)}
\right\},
\end{equation}
according to \eqref{eq:epsilon_K_bound} and \eqref{eq:epsilon_K_bound_bound}, we have
\begin{equation} \label{eq:epsilon_K_bound1}
\epsilon_K\le 2\sqrt{2K\log(1/\delta_1)}\epsilon_1\le \log\left(1+\sqrt{\frac{2K\log(1/\delta_2)}{T}}\right)
\end{equation}
which implies that
\begin{equation*}
\sqrt{2\frac{T}{K}\log(1/\delta_2)}\epsilon_{K}\ge  \frac{T}{K}\epsilon_{K}(e^{\epsilon_K}-1)
\end{equation*}
and
\begin{equation} \label{eq:epsilon_alg2_bound}
\epsilon_{K,T}^{(2)}\le 2\sqrt{2\frac{T}{K}\log(1/\delta_2)}\epsilon_{K}\le
8\sqrt{T\log(1/\delta_1)\log(1/\delta_2)}\epsilon_1=
8c(\delta_0)\Delta_l\sqrt{\frac{2\eta T\log(1/\delta_1)\log(1/\delta_2)}{\tau(1-\rho^2)\min_{c\in[N]}p_c}}
.
\end{equation}
with $\delta_1,\delta_2\in(0,1)$.

Notice that for any $K\ge 1$, if $T\gg 1$, using $\log(1+x)\approx x$ for $|x|\ll 1$, we can write \eqref{eq:eta_condition2} as
\begin{equation}
0\le \eta=O\left(\frac{\tau(1-\rho^2)\min_{c\in[N]}p_c\log(1/\delta_2)}{\Delta_l^2 T\log(1/\delta_0) \log(1/\delta_1)}\right),
\end{equation}
and \eqref{eq:epsilon_alg2_bound} as
\begin{equation}
\epsilon_{K,T}^{(2)}=O\left(\Delta_l\sqrt{\frac{\eta T \log(1/\delta_0)\log(1/\delta_1)\log(1/\delta_2)}{\tau(1-\rho^2)\min_{c\in[N]}p_c}}\right).
\end{equation}


\subsection{Differential privacy guarantee of Algorithm \ref{alg:alg_main_text_partial_main}}

In the synchronization process, $S$ clients selected via device-sampling scheme I or II send their local models to the center. Thus, for scheme I (with replacement) and scheme II (without replacement), according to Theorem 10 and Theorem 9 in \cite{NEURIPS2018_3b5020bb} respectively, each synchronization process is $(\tilde\epsilon_{K}, \tilde\delta_K)$-differentially private with
\begin{align}
\label{eq:DP_K_partial}
\tilde\epsilon_K = 
\log\left(1+\frac{S}{N}\left(e^{\epsilon_K}-1\right)\right),\quad
\tilde\delta_K=\frac{S}{N}\delta_K=\frac{S}{N}\left(K\gamma \delta_0+\delta_1\right).
\end{align}
Notice that similarly, if $\epsilon_K\in [0,1]$, we have
\begin{align}
\tilde\epsilon_K \le \frac{2S}{N} \epsilon_K.
\end{align}

The aggregation and broadcasting process is post-processing and preserves the guarantees of differential privacy (Proposition 2.1 in \cite{dwork2014algorithmic}).
When executed $T$ iterations, Algorithm \ref{alg:alg_main_text_partial_main} is the $T/K$-fold composition of local updates, synchronization, and broadcasting. According to the composition rules of $(\epsilon,\delta)$-differential privacy (Theorem 3.1 and  3.3 in \cite{dwork2010boosting}), Algorithm \ref{alg:alg_main_text_partial_main} is $(\epsilon^{(3)}_{K,T},\delta^{(3)}_{K,T})$-differentially private after $T$ iterations with
\begin{align}
\label{eq:epsilon_alg3}
&\epsilon^{(3)}_{K,T}=\min\left\{\sqrt{2\frac{T}{K}\log(1/\delta_2)}\tilde\epsilon_K + \frac{T}{K}\tilde\epsilon_{K}(e^{\tilde\epsilon_{K}}-1),\ \frac{T}{K}\tilde\epsilon_K\right\},\\
\label{eq:delta_alg3}
&\delta^{(3)}_{K,T}=\frac{T}{K}\tilde\delta_K+\delta_2=\frac{S}{N}\gamma T\delta_0+ \frac{TS}{KN}\delta_1+\delta_2,
\end{align}
for $\delta_1,\delta_2\in[0,1)$ and $\delta_0\in(0,1)$. 

\subsection{Discussion on the differential privacy guarantees of Algorithm \ref{alg:alg_main_text_partial_main} under scheme II}
By \eqref{eq:epsilon_K}, \eqref{eq:delta_K}, \eqref{eq:DP_K_partial}, \eqref{eq:epsilon_alg3}, and \eqref{eq:delta_alg3}, by letting $\delta_1,\delta_2=0$, Algorithm \ref{alg:alg_main_text_partial_main} is at least $(\frac{T}{K}\log\left(1+\frac{S}{N}(e^{K\epsilon_1}-1)\right),\frac{S}{N}\gamma T\delta_0)$-differentially private.

If
\begin{equation}
\tilde\epsilon_K\le \log\left(1+\sqrt{\frac{2K\log(1/\delta_2)}{T}}\right),
\end{equation}
we have $\frac{T}{K}\tilde\epsilon_K(e^{\tilde\epsilon_K}-1)\le \sqrt{2\frac{T}{K}\log(1/\delta_2)}\tilde\epsilon_K$ and therefore
\begin{equation} \label{eq:epsilon_alg3_bound0}
\epsilon^{(3)}_{K,T}\le 2\sqrt{2\frac{T}{K}\log(1/\delta_2)}\tilde\epsilon_K=2\sqrt{2\frac{T}{K}\log(1/\delta_2)}\log\left(1+\frac{S}{N}(e^{\epsilon_K}-1)\right).
\end{equation}
Now assume $\eta$ satisfies \eqref{eq:eta_condition1}, then by \eqref{eq:epsilon_K_bound} and \eqref{eq:epsilon_alg3_bound0},
\begin{equation} \label{eq:epsilon_alg3_bound1}
\epsilon^{(3)}_{K,T}\le 2\sqrt{2\frac{T}{K}\log(1/\delta_2)}\log\left(1+\frac{S}{N}(e^{2\sqrt{2K\log(1/\delta_1)}\epsilon_1}-1)\right)
\end{equation}
Notice that if 
\begin{equation} \label{eq:eta_condition_alg3}
0\le \eta\le \frac{\tau(1-\rho^2)\min_{c\in[N]}p_c}{2\Delta_l^2c(\delta_0)^2}
\min\left\{
\log^2\left(1+\sqrt{\frac{2\log(1/\delta_1)}{K}}\right),\ 
\frac{\log^2\left(1+\frac{N}{S}\sqrt{\frac{2K\log(1/\delta_2)}{T}}\right)}{8K\log(1/\delta_1)}
\right\},
\end{equation}
by \eqref{eq:epsilon1_def}, \eqref{eq:epsilon_K_bound}, and \eqref{eq:DP_K_partial}, we have 
\begin{align}
&\epsilon_1\le \frac{\log\left(1+\frac{N}{S}\sqrt{\frac{2K\log(1/\delta_2)}{T}}\right)}{2\sqrt{2K\log(1/\delta_1)}}, \\
&\epsilon_K\le 2\sqrt{2K\log(1/\delta_1)}\epsilon_1\le \log\left(1+\frac{N}{S}\sqrt{\frac{2K\log(1/\delta_2)}{T}}\right),\\
&\tilde\epsilon_K\le \log\left(1+\sqrt{\frac{2K\log(1/\delta_2)}{T}}\right).
\end{align}
Thus, we have \eqref{eq:epsilon_alg3_bound1} holds.
Plugging \eqref{eq:epsilon1_def} into \eqref{eq:epsilon_alg3_bound1}, we have
\begin{equation} \label{eq:epsilon_alg3_bound2}
\epsilon^{(3)}_{K,T}\le 2\sqrt{2\frac{T}{K}\log(1/\delta_2)}\log\left(1+\frac{S}{N}\left(\exp\left\{4c(\delta_0)\Delta_l\sqrt{\frac{\eta K\log(1/\delta_1)}{\tau(1-\rho^2)\min_{c\in[N]}p_c}}\right\}-1\right)\right).
\end{equation}

For any $K\ge 1$, if $\eta K\ll 1$ and $T\gg 1$, we can write \eqref{eq:eta_condition_alg3} as 
\begin{equation}
0\le \eta=O\left(\frac{\tau(1-\rho^2)N^2\min_{c\in[N]}p_c\log(1/\delta_2)}{\Delta_l^2 S^2 T\log(1/\delta_0) \log(1/\delta_1)}\right),
\end{equation}
and \eqref{eq:epsilon_alg3_bound2} as
\begin{equation}
\epsilon_{K,T}^{(3)}=O\left(\frac{S\Delta_l}{N}\sqrt{\frac{\eta T \log(1/\delta_0)\log(1/\delta_1)\log(1/\delta_2)}{\tau(1-\rho^2)\min_{c\in[N]}p_c}}\right).
\end{equation}