\subsection{Proof of Theorem \ref{thm:1}}
Without loss of generality, we select a single cluster, cluster 1 for analysis; the same analysis applies to the other $S - 1$ clusters. For readability, we eliminate the subscription indicating the cluster number 1. Consider each client running single step of SGD, we use $n$ to indicate the number of clients selected to update this cluster and $n_1$ and $n_0$ to indicate the number of clients using the correct data and incorrect data, respectively%\carlee{does ``correct data'' mean all data points at that client are correctly assigned to this cluster? Or only a majority?}
 \new{ (i.e. the data is drawn from this selected cluster is consider a correct data.)}, so that $n_1 + n_0 = n$. $S$ indicates the set of the client selected to update this cluster, $S*$ indicates the set of clients using the correct data, and $\overline{S*}$ indicates the set of clients using the incorrect data.
\begin{lemma}\label{lm:ds}
    (Doubly-stochastic weight matrix preserves the average) At the communication step, if the model of each client in the network is updated according to a doubly-stochastic weight matrix $\mathbf{W}^t$ then the average after the communication step remains the same. Formally, we have:
    \begin{equation}
        \mathbf{C}^{t+1}\frac{\mathbf{1}\mathbf{1}^T}{N} = \mathbf{C}^{t}\mathbf{W}^{t}\frac{\mathbf{1}\mathbf{1}^T}{N} = \mathbf{C}^{t}\frac{\mathbf{1}\mathbf{1}^T}{N}
    \end{equation}
\end{lemma}

From Lemma \ref{lm:ds}, we can write the left-hand side of Theorem \ref{thm:1} as:

\begin{equation}
\begin{aligned}
\left\|\overline{\mathbf{c}}^{(t+1)}-\mathbf{c}^{\star}\right\|^2= & \left\|\overline{\mathbf{c}}^{(t)}-\frac{\eta_t}{n} \sum_{i=1}^n \nabla F_i\left(\mathbf{c}_i^{(t)}, D_i^{(t)}\right)-\mathbf{c}^{\star}\right\|^2 \\
= & \left\|\overline{\mathbf{c}}^{(t)}-\mathbf{c}^{\star}-\frac{\eta_t}{n} \sum_{i \in S \cap S*} \nabla F_i\left(\mathbf{c}_i^{(t)}\right) -\frac{\eta_t}{n} \sum_{i \in S \cap \overline{S*}} \nabla F_i\left(\mathbf{c}_i^{(t)}\right)\right\|^2 \\
= & \left\|\overline{\mathbf{c}}^{(t)}-\mathbf{c}^{\star}-\frac{\eta_t}{n} \sum_{i \in S \cap S*} \nabla F_i\left(\mathbf{c}_i^{(t)}\right)\right\|^2+\left\|\frac{\eta_t}{n} \sum_{i \in S \cap \overline{S*}} \nabla F_i\left(\mathbf{c}_i^{(t)}\right)\right\|^2\\
& -\frac{2 \eta_t}{n}\left\langle\overline{\mathbf{c}}^{(t)}-\mathbf{c}^{\star}-\frac{\eta_t}{n} \sum_{i \in S \cap S*} \nabla F_i\left(\mathbf{c}_i^{(t)}\right), \sum_{i \in S \cap \overline{S*}} \nabla F_i\left(\mathbf{c}_i^{(t)}\right)\right\rangle
\end{aligned}
\end{equation}

We let the first and the second term on the right-hand side as $\| T_1 \|^2$ and $\| T_2 \|^2$ respectively. Thus the above equation can be written as:

\begin{equation}
\left\|\overline{\mathbf{c}}^{(t+1)}-\mathbf{c}^{\star}\right\|^2= \| T_1 \|^2 + \| T_2 \|^2 +2 \left\langle T_1, T_2 \right\rangle \leq (1+\alpha)\| T_1 \|^2 + (1+\alpha^{-1})\| T_2 \|^2
\end{equation}

for all $\alpha>0$.

The $T_1$ part is the typical decentralized SGD items. Inspired by \citep{koloskova2020unified}, we write $T_1$ as: 

\begin{equation}
\begin{aligned}
    \left\|\overline{\mathbf{c}}^{(t)}-\mathbf{c}^{\star}-\frac{\eta_t}{n} \sum_{i \in S \cap S*} \nabla F_i\left(\mathbf{c}_i^{(t)}\right)\right\|^2 & \leq \left\|\overline{\mathbf{c}}^{(t)}-\mathbf{c}^{\star}\right\|^2+\eta_t^2\frac{n_1^2}{n^2} \underbrace{\left\|\frac{1}{n_1} \sum_{i=1}^{n_1} \nabla f_i\left(\mathbf{c}_i^{(t)}\right)\right\|^2}_{T_{11}} \\
    & + 2 \eta_t\frac{n_1}{n}\left\langle\underbrace{\overline{\mathbf{c}}^{(t)}-\mathbf{c}^{\star}, \frac{-1}{n_1} \sum_{i=1}^{n_1} \nabla f_i\left(\mathbf{c}_i^{(t)}\right)}_{T_{12}}\right\rangle + \eta_t^2 v^2
\end{aligned}
\end{equation}

We can bound $T_{11}$ and $T_{12}$ separately as:

\begin{equation}
\begin{aligned}
T_{11} & =\left\|\frac{1}{n_1} \sum_{i=1}^{n_1}\left(\nabla f_i\left(\mathbf{c}_i^{(t)}\right)-\nabla f_i\left(\overline{\mathbf{c}}^{(t)}\right)+\nabla f_i\left(\overline{\mathbf{c}}^{(t)}\right)-\nabla f_i\left(\mathbf{c}^{\star}\right)\right)\right\|^2 \\
& \leq \frac{2}{n_1} \sum_{i=1}^{n_1}\left\|\nabla f_i\left(\mathbf{c}_i^{(t)}\right)-\nabla f_i\left(\overline{\mathbf{c}}^{(t)}\right)\right\|^2+2\left\|\frac{1}{n} \sum_{i=1}^{n_1} \nabla f_i\left(\overline{\mathbf{c}}^{(t)}\right)-\frac{1}{n} \sum_{i=1}^{n_1} \nabla f_i\left(\mathbf{c}^{\star}\right)\right\|^2 \\
& =\frac{2 L^2}{n_1} \sum_{i=1}^{n_1}\left\|\mathbf{c}_i^{(t)}-\overline{\mathbf{c}}^{(t)}\right\|^2+4 L\left(f\left(\overline{\mathbf{c}}^{(t)}\right)-f(\mathbf{c}^{\star})\right)
\end{aligned}
\end{equation}


\begin{equation}
\begin{aligned}
-T_{12} & =-\frac{1}{n_1} \sum_{i=1}^{n_1}\left[\left\langle\overline{\mathbf{c}}^{(t)}-\mathbf{c}_i^{(t)}, \nabla f_i\left(\mathbf{c}_i^{(t)}\right)\right\rangle+\left\langle\mathbf{c}_i^{(t)}-\mathbf{c}^{\star}, \nabla f_i\left(\mathbf{c}_i^{(t)}\right)\right\rangle\right] \\
& \leq-\frac{1}{n_1} \sum_{i=1}^{n_1}\left[f_i\left(\overline{\mathbf{c}}^{(t)}\right)-f_i\left(\mathbf{c}_i^{(t)}\right)-\frac{L}{2}\left\|\overline{\mathbf{c}}^{(t)}-\mathbf{c}_i^{(t)}\right\|^2+f_i\left(\mathbf{c}_i^{(t)}\right)-f_i\left(\mathbf{c}^{\star}\right)+\frac{\mu}{2}\left\|\mathbf{c}_i^{(t)}-\mathbf{c}^{\star}\right\|^2\right] \\
& \leq -\left(f\left(\overline{\mathbf{c}}^{(t)}\right)-f\left(\mathbf{c}^{\star}\right)\right)+\frac{L+\mu}{2n_1} \sum_{i=1}^{n_1}\left\|\overline{\mathbf{c}}^{(t)}-\mathbf{c}_i^{(t)}\right\|^2-\frac{\mu}{4}\left\|\overline{\mathbf{c}}^{(t)}-\mathbf{c}^{\star}\right\|^2
\end{aligned}
\end{equation}

Now we deal with $T_2$. From \citep{ruan2022fedsoft} and \citep{ghosh2020efficient} we have the following Lemma:

\begin{lemma}\label{lm:ep}
    (Mis-classified probability) For a data point belongs to cluster $j$, the probability of error classification $\mathbb{P}(\epsilon^{j, j'})$ to cluster $j' \neq j$ can be bound as:
    \begin{equation}
        \mathbb{P}(\epsilon^{j, j'}) \leq \frac{c_1}{\alpha_0^2\delta^4}
    \end{equation}
    And by union bound, the error probability is bounded as:
    \begin{equation}
        \mathbb{P}(\overline{\epsilon}) \leq \frac{c_1 S}{\alpha_0^2\delta^4}
    \end{equation}
    The expected number of clients using wrong cluster of data is bounded as:
    \begin{equation}
        \mathbb{E}[S \cap \overline{S*}] \leq \frac{c_1 N}{\alpha_0^2\delta^4} = \epsilon_N
    \end{equation}
    for some constant $c_1$.
    We define this bound as $\epsilon_N$
    
\end{lemma}

Inspired by \citep{ghosh2020efficient}, define $T_{2k}$ as the clients selecting the mis-classified data points that should be belongs to cluster $k$ where $k \neq 1$(The correct cluster). That is:

\begin{equation}
\begin{aligned}
T_{2k} = \sum_{i\in S \cap \overline{S*} \cap S_k*}\nabla F_i(\mathbf{c}_i)
\end{aligned}
\end{equation}

For each $T_{2k}$, we use $n_k$ to indicate the number of clients using mis-classified data that should be belongs to cluster $k$. We have:

\begin{equation}
\begin{aligned}
T_{2k} = \sum_{i=1}^{n_k}\nabla F_i^k(\mathbf{c}_i) + \sum_{i=1}^{n_k}\nabla F_i (\mathbf{c}_i) - \nabla F_i^k(\mathbf{c}_i)
\end{aligned}
\end{equation}

Taking the expectation and by Markov's inequality:

\begin{equation}
\begin{aligned}
\|T_{2k}\| & = \left\| \sum_{i=1}^{n_k}\nabla F_i^k(\mathbf{c}_i) + \sum_{i=1}^{n_k}\nabla F_i (\mathbf{c}_i) - \nabla F_i^k(\mathbf{c}_i) \right\| \\
& \leq 3 n_k L + \frac{\sqrt{n_k} v}{\theta_1} 
\end{aligned}
\end{equation}

For any $\theta_1 \in (0, 1)$ with probability equal or greater than $1-\theta_1$. The above used Lemma \ref{lm:ep}, Assumption \ref{as:5} and Assumption \ref{as:6} and the Markov inequality.

Using the union bound we see that $T_2 = \sum_k T_{2k}$ is bounded as the following with probability greater or equal to $1-(S-1)\theta_1-\theta_2$:

\begin{equation}
\begin{aligned}
\|T_{2}\|^2 & = \| \sum_{k=2}^S T_{2k}\|^2 \leq (S-1) \sum_{k=2}^S  \|T_{2k}\|^2 \\
& \leq \frac{18L^2\epsilon_N^2}{\theta_2^2} + \frac{2\epsilon_N (S-1) v^2}{\theta_1^2 \theta_2}
\end{aligned}
\end{equation}

When $\sum_{k=2}^S n_k \leq \frac{\epsilon_N}{\theta_2}$ with probability at least $1-\theta_2$. %: \carlee{this sentence does not make sense}

Combining the above three terms and Lemma \ref{lm:ep}, we have:

\begin{equation}
\begin{aligned}
\mathbb{E}\left\|\overline{\mathbf{c}}^{(t+1)}-\mathbf{c}^{\star}\right\|^2 & \leq (1- \eta_t \mu + \eta_t \mu\frac{\epsilon_N}{n})\left\|\overline{\mathbf{c}}^{(t)}-\mathbf{c}^{\star}\right\|^2 + \frac{18L^2\epsilon_N^2 \eta_t^2}{n^2} + \frac{2\epsilon_N (S-1) v^2 \eta_t^2}{n^2} + \eta_t^2v^2 \\
& + \frac{\eta_t (L+\mu)}{n}\sum_{i=1}^{n_1}\left\|\overline{\mathbf{c}}^{(t)}-\mathbf{c}_i^{(t)}\right\|^2 + \left(\frac{4\eta_t^2(n-\epsilon_N)^2 L}{n^2} + 2\eta_t - \frac{2\eta_t \epsilon_N}{n}\right) \left(f\left(\overline{\mathbf{c}}^{(t)}\right)-f\left(\mathbf{c}^{\star}\right)\right)
\end{aligned}
\end{equation}