\section{Convergence Analysis}\label{sec:math}
%\osman{should we change the title to Convergence Analysis? Then, we need a new title for Section 5.2, which could be main results}
We prove that \textbf{\algname}~converges in Theorem \ref{thm:4}. We first outline our technical assumptions and then present our main results. All proof details can be found in Appendix \ref{sec:proof}.

\textbf{Assumptions.} Our analysis relies on the following  assumptions on the risk function and gradient estimates, which are common in the literature~\citep{marfoq2021federated, ghosh2020efficient, koloskova2020unified} and weaker than those of~\citet{ruan2022fedsoft}. % \carlee{define $F$, $F_i$, $f$ (we have only defined $F_s$ for cluster $s$)}:

\begin{assumption}\label{as:1}
(Strong convexity and smoothness) The risk function $F_s$ for each cluster $s$ is $L$-smooth and $\mu$-strongly convex. That is, for some $L>0$ and $\mu \geq 0$:
\begin{equation}
%\small
\begin{aligned}
    & \| \nabla F_s(\mathbf{x}) - \nabla F_s(\mathbf{y}) \| \leq L \|\mathbf{x} - \mathbf{y} \| ; \\
    & \nabla F_s (\mathbf{x})^T(\mathbf{y}-\mathbf{x}) + \frac{\mu}{2} \|\mathbf{y} - \mathbf{x}\|^2 \leq F_s(\mathbf{y}) - F_s(\mathbf{x})
\end{aligned}
\end{equation} % \carlee{where is $\mu$?}
\end{assumption}

\begin{assumption}\label{as:2}
(Bounded risk function) The risk function $F_s$ for each cluster $s$ is lower-bounded by some $F_{inf} > 0$, i.e., 
%\begin{equation}
    $F_s(\mathbf{x}) \geq F_{inf}$.
%\end{equation}
%for some $F_{inf} > 0$.
\end{assumption}

\begin{assumption}\label{as:3}
(Unbiased gradient estimation) The gradient is unbiased, i.e.,
%\begin{equation}
    $\mathbb{E}[\nabla f_{is}(\mathbf{x})]=\nabla F_s(\mathbf{x})$.
%\end{equation}
\end{assumption}

\begin{assumption}\label{as:4}
(Bounded gradient) We have
%\begin{equation}
    $\mathbb{E} \| \nabla f_{is}(\mathbf{x})\|^2 \leq \sigma^2$
%\end{equation}
for some $\sigma^2 > 0$.
\end{assumption}

\begin{assumption}\label{as:5}
(Bounded variance of gradient estimation) The variance of the estimated gradient is bounded:
\begin{equation}
% \small
    \mathbb{E} \| \nabla f_{is}(\mathbf{x}) - \nabla F_s(\mathbf{x}) \|^2 \leq v^2 \textnormal{, for some } v^2 > 0.
\end{equation}
%for some $v^2 > 0$
\end{assumption}

%We also follow \citep{ruan2022fedsoft, ghosh2020efficient} in characterizing the cluster error:

\begin{assumption}\label{as:6}
(Bounded cluster error) Following \citet{ruan2022fedsoft, ghosh2020efficient}, \revise{during all training steps $t$}, all estimated cluster centers have bounded distance to the optimal centers. That is, for some $\delta > 0$: % \textcolor{blue}{C: for some $\delta > 0$}:
\begin{equation}
%\small
    \| \mathbf{c}_{is}^t - \mathbf{c}_{s} ^* \| \leq (0.5 - \alpha_0) \sqrt{\frac{\mu}{L}} \delta, \forall s \in 1, 2, ..., S
\end{equation}
where $0 < \alpha _0 \leq 0.5$. % \carlee{for sufficiently large $t$?}. Without loss of generality, we also assume for all $s$, $\| \mathbf{c}_s^{\star}\| \leq 1$.
\end{assumption}
Note that this assumption will always hold for some value of $\delta$; however, a larger $\delta$, and thus larger cluster error, will also lead to slower convergence.
%\carlee{Note that this assumption will always hold for some value of $\delta$; however, a larger $\delta$, and thus larger cluster error, will also lead to slower convergence.}

We finally follow \citet{koloskova2020unified} in assuming that clients communicate sufficiently for consensus: % \carlee{in assuming that clients communicate sufficiently to induce consensus}:

\begin{assumption}\label{as:7}
(Expected consensus rate) Define $\mathbf{C_s}$ as the concatenated model parameter matrix of cluster $s$. Then for some constant $p \in (0, 1]$ and integer $\beta \geq 1$, for all non-negative integers $l \leq \frac{T}{\beta}$ we have: % \carlee{what is $\bar{\mathbf{C}_s}$? we also have not defined $\mathbf{W}$}
\begin{equation}
%\small
    \mathbb{E} \left\| \mathbf{C}_s \prod_{t =l \beta}^{(l+1)\beta -1}\mathbf{W}_s^t - \overline{\mathbf{C}_s}\right\|_F^2 \leq (1-p) \| \mathbf{C}_s - \overline{\mathbf{C}_s} \|_F^2
\end{equation}
where $\bar{\mathbf{C}_s} := \underbrace{[\bar{\mathbf{c}_s}, ..., \bar{\mathbf{c}_s}]}_{\text{total }N\text{ terms.}}$ is the matrix with every column equal to the average of the model parameters.
\end{assumption}

%Assumption 1 - 5 are common assumptions which can be found in various works. 
% Assumption \ref{as:6} is also made in \citep{ruan2022fedsoft, ghosh2020efficient} and Assumption \ref{as:7} can also be found in \citep{koloskova2020unified}.

For simplicity, we further assume that all clients have the same amount of data (i.e., $\mathcal{D}_i$ has the same number of data points for all clients $i$) and that the number of local updates $\tau = 1$ in the remainder of this section. These can be easily relaxed if needed.

% For other derivation assumptions, we have:
% \begin{itemize}
%     \item The derivations follow the algorithms with one local update and one communication round.
%     \item For convenience, the number of data is set to be the same for all clients.    
% \end{itemize}
% These extra assumption can be relaxed easily.

%\subsection{Convergence Analysis}
%Without loss of generality, we present our results for certain cluster $i$, where $i = 1, ..., S$ is one of the $S$ clusters. Since the proof of the convergence for all $S$ clusters are the same, we drop the cluster index in the following proof for readability. We also define $n$ as the number of clients chosen to update the selected cluster. If the total amount of data in each cluster at all clients is roughly the same, $n\approx \frac{N}{S}$. We first bound the distance to the optimality of the average cluster center:
\textbf{Results.} Without loss of generality, we present our results for a specific cluster \( i \), where \( i = 1, \dots, S \). Since the convergence proof is identical for each of the \( S \) clusters, we omit the cluster index for clarity. Let \( n \) be the number of clients chosen to update the selected cluster. If the total data across clients is roughly uniform for each cluster, then \( n \approx \frac{N}{S} \). We begin by bounding the distance of the average cluster center to its optimal center:
\begin{theorem} \label{thm:1}
    (Descent lemma) The distance $\mathbb{E}\left\|\overline{\mathbf{c}}^{(t+1)}-\mathbf{c}^{\star}\right\|^2$ between the average cluster center and its optimum $\mathbf{c}^\star$ satisfies the bound (\ref{thm1eq}) with proper choice of learning rate $\eta_t$:
    \begin{equation}
    % \scriptsize
%    \small
    \begin{aligned}
      &\leq  \frac{\eta_t (L+\mu)}{n}\sum_{i=1}^{n_1}\left\|\overline{\mathbf{c}}^{(t)}-\mathbf{c}_i^{(t)}\right\|^2 + \frac{18L^2\epsilon_N^2 \eta_t^2}{n^2} + v^2\eta_t^2 \\
    & + \left(1- \eta_t \mu + \frac{\eta_t \mu\epsilon_N}{n}\right)\left\|\overline{\mathbf{c}}^{(t)}-\mathbf{c}^{\star}\right\|^2 + \frac{2\epsilon_N (S-1) v^2 \eta_t^2}{n^2} \\
    &  + \left(\frac{4\eta_t^2(n-\epsilon_N)^2 L}{n^2} + 2\eta_t\frac{1 - \epsilon_N}{n}\right) \left(f\left(\overline{\mathbf{c}}^{(t)}\right)-f\left(\mathbf{c}^{\star}\right)\right)
    \end{aligned}
    \label{thm1eq}
    \end{equation}
    Here $\epsilon_N$ is the bound of the expected number of clients using the wrong data in Lemma \ref{lm:ep}. %\carlee{Isn't $n_1$ then stochastic since clients probabilistically choose which cluster to update based on the amount of data they have?}. 
     %is around the order of $\frac{N}{S}$
\end{theorem}
We then derive an expression for the cluster centers estimated by individual clients.
\begin{theorem} \label{thm:2}
    (Update rule) % The model parameters
    % \carlee{Clients' estimated centers} 
    Clients' estimated centers of the cluster after time $t$ can be written as: %\carlee{what is $l\tau$?}
    \begin{equation}
%    \small
    \begin{aligned}
    \mathbf{C}^{t} & = \mathbf{C}^{l\beta}\prod_{m=l\beta}^{t-1}\mathbf{W}^{m}-\sum_{m=l\beta}^{t-1}\left(\eta_t\mathbf{G}^{m} \prod_{r=t-1}^{m}\mathbf{W}^{r}\right)
    \end{aligned}
    \end{equation}
\end{theorem}
Here $l \in \mathbb{N}$ and $\beta$ is the constant in Assumption \ref{as:7}. Given this expression, we can relate the clients' cluster center estimates to their average, showing that they eventually reach a near-consensus:
\begin{theorem} \label{thm:3}
    (Consensus distance) Define $\mathbf{E}_t = \frac{1}{N}  \sum_{i=1}^N \mathbb{E} \| \mathbf{c}_i^{(t)}- \overline{\mathbf{c}}^{(t)}\|^2$, the expected squared %\carlee{(squared)} 
    distance of the model parameters of client $i$ to the average model parameter. It is upper-bounded by
\begin{equation*}
%\small
\begin{aligned}
%\mathbf{E}_t & \leq 
%\left(1-\frac{p}{2}\right) \mathbf{E}_{m\beta} + \frac{p}{16\beta} \sum_{j=m\beta}^{t-1} \mathbf{E}_j + \frac{36Ln\beta}{pN} \sum_{j=m\beta}^{t-1} \eta_t^2 (f(\overline{\mathbf{c}^{{j}}})-f(\mathbf{c}^{\star})) + \frac{18\beta n \sigma^2 + nv^2p}{Np} \sum_{j=m\beta}^{t-1} \eta_t^2
% CJW: regrouping to save space
\left(1-\frac{p}{2}\right) \mathbf{E}_{m\beta} + & \sum_{j=m\beta}^{t-1}  \bigg(\frac{p\mathbf{E}_j}{16\beta} + \frac{18\beta n \sigma^2 + nv^2p}{Np} \eta_j^2 \\
& {} \quad + \left(\frac{36Ln\beta\eta_t^2}{pN}\right) (f(\overline{\mathbf{c}^{{j}}})-f(\mathbf{c}^{\star})) \bigg)
\end{aligned}    
\end{equation*}
Here $p$ is the constant defined in Assumption \ref{as:7}. %\textcolor{blue}{C: is $f = f_{is}$?}
\end{theorem}

\begin{theorem} \label{thm:4}
(Cluster convergence rate)
For given target accuracy $\epsilon$, there exists a constant learning rate for which $\epsilon$ accuracy can be reached after $T$ iterations: % \carlee{I reworded this slightly, please check it's still correct.} That is:
%\begin{equation}
%\small
%\begin{aligned}
%    [\frac{n-\epsilon_N}{n}+(\frac{n-\epsilon_N}{n})^2 \eta_t L] \sum_{t=0}^T\frac{w_t}{W_T}(\mathbb{E}f(\overline{\mathbf{c}}^{(t)})-f({\mathbf{c}^{\star}})) + \mu (1-\frac{\epsilon_N}{n})\mathbb{E}\| \overline{\mathbf{c}}^{(T+1)} - \mathbf{c}^{\star}\|^2 \leq \epsilon
%\end{aligned}
%\end{equation}
\begin{equation}
%\small
\begin{aligned}
    \left[1+\left(\frac{n-\epsilon_N}{n}\right) \eta L \right] & \sum_{t=0}^T\frac{r_t}{R_T}(\mathbb{E}f(\overline{\mathbf{c}}^{t})-f({\mathbf{c}^{\star}})) \\
    & + \mu \mathbb{E}\| \overline{\mathbf{c}}^{(T+1)} - \mathbf{c}^{\star}\|^2 \leq \epsilon
\end{aligned}
\end{equation}
Here $r_t$ is a sequence of positive weights defined in Lemma~\ref{lm:cr} in Appendix~\ref{sec:thm4-proof} %\textcolor{blue}{C: $w_t$ doesn't appear in (9), so I don't think we need it here?} 
and $R_T = \sum_{t=1}^T r_t$. Rearranging, we find that the number of required iterations $T$ to reach an error $\epsilon$ is of the order:
%\begin{equation}
%\small
%\begin{aligned}
%    & \tilde{\mathcal{O}} \left( \frac{\sqrt{L+\mu}}{\sqrt{(n-\epsilon_N)\epsilon} \mu}(\sqrt{n}\sigma + \frac{n}{\sqrt{N}}v) + v^2 \frac{n^2 + L^2 \epsilon_N^2 + \epsilon_N (S-1)}{\mu n (n-\epsilon_N) \epsilon}  + \frac{L\beta n^{\frac{3}{2}}}{\mu p \sqrt{N}(n-\epsilon_N)}ln\frac{1}{\epsilon} \right) \\
%\end{aligned}
%\end{equation}
\begin{equation*}
%\small
\begin{aligned}
    \tilde{\mathcal{O}} \Bigg( \frac{\sqrt{L+\mu}}{\sqrt{\epsilon} \mu}\bigg(\sigma & + \frac{\sqrt{n}}{\sqrt{N}}v\bigg) + \frac{L\beta n^{\frac{3}{2}}}{\mu p \sqrt{N}(n-\epsilon_N)}\ln\left(\frac{1}{\epsilon}\right) \\
    & + v^2 \frac{n^2 + L^2 \epsilon_N^2 + \epsilon_N (S-1)}{\mu n^2 \epsilon} \Bigg). \\
\end{aligned}
\end{equation*}
\end{theorem}
\new{The convergence rate, asymptotically requiring $O(1/\sqrt{\epsilon})$ training rounds to reach an error $\epsilon$, aligns with previous works on DFL without personalization \citep{koloskova2020unified}%\carlee{cite these works}
, leading us to conjecture that \textbf{\algname}~will converge well. We note that the network connectivity appears in this bound through the constant $p \in (0, 1]$ (Assumption \ref{as:7}), where higher connectivity indicates a larger $p$. However, the second term in the convergence rate that involves $p$ is not the dominant term. Thus, as long as the network is connected, we expect that the effect of network connectivity on convergence will be relatively minor. Our simulation results in Section \ref{sec:sim_nc} support this observation.}
%\carlee{This convergence rate, which is asymptotically $O()$, matches that shown in prior works on DFL without personalization; thus, we conjecture that \textbf{\algname}~will converge in a reasonable number of rounds.}
%\carlee{is there a way to see the effect of connectivity in these bounds? That could be a nice tie-in to the experiments and FedSPD's good performance under low connectivity}