\section{Proposed \algname~Algorithm}\label{sec:algorithms} % \carlee{``algorithms'' or ``algorithm'' in the title?}
% We first give an overview of \algname's main steps and then discuss each step in more detail.
% \subsection{Overview}
% \carlee{This paragraph seems more like part of the algorithm section. It should also explain why each client maintains these three types of parameters.}
% Like most federated learning algorithms, we consider an iterative training process.
% At each round $t = 1,2,\ldots,T$, each client $i$ maintains two different types of parameters: (i) its estimate of the cluster center $\mathbf{c}_{is}^t$ of each cluster $s$; (ii) the cluster to which each data point $d\in \mathcal{D}_i$ is associated (i.e., estimated to belong) and thus the portion of data of each cluster $s$, denoted by $u_{is}^t$. In each round $t$, clients update these parameters based on their local data and information received from their neighbors.
% Each round of training consists of \textbf{four steps}: (1) local training, (2) parameter exchange, (3) parameter (i.e., cluster center) update, and (4) data clustering; we follow the last training round with a \textbf{final personalization step} consisting of a local training update to each client's personalized model. 
% Like most federated learning algorithms, we consider an iterative training process. 
At each round \( t = 1, 2, \ldots, T \), each client \( i \) maintains two types of parameters: (i) its estimate of the cluster center \(\mathbf{c}_{is}^t\) for each cluster \( s \), and (ii) the cluster to which each data point \( d \in \mathcal{D}_i \) is associated, and the corresponding fraction of its data belonging to each cluster \( s \), denoted by \( u_{is}^t \). In each round \( t \), clients update these parameters based on their local data and information received from their neighbors.

Each round of training consists of \textbf{four steps}: (1) local training, (2) parameter exchange, (3) parameter (i.e., cluster center) update, and (4) data clustering. Following the last training round, we conduct a \textbf{final personalization step}, which involves a local training update to each client's personalized model. Algorithm \ref{alg:CPDFL} formalizes this method.



%We give an overview of the algorithm in this section and then elaborate its details. 
%
% \carlee{The below part of this subsection can be removed; left it for now to take up space.}
% % The algorithms consists of 5 parts. In the first training phase, it consists 4 parts, which are, the local training part, the parameter exchange part, the parameter update part and the data clustering part. 
%
% \textbf{Training steps.} In each round $t$, every client first selects one of the $S$ clusters. Each client then uses multiple iterations of stochastic gradient descent (SGD) to update its parameter estimates $C_{is}^t$ for the selected cluster $s$, using its local data that was estimated to belong to the selected cluster in the last training round. Clients then broadcast the updated model parameters for their selected clusters to their neighbors. Upon receipt of these parameters from their neighbors, clients update their estimates for the parameters of each cluster $s$ by taking a weighted sum of their neighbors' model parameters for cluster $s$. Each client then updates its estimate of which of its local data points $d\in D_i$ belongs to each cluster $s$, by assigning each local data point to the cluster whose (newly updated) center yields the smallest loss. We then move to the next round of training, $t + 1$.
% % chose to update parameters of a selected cluster and use the data associated with that cluster to do multiple steps of local stochastic gradient decent update. Then, each client broadcasts their updated model parameters to their neighbors. For each client, after they received all model parameters from their neighbors, they updated their estimation of the model parameters of the certain cluster, which we called cluster centers based on the weighted sum of their neighbors' model parameters for each cluster. After calculating the new model parameters, each local client calculate the loss of their own dataset according to the model parameters of different clusters. Associate each data with the cluster that yields the minimum loss and update the portion of the The data of each cluster. The above four steps form a full round.
%
% \textbf{Personalized models.} After the models reach consensus on the cluster centers, each client forms its personalized model as a weighted sum of its estimates for each cluster center, where the weights are proportional to $u^t_{is}$, the fraction of data points at client $i$ corresponding to cluster $s$.
% %aggregating the local model parameters of each client using weighted sum with the weight proportional to the portion of data associated with each cluster. Then for every client, they form a single personalized model. In the final phase, after aggregating the local model for different clusters, we 
% To further finetune the personalized model, each client then runs a few epochs of local training on its entire local dataset $D_i$.
%
%The details of the algorithms will be explained in the rest of this section. 

% \carlee{cite line numbers in the algorithm throughout this section as you discuss different parts of it}

% \subsection{Steps 1 and 2: Local Training and Parameter Exchange}
% \textbf{Local training.} In round $t$, each client $i$ has $u^t_{is}$ portion of data for cluster $s$, where $\sum_{s=1}^S u^t_{is} = 1$ (note that these $u^t_{is}$ are computed at the end of the previous training round, in step 4). Client $i$ then selects cluster $s$ to update in this round with probability $u_{is}$. Intuitively, this process ensures that clients with contribute more to training clusters where they have more data residing in that cluster. By having each client select only a single cluster in each round, \textit{\algname~ensures that the training overhead does not scale with the number of clusters $S$}, as each client only trains a single model, no matter the size of $S$. 
% Once cluster $s$ is selected, the client runs $\tau$ SGD updates of $\mathbf{c}^t_{is}$, its current estimate for the center of cluster $s$, with learning rate $\eta$. The gradients are computed on the risk of the data associated with the selected cluster, $\mathcal{D}_{i,s}^t$, as $\nabla_{\mathbf{c}}\ell(\mathbf{c}; d)$ where $d$ is a sample drawn uniformly at random from $\mathcal{D}_{i, s}^t$. Note that $\mathcal{D}_{i, s}^t$ is also computed in the data clustering step of the previous training round, which associates each data point $d\in \mathcal{D}_i$ with a cluster. % \carlee{SGD or full-batch GD?}
%\textbf{Local training.} (line 12 in Algorithms \ref{alg:CPDFL}) In round \( t \), each client \( i \) has \( u^t_{is} \) portion of data that is estimated to come from cluster \( s \), where \(\sum_{s=1}^S u^t_{is} = 1\). These \( u^t_{is} \) values are computed at the end of the previous training round, in step 4. Client \( i \) then selects cluster \( s \) to update in this round with probability \( u_{is} \). This process ensures that clients contribute more to training clusters where they have more data. By having each client select only a single cluster in each round, \textit{\algname~ensures that the training overhead does not scale with the number of clusters \( S \)}, as each client only trains a single model, regardless of the size of \( S \).

% Once cluster \( s \) is selected, the client runs \( \tau \) SGD updates of \(\mathbf{c}^t_{is}\), its current estimate for the center of cluster \( s \), with learning rate \(\eta\). The gradients are computed on the risk of the data associated with the selected cluster, \(\mathcal{D}_{i,s}^t\), as \(\nabla_{\mathbf{c}}\ell(\mathbf{c}; d)\) where \( d \) is a sample drawn uniformly at random from \(\mathcal{D}_{i, s}^t\). Note that \(\mathcal{D}_{i, s}^t\) is also computed in the data clustering step of the previous training round, which associates each data point \( d \in \mathcal{D}_i \) with a cluster. The stochastic form $f_{is}(\mathbf{c}) = \mathbb{E}_{d\sim D_{i, s}}[F_{is}(\mathbf{c}, d)]$ will recover the case of deterministic optimization when equal. % \carlee{define $f_{is}$ here?}

\textbf{Step 1: Local training} (line 12 in Algorithm \ref{alg:CPDFL}). In round \( t \), each client \( i \) has an estimated portion \( u^t_{is} \) of its data coming from cluster \( s \), where \(\sum_{s=1}^S u^t_{is} = 1\). These values are computed at the end of the previous round (step 4). Client \( i \) then selects cluster \( s \) to update with probability \( u_{is}^t \), ensuring that clients contribute more to clusters where they have more data. By selecting only one cluster per round, \textit{\textbf{\algname}~keeps the training overhead independent of the number of clusters \( S \)}, as each client always trains a single cluster's model. % regardless of \( S \)'s size.

Once a cluster \( s \) is selected, the client performs \( \tau \) SGD updates on its current cluster center estimate \(\mathbf{c}^t_{is}\) using learning rate \(\eta\). Gradients are computed on the risk of the data associated with the selected cluster, \(\mathcal{D}_{i,s}^t\), as \(\nabla_{\mathbf{c}}\ell(\mathbf{c}; d)\), where \( d \) is sampled uniformly at random from \(\mathcal{D}_{i, s}^t\). The dataset \(\mathcal{D}_{i, s}^t\) is formed in the previous round's clustering step, which assigns each data point \( d \in \mathcal{D}_i \) to a cluster. % The stochastic form \( f_{is}(\mathbf{c}) = \mathbb{E}_{d \sim D_{i, s}}[F_{is}(\mathbf{c}, d)] \) will reduce to deterministic optimization when applicable. \carlee{what does this last sentence mean?}


%\begin{equation}
%    F_{i}(t) = \ell(\mathbf{c}_{i, s}^t; \mathcal{D}_{i, s}^t)
%\end{equation}

%Here $c$ is the selected cluster, $\mathcal{D}_{i, c}^t$ is the collection of data associated with cluster $c$ for client $i$ at time $t$, and $\mathbf{x}_{i, c}^t$ is equal to $C_{is}^t$, which are client $i$'s current estimates of $c$'s cluster center. Each client do the local update $\tau$ times. % Where the gradient can be written as: \carlee{Do we need the gradient equation? This also appears to be full-batch training since the entire $\mathcal{D}$ is used?}
% \begin{equation}
%    \nabla_{\mathbf{x}_{i, c}^t} h_{i}(t) = \nabla f_{i}(\mathbf{x}_{i, c}^t; \mathcal{D}_{i, c}^t)
% \end{equation}


% \textbf{Parameter exchange.} (line 19 in Algorithms \ref{alg:CPDFL}) Letting $s_i^t$ denote the selected cluster of client $i$ in round $t$, each client $i$ now has an updated value for $\mathbf{c}_{i{s_i^t}}^t$. Each client then broadcasts both $s_i^t$ and $\mathbf{c}_{i{s_i^t}}^t$ to its neighbors. Thus, each client $i$ receives communications $\left\{s_j^t, \mathbf{c}_{j{s_j^t}}^t\right\}_{j\in\mathcal{N}_i}$ from all of its neighbors.

\textbf{Step 2: Parameter exchange} (line 18 in Algorithm \ref{alg:CPDFL}). Let \( s_i^t \) be the cluster selected by client \( i \) in round \( t \), meaning that \( \mathbf{c}_{i{s_i^t}}^t \) has been updated. Client \( i \) broadcasts \( s_i^t \) and \( \mathbf{c}_{i{s_i^t}}^t \) to its neighbors $j \in \mathcal{N}_i$. Consequently, each client \( i \) receives the communications \( \{s_j^t, \mathbf{c}_{j{s_j^t}}^t\}_{j \in \mathcal{N}_i} \) from all its neighbors.

%Each client keeps these as the new model parameters for the selected cluster and then broadcasts these new parameters (and the selected cluster index $i$) to their neighbors.

\textbf{Step 3: Cluster center updates}
%After receiving the updated cluster center parameters and corresponding cluster indices from its neighbors, each client $i$ updates its estimate of each cluster center $s$ with the average of the updates received for cluster $s$ in round $t$:
% \begin{equation}
    %\mathbf{c}_{is}^{t+1} = \frac{1}{|k \in \mathcal{N}_i \cap k \in C_s|}\sum_{k \in \mathcal{N} \cap k \in C_s}\mathbf{C}_{s}^t(k)
%    \mathbf{c}_{is}^{t+1} = \frac{1}{|j \in \mathcal{N}_i \cap s_j^t = s|}\sum_{j \in \mathcal{N}_i \cap s_j^t = s}\mathbf{c}_{js}^t
%    \label{eq:cluster_update}
%\end{equation}
%Here $|j \in \mathcal{N}_i \cap s_j^t = s|$ indicates the number of clients $j$ that are both neighbors of $i$ and selected cluster $s$ to update in this round. We slightly abuse notation and let $\mathcal{N}_i$ include client $i$ itself, i.e., client $i$'s local training is used to update its cluster center estimates. Since each client may receive center updates from multiple clusters, the client runs Eq.~\eqref{eq:cluster_update}'s update for \textit{all} clusters $s$ for which it has received at least one update.
%If the client does not receive any updates for a cluster center $s$ in round $t$, i.e., there is no neighboring clients selected cluster $s$ for their local training, its estimated cluster center remains unchanged, i.e., $\mathbf{c}_{is}^{t + 1} = \mathbf{c}_{is}^{t}$. The update rule can be written as $\mathbf{C}_{s}^{t+1} = \mathbf{W}_{s}^t \mathbf{C}_{s}^{t+1}$, where $\mathbf{W}_{s}^t$ is the weight matrix for cluster $s$ at time $t$.
% (line 25 in Algorithms \ref{alg:CPDFL}) After receiving the updated cluster center parameters and corresponding cluster indices from its neighbors, each client \( i \) updates its estimate of each cluster center \( s \) with the average of the updates received for cluster \( s \) in round \( t \):
% \begin{equation}
% %\tiny
%     \mathbf{c}_{is}^{t+1} = \frac{1}{|j \in \mathcal{N}[i] \cap s_j^t = s|}\sum_{j \in \mathcal{N}[i] \cap s_j^t = s}\mathbf{c}_{js}^t
%     \label{eq:cluster_update}
% \end{equation}
% Here,  \(\mathcal{N}[i]\) means the closed neighbourhood, which is the set include client \( i \) itself and its neighboring clients. \( |j \in \mathcal{N}[i] \cap s_j^t = s| \) indicates the number of clients \( j \) that are both selected to update cluster \( s \) in this round and in the set \(\mathcal{N}[i]\).  % Since each client may receive center updates from multiple clusters, 
% The client applies Eq.~\eqref{eq:cluster_update}'s update for \textit{all} clusters \( s \) for which it has received at least one update. If the client does not receive any updates for a cluster center \( s \) in round \( t \), i.e., no neighboring clients selected cluster \( s \) for their local training, its estimated cluster center remains unchanged: \(\mathbf{c}_{is}^{t+1} = \mathbf{c}_{is}^{t}\). The update rule can be written in matrix form as \(\mathbf{C}_{s}^{t+1} = \mathbf{W}_{s}^t \mathbf{C}_{s}^{t+1}\), where \(\mathbf{W}_{s}^t\) is the weight matrix for cluster \( s \) at time \( t \) and $\mathbf{C}_{s}^{t}=[\mathbf{c}_{1s}^{t}, ..., \mathbf{c}_{Ns}^{t}]$ consists of the concatenated cluster centers. % \carlee{also define $C$}
%
(line 23 in Algorithm \ref{alg:CPDFL}). After receiving the updated cluster centers and indices from its neighbors, each client \( i \) updates the cluster center of the cluster \( s \) it selected to update during this round. The client uses the average of its received cluster centers to update its estimate of \( \mathbf{c}_{is} \):
\begin{equation}
% \small
    \mathbf{c}_{is}^{t+1} = \frac{1}{|j \in \mathcal{N}[i] \cap s_j^t = s|} \sum_{j \in \mathcal{N}[i] \cap s_j^t = s} \mathbf{c}_{js}^t
    \label{eq:cluster_update}
\end{equation}
Here, \(\mathcal{N}[i]\) is the closed neighborhood, including client \( i \) and its neighboring clients, and \( |j \in \mathcal{N}[i] \cap s_j^t = s| \) represents the number of clients \( j \) that both updated cluster \( s \) and belong to \(\mathcal{N}[i]\). %The client applies Eq.~\eqref{eq:cluster_update} for \textit{all} clusters \( s \) for which it received at least one update. 
If no updates for cluster \( s \) are received in round \( t \), i.e., none of the neighbors selected it, the estimated cluster center remains unchanged: \(\mathbf{c}_{is}^{t+1} = \mathbf{c}_{is}^{t}\). This update rule can be expressed in matrix form as \(\mathbf{C}_{s}^{t+1} = \mathbf{W}_{s}^t \mathbf{C}_{s}^{t}\), where \(\mathbf{W}_{s}^t\) is the weight matrix for cluster \( s \) at time \( t \), and \(\mathbf{C}_{s}^{t} = [\mathbf{c}_{1s}^{t}, \dots, \mathbf{c}_{Ns}^{t}]\) contains the concatenated cluster centers.


% just simply remains the same. This can be written as:

% \carlee{write the matrix form of the cluster updates here, define $\mathbf{W}_s^t$}

% \begin{equation}
%     C_{is}^{t+1} = C_{is}^{t}
% \end{equation}

\textbf{Step 4: Data clustering}
% After each client updates its estimates for the cluster centers, it uses these updated estimates to associate each of its data points $d\in\mathcal{D}_i$ with a cluster. To do so, each client $i$ first calculates the loss $\ell(\mathbf{c}_{is}^{t + 1}, d)$ of each data point $d$ for cluster $s$. We then associate each data point with the cluster that yields the lowest loss value. Given these new associations of each data point with a cluster, we can then compute $u_{is}^{t + 1}$, the fraction of data points associated with cluster $s$. This step thus allows \algname~to learn the mixture coefficients of each client as the cluster center estimates evolve.
%Once the data clustering is complete, the training round $t$ advances to round $t + 1$, and we begin again with step 1's local training.
%
%(line 32 in Algorithms \ref{alg:CPDFL}) After each client updates its estimates for the cluster centers, it uses these updated estimates to associate each of its data points \( d \in \mathcal{D}_i \) with a cluster. To do so, each client \( i \) first calculates the loss \(\ell(\mathbf{c}_{is}^{t+1}, d)\) for each data point \( d \) with respect to each cluster \( s \). Each data point is then associated with the cluster that yields the lowest loss value. Given these new associations, we can compute \( u_{is}^{t+1} \), the fraction of data points associated with cluster \( s \). This step allows \textit{\algname} to learn the mixture coefficients of each client as the cluster center estimates evolve. Once the data clustering is complete, the training round \( t \) advances to round \( t+1 \), and we begin again with step 1's local training.
(line 29 in Algorithm \ref{alg:CPDFL}). After updating the cluster centers, each client \( i \) associates its data points \( d \in \mathcal{D}_i \) with a cluster. It calculates the loss \(\ell(\mathbf{c}_{is}^{t+1}, d)\) for each cluster \( s \) and assigns data point $d$ to the cluster with the lowest loss. Using these new associations, \( u_{is}^{t+1} \), the fraction of data points linked to cluster \( s \), is computed. This step enables \textbf{\algname} to adapt the mixture coefficients as cluster center estimates evolve. The process then moves to the next round, \( t+1 \), starting again with local training.


%we calculate the loss of each data with model parameters from different clusters. 
%We use $l_{ic}^t(d_k)$ to indicate the loss of data $d_k$ calculate for client $i$ with model parameters of cluster $c$ at time $t$. \carlee{I don't think you need to define another loss function here--try to stick to the same loss function as much as possible} Then $L_{i, d_k}^{t+1}$, the cluster associated with this data for client $i$ at time $t+1$ will be:
    
% \begin{equation}
%     L_{i, d_k}^{t+1} = c \text{ if } l_{ic}^t(d_k) = min(l_{i1}^t(d_k), l_{i2}^t(d_k), ..., l_{iS}^t(d_k))
% \end{equation}

%After reassign the cluster for each data, calculate the new portion of data of each cluster for every client.

% \subsection{Final Step: Personalization}\label{sec:personalization}

% (line 41 in Algorithms \ref{alg:CPDFL}) After $T$ rounds, each client can compute a personalized model by taking a weighted sum of its estimates for each cluster center.
% Formally, the client will compute its personalized model as:
% \begin{equation}
%     \mathbf{x}_i = \sum_{s=1}^S u_{i, s}^T \mathbf{c}_{i, s}^T
%     \label{eq:xi}
% \end{equation}

% \citep{marfoq2021federated} show that this weighted sum yields the optimal personalized model for client \( i \) when the loss function \( \ell \) is convex. However, most loss functions used in practice, such as cross-entropy for neural networks, are not convex. Therefore, we cannot expect this aggregated model to perform optimally in practice. To address this, we have each client run a few additional local training iterations, starting from \(\mathbf{x}_i\) as computed in Eq.~\eqref{eq:xi}, on its entire local dataset \(\mathcal{D}_i\).

% \textbf{Comparison to Prior Soft-Clustering Algorithms.} Both \citep{marfoq2021federated} and \citep{ruan2022fedsoft} use soft clustering models to learn cluster centers and personalized client models without finetuning. They directly learn personalized models in each training iteration, with a central server estimating the cluster centers from these models. In DFL, reaching a consensus on cluster models is challenging due to the numerous parameter exchanges required for personalized models to propagate to all clients. \citep{marfoq2021federated} proposes a decentralized algorithm that sets the personalized model as the weighted sum of the cluster centers at the end of each training round. However, this approach may be sub-optimal for non-convex loss functions. Finetuning within \citep{marfoq2021federated}'s framework can lead to overfitting, as the cluster center gradients are already based on the personalized models. In Section~\ref{sec:simulation}, we show that \textit{\algname} outperforms \citep{marfoq2021federated}'s FedEM algorithm, which requires each client to train all models in each training round, resulting in significantly more computing and communication per round than \textit{\algname}.

\textbf{Final Step: Personalization} %\label{sec:personalization}
%
(line 37 in Algorithm \ref{alg:CPDFL}). After \( T \) rounds, each client $i$ computes a personalized model as a weighted sum of its cluster centers:
\begin{equation}
%\small
    \mathbf{x}_i = \sum_{s=1}^S u_{i, s}^T \mathbf{c}_{i, s}^T
    \label{eq:xi}
\end{equation}
\citet{marfoq2021federated} show that Eq.~\eqref{eq:xi} provides the optimal personalized model for client \( i \) when the loss function \( \ell \) is convex. However, since most practical loss functions, such as cross-entropy for neural networks, are not convex, this aggregated model may perform poorly in practice. Thus, each client runs a few additional local training iterations, starting from \(\mathbf{x}_i\) (Eq.~\eqref{eq:xi}), using its entire local dataset \(\mathcal{D}_i\).

\textbf{Comparison to prior soft clustering algorithms.} \citet{marfoq2021federated} and \citet{ruan2022fedsoft} use soft clustering to learn cluster centers and personalized models without this final personalization step, directly learning personalized models in each iteration, with a central server estimating the cluster centers. In DFL, achieving consensus on cluster models is difficult due to the extensive parameter exchanges needed for model propagation, particularly when clients have few neighbors. \citet{marfoq2021federated} propose a decentralized algorithm that sets the personalized model as a weighted sum of the cluster centers \textit{at each round's end}, which can be sub-optimal for non-convex loss functions. Such a framework can lead to overfitting in DFL, as clients have low connectivity and thus cannot rely on receiving many other clients' updates in each training round. Adding another final personalization step, as we use in \textbf{\algname}, may exacerbate this overfitting, as cluster center gradients already incorporate personalized models. In Section~\ref{sec:simulation}, we demonstrate that \textit{\textbf{\algname}~outperforms \citet{marfoq2021federated}'s \textbf{FedEM} algorithm}, which also requires each client to train all models per round, incurring significantly more computation and communication than \textbf{\algname}.
\begin{algorithm}[h!]
\caption{Our Proposed \textbf{\algname} Algorithms}
\label{alg:CPDFL}
% \begin{multicols}{2}
\begin{algorithmic}[1]
\Procedure {FedSPD}{$\eta$, $\tau$, $S$, $T$, $\mathbf{W}_s^t$}
\For {$t = 1, 2, ..., T\tau$}
\State \textproc{LocalUpdate}($\mathcal{C}(t)$)
\If {$t \: mod \: \tau =0 $}
\State \textproc{ParameterExchange}($\mathcal{C}(t)$, $\mathbf{A}$)
\State \textproc{ParameterUpdate}($\mathcal{C}(t)$, $\mathbf{A}$)
\State \textproc{DataClustering}($\mathcal{C}(t)$, $\mathbf{A}$)
\EndIf
\EndFor
\State \textproc{FinalPhase}($\mathcal{C}(t)$, $\mathbf{u}(t)$)
\EndProcedure
% \Procedure {ClusterSelect}{$\mathbf{u}(t)$} % \carlee{I don't think this is ever called?}
% \State Select one cluster $s$ for each client $i$ to update. The probability is proportional to $\mathbf{u}(i, s)$
% \State Construct a boolean matrix $\mathbf{b}_s$ for cluster $s$ with size $N \times X$
% \State $b_s(i) = 1$ if client $i$ is selected for cluster $s$, else $b_s(i) = 0$
% \EndProcedure
%\newline
\Procedure {LocalUpdate}{$\mathcal{C}(t)$}
\For {$i = 1, 2, ..., N$}
\State Client $i$ selects cluster $s_i^t$ to update
\State $\mathbf{c}_{s_i^t}^{t+1} = \mathbf{c}_{s_i^t}^{t} - \eta_t \nabla f_{is} (\mathbf{c}_{s_i}^{t})$ % \carlee{need to define $\mathbf{G}_t$ (it might be clearer to just use $\nabla$, since that doesn't require another definition. Also why does this iterate over the clusters?} :{I tried to use ine line single math expression. However it turns out to be harder to read}
\EndFor
\EndProcedure
%\\
\Procedure {ParameterExchange}{$\mathcal{C}(t)$, $\mathbf{A}$}
\For {$i = 1, 2, ..., N$}
\State For each client $i$, exchange the updated parameter $\mathbf{c}_{is}$ and the selected cluster $s$ with client $j \in \mathcal{N}_i$
\EndFor
\EndProcedure
%\\
\Procedure {ParameterUpdate}{$\mathcal{C}(t)$, $\mathbf{A}$}
\State Construct $\mathbf{W}_{s}^t$ for each cluster $s$. If client $i$ is not selected to update cluster $s$, the row $i$ and column $i$ will only have diagonal element equal to 1, else equal to 0 , meaning the model parameter of the cluster that a user has not selected to update will remain the same as it was in the previous epoch.
\For {$s = 1, 2, ..., S$}
\State $\mathbf{C}_{s}^{t+1} = \mathbf{W}_{s}^t \mathbf{C}_{s}^{t+1}$
\EndFor
\EndProcedure
%\\
\Procedure {DataClustering}{$\mathcal{C}(t)$, $\mathbf{A}$}
\For {$i = 1, 2, ..., N$}
\For {$d_k \in \mathcal{D}_i$}
\State Label data $d_k$ with the least loss of all the model parameters among all clusters.
\EndFor
\State For $s=1,...,S$ update $u_{i, s}^t$ for client $i$
\EndFor
\EndProcedure
%\\
\Procedure {FinalPhase}{$\mathcal{C}(t)$, $u^t$}
\For {$i = 1, 2, ..., N$}
\State $\mathbf{X}_{i} = \sum_{s=1}^S u^t_{i, s} \mathbf{C}_s^t(i, :)$
\EndFor
\For {$t = 1, 2, ..., \tau_{final}$}
\State \textproc{LocalUpdate}($\mathbf{X}$) \algorithmiccomment{Run gradient descent using all data of the client for the aggregated training.}% \textcolor{blue}{C: this should be more formal, e.g., ``run $k$ iterations of gradient descent''}}
\EndFor
\EndProcedure
\end{algorithmic}
% \end{multicols}
\end{algorithm}