\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}

% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 

% self_package
\usepackage{makecell}
\usepackage{multirow}
\usepackage{booktabs}
\usepackage{soul}
\usepackage{xcolor}
\usepackage{color}
% cellclolor必备宏包
\usepackage{colortbl}
% 用于公式自动断行
\usepackage{autobreak}
\usepackage{float}
\usepackage{amssymb}
\newcommand{\comment}[1]{}
\usepackage{times}
\usepackage{subfigure}
\usepackage{multirow}
\usepackage{makecell}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage[ruled,linesnumbered]{algorithm2e}
\newtheorem{example}{Example}
\newtheorem{theorem}{Theorem}
\newtheorem{assumption}{Assumption}
\newtheorem{corollary}{Corollary}[theorem]
% 自定义计数器名为counter
\newcounter{counter}
% 第一个lemma从4开始，counter初始值为0，所以设置值为3
\setcounter{counter}{3}
% 用counter作为lemma的计数器，取代固有的计数器如theorem等
\newtheorem{Lemma}[counter]{Lemma}
\newtheorem{proposition}{Proposition}
\newtheorem{remark}{Remark}
%\theoremstyle{definition}
\newtheorem{definition}{Definition}[section]


\externaldocument{uai2023-template}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\newcommand{\gdh}[1]{{\textcolor{black}{#1}}}

\title{Fast Heterogeneous Federated Learning with Hybrid Client Selection\\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Duanxiao~Song$^*$}
\author[1,2]{Guangyuan~Shen$^*$}
\author[1,2]{Dehong~Gao\thanks{The first three authors contribute equally to this work. }}
\author[1]{libin~yang\thanks{Contact author. }}
\author[1]{Xukai~Zhou}
\author[3]{Shirui~Pan}
\author[4]{Wei~Lou}
\author[1]{Fang~Zhou}
% Add affiliations after the authors
\affil[1]{% of Cyber Science and Technology
    \gdh{Department of Cybersecurity}, Northwestern Polytechnical University, China
}
\affil[2]{%
    Alibaba Group, China
}
\affil[3]{%
    School of Information and Communication Technology, Griffith University, Australia
  }
\affil[4]{%
    Department of Computing, The Hong Kong Polytechnic University, Hong Kong, China
  }
\begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle

% This Supplementary Material should be submitted as a separate file. Please do not append the Supplementary Material to the main paper. 

% Fig. \ref{fig:pitt} and Eq \ref{eq:example} in the main paper can be cross referenced using \texttt{xr}. 

% \appendix
% \section{Additional simulation results}
% Table~\ref{tab:supp-data} lists additional simulation results; see also \citet{einstein} for a comparison. 

% \begin{table}[!h]
%     \centering
%     \caption{An Interesting Table.} \label{tab:supp-data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \section{Math font exposition}
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% How math looks in equations is important:
% \begin{equation*}
%   F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.

\section{Extra Illustration} \label{app:A}
\subsection{Dirichlet Distribution Illustration} \label{app:A.1}
Given different values of the concentration parameter $\alpha$ for the Dirichlet distribution, datasets with different degrees of heterogeneity can be generated. 
%
In particular, higher values of $\alpha$ lead to a more uniform distribution, indicating that each client has an almost equally weighted combination of labels. 
%
Lower values of $\alpha$ imply weights concentrated more heavily on only one of the labels, or more extreme label membership.
%
Table \ref{tab2} is an example of Dirichlet distribution used in the experiments. 
%
As shown in the Table \ref{tab2}, the data distribution on each client is different, and in the case of extreme non-IID, i.e., $\alpha  \rightarrow 0$, most of the data are concentrated under only one label, while the amount of others is almost zero.

\begin{table*}[htbp]
%   \vspace{-1em}
  \centering
  \caption{The actual Dirichlet Distribution (non-IID) generated from CIFAR-10 with $\alpha=0.001$}
  \setlength{\tabcolsep}{1.8mm} %控制表格列距
  \renewcommand\arraystretch{1.1} %控制表格行距
  \scalebox{0.9}{
  \begin{tabular}{c|c|c|c|c|c|c|c|c|c|c|c} 
  \hline
  \multirow{2}*{\textbf{Client ID}} &\multicolumn{10}{c|}{\textbf{Numbers of Samples in the Classes}} &\multirow{2}*{\textbf{Distribution}}\\ 
  \cline{2-11}
  &$c_0$&$c_1$&$c_2$&$c_3$&$c_4$&$c_5$&$c_6$&$c_7$&$c_8$&$c_9$\\ 
  \cline{1-12}
  {k=0}  & 2 & 1 & 33 & 117 & 100 & 6 & 1 & \textbf{0} & 1 & 239 & $\raisebox{-.25\height}{\includegraphics[width=0.2\columnwidth]{Figure/client0_dir_partition.pdf}}$
  \\
  \cline{1-12}
  {k=1}  & \textbf{0} & \textbf{0} & \textbf{0} & 1 & 1 & 29 & \textbf{467} & \textbf{0} & \textbf{0} & 2
  &$\raisebox{-.3\height}{\includegraphics[width=0.2\columnwidth]{Figure/client1_dir_partition.pdf}}$
  \\ 
  \cline{1-12}
  {k=2}  & 2 & \textbf{397} & 5 & 1 & 2 & 86 & \textbf{0} & 1 & \textbf{0} & 6 
  &$\raisebox{-.3\height}{\includegraphics[width=0.2\columnwidth]{Figure/client2_dir_partition.pdf}}$
  \\ 
  \cline{1-12}
  {k=3}  & 1 & \textbf{0} & \textbf{0} & \textbf{0} & \textbf{0} & 3 & \textbf{0} & \textbf{0} & 125 & \textbf{371} 
  &$\raisebox{-.3\height}{\includegraphics[width=0.2\columnwidth]{Figure/client3_dir_partition.pdf}}$
  \\ 
  \cline{1-12}
  {k=4}  & 1 & 67 & 5 & \textbf{0} & 15 & \textbf{304} & \textbf{0} & \textbf{0} & 34 & 74 
  &$\raisebox{-.3\height}{\includegraphics[width=0.2\columnwidth]{Figure/client4_dir_partition.pdf}}$
  \\ 
  \cline{1-12}
  {$\cdots$}  & $\cdots$ & $\cdots$ &$\cdots$ & $\cdots$ & $\cdots$ & $\cdots$ & $\cdots$ & $\cdots$ & $\cdots$ & $\cdots$ &$\cdots$
  \\
  \cline{1-12}
  {k=95}  & 32 & 213 & \textbf{0} & 94 & 17 & 3 & 138 & \textbf{0} & \textbf{0} & 3
  &$\raisebox{-.3\height}{\includegraphics[width=0.2\columnwidth]{Figure/client5_dir_partition.pdf}}$
  \\ 
  \cline{1-12}
  {k=96}  & 51 & 36 & 166 & 32 & \textbf{0} & \textbf{0} & 8 & 203 & \textbf{0} & 4 
  &$\raisebox{-.3\height}{\includegraphics[width=0.2\columnwidth]{Figure/client6_dir_partition.pdf}}$
  \\ 
  \cline{1-12}
  {k=97}  & 25 & \textbf{0} & \textbf{347} & 17 & 7 & \textbf{0} & \textbf{0} & 44 & \textbf{0} & 60 
  &$\raisebox{-.3\height}{\includegraphics[width=0.2\columnwidth]{Figure/client7_dir_partition.pdf}}$
  \\ 
  \cline{1-12}
  {k=98}  & \textbf{0} & 1 & \textbf{0} & 2 & 1 & 18 & 3 & 60 & \textbf{413} & 2 
  &$\raisebox{-.3\height}{\includegraphics[width=0.2\columnwidth]{Figure/client8_dir_partition.pdf}}$
  \\ 
  \cline{1-12}
  {k=99}  & \textbf{465} & \textbf{0} & 4 & 2 & 2 & 3 & 4 & 14 & 4 & 2 
  &$\raisebox{-.3\height}{\includegraphics[width=0.2\columnwidth]{Figure/client9_dir_partition.pdf}}$
  \\
  \hline
  \end{tabular}
  }
  \label{tab2}
\end{table*}

\subsection{Gradient Compress Algorithm} \label{app:A.2}
% \begin{wrapfigure}[20]{R}{0.55\textwidth}
% 	\flushright  % 右对齐环境
% 	 \vspace{-2em}  % 负表示竖直方向向上移动
% \begin{minipage}{1\linewidth}  % 控制算法框大小
\SetInd{0.3em}{0.4em}  % 第一位调整竖线启示位置，第二位控制两条线之间的宽度
\IncMargin{1em} % 使行号向内缩进
\begin{algorithm}
\caption{GC(Gradient Compress)}
\label{alg:algorithm4}
\KwIn{Raw updates in the $t^{th}$ round of the $k^{th}$ client $G_{t}^{k}=\{g_{1}, g_{2}, \ldots, g_{d}\}$}
% \KwIn{Target dimension of the compressed update $d'$ $\left( d' \leq d \right)$}
% \For(\tcp*[f]{target dimension $d^{\prime}$}){$d^{\prime}=\{3,\ldots,\log_{2}d\}$}
% {
\textbf{Initialize} Randomly select $d'$ $g_{i}$ as the group centers \{$x_{1}, x_{2},\ldots, x_{d'}\}$\;
\textbf{Initialize} $C_{j}=\varnothing\ (1 \leq j \leq d^{\prime})$\;
\Repeat{$\forall\ j = \{1,2,\ldots, d^{\prime}\}, x_{j}^{\prime}=x_{j}$}
{\For{each $g_{i},\ i=1,2,\ldots,d$}
{$\lambda_{i} = \arg \min _{j \in \{1,2,\ldots,d'\}} \left\|g_i-x_{j}\right\|_{2}$\; ${C}_{\lambda_{i}}={C}_{\lambda_{i}}\bigcup\left\{ g_i \right\}$\;
}
\For{each cluster $j=1,2,\ldots, d^{\prime}$}
{Calculate new center $x_{j}^{\prime}=\frac{1}{\left|{C}_{j}\right|} \sum_{g_{i} \in {C}_{j}}{g_{i}}$\;
% \eIf{$\boldsymbol{\mu}_{i}^{\prime} \neq \boldsymbol{\mu}_{i}$}
{$x_{j} \leftarrow x_{j}^{\prime}$\;}
% {$\boldsymbol{\mu}_{i}^{\prime} = \boldsymbol{\mu}_{i}$\;}
}
}
% $S_{d^{\prime}} \leftarrow$ compute the within-cluster sum of squares $\sum_{j=1}^{d^{\prime}} \sum_{g_{i} \in c_{j}}\left|g_{i}-c_{j}\right|^{2}$\;
% }
% $d^{\prime} \leftarrow {\arg \min}_{d^{\prime} \in \{3,\ldots,\log_{2}d\}}\{S_{d^{\prime}}\}$\;
\KwOut{$\boldsymbol{X}_{t}^{k}=\left\{x_{1}, x_{2}, \cdots, x_{d'} \right\}$;}
\end{algorithm}
\subsection{Key Lemmas} \label{app:A.3}
Following \cite{li2019convergence}, we present necessary assumptions and extra notations that we used to prove the convergence of FedAvg with random client selection. 
\paragraph{Assumptions.}
The convergence of FedAvg with random sampling scheme has been derived in \cite{li2019convergence}. The proof relies on the assumptions as follows. Assumptions \ref{assumption3} and \ref{assumption4} have been given by  \cite{zhang2013communication,stich2018local,stich2018sparsified}\footnote{Note that the strong assumptions are only used for convergence analysis, and variance comparison does not require these assumptions.}.
%
\begin{assumption}[L-Smooth] \label{assumption1}
$\forall$ $\mathbf{v}$ and $\mathbf{w}, k=1,\cdots,N F_{k}(\mathbf{v}) \leq F_{k}(\mathbf{w})+(\mathbf{v}-$ $\mathbf{w})^{T} \nabla F_{k}(\mathbf{w})+\frac{L}{2}\|\mathbf{v}-\mathbf{w}\|_{2}^{2}$ where the $\mathbf{v}$,$\mathbf{w}$ are different model parameters.
\end{assumption}
\begin{assumption}[Strongly Convex] \label{assumption2} 
$\forall\ \mathbf{v}$ and $\mathbf{w}, k=1,\cdots,N F_{k}(\mathbf{v}) \geq F_{k}(\mathbf{w})+(\mathbf{v}-$ $\mathbf{w})^{T} \nabla F_{k}(\mathbf{w})+\frac{\mu}{2}\|\mathbf{v}-\mathbf{w}\|_{2}^{2}$ where the $\mathbf{v}$,$\mathbf{w}$ are different model parameters.
\end{assumption}
%
\begin{assumption}[Bounded Variance] \label{assumption3}
Let $\xi_{t}^{k}$ be sampled from the $k^{th}$ device's local data uniformly at random. The variance of stochastic gradients in each device is bounded:
$$
\mathbb{E} \left\Vert\nabla F_{k}\left(\mathbf{w}_{t}^{k},\xi_{t}^{k}\right)-\nabla F_{k}\left(\mathbf{w}_{t}^{k}\right)\right\Vert^{2} \leq\sigma_{k}^{2},~\forall\ k=1, \cdots, N
$$
\end{assumption}
%
\begin{assumption}[Bounded Expectation] \label{assumption4}
The expectation of stochastic gradients in squared norm is bounded by $G^2$, i.e., 
$$
\mathbb{E}\left\|\nabla F_{k}\left(\mathbf{w}_{t}^{k}, \xi_{t}^{k}\right)\right\|^{2} \leq G^{2}, \forall\ k=1,\cdots,N,t=1,\cdots T-1
$$
\end{assumption}
\paragraph{Additional Notation.} We assume that FedAvg always activates all devices at the beginning of each round and then uses the parameters maintained in only a few sampled devices to produce the next-round parameter. This updating scheme is equivalent to the original. 
%
Let $\mathcal{I}_{E}$ be the set of global synchronization, i.e., $\mathcal{I}_{E}=\{n E \mid n=1,2, \cdots\}$. If $t+1 \in \mathcal{I}_{E}$, i.e., the time step to communicate.
%
Then the update of FedAvg with partial devices active can be described as: for all $k \in[N]$,
% \begin{equation}
%     \mathbf{v}_{t+1}^{k} =\mathbf{w}_{t}^{k}-\eta_{t} \nabla F_{k}\left(\mathbf{w}_{t}^{k}, \xi_{t}^{k}\right)
% \end{equation}
% \begin{align}
%     \mathbf{w}_{t+1}^{k} &= \begin{cases}\mathbf{v}_{t+1}^{k} & \text { ,if } t+1 \notin \mathcal{I}_{E} \\ \text { average }\left\{\mathbf{v}_{t+1}^{k}\right\}_{k \in \mathcal{S}_{t+1}} & \text { ,if } t+1 \in \mathcal{I}_{E},\end{cases}
% \end{align}
\begin{gather}
\mathbf{v}_{t+1}^{k}=\mathbf{w}_{t}^{k}-\eta_{t} \nabla F_{k}\left(\mathbf{w}_{t}^{k}, \xi_{t}^{k}\right) \\
\mathbf{w}_{t+1}^{k}= \begin{cases}\mathbf{v}_{t+1}^{k} & \text {,if } t+1 \notin \mathcal{I}_{E} \\
\text { average }\left\{\mathbf{v}_{t+1}^{k}\right\}_{k \in \mathcal{S}_{t+1}} & \text {,if } t+1 \in \mathcal{I}_{E},\end{cases}
\end{gather}
%
where $\mathcal{S}_{t+1}$ denotes the subset of $(t+1)^{th}$ round. Here, an additional variable $\mathbf{v}_{t+1}^{k}$ is introduced to represent the immediate result of one step SGD update from $\mathbf{w}_{t}^{k}$. We interpret $\mathbf{w}_{t+1}^{k}$ as the parameter obtained after communication steps.
%
Let $F^{*}$ and $F_{k}^{*}$ be the minimum values of $F$ and $F_{k}$, respectively. We use the term $\Gamma=F^{*}-\sum_{k=1}^{N} p_{k} F_{k}^{*}$ for quantifying the degree of non-IID, where the $p_{k}$ denotes the aggregation weight. If the data are IID, then $\Gamma$ goes to zero as the number of samples grows. If the data are non-IID, then $\Gamma$ is nonzero, and its magnitude reflects the heterogeneity of the data distribution.
\setcounter{counter}{0}
\begin{Lemma}[Results of one step SGD] \label{lemma1}
    Assume Assumption \ref{assumption1} and \ref{assumption2}. If $\eta_{t} \leq \frac{1}{4 L}$, we have
    \begin{align*}
    \mathbb{E}&\left\|\overline{\mathbf{v}}_{t+1}-\mathbf{w}^{\star}\right\|^{2} \leq\left(1-\eta_{t} \mu\right) \mathbb{E}\left\|\overline{\mathbf{w}}_{t}-\mathbf{w}^{\star}\right\|^{2}\\
    &+\eta_{t}^{2} \mathbb{E}\left\|G_{t}-\overline{G}_{t}\right\|^{2}+6 L \eta_{t}^{2} \Gamma+2 \mathbb{E} \sum_{k=1}^{N} p_{k}\left\|\overline{\mathbf{w}}_{t}-\mathbf{w}_{t}^{k}\right\|^{2}
    \end{align*}
where $\Gamma=F^{*}-\sum_{k=1}^{N} p_{k} F_{k}^{\star} \geq 0$, 
\end{Lemma}
\begin{Lemma}[Bounding the variance] \label{lemma2}
    Assume Assumption \ref{assumption3} holds, and $\sigma_{k}$ defined there. It follows that
    $$
    \mathbb{E}\left\|G_{t}-\overline{G}_{t}\right\|^{2} \leq \sum_{k=1}^{N} p_{k}^{2} \sigma_{k}^{2},
    $$
    where the $G_{t}$ is the gradient vector of $t^{th}$ round.
\end{Lemma}
\begin{Lemma}[Bounding the divergence of $\left\{\mathbf{w}_{t}^{k}\right\}$] \label{lemma3}
    Assume Assumption \ref{assumption4}, that $\eta_{t}$ is non-increasing and $\eta_{t} \leq 2 \eta_{t+E}$ for all $t \geq 0$. It follows that
    $$
    \mathbb{E}\left[\sum_{k=1}^{N} p_{k}\left\|\overline{\mathbf{w}}_{t}-\mathbf{w}_{t}^{k}\right\|^{2}\right] \leq 4 \eta_{t}^{2}(E-1)^{2} G^{2} .
    $$
\end{Lemma}
\begin{Lemma}[Unbiased sampling scheme] \label{lemma4}
    If ${\left(t+1\right)}^{th}$ round is the communication round, for our selection with $\mathcal{S}_{t}=\left\{i_{1}, \cdots, i_{m}\right\} \subset[N]$ we have
    $$
    \mathbb{E}\left[\mathbf{w}(\mathcal{S}_t)\right] = \mathbf{w}(\mathcal{K}),
    $$
    where $\mathcal{K}$ denotes the population of clients.
\end{Lemma}
\begin{proof}
    \begin{align}
        \mathbb{E}\left[\mathbf{w}(\mathcal{S}_t)\right]=\mathbb{E}_{\mathcal{S}_{t}} \sum_{k=1}^{m} \mathbf{w}_{i_{k}}=m \mathbb{E}_{\mathcal{S}_{t}} \left[\mathbf{w}_{i_{1}} \right]=m \sum_{k=1}^{N} p_{k} \mathbf{w}_{k}
    \end{align}
\end{proof}
\begin{Lemma}[Bounding the variance of $\mathbf{w}(\mathcal{S}_t)$] \label{lemma5}
    For $t+1 \in \mathcal{I}_{E}$, assume that $\eta_{t}$ is non-increasing and $\eta_{t} \leq 2 \eta_{t+E}$ for all $t \geq 0 .$ We have the following result
    assuming $p_{1}=p_{2}=\cdots=p_{m_{h}}=\frac{1}{N_{h}}$, the expected difference between $\overline{\mathbf{v}}_{t+1}$ and $\overline{\mathbf{w}}_{t+1}$ is bounded by
    $$
    \mathbb{E}_{\mathcal{S}_{t}}\left\|\overline{\mathbf{v}}_{t+1}-\overline{\mathbf{w}}_{t+1}\right\|^{2} \leq \frac{4}{K} \eta_{t}^{2} E^{2} G^{2}
    $$
\end{Lemma}
% \newpage
\section{Proof of Theorem} \label{app:B}
\subsection{Proof of Theorem 1 Variance Reduction}
\paragraph{Additional Notation.}
Divide the population $\mathcal{K}$ consisting of $N$ clients into $\mathrm{H}$ clusters via clustering. 
\begin{itemize}
    \item $N_{h}$ denotes the number of clients in $h^{th}$ cluster, $s.t. \sum_{h=1}^{\mathrm{H}} {N}_{h}={N}$
    \item $m_h$ denotes the number of sampled clients from the $h^{th}$ cluster
    \item $m$ denotes the sample size, $s.t. \sum_{h=1}^{\mathrm{H}} m_h=m$ 
    \item $\mathbf{w}_{h_i}$ denotes the model update $\mathbf{w}$ of the $i^{th}$ client in the $h^{th}$ cluster
    \item $\mathbf{w}_{h} = \sum_{i=1}^{m_h} \frac{\mathbf{w}_{h_i}}{m_h}$ is the sampled averaged model update of the $h^{th}$ cluster
    \item $\overline{\mathbf{w}} = \sum_{h=1}^{\mathrm{H}} \frac{m_h{\mathbf{w}}_h}{m}$ is the overall sampled averaged model update
    \item $\mathbf{W}_{h} = \sum_{i=1}^{N_h} \frac{\mathbf{w}_{h_i}}{N_h}$ is the averaged model update of the $h^{th}$ cluster
    \item $\mathbf{W}(\mathcal{K}) = \sum_{h=1}^{\mathrm{H}} \sum_{i=1}^{N_h} \frac{\mathbf{w}_{h_i}}{N}$ is the averaged model update of entire set $\mathcal{K}$
    \item $\mathbf{w}_{cluster} = \frac{1}{N} \sum_{h=1}^{\mathrm{H}} N_{h} \mathbf{w}_{h}$ is an unbiased estimator of $\mathbf{W}(\mathcal{K})$
    \item $S^{2} = \frac{1}{N}\sum_{i=1}^{N}\left\|\mathbf{w}_{i}-\mathbf{W}(\mathcal{K})\right\|_{2}^2 := \frac{1}{N}\sum_{i=1}^{N}\sigma^{2}$
    \item $S_{h}{}^{2} = \sum_{i=1}^{N_h} \frac{\left\|\mathbf{w}_{h_i}-\mathbf{W}_{h}\right\|_{2}^2}{N_h-1}$
    \item $s_{h}{}^{2} = \sum_{i=1}^{m_h} \frac{\left\|\mathbf{w}_{h_i} - \mathbf{w}_h\right\|_{2}^2}{m_h-1}$
    \item ${Q}_h = \frac{N_h}{N}$ is the proportion of clients in the $h^{th}$ cluster
    \item ${q}_h = \frac{m_h}{m}$ is the proportion of sampled clients in the $h^{th}$ cluster
\end{itemize}
%
\begin{proof}[Proof of Theorem 1]
    \textbf{Derive the Variance of Random Selection.}\ \ Assuming that each observation has variance $\sigma^2$, then we get
    \begin{align}
    \mathbb{V}(\mathbf{w}_{rand}) &=\mathbb{E}\left\|\overline{\mathbf{w}}-\mathbf{W}(\mathcal{K})\right\|_{2}^2\\ 
    &=\frac{1}{m^{2}}\mathbb{E}\left\| \sum_{i=1}^{m}[\mathbf{w}_{i}-\mathbf{W}(\mathcal{K})]\right\|_{2}^2 \\
    &=\underbrace{\frac{1}{m^{2}}\mathbb{E} \left[\sum_{i=1}^{m}\left\|\mathbf{w}_{i}-\mathbf{W}(\mathcal{K})\right\|_{2}^{2}\right]}_{Quadratic~Term}\\
    &+\underbrace{\frac{1}{m^{2}}\mathbb{E} \left[\sum_{i}^{m} \sum_{\neq j}^{m}\left[\mathbf{w}_{i}-\mathbf{W}(\mathcal{K})\right]^T\left[\mathbf{w}_{j}-\mathbf{W}(\mathcal{K})\right]\right]}_{ Cross-Product~Term} \\
    &=\frac{1}{m^{2}} \sum_{i=1}^{m} \mathbb{E}\left\|\mathbf{w}_{i}-\mathbf{W}(\mathcal{K})\right\|_{2}^2\\
    &+\frac{1}{m^{2}} \underbrace{\sum_{i}^{m} \sum_{\neq j}^{m} \mathbb{E}\left[\left[\mathbf{w}_{i}-\mathbf{W}(\mathcal{K})\right]^T\left[\mathbf{w}_{j}-\mathbf{W}(\mathcal{K})\right]\right]}_{Setting~to~K}\\
    &=\frac{1}{m^{2}} \sum_{i=1}^{m} \sigma^{2}+\frac{K}{m^{2}} \\
    &=\frac{N-1}{Nm} S^{2}+\frac{K}{m^{2}},
\end{align}
    
where we set $K=\sum_{i}^{m} \sum_{\neq j}^{m} \mathbb{E}\left[\left[\mathbf{w}_{i}-\mathbf{W}(\mathcal{K})\right]^{T}\left[\mathbf{w}_{j}-\mathbf{W}(\mathcal{K})\right]\right]$ for convenience.

\paragraph{Find the Expression of $K$.}
    In order to find $K$, we consider,
    \begin{equation}
        \begin{aligned}
        &\mathbb{E} \left[\left[\mathbf{w}_{i}-\mathbf{W}(\mathcal{K})\right]\left[\mathbf{w}_{j}-\mathbf{W}(\mathcal{K})\right]\right] \\
        &=\frac{1}{N(N-1)} \sum_{k}^{N} \sum_{\neq \ell}^{N} \left[\left[\mathbf{w}_{k}-\mathbf{W}(\mathcal{K})\right]^T\left[(\mathbf{w}_{l}-\mathbf{W}(\mathcal{K})\right]\right].
        \end{aligned}
    \end{equation}
    Meanwhile, we have,
    \begin{align}
        \sum_{k=1}^{N}\left[\mathbf{w}_{k}-\mathbf{W}(\mathcal{K})\right] &=  \sum_{k=1}^{N}\mathbf{w}_{k}-N \mathbf{W}(\mathcal{K}) \\
        &= N \mathbf{W}(\mathcal{K})-N \mathbf{W}(\mathcal{K}) = 0,
    \end{align}
    i.e.,
    \begin{align}
         \left\|\sum_{k=1}^{N}[\mathbf{w}_{k}-\mathbf{W}(\mathcal{K})]\right\|_{2}^{2} = 0.
    \end{align}
    And the left can be constructed as, 
    \begin{equation}
    \begin{aligned}
        &\left\|\sum_{k=1}^{N}[\mathbf{w}_{k}-\mathbf{W}(\mathcal{K})]\right\|_{2}^{2} =\sum_{k=1}^{N}\left\|\mathbf{w}_{k}-\mathbf{W}(\mathcal{K})\right\|_{2}^{2} \\
        &+\sum_{k}^{N} \sum_{\neq \ell}^{N}\left[\left[\mathbf{w}_{k}-\mathbf{W}(\mathcal{K})\right]^T\left[\mathbf{w}_{\ell}-\mathbf{W}(\mathcal{K})\right]\right].
    \end{aligned}
    \end{equation}
    Simplify it, we will get
    \begin{align}
        0=(N-1) S^{2}
        +\sum_{k}^{N} \sum_{\neq \ell}^{N}\left[\left[\mathbf{w}_{k}-\mathbf{W}(\mathcal{K})\right]^T\left[\mathbf{w}_{\ell}-\mathbf{W}(\mathcal{K})\right]\right],
    \end{align}
    equal to, 
     \begin{align}
        \sum_{k}^{N} \sum_{\neq \ell}^{N}\left[\left[\mathbf{w}_{k}-\mathbf{W}(\mathcal{K})\right]^T\left[\mathbf{w}_{\ell}-\mathbf{W}(\mathcal{K})\right]\right] = -(N-1) S^{2}.
    \end{align}
    Therefore,
    \begin{align}
        \frac{1}{N(N-1)} \sum_{k}^{N} &\sum_{\neq \ell}^{N}\left[\left[\mathbf{w}_{k}-\mathbf{W}(\mathcal{K})\right]^T\left[\mathbf{w}_{\ell}-\mathbf{W}(\mathcal{K})\right]\right] \\
        &=\frac{1}{N(N-1)}\left[-(N-1) S^{2}\right] \\
        &=-\frac{S^{2}}{N},
    \end{align}
    thus 
    \begin{align}
        K&=\sum_{i}^{m} \sum_{\neq j}^{m} \mathbb{E}\left[\left[\mathbf{w}_{i}-\mathbf{W}(\mathcal{K})\right]^T\left[\mathbf{w}_{j}-\mathbf{W}(\mathcal{K})\right]\right]\\
        &= m(m-1) \frac{1}{N(N-1)} \sum_{k}^{N} \sum_{\neq \ell}^{N}\left[\left[\mathbf{w}_{k}-\mathbf{W}(\mathcal{K})\right]^T\left[(\mathbf{w}_{l}-\mathbf{W}(\mathcal{K})\right]\right]\\
        &= -m(m-1)\frac{S^2}{N},
    \end{align}
    and substitute the value of $K$, the variance of $\mathbf{w}_{rand}$ is 
    \begin{align}
    \mathbb{V}\left(\mathbf{w}_{rand}\right) &=\frac{N-1}{N m} S^{2}-\frac{1}{n^{2}} m(m-1) \frac{S^{2}}{N} \\
    &=\frac{N-m}{N m} S^{2}.
    \end{align}
   If $N$ is infinite (large enough), we can get
    \begin{align}
        \mathbb{V}\left(\mathbf{w}_{rand}\right) &=\frac{N-m}{N m} S^{2} \\
        &= (\frac{1}{m}-\frac{1}{N})S^2 \cong \frac{S^2}{m}.
    \end{align}
\paragraph{Derive the Variance of Plain Clustering Selection.}As prior work constructed, clustering selection is always applied under plain proportional allocation, where the number of sampled clients $m_{h}$ from the $h^{th}$ cluster is proportional to its cluster size $N_{h}$, i.e., $m_{h}=m\frac{N_{h}}{N}$. And we have 
\begin{equation}
\begin{aligned}
    \mathbb{V}(\mathbf{w}_{cluster})
    &=\sum_{h=1}^{\mathrm{H}} Q_{h}{}^{2} \mathbb{V}\left(\mathbf{w}_{h}\right) \\
    &+\sum_{h(\neq j)=1}^{\mathrm{H}} \sum_{j=1}^{m_{h}} Q_{h} Q_{j} \operatorname{Cov}\left(\mathbf{w}_{h}, \mathbf{w}_{j}\right).
\end{aligned}
\end{equation}
For the former we have
\begin{align}
     \mathbb{V}\left(\mathbf{w}_{h}\right)&=\frac{N_{h}-m_{h}}{N_{h} m_{h}} S_{h}{}^{2},
\end{align}
and for the latter (covariance) we have
\begin{align}
    \operatorname{Cov}\left(\mathbf{w}_{h}, \mathbf{w}_{j}\right)&=0, h \neq j,
\end{align}
where
\begin{align}
    S_{h}{}^{2}&=\frac{1}{N_{h}-1} \sum_{j=1}^{N_{h}}\left\|\mathbf{w}_{h_{ j}}-\mathbf{W}_{h}\right\|_{2}^{2},
\end{align}
thus
\begin{align}
    \mathbb{V}(\mathbf{w}_{cluster})&=\sum_{h=1}^{\mathrm{H}}\left(\frac{N_{h}-m_{h}}{N_{h} m_{h}}\right) {Q}_{h}{}^{2} S_{h}{}^{2}.
\end{align}
Therefore we can get
\begin{align}
    \mathbb{V}(\mathbf{w}_{cluster})
    &=\sum_{h=1}^{\mathrm{H}}\left(\frac{N_{h}-\frac{m}{N} N_{h}}{N_{h}\frac{m}{N} N_{h}}\right)\left(\frac{N_{h}}{N}\right)^{2} S_{h}{}^{2} \\
    &= \frac{N-m}{N m} \sum_{h=1}^{\mathrm{H}} \frac{N_{h} S_{h}{}^{2}}{N} \\
    &= \frac{N-m}{N m} \sum_{h=1}^{\mathrm{H}} {Q}_{h} S_{h}{}^{2} \\
    &\cong \frac{\sum_{h=1}^{\mathrm{H}}N_{h}S_{h}{}^{2}}{mN}.
\end{align}

\paragraph{Derive the Variance of Clustering Selection with Sample Size Re-allocation.} 
We apply clustering selection under sample size re-allocation, where the number of sampled clients $m_{h}$ from the $h^{th}$ cluster is proportional to both cluster's size $N_{h}$ and the variability of cluster measured by $S_{h}$, i.e.,
\begin{align}
m_{h}=\frac{N_{h}S_{h}}{\sum_{h=1}^{\mathrm{H}} N_{h} S_{h}} \cdot m.
\end{align}
We can get
\begin{align}
\mathbb{V}\left(\mathbf{w}_{cludiv}\right) &=\sum_{h=1}^{\mathrm{H}}\left(\frac{1}{m_{h}}-\frac{1}{N_{h}}\right) Q_{h}{}^{2} S_{h}{}^{2} \\
&=\sum_{h=1}^{\mathrm{H}} \frac{Q_{h}{}^{2} S_{h}{}^{2}}{m_{h}}-\sum_{h=1}^{\mathrm{H}} \frac{Q_{h}{}^{2} S_{h}{}^{2}}{N_{h}} \\
&=\sum_{h=1}^{\mathrm{H}}\left[Q_{h}{}^{2} S_{h}{}^{2}\left(\frac{\sum_{h=1}^{\mathrm{H}} N_{h} S_{h}}{m N_{h} S_{h}}\right)\right]-\sum_{h=1}^{\mathrm{H}} \frac{Q_{h}{}^{2} S_{h}{}^{2}}{N_{h}} \\
&=\sum_{h=1}^{\mathrm{H}}\left[\frac{1}{m} \cdot \frac{N_{h} S_{h}}{N^{2}}\left(\sum_{h=1}^{\mathrm{H}} N_{h} S_{h}\right)\right]-\sum_{h=1}^{\mathrm{H}} \frac{Q_{h}{}^{2} S_{h}{}^{2}}{N_{h}} \\
&=\frac{1}{m}\left(\sum_{h=1}^{\mathrm{H}} \frac{N_{h} S_{h}}{N}\right)^{2}-\sum_{h=1}^{\mathrm{H}} \frac{Q_{h}{}^{2} S_{h}{}^{2}}{N_{h}}\\
&=\frac{1}{m}\left(\sum_{h=1}^{\mathrm{H}} Q_{h} S_{h}\right)^{2}-\frac{1}{N} \sum_{h=1}^{\mathrm{H}} Q_{h} S_{h}{}^{2}\\
&=\frac{1}{N^2}\frac{\left(\sum_{h=1}^{\mathrm{H}}N_{h}S_{h}\right)^2}{m} - \frac{1}{N^2}\sum_{h=1}^{\mathrm{H}}{N_{h}S_{h}{}^2}\\
&\cong \frac{1}{mN^2}\left(\sum_{h=1}^{\mathrm{H}}N_{h}S_{h}\right)^2.
\end{align}
Based on all the above, We have these equations below when approximations are used,
\begin{align}
\mathbb{V}\left(\mathbf{w}_{rand}\right) &= \frac{N-m}{Nm}S^2 \\
&\cong \frac{S^2}{m},\\
\mathbb{V}\left(\mathbf{w}_{cluster}\right) &= \frac{N-m}{N}\cdot\frac{\sum_{h=1}^{\mathrm{H}}N_{h}S_{h}{}^{2}}{mN} \\
&\cong \frac{\sum_{h=1}^{\mathrm{H}}N_{h}S_{h}{}^{2}}{mN},\\
\mathbb{V}\left(\mathbf{w}_{cludiv}\right) &= \frac{1}{N^2}\cdot\frac{\left(\sum_{h=1}^{\mathrm{H}}N_{h}S_{h}\right)^2}{m} - \frac{1}{N^2}\sum_{h=1}^{\mathrm{H}}{N_{h}S_{h}{}^2}\\
&\cong \frac{1}{mN^2}\left(\sum_{h=1}^{\mathrm{H}}N_{h}S_{h}\right)^2.
\end{align}
\paragraph{Relationship.}
In order to compare $\mathbb{V}({w}_{rand})$ and $\mathbb{V}({w}_{cluster})$, we first attempt to express $S^{2}$ as a function of $S_{h}{}^{2}$.
    \begin{align}
        \left(N-1\right)S^2 &= \sum_{h=1}^{\mathrm{H}}\sum_{i=1}^{m_h}\left\|\mathbf{w}_{h_i}-\mathbf{W}(\mathcal{K})\right\|_{2}^2     \\
        &= \sum_{h=1}^{\mathrm{H}}\sum_{i=1}^{m_h}\left\|\mathbf{w}_{h_i}-\mathbf{W}_h\right\|_{2}^2\\
        &+ \sum_{h=1}^{\mathrm{H}}N_{h}\left\|\mathbf{W}_h-\mathbf{W}(\mathcal{K})\right\|_{2}^2\\
        &= \sum_{h=1}^{\mathrm{H}}\left(N_{h}-1\right)S_{h}{}^{2} + \sum_{h=1}^{\mathrm{H}}N_{h}\left\|\mathbf{W}_h -\mathbf{W}(\mathcal{K})\right\|_{2}^2
    \end{align}
    \begin{align}
        \frac{N-1}{N} S^{2}=\sum_{h=1}^{\mathrm{H}} \frac{N_{h}-1}{N} S_{h}{}^{2}+\sum_{h=1}^{\mathrm{H}} \frac{N_{h}}{N}\left\|\mathbf{W}_{h}-\mathbf{W}(\mathcal{K})\right\|_{2}^{2}
    \end{align}
We assume that $N_{h}$ is large enough to permit the approximation for simplification
    \begin{align}
        \frac{N_{h}-1}{N_{h}} \approx 1 \text { and } \frac{N-1}{N} \approx 1
    \end{align}
Thus
    \begin{align}
    {S}^{2}=\sum_{h=1}^{\mathrm{H}} \frac{{N}_{h}}{N} {S}_{h}{}^{2}+\sum_{h=1} ^{\mathrm{H}}\frac{{N}_{h}}{N}\left\|\mathbf{W}_{h}-\mathbf{W}(\mathcal{K})\right\|_{2}^{2}
    \end{align}
Therefore
    \begin{align}
    \mathbb{V}\left(\mathbf{w}_{rand}\right)=\frac{S^{2}}{m}&=\frac{\sum_{h=1}^{\mathrm{H}} N_{h} S_{h}{ }^{2}}{mN}\\
    &+\frac{\sum_{h=1}^{\mathrm{H}} N_{h}\left\|\mathbf{W}_{h}-\mathbf{W}(\mathcal{K})\right\|_{2}^{2}}{mN}\\
    &=\mathbb{V}\left(\mathbf{w}_{cluster}\right)+\frac{\sum_{h=1}^{\mathrm{H}} N_{h}\left\|\mathbf{W}_{h}-\mathbf{W}(\mathcal{K})\right\|_{2}^{2}}{mN}
    \end{align}
which shows that
    \begin{equation}
    \mathbb{V}\left(\mathbf{w}_{cluster}\right) \leq \mathbb{V}\left(\mathbf{w}_{rand}\right)
    \end{equation}
Unless $\mathbf{W}_h=\mathbf{W}(\mathcal{K})$ for every h, we must have $\mathbb{V}\left(\mathbf{w}_{cludiv}\right) \leq \mathbb{V}\left(\mathbf{w}_{cluster}\right)$.\\
The difference is 
    \begin{align}
\mathbb{V}\left(\mathbf{w}_{cluster}\right)&=\mathbb{V}\left(\mathbf{w}_{cludiv}\right)+\frac{1}{mN}\sum_{h=1}^{\mathrm{H}}N_{h}\left(S_{h}-\overline{S}\right)^2.
    \end{align}
This shows that 
    \begin{equation}
    \mathbb{V}\left(\mathbf{w}_{cludiv}\right) \leq \mathbb{V}\left(\mathbf{w}_{cluster}\right),
    \end{equation}
unless $S_h=\overline{S}$ for every h, i.e., the clusters have equal variability. Therefore, we get 
    \begin{equation}
    \mathbb{V}\left(\mathbf{w}_{cludiv}\right) \leq \mathbb{V}\left(\mathbf{w}_{cluster}\right)\leq
\mathbb{V}\left(\mathbf{w}_{rand}\right)
    \end{equation}
In this paper, we proposed to apply the importance selection based on the norm of the gradient to each cluster instead of random selection. Here we present the variance-reduction relationship between random selection and importance selection. Please note that this part is directly adapted from prior importance sampling work~\cite{katharopoulos2018not}, which is not our contribution.
\begin{align}
    \operatorname{Tr}\left(\mathbb{V}_{rand}\left[G_{i}\right]\right)&-\operatorname{Tr}\left(\mathbb{V}_{import}\left[c_{i} G_{i}\right]\right) \\
&=\mathbb{E}_{rand}\left[\left\|G_{i}\right\|_{2}^{2}\right]-\mathbb{E}_{import}\left[c_{i}^{2}\left\|G_{i}\right\|_{2}^{2}\right]
\end{align}
Using the fact that $c_{i}=\frac{1}{N_{h} G_{i}}$, $I_{i} = \frac{\left\|G_{i}\right\|_{2}}{\sum^{N_h}_{i=1}\left\|G_i\right\|_{2}}$, $u=\frac{1}{N_{h}}$, we have
\begin{equation}
    \mathbb{E}_{import}\left[c_{i}^{2}\left\|G_{i}\right\|_{2}^{2}\right]=\left(\frac{1}{N_{h}} \sum_{i=1}^{N_{h}}\left\|G_{i}\right\|_{2}\right)^{2}
\end{equation}
Then simplify it, we can get 
\begin{align}
    &\operatorname{Tr}\left(\mathbb{V}_{rand}\left[G_{i}\right]\right)-\operatorname{Tr}\left(\mathbb{V}_{import}\left[w_{i} G_{i}\right]\right) \\
&=\frac{1}{N_{h}} \sum_{i=1}^{N_{h}}\left\|G_{i}\right\|_{2}^{2}-\left(\frac{1}{N_{h}} \sum_{i=1}^{N_{h}}\left\|G_{i}\right\|_{2}\right)^{2} \\
&=\frac{\left(\sum_{i=1}^{N_{h}}\left\|G_{i}\right\|_{2}\right)^{2}}{N_{h}^{3}} \sum_{i=1}^{N_{h}}\left(N_{h}^{2} \frac{\left\|G_{i}\right\|_{2}^{2}}{\left(\sum_{i=1}^{N_{h}}\left\|G_{i}\right\|_{2}\right)^{2}}-1\right) \\
&=\frac{\left(\sum_{i=1}^{N_{h}}\left\|G_{i}\right\|_{2}\right)^{2}}{N_{h}} \sum_{i=1}^{N_{h}}\left(I_{i}^{2}-u^{2}\right)
\end{align}
Using the fact that$\sum_{i=1}^{N_{h}} u=1$,  we can complete the derivation.
\begin{align}
    &\operatorname{Tr}\left(\mathbb{V}_{rand}\left[G_{i}\right]\right)-\operatorname{Tr}\left(\mathbb{V}_{import}\left[w_{i} G_{i}\right]\right) \\
&=\frac{\left(\sum_{i=1}^{N_{h}}\left\|G_{i}\right\|_{2}\right)^{2}}{N_{h}} \sum_{i=1}^{N_{h}}\left(I_{i}-u\right)^{2} \\
&=\left(\frac{1}{N_{h}} \sum_{i=1}^{}\left\|G_{i}\right\|_{2}\right)^{2} N_{h} \|I-u\|_{2}^{2} 
\end{align}
\end{proof}
\newpage
\section{Additional Experiments} \label{app:C}
\subsection{Influence of the Sampling Ratio}
% As the sampling ratio $q$ decreases, the performance of all schemes becomes unacceptable except ours.
\begin{table}[htbp]
  % \vspace{-1.0em}  % 调整表格与上文的距离
  \centering
  \setlength{\tabcolsep}{4mm} % 列间距
  \caption{Final test accuracy of multiple FL algorithms  with different sampling schemes under convex model on MNIST, FMNIST, setting parameters $q \in \{0.1, 0.2, 0.3, 0.5\}$, $N=100$, $nSGD=50$, $\eta=0.05$, $B=50$.}
\scalebox{0.95}
{
\begin{tabular}{llcccc} 
  \toprule
  \multirow{2}*{} &\multirow{2}*{Methods} &\multicolumn{2}{c}{MNIST} &\multicolumn{2}{c}{FMNIST} \\
  \cmidrule(r){3-4} \cmidrule(r){5-6}
% \rowcolor{Gray} 为每一行添加背景色
  ~ &~
  &IID&\cellcolor{white}non-IID
  &IID&\cellcolor{white}non-IID  
  \\ \midrule
  \multirow{5}*{q=0.1} &{Random} & 86.8 \color{gray}±0.0 & 79.3 \color{gray}±0.6 & 75.9 \color{gray}±0.0 & 62.6 \color{gray}±1.0 \\ %\cmidrule{2-5}
  &\texttt{SCAFFOLD} & 84.2 \color{gray}±0.2 & 78.8 \color{gray}±1.3 & 73.5 \color{gray}±0.0 & 70.7 \color{gray}±0.0 \\
  &{Importance}  & 90.9 \color{gray}±0.0 & 87.2 \color{gray}±1.2 & 82.5 \color{gray}±0.1 & 73.2 \color{gray}±1.6 \\
  &{Cluster}  & 90.91 \color{gray}±0.0 & 88.0 \color{gray}±0.4 & 82.6 \color{gray}±0.1 & 74.3 \color{gray}±2.0 \\
  &{\textbf{\texttt{HCSFed}}}  & 90.9 \color{gray}±0.0 & \textbf{89.0 \color{gray}±0.0} & 82.5 \color{gray}±0.1 & \textbf{78.8 \color{gray}±0.0} \\ \midrule
  %
  \multirow{5}*{q=0.2} &{Random}& 88.4 \color{gray}±0.0 & 83.2 \color{gray}±0.3 & 78.6 \color{gray}±0.1 & 71.6 \color{gray}±1.1 \\ %\cmidrule{2-5}
  &\texttt{SCAFFOLD} & 85.2 \color{gray}±0.0 & 86.6 \color{gray}±0.2 & 74.2 \color{gray}±0.0 & 71.0 \color{gray}±0.0 \\
  &{Importance}  & 90.8 \color{gray}±0.0 & 87.5 \color{gray}±1.0 & 82.6 \color{gray}±0.1 & 75.1 \color{gray}±2.0 \\
  &{Cluster}  & 90.89 \color{gray}±0.0 & 88.6 \color{gray}±0.3 & 82.5 \color{gray}±0.1 & 77.3 \color{gray}±1.1 \\
  &{\textbf{\texttt{HCSFed}}}  & 91.0 \color{gray}±0.0 & \textbf{89.1 \color{gray}±0.1} & \textbf{82.5 \color{gray}±0.1} & \textbf{78.9 \color{gray}±0.1} \\ \midrule
  %
  \multirow{5}*{q=0.3} &{Random}& 89.3 \color{gray}±0.0 & 86.1 \color{gray}±0.2 & 80.1 \color{gray}±0.1 & 71.3 \color{gray}±0.2 \\ %\cmidrule{2-5}
  &\texttt{SCAFFOLD} & 84.9 \color{gray}±0.1 & 87.0 \color{gray}±0.1 & 74.4 \color{gray}±0.0 & 71.1 \color{gray}±0.0 \\
  &{Importance}  & 90.8 \color{gray}±0.0 & 88.4 \color{gray}±0.4 & 82.6 \color{gray}±0.0 & 75.4 \color{gray}±1.9 \\
  &{Cluster}  & 90.85 \color{gray}±0.0 & 89.1 \color{gray}±0.1 & 82.5 \color{gray}±0.0 & 77.4 \color{gray}±0.5 \\
  &{\textbf{\texttt{HCSFed}}}  & 90.9 \color{gray}±0.0 & \textbf{89.0 \color{gray}±0.0} & 82.6 \color{gray}±0.1 & \textbf{78.9 \color{gray}±0.0} \\ \midrule
  %
  \multirow{5}*{q=0.5} &{Random}& 90.0 \color{gray}±0.0 & 87.2 \color{gray}±0.1 & 81.2 \color{gray}±0.0 & 75.5 \color{gray}±0.3 \\ %\cmidrule{2-5}
  &\texttt{SCAFFOLD} & 85.0 \color{gray}±0.1 & 87.2 \color{gray}±0.1 & 74.5 \color{gray}±0.0 & 70.9 \color{gray}±0.0 \\
  &{Importance}  & 90.9 \color{gray}±0.0 & 89.0 \color{gray}±0.2 & 82.5 \color{gray}±0.0 & 77.2 \color{gray}±1.0 \\
  &{Cluster}  & 90.87 \color{gray}±0.0 & 89.0 \color{gray}±0.1 & 82.5 \color{gray}±0.0 & 78.7 \color{gray}±0.4 \\
  &{\textbf{\texttt{HCSFed}}}  & 90.9 \color{gray}±0.0 & \textbf{89.0 \color{gray}±0.0} & 82.5 \color{gray}±0.0 & \textbf{78.9 \color{gray}±0.0} \\ 
  \bottomrule
  \end{tabular}
}
%   \vspace{-8em}
\label{tab3}
\end{table}
\begin{table}[H]
  % \vspace{-1.0em}  % 调整表格与上文的距离
  \centering
  \setlength{\tabcolsep}{1mm} % 列间距
  \caption{Final test accuracy of multiple FL algorithms with different sampling schemes under non-convex model on MNIST, FMNIST and CIFAR-10, setting parameters $q \in \{0.1, 0.2, 0.3, 0.5\}$, $N=100$, $nSGD=50$ for MNIST and FMNIST, $nSGD=80$ for CIFAR-10, $\eta=0.05$, $B=50$.}
\scalebox{0.95}{
\begin{tabular}
  {llccccccc} \toprule
  \multirow{2}*{} &\multirow{2}*{Methods} &\multicolumn{2}{c}{MNIST} &\multicolumn{2}{c}{FMNIST} &\multicolumn{3}{c}{CIFAR-10}\\
  \cmidrule(r){3-4} \cmidrule(r){5-6} \cmidrule(r){7-9}
  ~ &~
  &IID&\cellcolor{white}non-IID
  &IID&\cellcolor{white}non-IID  
  & \cellcolor{white}IID &$\alpha=0.01$&$\alpha=0.001$ \\ \midrule
  \multirow{5}*{q=0.1} &{Random} & 87.0 \color{gray}±0.0 & 59.7 \color{gray}±1.4 & 87.6 \color{gray}±0.1 & 76.9 \color{gray}±0.2 & 40.3 \color{gray}±0.2 & 25.8 \color{gray}±0.4 & 20.5 \color{gray}±0.5\\ %\cmidrule{2-5}
%   &\texttt{SCAFFOLD} & 79.7 \color{gray}±0.2 & 72.9 \color{gray}±0.5 & 67.7 \color{gray}±0.1 & 64.6 \color{gray}±0.7 & 24.4 \color{gray}±1.1 & 40.5 \color{gray}±1.0 & 32.8 \color{gray}±0.8\\
  &{Importance}  & 92.9 \color{gray}±0.1 & 71.2 \color{gray}±9.5 & 90.4 \color{gray}±0.1 & 85.1 \color{gray}±2.2 & 66.0 \color{gray}±0.2 & 39.5 \color{gray}±3.3 & 24.9 \color{gray}±2.6\\
  &{Cluster}  & 92.9 \color{gray}±0.0 & 73.6 \color{gray}±3.7 & 90.6 \color{gray}±0.2 & 88.9 \color{gray}±4.6 & 65.5 \color{gray}±0.3 & 37.2 \color{gray}±3.9 & 30.0 \color{gray}±4.7\\
  &{\textbf{\texttt{HCSFed}}}  & 92.9 \color{gray}±0.0 & \textbf{83.3 \color{gray}±0.0} & 90.5 \color{gray}±0.1 & \textbf{92.0 \color{gray}±0.2} & 65.7 \color{gray}±0.3 & \textbf{41.2 \color{gray}±1.8} & \textbf{38.8 \color{gray}±0.6} \\ \midrule
  %
  \multirow{5}*{q=0.2} &{Random} & 89.3 \color{gray}±0.0 & 70.8 \color{gray}±1.6 & 88.8 \color{gray}±0.0 & 80.5 \color{gray}±0.5 & 49.8 \color{gray}±0.4 & 29.7 \color{gray}±0.3 & 25.1 \color{gray}±0.2\\ %\cmidrule{2-5}
%   &\texttt{SCAFFOLD} & 81.3 \color{gray}±0.1 & 75.2 \color{gray}±0.3 & 67.5 \color{gray}±0.1 & 73.4 \color{gray}±0.1 & 12.7 \color{gray}±3.1 & 36.7 \color{gray}±1.1 & 30.1 \color{gray}±2.1\\
  &{Importance}  & 92.9 \color{gray}±0.0 & 72.9 \color{gray}±9.0 & 90.3 \color{gray}±0.0 & 90.8 \color{gray}±0.6 & 65.7 \color{gray}±0.2 & 40.7 \color{gray}±2.9 & 30.5 \color{gray}±2.0\\
  &{Cluster}  & 92.9 \color{gray}±0.0 & 80.3 \color{gray}±1.7 & 90.4 \color{gray}±0.1 & 90.7 \color{gray}±1.0 & 65.6 \color{gray}±0.3 & 42.0 \color{gray}±1.4 & 33.0 \color{gray}±3.6\\
  &{\textbf{\texttt{HCSFed}}}  & 92.8 \color{gray}±0.0 & \textbf{83.5 \color{gray}±0.1} & 90.4 \color{gray}±0.1 & \textbf{92.1 \color{gray}±0.2} & 65.4 \color{gray}±0.3 & \textbf{41.6 \color{gray}±0.9} & \textbf{39.2 \color{gray}±2.0} \\ \midrule
  %
  \multirow{5}*{q=0.3} &{Random} & 90.1 \color{gray}±0.0 & 73.4 \color{gray}±2.1 & 89.4 \color{gray}±0.0 & 83.7 \color{gray}±0.3 & 54.8 \color{gray}±0.4 & 34.9 \color{gray}±0.6 & 26.6 \color{gray}±0.5\\ %\cmidrule{2-5}
%   &\texttt{SCAFFOLD} & 81.3 \color{gray}±0.0 & 76.2 \color{gray}±0.0 & 67.8 \color{gray}±0.2 & 73.6 \color{gray}±0.3 & 24.6 \color{gray}±1.5 & 21.1 \color{gray}±2.6 & 31.7 \color{gray}±1.3\\
  &{Importance}  & 92.9 \color{gray}±0.0 & 76.3 \color{gray}±3.5 & 90.6 \color{gray}±0.1 & 90.5 \color{gray}±1.3 & 65.7 \color{gray}±0.2 & 42.0 \color{gray}±2.3 & 32.6 \color{gray}±3.4\\
  &{Cluster}  & 92.9 \color{gray}±0.0 & 81.8 \color{gray}±1.5 & 90.5 \color{gray}±0.1 & 91.6 \color{gray}±0.5 & 66.1 \color{gray}±0.3 & 43.0 \color{gray}±1.0 & 34.6 \color{gray}±2.1\\
  &{\textbf{\texttt{HCSFed}}}  & 92.8 \color{gray}±0.0 & \textbf{83.3 \color{gray}±0.1} & 90.2 \color{gray}±0.1 & \textbf{92.2 \color{gray}±0.1} & 65.4 \color{gray}±0.3 & \textbf{42.3 \color{gray}±0.6} & \textbf{39.7 \color{gray}±0.7} \\ \midrule
  %
  \multirow{5}*{q=0.5} &{Random} & 91.4 \color{gray}±0.0 & 80.9 \color{gray}±1.3 & 90.1 \color{gray}±0.1 & 87.9 \color{gray}±0.3 & 61.1 \color{gray}±0.2 & 39.4 \color{gray}±0.4 & 30.5 \color{gray}±0.6\\ %\cmidrule{2-5}
%   &\texttt{SCAFFOLD} & 81.6 \color{gray}±0.0 & 78.7 \color{gray}±0.0 & 68.0 \color{gray}±0.3 & 71.3 \color{gray}±0.8 & 16.2 \color{gray}±1.3 & 28.6 \color{gray}±1.1 & 28.8 \color{gray}±0.7\\
  &{Importance}  & 92.9 \color{gray}±0.0 & 80.8 \color{gray}±3.8 & 90.6 \color{gray}±0.1 & 90.5 \color{gray}±1.5 & 65.8 \color{gray}±0.3 & 43.2 \color{gray}±1.2 & 35.4 \color{gray}±1.7\\
  &{Cluster}  & 92.9 \color{gray}±0.0 & 83.5 \color{gray}±0.2 & 90.4 \color{gray}±0.1 & 92.0 \color{gray}±0.2 & 65.9 \color{gray}±0.4 & 44.0 \color{gray}±0.6 & 36.3 \color{gray}±1.0\\
  &{\textbf{\texttt{HCSFed}}}  & 92.9 \color{gray}±0.0 & \textbf{83.3 \color{gray}±0.1} & 90.4 \color{gray}±0.1 & \textbf{92.0 \color{gray}±0.2} & 65.7 \color{gray}±0.5 & \textbf{42.4 \color{gray}±0.7} & \textbf{39.8 \color{gray}±0.6} \\ 
  \bottomrule
  \end{tabular}}
\label{tab4}
\end{table}
% \vspace{4em}
\newpage
% We carry out extra experiments on FedNova, a modified FL algorithm, to further verify the compatibility of our sampling scheme. We use all datasets mentioned above and take different distributions into consideration. As illustrated in Figure \ref{fig11}, our sampling scheme achieves superb performance on FedNova, especially under heterogeneity. 
\begin{figure}[htbp]
    \centering
    \includegraphics[width=13.5cm]{Figure/pic_selected_1.pdf}
    \caption{Impact of sampling ratio $q$ on the performance with convex model. We compare \texttt{HCSFed} with simple random sampling, importance sampling, cluster sampling, SCAFFOLD on MNIST under non-IID, setting parameters $q\in \{0.1, 0.2, 0.3, 0.5\}$, $N=100$, $nSGD=50$, $\eta=0.01$, $B=50$.}
    \label{app:fig1}
\end{figure}
\begin{figure}[htbp]
    \centering
    \includegraphics[width=13.5cm]{Figure/pic_selected_2.pdf}
    \caption{Impact of sampling ratio $q$ on the performance with convex model. We compare \texttt{HCSFed} with simple random sampling, importance sampling, cluster sampling, SCAFFOLD on FMNIST under non-IID, setting parameters $q\in \{0.1, 0.2, 0.3, 0.5\}$, $N=100$, $nSGD=50$, $\eta=0.01$, $B=50$.}
    \label{app:fig2}
    %\vspace{-2em}
\end{figure}
% \vspace{-2em}
\begin{figure}[H]
    \centering
    \includegraphics[width=13.5cm]{Figure/pic_selected_3.pdf}
    \caption{Impact of sampling ratio $q$ on the performance with non-convex model. We compare \texttt{HCSFed} with simple random sampling, importance sampling, cluster sampling, SCAFFOLD on MNIST under non-IID, setting parameters $q\in \{0.1, 0.2, 0.3, 0.5\}$, $N=100$, $nSGD=50$, $\eta=0.01$, $B=50$.}
    \label{app:fig3}
\end{figure}
\begin{figure}[htbp]
    \centering
    \includegraphics[width=13.5cm]{Figure/pic_selected_4.pdf}
    \caption{Impact of sampling ratio $q$ on the performance with non-convex model. We compare \texttt{HCSFed} with simple random sampling, importance sampling, cluster sampling, SCAFFOLD on CIFAR-10, using a Dirichlet Distribution with $\alpha=0.01$, setting parameters $q\in \{0.1, 0.2\}$, $N=100$, $nSGD=80$, $\eta=0.05$, $B=50$.}
    \label{app:fig4}
\end{figure}
\begin{figure}[htbp]
    \centering
    \includegraphics[width=13.5cm]{Figure/pic_selected_5.pdf}
    \caption{Impact of the heterogeneity on the performance with non-convex model. We compare \texttt{HCSFed} with simple random sampling, importance sampling, cluster sampling and SCAFFOLD on CIFAR-10, using a Dirichlet Distribution with $\alpha=0.001$, setting parameters $q \in \{0.1, 0.2, 0.3, 0.5\}$, $N=100$, $nSGD=80$, $\eta=0.05$, $B=50$.}
    \label{app:fig5}
\end{figure}
\subsection{Extra Experiments on FedNova}
We carry out extra experiments on FedNova, a modified FL algorithm, to further verify the compatibility of our sampling scheme. We use all datasets mentioned above and take different distributions into consideration. As illustrated in Figure \ref{app:fig6}, our sampling scheme achieves superb performance on FedNova, especially under heterogeneity. 
\begin{figure}[htbp]
    \centering
    % \setcaptionwidth{.91\textwidth} % 设置图片标题宽度
    \centering
    \includegraphics[width=13.5cm]{Figure/pic7.pdf}
    \caption{Results on the MNIST, FMNIST and CIFAR10 under FedNova. We compare \texttt{HCSFed} with simple random sampling, using a Dirichlet Distribution with $\alpha =0.001$ and Shard, setting parameters $q=0.1$, $N=100$, $nSGD=50$ for MNIST and FMNIST, $nSGD=80$ for CIFAR-10, $\eta=0.05$, $B=50$.}
    \label{app:fig6}
\end{figure}
\newpage
\section{Widely used datasets in previous influential FL optimization works}
As for the datasets, we would like to state that constructing the heterogeneous dataset to verify our ideas with a non-iid setting deserves more attention in FL optimization. 
%
Meanwhile, we would like to make a fair comparison with the previous Influential FL optimization works shown in table \ref{tab5}, so we choose the widely used dataset including MNIST, FMNIST, and CIFAR-10.
\begin{table}[htbp]
  % \vspace{-1.0em}  % 调整表格与上文的距离
  \centering
  \setlength{\tabcolsep}{4mm} % 列间距
  \caption{Comparison of widely used datasets in previous influential FL optimization works.}
\scalebox{1}
{
\begin{tabular}{cc} 
  \toprule
  Articles &Datasets they use\\
  \midrule
% \rowcolor{Gray} 为每一行添加背景色
  Cluster sampling\cite{pmlr-v139-fraboni21a} &MNIST, CIFAR-10\\ \midrule
  Importance samping\cite{chen2022optimal} &FEMNIST, Shakespeare\\ \midrule
  %
  SCAFFOLD\cite{karimireddy2020scaffold} &EMNIST\\ \midrule
  %
  FedNova\cite{wang2020tackling} &Synthetic Federated dataset, CIFAR-10\\ \midrule
  %
  FedProx\cite{li2020federated} &MNIST, FEMNIST, Shakespeare, Sent140\\ 
  \bottomrule
  \end{tabular}
}
\label{tab5}
\end{table}

\newpage
\bibliography{song_116}

\end{document}
