\documentclass[accepted]{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}

% \usepackage{sectsty}
% \usepackage{titlecaps}
% \allsectionsfont{\titlecap}
\usepackage{hyperref}
% Attempt to make hyperref and algorithmic work together better:
\newcommand{\theHalgorithm}{\arabic{algorithm}}

\usepackage{subfigure}
\usepackage{multirow}

\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage[capitalize,noabbrev]{cleveref}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{threeparttable}
\usepackage{makecell}

\usepackage{algorithm}
\usepackage{algorithmic}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{Proposition}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\theoremstyle{definition}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}
\theoremstyle{remark}
\newtheorem{remark}{Remark}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{On the Convergence of Hierarchical Federated Learning with Partial Worker Participation}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1,2]{Xiaohan Jiang}
\author[2]{Hongbin Zhu}
% Add affiliations after the authors
\affil[1]{%
    School of Computer Science\\
    Fudan University\\
    Shanghai, China
}
\affil[2]{
    Institute of FinTech\\
    Fudan University\\
    Shanghai, China
}
  
\begin{document}
\allowdisplaybreaks[4]
\maketitle

\begin{abstract}
Hierarchical federated learning (HFL) has emerged as the architecture of choice for multi-level communication networks, mainly because of its data privacy protection and low communication cost.
However, existing studies on the convergence analysis for HFL are limited to the assumptions of full worker participation and/or i.i.d. datasets across workers, both of which rarely hold in practice.
Motivated by this, we in this work propose a unified convergence analysis framework for HFL covering both full and partial worker participation with non-i.i.d. data, non-convex objective function and stochastic gradient.
We correspondingly develop a three-sided learning rates algorithm to mitigate data divergences issue, thereby realizing better convergence performance. 
Our theoretical results provide key insights of why partial participation of HFL is beneficial in significantly reducing the data divergences compared to standard FL.
Besides, the convergence analysis allows certain individualization for each cluster in HFL indicating that adjusting the worker sampling ratio and round period can improve the convergence behavior.
\end{abstract}

\section{Introduction}
\label{sec:introduction}
Federated Learning (FL) \citep{mcmahan2017communication, yang2019federated} is a privacy-preserving machine learning paradigm for substantial decentralized data. FL allows a large number of workers to collaboratively learn a model with their local data under the coordination of a centralized server. Formally, the goal of FL is to solve an optimization problem, which is 
\begin{equation}
\label{eqn:main}
	\min_{\textbf{x}\in \mathbb R^d} f(\textbf{x}) := \frac{1}{m}\sum_{i=1}^m F_i(\textbf{x}),
\end{equation}
where $F_i(\textbf{x})$ is the local (non-convex) loss function parameterized by $\textbf{x}$ and $m$ is the number of workers in total.
% 这一块可以视后续篇幅 灵活删减
We can tackle the above problem iteratively in a distributed way, where most representative algorithm is FedAvg \citep{mcmahan2017communication}. Specifically, 
% the server sends the global model to the workers in each communication round. 
the workers each train on their local data, take several gradient steps, and then forward their locally updated model to the server to be averaged. This eliminates the need for explicitly sharing sensitive data with others, thereby providing privacy guarantee.
% However, this distributed SGD shows an obvious limitation that frequent communications may heavily prolong the overall training time when message transmission latency is high. A more practical way to reduce the communication overhead is to allow the workers to take multiple local iterations before uploading models to the server. 
% This is referred to as FedAvg \cite{mcmahan2017communication} in terms of a benchmark FL algorithm, or Local SGD \cite{stich2019local, lin2020don} in terms of an optimization method. 
A large body of work demonstrates the benefits of FedAvg via both theoretical convergence analysis and empirical experiments \citep{li2019convergence, wang2021cooperative}.

On the other hand, standard FL setting with single cloud is inapplicable to substantial real world scenarios with low latency requirement.
Multi-level network architecture enables great potential in low latency FL, such as edge computing.
Specifically, edge computing allows worker to transmit the updated model to local server in the vicinity, thereby significantly reducing the transmission latency \citep{zhang2022scalable}.
Local severs collect the updated models and send them to the cloud sever.
Besides, multi-level network boosts the number of connected end devices by providing more access points in the edge.

%In practice, networks are often constructed in a hierarchical architecture, especially for edge computing systems \cite{li2018learning}. End devices may not all be able to connect to a central server in a single hop due to network or communication range limitations. Besides, direct communication and interaction with the central cloud server may still suffer from high round-trip latency. Hence, a multi-level communication network model is naturally more suitable, where devices directly communicate with their local edge servers, and these edge servers instead communicate with an global cloud server.


\begin{table*}[!h]
\centering
\caption{A summary of convergence rates of optimization methods for HFL.}
\label{tab:rate-hfl}
{\small
\begin{threeparttable}
\begin{tabular}{ccccc}
\hline
\hline
Algorithm & SGD & Non-i.i.d. & Partial Worker & Convergence Rate\tnote{1} \\\hline
\cite{liu2020client}\tnote{2} & $\times$ & \checkmark & $\times$ & $\mathcal{O}(\frac{B^G}{\sqrt{mT}})$ \\
\cite{castiglia2020multi}\tnote{2} & \checkmark & $\times$ & $\times$ & $\mathcal{O}(\frac{1}{\sqrt{mT}} + \frac{mG^2}{IT})$ \\
\cite{wang2022demystifying}\tnote{2} & \checkmark & \checkmark & $\times$ & $\mathcal{O}(\frac{1}{\sqrt{mT}} + \frac{(M-1)G^2 + (m-M)I^2}{T})$ \\
\cite{liu2022hierarchical}\tnote{3} & \checkmark & $\times$ & $\times$ & $\mathcal{O}(\frac{1}{\sqrt{GT}} + \frac{M}{mT})$ \\
Ours\tnote{3} & \checkmark & \checkmark & \checkmark & $\mathcal{O}(\frac{1}{\sqrt{mGT}} + \frac{1}{T})$\\ \hline
\end{tabular}
\begin{tablenotes}
\footnotesize
    \item [1] $G$: global aggregation period (master period); $I$: local aggregation period (cluster period); $m$: number of workers in HFL system; $M$: number of groups; $B$ is a constant and $B > 2$.
    \item [2] $T$ in these works refer to total number of local iterations.
    \item [3] $T$ in these works refer to total number of master rounds.
\end{tablenotes}
\end{threeparttable}
}
\end{table*}

To accommodate the multi-level network architecture, a few works proposed hierarchical FL (HFL). In HFL, workers are partitioned into multiple groups, with each group governed by a cluster (local server), and all clusters coordinated through a master (global server). Specifically, after several local iterations, workers first send their updated models to cluster for local aggregation within the belonging cluster. 
After several local aggregations, all clusters communicate with the master for global aggregation among groups. This two-level aggregation in HFL strikes a subtle balance between the communication overhead and learning performance. Parallel local aggregations ensure ultra-low latency, while the time-consuming global aggregation embraces extensive model knowledge.
% The local aggregations take place for every $I$ (referred to as local period) local iterations, while the global aggregations take place for every $G$ (referred to as global period) local iterations ($G > I$).

Very recent, there have been a few works analyzing the convergence behavior under HFL scenario. In particular, \citet{castiglia2020multi} considers a two-level FL where the clusters are organized as a peer-to-peer network. However, their theoretical results only cover the case of identically and independently distributed (i.i.d.) data. 
% \cite{liu2022hierarchical} proposes HFL with quantization technique to improve communication efficiency, whereas non-i.i.d. data case is also not concerned. 
\citet{liu2020client} considers non-i.i.d. data, but they assume full (non-stochastic) gradients and the convergence bound is in an exponential function form. \citet{wang2022demystifying} provides a more systematic analysis for HFL, which considers non-convex objective function, non-i.i.d. data, and stochastic gradient descent (SGD). They split the overall data heterogeneity (global divergence) into two components, worker-cluster divergence (i.e., within a group) and cluster-master divergence (i.e., among groups). Compared to standard FL, their results show that local aggregation can help to overcome worker-cluster divergence.

However, all the aforementioned works only consider full worker participation (FWP), which is often not possible in practice. Workers may randomly join or leave the FL system, making the active worker set stochastic and time-varying across communication rounds \citep{yang2020achieving}. Waiting for all workers’ responses can significantly slow down the training performance, especially when there are inactive workers or stragglers. This necessitates us to consider partial worker participation (PWP), where only a subset of the workers are chosen in each communication round. This is especially critical in HFL scenarios, such as edge computing, where workers may experience availability issues due to battery level, network status, incoming calls, etc. Therefore, there is a need for a comprehensive analysis and understanding of HFL with PWP.

In this paper, we present a novel theoretical analysis for HFL with both FWP and PWP. We follow the setting of \citet{wang2022demystifying} for HFL, and newly derive the convergence bound for HFL w.r.t. communication rounds. This bound is more explicit than their original one \citep{wang2022demystifying} which was w.r.t. local iterations. Besides, we develop a generalized FedAvg with three-sided learning rates for HFL correspondingly, showing how local aggregation can help to overcome worker-cluster divergence and how to realize linear speedup. 

Our theoretical results reveal that PWP can facilitate HFL more effectively than standard FL. 
% This is in the sense that PWP successfully extends the weakening effect of local aggregations on worker-cluster divergences to even cluster-master ones.
Specifically, HFL alleviates the additional uncertainty caused by PWP in terms of both worker-cluster and cluster-master divergences. This weakening effect yet is only observed on worker-cluster divergences in HFL with FWP. Therefore, enabling PWP is especially beneficial when certain data heterogeneity exists among groups, which is the most general case of HFL in practice. 
This suggests a mutual beneficial relationship between HFL and PWP.
Many real world scenarios of HFL employ PWP to deal with the instability and unavailability of workers, such as edge computing. While exactly, the hierarchical architecture just provides a suitable showcase for PWP. Besides, our theoretical results show that HFL can be customized to a certain degree. Specifically, each cluster can adjust its worker sampling ratio and round period accordingly to deal with its inner data heterogeneity. A summary of our result with existing results is shown in \cref{tab:rate-hfl}.

We emphasize that our main contribution is to provide a new unified convergence bound for HFL settings. Our theoretical results recover the previous results of FL from \citet{yang2020achieving} by setting $M=1,I=G$, and $\epsilon=0$, and generalize existing HFL results to PWP and heterogeneous clusters $I_i$ and $n_i$. 
% We emphasize that our work is not the incremental modification compared to existing works (such as [Wang et al. 2022] or [Yang et al. 2021]). 
Compared to \citet{wang2022demystifying}, our work adopts the same typical HFL settings, but the convergence analysis framework is completely different. Specifically, we rederive a round-level convergence bound for both FWP and PWP, which is potentially tighter (in the sense of second term, our $\mathcal{O}(\frac{1}{T})$ versus their $\mathcal{O}(\frac{G}{T})$) and more general (aware of some server-sided optimization) than their iteration-level results. Compared to \citet{yang2020achieving}, our work shares a similar algorithmic approach but differs in the analysis method. 
% The similarities between our theoretical results and theirs are aimed at providing better unification and insight. 
We do face several theoretical challenges specialized in HFL scenario, which has not been considered in existing FL works. We defer some details on this to \cref{sec:convergence}.

We highlight our contributions as follows.
\begin{enumerate}
    \item We derive a general convergence bound of HFL with both FWP and PWP, non-i.i.d. data, non-convexity, and SGD. We correspondingly develop a three-sided learning rates algorithm.
    \item Compared to standard FL, we reveal that PWP can significantly reduce both worker-cluster divergences and cluster-master divergence in HFL.
    \item We provide certain individualization for HFL by suggesting each cluster flexibly set its worker sampling ratio and round period to match its inner divergence for potentially better convergence behavior.
    \item We introduce comprehensive and reproducible empirical baselines for comparison. We conduct extensive numerical experiments on multiple datasets to verify our theoretical results.
\end{enumerate}

\section{Related Works}
A large body work studied the convergence behavior of the FedAvg algorithm for standard FL. Some works focus on convex objective functions \citep{stich2019local, wang2019adaptive, li2019convergence}, while others consider non-convex objective functions \citep{haddadpour2019local, yu2019parallel, wang2021cooperative}. There are also works that extend the theoretical results to non-i.i.d. data case 
\citep{li2019convergence, Khaled2019FirstAO}. Besides, several variants of FedAvg are proposed and analyzed, such as those that address system heterogeneity \citep{li2020federated}, combine with momentum \citep{yu2019linear}, or use adaptive optimizers \citep{reddi2020adaptive}. 

For HFL, there have been a few works providing convergence analysis. These include \citet{castiglia2020multi, zhou2019distributed} for i.i.d. data, and \citet{liu2020client} for non-i.i.d. data with full gradient descent. \citet{wang2022demystifying} provides a comprehensive analysis framework for HFL, covering non-convex objective function, non-i.i.d. data, and SGD. Several variants are also proposed and theoretically studied, with quantization technique \citep{liu2022hierarchical}, momentum acceleration \citep{yang2023hierarchical}, over-the-air setup \citep{aygun2022hierarchical}, wireless resource allocation \citep{liu2022joint}, user mobility \citep{feng2022mobility}, data offloading \citep{ganguly2023multi}, and submodel partitioning \citep{fang2023submodel}. However, all these works assume FWP, and some even require stronger assumptions such as bounded gradient. Besides, there are also works on system design for HFL without convergence guarantees \citep{luo2020hfel, abad2020hierarchical, briggs2020federated}.

For PWP, a popular branch of works focuses on the uniform worker sampling pattern with or without replacement. Some typical representatives include, for strongly convex objective function \citep{li2019convergence}, with proximal term to handle heterogeneity \citep{li2020federated}, with extra communications to reduce variance introduced by PWP \citep{karimireddy2020scaffold}. Then \citet{yang2020achieving} improves the theoretical results and achieves a linear speedup with two-sided learning rates. There are further some works addressing variance-reducing in PWP via memorized gradients \citep{jhunjhunwala2022fedvarp} and momentum-based update \citep{das2022faster}. The work by \citet{qu2022convergence} considers a multi-server FL with overlapping area setting, also taking into account the uniform PWP. However, their multi-server scenario differs from HFL in that global aggregation never takes place in the training process. Some other works allow for arbitrary worker sampling probabilities and provide convergence analysis \citep{Gu2021FastFL, perazzone2022communication, fraboni2023general}. However, these works require much stronger assumptions, such as Lipschitz Hessian or/and bounded gradient assumption. Besides, another branch of works considers arbitrarily asynchronous participation patterns \cite{Avdiukhin2021FederatedLU,yang2022anarchic,nguyen2022federated,wang2022unified}. 
% This could be a promising direction to be potentially applied with HFL, and we would like to leave this to our future works. 
In this paper, we base our work on uniform sampling with standard assumptions, as we will elaborate in \cref{sec:HFL} and \cref{sec:convergence}. 
% 这最后一小块也许也可以放在最后conclusion一起说

\section{HFL Setup}
\label{sec:HFL}
Suppose there are $m$ in total workers making up a set $\mathcal{V}$. In two-level HFL setting, all workers are grouped into $M$ clusters $\mathcal{V}_1, \mathcal{V}_2, \ldots, \mathcal{V}_M$. 
Let $m_i := |\mathcal V_i|\; (i = 1, 2, \ldots, M)$ denote the number of workers in each group cluster. 
Therefore, we have $m = \sum_{i=1}^M m_i$.
With the cluster policy, the objective function of Eq. \ref{eqn:main} is equivalent to 
\begin{equation}
\label{eqn:main-hsgd}
	\min_{\textbf{x}\in \mathbb R^d} f(\textbf{x}) := \sum_{i=1}^M \frac{m_i}{m} f_i(\textbf{x}), \notag
\end{equation}
 where $f_i(\cdot)$ is the averaged loss function of workers in cluster $i$ which is 
\begin{equation}
\label{eqn:cluster_i}
f_i(\textbf{x}) := \frac{1}{m_i}\sum_{j\in \mathcal V_i}F_j(\textbf{x}). \notag
\end{equation}
In HFL, workers conduct multiple SGD iterations to minimize the local objective function $F_j(\textbf{x}) \triangleq \mathbb E_{\xi_j\sim D_j}[F_j(\textbf{x}, \xi_j)]$ w.r.t. model parameters $\textbf{x}$ on their own dataset $\mathcal{D}_j$. Each cluster $i \; (i = 1,2,\ldots, M)$ first averages the updated parameters from its workers $j\in\mathcal{V}_i$ every $I_i$ local iterations (refer to as a cluster round with period $I_i$). After several rounds of intra-cluster aggregations, a master globally averages the models from all $M$ clusters. This global aggregation takes place for every $G$ local iterations (refer to as a master round with period $G$). Note that $G$ is a common multiple of $\{ I_1, I_2, \ldots, I_M\}$. Different $I_i$ values account for potential system heterogeneity (computation and communication capacity) of devices in different cluster.

\begin{algorithm}[h!]
\caption{HFL with Three-sided Learning Rates} 
\label{alg:HFL}  
\begin{algorithmic}[1]
\STATE {\bfseries Input:} $\eta$, $\eta_c$, $\eta_g$, $\textbf{x}^0$, $\{\mathcal V_i : i\in [M]\}$, $G$, $\{I_i : i\in [M]\}$, $\{\omega_i : i\in [M]\}$, $\{n_i : i\in [M]\}$.
\STATE {\bfseries Output:} Global aggregated model $\overline{\textbf{x}}^T$.
\FOR{$t = 0$ \textit{to} $T-1$}
\FOR{\textit{each cluster} $i\in [M]$ \textit{in parallel}}
\FOR{$\tau = 0$ \textit{to} $\omega_i-1$}
\STATE{Cluster $i$ samples a subset $\mathcal S^{t,\tau}_i$ of workers with $|\mathcal S^{t,\tau}_i| = n_i$.}
\FOR{\textit{each worker} $j\in \mathcal S^{t,\tau}_i$ \textit{in parallel}}
\FOR{$h = 0$ \textit{to} $I_i-1$}
\STATE{Compute a gradient estimate $\textbf{g}_j^{t,\tau,h}$.}
\STATE{Worker update: $\textbf{x}^{t,\tau,h+1}_j = \textbf{x}^{t,\tau,h}_j-\eta \textbf{g}_j^{t,\tau,h}$.}
\ENDFOR
\STATE{Let $\tilde{\Delta}_j^{t,\tau} = \textbf{x}^{t,\tau,0}_j - \textbf{x}^{t,\tau,I_i}_j = \eta\sum_{h=0}^{I_i-1}\textbf{g}_j^{t,\tau,h}$}
\STATE{Send $\tilde{\Delta}_j^{t,\tau}$ to cluster $i$.}
\ENDFOR
\STATE{Cluster $i$ receives $\tilde{\Delta}_j^{t,\tau}, j\in\mathcal{S}^{t,\tau}_i$.}
\STATE{Let ${\Delta}^{t,\tau}_i = \frac{1}{n_i}\sum_{j\in \mathcal S_i^{t,\tau}}\tilde{\Delta}_j^{t,\tau}$.}
\STATE{Cluster update: $\overline{\textbf{x}}_i^{t,\tau+1} = \overline{\textbf{x}}_i^{t,\tau} - \eta_c {\Delta}^{t,\tau}_i$.}
\STATE{Broadcast $\overline{\textbf{x}}_i^{t,\tau+1}$ to workers in cluster $i$.}
\ENDFOR
\STATE{Let $\Delta_i^{t} = \overline{\textbf{x}}_i^{t,0} - \overline{\textbf{x}}_i^{t,\omega_i} = \eta_c\sum_{\tau=0}^{\omega_i-1}{\Delta}^{t,\tau}_i$.}
\STATE{Send $\Delta_i^{t}$ to master.}
\ENDFOR
\STATE{Master receives $\Delta_i^{t}, i\in[M]$.}
\STATE{Let ${\Delta}^t = \sum_{i}^{M} \frac{m_i}{m} \Delta_i^{t}$.}
\STATE{Master update: $\overline{\textbf{x}}^{t+1} = \overline{\textbf{x}}^{t} - \eta_g {\Delta}^{t}$.}
\STATE{Broadcast $\overline{\textbf{x}}^{t+1}$ to all workers.}
\ENDFOR
\end{algorithmic}
\end{algorithm} 

We investigate a three-sided learning rates hierarchical FedAvg, which is essentially a generalization of previous works \citep{karimireddy2020scaffold, reddi2020adaptive, yang2020achieving}. The algorithm is shown in \cref{alg:HFL}.
% Some notations and interpretations are as follows. 
For a natural number $m$, we use $[m]$ to represent the set $\{1,2, \ldots, m\}$. The Learning rates for worker, cluster, and master are $\eta$, $\eta_c$, and $\eta_g$, respectively. Note that $h$, $\tau$, and $t$ always count for local iteration, cluster round, and master round, respectively. We let $\omega_i = \frac{G}{I_i}$ denote the number of cluster rounds for cluster $i, \forall i\in[M]$ in a master round. We denote the stochastic gradient estimator as $\textbf{g}_j^{t,\tau,h} = \nabla F_j(\textbf{x}_j^{t,\tau,h}, \xi_j^{t,\tau,h})$, where $\xi_j^{t,\tau,h}$ is the random data samples from the local dataset $\mathcal{D}_j$ at worker $j$ for iteration $h$ (in cluster round $\tau$ and master round $t$). For a cluster round $\tau$, $\tilde{\Delta}_j^{t,\tau}$ is the accumulated gradients of worker $j$, while $\Delta_i^{t,\tau}$ is the averaged gradients of all participated workers in cluster $i$.
 
% During each local iteration $k$, each worker $j$ updates its own model using SGD
%  \begin{eqnarray}
%  	\label{eqn:SGD}
%  	\textbf{x}_j^{k+1} = \textbf{x}_j^{k}-\eta \textbf{g}(\textbf{x}_j^k, \xi_j^k),
%  \end{eqnarray}
%  where $\eta$ is workers' local learning rate, $\textbf{g}(\textbf{x}_j^k, \xi_j^k) = \nabla F_j(\textbf{x}_j^k, \xi_j^k)$ is the stochastic gradient of $F_j(\textbf{x})$, and $\xi_j^k$ is the random data samples from the local dataset $\mathcal{D}_j$ at worker $j$ at iteration $k$.

In HFL with PWP, each cluster round only includes a certain subset of workers. We denote $\mathcal{S}_i^{t,\tau}$ as participating worker index set, which is determined once a new cluster round $\tau$ starts. We have $|\mathcal{S}_i^{t,\tau}| = n_i$, for some $n_i\in(0, m_i]$. For the sampling strategy of participating set, we employ two strategies proposed by \citet{li2020federated} and \citet{li2019convergence}, respectively. Specifically, we select $\mathcal{S}_i^{t,\tau}$ randomly and independently, either with replacement (Strategy 1) or without replacement (Strategy 2). For each member in $\mathcal{S}_i^{t,\tau}$, we pick a worker from $\mathcal{V}_i$ uniformly at random with probability $p_j=\frac{1}{m_i},\forall j\in\mathcal{V}_i$. Ultimately, the participation likelihood for any worker $j\in\mathcal{S}_i^{t,\tau}$ equals to $\frac{n_i}{m_i}$. We denote the total number of worker sampling size in HFL system as $n=\sum_{i=1}^M {n_i}$.

\section{Convergence Analysis}
\label{sec:convergence}
To establish the convergence theorem, we preset the following assumptions.
\begin{assumption}[L-Lipschitz Continuous Gradient]
\label{ass:lipschitz}
There exists a constant $L > 0$, such that $||\nabla F_i(\textbf{x})-\nabla F_i(\textbf{y})|| \leq L ||\textbf{x} - \textbf{y} ||, \forall i, \textbf{x}, \textbf{y}$.
\end{assumption} 
Note that Lipschitz continuous gradient assumption internally applies to the cluster objective $f_i(\textbf{x})$ and global objective $f(\textbf{x})$. For instance, $|| \nabla f_i(\textbf{x}) - \nabla f_i(\textbf{y}) || = || \frac{1}{m_i} \sum_{j\in \mathcal{V}_i} \nabla F_j(\textbf{x}) - \frac{1}{m_i} \sum_{j\in \mathcal{V}_i} \nabla F_j(\textbf{y}) || \leq \frac{1}{m_i} \sum_{j\in\mathcal{V}_i} || \nabla F_j(\textbf{x}) - \nabla F_j(\textbf{y}) || \leq L||\textbf{x} - \textbf{y}||$.
\begin{assumption}[Unbiased Local Gradient Estimator]
\label{ass:unbiased-g}
Let $\xi_i^h$ be a random local data sample in the $h$-th step at the $i$-th worker. The local gradient estimator is unbiased, i.e., $\mathbb E[\nabla F_i(\textbf{x}, \xi^h_i)] = \nabla F_i(\textbf{x}), \forall i,\textbf{x}$, where the expectation is over all local datasets samples.
\end{assumption} 
\begin{assumption}[Bounded Variance]
\label{ass:bound-var}
There exists a constant $\sigma > 0$, such that the variance of each local gradient estimator is bounded by $\mathbb E[||\nabla F_i(\textbf{x},\xi_i^h)-\nabla F_i(\textbf{x})||^2]\leq \sigma^2, \forall i,\textbf{x}$. 
%  \remove{and the global variability of the local gradient of the cost function is bounded by $\mathbb E[||\nabla F_i(\textbf{x}^t)-\nabla f(\textbf{x}^t)||^2]\leq \sigma^2, \forall i\in [m]$.}
\end{assumption} 
% \begin{assumption}[Bounded Global Divergence]
% \label{ass:global-div}
% The bounded global divergence is expressed as $|| \nabla F_j(\textbf{x}) - \nabla f(\textbf{x}) ||^2 \leq \tilde{\epsilon}^2, \forall j\in[m], \textbf{x}$.
% \end{assumption}
\begin{assumption}[Bounded Cluster-Master and Worker-Cluster Divergence]
\label{ass:cluster-div}
The bounded cluster-master divergence is expressed as $ ||\nabla f_i(\textbf{x})-\nabla f(\textbf{x})||^2 \leq \epsilon^2, \forall i\in[M], \textbf{x}$, while the bounded worker-cluster divergence is $ ||\nabla f_j(\textbf{x})-\nabla F_i(\textbf{x})||^2 \leq \epsilon_i^2, \forall i\in[M], j\in\mathcal{V}_i, \textbf{x}$.
\end{assumption} 

The first three assumptions are standard in non-convex optimization \citep{ghadimi2013stochastic,bottou2018optimization}. Assumption \ref{ass:cluster-div} quantifies the heterogeneity of the non-i.i.d. datasets among different workers and groups. It was first introduced by \citet{wang2022demystifying} for HFL. The worker-cluster part measures the data heterogeneity among workers inside a group, while the cluster-master part measures the data heterogeneity among groups. In particular, 
% $\tilde{\epsilon}^2=0$ stands for globally i.i.d.,
$\epsilon^2=0$ stands for inter-group i.i.d., and $\epsilon_i^2=0$ for intra-group-$i$ i.i.d., respectively. In standard FL case, Assumption \ref{ass:cluster-div} has a simpler form as $|| \nabla F_j(\textbf{x}) - \nabla f(\textbf{x}) ||^2 \leq \tilde{\epsilon}^2, \forall j\in[m], \textbf{x}$, which is also referred to as bounded global divergence \citep{yang2020achieving,reddi2020adaptive,wang2019adaptive,yu2019parallel}. Note that the worker-cluster and cluster-master divergences are essentially two components of the global divergence, since the total variance is $\frac{1}{m}\sum_{j=1}^{m} \big\Vert \nabla F_j(\textbf{x}) - \nabla f(\textbf{x}) \big\Vert^2 = \sum_{i=1}^M \frac{m_i}{m} \big\Vert \nabla f_i(\textbf{x}) - \nabla f(\textbf{x}) \big\Vert^2 + \sum_{i=1}^M \frac{m_i}{m} \frac{1}{m_i} \sum_{j\in \mathcal{V}_i} \big\Vert \nabla F_i(\textbf{x}) - \nabla f_i(\textbf{x}) \big\Vert^2$, implying the asymptotic relation $\mathcal{O}(\tilde{\epsilon}^2) = \mathcal{O}(\epsilon^2 + \sum_{i=1}^M \frac{m_i}{m} \epsilon_i^2)$.
% Assumption 5 is not indispensable for acquiring the convergence bound in Theorem 1, however, it could provide more understandings for deeper analysis on our results.


\subsection{HFL with FWP}
% For better comprehension, we rearrange and restate the convergence results for hierarchical FL with FWP \cite{wang2022demystifying} here:

% \emph{Under Assumptions 1-5 and with FWP, let the learning rate be chosen such that $\eta < \frac{1}{2\sqrt{6}GL}$, then the generated sequence of outputs} $\{ \overline{\textbf{x}}^t \}$ \emph{satisfies}
% \begin{align}
% \label{eq:theorem-full}
%     &\min_{t\in [T]} \mathbb E[||\nabla f(\overline{\textbf{x}}^t)||^2] \leq \frac{2}{T\eta}\left[f^0 - f^* \right] + \frac{\eta L \sigma^2}{m} \notag\\
%     & + 16L^2\eta^2G \frac{M-1}{m} \sigma^2 + 24L^2\eta^2G^2\epsilon^2 + \frac{32}{3} L^2 \eta^2\sigma^2 \sum_{i=1}^M \frac{m_i-1}{m} I_i + 32 L^2 \eta^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2
% \end{align}
% \emph{where} $f^0\triangleq f(\overline{\textbf{x}}^0),\; f^*\triangleq f(\overline{\textbf{x}}^*)$ \emph{, and the expectation is taken over the local dataset samples among all workers.}

% Then we extend the above result to PWP case as follow:

Consider the problem described in Section \ref{sec:HFL}, we have the following results for HFL with FWP:

\begin{theorem}
\label{them:hfl-full}
Under Assumption \ref{ass:lipschitz}-\ref{ass:cluster-div}, with FWP, let the learning rates be chosen such that $\eta \leq \frac{1}{10I_{max}L}$, $\eta_c\eta \leq \frac{1}{10GL}$, $\eta_g\eta_c\eta \leq \frac{1}{GL}$, and $40 G^2\eta_c^2\eta^2 L^2 + 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 < \frac{1}{2}$, where $I_{max} = \max_i I_i$, then the sequence of outputs {\rm $\{ \overline{\textbf{x}}^t \}$} generated by \cref{alg:HFL} satisfies
{\rm
\begin{align}
	& \min_{t\in[T]} \mathbb{E} \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2 \leq \frac{f_0 - f_*}{c\eta_g\eta_c\eta G T} + \frac{1}{c} \big( \Phi_1 + \Phi_2 \big). \notag
\end{align}
}
where $c$ is a constant, {\rm $f^0\triangleq f(\overline{\textbf{x}}^0),\; f^*\triangleq f(\overline{\textbf{x}}^*)$}, and 
{\rm
\begin{align}
    & \Phi_1 = 9 G\eta_c^2\eta^2 L^2 \frac{M}{m} \sigma^2 + 8 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 \notag\\
    & + 40 G^2\eta_c^2\eta^2 L^2 \epsilon^2 + 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \epsilon^2 \notag\\
    & + 75 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2\;, \;\;\; \Phi_2 = \frac{L\eta_g\eta_c\eta}{2m} \sigma^2. \notag
\end{align}
}
\end{theorem} 

\begin{proof}
Please refer to Appendix \ref{app:proof-them1}. Here the core technique we use is the mutual bounding of worker-cluster parameter MSEs (WC-MSE) $\Vert\mathbf{x}_j^{t,\tau,h}-\overline{\mathbf{x}}_i^{t,\tau}\Vert^2$ and cluster-master parameter MSEs (CM-MSE) $\Vert\overline{\mathbf{x}}_i^{t,\tau}-\overline{\mathbf{x}}^t\Vert^2$, as shown in our derived \cref{lem:cm-MSE} and \ref{lem:wc-MSE}. This may seem counterintuitive at first, since we may expect that it is only CM-MSE being bounded by WC-MSE, but not the contrary. However, this mutual bounding is exactly the maximal knowledge on MSEs when no stronger assumption is available (e.g., bounded gradient or convexity assumption). We creatively leverage this to further derive \cref{lem:wc-MSE-indep}, which analytically provides a universal bound of WC-MSE. Then, with \cref{lem:wc-MSE-indep}, we achieve the final convergence bound for HFL and recover the desired weakening effect.
\end{proof}

\begin{remark}
The convergence bound in \cref{them:hfl-full} contains two parts: a vanishing term $\frac{f_0 - f_*}{c\eta_g\eta_c\eta G T}$ that decreases as $T$ increases, and other constants that depend on the problem instance configuration rather than $T$. The decaying rate of the vanishing term matches that of typical SGD methods. The constant part can be further categorized into two components $\Phi_1$ and $\Phi_2$ (this manual partition is for better comparison with subsequent results from PWP case). $\Phi_1$ reveals how the master and cluster periods $G, I_i$ interact with SGD noise $\sigma^2$ and divergences $\epsilon^2, \epsilon_i^2$. $\Phi_2$ covers all the impact of master learning rate $\eta_g$ for the constant part, which only acts on $\sigma^2$.
\end{remark}

\begin{remark}
\label{rem:div-full}
\cref{them:hfl-full} shows how local aggregation of HFL helps to overcome divergences. The overall divergences for HFL are $\mathcal{O}(\eta^2\eta_c^2 G^2\epsilon^2 + \eta^2 \sum_{i=1}^{M}\frac{m_i}{m}I_i^2(\epsilon^2 + \epsilon_i^2))$, originating from the constant component $\Phi_1$. In contrast, the corresponding divergence part in standard FL is $\mathcal{O}(\eta^2 G^2\tilde{\epsilon}^2) = \mathcal{O}(\eta^2 G^2\epsilon^2 + \eta^2 G^2 \sum_{i=1}^M \frac{m_i}{m} \epsilon_i^2)$ \citep{yang2020achieving}. We suppose $\eta_c=\mathcal{O}(1)$ for fair comparison,
% (which naturally satisfy the learning rate condition in \cref{them:hfl-full})
then we simplify the divergences of HFL in asymptotic sense as $\mathcal{O}(\eta^2 G^2\epsilon^2 + \eta^2 \sum_{i=1}^{M}\frac{m_i}{m}I_i^2\epsilon_i^2)$. This indicates that local aggregation of HFL can weaken the impacts of the worker-cluster part of the global divergence since $G\geq I_i, \forall i\in [M]$. The weakening effect here matches the iteration-level convergence results of HFL from \citet{wang2022demystifying}. Besides, our three-sided learning rates algorithm provides more flexibility here, since only the worker learning rate $\eta$ interacts with the cluster-master divergence $\epsilon_i$. This may allow a decoupling of learning, and we can thus adjust $\eta_c$ and $\eta$ according to divergences for probably better convergence. For example, larger $\epsilon_i$ and smaller $\epsilon$ may prefer a smaller $\eta$ for stability as well as a larger $\eta_c$ for acceleration.
\end{remark}

\begin{corollary}
\label{cor:linear-speedup-full}
Let $\eta=\frac{1}{\sqrt{T}GL}$, $\eta_c=\mathcal{O}(1)$, and $\eta_g = \sqrt{Gm}$. The convergence rate of the HFL with FWP in \cref{alg:HFL} is
{\rm 
% $\min_{t\in [T]} \mathbb E[||\nabla f(\overline{\textbf{x}}^t)||^2] = 
$\mathcal{O}(\frac{1}{\sqrt{mGT}} + \frac{1}{T})$.}
\end{corollary}

\begin{remark}
The HFL with FWP achieves a linear speedup $\mathcal{O}(\frac{1}{\sqrt{mGT}})$ with proper learning rate settings as shown in \cref{cor:linear-speedup-full} as long as $T \geq mG$. This resembles the results of standard FL from \citet{yang2020achieving}. To provide some flexibility, we set $\eta_c = \mathcal{O}(1)$ as we discussed in \cref{rem:div-full}, without impairing the linear speedup property.
\end{remark}

\subsection{HFL with PWP}
\label{subsec:hfl-partial}
Next, we analyze the convergence behavior for HFL with PWP, for which we have following results:

\begin{theorem}
\label{them:hfl-partial}
Under Assumption \ref{ass:lipschitz}-\ref{ass:cluster-div}, with PWP, let the learning rates be chosen such that $\eta \leq \frac{1}{10I_{max}L}$, $\eta_c\eta \leq \frac{1}{10GL}$, and $\eta_g\eta_c\eta \leq \frac{1}{GL}$, then the sequence of outputs {\rm $\{ \overline{\textbf{x}}^t \}$} generated by \cref{alg:HFL} satisfies
{\rm
\begin{align}
	& \min_{t\in[T]} \mathbb{E} \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2 \leq \frac{f_0 - f_*}{c\eta_g\eta_c\eta G T} + \frac{1}{c} \big(\Phi_1 + \Phi_2 + \Phi_3 \big), \notag
\end{align}
}
where $c$ is a constant, {\rm $f^0\triangleq f(\overline{\textbf{x}}^0),\; f^*\triangleq f(\overline{\textbf{x}}^*)$}, and for both sampling strategies
{\rm
\begin{align}
    & \Phi_1 = 9 G\eta_c^2\eta^2 L^2 \frac{M}{m} \sigma^2 + 8 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 \notag\\
    & + 40 G^2\eta_c^2\eta^2 L^2 \epsilon^2 + 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \epsilon^2 \notag\\
    & + 75 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2\;, \;\;\; \Phi_2 = \frac{1}{2} L\eta_g\eta_c\eta \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i} \sigma^2 . \notag
\end{align}
}
For strategy 1 (with replacement), let learning rates additionally satisfy $40 G^2\eta_c^2\eta^2 L^2 + 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 + 10 \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i < \frac{1}{2}$, it then holds that
\begin{align}
    & \Phi_3 = \frac{3}{4} \eta_g\eta_c\eta L \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2 + \frac{15}{2} \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon_i^2 \notag\\
    & + 10 \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon^2. \notag
\end{align}
For strategy 2 (without replacement), let learning rates additionally satisfy $40 G^2\eta_c^2\eta^2 L^2 + 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 + 10 \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i < \frac{1}{2}$, where $\alpha_i = \frac{m_i-n_i}{m_i-1}$, it then holds that
\begin{align}
    & \Phi_3 = \frac{3}{4} \eta_g\eta_c\eta L \sum_{i=1}^{M}  \frac{m_i^2\alpha_i}{m^2n_i} \sigma^2 + \frac{15}{2} \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon_i^2 \notag\\
    & + 10 \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon^2. \notag
\end{align}
\end{theorem} 

\begin{proof}
Please refer to \cref{app:proof-them2}. Regarding PWP, the core technique we use here is uncertainty redirection. In HFL, as the worker sampling occurs at each cluster round, the resulting uncertainty within a single master round accumulates across both multiple clusters and multiple rounds of a certain cluster. We creatively derive \cref{lem:A2-cr-decompose} to transform the corresponding accumulated uncertainty term $A_2$ in Eq. \ref{eqn:A2-partial}. We first decompose the impact of PWP among different clusters (inter-cluster) and then further decompose among different cluster rounds (intra-cluster). All these are carefully conducted via equality substitution. 
% **Lemma 6** redirects the randomness from overall master round to each cluster round, restricting PWP to only interact with $I_i$, thereby guaranteeing the desired weakening effect of PWP. 
% This uncertainty redirection is peculiar and indispensable in HFL setting. While as a contrast in FL with PWP, we can directly analyze similar $A_2$ term w.r.t. the specific sampling strategy due to the absence of accumulation effect.
Besides, WC-MSE and CM-MSE shall both have their partial versions (not just reused from full case). We thus thoroughly renew our deduction for the partial case, consolidate it with the bounds for the full case, and present the results in \cref{lem:cm-MSE} and \ref{lem:wc-MSE-indep}. This further induces an additional variance term $\Psi_i^{t,\tau}$. We decompose it w.r.t. sampling strategy (Eq. \ref{eq:Psi-st1} and \ref{eqn:Psi-st2}) and merge into some resulting terms of \cref{lem:A2-cr-decompose}, finally deriving the compact convergence bound \footnote{In FL with PWP, the analysis is much explicit due to the absence of intermediate cluster aggregation (thus there is no accumulation effect of $A_2$ term or additional randomness left in MSE term).}.
\end{proof}

\begin{remark}
The convergence bound in \cref{them:hfl-partial} shows certain consistency with that of \cref{them:hfl-full}, as they share the same constant component $\Phi_1$. The uncertainty introduced by PWP contributes to amplifying constant $\Phi_2$, and incurring an additional $\Phi_3$. Especially, for sampling strategy 2 with sampling size $n_i=m_i,\forall i\in [M]$, \cref{them:hfl-partial} would as expected, recover exactly the same convergence bound with the FWP case.
\end{remark} 

% The other component $\mathcal{C}_2$ covers the amplified effect of $\sigma^2$ and $\epsilon_i^2$, accounting for the additional uncertainty introduced by PWP. Note that for $\mathcal{C}_2$, the two sampling strategies share the same first term $\frac{\eta L}{m^2} \sum_{i=1}^{M}  \frac{m_i^2}{n_i} \sigma^2$, while only differ in the second term $\Phi$.

\begin{corollary}[Minimum Convergence Rate]
\label{cor:min-conv-rate}
Let $\eta = \frac{1}{\sqrt{T}GL}$, $\eta_c=\mathcal{O}(1)$, and $\eta_g = \sqrt{Gm\kappa_{min}}$, where $\kappa_{min}={\rm min}_i\frac{n_i}{m_i}$. The minimum convergence rate of the HFL with PWP in \cref{alg:HFL} for both sampling strategies is
{\rm
    % \min_{t\in [T]} \mathbb E[||\nabla f(\overline{\textbf{x}}^t)||^2] = 
    $\mathcal{O}(\frac{I_{max}}{\sqrt{m\kappa_{min}GT}} + \frac{1}{T})$.
}
\end{corollary} 

\begin{remark}
HFL with PWP can at least achieve a linear speedup $\mathcal{O}(\frac{I_{max}}{\sqrt{m\kappa_{min}GT}})$ with proper learning rate settings as shown in \cref{cor:min-conv-rate}. The minimum convergence rate of the HFL system is bottlenecked by the minimal sampling rate $\kappa_{min}$ and the maximal cluster period $I_{max}$.
\end{remark}

Note that the minimum convergence rate mentioned above is just the loosest estimation. In the following, we will show a more typical case.

\begin{corollary}
\label{cor:com-conv-rate}
Suppose $\frac{n_i}{m_i} = \frac{n}{m}, I_i=I, \forall i\in[M]$, and let $\eta = \frac{1}{\sqrt{T}GL}$, $\eta_c=\mathcal{O}(1)$, and $\eta_g = \sqrt{Gn}$. The convergence rate of the hierarchical FL with PWP in \cref{alg:HFL} for both sampling strategies is
{\rm
    % \min_{t\in [T]} \mathbb E[||\nabla f(\overline{\textbf{x}}^t)||^2] = 
$\mathcal{O}(\frac{I}{\sqrt{nGT}} + \frac{1}{T})$.
}
\end{corollary} 

\begin{remark}
When all clusters have the same (or close) sampling rates and round period, a linear speedup $\mathcal{O}(\frac{I}{\sqrt{nGT}})$ can be guaranteed by setting the learning rates as shown in \cref{cor:com-conv-rate}. Note that the convergence rate here has a smaller first term than that of standard FL with PWP, which is $\mathcal{O}(\frac{\sqrt{G}}{\sqrt{nT}} + \frac{1}{T})$ (Corollary 2 from \cite{yang2020achieving}). This indicates the additional benefit of PWP on HFL. However, this homogeneous setting\footnote{Similar setting is also considered in \cite{qu2022convergence} for multi-server federated learning (but non-hierarchical), where they refer to as unbiased PWP. However, here we do not exclusively emphasize this, since it could be inherently covered by our theoretical results.} of HFL may not necessarily lead to the optimal convergence rate, which depends on the specific worker-cluster divergence $\epsilon_i^2$. We will subsequently discuss this further.
\end{remark}

\begin{remark}
The convergence rate bound for HFL with PWP has the same structure (in order sense) as the full case, but with a larger variance. This is consistent with the results for standard FL \citep{yang2020achieving}. Uniform sampling (with/without replacement) yields a good approximation of the entire intra-group worker distribution in expectation, thereby reducing the risk of distribution deviation incurred by PWP.
\end{remark} 

\begin{table*}[!h]
\centering
\caption{Communication time (s) and iterations ($\times 10^4$) to achieve target test accuracy with $20\%$ workers participating.}
\label{tab:3dataset}
\small{\begin{tabular}{c|c|ccc|ccc}
\hline
\multicolumn{1}{c}{} & & \multicolumn{3}{c|}{Standard FL ($P$)} & \multicolumn{3}{c}{HFL ($G,I$)} \\\hline\hline
 \multirow{3}{*}{MNIST} & Setting & 10 & 50 & 100 & \;50, 10 & \;100, 10 & \;100, 50 \\\cline{2-8}
 & Communication Time & 15.95 & 11.48 & 9.91 & 1.5 & 1.54 & 1.09 \\
 & Iterations ($\times 10^4$) & 1.46 & 5.27 & 9.09 & 1.38 & 1.4 & 5.15 \\\hline\hline
\multirow{3}{*}{FEMNIST} & Setting & 20 & 100 & 200 & \;100, 20 & \;200, 20 & \;200, 100 \\\cline{2-8}
 & Communication Time & 8.92 & 3.03 & 2.78 & 0.93 & 1.01 & 0.34 \\
 & Iterations ($\times 10^4$) & 0.73 & 1.24 & 2.24 & 0.77 & 0.82 & 1.3 \\\hline\hline
\multirow{3}{*}{CIFAR-10} & Setting & 10 & 50 & 250 & \;50, 10 & \;250, 10 & \;250, 50 \\\cline{2-8}
 & Communication Time & 399.77 & 113.03 & 54.27 & 49.29 & 50.57 & 18.79 \\
 & Iterations ($\times 10^4$) & 1.24 & 1.76 & 4.23 & 1.54 & 1.58 & 2.93 \\\hline
\end{tabular}}
\end{table*}

\subsection{Overcome Divergence}
\label{subsec:overcome-divergence}
We now exclusively elaborate how the weakening effect for overcoming divergences in HFL, as shown in FWP case (\cref{rem:div-full}), can be enhanced with PWP. Specifically, we focus on the additional divergences resulting from PWP (covered by component $\Phi_3$) in HFL, denoted as $\Theta_H$. For strategy 1, we have
\begin{equation}
    \Theta_H = \mathcal{O}(\eta_g\eta_c\eta\sum_{i=1}^M \frac{m_i^2}{m^2n_i}I_i (\epsilon_i^2 + \epsilon^2)), \notag
\end{equation}
and for strategy 2, we have
\begin{equation}
    \Theta_H = \mathcal{O}(\eta_g\eta_c\eta\sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i}I_i (\epsilon_i^2 + \epsilon^2)). \notag
\end{equation} 
For standard FL, Theorem 2 of \cite{yang2020achieving} indicates that the additional divergence part resulting from partial worker participation is, for sampling strategy 1, we have 
\begin{equation}
    \Theta_S = \mathcal{O}(\frac{1}{n}\eta_g\eta G\tilde{\epsilon}^2),
\end{equation}
and for sampling strategy 2, we have
\begin{equation}
    \Theta_S = \mathcal{O}(\frac{\alpha}{n}\eta_g\eta G\tilde{\epsilon}^2),
\end{equation}
where $\alpha = \frac{m-n}{m-1}$. Here we merge their quadratic term for better analysis.

% For FWP case, the overall divergences of HFL are $\mathcal{O}(\eta^2 G^2\epsilon^2 + \eta^2 \sum_{i=1}^{M}\frac{m_i}{m}I_i^2\epsilon_i^2)$ (Remark 2 of \cite{wang2022demystifying}), originating from the constant component $\mathcal{C}_1$. The corresponding divergence part in standard FL is $\mathcal{O}(\eta^2 G^2\tilde{\epsilon}^2) = \mathcal{O}(\eta^2 G^2\epsilon^2 + \eta^2 G^2 \sum_{i=1}^M \frac{m_i}{m} \epsilon_i^2)$. This indicates that local aggregation of HFL can weaken the impacts of the worker-cluster part of the global divergence since $G\geq I_i, \forall i\in [M]$. We would subsequently show a significant merit that the above weakening effect could even be enhanced in PWP case.

% % 这里补充一段 samling 没有产生 更多的 upward divergence
% With PWP, the overall divergences could be written as $\mathcal{O}(\eta^2 G^2\epsilon^2 + \eta^2 \sum_{i=1}^{M}\frac{m_i}{m}I_i^2\epsilon_i^2 + \Psi_H)$.
% For sampling strategy 1, it holds that
% \begin{equation}
%     \Psi_H \triangleq \eta\sum_{i=1}^{M} \frac{m_i(m_i-1)}{m^2n_i}\epsilon_i^2 + \eta^3\sum_{i=1}^{M} \frac{m_i(m_i-1)}{m^2n_i}I_i^2\epsilon_i^2. \notag
% \end{equation}
% For sampling strategy 2, it holds that
% \begin{equation}
%     \Psi_H \triangleq \eta\sum_{i=1}^{M} \frac{m_i(m_i-n_i)}{m^2n_i}\epsilon_i^2 + \eta^3\sum_{i=1}^{M} \frac{m_i(m_i-n_i)}{m^2n_i}I_i^2\epsilon_i^2. \notag
% \end{equation}
% For standard FL with PWP, Theorem 2 of \cite{yang2020achieving} indicates that the corresponding divergence part is $\mathcal{O}(\eta^2 G^2\tilde{\epsilon}^2 + \Psi_L)$\footnote{Their original result is w.r.t. communication round $t$ (rather than local iteration $t$), so here we rescale their additional term by dividing $G$ to get $\Psi_L$ for a more fair comparison.}, where for sampling strategy 1, 
% \begin{equation}
%     \Psi_L \triangleq \frac{1}{n}\eta\tilde{\epsilon}^2 + \frac{1}{n}\eta^3G^2\tilde{\epsilon}^2, \label{eq:fl-add-div-1}
% \end{equation}
% and for sampling strategy 2, 
% \begin{equation}
%     \Psi_L \triangleq \frac{m-n}{n(m-1)}\eta\tilde{\epsilon}^2 + \frac{m-n}{n(m-1)}\eta^3G^2\tilde{\epsilon}^2. \label{eq:fl-add-div-2}
% \end{equation}
% Let the invariant parts be freely set aside since we have already discussed them before this remark. We focus on inspecting the additional divergences $\Psi_H$ and $\Psi_L$, resulting from PWP.

% Note that $\Theta_H$ is irrelevant to $\epsilon^2$ and $G$, while $\chi_L$ is. This implies that sampling within a cluster in HFL does not incur any amplified impact in cluster-master divergence. As a contrast, in standard FL, this is not the case since sampling would easily take place across (virtual) groups, thereby intensifying divergence in sense of inter-group level.


With PWP, the weakening effect is observed for both the worker-cluster and cluster-master part. Taking strategy 1 as example, when $\frac{n_i}{m_i} = \frac{n}{m}, I_i=I, \forall i\in[M]$ and $\eta_c = \mathcal{O}(1)$, it always holds in the asymptotic sense that
\begin{align}
    & \Theta_H = \mathcal{O}(\eta_g\eta \sum_{i=1}^{M} \frac{m_i}{mn} I(\epsilon_i^2 + \epsilon^2)) \notag\\
    & = \mathcal{O}(\frac{1}{n}\eta_g\eta I \tilde{\epsilon}^2) < \mathcal{O}(\frac{1}{n}\eta_g\eta G \tilde{\epsilon}^2) = \Theta_S. \notag
\end{align}
The same holds for strategy 2. Compared to standard FL, this suggests that setting the same (or close) sampling rates for all clusters can consistently reduce additional divergences. As we discussed in \cref{rem:div-full}, this weakening effect yet is only observed on worker-cluster divergences in HFL with FWP. In fact, the bound $\mathcal{O}(\frac{1}{n}\eta_g\eta I \tilde{\epsilon}^2)$ indicates that this weakening effect can always restrict the global divergences $\tilde{\epsilon}^2$ to only being intensified by $I$ rather than $G$, regardless of the specific grouping setting (i.e., grouping-agnostic).
% Due to local aggregation, worker sampling within a cluster in HFL can restrict additional cluster-master divergence to only being intensified by $I_i$ rather than $G$.
Therefore, PWP can probably guarantee the performance of HFL with $G, I$ to be close to that of standard FL with aggregation period $I$. We empirically verify this superiority of HFL with PWP from our experiments in \cref{sec:exp}.
% enabling PWP could always be a beneficial choice for HFL. In particular, when in practice certain data heterogeneity
% exists among groups, the superiority of PWP would be more explicit.

Though $\frac{n_i}{m_i} = \frac{n}{m}, \forall i\in[M]$ may not be the optimal solution, it can be used as a priori to serve as a sufficient condition for reducing additional divergences. Note that we can flexibly set $\frac{n_i}{m_i}$ to match $I_i$ and $\epsilon_i$ accordingly for a better weakening effect. For instance, in practice, some clusters may inherently have larger inner divergence. In this case, enabling a larger sampling size and a smaller round period (if possible) could probably ensure a more effective convergence.
% 可根据实际的通信资源、用户状态等情况，灵活调整 Ii, ni 来更好地适配 不同 \epsilon_i，形成 individualized group SGD，进一步 收缩bound

% Besides, from \cref{them:h-sgd}, when degenerating to standard FL (i.e., $M=1$, $G=I_i$, and $\epsilon_i^2=\tilde{\epsilon}^2$), the additional divergence becomes $\mathcal{O}(\Psi_H) = \mathcal{O}(\frac{m-1}{mn} (\eta + \eta^3 G^2)\tilde{\epsilon}^2)$ for strategy 1 and $\mathcal{O}(\Psi_H) = \mathcal{O}(\frac{m-n}{mn} (\eta + \eta^3 G^2)\tilde{\epsilon}^2)$ for strategy 2, respectively. We note that there is an additional coefficient $1-\frac{1}{m}$ for both strategies, comparing with the corresponding terms of \cite{yang2020achieving}, i.e., Eq. \ref{eq:fl-add-div-1} and \ref{eq:fl-add-div-2}. This coefficient could potentially make our bound even tighter.

% To summarize, PWP benefits HFL better than standard FL. This is in terms of that it would not only help weaken the impact of worker-cluster divergences, but also never cause any additional cluster-master ones. We validate the above analysis through our empirical experiments.

\section{Numerical Experiments}
\label{sec:exp}
We conduct extensive experiments to validate our theoretical results. We defer some results to \cref{app:add-res}. All reported results are averaged over five random realizations.

\begin{table}[!h]
\centering
\caption{Per-round communication time between worker and cluster.}
\label{tab:RTT}
\small{\begin{tabular}{c|ccc}
\hline
Model & \makecell[c]{CNN\\(MNIST)} & \makecell[c]{CNN\\(FEMNIST)} & PreAct ResNet-18 \\\hline
RTT (ms) & 1.09$\pm$0.17 & 2.45$\pm$0.56 &  32.11$\pm$6.32 \\\hline
\end{tabular}}
\end{table}

\subsection{Dataset}
In our experiments, we choose three real datasets: MNIST, FEMNIST, and CIFAR-10. We partition the three datasets in non-i.i.d. manner, with details as follows 
\begin{itemize}
    \item \textbf{MNIST}. The MNIST dataset \cite{lecun1998gradient} consists of images of handwritten digits 0-9, with 60,000 training samples and 10,000 test samples. We distribute the training data to $m=100$ workers uniformly. We restrict each worker to have training samples of no more than 2 class of digits, to provide certain data heterogeneity.
    \item \textbf{FEMNIST}. FEMNIST is a federated version of the EMNIST dataset proposed by LEAF \cite{caldas2018leaf}. We follow the non-i.i.d. preprocessing protocol of \cite{wang2022demystifying}, where the training set consists of 34,659 samples distributed to $m=156$ workers and the test set consists of 4,973 samples. 
    \item \textbf{CIFAR-10}. The CIFAR-10 dataset \cite{krizhevsky2009learning} consists of $32\times32$ color images in 10 classes, with 50,000 training samples and 10,000 test samples. Like we do in MNIST, we distribute the training data to $m=100$ workers uniformly and restrict each worker to have training samples of no more than 4 class, to provide certain data heterogeneity.
\end{itemize}

For the group non-i.i.d. setting, we restrict each group to have worker training samples of no more than 4 classes on MNIST, while no more than 6 classes on CIFAR-10.

\subsection{Implementation Details}
We use Python 3.7 with PyTorch 1.8.1 to implement all our models and HFL algorithm\footnote{Our code is available at \url{https://github.com/cardistryj/HFL}.}. We set the learning rates $\eta = 0.01$, $\eta_c = 1$ and $\eta_g=1$. The local SGD mini-batch of each worker is set to 20.

We use CNNs for both MNIST and FEMNIST datasets. Specifically, for MNIST, we use CNN model composed of two convolutional layers and two fully connected layers. For FEMNIST, we use the same architecture as \cite{wang2022demystifying}. For CIFAR-10, we use PreAct ResNet-18 \cite{he2016identity}.

For the communication time, we follow the emulation of \cite{wang2022demystifying}. Specifically, we measure the round-trip time (RTT) of transmitting the model between an end device (worker) and a nearby server (cluster). Due to resource limitation, we simply assume the worker-master RTT is ten times as the worker-cluster one (which basically matches \cite{wang2022demystifying,liu2020client}). The estimated worker-cluster RTT is presented in \cref{tab:RTT}.

\begin{figure*}[!h]
\centering
\subfigure[]{
\centering
\includegraphics[width=1.55in]{figs/case1}
\label{fg:giid-full}
}
\subfigure[]{
\centering
\includegraphics[width=1.55in]{figs/case1_p}
\label{fg:giid-partial}
}
\subfigure[]{
\centering
\includegraphics[width=1.55in]{figs/case3}
\label{fg:gniid-full}
}
\subfigure[]{
\centering
\includegraphics[width=1.55in]{figs/case3_p}
\label{fg:gniid-partial}
}
\caption{Test Accuracy w.r.t. iterations on MNIST. (a) Group i.i.d. with full participation; (b) Group i.i.d. with partial participation; (c) Group non-i.i.d. with full participation; (d) Group non-i.i.d. with partial participation.}
\label{fg:mnist}
\end{figure*}

\subsection{Communication Overhead}
Table \ref{tab:3dataset} presents a comparison between standard FL and HFL on three datasets, all with PWP. $P$ stands for aggregation period of standard FL. The target test accuracy are $95\%$ for MNIST, $80\%$ for FEMNIST, and $85\%$ for CIFAR-10. By default, we uniformly partition workers into 4 groups.
% The communication time is emulated by measuring the round-trip time of transmitting the model between a device (in a home) and near (i.e., cluster) / far (i.e., master) Amazon EC2 instances \cite{wang2022demystifying}.

We observe that HFL can benefit the training process in terms of reducing communication overhead. HFL basically shows a similar convergence performance to its standard FL counterpart with $P=I$, even when $G$ is large. In particular, on FEMNIST, the total number of iterations ($\times 10^4$) of standard FL for $P=20$ is $0.73$, only slightly less than $0.77$ and $0.82$, from HFL with $G=100, I=20$ and $G=200, I=20$, respectively. Note that on MNIST, HFL is even better. On the other hand, HFL requires much less communication time (about only one-tenth) to achieve certain target test accuracy, due to the ultra-low communication latency granted by parallel local aggregations. 
% As a contrast, the global aggregation of standard FL is much more time-consuming, consequently incurring large communication overhead.

\subsection{Weakening Effect}
Fig. \ref{fg:mnist} shows the convergence curves in terms of test accuracy on MNIST. Group i.i.d. stands for small cluster master divergence grouping, i.e., $\epsilon^2 \approx 0$, while group non-i.i.d. for large $\epsilon^2$ setting. 
% Curves with $P$ (refers to the aggregation period) are for standard FL, and curves with $G$ and $I$ are for HFL. 
For PWP, we always keep $20\%$ sampling rate. 
% For better comparison among heterogeneous $P$ and $G$, we depict upon iterations rather than rounds.

% 先说 weakening effect of worker-cluster div ? 再谈 cluster-master 的
We first focus on FWP. From Fig. \ref{fg:gniid-full}, we observe that the convergence performance of HFL with $G$ and $I$ is between that of standard FL with $P = I$ and with $P = G$ (also referred to as ``sandwitch" behavior from \cite{wang2022demystifying}). This matches the weakening effect of local aggregation on worker-cluster divergences as we discussed in \cref{rem:div-full}. Note that the performance gap between HFL and its standard FL counterpart with $P=I$ originates from the amplified impact of $\epsilon^2$ by $G$ as shown in our \cref{them:hfl-full} ($\sigma^2$ is also negligible compared to non-i.i.d. divergences). To verify this, we refer to the corresponding curves in Fig. \ref{fg:giid-full}, observing almost the same convergence trends (e.g., $G=50, I=10$ and $G=100, I=10$ versus $P=10$). Essentially, it is the group i.i.d. setting with $\epsilon^2 \approx 0$ that makes the impact of $G$ trivial and eliminates the aforementioned performance gap.

With insights above, we next check on PWP. For group i.i.d. setting in Fig. \ref{fg:giid-partial}, the curve patterns show high consistency with those in Fig. \ref{fg:giid-full},  where there is also no performance gap between HFL and its standard FL counterpart with $P=I$. Still, PWP introduces additional randomness, resulting in zigzagging curves and slower convergence. For group non-i.i.d. setting, the results are more significant. The curve patterns in Fig. \ref{fg:gniid-partial}, instead resembles those in Fig. \ref{fg:giid-partial} (while unlike those in Fig. \ref{fg:gniid-full}), i.e., without noticeable performance gap. This exactly matches the enhanced weakening effect with PWP on global divergences (especially the cluster-master part) as we discussed in the last part of \cref{subsec:hfl-partial}. Intuitively, PWP pushes the convergence behavior of HFL with $G, I$ to the optimal upper boundary of the ``sandwitch'' (i.e., standard FL with $P=I$). Therefore, it could always be a beneficial choice for HFL to enable PWP.
% Compared to FWP case, the additional constant $\Phi_3$ introduced by PWP (\cref{them:hfl-partial}) dominates the training process, rendering the convergence curves strikingly fluctuating. Therefore, with HFL restricting additional cluster-master divergence to only being intensified by $I_i$, the aforementioned performance gap can be potentially reduced.

\subsection{Comparison with Other Methods}
We conduct comparison experiments on MNIST dataset to justify the effectiveness of our proposed three-sided learning rates HFL algorithm. We use the group non-i.i.d. setting with 4 groups and 20$\%$ workers participating. The round periods are set to $G=100, I=10$.

\begin{table*}[!h]
\centering
\caption{Comparison among different methods to achieve target accuracy on MNIST.}
\label{tab:comparison}
\small{\begin{tabular}{c|cc}
\hline
 & Iterations ($\times 10^4$) & Communication Time  \\\hline
Hier-Local-QSGD & 3.57 & 1.32 \\
HierMo & \underline{0.24} & 0.47 \\
MLL-SGD (fully-connected) & 2.23 & 2.43 \\
MLL-SGD (ring) & 3.1 & 3.38 \\\hline
HierFedAvg & 2.22 & 2.42 \\
Ours ($\eta_c = 1, \eta_g = 2$) & 1.95 & 2.13 \\
Ours ($\eta_c = 1, \eta_g = 3$) & 1.68 & 1.83 \\
Ours ($\eta_c = 3, \eta_g = 1$) & 0.29 & \underline{0.32} \\
Ours ($\eta_c = 5, \eta_g = 1$) & \textbf{0.22} & \textbf{0.24} \\
Ours ($\eta_c = 3, \eta_g = 2$) & 0.57 & 0.62 \\
\hline
\end{tabular}}
\end{table*}

\begin{table*}[!h]
\centering
\caption{Impact of cluster period and sampling number to achieve target test accuracy on MNIST.}
\label{tab:ie}
\small{\begin{tabular}{c|ccccc}
\hline
 Cluster periods ($I_1$, $I_2$) & 10, 200 & 20, 100 & 50, 50 & 100, 20 & 200, 10 \\\hline
Communication Time & 1.18 & 0.68 & 0.42 & 0.64 & 0.93 \\
Master Rounds & 46 & 47 & 47 & 44 & 37 \\\hline\hline
Sampling Number ($n_1$, $n_2$) & 1, 19 & 6, 14 & 10, 10 & 14, 6 & 19, 1 \\\hline
Communication Time & 0.46 & 0.42 & 0.46 & 0.56 & 1.40 \\
Master Rounds & 52 & 47 & 51 & 62 & 157 \\\hline
\end{tabular}}
\end{table*}

We mainly consider three works as follows
\begin{itemize}
    \item \textbf{Hier-Local-QSGD} \citep{liu2022hierarchical}: An HFL optimization algorithm with model quantization to reduce communication overhead. We conduct quantization by converting the weights from fp32 into int8. RTT for the quantized model is 0.37$\pm$0.1ms.
    \item \textbf{HierMo} \citep{yang2023hierarchical}: An HFL optimization algorithm with momentum update to accelerate the convergence of HFL. We use the provided setting in the original paper. Due to the extra transmission of momentum, RTT for HierMo is 1.96$\pm$0.3ms.
    \item \textbf{MLL-SGD} \citep{castiglia2020multi}: A partially decentralized FL algorithm, where it is still a two-level architecture while clusters are organized as a peer-to-peer network. We consider both fully-connected topology and ring topology for the cluster network.
\end{itemize}
We also adjust cluster learning rate $\eta_c$ and master learning rate $\eta_g$ of our algorithm. The configuration with $\eta_c=1$ and $\eta_g=1$ can also be considered as a natural generalization of FedAvg in HFL (referred to as HierFedAvg). We maintain the same worker learning rate $\eta=0.01$ for all the methods mentioned above.

\cref{tab:comparison} presents the performance among different methods. We highlight the best results in bold style, while the second best with underline.

We can observe the effectiveness of our three-sided learning rates. When tuning $\eta_c$ and $\eta_g$, there are varying degrees in acceleration on convergence. The speedup effect of $\eta_c$ is particularly pronounced. Simply setting $\eta_c$ to 3 or 5 reduces the iterations required to achieve target accuracy from the original 2.22$\times 10^4$ of HierFedAvg to less than 3000, indicating a nearly tenfold acceleration. 
The best hyperparameter combinations require a more refined tuning and searching process. However, we can still observe the significant potential of adjusting $\eta_c$ and $\eta_g$ to facilitate convergence at no additional information or communication cost.

HierMo also performs well, achieving target accuracy with the second fewest iterations. However, this acceleration comes at the cost of more communication overhead due to the momentum update. In contrast, Hier-Local-QSGD can directly mitigate the communication overhead by quantizing and compressing model weights. Nevertheless, the introduction of quantization leads to information loss, consequently impeding the convergence performance. For MLL-SGD algorithm, we observe that its fully-connected variant is essentially equivalent to the HFL architecture. The only difference is that the aggregation among clusters is achieved through peer-to-peer communication rather than through the coordination of a master node. This can be verified by the very close performance between MLL-SGD (fully-connected) and HierFedAvg. MLL-SGD (ring) exhibits relatively slower convergence rate. This is because the model aggregation among clusters cannot always be fully synchronized.


\subsection{Heterogeneous Groups}
\label{subsec:exp-hetergroup}
Table \ref{tab:ie} shows the impact of cluster period and sampling ratio on MNIST. Here we use two groups with 30 and 70 workers, respectively. Group 1 is i.i.d. partitioned among workers (i.e., small $\epsilon_1^2$) while group 2 is non-i.i.d. partitioned (i.e., large $\epsilon_2^2$). The default setting is $G=200$, $(I_1, I_2) = (50, 50)$, and $(n_1, n_2) = (6, 14)$, namely, a homogeneous clusters setting in \cref{cor:com-conv-rate}. We always keep an invariant $n=n_1+n_2=20$.

Though a small $I_2 = 10$ could save for about 10 master rounds, this is at cost of incurring more communication time instead. Note that impact of cluster period is not as that explicit as sampling ratios. In the extreme case $n_2 = 1$, the number of master rounds increases to about three times. Hence, choosing proper configurations is important to achieve a fair convergence. Still, the default homogeneous setting could sufficiently serve as a decent solution.

\section{Disscussion}
\textbf{Flexible Hierarchical Structure:} For edge-based FL, clients within the communication range of the server collaborate to train a machine learning model. Generally, the location and communication range of edge servers (such as base stations) are fixed. Treating the clients within the communication range of an edge server as one cluster is a natural choice. Therefore, in this paper, we employ given clusters (i.e., assuming an arbitrary grouping) which is practical in real-world scenarios. Regarding sophisticated hierarchical structure design (such as edge server deployment, grouping strategies), it is important and worth researching. %, and we would like to leave these as our future works. 

On the other hand, our theoretical framework allows certain flexibility and individualization for HFL. This includes the master round $G$ for the whole learning system, different worker number $m_i$, sampling number $n_i$, and cluster period $I_i$ for each cluster. Still, different cluster may possess inherent characteristics, such as data heterogeneity. We discuss this in subsection \ref{subsec:overcome-divergence}, where each cluster can adjust its worker sampling ratio and round period accordingly to deal with its inner data heterogeneity. We also verify this with experiments in subsection \ref{subsec:exp-hetergroup}.

\textbf{More Levels:} The cloud-edge-end architecture is prevalent in edge computing, constituting a two-level architecture. Therefore, our work delves into the convergence performance of two-level HFL. For HFL with more than $2$ levels, we assume that there are $\mathcal{L}$ levels in total. The global server is at the uppermost level $l = 1$. Each upper-level server at level $l = 1,\ldots,\mathcal{L} - 1$ connects to $M_{l}$ lower-level servers at level $l+1$. At the lowest level $l = \mathcal{L}$, each edge server serves $M_{\mathcal{L}}$ clients. Intuitively, we can straightforwardly extend our theoretical results. The weakening effects will still apply to each connected level in multi-level HFL with PWP, aiding in reducing data divergences. The global divergences $\tilde{\epsilon}^2$ are expected to only be intensified by the lowest level aggregation period $I_{\mathcal{L}}$ rather than global period $G$. 
% A rigorous theoretical proof would be part of our future work.

\textbf{Communication Time:} If there is minimal disparity in worker-master RTT and worker-cluster RTT, the communication costs of HFL and standard FL \citep{yang2020achieving} will be nearly identical. On the other hand, a critical factor driving the widespread adoption of HFL is its low latency. By distributing computational tasks across cloud servers, edge servers, and clients, data processing can occur closer to its source, reducing the latency associated with transmitting data back and forth to distant cloud servers.

\section{Conclusion}
In this work, we study the convergence behavior of HFL. We newly derive a general convergence bound for HFL that covers both full and PWP with non-i.i.d. data, non-convex objective function and SGD.
Based on the convergence analysis, we develop a three-sided learning rates algorithm to mitigate data divergences issue and realize better convergence performance. 
Our theoretical results provide key insights of why PWP of HFL is beneficial in significantly reducing the data divergences compared to standard FL.
Besides, we provide a degree of individualization for each cluster in HFL, indicating that adjusting the worker sampling ratio and round period to match inner divergence can potentially improve the convergence behavior.
We conduct extensive experiments on real world datasets to verify our theoretical results.

% \begin{contributions} % will be removed in pdf for initial submission 
% 					  % (without ‘accepted’ option in \documentclass)
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions. 
%     This is a nice way of making clear who did what and to give proper credit.
%     This section is optional.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

\begin{acknowledgements} % will be removed in pdf for initial submission,
						 % (without ‘accepted’ option in \documentclass)
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
This work was supported in part by the National Natural Science Foundation of China (NSFC) under Grant 62306077, the National Key Research and Development Program of China under Grant 2023YFC3305304, and Shanghai Sailing Program under Grant 23YF1402600.
\end{acknowledgements}

% References
\bibliography{ref}

\newpage

\appendix
\onecolumn
\input{supp}

\end{document}
