\section{Notation Table}
\begin{table*}[!h]
\centering
\caption{Key notations for three-sided learning rates HFL algorithm.}
\label{tab:notation}
\begin{tabular}{c|c}
\hline\hline
$\eta$ & worker learning rate \\
$\eta_c$ & cluster learning rate  \\
$\eta_g$ & master learning rate \\\hline
$T$ & number of total master rounds \\
$G$ & master aggregation period \\
$I_i$ & cluster $i$ aggregation period \\
$\omega_i$ & number of cluster rounds for $i$ in a master round \\\hline
$t$ & the index of master round, $0 \leq t < T$ \\
$\tau$ & the index of cluster round, $0 \leq \tau < \omega_i$ \\
$h$ & the index of worker local iteration, $0 \leq h < I_i$\\\hline
$\mathcal{V}_i$ & set of workers in cluster $i$ with size $m_i$ \\
$\mathcal{S}_i^{t,\tau}$ & set of workers in cluster $i$ sampled in master round $t$ and cluster round $\tau$ with size $n_i$ \\\hline
$\overline{\mathbf{x}}^{t}$ & master aggregated parameters before master round $t$ \\
$\overline{\mathbf{x}}_i^{t,\tau}$ & aggregated parameters on cluster $i$ before master round $t$ and cluster round $\tau$ \\
$\mathbf{x}_j^{t,\tau,h}$ & local model parameters on worker $j$ at update step ($t, \tau, h$), where the total number of iterations is $Gt + I_i \tau + h$ \\
\hline
\end{tabular}
\end{table*}

\section{Preliminary of Proof}
% \textbf{Notation.} We denote the accumulated stochastic gradients of a certain worker $j$ in a cluster round $\tau$ as $\tilde{\Delta}_j^{t,\tau} = \sum_{h=0}^{I_i-1}\eta\textbf{g}_j^{t,\tau,h}$. For cluster $i$ we denote the averaged full-participating gradients as $\bar{\Delta}^{t,\tau}_i = \frac{1}{m_i}\sum_{j\in \mathcal{V}_i}\tilde{\Delta}_j^{t,\tau}$, and the partial-participating estimator ${\Delta}^{t,\tau}_i = \frac{1}{n_i}\sum_{j\in \mathcal S_i^{t,\tau}}\tilde{\Delta}_j^{t,\tau}$. Likewise, for a master round $t$, we denote the full-participating estimator as $\bar{\Delta}^{t} = \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\eta_c \bar{\Delta}^{t,\tau}_i$, and the partial-participating one $\Delta^{t} = \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\eta_c {\Delta}^{t,\tau}_i$.

\begin{lemma}[Unbiased Sampling]
\label{lem:unbias}
For both sampling strategies 1 and 2, the estimator is unbiased in both cluster and master round, i.e.,
\begin{eqnarray*}
	\mathbb E [\Delta^{t,\tau}_i] = \bar{\Delta}^{t,\tau}_i, \forall i\in[M], \;{\rm and}\;\;\mathbb E [\Delta^{t}] = \bar{\Delta}^{t},
\end{eqnarray*} 
where the expectation is taken over the randomness introduced by sampling workers.
\end{lemma}

\begin{proof}
Let $\mathcal S^{t,\tau}_i = \{l_{i,1}, l_{i,2},\ldots,l_{i,n_i}\}$ with size $n_i$. 
For both sampling strategies $1$ and $2$, each sampling distribution is identical.
Therefore, for each cluster $i \in [M]$, we have
\begin{eqnarray}
	\mathbb E [\Delta^{t,\tau}_i] = \frac{1}{n_i}\mathbb E \Big[ \sum_{l_{i,j}\in \mathcal S^{t,\tau}_i}\tilde{\Delta}_{l_{i,j}}^{t, \tau} \Big] = \mathbb E [ \tilde{\Delta}^{t,\tau}_{l_{i,1}} ] = \frac{1}{m_i}\sum_{j\in \mathcal{V}_i}\tilde{\Delta}^{t,\tau}_j = \bar{\Delta}^{t,\tau}_i.
\end{eqnarray}
And we have for the master
\begin{eqnarray}
	\mathbb E [\Delta^{t}] = \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\eta_c \mathbb E [\Delta^{t,\tau}_i] = \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\eta_c \bar{\Delta}^{t,\tau}_i = \bar{\Delta}^{t}.
\end{eqnarray}
This completes the proof.
\end{proof}
Note that this unbiased sampling property also inherently applies to the full gradient $\nabla F_j(\textbf{x}_j^{t,\tau,h})$. This is guaranteed by taking additional expectation over the stochastic gradient, which is independent of the worker sampling.

% We denote master round as $t=0,1,\ldots$, cluster round for cluster $i$ as $\tau=0,1,\ldots, \omega_i-1$, where $\omega_i = \frac{G}{I_i}$. We denote the local iteration $h=0,1,\dots, I_i-1$ w.r.t. a cluster round. Unbiased sampling for each master round can be expressed as
% \begin{equation}
%     \Delta^{t} = \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\eta_c\sum_{h=0}^{I_i-1} \frac{1}{n_i} \sum_{j\in\mathcal{S}_i^{t,\tau}} \eta \textbf{g}_j^{t,\tau,h}, \;{\rm and}\;\; \bar{\Delta}^{t} = \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\eta_c\sum_{h=0}^{I_i-1} \frac{1}{m_i} \sum_{j\in\mathcal{V}_i} \eta \textbf{g}_j^{t,\tau,h},
% \end{equation}
% where this estimator follows unbiased sampling as $\mathbb{E}_{t} \Delta^{t} = \bar{\Delta}^{t}$.

% Note that this unbiased gradient estimator for iteration $t$ is guaranteed by the uniform probability and randomness of $\mathcal{S}_i^t$, whereas irrelevant to when it is exactly realized.
% Lemma 1 describes the iteration level, which could be considered as a fine-grained extension of the round-level version (Lemma 1 of \cite{yang2020achieving}). 

% \subsection{Auxiliary (In)equalities}
% Throughout the proof, we use the following (in)equalities frequently.
% The first is
% \begin{equation}
%     \bigg\Vert \sum_{i=1}^{M} p_i\textbf{x}_i \bigg\Vert^2 \leq \sum_{i=1}^{M} p_i \big\Vert \textbf{x}_i \big\Vert^2, \label{eqn:jensen}
% \end{equation}
% where $0\leq p_i\leq 1$ and $\sum_{i=1}^M p_i=1$, holds due to Jensen's Inequality.
% And the second follows
% \begin{equation}
%     \sum_{i=1}^{M} p_i \big\Vert \textbf{x}_i - \overline{\textbf{x}} \big\Vert^2 = \sum_{i=1}^{M} p_i \big\Vert \textbf{x}_i \big\Vert^2 - \big\Vert \overline{\textbf{x}} \big\Vert^2 \leq \sum_{i=1}^{M} p_i \big\Vert \textbf{x}_i \big\Vert^2, \label{eqn:variance}
% \end{equation}
% where $\overline{\textbf{x}}:=\sum_{i=1}^M p_i\textbf{x}_i$.

\begin{lemma}[Lemma 7 from \cite{reddi2020adaptive}]
\label{lem:ind-0mean}
For independent, mean 0 random variables $z_1, ..., z_r$ we have
\begin{equation}
    \mathbb{E} \left[\Vert z_1+...+z_r \Vert^2 \right] = \mathbb{E} \left[\Vert z_1 \Vert^2 + ... + \Vert z_r \Vert^2 \right]. \label{eqn:iid-expectation}
\end{equation}
\end{lemma}

% 先考虑一样的 I
\section{Proof of Theorem 1}
\label{app:proof-them1}
\begin{proof}
We start with bounding for the full participation case, where $\Delta^{t}$ exactly equals to $\bar{\Delta}^{t}$. Taking expectation over the randomness of the master round $t$, we have
\begin{align}
	& \mathbb{E}_t[f(\overline{\textbf{x}}^{t+1})] = \mathbb E_t f\left(\textbf{x}^t- \eta_g \Delta^{t} \right)\notag \\ 
	& \overset{(a)}{\leq} \mathbb{E}_t f(\overline{\textbf{x}}^t)-\mathbb E_t\bigl< \nabla f(\overline{\textbf{x}}^t), \eta_g\Delta^{t} \bigr> + \frac{ L}{2}\mathbb E_t \big\Vert \eta_g \Delta^{t} \big\Vert^2 \notag\\
    & = f(\overline{\textbf{x}}^t) - \eta_g \mathbb E_t \bigl< \nabla f(\overline{\textbf{x}}^t), \Delta^{t} - \eta_c\eta G \nabla f(\overline{\textbf{x}}^t) + \eta_c\eta G \nabla f(\overline{\textbf{x}}^t) \bigr> + \eta_g^2 \frac{ L}{2}\mathbb E_t \big\Vert \Delta^{t} \big\Vert^2 \notag\\
    & = f(\overline{\textbf{x}}^t) - \eta_g \eta_c \eta G \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2 - \eta_g \underbrace{\mathbb E_t\bigl< \nabla f(\overline{\textbf{x}}^t), \Delta^{t} - \eta_c\eta G \nabla f(\overline{\textbf{x}}^t)\bigr>}_{A_1} + \eta_g^2 \frac{ L}{2}\underbrace{\mathbb E_t \big\Vert \Delta^{t} \big\Vert^2}_{A_2}
    \label{eqn:A1A2-full},
\end{align}
where $(a)$ is a proposition of Lipschitz smooth. 

The inner product term $A_1$ follows
\begin{align}
	& A_1 = \mathbb E_t\biggl<\nabla f(\overline{\textbf{x}}^t), \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\eta_c\sum_{h=0}^{I_i-1} \frac{1}{m_i} \sum_{j\in\mathcal{V}_i} \eta \textbf{g}_j^{t,\tau,h} - \eta_c\eta G \nabla f(\overline{\textbf{x}}^t) \biggr>\notag\\ 
	% & \overset{(a)}{=}  \mathbb E\left[\mathbb E\left[\biggl<\nabla f(\overline{\textbf{x}}^t), \sum_{\tau=0}^{\omega}\sum_{h=0}^{I}\sum_{i=1}^{M}\frac{m_i}{m} \frac{1}{m_i} \sum_{j\in\mathcal{V}_i} \eta \textbf{g}_j^{t,\tau,h} - \eta G \nabla f(\overline{\textbf{x}}^t) \Bigg\vert\{\textbf x^t_j: \forall j\in \mathcal V_i\}\right]\right]\notag\\
	& = \mathbb E_t\biggl<\nabla f(\overline{\textbf{x}}^t), \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\eta_c\sum_{h=0}^{I_i-1} \frac{1}{m_i} \sum_{j\in\mathcal{V}_i} \eta \nabla F_j(\textbf{x}_j^{t,\tau,h}) - \eta_c\eta G \nabla f(\overline{\textbf{x}}^t) \biggr>\notag \\
    & = \mathbb E_t\biggl<\sqrt{\eta_c\eta G} \nabla f(\overline{\textbf{x}}^t), \frac{\sqrt{\eta_c\eta}}{\sqrt{G}}  \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \frac{1}{m_i} \sum_{j\in\mathcal{V}_i} \Big(\nabla F_j(\textbf{x}_j^{t,\tau,h}) - \nabla F_j(\overline{\textbf{x}}^t)\Big) \biggr>\notag \\
	& \overset{(a)}{=} - \frac{\eta_c\eta G}{2} \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2 - \frac{\eta_c\eta}{2G} \mathbb{E}_t \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \frac{1}{m_i} \sum_{j\in\mathcal{V}_i} \Big(\nabla F_j(\textbf{x}_j^{t,\tau,h}) - \nabla F_j(\overline{\textbf{x}}^t)\Big) \bigg\Vert^2 \notag\\
    & + \frac{\eta_c\eta}{2G} \mathbb{E}_t \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \frac{1}{m_i} \sum_{j\in\mathcal{V}_i}\nabla F_j(\textbf{x}_j^{t,\tau,h}) \bigg\Vert^2 , \label{eqn:A1-1}
\end{align}
where $(a)$ is due to the fact $<x, y> = \frac{1}{2}(||x+y||^2 - ||x||^2 - ||y||^2)$.


% 这里先 全部展开 为 parameter MSE，再 按照 group 展开，因为 最后 始终只保留 \nabla f(\overline{\textbf{f}^t}) 的 Norm 项（不希望引入 其他 local aggregated 的）
For the second term of Eq. \ref{eqn:A1-1}, we have
\begin{align}
    & \mathbb{E}_t \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \frac{1}{m_i} \sum_{j\in\mathcal{V}_i} \Big(\nabla F_j(\textbf{x}_j^{t,\tau,h}) - \nabla F_j(\overline{\textbf{x}}^t)\Big) \bigg\Vert^2 \notag\\
    & \overset{(a)}{\leq} \sum_{i=1}^{M}\frac{m_i}{m} G \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \frac{1}{m_i} \sum_{j\in\mathcal{V}_i} \mathbb{E}_t \bigg\Vert \nabla F_j(\textbf{x}_j^{t,\tau,h}) - \nabla F_j(\overline{\textbf{x}}^t) \bigg\Vert^2 \notag\\
    & =  G \frac{1}{m} \sum_{i=1}^{M} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \sum_{j\in\mathcal{V}_i} \mathbb{E}_t \bigg\Vert \nabla F_j(\textbf{x}_j^{t,\tau,h}) - \nabla F_j(\overline{\textbf{x}}_i^{t,\tau}) + \nabla F_j(\overline{\textbf{x}}_i^{t,\tau}) - \nabla F_j(\overline{\textbf{x}}^t) \bigg\Vert^2 \notag\\
    & \overset{(b)}{\leq} 2L^2G \frac{1}{m} \sum_{i=1}^{M} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \sum_{j\in\mathcal{V}_i} \mathbb{E}_t \big\Vert \textbf{x}_j^{t,\tau,h} - \overline{\textbf{x}}_i^{t,\tau} \big\Vert^2 + 2L^2G  \sum_{i=1}^{M} \sum_{\tau=0}^{\omega_i-1} I_i \frac{m_i}{m} \mathbb{E}_t \big\Vert \overline{\textbf{x}}_i^{t,\tau} - \overline{\textbf{x}}^{t} \big\Vert^2, \label{eqn:global-MSE-full}
\end{align}
where $(a)$ is a proposition of Jensen's Inequality. $(b)$ is achieved by first unrolling with an extension of Jensen inequality as $\Vert \sum_i^k \mathbf{x}_i \Vert^2 \leq k\sum_i^k \Vert \mathbf{x}_i \Vert^2$ and then applying Assumption \ref{ass:lipschitz}.

In Eq. \ref{eqn:global-MSE-full}, the first term represents the overall worker-cluster parameter MSE while the second one represents the overall cluster-master parameter MSE in each round.

\subsection{Bounding cluster-master parameter MSE}
We prove a lemma that bounds this cluster-master parameter MSE term.

\begin{lemma}[Cluster-master Parameter MSE]
\label{lem:cm-MSE}
For any local learning rate $\eta$ and cluster learning rate $\eta_c$ satisfying $\eta_c\eta \leq \frac{1}{8LG}$, we can bound the overall cluster-master parameter MSE for a cluster $i$ in a certain master round $t$ regarding the worker-cluster parameter MSE as, with full worker participation,
{\rm\begin{align}
    & \sum_{\tau=0}^{\omega_i-1}  \mathbb{E}_t \big\Vert \overline{\textbf{x}}_i^{t,\tau} - \overline{\textbf{x}}^{t} \big\Vert^2 \leq 5 G\eta_c^2\eta^2 \frac{\omega_i}{m_i} \sigma^2 + 40 \omega_i G^2\eta_c^2\eta^2 \epsilon^2 \notag\\
    & + 40 \omega_i G^2\eta_c^2\eta^2 \big\Vert  \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 + 24 G\eta_c^2\eta^2L^2 \omega_i \sum_{\tau=0}^{\omega_i-1} \Omega_i^{t,\tau} , \label{eqn:cm-MSE-full}
\end{align}}
where {\rm$\Omega_i^{t,\tau} = \frac{1}{m_i}\sum_{j\in\mathcal{V}_i} \sum_{h=0}^{I_i} \mathbb{E} \Vert \textbf{x}_j^{t,\tau,h} - \overline{\textbf{x}}_i^{t,\tau} \Vert^2$}.

With partial worker participation for both sampling strategies, we have
{\rm\begin{align}
    & \sum_{\tau=0}^{\omega_i-1}  \mathbb{E}_t \big\Vert \overline{\textbf{x}}_i^{t,\tau} - \overline{\textbf{x}}^{t} \big\Vert^2  \leq 5 G\eta_c^2\eta^2 \frac{\omega_i}{n_i} \sigma^2 + 40 \omega_i G^2\eta_c^2\eta^2 \epsilon^2 \notag\\
    & + 40 \omega_i G^2\eta_c^2\eta^2 \big\Vert  \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 + 24 G\eta_c^2\eta^2L^2 \omega_i \sum_{\tau=0}^{\omega_i-1} \Omega_i^{t,\tau} + 3 \eta_c^2\eta^2 \omega_i \sum_{\tau=0}^{\omega_i-1} \Psi_i^{t,\tau}, \label{eqn:cm-MSE-partial}
\end{align}}
where {\rm $\Psi_i^{t,\tau} = \mathbb{E} \Vert \frac{1}{n_i}\sum_{j\in\mathcal{S}_i^{t,\tau}}\sum_{h=0}^{I_i} \nabla F_j(\textbf{x}_j^{t,\tau,h}) - \frac{1}{m_i}\sum_{j\in\mathcal{V}_i}\sum_{h=0}^{I_i} \nabla F_j(\textbf{x}_j^{t,\tau,h}) \Vert^2$}.
\end{lemma}

\begin{proof}
We elaborate the proof for the partial worker participation, and the result naturally generalize to the full case.
For any round $\tau$ of certain cluster $i$, we have
\begin{align}
    & \mathbb{E}_t \big\Vert \overline{\textbf{x}}_i^{t,\tau} - \overline{\textbf{x}}^{t} \big\Vert^2 = \mathbb{E} \bigg\Vert \overline{\textbf{x}}_i^{t,\tau-1} - \overline{\textbf{x}}^{t} - \eta_c \frac{1}{n_i}\sum_{j\in\mathcal{S}_i^{t,\tau-1}}\sum_{h=0}^{I_i} \eta \textbf{g}_j^{t,\tau-1,h} \bigg\Vert^2 \notag\\
    & = \mathbb{E}_t \bigg\Vert \overline{\textbf{x}}_i^{t,\tau-1} - \overline{\textbf{x}}^{t} - \eta_c \frac{1}{n_i}\sum_{j\in\mathcal{S}_i^{t,\tau-1}}\sum_{h=0}^{I_i} \eta \Big( \textbf{g}_j^{t,\tau-1,h} - \nabla F_j(\textbf{x}_j^{t,\tau-1,h}) + \nabla F_j(\textbf{x}_j^{t,\tau-1,h}) \Big) \notag\\
    & + \frac{1}{m_i}\sum_{j\in\mathcal{V}_i}\sum_{h=0}^{I_i} \eta \Big( \nabla F_j(\textbf{x}_j^{t,\tau-1,h}) - \nabla F_j(\textbf{x}_j^{t,\tau-1,h}) - \nabla f_i(\overline{\textbf{x}}_i^{t,\tau-1}) + \nabla f_i(\overline{\textbf{x}}_i^{t,\tau-1}) \notag\\
    & - \nabla f_i(\overline{\textbf{x}}^{t}) + \nabla f_i(\overline{\textbf{x}}^{t}) - \nabla f(\overline{\textbf{x}}^{t}) + \nabla f(\overline{\textbf{x}}^{t}) \Big) \bigg\Vert^2 \notag\\
    & \overset{(a)}{=} \eta_c^2\eta^2 \mathbb{E}_t \bigg\Vert \frac{1}{n_i}\sum_{j\in\mathcal{S}_i^{t,\tau-1}}\sum_{h=0}^{I_i} \eta \Big( \textbf{g}_j^{t,\tau-1,h} - \nabla F_j(\textbf{x}_j^{t,\tau-1,h}) \Big) \bigg\Vert^2 \notag\\
    & + \eta_c^2\eta^2 \mathbb{E}_t \bigg\Vert \frac{1}{n_i}\sum_{j\in\mathcal{S}_i^{t,\tau-1}}\sum_{h=0}^{I_i} \nabla F_j(\textbf{x}_j^{t,\tau-1,h}) - \frac{1}{m_i}\sum_{j\in\mathcal{V}_i}\sum_{h=0}^{I_i} \nabla F_j(\textbf{x}_j^{t,\tau-1,h}) \bigg\Vert^2 \notag\\
    & + \mathbb{E}_t \bigg\Vert \overline{\textbf{x}}_i^{t,\tau-1} - \overline{\textbf{x}}^{t} - \eta_c\frac{1}{m_i}\sum_{j\in\mathcal{V}_i}\sum_{h=0}^{I_i} \eta \Big( \nabla F_j(\textbf{x}_j^{t,\tau-1,h}) - \nabla F_j(\overline{\textbf{x}}_i^{t,\tau-1}) \notag\\
    & + \nabla f_i(\overline{\textbf{x}}_i^{t,\tau-1}) - \nabla f_i(\overline{\textbf{x}}^{t}) + \nabla f_i(\overline{\textbf{x}}^{t}) - \nabla f(\overline{\textbf{x}}^{t}) + \nabla f(\overline{\textbf{x}}^{t}) \Big) \bigg\Vert^2 \notag\\
    & \overset{(b)}{\leq} \frac{I_i\eta_c^2\eta^2\sigma^2}{n_i} + \eta_c^2\eta^2 \mathbb{E}_t \bigg\Vert \frac{1}{n_i}\sum_{j\in\mathcal{S}_i^{t,\tau-1}}\sum_{h=0}^{I_i} \nabla F_j(\textbf{x}_j^{t,\tau-1,h}) - \frac{1}{m_i}\sum_{j\in\mathcal{V}_i}\sum_{h=0}^{I_i} \nabla F_j(\textbf{x}_j^{t,\tau-1,h}) \bigg\Vert^2 \notag\\
    & + (1 + \frac{1}{2\omega_i-1}) \mathbb{E}_t \big\Vert \overline{\textbf{x}}_i^{t,\tau-1} - \overline{\textbf{x}}^{t} \big\Vert^2 + 8\omega_i\eta_c^2\eta^2 \bigg( \mathbb{E}_t \Big\Vert \frac{1}{m_i}\sum_{j\in\mathcal{V}_i}\sum_{h=0}^{I_i} \Big( \nabla F_j(\textbf{x}_j^{t,\tau-1,h}) - \nabla F_j(\overline{\textbf{x}}_i^{t,\tau-1}) \Big) \Big\Vert^2 \notag\\
    & + I_i^2 \mathbb{E}_t \Big\Vert \nabla f_i(\overline{\textbf{x}}_i^{t,\tau-1}) - \nabla f_i(\overline{\textbf{x}}^{t}) \Big\Vert^2 + I_i^2 \mathbb{E}_t \Big\Vert \nabla f_i(\overline{\textbf{x}}^{t}) - \nabla f(\overline{\textbf{x}}^{t}) \Big\Vert^2 + I_i^2 \mathbb{E}_t \big\Vert  \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 \bigg) \notag\\
    & \leq \frac{I_i\eta_c^2\eta^2\sigma^2}{n_i} + \eta_c^2\eta^2 \mathbb{E}_t \bigg\Vert \frac{1}{n_i}\sum_{j\in\mathcal{S}_i^{t,\tau-1}}\sum_{h=0}^{I_i} \nabla F_j(\textbf{x}_j^{t,\tau-1,h}) - \frac{1}{m_i}\sum_{j\in\mathcal{V}_i}\sum_{h=0}^{I_i} \nabla F_j(\textbf{x}_j^{t,\tau-1,h}) \bigg\Vert^2 \notag\\
    & + 8G \eta_c^2\eta^2 L^2 \frac{1}{m_i}\sum_{j\in\mathcal{V}_i} \sum_{h=0}^{I_i} \mathbb{E}_t \big\Vert \textbf{x}_j^{t,\tau-1,h} - \overline{\textbf{x}}_i^{t,\tau-1} \big\Vert^2 + 8 GI_i\eta_c^2\eta^2 \epsilon^2 \notag\\
    & + (1 + \frac{1}{2\omega_i-1} + 8GI_i\eta_c^2\eta^2 L^2) \mathbb{E}_t \big\Vert \overline{\textbf{x}}_i^{t,\tau-1} - \overline{\textbf{x}}^{t} \big\Vert^2 + 8GI_i\eta_c^2\eta^2 \big\Vert \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2,
\end{align}
where $(a)$ holds due to the zero mean and independence of the first two term, and the definition $f_i(\overline{\textbf{x}}_i^{t,\tau}) = \frac{1}{m_i} \sum_{j\in\mathcal{V}_i} F_j(\overline{\textbf{x}}_i^{t,\tau})$ is also used for substitution in the last term. The first term of $(b)$ is acquired via \cref{lem:ind-0mean} and bounded variance (Assumption \ref{ass:bound-var}). The other terms of $(b)$ holds due to the fact that $||x+y||^2 \leq (1 + \frac{1}{k-1}) ||x||^2 + k ||y||^2, \forall k > 1$ (we set $k = 2\omega_i$ here).

For better presentation, we denote $\Psi_i^{t,\tau} = \mathbb{E} \Vert \frac{1}{n_i}\sum_{j\in\mathcal{S}_i^{t,\tau}}\sum_{h=0}^{I_i} \nabla F_j(\textbf{x}_j^{t,\tau,h}) - \frac{1}{m_i}\sum_{j\in\mathcal{V}_i}\sum_{h=0}^{I_i} \nabla F_j(\textbf{x}_j^{t,\tau,h}) \Vert^2$ and $\Omega_i^{t,\tau} = \frac{1}{m_i}\sum_{j\in\mathcal{V}_i} \sum_{h=0}^{I_i} \mathbb{E} \Vert \textbf{x}_j^{t,\tau,h} - \overline{\textbf{x}}_i^{t,\tau} \Vert^2$. Note that $\Psi_i^{t,\tau}$ is essentially a measurement of the additional variance introduced by worker sampling in cluster round $\tau$ of cluster $i$. Suppose $\eta_c\eta \leq \frac{1}{8LG}$, we can have
\begin{align}
    & \mathbb{E} \big\Vert \overline{\textbf{x}}_i^{t,\tau} - \overline{\textbf{x}}^{t} \big\Vert^2 \leq (1 + \frac{1}{2\omega_i-1} + 8GI_i\eta_c^2\eta^2 L^2) \mathbb{E} \big\Vert \overline{\textbf{x}}_i^{t,\tau-1} - \overline{\textbf{x}}^{t} \big\Vert^2 + \frac{I_i\eta_c^2\eta^2\sigma^2}{n_i} + \eta_c^2\eta^2 \Psi_i^{t,\tau-1} \notag\\
    & + 8 GI_i\eta_c^2\eta^2 \epsilon^2 + 8G\eta_c^2\eta^2 L^2 \Omega_i^{t,\tau-1} + 8GI_i\eta_c^2\eta^2 \big\Vert  \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 \notag\\
    & \leq (1 + \frac{1}{\omega_i-1}) \mathbb{E} \big\Vert \overline{\textbf{x}}_i^{t,\tau-1} - \overline{\textbf{x}}^{t} \big\Vert^2 + \frac{I_i\eta_c^2\eta^2\sigma^2}{n_i} + 8 GI_i\eta_c^2\eta^2 \epsilon^2 + 8GI_i\eta_c^2\eta^2 \big\Vert  \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 \notag\\
    & + \eta_c^2\eta^2 \Psi_i^{t,\tau-1} + 8G\eta_c^2\eta^2 L^2 \Omega_i^{t,\tau-1} .
\end{align}
Unrolling the recursion, we obtain
\begin{align}
    & \mathbb{E} \big\Vert \overline{\textbf{x}}_i^{t,\tau} - \overline{\textbf{x}}^{t} \big\Vert^2 \notag\\
    & \leq \sum_{p=0}^{\tau-1}(1 + \frac{1}{\omega_i-1})^p \bigg[\frac{I_i\eta_c^2\eta^2\sigma^2}{n_i} + 8 GI_i\eta_c^2\eta^2 \epsilon^2 + 8GI_i\eta_c^2\eta^2 \big\Vert  \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 \notag\\
    & + \eta_c^2\eta^2 \Psi_i^{t,\tau-1-p} +  8G\eta_c^2\eta^2 L^2 \Omega_i^{t,\tau-1-p} \bigg] \notag\\
    & \leq (\omega_i - 1)\big[(1+\frac{1}{\omega_i-1})^{\omega_i} - 1\big] \bigg[ \frac{I_i\eta_c^2\eta^2\sigma^2}{n_i} + 8 GI_i\eta_c^2\eta^2 \epsilon^2 + 8GI_i\eta_c^2\eta^2 \big\Vert  \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 \bigg] \notag\\
    & + (1 + \frac{1}{\omega_i-1})^{\omega_i-1} \bigg[ \eta_c^2\eta^2 \sum_{p=0}^{\tau-1} \Psi_i^{t,p} + 8G\eta_c^2\eta^2L^2 \sum_{p=0}^{\tau-1} \Omega_i^{t,p} \bigg] \notag\\
    & \overset{(a)}{\leq} 5\omega_i \bigg[ \frac{I_i\eta_c^2\eta^2\sigma^2}{n_i} + 8 GI_i\eta_c^2\eta^2 \epsilon^2 + 8GI_i\eta_c^2\eta^2 \big\Vert  \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 \bigg] + 3 \eta_c^2\eta^2 \sum_{p=0}^{\tau-1} \Psi_i^{t,p} + 24G\eta_c^2\eta^2L^2 \sum_{p=0}^{\tau-1} \Omega_i^{t,p} \notag\\
    & = \frac{5G\eta_c^2\eta^2\sigma^2}{n_i} + 40 G^2\eta_c^2\eta^2 \epsilon^2 + 40G^2\eta_c^2\eta^2 \big\Vert  \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 + 3 \eta_c^2\eta^2 \sum_{p=0}^{\tau-1} \Psi_i^{t,p} + 24G\eta_c^2\eta^2L^2 \sum_{p=0}^{\tau-1} \Omega_i^{t,p} ,
\end{align}
where $(a)$ is due to the fact that $(1 + \frac{1}{\omega_i-1})^{\omega_i-1} \leq 3$ and $(1 + \frac{1}{\omega_i-1})^{\omega_i} \leq 5$ for $\omega_i > 1$.

Summing from $\tau=0,\ldots,\omega_i-1$, the overall cluster-master parameter MSE of cluster $i$ in a master round $t$ can be expressed as
\begin{align}
    & \sum_{\tau=0}^{\omega_i-1} \mathbb{E}_t \big\Vert \overline{\textbf{x}}_i^{t,\tau} - \overline{\textbf{x}}^{t} \big\Vert^2
    % & \leq 5 G\eta_c^2\eta^2 \frac{\omega_i}{n_i} \sigma^2 + 40 \omega_i G^2\eta_c^2\eta^2 \epsilon^2 + 40 \omega_i G^2\eta_c^2\eta^2 \big\Vert  \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 + 24G\eta_c^2\eta^2L^2 \sum_{\tau=0}^{\omega_i-1} \sum_{p=0}^{\tau-1} \Omega_i^{t,p} + 3 \eta_c^2\eta^2 \sum_{\tau=0}^{\omega_i-1} \sum_{p=0}^{\tau-1} \Psi_i^{t,p} \notag\\
    \overset{(a)}{\leq} 5 G\eta_c^2\eta^2 \frac{\omega_i}{n_i} \sigma^2 + 40 \omega_i G^2\eta_c^2\eta^2 \epsilon^2 \notag\\
    & + 40 \omega_i G^2\eta_c^2\eta^2 \big\Vert  \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 + 24 G\eta_c^2\eta^2L^2 \omega_i \sum_{\tau=0}^{\omega_i-1} \Omega_i^{t,\tau} + 3 \eta_c^2\eta^2 \omega_i \sum_{\tau=0}^{\omega_i-1} \Psi_i^{t,\tau} . \label{eqn:cm-MSE-raw}
\end{align}
where the above simplification follows
$\sum_{\tau=0}^{\omega_i-1} \sum_{p=0}^{\tau-1} \Omega_i^{t,p} \leq \sum_{\tau=0}^{\omega_i-1} \sum_{p=0}^{\tau} \Omega_i^{t,p} \leq \omega_i \sum_{\tau=0}^{\omega_i-1} \Omega_i^{t,\tau}$, and the same holds for $\Psi_i^{t,\tau}$. 

Note that when degenerating to full participation case, we could set $n_i = m_i$ and $\Psi_i^{t,\tau} = 0, \forall \tau$, and thereby recover the corresponding result in \cref{lem:cm-MSE}.
This concludes the proof.
\end{proof}

\subsection{Bounding worker-cluster parameter MSE}
Then we bound the worker-cluster parameter MSE. Here we consider the intra-cluster aggregation process as a standard FL process, and introduce two lemmas here to bound it.

\begin{lemma}[Generalization of Lemma 3 from \cite{reddi2020adaptive}]
\label{lem:wc-MSE}
For any local learning rate $\eta \leq \frac{1}{8I_iL}$, we have the following result for cluster $i$ in its cluster round $\tau$
{\rm\begin{align}
    & \frac{1}{m_i} \sum_{j\in\mathcal{V}_i} \mathbb{E}_t \big\Vert \textbf{x}_j^{t,\tau,h} - \overline{\textbf{x}}_i^{t,\tau} \big\Vert^2 \leq 5I_i\eta^2(\sigma^2 + 10I_i\epsilon_i^2) + 50I_i^2\eta^2 \epsilon^2 \notag\\
    & + 50I_i^2\eta^2 L^2 \mathbb{E}_t \Vert \overline{\textbf{x}}_i^{t,\tau} - \overline{\textbf{x}}^{t} \Vert^2 + 50I_i^2\eta^2 \Vert \nabla f(\overline{\textbf{x}}^t) \Vert^2. \label{eqn:wc-MSE-full}
\end{align}}
\end{lemma}

\begin{proof}
Our proof here is a variant of that of \cite{reddi2020adaptive} in the HFL. For any worker $j$ in cluster $i$, we have for any local step $h$,
\begin{align}
    & \mathbb{E}_t \big\Vert \textbf{x}_j^{t,\tau,h} - \overline{\textbf{x}}_i^{t,\tau} \big\Vert^2 = \mathbb{E} \big\Vert \textbf{x}_j^{t,\tau,h-1} - \overline{\textbf{x}}_i^{t,\tau} - \eta \textbf{g}_j^{t,\tau,h-1} \big\Vert^2 \notag\\
    & = \mathbb{E}_t \bigg\Vert \textbf{x}_j^{t,\tau,h-1} - \overline{\textbf{x}}_i^{t,\tau} - \eta \Big( \textbf{g}_j^{t,\tau,h-1} - \nabla F_j(\textbf{x}_j^{t,\tau,h-1}) + \nabla F_j(\textbf{x}_j^{t,\tau,h-1}) - \nabla F_i(\overline{\textbf{x}}_i^{t,\tau}) \notag\\
    & + \nabla F_i(\overline{\textbf{x}}_i^{t,\tau}) - \nabla f_i(\overline{\textbf{x}}_i^{t,\tau}) + \nabla f_i(\overline{\textbf{x}}_i^{t,\tau}) - \nabla f_i(\overline{\textbf{x}}^{t}) + \nabla f_i(\overline{\textbf{x}}^{t}) - \nabla f(\overline{\textbf{x}}^{t}) + \nabla f(\overline{\textbf{x}}^{t}) \Big) \bigg\Vert^2 \notag\\
    & \overset{(a)}{\leq} (1+\frac{1}{2I_i-1}) \mathbb{E}_t \big\Vert \textbf{x}_j^{t,\tau,h-1} - \overline{\textbf{x}}_i^{t,\tau} \big\Vert^2 + \eta^2 \mathbb{E}_t \Big\Vert \textbf{g}_j^{t,\tau,h-1} - \nabla F_j(\textbf{x}_j^{t,\tau,h-1}) \Big\Vert^2 \notag\\
    & + 10I_i\eta^2 \mathbb{E}_t \big\Vert \nabla F_j(\textbf{x}_j^{t,\tau,h-1}) - \nabla F_j(\overline{\textbf{x}}_i^{t,\tau}) \big\Vert^2 + 10I_i\eta^2 \mathbb{E}_t \big\Vert \nabla F_j(\overline{\textbf{x}}_i^{t,\tau}) - \nabla f_i(\overline{\textbf{x}}_i^{t,\tau}) \big\Vert^2 \notag\\
    & + 10I_i\eta^2 \mathbb{E}_t \big\Vert \nabla f_i(\overline{\textbf{x}}_i^{t,\tau}) - \nabla f_i(\overline{\textbf{x}}^{t}) \big\Vert^2 + 10I_i\eta^2 \mathbb{E}_t \big\Vert \nabla f_i(\overline{\textbf{x}}^{t}) - \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 + 10I_i\eta^2 \mathbb{E}_t \big\Vert \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 \notag\\
    & \leq (1+\frac{1}{2I_i-1}+10I_i\eta^2L^2) \mathbb{E}_t \big\Vert \textbf{x}_j^{t,\tau,h-1} - \overline{\textbf{x}}_i^{t,\tau} \big\Vert^2 + \eta^2 \sigma^2 + 10I_i\eta^2 \epsilon_i^2 \notag\\
    & + 10I_i\eta^2L^2 \mathbb{E}_t \big\Vert \overline{\textbf{x}}_i^{t,\tau} - \overline{\textbf{x}}^{t} \big\Vert^2 + 10I_i\eta^2 \epsilon^2 + 10I_i\eta^2 \mathbb{E}_t \big\Vert \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 \notag\\
    & \leq (1+\frac{1}{I_i-1}) \mathbb{E}_t \big\Vert \textbf{x}_j^{t,\tau,h-1} - \overline{\textbf{x}}_i^{t,\tau} \big\Vert^2 + \eta^2 \sigma^2 + 10I_i\eta^2 \epsilon_i^2 + 10I_i\eta^2 \epsilon^2 \notag\\
    & + 10I_i\eta^2L^2 \mathbb{E}_t \big\Vert \overline{\textbf{x}}_i^{t,\tau} - \overline{\textbf{x}}^{t} \big\Vert^2 + 10I_i\eta^2 \mathbb{E}_t \big\Vert \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2
\end{align}
where the expansion of $(a)$ holds similarly as we prove \cref{lem:cm-MSE}, i.e., $||x+y||^2 \leq (1 + \frac{1}{k-1}) ||x||^2 + k ||y||^2, \forall k > 1$ with $k = 2I_i$.

Unrolling the recursion, we get
\begin{align}
    & \frac{1}{m_i} \sum_{j\in\mathcal{V}_i} \mathbb{E}_t \big\Vert \textbf{x}_j^{t,\tau,h} - \overline{\textbf{x}}_i^{t,\tau} \big\Vert^2 \notag\\
    % & \leq \sum_{p=0}^{h-1}(1 + \frac{1}{I_i-1})^p \bigg[ \eta^2 \sigma^2 + 10I_i\eta^2 \epsilon_i^2 + 10I_i\eta^2L^2 \mathbb{E}_t \big\Vert \overline{\textbf{x}}_i^{t,\tau} - \overline{\textbf{x}}^{t} \big\Vert^2 + 10I_i\eta^2 \epsilon^2 + 10I_i\eta^2 \mathbb{E}_t \big\Vert \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 \bigg] \notag\\
    & \leq (I_i - 1)\big[(1+\frac{1}{I_i-1})^{I_i} - 1\big] \bigg[ \eta^2 \sigma^2 + 10I_i\eta^2 \epsilon_i^2 + 10I_i\eta^2 \epsilon^2 \notag\\
    & + 10I_i\eta^2L^2 \mathbb{E}_t \big\Vert \overline{\textbf{x}}_i^{t,\tau} - \overline{\textbf{x}}^{t} \big\Vert^2 + 10I_i\eta^2 \mathbb{E}_t \big\Vert \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 \bigg] \notag\\
    & \overset{(a)}{\leq} 5I_i\eta^2(\sigma^2 + 10I_i\epsilon_i^2) + 50I_i^2\eta^2 L^2 \mathbb{E}_t \Vert \overline{\textbf{x}}_i^{t,\tau} - \overline{\textbf{x}}^{t} \Vert^2 + 50I_i^2\eta^2 \epsilon^2 + 50I_i^2\eta^2 \Vert \nabla f(\overline{\textbf{x}}^t) \Vert^2,
\end{align}
where $(a)$ is due to the fact that $(1 + \frac{1}{I_i-1})^{I_i} \leq 5$ for $I_i > 1$. This completes the proof.
\end{proof}

Note that \cref{lem:cm-MSE} and \cref{lem:wc-MSE} indicate that the cluster-master parameter MSE and worker-cluster parameter MSE can be bounded via each other. We thus utilize these two lemmas with proper learning rate condition to derive a more general lemma for worker-cluster parameter MSE, which is not depends on the cluster-master one.

\begin{lemma}
\label{lem:wc-MSE-indep}
For any local learning rate $\eta$ and cluster learning rate $\eta_c$ satisfying $\eta_c\eta \leq \frac{1}{10LG}$ and $\eta \leq \frac{1}{10I_{max}L}$, where $I_{max} = \max_i I_i$, we can bound the overall worker-cluster parameter MSE in a master round $t$ with arbitrary positive coefficient $p_i > 0, \forall i\in[M]$ as, with full worker participation,
{\rm
\begin{align}
    & \sum_{i=1}^{M} p_i \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{V}_i} \sum_{h=0}^{I_i-1} \mathbb{E}_t \Vert \textbf{x}_j^{t,\tau,h} - \overline{\textbf{x}}_i^{t,\tau} \Vert^2 \leq 6 G\eta^2 \sum_{i=1}^{M} p_im_i I_i\sigma^2 + 60 G\eta^2 \sum_{i=1}^{M} p_im_i I_i^2\epsilon_i^2 \notag\\
    & + 80 G \eta^2 \sum_{i=1}^{M} p_im_i I_i^2 \Vert \nabla f(\overline{\textbf{x}}^t) \Vert^2 + 80 G \eta^2 \sum_{i=1}^{M} p_im_i I_i^2 \epsilon^2 + 3 G^2 \eta_c^2 \eta^2 \sum_{i=1}^M p_i \sigma^2 .
\end{align}
}

With partial worker participation for both sampling strategies, we have
{\rm
\begin{align}
    & \sum_{i=1}^{M} p_i \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{V}_i} \sum_{h=0}^{I_i-1} \mathbb{E}_t \Vert \textbf{x}_j^{t,\tau,h} - \overline{\textbf{x}}_i^{t,\tau} \Vert^2 \leq 6 G\eta^2 \sum_{i=1}^{M} p_im_i I_i\sigma^2 + 60 G\eta^2 \sum_{i=1}^{M} p_im_i I_i^2\epsilon_i^2 \notag\\
    & + 80 G \eta^2 \sum_{i=1}^{M} p_im_i I_i^2 \Vert \nabla f(\overline{\textbf{x}}^t) \Vert^2 + 80 G \eta^2 \sum_{i=1}^{M} p_im_i I_i^2 \epsilon^2 \notag\\
    & + 3 G^2 \eta_c^2 \eta^2 \sum_{i=1}^M \frac{p_im_i}{n_i} \sigma^2 + 2 G\eta_c^2\eta^2 \sum_{i=1}^{M} p_im_i \sum_{\tau=0}^{\omega_i-1} \Psi_i^{t,\tau}.
\end{align}
}
where {\rm $\Psi_i^{t,\tau} = \mathbb{E}_t \Vert \frac{1}{n_i}\sum_{j\in\mathcal{S}_i^{t,\tau}}\sum_{h=0}^{I_i} \nabla F_j(\textbf{x}_j^{t,\tau,h}) - \frac{1}{m_i}\sum_{j\in\mathcal{V}_i}\sum_{h=0}^{I_i} \nabla F_j(\textbf{x}_j^{t,\tau,h}) \Vert^2$}.
\end{lemma}

\begin{proof}
With \cref{lem:wc-MSE}, we have for arbitrary $p_i > 0, \forall i\in[M]$,
\begin{align}
    & \sum_{i=1}^{M} p_i \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \sum_{j\in\mathcal{V}_i} \mathbb{E}_t \big\Vert \textbf{x}_j^{t,\tau,h} - \overline{\textbf{x}}_i^{t,\tau} \big\Vert^2 \notag\\
    % & \leq \sum_{i=1}^{M} p_im_i \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \Big( 5I_i\eta^2(\sigma^2 + 10I_i\epsilon_i^2) + 50I_i^2\eta^2 L^2 \mathbb{E}_t \Vert \overline{\textbf{x}}_i^{t,\tau} - \overline{\textbf{x}}^{t} \Vert^2 + 50I_i^2\eta^2 \epsilon^2 + 50I_i^2\eta^2 \Vert \nabla f(\overline{\textbf{x}}^t) \Vert^2 \Big) \notag\\
    & \leq 5G\eta^2 \sum_{i=1}^{M} p_im_i I_i\sigma^2 + 50G\eta^2 \sum_{i=1}^{M} p_im_i I_i^2\epsilon_i^2 + 50 G \eta^2 \sum_{i=1}^{M} p_im_i I_i^2 \Vert \nabla f(\overline{\textbf{x}}^t) \Vert^2 \notag\\
    & + 50 G \eta^2 \sum_{i=1}^{M} p_im_i I_i^2 \epsilon^2 + 50 \eta^2 L^2 \sum_{i=1}^{M} p_im_i I_i^3 \sum_{\tau=0}^{\omega_i-1} \mathbb{E}_t \Vert \overline{\textbf{x}}_i^{t,\tau} - \overline{\textbf{x}}^{t} \Vert^2 , \label{eqn:wc-MSE-cm}
\end{align}

With \cref{lem:cm-MSE}, we have for the last term of Eq. \ref{eqn:wc-MSE-cm} as
\begin{align}
    & 50 \eta^2 L^2 \sum_{i=1}^{M} p_im_i I_i^3 \sum_{\tau=0}^{\omega_i-1} \mathbb{E}_t \big\Vert \overline{\textbf{x}}_i^{t,\tau} - \overline{\textbf{x}}^{t} \big\Vert^2 \notag\\
    % & \leq 250 G^2\eta_c^2\eta^4 L^2 \sum_{i=1}^M \frac{p_im_i}{n_i} I_i^2 \sigma^2 + 2000 G^3\eta_c^2\eta^4 L^2 \sum_{i=1}^M p_im_i I_i^2 \epsilon^2 + 2000G^3\eta_c^2\eta^4 L^2 \sum_{i=1}^M p_im_i I_i^2 \big\Vert  \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 \notag\\
    % & + 1200G^2\eta_c^2\eta^4L^4 \sum_{i=1}^{M} p_iI_i^2 \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{V}_i} \sum_{h=0}^{I_i-1} \mathbb{E}_t \Vert \textbf{x}_j^{t,\tau,h} - \overline{\textbf{x}}_i^{t,\tau} \Vert^2 + 150G\eta_c^2\eta^4L^2 \sum_{i=1}^{M} p_im_iI_i^2 \sum_{\tau=0}^{\omega_i-1} \Psi_i^{t,\tau} \notag\\
    & \overset{(a)}{\leq} \frac{5}{2} G^2 \eta_c^2 \eta^2 \sum_{i=1}^M \frac{p_im_i}{n_i} \sigma^2 + 20 G\eta^2 \sum_{i=1}^M p_im_i I_i^2 \epsilon^2 + 20 G\eta^2 \sum_{i=1}^M p_im_i I_i^2 \big\Vert  \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 \notag\\
    & + \frac{1}{8} \sum_{i=1}^{M} p_i \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{V}_i} \sum_{h=0}^{I_i-1} \mathbb{E}_t \Vert \textbf{x}_j^{t,\tau,h} - \overline{\textbf{x}}_i^{t,\tau} \Vert^2 + \frac{3}{2}G\eta_c^2\eta^2 \sum_{i=1}^{M} p_im_i \sum_{\tau=0}^{\omega_i-1} \Psi_i^{t,\tau}, \label{eqn:cm-wc-rebound}
\end{align}
where in $(a)$, we slightly tighten the condition as $\eta_c\eta \leq \frac{1}{10LG}$ for the middle three terms. We use disparate treatment $\eta \leq \frac{1}{10I_{max}L}$ for the first term since it can be further merged to some term in $A_2$. For the last term, both two conditions are applicable as we will show in bounding $A_2$, so here we choose a simpler one, i.e., $\eta \leq \frac{1}{10I_{max}L}$ . Note that both these two conditions for simplification will not incur any fundamental changes in the convergence behavior, since we can further merge them into some lower-order terms. This could also provide much more readability.

Substituting Eq. \ref{eqn:cm-wc-rebound} back into Eq. \ref{eqn:wc-MSE-cm}, merging the left hand term, we have
\begin{align}
    & \sum_{i=1}^{M} p_i \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{V}_i} \sum_{h=0}^{I_i-1} \mathbb{E}_t \Vert \textbf{x}_j^{t,\tau,h} - \overline{\textbf{x}}_i^{t,\tau} \Vert^2\notag\\
    & \leq 6 G\eta^2 \sum_{i=1}^{M} p_im_i I_i\sigma^2 + 60 G\eta^2 \sum_{i=1}^{M} p_im_i I_i^2\epsilon_i^2 + 80 G \eta^2 \sum_{i=1}^{M} p_im_i I_i^2 \Vert \nabla f(\overline{\textbf{x}}^t) \Vert^2 \notag\\
    & + 80 G \eta^2 \sum_{i=1}^{M} p_im_i I_i^2 \epsilon^2 + 3 G^2 \eta_c^2 \eta^2 \sum_{i=1}^M \frac{p_im_i}{n_i} \sigma^2 + 2 G\eta_c^2\eta^2 \sum_{i=1}^{M} p_im_i \sum_{\tau=0}^{\omega_i-1} \Psi_i^{t,\tau}.\label{eqn:wc-MSE-final}
\end{align}
Similarly, for full participation case, we set $n_i = m_i, \Psi_i^{t,\tau} = 0, \forall i\in[M], \tau$ and thus recover the corresponding result in \cref{lem:wc-MSE-indep}.
\end{proof}

With Eq. \ref{eqn:global-MSE-full}, \cref{lem:cm-MSE}, and \cref{lem:wc-MSE-indep} for full participation, and setting $p_i = \frac{1}{m}$ in \cref{lem:wc-MSE-indep}, we rearrange Eq. \ref{eqn:A1-1} to bound for $- A_1$ as
\begin{align}
	& -A_1 \leq \frac{\eta_c\eta G}{2} \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2 - \frac{\eta_c\eta}{2G} \mathbb{E}_t \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \frac{1}{m_i} \sum_{j\in\mathcal{V}_i}\nabla F_j(\textbf{x}_j^{t,\tau,h}) \bigg\Vert^2 \notag\\
    & + L^2\eta_c\eta \bigg( 5 G^2\eta_c^2\eta^2 \frac{M}{m} \sigma^2 + 40 G^3\eta_c^2\eta^2 \epsilon^2 + 40G^3\eta_c^2\eta^2 \big\Vert  \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 + \frac{124}{100} \Big( 6 G\eta^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 \notag\\
    & + 60 G\eta^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 80 G \eta^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \Vert \nabla f(\overline{\textbf{x}}^t) \Vert^2 + 80 G \eta^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \epsilon^2 + 3 G^2\eta_c^2\eta^2 \frac{M}{m} \sigma^2 \Big) \bigg) \notag\\
    & \leq \frac{\eta_c\eta G}{2} \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2 - \frac{\eta_c\eta}{2G} \mathbb{E}_t \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \frac{1}{m_i} \sum_{j\in\mathcal{V}_i}\nabla F_j(\textbf{x}_j^{t,\tau,h}) \bigg\Vert^2 \notag\\
    & + 9 G^2\eta_c^3\eta^3 L^2 \frac{M}{m} \sigma^2 + 40 G^3\eta_c^3\eta^3 L^2 \epsilon^2 + 40G^3\eta_c^3\eta^3 L^2 \big\Vert  \nabla f(\overline{\textbf{x}}^{t}) \big\Vert^2 + 8 G\eta_c\eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 \notag\\
    & + 75 G\eta_c\eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 100 G \eta_c\eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \Vert \nabla f(\overline{\textbf{x}}^t) \Vert^2 + 100 G \eta_c \eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \epsilon^2 , \label{eqn:A1-full}
\end{align}
For the term $A_2$, we have
\begin{align}
    & A_2 = \mathbb{E}_t \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\eta_c\sum_{h=0}^{I_i-1} \frac{1}{m_i} \sum_{j\in\mathcal{V}_i} \eta \textbf{g}_j^{t,\tau,h} \bigg\Vert^2 \notag\\
	& \overset{(a)}{=} \eta_c^2\eta^2\mathbb E_t \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \frac{1}{m_i}\sum_{j\in\mathcal{V}_i} \left(\textbf{g}_j^{t,\tau,h}-\nabla F_j(\textbf{x}_j^{t,\tau,h})\right) \bigg\Vert^2 \notag\\
    & + \eta_c^2\eta^2\mathbb E_t \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \frac{1}{m_i}\sum_{j\in\mathcal{V}_i} \nabla F_j(\textbf{x}_j^{t,\tau,h})\bigg\Vert^2 \notag\\
	&\leq  \frac{G\eta_c^2\eta^2}{m} \sigma^2 + \eta_c^2\eta^2\mathbb E_t \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \frac{1}{m_i}\sum_{j\in\mathcal{V}_i} \nabla F_j(\textbf{x}_j^{t,\tau,h}) \bigg\Vert^2, \label{eqn:A2-full}
\end{align}
where $(a)$ follows the fact that $\mathbb E\Vert\textbf{x}\Vert^2 = \mathbb E\Vert\textbf{x}-\mathbb E \textbf{x} \Vert^2 + \Vert\mathbb E \textbf{x}\Vert^2$.

Substituting Eq. \ref{eqn:A1-full} and Eq. \ref{eqn:A2-full} back into Eq. \ref{eqn:A1A2-full} and rearranging the order, we have
\begin{align}
	& \eta_g\eta_c\eta G \Big(\frac{1}{2} - 40 G^2\eta_c^2\eta^2 L^2 - 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \Big) \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2  \notag\\
    & \leq f(\overline{\textbf{x}}^t) - \mathbb{E}_t[f(\overline{\textbf{x}}^{t+1})] + \Big(\frac{L\eta_g^2\eta_c^2\eta^2}{2} - \frac{\eta_g\eta_c\eta}{2G}\Big) \mathbb E_t \bigg\Vert \frac{1}{m} \sum_{i=1}^{M} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \sum_{j\in\mathcal{V}_i} \nabla F_j(\textbf{x}_j^{t,\tau,h}) \bigg\Vert^2 \notag\\
    & + \frac{GL\eta_g^2\eta_c^2\eta^2}{2m} \sigma^2 + 9 G^2\eta_g\eta_c^3\eta^3 L^2 \frac{M}{m} \sigma^2 + 40 G^3\eta_g\eta_c^3\eta^3 L^2 \epsilon^2 + 8 G\eta_g\eta_c\eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 \notag\\
    & + 75 G\eta_g\eta_c\eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 100 G \eta_g \eta_c \eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \epsilon^2 \notag\\
    & \overset{(a)}{\leq} f(\overline{\textbf{x}}^t) - \mathbb{E}_t[f(\overline{\textbf{x}}^{t+1})] + \frac{GL\eta_g^2\eta_c^2\eta^2}{2m} \sigma^2 + 9 G^2\eta_g\eta_c^3\eta^3 L^2 \frac{M}{m} \sigma^2 + 8 G\eta_g\eta_c\eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 \notag\\
    & + 75 G\eta_g\eta_c\eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 100 G \eta_g \eta_c \eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \epsilon^2 + 40 G^3\eta_g\eta_c^3\eta^3 L^2 \epsilon^2,
\end{align}
where $(a)$ follows from $\frac{L\eta_g^2\eta_c^2\eta^2}{2} - \frac{\eta_g\eta_c\eta}{2G} \leq 0$ if $\eta_g\eta_c\eta \leq \frac{1}{GL}$.

Suppose $ 40 G^2\eta_c^2\eta^2 L^2 + 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 < \frac{1}{2}$,  and there exists a constant satisfying $(\frac{1}{2} - 40 G^2\eta_c^2\eta^2 L^2 - 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 ) > c > 0$, then we have
\begin{align}
	& \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2  \leq \frac{f(\overline{\textbf{x}}^t) - \mathbb{E}_t [f(\overline{\textbf{x}}^{t+1})]}{\eta_g\eta_c\eta G c} + \frac{1}{c} \bigg[ \frac{L\eta_g\eta_c\eta}{2m} \sigma^2 + 9 G\eta_c^2\eta^2 L^2 \frac{M}{m} \sigma^2 + 8 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 \notag\\
    & + 75 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \epsilon^2 + 40 G^2\eta_c^2\eta^2 L^2 \epsilon^2 \bigg], \label{eqn:full-oneStep}
\end{align}
% where to achieve $(a)$, we introduce a constant $c$ such that $\frac{1}{c} \geq \frac{1}{c_1c_2}$. Plus, Eq. \ref{eq:phi} is substituted with the condition $\eta < \frac{1}{8I_iL},\forall i\in[M]$ into $(a)$ for simplifying some higher-order terms (greater than 2) and merging as lower-order ones. Note that though this simplification slightly slackens the bound, it will not incur any fundamental changes in the convergence behavior.
% From Eq. \ref{eqn:full-glr-useless} we clearly see that the global learning rate $\eta_g$ is much less significant than $\eta$ and $\eta_c$, since the vanishing rate of the first term will be bottlenecked by the additional coefficient $\frac{\eta_g}{c_1}$. Thus, we suggest to simply set $\eta_g = 1$.
% \begin{align}
%     & \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2 \leq \frac{f(\overline{\textbf{x}}^t) - \mathbb{E}_t[f(\overline{\textbf{x}}^{t+1})]}{\eta_c\eta G c} + \frac{1}{c} \bigg[ 40 G^2\eta_c^2\eta^2 L^2 \epsilon^2 + 60\eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 10\eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 \notag\\
%     & + \frac{L\eta_c\eta}{2m} \sigma^2 + 5 G\eta_c^2\eta^2 L^2 \frac{M}{m} \sigma^2 + 30 \eta_c\eta^3 L^3 \sum_{i=1}^{M} \frac{1}{m} I_i^2 \sigma^2 + 150 \eta^4 L^4 \sum_{i=1}^{M} \frac{m_i}{m} I_i^3 \sigma^2 + 900 \eta^4 L^4 \sum_{i=1}^{M} \frac{m_i}{m} I_i^4 \epsilon_i^2 \bigg] \notag\\
% 	& \overset{(a)}{\leq} \frac{f(\overline{\textbf{x}}^t) - \mathbb{E}_t[f(\overline{\textbf{x}}^{t+1})]}{\eta_c\eta G c} + \frac{1}{c} \bigg[ 40 G^2\eta_c^2\eta^2 L^2 \epsilon^2 + 75\eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 13\eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 \notag\\
%     & + \frac{L\eta_c\eta}{m} \sigma^2 + 5 G\eta_c^2\eta^2 L^2 \frac{M}{m} \sigma^2 \bigg], \label{eqn:full-oneStep}
% \end{align}

Taking a double expectation over the data samples among all workers and averaging from $t=0,1,\ldots,T$, we have the final results as
\begin{align}
	& \min_{t\in[T]} \mathbb{E} \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2 \leq \frac{f_0 - f_*}{c\eta_g\eta_c\eta G T} + \frac{1}{c}(\Phi_1 + \Phi_2), \label{eqn:theorem1}
\end{align}
where 
\begin{align}
    & \Phi_1 = 9 G\eta_c^2\eta^2 L^2 \frac{M}{m} \sigma^2 + 8 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 + 40 G^2\eta_c^2\eta^2 L^2 \epsilon^2, \notag\\
    & + 75 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \epsilon^2, \;\; \Phi_2 = \frac{L\eta_g\eta_c\eta}{2m} \sigma^2.
\end{align}

% When setting $\eta = \frac{1}{\sqrt{T}GL}$, $\eta_c=1$, and $\eta_g = \sqrt{Gm}$, the convergence rate is $\mathcal{O}(\frac{1}{\sqrt{mGT}} + \frac{1}{T})$.

This completes the proof of \cref{them:hfl-full}.
\end{proof}

\section{Proof of Theorem 2}
\label{app:proof-them2}
\begin{proof}
For the partial participation case, we similarly start with taking expectation over the randomness of the master round $t$ and expanding with Assumption \ref{ass:lipschitz} as
\begin{align}
	& \mathbb{E}_t[f(\overline{\textbf{x}}^{t+1})] \leq f(\overline{\textbf{x}}^t) - \eta_g \eta_c \eta G \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2 - \eta_g \underbrace{\mathbb E_t\bigl< \nabla f(\overline{\textbf{x}}^t), \Delta^{t} - \eta_c\eta G \nabla f(\overline{\textbf{x}}^t)\bigr>}_{A_1} + \eta_g^2 \frac{ L}{2}\underbrace{\mathbb E_t \big\Vert \Delta^{t} \big\Vert^2}_{A_2}
    \label{eqn:A1A2-partial}.
\end{align}
Due to Lemma 1 for both sampling strategies, $A_1$ equals exactly to the fully participating case, i.e.,
\begin{align}
	& A_1 = \mathbb E_t\bigl< \nabla f(\overline{\textbf{x}}^t), \Delta^{t} - \eta_c\eta G \nabla f(\overline{\textbf{x}}^t)\bigr> = \mathbb E_t\bigl< \nabla f(\overline{\textbf{x}}^t), \bar{\Delta}^{t} - \eta_c\eta G \nabla f(\overline{\textbf{x}}^t)\bigr>.
\end{align}
Hence, we have exactly the same bound with the full participation case for $A_1$ as Eq. \ref{eqn:A1-full}.
Then we focus on bounding $A_2$.

Let $\theta_j^{t,\tau} = \sum_{h=0}^{I_i-1} \nabla F_j(\textbf{x}_j^{t,\tau,h}), j\in\mathcal{V}_i$, for both sampling strategies, we have
\begin{align}
    & A_2 = \mathbb{E} \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\eta_c \sum_{h=0}^{I_i-1} \frac{1}{n_i} \sum_{j\in\mathcal{S}_i^{t,\tau}} \eta \textbf{g}_j^{t,\tau,h} \bigg\Vert^2 \notag\\
	% & \overset{(a)}{=} \eta_c^2\eta^2\mathbb E_t \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \frac{1}{n_i} \sum_{j\in\mathcal{S}_i^{t,\tau}} \left(\textbf{g}_j^{t,\tau,h}-\nabla F_j(\textbf{x}_j^{t,\tau,h})\right) \bigg\Vert^2 + \eta_c^2\eta^2\mathbb E_t \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{m} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \frac{1}{n_i}\sum_{j\in\mathcal{S}_i^{t,\tau}} \nabla F_j(\textbf{x}_j^{t,\tau,h})\bigg\Vert^2 \notag\\
	& \overset{(a)}{=} \eta_c^2\eta^2 \sum_{i=1}^{M} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \sum_{j\in\mathcal{S}_i^{t,\tau}} \frac{m_i^2}{m^2}\frac{1}{n_i^2} \mathbb E_t \bigg\Vert\textbf{g}_j^{t,\tau,h}-\nabla F_j(\textbf{x}_j^{t,\tau,h}) \bigg\Vert^2 + \eta_c^2\eta^2\mathbb E_t \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{mn_i} \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} \bigg\Vert^2 \notag\\
	&\overset{(b)}{\leq} G\eta_c^2\eta^2  \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2 + \eta_c^2\eta^2\mathbb E_t \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{mn_i} \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} \bigg\Vert^2, \label{eqn:A2-partial}
\end{align}
where 
% $(a)$ follows the fact that $\mathbb E\Vert\textbf{x}\Vert^2 = \mathbb E\Vert\textbf{x}-\mathbb E \textbf{x} \Vert^2 + \Vert\mathbb E \textbf{x}\Vert^2$ and 
$(a)$ is due to $\mathbb E\Vert\textbf{x}\Vert^2 = \mathbb E\Vert\textbf{x}-\mathbb E \textbf{x} \Vert^2 + \Vert\mathbb E \textbf{x}\Vert^2$ and \cref{lem:ind-0mean}. $(b)$ is due to bounded local variance (Assumption \ref{ass:bound-var}). 

We then prove a lemma that refines the second term of Eq. \ref{eqn:A2-partial} for easier analysis.

\begin{lemma}
\label{lem:A2-cr-decompose}
For both sampling strategies, the norm of the averaged accumulated gradients of all participated workers in master round $t$ follows
\begin{align}
	& \mathbb E_t \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{mn_i} \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} \bigg\Vert^2 \notag\\
    & = \mathbb E_t \Bigg[ \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i^2} \sum_{\tau=0}^{\omega_i-1} \Big\Vert \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} \Big\Vert^2 -  \frac{1}{m^2} \sum_{i=1}^{M} \sum_{\tau=0}^{\omega_i-1} \Big\Vert \sum_{j\in\mathcal{V}_i} \theta_j^{t,\tau} \Big\Vert^2 + \Big\Vert \frac{1}{m}\sum_{i=1}^{M}\sum_{j\in \mathcal V_i} \sum_{\tau=0}^{\omega_i-1}  \theta_j^{t,\tau} \Big\Vert^2 \Bigg] . \label{eq:lem-A2-2}
\end{align}
\end{lemma}

\begin{proof}
We have
\begin{align}
	& \mathbb E_t \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{mn_i} \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} \bigg\Vert^2 \notag\\
	& = \mathbb E_t \left[ \sum_{i=1}^{M} \bigg\Vert \frac{m_i}{mn_i} \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} \bigg\Vert^2 +  \sum_{i \neq h; i, h\in [M]}\left\langle \frac{m_i}{mn_i} \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} , \frac{m_h}{mn_h} \sum_{\tau=0}^{\omega_h-1} \sum_{k\in\mathcal{S}_h^{t,\tau}} \theta_k^{t,\tau} \right\rangle \right] , \label{eq:A2-cluster-expand}
\end{align}
which is an inter-cluster decomposition among different groups. For the second term of Eq. \ref{eq:A2-cluster-expand}, we have
\begin{align}
    & \mathbb E_t \left[\sum_{i \neq h; i, h\in [M]}\left\langle \frac{m_i}{mn_i} \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} , \frac{m_h}{mn_h} \sum_{\tau=0}^{\omega_h-1} \sum_{k\in\mathcal{S}_h^{t,\tau}} \theta_k^{t,\tau} \right\rangle \right] \notag\\
	& = \mathbb E_t \left[  \sum_{i \neq h; i, h\in [M]} \frac{m_im_h}{m^2} \left\langle \frac{1}{n_i} \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} , \frac{1}{n_h} \sum_{\tau=0}^{\omega_h-1} \sum_{k\in\mathcal{S}_h^{t,\tau}} \theta_k^{t,\tau} \right\rangle \right] \notag\\
	& \overset{(a)}{=} \frac{1}{m^2} \mathbb E_t \left[  \sum_{i \neq h; i, h\in [M]} \left\langle  \sum_{j\in \mathcal V_i} \sum_{\tau=0}^{\omega_i-1}  \theta_j^{t,\tau} ,  \sum_{k\in \mathcal V_h} \sum_{\tau=0}^{\omega_h-1}  \theta_k^{t,\tau} \right\rangle \right] , \label{eq:A2-clusterp}
% 	& = \frac{n^2(M-1)}{m^2} \sum_{i=1}^{M} \bigg\Vert \sum_{j\in \mathcal V_i}\nabla F_{j}(\textbf{x}_{j}^t) \bigg\Vert^2 - \frac{n^2}{2m^2} \sum_{i \neq h; i, h\in [M]} \bigg\Vert \sum_{j\in \mathcal V_i}\nabla F_{j}(\textbf{x}_{j}^t) - \sum_{k\in \mathcal V_h}\nabla F_{k}(\textbf{x}_{k}^t) \bigg\Vert^2 
\end{align}
where $(a)$ is due to \cref{lem:unbias} and the independence among the sampling sets of different groups.

We note that it is a fact that
\begin{align}
     & \mathbb{E}_t \bigg\Vert \frac{1}{m} \sum_{i=1}^{M} \sum_{j\in\mathcal{V}_i} \sum_{\tau=0}^{\omega_i-1}  \theta_j^{t,\tau} \bigg\Vert^2 \notag\\
     & = \frac{1}{m^2} \mathbb E_t \left[ \sum_{i=1}^M \big\Vert \sum_{j\in\mathcal{V}_i} \sum_{\tau=0}^{\omega_i-1}  \theta_j^{t,\tau} \big\Vert^2 + \sum_{i\neq h,\;i,h\in[M]}\; \left\langle \sum_{j\in \mathcal{V}_i} \sum_{\tau=0}^{\omega_i-1}  \theta_j^{t,\tau}, \sum_{k\in \mathcal{V}_h} \sum_{\tau=0}^{\omega_h-1}  \theta_k^{t,\tau} \right\rangle \right] . \label{eq:A2-clusterp-bound}
\end{align}
Then, review the first term of Eq. \ref{eq:A2-cluster-expand}, we have
\begin{align}
	\mathbb E_t \bigg\Vert \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} \bigg\Vert^2 = \mathbb E_t \left[ \sum_{\tau=0}^{\omega_i-1} \bigg\Vert \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} \bigg\Vert^2 +  \sum_{\tau \neq \nu; \tau, \nu\in [\omega_i-1]}\left\langle \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} , \sum_{k\in\mathcal{S}_i^{t,\nu}} \theta_k^{t,\nu} \right\rangle \right], \label{eq:A2-round-expand}
\end{align}
which is an intra-cluster decomposition among different cluster rounds. Note that for the second term of Eq. \ref{eq:A2-round-expand}, we have
\begin{align}
    & \mathbb E_t \left[\sum_{\tau \neq \nu; \tau, \nu\in [\omega_i-1]}\left\langle \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} , \sum_{k\in\mathcal{S}_i^{t,\nu}} \theta_k^{t,\nu} \right\rangle \right] \notag\\
	& = \mathbb E_t \left[\sum_{\tau \neq \nu; \tau, \nu\in [\omega_i-1]} n_i^2 \left\langle \frac{1}{n_i} \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} , \frac{1}{n_i} \sum_{k\in\mathcal{S}_i^{t,\nu}} \theta_k^{t,\nu} \right\rangle \right] \notag\\
	& \overset{(a)}{=} \frac{n_i^2}{m_i^2} \mathbb E_t \left[\sum_{\tau \neq \nu; \tau, \nu\in [\omega_i-1]} \left\langle \sum_{j\in \mathcal{V}_i} \theta_j^{t,\tau} , \sum_{k\in \mathcal{V}_i} \theta_k^{t,\nu} \right\rangle \right], \label{eq:A2-roundp}
% 	& = \frac{n^2(M-1)}{m^2} \sum_{i=1}^{M} \bigg\Vert \sum_{j\in \mathcal V_i}\nabla F_{j}(\textbf{x}_{j}^t) \bigg\Vert^2 - \frac{n^2}{2m^2} \sum_{i \neq h; i, h\in [M]} \bigg\Vert \sum_{j\in \mathcal V_i}\nabla F_{j}(\textbf{x}_{j}^t) - \sum_{k\in \mathcal V_h}\nabla F_{k}(\textbf{x}_{k}^t) \bigg\Vert^2 
\end{align}
where $(a)$ is due to \cref{lem:unbias} and the independence among the sampling sets of different cluster rounds.

We similarly note that it is fact for the first term of Eq. \ref{eq:A2-clusterp-bound} that
\begin{align}
    & \mathbb{E}_t \big\Vert \sum_{j\in\mathcal{V}_i} \sum_{\tau=0}^{\omega_i-1}  \theta_j^{t,\tau} \big\Vert^2 = \mathbb E_t \left[ \sum_{\tau=0}^{\omega_i-1} \big\Vert \sum_{j\in\mathcal{V}_i} \theta_j^{t,\tau} \big\Vert^2 + \sum_{\tau \neq \nu; \tau, \nu\in [\omega_i-1]} \left\langle \sum_{j\in \mathcal{V}_i} \theta_j^{t,\tau} , \sum_{k\in \mathcal{V}_i} \theta_k^{t,\nu} \right\rangle \right] . \label{en:A2-roundp-bound}
\end{align}
Hence, substituting all Eq. \ref{eq:A2-clusterp}, \ref{eq:A2-clusterp-bound}, \ref{eq:A2-round-expand}, \ref{eq:A2-roundp}, and \ref{en:A2-roundp-bound} back into Eq. \ref{eq:A2-cluster-expand}, we have
\begin{align}
	& \mathbb E_t \bigg\Vert \sum_{i=1}^{M}\frac{m_i}{mn_i} \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} \bigg\Vert^2 \notag\\
	& = \mathbb E_t \Bigg[ \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i^2} \sum_{\tau=0}^{\omega_i-1} \Big\Vert \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} \Big\Vert^2 +  \frac{1}{m^2} \sum_{i=1}^{M} \bigg( \Big\Vert \sum_{j\in\mathcal{V}_i} \sum_{\tau=0}^{\omega_i-1}  \theta_j^{t,\tau} \Big\Vert^2 - \sum_{\tau=0}^{\omega_i-1} \Big\Vert \sum_{j\in\mathcal{V}_i} \theta_j^{t,\tau} \Big\Vert^2 \bigg) \notag\\
    & + \Big\Vert \frac{1}{m}\sum_{i=1}^{M}\sum_{j\in \mathcal V_i} \sum_{\tau=0}^{\omega_i-1}  \theta_j^{t,\tau} \Big\Vert^2 - \frac{1}{m^2} \sum_{i=1}^M \Big\Vert \sum_{j\in\mathcal{V}_i} \sum_{\tau=0}^{\omega_i-1}  \theta_j^{t,\tau} \Big\Vert^2 \Bigg] \notag\\
    & = \mathbb E_t \Bigg[ \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i^2} \sum_{\tau=0}^{\omega_i-1} \Big\Vert \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} \Big\Vert^2 -  \frac{1}{m^2} \sum_{i=1}^{M} \sum_{\tau=0}^{\omega_i-1} \Big\Vert \sum_{j\in\mathcal{V}_i} \theta_j^{t,\tau} \Big\Vert^2 + \Big\Vert \frac{1}{m}\sum_{i=1}^{M}\sum_{j\in \mathcal V_i} \sum_{\tau=0}^{\omega_i-1}  \theta_j^{t,\tau} \Big\Vert^2 \Bigg] . \label{eq:A2-expand-bound}
\end{align}
This completes the proof of \cref{lem:A2-cr-decompose}.
\end{proof}

Note that \cref{lem:A2-cr-decompose} essentially redirect the uncertainty of partial participation from overall master round level to each cluster round level (the first term of Eq. \ref{eq:A2-expand-bound}). It is this redirection that restricts partial participation to only directly interact with $I_i$ rather than $G$, thereby guaranteeing the weakening effect.

With \cref{lem:A2-cr-decompose}, we have $A_2$ for both sampling strategies
\begin{align}
    & A_2 \leq G\eta_c^2\eta^2  \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2 + \eta_c^2\eta^2 \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i^2} \sum_{\tau=0}^{\omega_i-1} \mathbb E_t \Big\Vert \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} \Big\Vert^2 \label{eqn:A2-target} \\
    & - \eta_c^2\eta^2 \frac{1}{m^2} \sum_{i=1}^{M} \sum_{\tau=0}^{\omega_i-1} \mathbb E_t \Big\Vert \sum_{j\in\mathcal{V}_i} \theta_j^{t,\tau} \Big\Vert^2 + \eta_c^2\eta^2 \mathbb E_t \Big\Vert \frac{1}{m}\sum_{i=1}^{M}\sum_{j\in \mathcal V_i} \sum_{\tau=0}^{\omega_i-1}  \theta_j^{t,\tau} \Big\Vert^2. \notag
\end{align}
Note that only the second term of Eq. \ref{eqn:A2-target} remains relevant to specific sampling strategies. We will next bound for it.

\subsection{Bounding strategy 1}
With sampling strategy 1, suppose $\mathcal S_i^{t, \tau} = \{l_{i,1}^{t,\tau}, l_{i,2}^{t,\tau},\ldots, l_{i,n_i}^{t,\tau}\}$, we have for the second term of Eq. \ref{eqn:A2-target}
\begin{align}
    & \mathbb E_t \bigg\Vert \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} \bigg\Vert^2 =  \mathbb E_t \bigg\Vert \sum_{z = 1}^{n_i} \theta_{l_{i,z}^{t,\tau}}^{t,\tau} \bigg\Vert^2 \notag\\
	& = \mathbb E_t \left[\sum_{z = 1}^{n_i}\big\Vert \theta_{l_{i,z}^{t,\tau}}^{t,\tau} \big\Vert^2 +  \sum_{j \neq k; j, k\in [n_i]}\left\langle \theta_{l_{i,j}^{t,\tau}}^{t,\tau} , \theta_{l_{i,k}^{t,\tau}}^{t,\tau} \right\rangle \right] \notag\\
	& = \mathbb E_t \left[n_i\big\Vert \theta_{l_{i,1}^{t,\tau}}^{t,\tau} \big\Vert^2 +  n_i(n_i-1)\left\langle \theta_{l_{i,1}^{t,\tau}}^{t,\tau} , \theta_{l_{i,2}^{t,\tau}}^{t,\tau} \right\rangle \right] \notag\\
	& = \mathbb E_t \left[\sum_{j\in \mathcal V_i}\frac{n_i}{m_i}\big\Vert \theta_j^{t,\tau} \big\Vert^2 +  \sum_{j, k\in \mathcal V_i}\frac{n_i(n_i-1)}{m_i^2}\left\langle \theta_j^{t,\tau} , \theta_k^{t,\tau} \right\rangle \right] \notag\\
	& = \sum_{j\in \mathcal V_i}\frac{n_i}{m_i} \mathbb E_t \big\Vert \theta_j^{t,\tau} \big\Vert^2 + \frac{n_i(n_i-1)}{m_i^2} \mathbb E_t \bigg\Vert \sum_{j\in \mathcal V_i} \theta_j^{t,\tau} \bigg\Vert^2 . \label{eqn:st1-sum-set-Norm}
\end{align}
Substituting Eq. \ref{eqn:st1-sum-set-Norm} back into Eq. \ref{eqn:A2-target}, we have
\begin{align}
	& A_2 \leq G\eta_c^2\eta^2  \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2 + \eta_c^2\eta^2 \sum_{i=1}^{M}\sum_{\tau=0}^{\omega_i-1}\sum_{j\in \mathcal V_i}\frac{m_i}{m^2n_i} \mathbb E_t \big\Vert \theta_j^{t,\tau} \big\Vert^2 \notag\\
    & - \eta_c^2\eta^2 \frac{1}{m^2} \sum_{i=1}^{M} \frac{1}{n_i} \sum_{\tau=0}^{\omega_i-1} \mathbb E_t \Big\Vert \sum_{j\in\mathcal{V}_i} \theta_j^{t,\tau} \Big\Vert^2 + \eta_c^2\eta^2 \mathbb E_t \Big\Vert \frac{1}{m}\sum_{i=1}^{M}\sum_{j\in \mathcal V_i} \sum_{\tau=0}^{\omega_i-1}  \theta_j^{t,\tau} \Big\Vert^2 \notag\\
    & \leq G\eta_c^2\eta^2  \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2 + \eta_c^2\eta^2 \sum_{i=1}^{M}\sum_{\tau=0}^{\omega_i-1}\sum_{j\in \mathcal V_i}\frac{m_i}{m^2n_i} \mathbb E_t \big\Vert \theta_j^{t,\tau} \big\Vert^2 + \eta_c^2\eta^2 \mathbb E_t \Big\Vert \frac{1}{m}\sum_{i=1}^{M}\sum_{j\in \mathcal V_i} \sum_{\tau=0}^{\omega_i-1}  \theta_j^{t,\tau} \Big\Vert^2
    \label{eqn:A2-st1-raw}
\end{align}

For the second term of Eq. \ref{eqn:A2-st1-raw}, we can have the following important inequality. Here we similarly consider arbitrary positive coefficients $p_i > 0,\forall i\in[M]$ for compatibility with our subsequent proof for strategy 2,
\begin{align}
    & \sum_{i=1}^{M} p_i \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{V}_i} \mathbb E_t \big\Vert \sum_{h=0}^{I_i-1} \nabla F_j(\textbf{x}_j^{t,\tau,h}) \big\Vert^2 = \sum_{i=1}^{M} p_i \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{V}_i} \mathbb E_t \bigg\Vert \sum_{h=0}^{I_i-1} \Big( \nabla F_j(\textbf{x}_j^{t,\tau,h}) - \nabla F_j(\overline{\textbf{x}}_i^{t,\tau}) \notag\\
    & + \nabla F_j(\overline{\textbf{x}}_i^{t,\tau}) - \nabla f_i(\overline{\textbf{x}}_i^{t,\tau}) + \nabla f_i(\overline{\textbf{x}}_i^{t,\tau}) - \nabla f_i(\overline{\textbf{x}}^{t}) + \nabla f_i(\overline{\textbf{x}}^{t}) - \nabla f(\overline{\textbf{x}}^{t}) + \nabla f(\overline{\textbf{x}}^{t}) \Big) \bigg\Vert^2 \notag\\
    & \leq 5 L^2 \sum_{i=1}^M p_iI_i \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{V}_i} \sum_{h=0}^{I_i-1} \mathbb E_t \big\Vert \textbf{x}_j^{t,\tau,h} - \overline{\textbf{x}}_i^{t,\tau} \big\Vert^2 + 5L^2 \sum_{i=1}^M p_im_iI_i^2 \sum_{\tau=0}^{\omega_i-1} \mathbb E_t \big\Vert \textbf{x}_i^{t,\tau} - \overline{\textbf{x}}^{t} \big\Vert^2 \notag\\
    & + 5G \sum_{i=1}^M p_im_i I_i \epsilon_i^2 + 5G \sum_{i=1}^M p_im_i I_i \epsilon^2 + 5G \sum_{i=1}^M p_i m_i I_i \Vert \nabla f(\overline{\textbf{x}}^{t}) \Vert^2 \notag\\
    & \overset{(a)}{\leq} \frac{31}{5} L^2 \sum_{i=1}^M p_iI_i \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{V}_i} \sum_{h=0}^{I_i-1} \mathbb E_t \big\Vert \textbf{x}_j^{t,\tau,h} - \overline{\textbf{x}}_i^{t,\tau} \big\Vert^2 + 5G \sum_{i=1}^M p_im_i I_i \epsilon_i^2 + 7G \sum_{i=1}^M p_im_i I_i \epsilon^2 \notag\\
    & + 7G \sum_{i=1}^M p_i m_i I_i \Vert \nabla f(\overline{\textbf{x}}^{t}) \Vert^2 + \frac{1}{4} \sum_{i=1}^M \frac{p_im_i}{n_i} I_i \sigma^2 + 15 G\eta_c^2\eta^2 L^2 \sum_{i=1}^{M} p_im_iI_i \sum_{\tau=0}^{\omega_i-1} \Psi_i^{t,\tau} \notag\\
    & \overset{(b)}{\leq} \frac{2}{5} G \sum_{i=1}^{M} p_im_i \sigma^2 + \frac{1}{2} \sum_{i=1}^M \frac{p_im_i}{n_i} I_i \sigma^2 + 9G \sum_{i=1}^M p_im_i I_i \epsilon_i^2 + 12G \sum_{i=1}^M p_im_i I_i \epsilon^2 \notag\\
    & + 12G \sum_{i=1}^M p_i m_i I_i \Vert \nabla f(\overline{\textbf{x}}^{t}) \Vert^2 + 28 G\eta_c^2\eta^2 L^2 \sum_{i=1}^{M} p_im_iI_i \sum_{\tau=0}^{\omega_i-1} \Psi_i^{t,\tau}
    , \label{eqn:A2-Fnorm-self}
\end{align}
where $(a)$ is due to \cref{lem:cm-MSE} for partial participation and simplified with $\eta_c\eta \leq \frac{1}{10LG}$. $(b)$ is due to \cref{lem:wc-MSE-indep} for partial participation case and simplified via the condition $\eta \leq \frac{1}{10I_{max}L}$.

We can bound $\Psi_i^{t,\tau}$ as
\begin{align}
    & \Psi_i^{t,\tau} = \mathbb{E}_t \bigg\Vert \frac{1}{n_i}\sum_{j\in\mathcal{S}_i^{t,\tau}}\sum_{h=0}^{I_i} \nabla F_j(\textbf{x}_j^{t,\tau,h}) - \frac{1}{m_i}\sum_{j\in\mathcal{V}_i}\sum_{h=0}^{I_i} \nabla F_j(\textbf{x}_j^{t,\tau,h}) \bigg\Vert^2 \notag\\
    & \overset{(a)}{=} \mathbb{E}_t \bigg\Vert \frac{1}{n_i}\sum_{j\in\mathcal{S}_i^{t,\tau}}\sum_{h=0}^{I_i} \nabla F_j(\textbf{x}_j^{t,\tau,h}) \bigg\Vert^2 - \mathbb{E}_t \bigg\Vert \frac{1}{m_i}\sum_{j\in\mathcal{V}_i} \sum_{h=0}^{I_i} \nabla F_j(\textbf{x}_j^{t,\tau,h}) \bigg\Vert^2 \notag\\
    & = \frac{1}{n_i^2} \mathbb{E}_t \bigg\Vert \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} \bigg\Vert^2 - \frac{1}{m_i^2} \mathbb{E}_t \bigg\Vert \sum_{j\in\mathcal{V}_i} \theta_j^{t,\tau} \bigg\Vert^2, \label{eq:Psi-ori} 
\end{align}
where $(a)$ is due to the fact that $\mathbb E\Vert\textbf{x}\Vert^2 = \mathbb E\Vert\textbf{x}-\mathbb E \textbf{x} \Vert^2 + \Vert\mathbb E \textbf{x}\Vert^2$, holding for both sampling strategies.

% For strategy 1, we have for the first term of Eq. \ref{eq:Psi-ori},
% \begin{align}
%     & \mathbb{E}_t \bigg\Vert \frac{1}{n_i}\sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} \bigg\Vert^2 = \mathbb{E}_t \bigg\Vert \frac{1}{n_i}\sum_{z=1}^{n_i} \theta_{l_{i,z}}^{t,\tau} \bigg\Vert^2 \notag\\
%     & = \frac{1}{n_i^2} \mathbb E_t \left[\sum_{z = 1}^{n_i}\big\Vert \theta_{l_{i,z}}^{t,\tau} \big\Vert^2 +  \sum_{j \neq k; j, k\in [n_i]}\left\langle \theta_{l_{i,j}}^{t,\tau} , \theta_{l_{i,k}}^{t,\tau} \right\rangle \right] \notag\\
%     & = \frac{1}{n_i^2}\mathbb E_t \left[n_i\big\Vert \theta_{l_{i,1}}^{t,\tau} \big\Vert^2 +  n_i(n_i-1)\left\langle \theta_{l_{i,1}}^{t,\tau} , \theta_{l_{i,2}}^{t,\tau} \right\rangle \right] \notag\\
% 	& = \frac{1}{n_i^2} \mathbb{E}_t \left[\sum_{j\in \mathcal V_i}\frac{n_i}{m_i}\big\Vert \theta_{j}^{t,\tau} \big\Vert^2 +  \sum_{j, k\in \mathcal V_i}\frac{n_i(n_i-1)}{m_i^2}\left\langle \theta_{j}^{t,\tau} , \theta_{k}^{t,\tau} \right\rangle \right] \notag\\
%     & = \frac{1}{n_im_i} \sum_{j\in \mathcal V_i} \mathbb{E}_t \big\Vert \theta_{j}^{t,\tau} \big\Vert^2 + \frac{n_i-1}{n_im_i^2} \mathbb{E}_t \big\Vert \sum_{j\in \mathcal V_i} \theta_{j}^{t,\tau} \big\Vert^2. \label{eq:Psi-1}
% \end{align}
For sampling strategy 1, substituting Eq. \ref{eqn:st1-sum-set-Norm} into Eq. \ref{eq:Psi-ori}, we have
\begin{align}
    & \Psi_i^{t,\tau} = \frac{1}{n_im_i} \sum_{j\in \mathcal V_i} \mathbb{E}_t \big\Vert \theta_{j}^{t,\tau} \big\Vert^2 - \frac{1}{n_im_i^2} \mathbb{E}_t \big\Vert \sum_{j\in \mathcal V_i} \theta_{j}^{t,\tau} \big\Vert^2, \label{eq:Psi-st1} 
\end{align}
Substituting Eq. \ref{eq:Psi-st1} back into Eq. \ref{eqn:A2-Fnorm-self} with $p_i = \frac{m_i}{m^2n_i}, \forall i\in[M]$, we have
\begin{align}
    & \sum_{i=1}^{M} \frac{m_i}{m^2n_i} \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{V}_i} \mathbb E_t \big\Vert \theta_{j}^{t,\tau} \big\Vert^2 \notag\\
    & \leq \frac{2}{5} G \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i} \sigma^2 + \frac{1}{2} \sum_{i=1}^M \frac{m_i^2}{m^2n_i^2} I_i \sigma^2 + 9G \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon_i^2 \notag\\
    & + 12G \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon^2 + 12G \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \Vert \nabla f(\overline{\textbf{x}}^{t}) \Vert^2 \notag\\
    & + 28 G\eta_c^2\eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m^2n_i^2} I_i \sum_{\tau=0}^{\omega_i-1} \sum_{j\in \mathcal V_i} \mathbb{E}_t \big\Vert \theta_{j}^{t,\tau} \big\Vert^2 - 28 G\eta_c^2\eta^2 L^2 \sum_{i=1}^{M} \frac{1}{m^2n_i^2} I_i \sum_{\tau=0}^{\omega_i-1} \mathbb{E}_t \big\Vert \sum_{j\in \mathcal V_i} \theta_{j}^{t,\tau} \big\Vert^2 \notag\\
    & \overset{(a)}{\leq} \frac{9}{10} G \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i} \sigma^2 + 9G \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon_i^2 + 12G \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon^2 \notag\\
    & + 12G \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \Vert \nabla f(\overline{\textbf{x}}^{t}) \Vert^2 + \frac{2}{5} \sum_{i=1}^{M} \frac{m_i}{m^2n_i} \sum_{\tau=0}^{\omega_i-1} \sum_{j\in \mathcal V_i} \mathbb{E}_t \big\Vert \theta_{j}^{t,\tau} \big\Vert^2,
\end{align}
where $(a)$ is due to the fact that $\frac{I_i}{n_i} < G, \forall i\in[M]$ and the condition $\eta_c\eta \leq \frac{1}{10LG}$.

Rearranging the order for the left hand term, we have
\begin{align}
    & \sum_{i=1}^{M} \frac{m_i}{m^2n_i} \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{V}_i} \mathbb E_t \big\Vert \theta_{j}^{t,\tau} \big\Vert^2 \notag\\
    & \leq \frac{3}{2} G \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i} \sigma^2 + 15G \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon_i^2 + 20G \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon^2 + 20G \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \Vert \nabla f(\overline{\textbf{x}}^{t}) \Vert^2. \label{eqn:A2-Fnorm-st1}
\end{align}
We substitute Eq. \ref{eqn:A2-Fnorm-st1} back to Eq. \ref{eqn:A2-st1-raw}, and bound $A_2$ for strategy 1 as
\begin{align}
	& A_2 \leq \frac{5}{2} G\eta_c^2\eta^2  \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2 + \eta_c^2\eta^2 \mathbb E_t \Big\Vert \frac{1}{m}\sum_{i=1}^{M}\sum_{j\in \mathcal V_i} \sum_{\tau=0}^{\omega_i-1} \sum_{h=0}^{I_i-1} \nabla F_j(\textbf{x}_j^{t,\tau,h}) \Big\Vert^2 \notag\\
    & + 15G \eta_c^2\eta^2 \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon_i^2 + 20G \eta_c^2\eta^2 \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon^2 + 20G \eta_c^2\eta^2 \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \Vert \nabla f(\overline{\textbf{x}}^{t}) \Vert^2. \label{eqn:A2-st1}
\end{align}
Substituting Eq. \ref{eqn:A1-full} and Eq. \ref{eqn:A2-st1} back into Eq. \ref{eqn:A1A2-partial} and rearranging the order, we have
\begin{align}
	& \eta_g\eta_c\eta G \Big(\frac{1}{2} - 40 G^2\eta_c^2\eta^2 L^2 - 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 - 10 \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \Big) \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2  \notag\\
    % & \leq f(\overline{\textbf{x}}^t) - \mathbb{E}_t[f(\overline{\textbf{x}}^{t+1})] + \Big(\frac{L\eta_g^2\eta_c^2\eta^2}{2} - \frac{\eta_g\eta_c\eta}{2G}\Big) \mathbb E_t \bigg\Vert \frac{1}{m} \sum_{i=1}^{M} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \sum_{j\in\mathcal{V}_i} \nabla F_j(\textbf{x}_j^{t,\tau,h}) \bigg\Vert^2 \notag\\
    % & + 9 G^2\eta_g\eta_c^3\eta^3 L^2 \frac{M}{m} \sigma^2 + 40 G^3\eta_g\eta_c^3\eta^3 L^2 \epsilon^2 + 8 G\eta_g\eta_c\eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 + 75 G\eta_g\eta_c\eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 \notag\\
    % & + 100 G \eta_g \eta_c \eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \epsilon^2 + \frac{5}{4} G\eta_g^2\eta_c^2\eta^2 L \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2 \notag\\
    % & + \frac{15}{2} G \eta_g^2\eta_c^2\eta^2 L \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon_i^2 + 10G \eta_g^2\eta_c^2\eta^2 L \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon^2 \notag\\
    & \leq f(\overline{\textbf{x}}^t) - \mathbb{E}_t[f(\overline{\textbf{x}}^{t+1})] + 9 G^2\eta_g\eta_c^3\eta^3 L^2 \frac{M}{m} \sigma^2 + 8 G\eta_g\eta_c\eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 \notag\\
    & + 75 G\eta_g\eta_c\eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 100 G \eta_g \eta_c \eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \epsilon^2 + 40 G^3\eta_g\eta_c^3\eta^3 L^2 \epsilon^2 \notag\\
    & + \frac{5}{4} G\eta_g^2\eta_c^2\eta^2 L \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2 + \frac{15}{2} G \eta_g^2\eta_c^2\eta^2 L \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon_i^2 + 10G \eta_g^2\eta_c^2\eta^2 L \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon^2.
\end{align}
Here, like we do in the full case, we drop the term $\mathbb E_t \Vert \frac{1}{m} \sum_{i=1}^{M} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \sum_{j\in\mathcal{V}_i} \nabla F_j(\textbf{x}_j^{t,\tau,h}) \Vert^2$ with condition $\eta_g\eta_c\eta \leq \frac{1}{GL}$ ensuring its coefficient $\frac{L\eta_g^2\eta_c^2\eta^2}{2} - \frac{\eta_g\eta_c\eta}{2G} \leq 0$.
% where $(a)$ follows from $60 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \leq 1$ if $\eta < \frac{1}{8I_iL}, \forall i\in[M]$ (condition from \cref{lem:FL-gNorm-partial}) and $\frac{L\eta_g^2\eta_c^2\eta^2}{2} - \frac{\eta_g\eta_c\eta}{2G} \leq 0$ and $\eta_g\eta_c\eta L \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i} I_i \leq 1$ if $\eta_g\eta_c\eta \leq \frac{1}{GL}$.

Suppose $40 G^2\eta_c^2\eta^2 L^2 + 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 + 10 \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i < \frac{1}{2}$,  and there exists a constant $c>0$ satisfying $(\frac{1}{2} - 40 G^2\eta_c^2\eta^2 L^2 - 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 - 10 \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i) > c > 0$, then we have
\begin{align}
	& \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2  \leq \frac{f(\overline{\textbf{x}}^t) - \mathbb{E}_t [f(\overline{\textbf{x}}^{t+1})]}{\eta_g\eta_c\eta G c} + \frac{1}{c} \bigg[ 9 G\eta_c^2\eta^2 L^2 \frac{M}{m} \sigma^2 + 8 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 \notag\\
    & + 75 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \epsilon^2 + 40 G^2\eta_c^2\eta^2 L^2 \epsilon^2 \notag\\
    & + \frac{5}{4} \eta_g\eta_c\eta L \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2 + \frac{15}{2} \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon_i^2 + 10 \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon^2 \bigg] , \label{eqn:st1-oneStep}
\end{align}

% For better presentation, we introduce constant $c$ such that $\frac{1}{c} = {\rm max} \{ (1+\frac{1}{c_1} + \frac{5\eta_g}{2c_3})\frac{1}{c_2}, \frac{1}{c_1c_2}, \frac{1}{c_2c_3} \}$. Then we substitute Eq. \ref{eq:phi} and , and then rearrange Eq. \ref{eqn:st1-glr-useless} as
% \begin{align}
% 	& \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2 \leq \frac{f(\overline{\textbf{x}}^t) - \mathbb{E}_t[f(\overline{\textbf{x}}^{t+1})]}{\eta_c\eta G c} + \frac{1}{c} \bigg[ 40 G^2\eta_c^2\eta^2 L^2 \epsilon^2 + 75\eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 13\eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 \notag\\
%     & + 5 G\eta_c^2\eta^2 L^2 \frac{M}{m} \sigma^2 + \frac{L\eta_c\eta}{2m} \sigma^2 +  \frac{1}{2}L\eta_c\eta \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i} \sigma^2 + \frac{3}{2}L\eta_c\eta \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i} I_i \epsilon_i^2 \notag\\
%     & + \frac{15}{2}L^3\eta_c\eta^3 \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i} I_i^2(\sigma^2 + 6I_i\epsilon_i^2) + \frac{5}{4}L^2\eta_c^2\eta^2 \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i^2} I_i \sigma^2 + \frac{15}{4}L^2\eta_c^2\eta^2 \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i^2} I_i^2 \epsilon_i^2 \notag\\
%     & + \frac{25}{4}L^3\eta_c\eta^3 \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i} I_i^2 \sigma^2 +  \frac{75}{4}L^4\eta_c^2\eta^4 \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i^2} I_i^3 \sigma^2 + \frac{75}{2}L^3\eta_c\eta^3 \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i} I_i^3 \epsilon_i^2 + \frac{225}{2}L^4\eta_c^2\eta^4 \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i^2} I_i^4 \epsilon_i^2 \bigg] \notag\\
%     & \overset{(a)}{\leq} \frac{f(\overline{\textbf{x}}^t) - \mathbb{E}_t[f(\overline{\textbf{x}}^{t+1})]}{\eta_c\eta G c} + \frac{1}{c} \bigg[ 40 G^2\eta_c^2\eta^2 L^2 \epsilon^2 + 75\eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 13\eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 \notag\\
%     & + \frac{L\eta_c\eta}{2m} \sigma^2 + 5 G\eta_c^2\eta^2 L^2 \frac{M}{m} \sigma^2 +  L\eta_c\eta \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i} \sigma^2 \notag\\
%     & + 3L\eta_c\eta \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i} I_i \epsilon_i^2 + 2L^2\eta_c^2\eta^2 \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i^2} I_i \sigma^2 + 6L^2\eta_c^2\eta^2 \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i^2} I_i^2 \epsilon_i^2 \bigg] , \label{eqn:st1-oneStep}
% \end{align}
% where $(a)$ is the substitution of condition $\eta < \frac{1}{8I_iL}, \forall i\in[M]$ for simplification of some higher-order terms (greater than 2), like we did in the full participation case.

Taking a double expectation over the data samples among all workers and averaging from $t=0,1,\ldots,T$, we have the final results as
\begin{align}
	& \min_{t\in[T]} \mathbb{E} \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2 \leq \frac{f_0 - f_*}{c\eta_g\eta_c\eta G T} + \frac{1}{c} (\Phi_1 + \Phi_2 + \Phi_3), \label{eqn:st1-theorem2}
\end{align}
where, 
\begin{align}
    & \Phi_1 = 9 G\eta_c^2\eta^2 L^2 \frac{M}{m} \sigma^2 + 8 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 + 40 G^2\eta_c^2\eta^2 L^2 \epsilon^2 \notag\\
    & +  + 75 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \epsilon^2, \;\; \Phi_2 = \frac{1}{2} \eta_g\eta_c\eta L \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2, \notag\\
    & \Phi_3 = \frac{3}{4} \eta_g\eta_c\eta L \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2 + \frac{15}{2} \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon_i^2 + 10 \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2}{m^2n_i} I_i \epsilon^2. \label{eqn:phi-st1}
\end{align}

% When setting $\eta = \frac{1}{\sqrt{T}GL}$ and $\eta_c = \sqrt{Gm\kappa_{min}}$, the convergence rate is $\mathcal{O}(\frac{I_{max}}{\sqrt{m\kappa_{min}G}}(\frac{1}{\sqrt{T}}) + \frac{1}{T})$.

\subsection{Bounding strategy 2}
With sampling strategy 2, we have for the second term of Eq. \ref{eqn:A2-target}
\begin{align}
    & \mathbb E_t \Big\Vert \sum_{j\in\mathcal{S}_i^{t,\tau}} \theta_j^{t,\tau} \Big\Vert^2 = \mathbb E_t \bigg\Vert \sum_{j\in\mathcal{V}_i}\mathbb{P}\{j\in \mathcal{S}_i^{t,\tau}\} \theta_j^{t,\tau} \bigg\Vert^2 \notag\\
	& = \mathbb E_t \left[\sum_{j\in\mathcal{V}_i}\mathbb{P}\{j\in \mathcal{S}_i^{t,\tau}\} \big\Vert \theta_j^{t,\tau} \big\Vert^2 + \sum_{j \neq k; j, k\in \mathcal{V}_i} \mathbb{P}\{j, k\in \mathcal{S}_i^{t,\tau}\}\left\langle \theta_j^{t,\tau} , \theta_k^{t,\tau} \right\rangle \right] \notag\\
	& = \mathbb E_t \left[\frac{n_i}{m_i}\sum_{j\in \mathcal V_i} \big\Vert \theta_j^{t,\tau} \big\Vert^2 + \frac{n_i(n_i-1)}{m_i(m_i-1)} \sum_{j \neq k; j, k\in \mathcal{V}_i} \left\langle \theta_j^{t,\tau} , \theta_k^{t,\tau} \right\rangle \right] \notag\\
    & = \mathbb E_t \left[\frac{n_i}{m_i}\sum_{j\in \mathcal V_i} \big\Vert \theta_j^{t,\tau} \big\Vert^2 + \frac{n_i(n_i-1)}{m_i(m_i-1)} \bigg( \Big\Vert \sum_{j\in \mathcal V_i} \theta_j^{t,\tau} \Big\Vert^2 - \sum_{j\in \mathcal V_i}\big\Vert \theta_j^{t,\tau} \big\Vert^2 \bigg) \right] \notag\\
	& = \sum_{j\in \mathcal V_i} \frac{n_i(m_i-n_i)}{m_i(m_i-1)} \mathbb E_t \big\Vert \theta_j^{t,\tau} \big\Vert^2 + \frac{n_i(n_i-1)}{m_i(m_i-1)} \mathbb E_t \bigg\Vert \sum_{j\in \mathcal V_i} \theta_j^{t,\tau} \bigg\Vert^2 . \label{eqn:st2-sum-set-Norm}
\end{align}
Substituting Eq. \ref{eqn:st2-sum-set-Norm} back into Eq. \ref{eqn:A2-target}, we have
\begin{align}
	& A_2 \leq G\eta_c^2\eta^2  \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2 + \eta_c^2\eta^2 \sum_{i=1}^{M}\sum_{\tau=0}^{\omega_i-1}\sum_{j\in \mathcal V_i} \frac{m_i(m_i-n_i)}{m^2n_i(m_i-1)} \mathbb E_t \big\Vert \theta_j^{t,\tau} \big\Vert^2 \notag\\
    & - \eta_c^2\eta^2 \frac{1}{m^2} \sum_{i=1}^{M} \frac{m_i-n_i}{n_i(m_i-1)} \sum_{\tau=0}^{\omega_i-1} \mathbb E_t \Big\Vert \sum_{j\in\mathcal{V}_i} \theta_j^{t,\tau} \Big\Vert^2 + \eta_c^2\eta^2 \mathbb E_t \Big\Vert \frac{1}{m}\sum_{i=1}^{M}\sum_{j\in \mathcal V_i} \sum_{\tau=0}^{\omega_i-1}  \theta_j^{t,\tau} \Big\Vert^2 \notag\\
    & \leq G\eta_c^2\eta^2  \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2 + \eta_c^2\eta^2 \sum_{i=1}^{M}\sum_{\tau=0}^{\omega_i-1}\sum_{j\in \mathcal V_i} \frac{m_i(m_i-n_i)}{m^2n_i(m_i-1)} \mathbb E_t \big\Vert \theta_j^{t,\tau} \big\Vert^2 + \eta_c^2\eta^2 \mathbb E_t \Big\Vert \frac{1}{m}\sum_{i=1}^{M}\sum_{j\in \mathcal V_i} \sum_{\tau=0}^{\omega_i-1}  \theta_j^{t,\tau} \Big\Vert^2
    \label{eqn:A2-st2-raw}
\end{align}
Let $\alpha_i = \frac{m_i-n_i}{m_i-1}$. Like we do in strategy 1, we can also bound $\Psi_i^{t,\tau}$ for strategy 2 by substituting Eq. \ref{eqn:st2-sum-set-Norm} into the first term of Eq. \ref{eq:Psi-ori}
\begin{align}
    & \Psi_i^{t,\tau} = \sum_{j\in \mathcal V_i} \frac{\alpha_i}{m_in_i} \mathbb E_t \big\Vert \theta_j^{t,\tau} \big\Vert^2 - \frac{\alpha_i}{m_i^2n_i} \mathbb E_t \bigg\Vert \sum_{j\in \mathcal V_i} \theta_j^{t,\tau} \bigg\Vert^2 \label{eqn:Psi-st2}
\end{align}
Similarly, substituting Eq. \ref{eqn:Psi-st2} back into Eq. \ref{eqn:A2-Fnorm-self} with $p_i = \frac{m_i\alpha_i}{m^2n_i}, \forall i\in[M]$, we have
\begin{align}
    & \sum_{i=1}^{M} \frac{m_i\alpha_i}{m^2n_i} \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{V}_i} \mathbb E_t \big\Vert \sum_{h=0}^{I_i-1} \nabla F_j(\textbf{x}_j^{t,\tau,h}) \big\Vert^2 \notag\\
    % & \leq \frac{2}{5} G \sum_{i=1}^{M} \frac{m_i^2\alpha_i}{m^2n_i} \sigma^2 + \frac{1}{2} \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i^2} I_i \sigma^2 + 9G \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon_i^2 + 12G \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon^2 + 12G \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \Vert \nabla f(\overline{\textbf{x}}^{t}) \Vert^2 \notag\\
    % & + 28 G\eta_c^2\eta^2 L^2 \sum_{i=1}^{M} \frac{m_i\alpha_i^2}{m^2n_i^2} I_i \sum_{\tau=0}^{\omega_i-1} \sum_{j\in \mathcal V_i} \mathbb E_t \big\Vert \theta_j^{t,\tau} \big\Vert^2 - 28 G\eta_c^2\eta^2 L^2 \sum_{i=1}^{M} \frac{\alpha_i^2}{m^2n_i^2} I_i \sum_{\tau=0}^{\omega_i-1} \mathbb E_t \bigg\Vert \sum_{j\in \mathcal V_i} \theta_j^{t,\tau} \bigg\Vert^2 \notag\\
    & \leq \frac{9}{10} G \sum_{i=1}^{M} \frac{m_i^2\alpha_i}{m^2n_i} \sigma^2 + 9G \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon_i^2 + 12G \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon^2 + 12G \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \Vert \nabla f(\overline{\textbf{x}}^{t}) \Vert^2 \notag\\
    & + \frac{2}{5} \sum_{i=1}^{M} \frac{m_i\alpha_i}{m^2n_i}  \sum_{\tau=0}^{\omega_i-1} \sum_{j\in \mathcal V_i} \mathbb E_t \big\Vert \theta_j^{t,\tau} \big\Vert^2,
\end{align}
where $(a)$ is due to the fact that $\frac{I_i}{n_i} < G$ and $\alpha_i \leq 1, \forall i\in[M]$ and the condition $\eta_c\eta \leq \frac{1}{10LG}$.

Rearranging the order for the left hand term, we have
\begin{align}
    & \sum_{i=1}^{M} \frac{m_i\alpha_i}{m^2n_i} \sum_{\tau=0}^{\omega_i-1} \sum_{j\in\mathcal{V}_i} \mathbb E_t \big\Vert \theta_{j}^{t,\tau} \big\Vert^2 \notag\\
    & \leq \frac{3}{2} G \sum_{i=1}^{M} \frac{m_i^2\alpha_i}{m^2n_i} \sigma^2 + 15G \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon_i^2 + 20G \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon^2 + 20G \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \Vert \nabla f(\overline{\textbf{x}}^{t}) \Vert^2. \label{eqn:A2-Fnorm-st2}
\end{align}
We substitute Eq. \ref{eqn:A2-Fnorm-st2} back to Eq. \ref{eqn:A2-st2-raw}, and bound $A_2$ for strategy 2 as
\begin{align}
	& A_2 \leq G\eta_c^2\eta^2  \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2 + \eta_c^2\eta^2 \mathbb E_t \Big\Vert \frac{1}{m}\sum_{i=1}^{M}\sum_{j\in \mathcal V_i} \sum_{\tau=0}^{\omega_i-1} \sum_{h=0}^{I_i-1} \nabla F_j(\textbf{x}_j^{t,\tau,h}) \Big\Vert^2 + \frac{3}{2} G\eta_c^2\eta^2  \sum_{i=1}^{M}  \frac{m_i^2\alpha_i}{m^2n_i} \sigma^2 \notag\\
    & + 15G \eta_c^2\eta^2 \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon_i^2 + 20G \eta_c^2\eta^2 \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon^2 + 20G \eta_c^2\eta^2 \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \Vert \nabla f(\overline{\textbf{x}}^{t}) \Vert^2. \label{eqn:A2-st2}
\end{align}
Substituting Eq. \ref{eqn:A1-full} and Eq. \ref{eqn:A2-st2} back into Eq. \ref{eqn:A1A2-partial} and rearranging the order, we have
\begin{align}
	& \eta_g\eta_c\eta G \Big(\frac{1}{2} - 40 G^2\eta_c^2\eta^2 L^2 - 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 - 10 \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \Big) \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2  \notag\\
    % & \leq f(\overline{\textbf{x}}^t) - \mathbb{E}_t[f(\overline{\textbf{x}}^{t+1})] + \Big(\frac{L\eta_g^2\eta_c^2\eta^2}{2} - \frac{\eta_g\eta_c\eta}{2G}\Big) \mathbb E_t \bigg\Vert \frac{1}{m} \sum_{i=1}^{M} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \sum_{j\in\mathcal{V}_i} \nabla F_j(\textbf{x}_j^{t,\tau,h}) \bigg\Vert^2 \notag\\
    % & + 9 G^2\eta_g\eta_c^3\eta^3 L^2 \frac{M}{m} \sigma^2 + 40 G^3\eta_g\eta_c^3\eta^3 L^2 \epsilon^2 + 8 G\eta_g\eta_c\eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 + 75 G\eta_g\eta_c\eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 \notag\\
    % & + 100 G \eta_g \eta_c \eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \epsilon^2 + \frac{1}{2} G\eta_g^2\eta_c^2\eta^2 L \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2 + \frac{3}{4} G\eta_g^2\eta_c^2\eta^2 L \sum_{i=1}^{M}  \frac{m_i^2\alpha_i}{m^2n_i} \sigma^2 \notag\\
    % & + \frac{15}{2} G \eta_g^2\eta_c^2\eta^2 L \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon_i^2 + 10G \eta_g^2\eta_c^2\eta^2 L \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon^2 \notag\\
    & \overset{(a)}{\leq} f(\overline{\textbf{x}}^t) - \mathbb{E}_t[f(\overline{\textbf{x}}^{t+1})] + 9 G^2\eta_g\eta_c^3\eta^3 L^2 \frac{M}{m} \sigma^2 + 8 G\eta_g\eta_c\eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 + 40 G^3\eta_g\eta_c^3\eta^3 L^2 \epsilon^2 \notag\\
    & + 75 G\eta_g\eta_c\eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 100 G \eta_g \eta_c \eta^3 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \epsilon^2 + \frac{1}{2} G\eta_g^2\eta_c^2\eta^2 L \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2 \notag\\
    & + \frac{3}{4} G\eta_g^2\eta_c^2\eta^2 L \sum_{i=1}^{M} \frac{m_i^2\alpha_i}{m^2n_i} \sigma^2 + \frac{15}{2} G \eta_g^2\eta_c^2\eta^2 L \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon_i^2 + 10G \eta_g^2\eta_c^2\eta^2 L \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon^2.
\end{align}
Likewise, we drop the term $\mathbb E_t \Vert \frac{1}{m} \sum_{i=1}^{M} \sum_{\tau=0}^{\omega_i-1}\sum_{h=0}^{I_i-1} \sum_{j\in\mathcal{V}_i} \nabla F_j(\textbf{x}_j^{t,\tau,h}) \Vert^2$ with condition $\eta_g\eta_c\eta \leq \frac{1}{GL}$.
% where $(a)$ follows from $60 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \leq 1$ if $\eta < \frac{1}{8I_iL}, \forall i\in[M]$ (condition from \cref{lem:FL-gNorm-partial}) and $\frac{L\eta_g^2\eta_c^2\eta^2}{2} - \frac{\eta_g\eta_c\eta}{2G} \leq 0$ and $\eta_g\eta_c\eta L \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i} I_i \leq 1$ if $\eta_g\eta_c\eta \leq \frac{1}{GL}$.

Suppose $40 G^2\eta_c^2\eta^2 L^2 + 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 + 10 \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i < \frac{1}{2}$,  and there exists a constant $c>0$ satisfying $(\frac{1}{2} - 40 G^2\eta_c^2\eta^2 L^2 - 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 - 10 \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i) > c > 0$, then we have
\begin{align}
	& \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2  \leq \frac{f(\overline{\textbf{x}}^t) - \mathbb{E}_t [f(\overline{\textbf{x}}^{t+1})]}{\eta_g\eta_c\eta G c} + \frac{1}{c} \bigg[ 9 G\eta_c^2\eta^2 L^2 \frac{M}{m} \sigma^2 + 8 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 \notag\\
    & + 75 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \epsilon^2 + 40 G^2\eta_c^2\eta^2 L^2 \epsilon^2 + \frac{1}{2} \eta_g\eta_c\eta L \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2 \notag\\
    & + \frac{3}{4} \eta_g\eta_c\eta L \sum_{i=1}^{M}  \frac{m_i^2\alpha_i}{m^2n_i} \sigma^2 + \frac{15}{2} \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon_i^2 + 10 \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon^2 \bigg] , \label{eqn:st2-oneStep}
\end{align}

% For better presentation, we introduce constant $c$ such that $\frac{1}{c} = {\rm max} \{ (1+\frac{1}{c_1} + \frac{5\eta_g}{2c_3})\frac{1}{c_2}, \frac{1}{c_1c_2}, \frac{1}{c_2c_3} \}$. Then we substitute Eq. \ref{eq:phi} and , and then rearrange Eq. \ref{eqn:st1-glr-useless} as
% \begin{align}
% 	& \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2 \leq \frac{f(\overline{\textbf{x}}^t) - \mathbb{E}_t[f(\overline{\textbf{x}}^{t+1})]}{\eta_c\eta G c} + \frac{1}{c} \bigg[ 40 G^2\eta_c^2\eta^2 L^2 \epsilon^2 + 75\eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 13\eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 \notag\\
%     & + 5 G\eta_c^2\eta^2 L^2 \frac{M}{m} \sigma^2 + \frac{L\eta_c\eta}{2m} \sigma^2 +  \frac{1}{2}L\eta_c\eta \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i} \sigma^2 + \frac{3}{2}L\eta_c\eta \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i} I_i \epsilon_i^2 \notag\\
%     & + \frac{15}{2}L^3\eta_c\eta^3 \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i} I_i^2(\sigma^2 + 6I_i\epsilon_i^2) + \frac{5}{4}L^2\eta_c^2\eta^2 \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i^2} I_i \sigma^2 + \frac{15}{4}L^2\eta_c^2\eta^2 \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i^2} I_i^2 \epsilon_i^2 \notag\\
%     & + \frac{25}{4}L^3\eta_c\eta^3 \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i} I_i^2 \sigma^2 +  \frac{75}{4}L^4\eta_c^2\eta^4 \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i^2} I_i^3 \sigma^2 + \frac{75}{2}L^3\eta_c\eta^3 \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i} I_i^3 \epsilon_i^2 + \frac{225}{2}L^4\eta_c^2\eta^4 \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i^2} I_i^4 \epsilon_i^2 \bigg] \notag\\
%     & \overset{(a)}{\leq} \frac{f(\overline{\textbf{x}}^t) - \mathbb{E}_t[f(\overline{\textbf{x}}^{t+1})]}{\eta_c\eta G c} + \frac{1}{c} \bigg[ 40 G^2\eta_c^2\eta^2 L^2 \epsilon^2 + 75\eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 13\eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 \notag\\
%     & + \frac{L\eta_c\eta}{2m} \sigma^2 + 5 G\eta_c^2\eta^2 L^2 \frac{M}{m} \sigma^2 +  L\eta_c\eta \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i} \sigma^2 \notag\\
%     & + 3L\eta_c\eta \sum_{i=1}^{M} \frac{m_i^2}{m^2n_i} I_i \epsilon_i^2 + 2L^2\eta_c^2\eta^2 \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i^2} I_i \sigma^2 + 6L^2\eta_c^2\eta^2 \sum_{i=1}^{M}\frac{m_i^2}{m^2n_i^2} I_i^2 \epsilon_i^2 \bigg] , \label{eqn:st1-oneStep}
% \end{align}
% where $(a)$ is the substitution of condition $\eta < \frac{1}{8I_iL}, \forall i\in[M]$ for simplification of some higher-order terms (greater than 2), like we did in the full participation case.

Taking a double expectation over the data samples among all workers and averaging from $t=0,1,\ldots,T$, we have the final results as
\begin{align}
	& \min_{t\in[T]} \mathbb{E} \big\Vert \nabla f(\overline{\textbf{x}}^t) \big\Vert^2 \leq \frac{f_0 - f_*}{c\eta_g\eta_c\eta G T} + \frac{1}{c} (\Phi_1 + \Phi_2 + \Phi_3), \label{eqn:st2-theorem2}
\end{align}
where, 
\begin{align}
    & \Phi_1 = 9 G\eta_c^2\eta^2 L^2 \frac{M}{m} \sigma^2 + 8 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i\sigma^2 + 40 G^2\eta_c^2\eta^2 L^2 \epsilon^2 \notag\\
    & + 75 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2\epsilon_i^2 + 100 \eta^2 L^2 \sum_{i=1}^{M} \frac{m_i}{m} I_i^2 \epsilon^2, \;\; \Phi_2 =  \frac{1}{2} \eta_g\eta_c\eta L \sum_{i=1}^{M}  \frac{m_i^2}{m^2n_i} \sigma^2, \notag\\
    & \Phi_3 = \frac{3}{4} \eta_g\eta_c\eta L \sum_{i=1}^{M}  \frac{m_i^2\alpha_i}{m^2n_i} \sigma^2 + \frac{15}{2} \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon_i^2 + 10 \eta_g\eta_c\eta L \sum_{i=1}^M \frac{m_i^2\alpha_i}{m^2n_i} I_i \epsilon^2. \label{eqn:phi-st2}
\end{align}

This completes the proof of \cref{them:hfl-partial}.
\end{proof}


\section{Additional Results}
\label{app:add-res}
\begin{figure*}[!ht]
\centering
\subfigure[]{
\centering
\includegraphics[width=2.2in]{figs/cifar_case1_p}
\label{fg:cifar-giid-partial}
}
\subfigure[]{
\centering
\includegraphics[width=2.2in]{figs/cifar_case3_p}
\label{fg:cifar-gniid-partial}
}
\caption{Test Accuracy w.r.t. iterations on CIFAR-10. (a) Group i.i.d. with partial participation; (b) Group non-i.i.d. with partial participation.}
\label{fg:cifar}
\end{figure*}

\subsection{Weakening Effect on CIFAR-10} 
We conduct similar experiments on CIFAR-10 to show the weakening effect of partial participation for HFL. Here we only test the partial worker case with $20\%$ participating.

Similarly, we observe that the curve patterns in Fig. \ref{fg:cifar-gniid-partial} resembles those in Fig. \ref{fg:cifar-giid-partial}, namely there is no noticeable performance gap (e.g., $G=50, I=10$ and $G=250, I=10$ versus $P=10$). Note that HFL with $G=250, I=50$ even performs slightly better than standard FL with $P=50$.

\subsection{Sampling Strategy}
We explore the performance of different sampling strategies on MNIST. Here we use two uniform groups both with 50 workers. The round periods are set to $G=100, I=50$.

\begin{table*}[!h]
\centering
\caption{Master rounds needed for sampling strategies to achieve target test accuracy on MNIST.}
\label{tab:strategy}
\begin{tabular}{c|cccccccccc}
\hline
Sampling Ratio & $10\%$ & $20\%$ & $30\%$ & $40\%$ & $50\%$ & $60\%$ & $70\%$ & $80\%$ & $90\%$ & $100\%$ \\\hline
Strategy 1 & 643 & 491 & 492 & 469 & 444 & 444 & 431 & 438 & 409 & 441 \\
Strategy 2 & 564 & 485 & 477 & 414 & 414 & 411 & 411 & 399 & 379 & 377 \\\hline
\end{tabular}
\end{table*}

\cref{tab:strategy} presents the results for different sampling strategies. We observe that strategy 2 always outperform strategy 1 with the same sampling ratio. This matches our \cref{them:hfl-partial} as convergence bound for strategy 2 is tighter due to the additional coefficient $\alpha_i \leq 1, \forall i\in[M]$. Basically, the results indicates a trend that the higher the sampling ratio, the better the convergence. Still, note that strategy 1 with $100\%$ shows a little performance degradation. This may be a high sampling ratio with replacement may include multiple stochastic gradient updates from a single worker, which instead potentially leads to higher variance.