
%\documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)

%% Choose your variant of English; be consistent

\usepackage[american]{babel}
\usepackage{natbib}
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools, cuted}
\usepackage{booktabs}
\usepackage{tikz}
\usepackage{hyperref}
\usepackage{cleveref}
\usepackage{amsthm, amssymb, bm, amsmath}
\usepackage{amsfonts}
\usepackage{subcaption}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{float}
\usepackage{booktabs}
% \usepackage{array}

% \input{macros}
\allowdisplaybreaks

%% Self-defined macros
\newcommand{\TODO}[1]{\todo[color=blue!25, inline]{ TODO: #1} \index{To Do: !#1}}

\theoremstyle{plain}
\newtheorem{lem}{Lemma}
\newtheorem{thm}{Theorem}
\newtheorem*{thm*}{Theorem}
\newtheorem{defn}{Definition}
\newtheorem{coro}{Corollary}
\newtheorem{clm}{Claim}
\newtheorem{conj}{Conjecture}
\newtheorem{exple}{Example}
\newtheorem{prop}{Proposition}
\newtheorem{propty}{Property}
\newtheorem{rem}{Remark}
\newtheorem{assum}{Assumption}

\newcommand{\Ex}{\mathbb{E}}
\newcommand{\Eg}[2]{\mathbb{E}_{#1}\left[{#2}\right]}
\newcommand{\E}[1]{\mathbb{E}\left[{#1}\right]}
\newcommand{\V}[1]{\mathrm{Var}\left[{#1}\right]}
\newcommand{\posfunc}[1]{\lvert {#1}\rvert ^{+}}
\newcommand{\HyperExp}{\textit{HyperExp}}
\newcommand{\SExp}{\textit{ShiftedExp}}
\newcommand{\Exp}{\textit{Exp}}
\newcommand{\Pareto}{\textit{Pareto}}
\newcommand{\xor}{\oplus}
\newcommand{\cmark}{\ding{51}}
\newcommand{\xmark}{\ding{55}}


\newcommand{\norm}[1]{\left\lVert#1\right\rVert^{2}}
\newcommand{\matsq}[1]{#1^{T}#1}
\newcommand{\bigO}[1]{\mathcal{O}\left({#1}\right)}
\newcommand{\brac}[1]{\left({#1}\right)}


\graphicspath{{Figures/}}

\crefname{equation}{}{}
\Crefname{equation}{}{}
\crefname{thm}{theorem}{theorems}
\Crefname{thm}{Theorem}{Theorems}
\crefname{clm}{claim}{claims}
\Crefname{clm}{Claim}{Claims}
\Crefname{coro}{Corollary}{Corollaries}
\Crefname{lem}{Lemma}{Lemmas}
\Crefname{sec}{Section}{Sections}
\crefname{app}{appendix}{appendices}
\Crefname{app}{Appendix}{Appendices}
\crefname{prop}{proposition}{propositions}
\Crefname{prop}{Proposition}{Propositions}
\Crefname{propty}{Property}{Properties}
\crefname{figure}{fig.}{figures}
\Crefname{figure}{Fig.}{Figures}
\crefname{defn}{definition}{definitions}
\Crefname{defn}{Definition}{Definitions}
\crefname{fact}{fact}{facts}
\Crefname{fact}{Fact}{Facts}
\crefname{appendix}{appendix}{appendices}
\Crefname{appendix}{Appendix}{Appendices}
\crefname{algo}{algorithm}{algorithms}
\Crefname{algo}{Algorithm}{Algorithms}
\crefname{algorithm}{algorithm}{algorithms}
\Crefname{algorithm}{Algorithm}{Algorithms}
\crefname{tbl}{table}{table}
\Crefname{tbl}{Table}{Table}
\crefname{table}{table}{table}
\Crefname{table}{Table}{Table}
\crefname{algorithm}{algorithm}{algorithms}
\Crefname{algorithm}{Algorithm}{Algorithms}

\crefname{conj}{conjecture}{conjectures}
\Crefname{conj}{Conjecture}{Conjectures}
\crefname{obs}{observation}{observations}
\Crefname{obs}{Observation}{Observations}

% Defining new commands and environments
\newtheorem{remark}{Remark}
\newtheorem{theorem}{\textbf{Theorem}}
\newcommand{\gradvect}{\mathbf{x}}
\newcommand{\gradsc}{x}
\newcommand{\wtsc}{w}
\newcommand{\wtvect}{\mathbf{w}}
\newcommand{\gradcomp}{\mathbf{h}}
\newcommand{\compsc}{h}
\newcommand{\ind}{Z}
\newcommand{\avgtru}{\bar{\gradvect}}
\newcommand{\avgest}{\hat{\gradvect}}
\newcommand{\idmat}{\mathbf{I}}
\newcommand{\codemat}{\mathbf{E}}
\newcommand{\codepinv}{\mathbf{W}}
\newcommand{\codeprob}{\rho}
\newcommand{\gradnum}{n}
\newcommand{\graddim}{d}
\newcommand{\var}{\gamma}
\newcommand{\gradnoise}{\bm{\eta}}
\newcommand{\compdim}{k}
\newcommand{\scale}{\alpha}
\newcommand{\prob}{p}
\newcommand{\covmat}{\mathbf{C}}
\newcommand{\eig}{\lambda}
\newcommand{\eigmat}{\bm{\Lambda}}
\newcommand{\eigvects}{\mathbf{U}}
%\newcommmand{\randscale}{\omega}

\newcommand{\encmat}{\mathbf{G_e}}
\newcommand{\recmat}{\mathbf{G}}
\newcommand{\checkmat}{\mathbf{H}}
\newcommand{\onedata}{\mathbf{x}}
\newcommand{\numfunc}{m}
\newcommand{\numencfunc}{m_e}
\newcommand{\numrecfunc}{M'}
\newcommand{\decrows}{m_d}
\newcommand{\fcdeg}{d}
\newcommand{\lndd}{\Omega}
\newcommand{\ledd}{\omega}
\newcommand{\rndd}{\Lambda}
\newcommand{\redd}{\lambda}
\newcommand{\params}{\bm{\theta}}
\newcommand{\loss}{\mathcal{L}}
\newcommand{\cols}{n}

\newcommand{\mdsmat}{\mathbf{F}}
\newcommand{\mdsnum}{k}

\newcommand{\fcpc}{c}
\newcommand{\fcpdel}{\delta}
\newcommand{\fcps}{S}
\newcommand{\fcsource}{s}
\newcommand{\fcenc}{e}
\newcommand{\RS}{\rho}
\newcommand{\numcomp}{C}
\newcommand{\workcomp}{B}
\newcommand{\tupleset}{\mathbb{S}}
\newcommand{\runtime}{T}
\newcommand{\worktime}{Y}
\newcommand{\queuetime}{Z}
\newcommand{\shifttime}{\tau}
\newcommand{\smalltime}{t}
\newcommand{\exprate}{\mu}
%\newcommmand{\x}{\mathbf{x}}

\def\y{{\mathbf y}}
\def\A{{\mathbf A}}
\def\H{{\mathbf H}}
\def\b{{\mathbf b}}

\newcommand{\bA}{{\bf A}}
\newcommand{\bx}{{\bf x}}
\newcommand{\be}{{\bf e}}
\newcommand{\bb}{{\bf b}}
\newcommand{\bg}{{\bf g}}
\newcommand{\bu}{{\bf u}}
\newcommand{\bd}{{\bf d}}
\newcommand{\bdx}{\dot{\bf x}}
\newcommand{\bp}{{\bf \varpi}}
\newcommand{\bdp}{\dot{\bf p}}
\newcommand{\bq}{{\bf q}}
\newcommand{\bdq}{\dot{\bf q}}
\newcommand{\bX}{{\bf X}}
\newcommand{\bdX}{\dot{\bf X}}
\newcommand{\bQ}{{\bf Q}}
\newcommand{\bdQ}{\dot{\bf Q}}
\newcommand{\bP}{{\bf P}}
\newcommand{\bdP}{\dot{\bf P}}
\newcommand{\by}{{\bf y}}
\newcommand{\bdy}{\dot{\bf y}}
\newcommand{\bv}{{\bf v}}
\newcommand{\bh}{{\bf h}}
\newcommand{\bdv}{\dot{\bf v}}
\newcommand{\bw}{{\bf w}}
\newcommand{\bdw}{\dot{\bf w}}
\newcommand{\bt}{{\bf t}}

\newcommand{\bwt}{\bw^{(t)}}
\newcommand{\bvt}{\bv^{(t)}}
\newcommand{\bwit}{\bw_i^{(t)}}
\newcommand{\bwjt}{\bw_j^{(t)}}
\newcommand{\bwitk}{\bw_i^{(t,k)}}
\newcommand{\bwitj}{\bw_i^{(t,j)}}
\newcommand{\xiit}{\xi_i^{(t)}}
\newcommand{\xiitk}{\xi_i^{(t,k)}}
\newcommand{\xiitj}{\xi_i^{(t,j)}}
\newcommand{\bwtp}{\bw^{(t+1)}}
\newcommand{\byt}{\by^{(t)}}
\newcommand{\bytp}{\by^{(t+1)}}
\newcommand{\byit}{\by_i^{(t)}}
\newcommand{\byjt}{\by_j^{(t)}}
\newcommand{\byjtp}{\by_j^{(t+1)}}
\newcommand{\bdit}{\Delta_i^{(t)}}
\newcommand{\bhit}{\bh_i^{(t)}}
\newcommand{\bhjt}{\bh_j^{(t)}}

\newcommand{\Cc}{{\mathcal{C}}}

\newcommand{\set}{{\mathcal{S}}}
\newcommand{\ssize}{M}
\newcommand{\setj}{{\set_j}}
\newcommand{\ssizej}{{\ssize_j}}
\newcommand{\avgestj}{\hat{x}_j}
\newcommand{\weightj}{{T(\ssizej)}}
\newcommand{\weight}{T}
\newcommand{\avgtruj}{\bar{x}_j}
\newcommand{\squares}{R_1}
\newcommand{\cross}{R_2}
\newcommand{\deltai}{\Delta^{(t)}_i}
\newcommand{\deltaj}{\Delta^{(t)}_j}
\newcommand{\activec}{\mathcal{A}(t)}
\newcommand{\gradcomph}{\mathbf{h}'} %^{\text{temp}}}

\newcommand{\scalcomph}{h'}

%^{\text{temp}}}


\newcommand{\mc}{\mathcal}
\newcommand{\mco}{\mathcal O}
\newcommand{\mbb}{\mathbb}
\newcommand{\mbf}{\mathbf}
\newcommand{\mbe}{\mathbb E}
\newcommand{\mbn}{\mathbb N}
\newcommand{\mbr}{\mathbb R}
\newcommand{\mcr}{\mathcal R}
\newcommand{\mcP}{\mathcal P}

\newcommand{\lp}{\left(}
\newcommand{\rp}{\right)}
\newcommand{\ld}{\left.}
\newcommand{\rd}{\right.}
\newcommand{\lcb}{\left\{}
\newcommand{\rcb}{\right\}}
\newcommand{\lb}{\left[}
\newcommand{\rb}{\right]}
\newcommand{\lnr}{\left\|}
\newcommand{\rnr}{\right\|}
\newcommand{\lan}{\left\langle}
\newcommand{\ran}{\right\rangle}

\newcommand{\G}{\nabla}
% Number of agents
\newcommand{\na}{N}
\newcommand{\nas}{M}

\newcommand{\sumin}{\sum_{i=1}^\na}
\newcommand{\sumjn}{\sum_{j=1}^\na}
\newcommand{\sumkt}{\sum_{k=0}^{\tau-1}}
\newcommand{\sumjt}{\sum_{j=0}^{\tau-1}}
\newcommand{\sumjk}{\sum_{j=0}^{k-1}}
\newcommand{\sumtT}{\sum_{t=0}^{T-1}}

\newcommand{\lrc}{\eta_c}
\newcommand{\lrs}{\tilde{\eta}_s}
\newcommand{\lrss}{\eta_s}

\newcommand{\nn}{\nonumber}
\newcommand{\numclients}{N}
\newcommand{\selclients}{M}
\newcommand{\numclusters}{K}
\newcommand{\algoname}{FedVARP}
\newcommand{\clusteralgoname}{ClusterFedVARP}




\newcommand{\GJ}[1]{ \textcolor{magenta}{\textsc{Gauri:} #1}}

\newcommand{\ps}[1]{ \textcolor{red}{\textsc{PS:} #1}}%Pranay

\newcommand{\DJH}[1]{ \textcolor{green}{Divyansh: #1}}

\newcommand{\AN}[1]{ \textcolor{blue}{\textsc{Aushim:} #1}}

\newcommand{\swap}[3][-]{#3#1#2} % just an example
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}


\title{FedVARP: Tackling the Variance Due to Partial Client Participation\\ in Federated Learning (Supplementary material)}

% Add authors
\author[1]{Divyansh Jhunjhunwala}
\author[1]{Pranay Sharma}
\author[1]{Aushim Nagarkatti}
\author[1]{Gauri Joshi}
% Add affiliations after the authors
\affil[1]{%
    Carnegie Mellon University\\
    Pittsburgh, Pennsylvania, USA
}
  
\begin{document}

\onecolumn

\maketitle

% \begin{abstract}
% Data-heterogeneous federated learning (FL) systems suffer from two significant sources of convergence error: 1) client drift error caused by performing multiple local optimization steps at clients, and 2) partial client participation error caused by the fact that only a small subset of the edge clients participate in every training round. We find that among these, only the former has received significant attention in the literature. To remedy this, we propose \texttt{FedVARP}, a novel variance reduction algorithm applied at the server that eliminates error due to partial client participation. 
% To do so, the server simply maintains in memory the most recent update for each client and uses these as surrogate updates for the non-participating clients in every round. Further, to alleviate the memory requirement at the server, we propose a novel clustering-based variance reduction algorithm \texttt{ClusterFedVARP}. Unlike previously proposed methods, both \texttt{FedVARP} and \texttt{ClusterFedVARP} do not require additional computation at clients or communication of additional optimization parameters. Through extensive experiments, we show that \texttt{FedVARP} outperforms state-of-the-art methods, and \texttt{ClusterFedVARP} achieves performance comparable to \texttt{FedVARP} with much less memory requirements.
% \end{abstract}



% \input{Sections/Introduction}
% \input{Sections/Problem_Setup}
% \input{Sections/ProposedAlgorithm}
% \input{Sections/ProposedClusterAlgo}
% % \input{Sections/Analysis}
% \input{Sections/Experiments}
% \input{Sections/RelatedWork}
% \input{Sections/Conclusion}


% Citations should include the author's last name and year.
% They should be part of the sentence.
% An example parenthetical citation: “Good introductions to the topic are available \citep{latexcompanion}.”
% An example textual citation: “\citet{einstein} discusses electrodynamics of moving bodies.”
% Do not use a parenthetical citation where a textual one is appropriate.
% An example of what \emph{not} to do: “\citep{einstein} discusses electrodynamics of moving bodies.”

% We strongly advise to use reference list software such as Bib\TeX{} and a citation package such as \textsf{natbib}.
% The reference style you use should be compatible with the author-year citations.
% Both the citation style and reference style used should be consistent.

% For the original submission, take care not to reveal the authors' identity through the manner in which one's own previous work is cited.
% For example, writing
% “I discussed electrodynamics of moving bodies before \citep{einstein}.” would be inappropriate, as it reveals the author's identity.
% Instead, write “\citet{einstein} discussed electrodynamics of moving bodies.”

% \subsubsection{Footnotes}
% You can include footnotes in your text.\footnote{
%     Use footnotes sparingly, as they can be distracting, having readers skip back and forth between the main text and the foot of the page.
% }
% The footnote mark should follow the fragment to which it refers, so a footnote\footnote{
%     A footnote is material put at the foot of a page.
% }
% for a word has a footnote mark attached to that word and a footnote for a phrase or sentence has a footnote mark attached to the closing punctuation.

% \section{Math}\label{sec:math}
% The class file does not load any math support package like \textsf{amsmath}\footnote{%
%   See the \textsf{amsmath} documentation at \url{https://ctan.org/pkg/amsmath} for further details.
% }.
% We advise using the \textsf{mathtools}\footnote{%
%   See the \textsf{mathtools} documentation at \url{https://ctan.org/pkg/mathtools} for further details.
% }
% package, which extends \textsf{amsmath} with fixes and even more useful commands.
% Feel free to load other support packages for symbols, theorems, etc.

% Use the \textsf{amsmath} environments for displayed equations.
% So, specifically, use the \texttt{equation} environment instead of \verb|$$...$$| and the \texttt{align} environment instead of \texttt{eqnarray}.\footnote{For reasons why you should not use the obsolete \texttt{eqnarray} environment, see Lars Madsen, \textit{Avoid eqnarray!} TUGboat 33(1):21--25, 2012.}
% An \texttt{equation}:
% \begin{equation}\label{eq:example}
%   0 = 1 - 1.
% \end{equation}
% Two \texttt{align}'ed equations:
% \begin{align*} % no numbers with starred version
%   1 + 2 &= 3,\\
%   1 - 2 &= -1.
% \end{align*}
% Equations can also be put inline, of course.
% For example, Equation~\eqref{eq:example}: \(0=1+1\). % $0=1+1$ also works
% (Notice that both inline and displayed math are part of the sentence, so punctuation should be added to displayed math.)

% The \textsf{amsmath} and \textsf{mathtools} packages provide a lot of nice functionality, such as many common math operators, e.g., \(\sin\) and \(\max\), and also commands for defining new ones.

% \section{Floats}\label{sec:floats}
% Floats, such as figures, tables and algorithms, are moving objects and are supposed to float to the nearest convenient location.
% Please do not force them to go in the middle of a paragraph.
% They must respect the column width.

% Two-column floats are possible.
% They appear at the top of the next page, so strategic placement may be necessary.
% For an example, see Figure~\ref{fig:tikz}.
% They may not enter the margins.
% \begin{figure*}
%     \centering
%     \begin{tikzpicture}[xscale=1.5]
%         \coordinate (origin);
%         \draw[->] (origin) -- +(1cm,0) node[below] {$x$};
%         \draw[->] (origin) -- +(0,1cm) node[left] {$y$};
%         \fill[gray] (45:1cm) circle[radius=.2cm];
%     \end{tikzpicture}
%     \caption{A Nice Filled Ellipse with a Pair of Coordinate Axes.}\label{fig:tikz}
% \end{figure*}

% All material in floats should be legible and of good quality.
% So avoid very small or large text and pixelated or fuzzy lines.

% \subsection{Figures}\label{sec:figures}
% Figures should go in the \texttt{figure} environment and be centered therein.
% The caption should go below the figure.
% Use \verb|\includegraphics| for external graphics files but omit the file extension.
% Supported formats are \textsf{pdf} (preferred for vector drawings and diagrams), \textsf{png} (preferred for screenshots), and \textsf{jpeg} (preferred for photographs).
% Do not use \verb|\epsfig| or \verb|\psfig|.
% If you want to scale the image, it is better to use a fraction of the line width rather than an explicit length.
% For example, see Figure~\ref{fig:Eindhoven}.
% \begin{figure}
%   \centering
%   \includegraphics[width=0.7\linewidth,page=3]{Eindhoven}
%   \caption{A View of a Nice City.}\label{fig:Eindhoven}
% \end{figure}

% Do not use \verb|\graphicspath|.
% If the images are contained in a subdirectory, specify this when you include the image, for example \verb|\includegraphics{figures/mypic}|.

% \subsection{Tables}\label{sec:tables}
% Tables should go in the \texttt{table} environment and be centered therein.
% The caption should go above the table and be in title caps.
% For an example, see Table~\ref{tab:data}.
% \begin{table}
%     \centering
%     \caption{An Interesting Table.}\label{tab:data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \subsection{Algorithms}\label{sec:algorithms}
% You can load your favorite algorithm package, such as \textsf{algorithm2e}\footnote{See the \textsf{algorithm2e} documentation at \url{https://ctan.org/pkg/algorithm2e}.}.
% Use the environment defined in the package to create a centered float with an algorithm inside.





% \begin{contributions} % will be removed in pdf for initial submission,
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions.
%     This is a nice way of making clear who did what and to give proper credit.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

% \begin{acknowledgements} 
% This research was generously supported in part by the NSF Award (CNS-2112471), the NSF CAREER Award (CCF-2045694), and the David H. Barakat and LaVerne Owen-Barakat College of Engineering Dean's Fellowship at Carnegie Mellon University.
% \end{acknowledgements}

%\newpage

% \bibliography{uai2022-template}

% \newpage


% \appendix
% NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% \section{Math font exposition}
% How math looks in equations is important:
% \begin{equation*}
%   F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.


% \section{You \emph{can} have an appendix here.}
\section{Notations and Basic Results}

Let $\mathcal{S}^{(t)}$ be the subset of clients sampled in the $t-$th round. Let $\xi^{(t)}$ denotes the randomness due to the stochastic sampling at round $t$.

\begin{equation}
    \begin{aligned}
        \text{ Normalized Stochastic Gradient: } \deltai &= \frac{1}{\tau} \sumkt \G f_i(\bwitk, \xiitk)\\
        \text{ Normalized Gradient: } \bhit &= \frac{1}{\tau} \sum_{k=0}^{\tau-1} \G f_i(\bwitk)\\
        \text{ Average Normalized Gradient: } \bar{\bh}^{(t)} &= \frac{1}{\numclients} \sumin \bhit \\
        \text{Server Updates: } \bwtp &= \bwt - \lrs \frac{1}{\selclients} \sum_{i \in \set^{(t)}} \deltai, \qquad \qquad \text{where } \lrs = \lrss \lrc \tau
    \end{aligned}
    \label{eq:FedAvg_update}
\end{equation}


\begin{lem}[Young's inequality]
Given two same-dimensional vectors $\mbf u, \mbf v \in \mbb R^d$, the Euclidean inner product can be bounded as follows:
$$\lan \bx, \bv \ran \leq \frac{\norm{\bx}}{2 \gamma} + \frac{\gamma \norm{\bv}}{2}$$
for every constant $\gamma > 0$.
\end{lem}


\begin{lem}[Jensen's inequality]
% \label{lem:jensens}
Given a convex function $f$ and a random variable $X$, the following holds.
$$f \lp \mbe [X] \rp \leq \mbe \lb f(X) \rb.$$
\end{lem}

\begin{lem}[Sum of squares]
% \label{lem:sum_of_squares}
For a positive integer $K$, and a set of vectors $\bx_1, \hdots, \bx_K$, the following holds:
\begin{align*}
    \norm{\sum_{k=1}^K \bx_k} \leq K \sum_{k=1}^K \norm{\bx_k}.
\end{align*}
\end{lem}

\begin{lem}[Variance under uniform, without replacement sampling]
\label{lem:sample_WR}
Let $\bar{\bx} = \frac{1}{\numclients} \sumin \bx_i$. If $\bar{\bx}$ is approximated using a mini-batch $\mathcal{M}$ of size $\selclients$, sampled uniformly at random, and without replacement, then the following holds.
\begin{align*}
    \mbe \lb \frac{1}{\selclients} \sum_{i \in \mathcal{M}} \bx_i \rb &= \bar{\bx}, \\
    \mbe \lnr \frac{1}{\selclients} \sum_{i \in \mathcal{M}} \bx_i - \bar{\bx} \rnr^2 &= \frac{1}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} \frac{1}{\numclients} \sumin \lnr \bx_i - \bar{\bx} \rnr^2.
\end{align*}
\end{lem}

\begin{proof}
\begin{align*}
    \mbe \lnr \frac{1}{\selclients} \sum_{i \in \mathcal{M}} \bx_i - \bar{\bx} \rnr^2 &= \mbe \lnr \frac{1}{\selclients} \sumin \mathbb I (i \in \mc M) \lp \bx_i - \bar{\bx} \rp \rnr^2 \\
    &= \frac{1}{\selclients^2} \mbe \lb \sumin \lp \mathbb I (i \in \mc M) \rp^2 \lnr \bx_i - \bar{\bx} \rnr^2 + \sum_{j \in [\numclients]} \sum_{\substack{j \in [\numclients] \\ i \neq j}} \mathbb I (i \in \mc M) \mathbb I (j \in \mc M) \lan \bx_i - \bar{\bx}, \bx_j - \bar{\bx} \ran \rb \\
    &= \frac{1}{\selclients^2} \sumin \frac{\selclients}{\numclients} \lnr \bx_i - \bar{\bx} \rnr^2 + \frac{1}{\selclients^2} \sum_{i \neq j} \frac{\selclients}{\numclients} \frac{(\selclients - 1)}{(\numclients - 1)} \lan \bx_i - \bar{\bx}, \bx_j - \bar{\bx} \ran \nn \\
    &= \frac{1}{\selclients^2} \sumin \lnr \bx_i - \bar{\bx} \rnr^2 \lb \frac{\selclients}{\numclients} - \frac{\selclients}{\numclients} \frac{(\selclients - 1)}{(\numclients - 1)} \rb + \frac{1}{\selclients^2} \frac{\selclients}{\numclients} \frac{(\selclients - 1)}{(\numclients - 1)} \underbrace{\lnr \sumin \lp \bx_i - \bar{\bx} \rp \rnr^2}_{=0} \\
    &= \frac{1}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} \frac{1}{\numclients} \sumin \lnr \bx_i - \bar{\bx} \rnr^2.
\end{align*}
\end{proof}
















% \newpage
\section{Convergence Proof for \texttt{FedAvg} (Theorem 1)}
\label{app:FedAvg}

In this section we prove the convergence of \texttt{FedAvg}, and provide the complexity and communication guarantees.
We organize this section as follows. First, in \ref{app:FedAvg_int_results} we present some intermediate results, which we use to prove the main theorem. Next, in \ref{app:FedAvg_thm_proof}, we present the proof of Theorem 1, which is followed by the proofs of the intermediate results in \ref{app:FedAvg_int_results_proofs}.

% \ps{I would suggest adding the pseudocode of \texttt{FedAvg} here for the sake of completeness}

\subsection{Intermediate Lemmas} \label{app:FedAvg_int_results}

\begin{lem}
\label{lem:FedAvg_f_decay_one_step}
Suppose the function $f$ satisfies Assumption 1, and the stochastic oracles at the clients satisfy Assumption 2, then the iterates $\{ \bwt \}_t$ generated by \texttt{FedAvg} satisfy
\begin{equation}
    \begin{aligned}
        \Eg{\set^{(t)}, \xi^{(t)}}{f(\bwtp)} & \leq f(\bwt) - \frac{\lrs}{2} \left[ \norm{\G f(\bwt)} + \mbe_{\xi^{(t)}} \norm{\bar{\bh}^{(t)}} - \mbe_{\xi^{(t)}} \norm{\G f(\bwt) - \bar{\bh}^{(t)}} \right] \nn \\
        & \quad + \frac{\lrs^2 L}{2} \lb \frac{2 \sigma^2}{M \tau} + 2 \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\frac{1}{\selclients} \sum_{i \in \set^{(t)}} \bhit} \rb,
    \end{aligned}
    \label{eq:lem:FedAvg_f_decay_one_step}
\end{equation}
where $\lrs$ is the server learning rate, and $\Eg{\set^{(t)}, \xi^{(t)}}{\cdot}$ is expectation over the randomness in the $t-$th round, conditioned on $\bwt$.
\end{lem}


\begin{lem}
\label{lem:FedAvg_grad_ht_err}
Suppose the function $f$ satisfies Assumption 1, and the stochastic oracles at the clients satisfy Assumptions 2, 3. Further, $\lrc$, the client learning rate is chosen such that $\lrc \leq \frac{1}{2 L \tau}$.
Then, the iterates $\{ \bwt \}_t$ generated by \texttt{FedAvg} satisfy
\begin{align}
    \mbe_{\xi^{(t)}} \norm{\G f(\bwt) - \bar{\bh}^{(t)}} & \leq \frac{1}{\numclients} \sumin \mbe_{\xi^{(t)}} \norm{\G f_i(\bwt) - \bhit} \nn \\
    & \leq 2 \lrc^2 L^2 (\tau - 1) \sigma^2 + 8 \lrc^2 L^2 \tau (\tau - 1) \lb \sigma_g^2 + \norm{\G f (\bwt)} \rb. \nn
\end{align}
\end{lem}


\begin{lem}
\label{lem:FedAvg_avg_ht_err}
Suppose the function $f$ satisfies Assumption 1, and the stochastic oracles at the clients satisfy Assumptions 2, 3.
Then, the iterates $\{ \bwt \}_t$ generated by \texttt{FedAvg} satisfy
\begin{align}
    \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\frac{1}{\selclients} \sum_{i \in \set^{(t)}} \bhit} & \leq \frac{3}{\numclients} \sumin \mbe \norm{\bhit - \G f_i(\bwt)} + \frac{3(\numclients - \selclients)}{(\numclients - 1) \selclients} \sigma_g^2 + 3 \mbe \norm{\G f(\bwt)}. \nn
\end{align}
\end{lem}





\subsection{Proof of Theorem 1}
\label{app:FedAvg_thm_proof}
For the sake of completeness, we first state the complete theorem statement.
\begin{thm*}
Suppose the function $f$ satisfies Assumption 1, and the stochastic oracles at the clients satisfy Assumptions 2,3. Further, the client learning rate $\lrc$, and the server learning rate $\lrss$ are chosen such that $\lrc \leq \frac{1}{8 L \tau}$, $\lrss \lrc \leq \frac{1}{24 \tau L}$.
Then, the iterates $\{ \bwt \}_t$ generated by \texttt{FedAvg} satisfy
\begin{align*}
    \min_{t \in [T]} \mbe \norm{\G f(\bwt)} & \leq \underbrace{\mco \lp \frac{f(\bw^{(0)}) - f^*}{\lrss \lrc \tau T} \rp}_{\text{Effect of initialization}} + \underbrace{\mco \lp \frac{\lrss \lrc L \sigma^2}{\selclients} + \lrc^2 L^2 (\tau - 1) \sigma^2 \rp}_{\text{Stochastic Gradient Error}} \\
    & \qquad + \underbrace{\mco \lp \lrss \lrc \tau L \frac{(\numclients - \selclients)}{(\numclients - 1) \selclients} \sigma_g^2 \rp}_{\text{Error due to partial participation}} + \underbrace{\mco \lp \lrc^2 L^2 \tau (\tau - 1) \sigma_g^2 \rp}_{\text{Client Drift Error}},
\end{align*}
where $f^* = \argmin_\mathbf{x} f(\mathbf{x})$.
\end{thm*}

\begin{proof}
Note that for simplicity, we use the notation $\lrs = \lrss \lrc \tau$.
Substituting the bounds in Lemma \ref{lem:FedAvg_grad_ht_err} and Lemma \ref{lem:FedAvg_avg_ht_err} in \eqref{eq:lem:FedAvg_f_decay_one_step}, we get
\begin{align}
    & \mbe \lb f(\bwtp) - f(\bwt) \rb \nn \\
    & \leq - \frac{\lrs}{2} \norm{\G f(\bwt)} - \frac{\lrs}{2} \mbe \norm{\bar{\bh}^{(t)}} \nn \\
    & \quad + \frac{\lrs}{2} \lb 2 \lrc^2 L^2 (\tau - 1) \sigma^2 + 8 \lrc^2 L^2 \tau (\tau - 1) \lp \sigma_g^2 + \norm{\G f (\bwt)} \rp \rb \nn \\
    & \quad + \frac{\lrs^2 L}{2} \lb \frac{2 \sigma^2}{M \tau} + \frac{6(\numclients - \selclients)}{(\numclients - 1) \selclients} \sigma_g^2 + 6 \mbe \norm{\G f(\bwt)} \rb \nn \\
    & \quad + 3 \lrs^2 L \lb 2 \lrc^2 L^2 (\tau - 1) \sigma^2 + 8 \lrc^2 L^2 \tau (\tau - 1) \lp \sigma_g^2 + \norm{\G f (\bwt)} \rp \rb \nn \\
    & \leq - \frac{\lrs}{2} \lp 1 - 8 \lrc^2 L^2 \tau (\tau - 1) - 6 \lrs L - 48 \lrs L \lrc^2 L^2 \tau (\tau - 1) \rp \norm{\G f(\bwt)} - \frac{\lrs}{2} \mbe \norm{\bar{\bh}^{(t)}} \nn \\
    & \quad + \frac{\lrs}{2} \lp 1 + 6 \lrs L \rp 2 \lrc^2 L^2 (\tau - 1) \lb \sigma^2 + 4 \tau \sigma_g^2 \rb + \frac{\lrs^2 L}{2} \lb \frac{2 \sigma^2}{M \tau} + \frac{6(\numclients - \selclients)}{(\numclients - 1) \selclients} \sigma_g^2 \rb \nn \\
    & \leq - \frac{\lrs}{4} \norm{\G f(\bwt)} + 2 \lrs \lrc^2 L^2 (\tau - 1) \lb \sigma^2 + 4 \tau \sigma_g^2 \rb + \frac{\lrs^2 L}{2} \lb \frac{2 \sigma^2}{M \tau} + \frac{6(\numclients - \selclients)}{(\numclients - 1) \selclients} \sigma_g^2 \rb. \label{eq_proof:thm:FedAvg_1}
\end{align}
where \eqref{eq_proof:thm:FedAvg_1} follows because
\begin{align*}
    8 \lrc^2 L^2 \tau (\tau - 1) & \leq \frac{1}{8} \tag{$\because \lrc \leq \frac{1}{8 \tau L}$} \nn \\
    6 \lrs L & \leq \frac{1}{4} \tag{$\because \lrss \lrc \leq \frac{1}{24 \tau L}$} \nn \\
    48 \lrs L \lrc^2 L^2 \tau (\tau - 1) & \leq 6 \lrs L \leq \frac{1}{4}. \tag{$\because 8 \lrc^2 L^2 \tau (\tau - 1) \leq \frac{1}{8}$}
\end{align*}
Rearranging the terms, and summing over $t=0, \hdots, T-1$, we get
\begin{align}
    & \frac{1}{T} \sumtT \mbe \norm{\G f(\bwt)} \nn \\
    & \leq \frac{4}{\lrs T} \sumtT \mbe \lb f(\bwt) - f(\bwtp) \rb + 8 \lrc^2 L^2 (\tau - 1) \lb \sigma^2 + 4 \tau \sigma_g^2 \rb + 2 \lrs L \lb \frac{2 \sigma^2}{M \tau} + \frac{6(\numclients - \selclients)}{(\numclients - 1) \selclients} \sigma_g^2 \rb \nn \\
    & \leq \frac{4 \lb f(\bw^{(0)}) - f(\bw^{(T)}) \rb}{\lrss \lrc \tau T} + \frac{4 \lrss \lrc L \sigma^2}{\selclients} + 8 \lrc^2 L^2 (\tau - 1) \sigma^2 + 12 \lrss \lrc \tau L \frac{(\numclients - \selclients)}{(\numclients - 1) \selclients} \sigma_g^2 + 32 \lrc^2 L^2 \tau (\tau - 1) \sigma_g^2. \nn
\end{align}
\end{proof}




\subsection{Proofs of the Intermediate Lemmas}
\label{app:FedAvg_int_results_proofs}

\begin{proof}[Proof of Lemma \ref{lem:FedAvg_f_decay_one_step}]
Using $L$-smoothness (Assumption 1) of $f$, and only considering the randomness in the $t$-th round $\{ \set^{(t)}, \xi^{(t)} \}$,
\begin{align}
    \mbe_{\set^{(t)}, \xi^{(t)}} f(\bwtp) - f(\bwt) & \leq \mbe_{\set^{(t)}, \xi^{(t)}} \lan \G f(\bwt), \bwtp - \bwt \ran + \frac{L}{2} \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\bwtp - \bwt} \nn \\
    & = -\lrs \underbrace{\mbe_{\set^{(t)}, \xi^{(t)}} \lan \G f(\bwt), \frac{1}{\selclients} \sum_{i \in \set^{(t)}} \deltai \ran}_{T_1} + \frac{\lrs^2 L}{2} \underbrace{\mbe_{\set^{(t)}, \xi^{(t)}} \norm{\frac{1}{\selclients} \sum_{i \in \set^{(t)}} \deltai}}_{T_2}. \label{eq_proof:lem:FedAvg_f_decay_one_step_1}
\end{align}
Next, we bound the terms $T_1$ and $T_2$ separately.
\begin{align}
    -T_1 &= -\mbe_{\xi^{(t)}} \lan \G f(\bwt), \frac{1}{\numclients} \sumin \bhit \ran \tag{from Assumption 3 and uniform sampling of clients} \\
    &= \frac{1}{2} \lb \mbe_{\xi^{(t)}} \norm{\G f(\bwt) - \frac{1}{\numclients} \sumin \bhit} - \norm{\G f(\bwt)} - \mbe_{\xi^{(t)}} \norm{\frac{1}{\numclients} \sumin \bhit} \rb. \label{eq_proof:lem:FedAvg_f_decay_one_step_2}
\end{align}
Next, we bound $T_2$.
\begin{align}
    T_2 & \leq 2 \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\frac{1}{\selclients} \sum_{i \in \set^{(t)}} \lp \deltai - \bhit \rp} + 2 \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\frac{1}{\selclients} \sum_{i \in \set^{(t)}} \bhit} \tag{Young's inequality} \\
    & \leq \frac{2}{\selclients} \frac{1}{\numclients} \sumin \mbe_{\xi^{(t)}} \norm{\deltai - \bhit} + 2 \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\frac{1}{\selclients} \sum_{i \in \set^{(t)}} \bhit} \tag{uniform sampling of clients, $\mbe [ \deltai ] = \bhit$} \\
    & \leq \frac{2}{\selclients} \frac{1}{\numclients} \sumin \frac{\sigma^2}{\tau} + 2 \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\frac{1}{\selclients} \sum_{i \in \set^{(t)}} \bhit}, \label{eq_proof:lem:FedAvg_f_decay_one_step_3}
\end{align}
where, \eqref{eq_proof:lem:FedAvg_f_decay_one_step_3} follows from the following reasoning.
\begin{align*}
    \mbe_{\xi^{(t)}} \norm{\deltai -\bhit} &= \mbe_{\xi^{(t)}} \norm{\frac{1}{\tau} \sumkt \lp \G f_i(\bwitk, \xiitk) - \G f_i(\bwitk) \rp} \tag{from \eqref{eq:FedAvg_update}} \\
    &= \frac{1}{\tau^2} \mbe_{\xi^{(t)}} \Bigg[ \sumkt \norm{\G f_i(\bwitk, \xiitk) - \G f_i(\bwitk)} \\
    & \ + \frac{2}{\tau^2} \sum_{j < k} \mbe_{\xi^{(t)}} \lan \underbrace{\mbe \lb \G f_i(\bwitk, \xiitk) - \G f_i(\bwitk) \vert \bwitj \rb}_{=0}, \G f_i(\bwitj, \xiitj) - \G f_i(\bwitj) \ran \Bigg] \\
    & \leq \frac{\sigma^2}{\tau} \tag{Assumption 3}.
\end{align*}
Substituting the bounds on $T_1$ \eqref{eq_proof:lem:FedAvg_f_decay_one_step_2} and $T_2$ \eqref{eq_proof:lem:FedAvg_f_decay_one_step_3} in \eqref{eq_proof:lem:FedAvg_f_decay_one_step_1}, we get the result in the lemma.
\end{proof}


\begin{proof}[Proof of Lemma \ref{lem:FedAvg_grad_ht_err}]
We borrow some of the proof techniques from [Wang et al., 2020].
\begin{align}
    \mbe_{\xi^{(t)}} \norm{\G f(\bwt) - \bar{\bh}^{(t)}} & \leq \frac{1}{\numclients} \sumin \mbe_{\xi^{(t)}} \norm{\G f_i(\bwt) - \bhit} \tag{Jensen's inequality} \\
    &= \frac{1}{\numclients} \sumin \mbe_{\xi^{(t)}} \norm{\frac{1}{\tau} \sum_{k=0}^{\tau-1} \lp \G f_i(\bwt) - \G f_i(\bwitk) \rp} \tag{from \eqref{eq:FedAvg_update}} \\
    &= \frac{L^2}{\numclients}  \sumin \frac{1}{\tau} \sumkt \mbe_{\xi^{(t)}} \norm{\bwitk - \bwt} \label{eq_proof:lem:FedAvg_grad_ht_err_1}
\end{align}
Next, we bound the individual difference $\mbe_{\xi^{(t)}} \norm{\bwitk - \bwt}$.
\begin{align}
    \mbe_{\xi^{(t)}} \norm{\bwitk - \bwt} &= \lrc^2 \mbe_{\xi^{(t)}} \norm{\sumjk \G f_i (\bwitj, \xiitj)} \nn \\
    &= \lrc^2 \lb \mbe_{\xi^{(t)}} \norm{\sumjk \lp \G f_i (\bwitj, \xiitj) - \G f_i (\bwitj) \rp} + \mbe_{\xi^{(t)}} \norm{\sumjk \G f_i (\bwitj)} \rb \nn \\
    & \leq \lrc^2 \lb \sumjk \mbe_{\xi^{(t)}} \norm{\G f_i (\bwitj, \xiitj) - \G f_i (\bwitj)} + k \sumjk \mbe_{\xi^{(t)}} \norm{\G f_i (\bwitj)} \rb \nn \\
    & \leq \lrc^2 \lb k \sigma^2 + k \sumjk \mbe_{\xi^{(t)}} \norm{\G f_i (\bwitj)} \rb. \label{eq_proof:lem:FedAvg_grad_ht_err_2}
\end{align}
Summing over $k = 0, \hdots, \tau-1$, we get
\begin{align}
    \frac{1}{\tau} \sumkt \mbe_{\xi^{(t)}} \norm{\bwitk - \bwt} & \leq \lrc^2 \frac{1}{\tau} \sumkt \lb k \sigma^2 + k \sumjk \mbe_{\xi^{(t)}} \norm{\G f_i (\bwitj) - \G f_i (\bwt) + \G f_i (\bwt)} \rb \nn \\
    & \leq \lrc^2 (\tau - 1) \sigma^2 + \frac{\lrc^2 L^2}{\tau} \sumkt k \sumjk \lb \mbe_{\xi^{(t)}} \norm{\bwitj - \bwt} + \norm{\G f_i (\bwt)} \rb \nn \\
    & \leq \lrc^2 (\tau - 1) \sigma^2 + 2 \lrc^2 L^2 \tau (\tau - 1) \lb \frac{1}{\tau} \sumkt \mbe_{\xi^{(t)}} \norm{\bwitk - \bwt} \rb \nn \\
    & \quad + 2 \lrc^2 \tau (\tau - 1) \norm{\G f_i (\bwt)}. \label{eq_proof:lem:FedAvg_grad_ht_err_3}
\end{align}
Define $D \triangleq 2 \lrc^2 L^2 \tau (\tau - 1)$. We choose $\lrc$ small enough such that $D \leq 1/2$. Then, rearranging the terms in 
\begin{align}
    \frac{1}{\tau} \sumkt \mbe_{\xi^{(t)}} \norm{\bwitk - \bwt} & \leq \frac{\lrc^2 (\tau - 1) \sigma^2}{1-D} + \frac{2 \lrc^2 \tau (\tau - 1)}{1-D} \norm{\G f_i (\bwt)}. \label{eq_proof:lem:FedAvg_grad_ht_err_4}
\end{align}
Substituting \eqref{eq_proof:lem:FedAvg_grad_ht_err_4} in \eqref{eq_proof:lem:FedAvg_grad_ht_err_1}, we get
\begin{align}
    \mbe_{\xi^{(t)}} \norm{\G f(\bwt) - \bar{\bh}^{(t)}} & \leq \frac{\lrc^2 L^2 (\tau - 1) \sigma^2}{1-D} + \frac{D}{1-D} \norm{\G f_i (\bwt) - \G f (\bwt) + \G f (\bwt)} \nn \\
    & \leq 2 \lrc^2 L^2 (\tau - 1) \sigma^2 + 4 D \sigma_g^2 + 4D \norm{\G f (\bwt)}. \tag{since $D \leq 1/2$}
\end{align}
\end{proof}

\begin{proof}[Proof of Lemma \ref{lem:FedAvg_avg_ht_err}]
Also, 
\begin{align}
    & \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\frac{1}{\selclients} \sum_{i \in \set^{(t)}} \lp \bhit - \G f_i(\bwt) + \G f_i(\bwt) \rp - \G f(\bwt) + \G f(\bwt)} \nn \\
    & \leq 3 \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\frac{1}{\selclients} \sum_{i \in \set^{(t)}} \lp \bhit - \G f_i(\bwt) \rp} + 3 \mbe_{\set^{(t)}} \norm{\frac{1}{\selclients} \sum_{i \in \set^{(t)}} \G f_i(\bwt) - \G f(\bwt)} + 3 \mbe \norm{\G f(\bwt)} \nn \\
    & \leq 3 \mbe_{\set^{(t)}, \xi^{(t)}} \lb \frac{1}{\selclients} \sum_{i \in \set^{(t)}} \norm{\bhit - \G f_i(\bwt)} \rb + 3 \frac{\numclients - \selclients}{(\numclients - 1) \selclients} \frac{1}{\numclients} \sumin \mbe \norm{\G f_i(\bwt) - \G f(\bwt)} + 3 \mbe \norm{\G f(\bwt)} \tag{sampling without replacement, see Lemma \ref{lem:sample_WR}} \\
    & \leq \frac{3}{\numclients} \sumin \mbe \norm{\bhit - \G f_i(\bwt)} + \frac{3(\numclients - \selclients)}{(\numclients - 1) \selclients} \sigma_g^2 + 3 \mbe \norm{\G f(\bwt)}. \nn
\end{align}
\end{proof}























% \newpage
\section{Convergence Result for \texttt{\algoname} (Theorem 2)}
\label{app:\algoname}

In this section we prove the convergence result for \texttt{\algoname} in Theorem 2, and provide the complexity and communication guarantees.

We organize this section as follows. First, in \ref{app:\algoname_int_results} we present some intermediate results, which we use to prove the main theorem. Next, in \ref{app:\algoname_thm_proof}, we present the proof of Theorem 2, which is followed by the proofs of the intermediate results in \ref{app:\algoname_int_results_proofs}.



\subsection{Intermediate Lemmas} \label{app:\algoname_int_results}


\begin{lem}
\label{:algoname_f_decay_one_step}
Suppose the function $f$ satisfies Assumption 1, and the stochastic oracles at the clients satisfy Assumption 2, then the iterates $\{ \bwt \}_t$ generated by \texttt{\algoname} satisfy
\begin{align}
    \Eg{\set^{(t)}, \xi^{(t)}}{f(\bwtp)} & \leq f(\bwt) - \frac{\lrs}{2} \left[ \norm{\G f(\bwt)} + \mbe_{\xi^{(t)}} \norm{\bar{\bh}^{(t)}} - \mbe_{\xi^{(t)}} \norm{\G f(\bwt) - \bar{\bh}^{(t)}} \right] \nn \\
    & \quad + \frac{\lrs^2 L}{2}\Eg{\set^{(t)}, \xi^{(t)}}{\norm{\bvt}},
\end{align}
where $\lrs = \lrss \lrc \tau$ is the effective server learning rate, and $\Eg{\set^{(t)}, \xi^{(t)}}{\cdot}$ is expectation over the randomness in the $t-$th round, conditioned on $\bwt$.
\end{lem}


\begin{lem}
\label{:algoname_vt_ht_err}
Suppose the function $f$ satisfies Assumption 1, and the stochastic oracles at the clients satisfy Assumptions 2.
Then, the iterates $\{ \bwt \}_t$ generated by \texttt{\algoname} satisfy
\begin{align*}
    \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\bvt-\bar{\bh}^{(t)}} & \leq \frac{\sigma^2}{\selclients \tau} + \frac{4 (\numclients - \selclients)}{\selclients (\numclients - 1)} \lb \frac{1}{\numclients} \sumin \Eg{\xi^{(t)}}{\norm{\bhit - \G f_i(\bwt)}} + \lrs^2 L^2 \norm{\bv^{(t-1)}} \rb \nn \\
    & \quad + \frac{2 (\numclients - \selclients)}{\selclients (\numclients - 1)} \frac{1}{\numclients}\sumin \norm{\G f_i(\bw^{(t-1)}) - \byit}.
\end{align*}
\end{lem}


\begin{lem}
\label{:algoname_grad_yt_err}
Suppose the function $f$ satisfies Assumption 1, and the stochastic oracles at the clients satisfy Assumptions 2, 3.
Then, the iterates $\{ \bwt \}_t$ generated by \texttt{\algoname} satisfy
\begin{align}
    & \Eg{\set^{(t)}, \xi^{(t)}}{\frac{1}{\numclients}\sum_{j=1}^{\numclients} \norm{\G f_j(\bwt) - \byjtp}} \leq \frac{\selclients}{\numclients} \left[ \frac{\sigma^2}{\tau} + \frac{1}{\numclients} \sumjn \mbe_{\xi^{(t)}} \norm{\G f_j(\bwt) - \bhjt} \right] \nn \\
    & \qquad + \left(1-\frac{\selclients}{\numclients}\right) \lb \left(1+\frac{1}{\beta}\right) \lrs^2 L^2\norm{\bv^{(t-1)}} + (1+\beta) \frac{1}{\numclients} \sum_{j=1}^{\numclients} \norm{{\G f_j(\bw^{(t-1)}) - \byjt}} \rb, \nn
\end{align}
for any positive scalar $\beta$.
\end{lem}
We also use the bound on $\frac{1}{\numclients} \sumjn \mbe_{\xi^{(t)}} \norm{\G f_j(\bwt) - \bhjt}$ from Lemma \ref{lem:FedAvg_grad_ht_err} in the previous section.

\subsection{Proof of Theorem 2}
\label{app:\algoname_thm_proof}
For the sake of completeness, first we state the complete theorem statement.

\begin{thm*}
Suppose the function $f$ satisfies Assumption 1, and the stochastic oracles at the clients satisfy Assumptions 2, 3. Further, the client learning rate $\lrc$, and the server learning rate $\lrss$ are chosen such that $\lrc \leq \frac{1}{10 L \tau}$, $\lrss \lrc \leq \min \lcb \frac{\selclients^{3/2}}{8 L \tau \numclients}, \frac{5 \selclients}{48 \tau L}, \frac{1}{4 L \tau} \rcb$.
Then, the iterates $\{ \bwt \}_t$ generated by \texttt{\algoname} satisfy
\begin{align*}
    \min_{t \in [T]} \mbe \norm{\G f(\bwt)} & \leq \underbrace{\mco \lp \frac{f(\bw^{(0)}) - f^*}{\lrss \lrc \tau T} \rp}_{\text{Effect of initialization}} + \underbrace{\mco \lp \frac{\lrss \lrc L \sigma^2}{\selclients} + \lrc^2 L^2 (\tau - 1) \sigma^2 \rp}_{\text{Stochastic Gradient Error}} + \underbrace{\mco \lp \lrc^2 L^2 \tau (\tau - 1) \sigma_g^2 \rp}_{\text{Client Drift Error}}
\end{align*}
where $f^* = \argmin_\mathbf{x} f(\mathbf{x})$.
\end{thm*}

\begin{coro}
% \label{corro_1}
Setting $\lrc = \frac{1}{\sqrt{T}\tau L}$ and $\lrss = \sqrt{\tau \selclients}$, \texttt{\algoname} converges to a stationary point of the global objective $f(\bw)$ at a rate given by,
\begin{align*}
    &\min_{t \in \{0, \hdots, T-1 \}} \mbe \norm{\nabla  f(\bw^{(t)})} \leq \underbrace{\bigO{\frac{1}{\sqrt{\selclients \tau T}}}}_{\text{stochastic gradient error}} + \underbrace{\bigO{\frac{1}{T}}}_{\text{client drift error}}.
\end{align*}
\end{coro}

\begin{proof}
We define the Lyapunov function as below for some $\alpha$ and $\frac{\lrs^2 L}{2} \leq \delta \leq \frac{\lrs}{2}$. A necessary condition for this to be satisfied is $\lrs = \lrss \lrc \tau \leq 1/L$. The precise choice of $\alpha, \delta$ will be discussed later.
\begin{align}
    R^{(t+1)} \triangleq \E{f(\bw^{(t+1)}) + \left(\delta - \frac{\lrs^2 L}{2}\right)\norm{\bvt} + \alpha\frac{1}{\numclients}\sum_{j=1}^{\numclients}\norm{\G f_j(\bwt) - \byjtp}}. \label{eq:lyapunov_defn}
\end{align}
Using Lemma \ref{:algoname_f_decay_one_step},
\begin{align}
    R^{(t+1)} & \leq \mathbb{E} \left[ f(\bwt) - \frac{\lrs}{2} \norm{\G f(\bwt)} + \frac{\lrs}{2} \frac{1}{\numclients} \sumjn \mbe_{\xi^{(t)}} \norm{\G f_j(\bwt) - \bhjt} - \frac{\lrs}{2} \mbe_{\xi^{(t)}} \norm{\bar{\bh}^{(t)}} + \delta \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\bvt} \right. \nn \\
    &\hspace{10pt}\left. + \alpha\frac{1}{\numclients}\sum_{j=1}^{\numclients}\Eg{\set^{(t)}, \xi^{(t)}}{\norm{\G f_j(\bwt) - \byjtp }}\right] \tag{Jensen's inequality} \\
    &\leq \mathbb{E}\left[f(\bwt) -\frac{\lrs}{2} \norm{\G f(\bwt)} + \frac{\lrs}{2} \frac{1}{\numclients} \sumjn \Eg{\xi^{(t)}}{\norm{\G f_j(\bwt) - \bhjt}} + \delta\Eg{\set^{(t)}, \xi^{(t)}}{\norm{\bvt-\bar{\bh}^{(t)}}}\right. \nn \\
    &\left.\hspace{20pt} + \alpha\frac{1}{\numclients}\sum_{j=1}^{\numclients}\Eg{\set^{(t)}, \xi^{(t)}}{\norm{\G f_j(\bwt) - \by_j^{(t+1)}}} \right], \label{eq_proof:thm:\algoname_1}
\end{align}
where for the last line we use that $\Eg{\set^{(t)}, \xi^{(t)}}{\norm{\bvt}} = \Eg{\set^{(t)}, \xi^{(t)}}{\norm{\bvt-\bar{\bh}^{(t)}}} + \Eg{\xi^{(t)}}{\norm{\bar{\bh}^{(t)}}}$ and $\delta \leq \frac{\lrs}{2}$. Next, define $C^{(t)} \triangleq \frac{1}{\numclients}\sumjn\Eg{\xi^{(t)}}{\norm{\G f_j(\bwt) - \bhjt}}$. Substituting the bounds from Lemma \ref{:algoname_vt_ht_err} and Lemma \ref{:algoname_grad_yt_err} in \eqref{eq_proof:thm:\algoname_1} we get,
\begin{align}
    R^{(t+1)} & \leq \E{f(\bwt) -\frac{\lrs}{2}\norm{\G f(\bwt)}+ \left(\frac{\lrs}{2} + \frac{4\delta}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} + \frac{\alpha \selclients}{\numclients} \right) C^{(t)} + \left(\frac{ \delta}{\selclients} + \frac{\alpha \selclients}{\numclients}\right) \frac{\sigma^2}{\tau}} \nn \\
    & \quad + \left(\frac{4\delta \lrs^2 L^2}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} + \alpha \left( 1 - \frac{\selclients}{\numclients} \right) \left( 1 + \frac{1}{\beta} \right) \lrs^2 L^2 \right) \mbe \norm{\bv^{(t-1)}} \nn \\
    & \quad + \left( \frac{2\delta}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} + \alpha \left( 1 - \frac{\selclients}{\numclients} \right) (1 + \beta) \right) \frac{1}{\numclients} \sum_{j=1}^{\numclients} \mbe \norm{\G f_j(\bw^{(t-1)}) - \byjt }. \label{eq_proof:thm:\algoname_2}
\end{align}

\paragraph{Choice of $\alpha, \delta$.}
Our goal is now to find a suitable $\delta$ and $\alpha$ such that,
\begin{align*}
    \left(\frac{4\delta \lrs^2 L^2}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} + \alpha\left(1-\frac{\selclients}{\numclients}\right)\left(1+\frac{1}{\beta}\right)\lrs^2 L^2\right) & \leq \delta - \frac{\lrs^2 L}{2}, \nn \\
    \left(\frac{2\delta}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} + \alpha \left(1-\frac{\selclients}{\numclients}\right)(1+\beta)\right) & \leq \alpha
\end{align*}
We define $A = \left(1-\frac{\selclients}{\numclients} \right)\left(1+\frac{1}{\beta} \right)$ and $B = \left(1-\frac{\selclients}{\numclients}\right)(1+\beta)$. In case of full client participation, $\selclients = \numclients$, and $A = B = 0$. The resulting condition on $\alpha$ is
\begin{align*}
    \alpha \geq \frac{2\delta}{\selclients (1-B)} \frac{(\numclients - \selclients)}{(\numclients - 1)}, \qquad \beta \leq \frac{\selclients}{\numclients - \selclients}.
\end{align*}
We set $\alpha = \frac{2\delta}{\selclients (1-B)} \frac{(\numclients - \selclients)}{(\numclients - 1)}$, and our condition on $\delta$ then reduces to,
\begin{align*}
    \delta \geq \frac{\lrs^2 L/2}{1 - \frac{4 \lrs^2 L^2}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} - \frac{2 A \lrs^2 L^2}{\selclients (1-B)} \frac{(\numclients - \selclients)}{(\numclients - 1)}}
\end{align*}
We want $\lrs$ such that,
\begin{align*}
    \frac{4 \lrs^2 L^2}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} + \frac{2 A \lrs^2 L^2}{\selclients (1-B)} \frac{(\numclients - \selclients)}{(\numclients - 1)} \leq \frac{1}{2}
\end{align*}
A sufficient condition for this is
% \ps{not sure of the reasoning for this} 
\begin{align}
    \frac{4 \lrs^2 L^2}{\selclients} \leq \frac{1}{4}, \qquad \text{ and } \qquad \frac{2 A \lrs^2 L^2}{\selclients (1-B)} \leq \frac{1}{4}. \label{eq:cond_lrs_3}
\end{align}
% \ps{This choice of $\beta$ won't work for full client participation. Need another condition for that simpler case.}
For $\beta = \frac{\selclients}{2(\numclients - \selclients)}$, $B = 1 - \frac{\selclients}{2 \numclients}$ and $A = \lp 1 - \frac{\selclients}{\numclients} \rp \lp \frac{2 \numclients}{\selclients} - 1 \rp \leq \frac{2 \numclients}{\selclients}$. A sufficient condition for \eqref{eq:cond_lrs_3} to be satisfied is
\begin{align*}
    \lrs \leq \min \lcb \frac{\sqrt{M}}{4 L}, \frac{\selclients^{3/2}}{8 L \numclients} \rcb \quad \Rightarrow \quad \lrss \lrc \leq \lcb \frac{\sqrt{M}}{4 \tau L}, \frac{\selclients^{3/2}}{8 L \tau \numclients} \rcb
\end{align*}
With \eqref{eq:cond_lrs_3} we have $\delta \geq \lrs^2 L$. We set $\delta = 2\lrs^2 L$ which gives us $\alpha = \frac{8 \numclients \lrs^2 L}{\selclients^2} \frac{(\numclients - \selclients)}{(\numclients - 1)}$. Since $\delta \leq \frac{\lrs}{2}$, we also need $\lrs \leq \frac{1}{4L}$.

With this choice of $\alpha, \delta$, from \eqref{eq_proof:thm:\algoname_2} we get
\begin{align}
    R^{(t+1)} &\leq R^{(t)} - \frac{\lrs}{2} \mbe \norm{\G f(\bwt)} + \left( \frac{\lrs}{2} + \frac{4\delta}{\selclients} + \frac{\alpha \selclients}{\numclients} \right) C^{(t)} + \left(\frac{\delta}{\selclients} + \frac{\alpha \selclients}{\numclients} \right) \frac{\sigma^2}{\tau} \nn \\
    &\leq R^{(t)} - \frac{\lrs}{2}\norm{\G f(\bwt)} +  3\lrs C^{(t)} + \frac{10 \lrs^2 L}{\selclients} \frac{\sigma^2}{\tau}
\end{align}
where we use the condition that $\lrs \leq \frac{5 \selclients}{48 L}$.
Further, we can bound $C^{(t)} = \frac{1}{\numclients}\sumjn\Eg{\xi^{(t)}}{\norm{\G f_j(\bwt) - \bhjt}}$ using Lemma \ref{lem:FedAvg_grad_ht_err}, which gives us
\begin{align*}
    R^{(t+1)} \leq R^{(t)} - \frac{\lrs}{2} \lp 1 - 48 \lrc^2 L^2 \tau (\tau-1) \rp \norm{\G f(\bwt)} + 6 \lrs \lrc^2 L^2 (\tau-1) \lb \sigma^2 + 4 \tau \sigma_g^2 \rb + \frac{10 \lrs L \sigma^2}{\selclients \tau}.
\end{align*}
Using the condition on $\lrc$ that $\lrc \leq \frac{1}{10 L \tau}$, and unrolling the recursion we get,
\begin{align}
    R^{(t)} &\leq R^{(1)}+  \sum_{t=1}^{(t-1)}\left(-\frac{\lrs}{4}\norm{\G f(\bwt)} + 6 \lrs \lrc^2 L^2 (\tau-1) \lb \sigma^2 + 4 \tau \sigma_g^2 \rb + \frac{10 \lrs^2 L \sigma^2}{\selclients \tau} \right). \label{eq_proof:thm:\algoname_3}
\end{align}
Next, we bound $R^{(1)}$.
Using \eqref{eq_proof:thm:\algoname_1} and \eqref{:algoname_grad_yt_err_1} we can bound $R^{(1)}$ as follows,
\begin{align}
    R^{(1)} \leq & f(\bw^{(0)}) -\frac{\lrs}{2}\norm{\G f(\bw^{(0)})} +\left(\frac{\lrs}{2}+ \frac{\alpha \selclients}{\numclients} \right) C^{(0)} + \frac{\alpha \selclients}{\numclients} \frac{\sigma^2}{\tau} \nn \\
    & \hspace{10pt} + \delta \Eg{\xi^{(0)},\set^{(0)}}{\norm{\bv^{(0)}-\bar{\bh}^{(0)}}}+ \alpha\left(1-\frac{\selclients}{\numclients}\right)\frac{1}{\numclients}\sumin \norm{\G f_i(\bw^{(0)})} \nn \\
    & \leq f(\bw^{(0)}) -\frac{\lrs}{4}\norm{\G f(\bw^{(0)})} + 4 \lrs \lrc^2 L^2 (\tau-1) \lb \sigma^2 + 4 \tau \sigma_g^2 \rb + \frac{8 \lrs^2 L \sigma^2}{\selclients \tau} \nn \\
    & \hspace{10pt} + \delta \Eg{\xi^{(0)},\set^{(0)}}{\norm{\bv^{(0)}-\bar{\bh}^{(0)}}}+ \alpha\left(1-\frac{\selclients}{\numclients}\right)\frac{1}{\numclients}\sumin \norm{\G f_i(\bw^{(0)})}. \label{eq_proof:thm:\algoname_4}
\end{align}
Substituting the bound on $R^{(1)}$ from \eqref{eq_proof:thm:\algoname_4} in \eqref{eq_proof:thm:\algoname_3}, and using $t = T$ we get
\begin{align*}
    R^{(T)} \leq & f(\bw^{(0)}) - \sum_{t=0}^{(t-1)}\left(\frac{\lrs}{4}\norm{\G f(\bwt)} + 6 \lrs \lrc^2 L^2 (\tau-1) \lb \sigma^2 + 4 \tau \sigma_g^2 \rb + \frac{10 \lrs^2 L \sigma^2}{\selclients \tau} \right)\\
    & \hspace{10pt} + 2\lrs^2 L \Eg{\xi^{(0)},\set^{(0)}}{\norm{\bv^{(0)}-\bar{\bh}^{(0)}}}+ \frac{8 \numclients \lrs^2 L}{\selclients^2} \left( 1-\frac{\selclients}{\numclients}\right)\frac{1}{\numclients}\sumin \norm{\G f(\bw^{(0)})} 
\end{align*}
Rearranging the terms, we get
\begin{align*}
    \min_{t \in [0,T-1]} \mbe \norm{\G f(\bwt)} & \leq \frac{1}{T} \sumtT \mbe \norm{\G f(\bwt)} \nn \\
    &\leq \frac{4(f(\bw^{(0)}) - f^{*})}{\lrs T} + \frac{40 \lrs L \sigma^2}{\selclients \tau} + 24\lrc^2L^2(\tau-1)\sigma^2 + 96\lrc^2L^2\tau(\tau-1)\sigma_g^2\\ 
    & \hspace{10pt} + \frac{1}{T} \lb 8 \lrs L \Eg{\xi^{(0)},\set^{(0)}}{\norm{\bv^{(0)}-\bar{\bh}^{(0)}}}+ \frac{32 \numclients \lrs L}{\selclients^2} \left( 1-\frac{\selclients}{\numclients}\right)\frac{1}{\numclients}\sumin \norm{\G f(\bw^{(0)})}  \rb \\
    & = \mathcal{O} \left( \frac{(f(\bw^{(0)})-f^*)}{\lrs T} \right) + \mathcal{O}\left(\frac{\lrs L \sigma^2}{\selclients \tau} \right) + \mathcal{O}( \lrc^2L^2(\tau-1)\sigma^2 + \lrc^2 L^2 \tau(\tau-1)\sigma_g^2)   
\end{align*}
\end{proof}


\subsection{Proofs of the Intermediate Lemmas}
\label{app:\algoname_int_results_proofs}

\begin{proof}[Proof of Lemma \ref{:algoname_f_decay_one_step}]
Using $L$-smoothness (Assumption 1) of $f$,
\begin{align}
    f(\bwtp) & \leq f(\bwt) - \lrs \lan \G f(\bwt), \bvt \ran + \frac{\lrs^2 L}{2} \norm{\bvt}. \nn
\end{align}
Taking expectation only over the randomness in the $t$-th round: due to client sampling (inherent in $\set^{(t)}$) and due to stochastic gradients (inherent in $\xi^{(t)} \triangleq \{ \xiitk \}_{i,k}$), we get
\begin{align}
    \Eg{\set^{(t)}, \xi^{(t)}}{f(\bwtp)} & \leq f(\bwt) - \lrs \Ex_{\set^{(t)}, \xi^{(t)}} \lan \G f(\bwt),\bvt \ran + \frac{\lrs^2 L}{2} \Ex_{\set^{(t)}, \xi^{(t)}} \norm{\bvt} \nn \\
    & \overset{(a)}{=} f(\bwt) - \lrs \mbe_{\xi^{(t)}} \lan \G f(\bwt), \bar{\bh}^{(t)} \ran + \frac{\lrs^2 L}{2} \Ex_{\set^{(t)}, \xi^{(t)}} \norm{\bvt} \nn \\
    & = f(\bwt) - \frac{\lrs}{2} \left[ \norm{\G f(\bwt)} + \mbe_{\xi^{(t)}} \norm{\bar{\bh}^{(t)}} - \mbe_{\xi^{(t)}} \norm{\G f(\bwt) - \bar{\bh}^{(t)}} \right]+ \frac{\lrs^2 L}{2}\Ex_{\set^{(t)}, \xi^{(t)}} \norm{\bvt}, \nn
\end{align}
where $(a)$ follows since
\begin{align*}
    \mbe_{\set^{(t)}, \xi^{(t)}} \lb \bvt \rb &= \mbe_{\set^{(t)}, \xi^{(t)}} \lb \frac{1}{\set^{(t)}} \sum_{i \in \mathcal{S}^{(t)}} \deltai \rb - \mbe_{\set^{(t)}} \lb \frac{1}{\set^{(t)}} \sum_{i \in \mathcal{S}^{(t)}} \byit \rb + \byt \\
    &= \frac{1}{\numclients} \sumin \mbe_{\xi^{(t)}} \lb \deltai \rb - \byt + \byt \tag{uniform sampling of clients} \\
    &= \frac{1}{\numclients} \sumin \mbe_{\xi^{(t)}} \lb \frac{1}{\tau} \sumkt \G f_i(\bwitk, \xiitk) \rb \\
    &= \mbe \lb \bar{\bh}^{(t)} \rb.
\end{align*}
\end{proof}


\begin{proof}[Proof of Lemma \ref{:algoname_vt_ht_err}]

\begin{align}
    & \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\bvt-\bar{\bh}^{(t)}} \nn \\
    &= \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\frac{1}{\selclients}\sum_{i \in \set^{(t)}} \left(\deltai - \byit + \frac{1}{\numclients} \sum_{j=1}^{\numclients} \byjt - \bar{\bh}^{(t)}\right)} \tag{Server update direction in \texttt{\algoname}} \\
    & = \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\frac{1}{\selclients}\sum_{i \in \set^{(t)}}\left(\deltai - \bhit + \bhit - \byit + \frac{1}{\numclients}\sum_{j=1}^{\numclients}\byjt - \bar{\bh}^{(t)}\right)} \nn \\
    & \overset{(a)}{=} \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\frac{1}{\selclients}\sum_{i \in \set^{(t)}}\left(\deltai -\bhit\right)} + \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\frac{1}{\selclients}\sum_{i \in \set^{(t)}}\left(\bhit - \byit + \frac{1}{\numclients}\sum_{j=1}^{\numclients}\byjt - \bar{\bh}^{(t)}\right)} \nn \\
    & \overset{(b)}{=} \frac{1}{\selclients^2} \mbe_{\set^{(t)}, \xi^{(t)}} \lb \sum_{i \in \set^{(t)}} \norm{\deltai -\bhit} \rb + \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\frac{1}{\selclients}\sum_{i \in \set^{(t)}}\left(\bhit - \byit + \frac{1}{\numclients}\sum_{j=1}^{\numclients}\byjt - \bar{\bh}^{(t)}\right)} \nn \\
    & \overset{(c)}{\leq} \frac{\sigma^2}{\tau \selclients} + \underbrace{ \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\frac{1}{\selclients}\sum_{i \in \set^{(t)}}\left(\bhit - \byit + \frac{1}{\numclients}\sum_{j=1}^{\numclients}\byjt - \bar{\bh}^{(t)}\right)}}_{T_1}. \label{eq_proof::algoname_vt_ht_err_1}
\end{align}
where $(a)$ follows from the following reasoning.
\begin{align*}
    & \mbe_{\set^{(t)}, \xi^{(t)}} \lan \frac{1}{\selclients}\sum_{i \in \set^{(t)}}\left( \deltai - \bhit \right), \frac{1}{\selclients}\sum_{i \in \set^{(t)}}\left( \bhit - \byit + \frac{1}{\numclients}\sum_{j=1}^{\numclients}\byjt - \bar{\bh}^{(t)}\right) \ran \\
    &= \frac{1}{\selclients^2} \mbe_{\set^{(t)}} \lb \sum_{i \in \set^{(t)}} \mbe_{\xi^{(t)}} \lan \deltai - \bhit, \bhit - \byit + \byt - \bar{\bh}^{(t)} \ran + \sum_{i \in \set^{(t)}} \sum_{\substack{j \in \set^{(t)} \\ i \neq j}} \mbe_{\xi^{(t)}} \lan \deltai - \bhit, \bhjt - \byjt + \byt - \bar{\bh}^{(t)} \ran \rb \\
    &= \frac{1}{\selclients^2} \mbe_{\set^{(t)}} \lb \sum_{i \in \set^{(t)}} \mbe_{\xi^{(t)}} \lan \deltai - \bhit, \bhit - \byit + \byt - \bar{\bh}^{(t)} \ran \rb \tag{Assumption 2; independence of stochastic gradients across clients} \\
    &= \frac{1}{\selclients^2} \mbe_{\set^{(t)}} \lb \sum_{i \in \set^{(t)}} \mbe_{\xi^{(t)}} \lan \deltai - \bhit, \bhit - \bar{\bh}^{(t)} \ran \rb \tag{since $\{ \byit \}$ are independent of $\set^{(t)}, \xi^{(t)}$} \\
    &= \frac{1}{(\tau \selclients)^2} \mbe_{\set^{(t)}} \lb \sum_{i \in \set^{(t)}} \mbe_{\xi^{(t)}} \lan \sumkt \lp \G f_i(\bwitk, \xiitk) - \G f_i(\bwitk) \rp, \sumjt \G f_i(\bwitj) - \frac{1}{\numclients} \sum_{\ell=1}^\numclients \sumjt \G f_\ell (\bw_\ell^{(t,j)}) \ran \rb \\
    &= \frac{1}{(\tau \selclients)^2} \mbe_{\set^{(t)}} \lb \sum_{i \in \set^{(t)}} \mbe_{\xi^{(t)}} \lb \sumkt \lan \G f_i(\bwitk, \xiitk) - \G f_i(\bwitk), \G f_i(\bwitk) - \frac{1}{\numclients} \sum_{\ell=1}^\numclients \G f_\ell (\bw_\ell^{(t,k)}) \ran \rb \rb \\
    & \quad + \frac{1}{(\tau \selclients)^2} \mbe_{\set^{(t)}} \lb \sum_{i \in \set^{(t)}} \mbe_{\xi^{(t)}} \lb \sumkt \sum_{j \neq k} \lan \G f_i(\bwitk, \xiitk) - \G f_i(\bwitk), \G f_i(\bwitj) - \frac{1}{\numclients} \sum_{\ell=1}^\numclients \G f_\ell (\bw_\ell^{(t,j)}) \ran \rb \rb \\
    &= \frac{1}{(\tau \selclients)^2} \mbe_{\set^{(t)}} \lb \sum_{i \in \set^{(t)}} \mbe_{\xi^{(t)}} \lb \sumkt \lan \mbe \lb \G f_i(\bwitk, \xiitk) - \G f_i(\bwitk) \vert \bwitk \rb, \G f_i(\bwitk) - \frac{1}{\numclients} \sum_{\ell=1}^\numclients \G f_\ell (\bw_\ell^{(t,k)}) \ran \rb \rb \\
    & \quad + \frac{2}{(\tau \selclients)^2} \mbe_{\set^{(t)}} \lb \sum_{i \in \set^{(t)}} \mbe_{\xi^{(t)}} \lb \sum_{j < k} \lan \mbe \lb \G f_i(\bwitk, \xiitk) - \G f_i(\bwitk) \vert \bwitj \rb, \G f_i(\bwitj) - \frac{1}{\numclients} \sum_{\ell=1}^\numclients \G f_\ell (\bw_\ell^{(t,j)}) \ran \rb \rb \\
    &= 0.
\end{align*}
% \ps{We can even have equality instead of the inequality and removes the 2's, since $\mbe_{\xi^{(t)}} \lb \deltai \rb = \bhit$}
% Note that $\bhit$ is also random in this case i.e, $\E[\bhit] = \E[\deltai]$ so the cross-term will not necessarily be zero.

Further, $(b)$ follows since $\Eg{\xi^{(t)}} {\langle \deltai-\bhit, \deltaj-\bhjt \rangle} = 0$
for $i \neq j$. 
% and $\Eg{\xi^{(t)}}{\norm{\deltai - \bhit}} \leq \frac{\sigma^2}{\tau}$.
Finally, $(c)$ follows from the following reasoning.
\begin{align*}
    \mbe \norm{\deltai -\bhit} &= \frac{1}{\tau^2} \mbe \norm{\sumkt \lp \G f_i(\bwitk, \xiitk) - \G f_i(\bwitk) \rp} \\
    &= \frac{1}{\tau^2} \mbe \Bigg[ \sumkt \norm{\G f_i(\bwitk, \xiitk) - \G f_i(\bwitk)} \\
    & \qquad + 2 \sum_{j < k} \lan \mbe \lb \G f_i(\bwitk, \xiitk) - \G f_i(\bwitk) \vert \bwitj \rb, \G f_i(\bwitj, \xiitj) - \G f_i(\bwitj) \ran \Bigg] \\
    & \leq \frac{\sigma^2}{\tau}. \tag{Assumption 2}
\end{align*}


Next, we bound $T_1$ in \eqref{eq_proof::algoname_vt_ht_err_1}.
\begin{align}
    & \mbe_{\set^{(t)}, \xi^{(t)}}{\norm{\frac{1}{\selclients}\sum_{j \in \set^{(t)}}\left(\bhit - \byit + \byt -\bar{\bh}^{(t)}\right)}} \nn \\
    % & =\Eg{t}{\norm{\left(\bhit - \bh_i^{(t-1)}\right) - \left(\bar{\bh}^{(t)} - \bar{h}^{(t-1)}\right) +\left( \bh_i^{(t-1)} - \by_i^{(t-1)} + y^{(t-1)} -\bar{h}^{(t-1)}\right)}}\\
    & = \mbe_{\set^{(t)}, \xi^{(t)}}{\norm{\frac{1}{\selclients}\sum_{i \in \set^{(t)}}\left(\bhit - \G f_i(\bw^{(t-1)}) - (\bar{\bh}^{(t)} - \G f(\bw^{(t-1)})) +\left( \G f_i(\bw^{(t-1)}) - \byit + \byt -\G f(\bw^{(t-1)})\right)\right)}} \nn \\
    % & = 2 \Eg{t}{\norm{\left(\bhit - \bh_i^{(t-1)}\right) - \left(\bar{\bh}^{(t)} -\bar{h}^{(t-1)}\right)}}+ 2 \Eg{t}{\norm{h_i^{(t-1)} - \by_i^{(t-1)} + \frac{1}{\numclients}\sum_{j=1}^{\numclients}y_j^{(t-1)} -\bar{h}^{(t-1)}}}\\
    & \leq 2 \mbe_{\set^{(t)}, \xi^{(t)}}{\norm{\frac{1}{\selclients}\sum_{i \in \set^{(t)}}\left(\bhit -\G f_i(\bw^{(t-1)}) - \left(\bar{\bh}^{(t)} -\G f(\bw^{(t-1)})\right)\right)}} \nn \\
    & \qquad + 2 \Eg{\set^{(t)}}{\norm{\frac{1}{\selclients}\sum_{i \in \set^{(t)}}\left(\G f_i(\bw^{(t-1)})  - \byit + \byt -\G f(\bw^{(t-1)})\right)}} \nonumber\\
    & = \frac{2(\numclients - \selclients)}{\selclients (\numclients - 1) \numclients} \sumin \Eg{\xi^{(t)}}{\norm{\bhit - \G f_i(\bw^{(t-1)})-\bar{\bh}^{(t)} +\G f(\bw^{(t-1)})}} \nn \\
    & \qquad + \frac{2(\numclients - \selclients)}{\selclients (\numclients - 1) \numclients} \sumin \norm{\G f_i(\bw^{(t-1)}) - \byit + \byt -\G f(\bw^{(t-1)})} \tag{Lemma \ref{lem:sample_WR}} \\
    & \overset{(d)}{\leq} \frac{2(\numclients - \selclients)}{\selclients (\numclients - 1) \numclients} \lb \sumin \Eg{\xi^{(t)}}{\norm{\bhit - \G f_i(\bw^{(t-1)})}}
    + \sumin \norm{\G f_i(\bw^{(t-1)}) - \byit} \rb \tag{$\because \text{Var}(X) \leq \E{X^2}$} \\
    & = \frac{2(\numclients - \selclients)}{\selclients (\numclients - 1) \numclients} \lb \sumin\Eg{\xi^{(t)}}{\norm{\bhit - \G f_i(\bwt) + \G f_i(\bwt)-\G f_i(\bw^{(t-1)})}} +  \sumin \norm{\G f_i(\bw^{(t-1)}) - \byit} \rb \nn \\
    & \leq \frac{2(\numclients - \selclients)}{\selclients (\numclients - 1) \numclients} \lb 2 \sumin \Eg{\xi^{(t)}}{\norm{\bhit - \G f_i(\bwt)}} + 2 \numclients \lrs^2 L^2 \norm{\bv^{(t-1)}} + \sumin \norm{\G f_i(\bw^{(t-1)}) - \byit} \rb. \label{eq_proof::algoname_vt_ht_err_2}
\end{align}
Finally, substituting \eqref{eq_proof::algoname_vt_ht_err_2} in \eqref{eq_proof::algoname_vt_ht_err_1}, we get the result in the lemma.
\end{proof}


\begin{proof}[Proof of Lemma \ref{:algoname_grad_yt_err}]
\begin{align}
    & \Eg{\set^{(t)}, \xi^{(t)}}{\frac{1}{\numclients}\sum_{j=1}^{\numclients} \norm{\G f_j(\bwt) - \byjtp}} \nn \\
    & = \frac{1}{\numclients}\sum_{j=1}^{\numclients} \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\G f_j(\bwt) - \byjtp} \nn \\
    & = \frac{1}{\numclients}\sum_{j=1}^{\numclients}\left[\frac{\selclients}{\numclients} \Eg{\xi^{(t)}}{\norm{\G f_j(\bwt) - \deltaj}} + \left(1-\frac{\selclients}{\numclients}\right)\norm{\G f_j(\bwt) - \byjt}\right] \label{:algoname_grad_yt_err_1} \\
    & = \frac{1}{\numclients}\sum_{j=1}^{\numclients}\left[\frac{\selclients}{\numclients} \Eg{\xi^{(t)}}{\norm{\G f_j(\bwt) - \deltaj}} + \left(1-\frac{\selclients}{\numclients}\right)\norm{\G f_j(\bwt) - \G f_j(\bw^{(t-1)}) + \G f_j(\bw^{(t-1)}) - \byjt}\right] \nonumber \\
    & \leq \frac{1}{\numclients}\sum_{j=1}^{\numclients} \left[ \frac{\selclients \sigma^2}{\numclients \tau} + \frac{\selclients}{\numclients} \mbe_{\xi^{(t)}} \norm{\G f_j(\bwt) - \bhjt} + \left(1-\frac{\selclients}{\numclients}\right)\norm{\G f_j(\bwt) - \G f_j(\bw^{(t-1)}) + \G f_j(\bw^{(t-1)}) - \byjt }\right] \nonumber \\
    & \leq \frac{\selclients}{\numclients} \left[ \frac{\sigma^2}{\tau} + \frac{1}{\numclients} \sumjn \mbe_{\xi^{(t)}} \norm{\G f_j(\bwt) - \bhjt} \right] \nn \\
    & + \left(1-\frac{\selclients}{\numclients}\right) \lb \left(1+\frac{1}{\beta}\right) \lrs^2 L^2\norm{\bv^{(t-1)}} + (1+\beta) \frac{1}{\numclients} \sum_{j=1}^{\numclients} \norm{{\G f_j(\bw^{(t-1)}) - \byjt}} \rb. \tag{$\beta$ is a positive constant}
\end{align}
\end{proof}











% \newpage
\section{Convergence Result for \texttt{\clusteralgoname} (Theorem 3)}
\label{app:clusteralgoname}

In this section we prove the convergence result for \texttt{\clusteralgoname} in Theorem 3, and provide the complexity and communication guarantees.

We organize this section as follows. First, in \ref{app:clusteralgoname_int_results} we present some intermediate results, which we use to prove the main theorem. Next, in \ref{app:clusteralgoname_thm_proof}, we present the proof of Theorem 3, which is followed by the proofs of the intermediate results in \ref{app:clusteralgoname_int_results_proofs}.

\begin{algorithm}
\caption{\texttt{\clusteralgoname} }
\label{clusteralgoname}
\begin{algorithmic}[1]
\State \textbf{Input:} initial model $\bw^{(0)}$, server learning rate $\lrss$, client learning rate $\eta$, local SGD steps $\tau$, $\lrs = \lrss \lrc \tau $, number of rounds $T$, number of clusters $K$, initial cluster states $\by_k^{(0)} = \mathbf{0}$ for all $k \in [K]$, cluster identities $c_i \in [K]$ for all $i \in [\numclients]$, cluster sets $\mathcal{C}_k = \{ i: c_i = k\} \text{ for all } k \in [K]$

\For {$t = 1,2,\dots, T$}
\State Sample $\set^{(t)} \subseteq [\numclients]$ uniformly without replacement
\For{$i \in \set^{(t)}$}
\State $\deltai \gets \texttt{LocalSGD}(i,\bw^{(t)},\tau,\eta)$
\EndFor
\State // At Server:
\State {\small$\bvt = \frac{1}{|\set^{(t)}|}\sum_{i \in \set^{(t)}}\brac{\deltai - \by_{c_i}^{(t)}} + \frac{1}{\numclients}\sum_{j=1}^\numclients \by_{c_j}^{(t)}$}
\State $\bw^{(t+1)} = \bw^{(t)} - \lrs \bvt$
\State //State update
\For {$k \in [\numclusters]$}
\State $\by_k^{(t+1)} = 
    \begin{cases}
        \dfrac{\sum_{ i \in \set^{(t)} \cap \mathcal{C}_k} \deltai}{|\set^{(t)} \cap \mathcal{C}_k|} & \text{ if } |\set^{(t)} \cap \mathcal{C}_k| \neq 0\\
        \by_k^{(t)} & \text{ otherwise}
    \end{cases}$
\EndFor
\EndFor

\Procedure{\texttt{LocalSGD}}{$i,\bw^{(t)},\tau, \eta$}
  \State Set $\bw_i^{(t,0)} = \bw^{(t)}$
  \For{$k = 0,1\dots,\tau-1$}
  \State Compute stochastic gradient $\nabla f_i(\bw_i^{(t,k)},\xi_i^{(t,k)})$
  \State $\bw_i^{(t,k+1)} = \bw_i^{(t,k)} - \lrc \nabla f_i(\bw_i^{(t,k)}, \xi_i^{(t,k)})$
  \EndFor
  \State Return $(\bw^{(t)} - \bw_i^{(t,\tau)})/\lrc \tau$
\EndProcedure
\end{algorithmic}
\end{algorithm}

\subsection{Intermediate Lemmas} \label{app:clusteralgoname_int_results}
The proof of \texttt{\clusteralgoname} follows closely the proof of \texttt{\algoname}. We borrow Lemma \ref{:algoname_f_decay_one_step} from Section \ref{app:\algoname}, and the next lemma is analogous to Lemma \ref{:algoname_vt_ht_err}.

\begin{lem}
\label{lem:clusteralgoname_vt_ht_err}
Suppose the function $f$ satisfies Assumption 1, and the stochastic oracles at the clients satisfy Assumptions 2.
Then, the iterates $\{ \bwt \}_t$ generated by \texttt{\algoname} satisfy
\begin{align*}
    \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\bvt-\bar{\bh}^{(t)}} & \leq \frac{\sigma^2}{\selclients \tau} + \frac{4 (\numclients - \selclients)}{\selclients (\numclients - 1)} \lb \frac{1}{\numclients}\sumin\Eg{\xi^{(t)}}{\norm{\bhit - \G f_i(\bwt)}} + \lrs^2 L^2 \norm{\bv^{(t-1)}} \rb \nn \\
    & \quad + \frac{2 (\numclients - \selclients)}{\selclients (\numclients - 1)} \frac{1}{\numclients}\sumin \norm{\G f_i(\bw^{(t-1)}) - \by_{c_i}^{(t)}}.
\end{align*}
\end{lem}


\begin{lem}
\label{lem:clusteralgoname_grad_yt_err}
Suppose the function $f$ satisfies Assumption 1, and the stochastic oracles at the clients satisfy Assumptions 2, 3.
Then, the iterates $\{ \bwt \}_t$ generated by \texttt{\algoname} satisfy
\begin{align*}
    & \Eg{\xi^{(t)}, \set^{(t)}}{\frac{1}{\numclients}\sumjn \norm{\nabla f_j(\bw^{(t)}) - \by_{c_j}^{(t+1)}}} \leq 4 (1-p) \left[ \sigma_K^2 + \frac{\sigma^2}{\tau} + \frac{1}{\numclients}\sumjn \mbe_t \norm{\nabla f_j(\bw^{(t)}) - \bh_j^{(t)}} \right]\\
    & \qquad \qquad + p \lb \left( 1 + \frac{1}{\beta} \right) \lrs^2 L^2 \norm{\bv^{(t-1)}} + (1+\beta) \frac{1}{\numclients} \sum_{j=1}^{\numclients} \norm{{\nabla f_j(\bw^{(t-1)}) - \by_{c_j}^{(t)}}} \rb. \nonumber
\end{align*}
for any positive scalar $\beta$.
Note that keeping $r=1$ (which implies $\sigma_K^2 = 0$) we recover our earlier result in Lemma \ref{:algoname_grad_yt_err} (upto multiplicative constants).
\end{lem}
We also use the bound on $\frac{1}{\numclients} \sumjn \mbe_{\xi^{(t)}} \norm{\G f_j(\bwt) - \bhjt}$ from Lemma \ref{lem:FedAvg_grad_ht_err} in the previous section.

\subsection{Proof of Theorem 3}
\label{app:clusteralgoname_thm_proof}
For the sake of completeness, first we state the complete theorem statement.

\begin{thm*}
Suppose the function $f$ satisfies Assumption 1, and the individual client functions satisfy Assumptions 2, 4. Further, the client learning rate $\lrc$, and the server learning rate $\lrss$ are chosen such that $\lrc \leq \frac{1}{10 L \tau}$, $\lrss \lrc \leq \min \lcb \frac{\sqrt{\selclients} (1-p)}{8 L \tau}, \frac{\selclients}{16 \tau L}, \frac{1}{4 L \tau} \rcb$.
Then, the iterates $\{ \bwt \}_t$ generated by \texttt{\clusteralgoname} satisfy
\begin{align*}
    \min_{t \in [T]} \mbe \norm{\G f(\bwt)} & \leq \underbrace{\mco \lp \frac{f(\bw^{(0)}) - f^*}{\lrss \lrc \tau T} \rp}_{\text{Effect of initialization}} + \underbrace{\mco \lp \frac{\lrss \lrc L \sigma^2}{\selclients} + \lrc^2 L^2 (\tau - 1) \sigma^2 \rp}_{\text{Stochastic Gradient Error}} \\
    & \qquad + \underbrace{\mathcal{O}\left(\frac{\lrss \lrc \tau L \sigma_K^2}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} \right)}_{\text{ Error due clustering}} + \underbrace{\mco \lp \lrc^2 L^2 \tau (\tau - 1) \sigma_g^2 \rp}_{\text{Client Drift Error}}
\end{align*}
where $f^* = \argmin_\mathbf{x} f(\mathbf{x})$.
\end{thm*}

\begin{coro}
% \label{corro_1}
Setting $\lrc = \frac{1}{\sqrt{T}\tau L}$ and $\lrss = \sqrt{\tau \selclients}$, \texttt{\clusteralgoname} converges to a stationary point of the global objective $f(\bw)$ at a rate given by,
\begin{align*}
    &\min_{t \in \{0, \hdots, T-1 \}} \mbe \norm{\nabla  f(\bw^{(t)})} \leq \underbrace{\bigO{\frac{1}{\sqrt{\selclients \tau T}}}}_{\text{stochastic gradient error}} + \underbrace{\bigO{\frac{(\numclients - \selclients)}{(\numclients - 1)} \sqrt{\frac{\tau}{\selclients T}}}}_{\text{partial participation error}}+ \underbrace{\bigO{\frac{1}{T}}}_{\text{client drift error}}    
\end{align*}
\end{coro}

% Therefore we see a expected trade-off when using clustering: our error floor increases but we can potentially keep a larger learning rate implying faster convergence.
% \ps{Need some explanation here.}

\begin{proof}
The proof is analogous to the proof of Theorem 2 in Section \ref{app:\algoname}, with $1- \frac{\selclients}{\numclients}$ replaced by $p = \frac{\binom{\numclients - r}{\selclients}}{\binom{\numclients}{\selclients}}$.
We use the same Lyapunov function defined in \eqref{eq:lyapunov_defn}, with $\frac{\lrs^2 L}{2} \leq \delta \leq \frac{\lrs}{2}$.
Using Lemma \ref{:algoname_f_decay_one_step}, we get
\begin{align}
    R^{(t+1)} &\leq \mathbb{E}\left[f(\bwt) -\frac{\lrs}{2} \norm{\G f(\bwt)} + \frac{\lrs}{2} A^{(t)} + \delta \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\bvt-\bar{\bh}^{(t)}} \right. \nn \\
    & \qquad \left. + \alpha\frac{1}{\numclients}\sum_{j=1}^{\numclients} \mbe_{\set^{(t)}, \xi^{(t)}} \norm{\G f_j(\bwt) - \by_{c_j}^{(t+1)}} \right], \label{eq_proof:thm:clusteralgoname_1}
\end{align}
where $A^{(t)} \triangleq \frac{1}{\numclients} \sumjn \Eg{\xi^{(t)}}{\norm{\G f_j(\bwt) - \bhjt}}$. Substituting the bounds from Lemma \ref{lem:clusteralgoname_vt_ht_err} and Lemma \ref{lem:clusteralgoname_grad_yt_err} in \eqref{eq_proof:thm:clusteralgoname_1} we get,
\begin{align}
    R^{(t+1)} & \leq \E{f(\bwt) -\frac{\lrs}{2}\norm{\G f(\bwt)}+ \left(\frac{\lrs}{2} + \frac{4\delta}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} + 4 \alpha (1-p) \right) A^{(t)} + \frac{ \delta \sigma^2}{\selclients \tau} + 4 \alpha (1 - p) \lp  \frac{\sigma^2}{\tau} + \sigma^2_K \rp} \nn \\
    & \quad + \left(\frac{4\delta \lrs^2 L^2}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} + \alpha p \left( 1 + \frac{1}{\beta} \right) \lrs^2 L^2 \right) \mbe \norm{\bv^{(t-1)}} \nn \\
    & \quad + \left( \frac{2\delta}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} + \alpha p (1 + \beta) \right) \frac{1}{\numclients} \sum_{j=1}^{\numclients} \mbe \norm{\G f_j(\bw^{(t-1)}) - \by_{c_j}^{(t)} }. \label{eq_proof:thm:clusteralgoname_2}
\end{align}

\paragraph{Choice of $\alpha, \delta$.}
Our goal is now to find a suitable $\delta$ and $\alpha$ such that,
\begin{align*}
    \left(\frac{4\delta \lrs^2 L^2}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} + \alpha p \left(1+\frac{1}{\beta}\right)\lrs^2 L^2\right) & \leq \delta - \frac{\lrs^2 L}{2}, \nn \\
    \left(\frac{2\delta}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)}  + \alpha p (1+\beta)\right) & \leq \alpha
\end{align*}
We define $A = p \left(1+\frac{1}{\beta} \right)$ and $B = p (1+\beta)$. The resulting condition on $\alpha$ is
\begin{align*}
    \alpha \geq \frac{2\delta}{\selclients (1-B)} \frac{(\numclients - \selclients)}{(\numclients - 1)} , \qquad \beta \leq \frac{1}{p} - 1.
\end{align*}
We set $\alpha = \frac{2\delta}{\selclients (1-B)} \frac{(\numclients - \selclients)}{(\numclients - 1)} $, and our condition on $\delta$ then reduces to,
\begin{align*}
    \delta \geq \frac{\lrs^2 L/2}{1 - \frac{4 \lrs^2 L^2}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} - \frac{2 A \lrs^2 L^2}{\selclients (1-B)} \frac{(\numclients - \selclients)}{(\numclients - 1)} }
\end{align*}
We want $\lrs$ such that,
\begin{align*}
    \frac{4 \lrs^2 L^2}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} + \frac{2 A \lrs^2 L^2}{\selclients (1-B)} \frac{(\numclients - \selclients)}{(\numclients - 1)} \leq \frac{1}{2}
\end{align*}
A sufficient condition for this is 
\begin{align}
    \frac{4 \lrs^2 L^2}{\selclients} \leq \frac{1}{4}, \qquad \frac{2 A \lrs^2 L^2}{\selclients (1-B)} \leq \frac{1}{4}. \label{eq:cond_lrs_4}
\end{align}
% \ps{This choice of $\beta$ won't work for full client participation. Need another condition for that simpler case.}
For $\beta = \frac{1}{2p} - \frac{1}{2}$, $B = \frac{p}{2} + \frac{1}{2}$ and $A \leq \frac{2}{1-p}$. Hence, a sufficient condition for \eqref{eq:cond_lrs_4} to be satisfied is
\begin{align*}
    \lrs \leq \frac{\sqrt{\selclients}(1-p)}{8 L} \quad \Rightarrow \quad \lrss \lrc \leq \frac{\sqrt{\selclients}(1-p)}{8 L \tau}
\end{align*}
With \eqref{eq:cond_lrs_4} we have $\delta \geq \lrs^2 L$. We set $\delta = 2\lrs^2 L$ which gives us $\alpha = \frac{8 \lrs^2 L}{\selclients (1-p)} \frac{(\numclients - \selclients)}{(\numclients - 1)} $. Since $\delta \leq \frac{\lrs}{2}$, we also need $\lrs \leq \frac{1}{4L}$.

With this choice of $\alpha, \delta$, from \eqref{eq_proof:thm:clusteralgoname_2} we get
\begin{align}
    R^{(t+1)} &\leq R^{(t)} - \frac{\lrs}{2} \mbe \norm{\G f(\bwt)} + \left(\frac{\lrs}{2} + \frac{4\delta}{\selclients} + 4 \alpha (1-p) \right) A^{(t)} + \frac{ \delta \sigma^2}{\selclients \tau} + 4 \alpha (1 - p) \lp  \frac{\sigma^2}{\tau} + \sigma^2_K \rp \nn \\
    &\leq R^{(t)} - \frac{\lrs}{2}\norm{\G f(\bwt)} +  3\lrs A^{(t)} + \frac{40 \lrs^2 L}{\selclients} \frac{\sigma^2}{\tau} + \frac{32 \lrs^2 L}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} \sigma_K^2,
\end{align}
where we use the condition that $\lrs \leq \frac{\selclients}{16 L}$.
Further, we can bound $A^{(t)} = \frac{1}{\numclients}\sumjn\Eg{\xi^{(t)}}{\norm{\G f_j(\bwt) - \bhjt}}$ using Lemma \ref{lem:FedAvg_grad_ht_err}, which gives us
\begin{align*}
    R^{(t+1)} & \leq R^{(t)} - \frac{\lrs}{2} \lp 1 - 48 \lrc^2 L^2 \tau (\tau-1) \rp \norm{\G f(\bwt)} + 6 \lrs \lrc^2 L^2 (\tau-1) \lb \sigma^2 + 4 \tau \sigma_g^2 \rb \\
    & \quad + \frac{40 \lrs^2 L}{\selclients} \frac{\sigma^2}{\tau} + \frac{32 \lrs^2 L}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} \sigma_K^2.
\end{align*}
Using the condition on $\lrc$ that $\lrc \leq \frac{1}{10 L \tau}$, and unrolling the recursion we get,
\begin{align}
    R^{(t)} &\leq R^{(1)}+  \sum_{t=1}^{(t-1)}\left(-\frac{\lrs}{4}\norm{\G f(\bwt)} + 6 \lrs \lrc^2 L^2 (\tau-1) \lb \sigma^2 + 4 \tau \sigma_g^2 \rb + \frac{40 \lrs^2 L}{\selclients} \frac{\sigma^2}{\tau} + \frac{32 \lrs^2 L}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} \sigma_K^2 \right). \label{eq_proof:thm:clusteralgoname_3}
\end{align}
Next, we bound $R^{(1)}$.
Using \eqref{eq_proof:thm:clusteralgoname_1} and \eqref{eq_proof:lem:clusteralgoname_grad_yt_err_4} we can bound $R^{(1)}$ as follows,
\begin{align}
    R^{(1)} \leq & f(\bw^{(0)}) - \frac{\lrs}{2} \norm{\G f(\bw^{(0)})} + \left( \frac{\lrs}{2} + 4 \alpha (1-p) \right) A^{(0)} + 4 \alpha (1-p) \lp \frac{\sigma^2}{\tau} + \sigma_K^2 \rp \nn \\
    & \hspace{10pt} + \delta \Eg{\xi^{(0)}, \set^{(0)}}{\norm{\bv^{(0)} - \bar{\bh}^{(0)}}}+ \alpha p \frac{1}{\numclients} \sumin \norm{\G f_i(\bw^{(0)})} \nn \\
    & \leq f(\bw^{(0)}) -\frac{\lrs}{4}\norm{\G f(\bw^{(0)})} + 4 \lrs \lrc^2 L^2 (\tau-1) \lb \sigma^2 + 4 \tau \sigma_g^2 \rb + \frac{32 \lrs^2 L}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} \lp \frac{\sigma^2}{\tau} + \sigma_K^2 \rp \nn \\
    & \hspace{10pt} + \delta \Eg{\xi^{(0)},\set^{(0)}}{\norm{\bv^{(0)}-\bar{\bh}^{(0)}}}+ \alpha p \frac{1}{\numclients}\sumin \norm{\G f_i(\bw^{(0)})}. \label{eq_proof:thm:clusteralgoname_4}
\end{align}
Substituting the bound on $R^{(1)}$ from \eqref{eq_proof:thm:clusteralgoname_4} in \eqref{eq_proof:thm:clusteralgoname_3}, and using $t = T$ we get
\begin{align*}
    R^{(T)} \leq & f(\bw^{(0)}) - \sum_{t=0}^{(T-1)}\left(\frac{\lrs}{4}\norm{\G f(\bwt)} + 6 \lrs \lrc^2 L^2 (\tau-1) \lb \sigma^2 + 4 \tau \sigma_g^2 \rb + \frac{40 \lrs^2 L}{\selclients} \frac{\sigma^2}{\tau} + \frac{32 \lrs^2 L}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} \sigma_K^2 \right) \\
    & \hspace{10pt} + 2\lrs^2 L \Eg{\xi^{(0)},\set^{(0)}}{\norm{\bv^{(0)}-\bar{\bh}^{(0)}}}+ \frac{8 p \lrs^2 L}{\selclients (1-p)} \frac{(\numclients - \selclients)}{(\numclients - 1)} \frac{1}{\numclients}\sumin \norm{\G f(\bw^{(0)})} 
\end{align*}
Rearranging the terms, we get
\begin{align*}
    & \min_{t \in [0,T-1]} \mbe \norm{\G f(\bwt)} \leq \frac{1}{T} \sumtT \mbe \norm{\G f(\bwt)} \nn \\
    & \leq \frac{4(f(\bw^{(0)}) - f^{*})}{\lrs T} + \frac{160 \lrs L}{\selclients} \frac{\sigma^2}{\tau} + \frac{128 \lrs L}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} \sigma_K^2 + 24\lrc^2L^2(\tau-1)\sigma^2 + 96\lrc^2L^2\tau(\tau-1)\sigma_g^2\\ 
    & \hspace{10pt} + \frac{1}{T} \lb 8 \lrs L \Eg{\xi^{(0)},\set^{(0)}}{\norm{\bv^{(0)}-\bar{\bh}^{(0)}}}+ \frac{32 p \lrs L}{\selclients (1-p)} \frac{(\numclients - \selclients)}{(\numclients - 1)} \frac{1}{\numclients}\sumin \norm{\G f(\bw^{(0)})}  \rb \\
    & = \mathcal{O} \left( \frac{f(\bw^{(0)})-f^*}{\lrs T} \right) + \mathcal{O}\left(\frac{\lrs L \sigma^2}{\selclients \tau} \right) + \mathcal{O}\left(\frac{\lrs L \sigma_K^2}{\selclients} \frac{(\numclients - \selclients)}{(\numclients - 1)} \right) + \mathcal{O}( \lrc^2L^2(\tau-1)\sigma^2 + \lrc^2 L^2 \tau(\tau-1)\sigma_g^2),
\end{align*}
which concludes the proof.
\end{proof}

% \newpage
\subsection{Proofs of the Intermediate Lemmas}
\label{app:clusteralgoname_int_results_proofs}

\begin{proof}[Proof of Lemma \ref{lem:clusteralgoname_vt_ht_err}]
The proof is analogous to proof of Lemma \ref{lem:clusteralgoname_vt_ht_err}. The only difference being that in \eqref{eq_proof::algoname_vt_ht_err_2}, we do not bound the term $\frac{\numclients - \selclients}{\numclients - 1}$ with $1$.
\end{proof}


\begin{proof}[Proof of Lemma \ref{lem:clusteralgoname_grad_yt_err}]
\begin{align}
    \Eg{\xi^{(t)},\set^{(t)}}{\frac{1}{\numclients} \sum_{j=1}^{\numclients} \norm{\nabla f_j(\bw^{(t)}) - \by_{c_j}^{(t+1)}}} & = \frac{1}{\numclients}\sum_{j=1}^{\numclients} \Eg{\xi^{(t)},\set^{(t)}}{\norm{\nabla f_j(\bw^{(t)}) - \by_{c_j}^{(t+1)}}}.
    % & \leq p\left[2\sigma^2_K + \frac{4\sigma^2}{\tau} + \frac{4}{\numclients}\sumjn\Eg{t}{\norm{\nabla f_j(\bw^{(t)}) - \bh_j^{(t)}}} \right]\\
    % & + \left(1-p\right)\left(1+\frac{1}{\beta}\right)\lrs^2\tau^2L^2\norm{v^{(t-1)}} + \left(1-p\right)(1+\beta)\frac{1}{\numclients}\sum_{j=1}^{\numclients}\norm{{\nabla f_j(\bw^{(t-1)}) - \by_j^{(t)}}} \nonumber
    \label{eq_proof:lem:clusteralgoname_grad_yt_err}
\end{align}
% where $p = 1-\frac{\binom{n-r}{\selclients}}{\binom{\numclients}{\selclients}}$

Let $\Cc_k^{(t)} = \{ i: c_i=k \text{ and } i \in \set^{(t)}\}$, i.e., the set of sampled clients which belong to the $k$-th cluster. For a specific cluster $c_j \in [K]$
\begin{align*}
    & \Eg{\set^{(t)}}{\norm{\nabla f_j(\bw^{(t)}) - \by_{c_j}^{(t+1)}}} \nn \\
    & = \Eg{\Cc_{c_j}^{(t)}}{ \mathbb{I} \lp |\Cc_{c_j}^{(t)}|\neq 0 \rp \norm{\nabla f_j(\bw^{(t)}) - \frac{\sum_{l \in \Cc_{c_j}^{(t)}}  \Delta_l^{(t)}}{|\Cc_{c_j}^{(t)}|}}+ \mathbb{I} \lp |\Cc_{c_j}^{(t)}| = 0 \rp\norm{\nabla f_j(\bw^{(t)}) - \by_{c_j}^{(t)}}} \tag{Cluster center update in \texttt{\clusteralgoname}} \\
    &= \Eg{\Cc_{c_j}^{(t)}}{ \mathbb{I} \lp |\Cc_{c_j}^{(t)}|\neq 0 \rp \norm{\nabla f_j(\bw^{(t)}) -  \frac{1}{|\Cc_{c_j}^{(t)}|} \sum_{l \in \Cc_{c_j}^{(t)}} \lp \Delta_l^{(t)} - \nabla f_l(\bw^{(t)}) + \nabla f_l(\bw^{(t)}) \rp}}\\
    & \hspace{10pt} + \Eg{\Cc_{c_j}^{(t)}}{\mathbb{I} \lp |\Cc_{c_j}^{(t)}| = 0 \rp\norm{\nabla f_j(\bw^{(t)}) - \by_{c_j}^{(t)}}}\\
    & \leq \Eg{\Cc_{c_j}^{(t)}}{ \mathbb{I} \lp |\Cc_{c_j}^{(t)}|\neq 0 \rp \left( \frac{2}{|\Cc_{c_j}^{(t)}|} \sum_{l \in C_{c_j}^{(t)}} \norm{\nabla f_j (\bw^{(t)}) - \nabla f_l (\bw^{(t)})} + \frac{2}{|\Cc_{c_j}^{(t)}|} \sum_{l \in \Cc_{c_j}^{(t)}} \norm{\nabla f_l(\bw^{(t)}) - \Delta_l^{(t)}} \right)} \\
    & \hspace{10pt} + \Eg{\Cc_{c_j}^{(t)}}{\mathbb{I} \lp |\Cc_{c_j}^{(t)}| = 0 \rp\norm{\nabla f_j(\bw^{(t)}) - \by_{c_j}^{(t)}}} \tag{Jensen's inequality; Young's inequality} \\
    &\leq\Eg{\Cc_{c_j}^{(t)}}{ \mathbb{I} \lp |\Cc_{c_j}^{(t)}|\neq 0 \rp\left(4\sigma_K^2+\frac{2}{|\Cc_{c_j}^{(t)}|}\sum_{l \in \Cc_{c_j}^{(t)}}\norm{\nabla f_l(\bw^{(t)})-\Delta_l^{(t)}}\right)}\\
    &\hspace{10pt} + \Eg{\Cc_{c_j}^{(t)}}{\mathbb{I} \lp |\Cc_{c_j}^{(t)}| = 0 \rp\norm{\nabla f_j(\bw^{(t)}) - \by_{c_j}^{(t)}}} \tag{Assumption []}
\end{align*}
Substituting in \eqref{eq_proof:lem:clusteralgoname_grad_yt_err} we get
\begin{align}
    & \sumin \Eg{\set^{(t)}}{\norm{\nabla f_j(\bw^{(t)}) - \by_{c_j}^{(t+1)}}} \nn \nn \\
    & \leq \sum_{k=1}^K  \left(4r\Eg{\Cc_k^{(t)}}{\mathbb{I}(|\Cc_k^{(t)}|\neq 0)}\sigma_K^2 + 2 \Eg{\Cc_k^{(t)}}{\mathbb{I}(|\Cc_k^{(t)}| \neq 0) \frac{r}{|\Cc_k^{(t)}|}\sum_{l \in \Cc_k^{(t)}}\norm{\nabla f_l(\bw^{(t)})-\Delta_l^{(t)}}} \right) \nn \\
    & \hspace{10pt} + \sumjn \Eg{\Cc_{c_j}^{(t)}}{\mathbb{I}(\Cc_{c_j}^{(t)} = 0)} \norm{\nabla f_j(\bw^{(t)}) - \by_{c_j}^{(t)}} \tag{$| \Cc_k^{(t)} | \leq r$} \nn \\
    & = \sum_{k=1}^{K} \left( 4 r (1-p) \sigma_K^2 + 2 \Eg{\Cc_k^{(t)}}{\mathbb{I}(|\Cc_k^{(t)}| \neq 0) \frac{r}{|\Cc_k^{(t)}|} \sum_{l \in \Cc_k^{(t)}} \norm{\nabla f_l(\bw^{(t)})-\Delta_l^{(t)}}} \right) \nn \\
    & \hspace{10pt} + p\sumjn\norm{\nabla f_j(\bw^{(t)})-\by_{c_j}^{(t)}}, \label{eq_proof:lem:clusteralgoname_grad_yt_err_2}
\end{align}
where $p = \Eg{\Cc_{c_j}^{(t)}}{\mathbb{I}(\Cc_{c_j}^{(t)} = 0)} = \frac{\binom{\numclients - r}{\selclients}}{\binom{\numclients}{r}}$ is the probability that no client from a particular cluster is sampled in $\set^{(t)}$ (same for all $j$ since we assumed equal number of devices in each cluster). Note that,
\begin{align}
    \Eg{\Cc_k^{(t)}}{\mathbb{I}(|\Cc_k^{(t)}| \neq 0) \frac{r}{|\Cc_k^{(t)}|} \sum_{l \in \Cc_k^{(t)}} \norm{\nabla f_l(\bw^{(t)})-\Delta_l^{(t)}}} &= \Eg{\Cc_k^{(t)}}{ \frac{r}{|\Cc_k^{(t)}|} \sum_{l \in \Cc_k} \mathbb{I}(|\Cc_k^{(t)}| \neq 0, l \in \Cc_k^{(t)}) \norm{\nabla f_l(\bw^{(t)})-\Delta_l^{(t)}}}\\
    & = \sum_{l \in \Cc_k} \norm{\nabla f_l(\bw^{(t)}) - \Delta_l^{(t)}}\sum_{z=1}^r \frac{r}{z}  \mathbb{P}(|\Cc_k^{(t)}| = z, l \in \Cc_k^{(t)})   \nn \\
    & = \sum_{l \in \Cc_k} \norm{\nabla f_l(\bw^{(t)}) - \Delta_l^{(t)}}\sum_{z=1}^r \frac{r}{z}\frac{\binom{r-1}{z-1}\binom{\numclients - r}{\selclients-z}}{\binom{\numclients}{\selclients}} \nn \\
    & = \sum_{l \in \Cc_k}\norm{\nabla f_l(\bw^{(t)}) - \Delta_l^{(t)}}\sum_{z=1}^r \frac{\binom{r}{z}\binom{\numclients-r}{\selclients-z}}{\binom{\numclients}{\selclients}} \nn \\
    & = (1-p)\sum_{l \in \Cc_k}\norm{\nabla f_l(\bw^{(t)}) - \Delta_l^{(t)}}. \label{eq_proof:lem:clusteralgoname_grad_yt_err_3}
\end{align}

Substituting the bounds from \eqref{eq_proof:lem:clusteralgoname_grad_yt_err_2}, \eqref{eq_proof:lem:clusteralgoname_grad_yt_err_3} in \eqref{eq_proof:lem:clusteralgoname_grad_yt_err}, we get
\begin{align}
    & \frac{1}{\numclients}\sumjn \Eg{\xi^{(t)}, \set^{(t)}}{\norm{\nabla f_j(\bw^{(t)})-\by_{c_j}^{(t+1)}}} \nn \\
    & \leq 4(1-p)\sigma_K^2 + 2(1-p)\frac{1}{\numclients}\sumjn\Eg{\xi^{(t)}}{\norm{\nabla f_j(\bw^{(t)})-\Delta_j^{(t)}}} + p\frac{1}{\numclients}\sumjn\norm{\nabla f_j(\bw^{(t)})-\by_{c_j}^{(t)}} \label{eq_proof:lem:clusteralgoname_grad_yt_err_4}\\
    &\leq (1-p)\left[4\sigma_K^2 + \frac{4\sigma^2}{\tau} + \frac{4}{\numclients}\sumjn\Eg{t}{\norm{\nabla f_j(\bw^{(t)}) - \bh_j^{(t)}}} \right] \nn \\
    & \quad + p\left(1+\frac{1}{\beta}\right) \lrs^2 L^2 \norm{\bv^{(t-1)}} + p (1 + \beta) \frac{1}{\numclients} \sum_{j=1}^{\numclients} \norm{{\nabla f_j(\bw^{(t-1)}) - \by_{c_j}^{(t)}}}, \nonumber
\end{align}
for any positive constant $\beta$.
\end{proof}

% \newpage

% \section{Experimental Details}
% \label{appendix:exps}

% \subsection{Data Preprocessing} 

% The CIFAR-10 training dataset was augmented with Horizontally Flipped and Randomly Cropped data. Both the training and testing images were normalized according to their mean and standard deviation values as $(x-\mu)/\sigma$, where $\mu$ is the average of the pixel values x and $\sigma$ is the standard deviation. In Shakespeare, each speaker's dialogues were split into an input length of 20 characters. ASCII characters outside our defined vocabulary were discarded. Each training sequence of character length 20 was shifted by one and appended with the target character to form subsequent input sequences.

% \subsection{Models} 
% ResNet-18 model was used with CIFAR-10 to evaluate the performance of the proposed algorithm in a highly non-convex setting. The standard ResNet-18 model's Batch Normalization layers were replaced by group normalization layers containing two groups. 

% Standard LenNet-5 without any alterations was used with CIFAR-10 in the other experiment. 

% The Shakespeare data was used to train a single layer GRU layer, with a hidden dimension of 128. The output of this layer was fed to a fully connected layer, followed by a softmax. 

% \subsection{Training}
% All three experiments were based on classification tasks, with the test accuracy and training loss defining performance. Cross Entropy Loss was used as the criterion with SGD as the optimizer. Training was performed in batch sizes of 64, where each batch was randomly sampled without replacement from the training data. 
% Each client was trained for five local epochs ($\tau$), where an epoch was a whole pass on the local data. At the server, a constant learning rate of 1 was maintained for the aggregation steps in all implemented algorithms. Both CIFAR-10 experiments were carried out over 1500 global communication rounds, while the Shakespeare experiment was run for 1000 global communication rounds. The training loss was calculated for each communication round, averaged over 5 participating clients. A global training loss was also calculated over the entire training dataset. The model was tested on the entire test data, with the accuracy and test loss averaged over all mini-batches.

% \subsection{Hyperparameters}
% As described in Appendix <E.3>, mini-batch SGD was used throughout the experiments with a batch size of 64. 5 Clients were randomly picked every round and trained for 5 local epochs. 

% The client learning rate $\eta$ for all tasks was found using grid search. The grid was designed as follows: $\eta \in \{10^{-1},10^{-1.5},10^{-2},10^{-2.5}, 10^{-3}\}$.  For the best performing learning rate in the grid $\eta_i$, we picked the best learning rate by tuning between $min(0,\eta_{i-1})$ and $max(len(grid),\eta_{i+1})$ in intervals of 0.01 or 0.001, depending on the case. 
% A step learning rate decay was used every 600 rounds in the CIFAR-10 experiments as described in <table>.

% 55 clusters were ClusterFedVAR, to account for the distribution of data across clients, as $^{10}C_2$. 36 clusters were chosen in case of Shakespeare, with each cluster center representing a play.



% \begin{table}[h]
% \centering{
% \begin{tabular}{cccc} 
% \midrule
% & \texttt{FedAvg} & \texttt{\algoname} & \texttt{\clusteralgoname} \\ 
% \midrule
% CIFAR-10/LeNet-5& 1 & 1 & 1\\
% (50\% accuracy)\\
% \midrule
% CIFAR-10/ResNet-18 & 0 & 0 & 0\\
% (60\% accuracy) \\
% \midrule
% Shakespeare/RNN & 0 & 0 & 0\\
% (52\% accuracy)\\
% \end{tabular}
% }
% \label{default}
% \caption{A table}
% \end{table}


% \begin{tabular}{best performing learning rate $\eta$}
% % 
% \end{tabular}

% \textbf{ Additional assumptions:}

% 1. Let there be $K$ clusters and $r$ devices in each cluster, i.e, $rK = \numclients$. Let $c_i$ denote the cluster identity of the $i$-th device. Let $\mathcal{C}_k = \{ i: c_i = k\}$

% 2. \textbf{Cluster Heterogeneity bound:}
% We have, $\max_{i,j \in \mathcal{C}_k} \norm{\nabla f_i(x) - \nabla f_j(x)} \leq \sigma_K^2$ for all $k \in [K]$.


% Note that the only change in the proof from SAGA without clustering is a change in Lemma 3. More specifically, $y_j^{(t)}$ can be updated in round $t$ although client $j$ did not participate in round $t$. We derive the new lemma below






% $\Eg_{\xi^{(t)}}[\deltai] =\Eg_{\xi^{(t)}}[\bhit]$,

% so the cross-term will not necessarily be zero.

% \textbf{Q. How to bound $\norm{\bhit - \bh_i^{(t-1)}}$}

% We have,
% \begin{align*}
%     \norm{\bhit - \bh_i^{(t-1)}} &= \frac{1}{\tau^2}\norm{\sum_{j=0}^{\tau-1}\G f_i(\bw^{(t,j)}) - \G f_i(\bw^{(t-1,j)})}\\
%     & \leq \frac{1}{\tau}\sum_{j=0}^{\tau-1} \norm{\G f_i(\bw^{(t,j)}) - \G f_i(\bw^{(t-1,j)})}\\
%     & \leq \frac{1}{\tau}L^2\sum_{j=0}^{\tau-1} \norm{w^{(t,j)} - w^{(t-1,j)}}\\
% \end{align*}

% We have,
% \begin{align*}
%     \frac{\Eg{t}{f(\bw^{(t+1)})} - f(\bwt)}{\eta \tau} &\leq -\frac{1}{2} \norm{\G f(\bwt)} + \left(1+2\tau\eta L \right) \frac{1}{\numclients}\sum_{i=1}^{\numclients}\Eg{t}{\norm{\bhit - \G f_i(\bwt)}}\\
%     & + \frac{2\sigma^2}{\tau} + \frac{\eta \tau L}{2} \left( 4\eta^2L^2\norm{v^{(t-1)}}+ 2 \frac{1}{\numclients}\sumin \norm{\G f_i(\bw^{(t-1)}) - \by_i^{(t-1)}}\right)
% \end{align*}
% \clearpage


\end{document}
