% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{subcaption}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{bbm}
\usepackage{amssymb}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{mathtools}
\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}


\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{Proposition}
\theoremstyle{definition}
\newtheorem{definition}{Definition}[section]

\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\abs}[1]{\left\lvert#1\right\rvert}
\renewcommand{\qedsymbol}{$\blacksquare$}
\newcommand{\ts}{\textsuperscript}
\usepackage{subcaption,times}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\usepackage{import}
%\usepackage{enumitem}
\usepackage{amsmath}
\usepackage{mathtools}
\usepackage{dsfont}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage{csquotes}
\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
\makeatletter
\newcommand{\printfnsymbol}[1]{%
  \textsuperscript{\@fnsymbol{#1}}%
}
\makeatother

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}
\usepackage{framed}
\colorlet{shadecolor}{pink}
\usepackage{authblk}
\usepackage{adjustbox}
\usepackage{bbm}

\usepackage{graphicx}
\usepackage{soul}
\usepackage{subcaption}
\usepackage{booktabs} % for professional tables
\usepackage{tablefootnote}

\usepackage{amsmath,amsthm,amssymb,amsfonts}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{enumerate}
\usepackage{cleveref}
\usepackage{comment}
\usepackage{bm}
\usepackage{pifont}
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%



\theoremstyle{plain}
\newtheorem{corollary}{Corollary}[theorem]
\newtheorem{lemma}[]{Lemma}
\newtheorem{assumption}{Assumption}[]

\theoremstyle{definition}
\newtheorem{remark}{Remark}[]

\newcommand{\ra}[1]{\renewcommand{\arraystretch}{#1}}
\renewcommand{\qedsymbol}{$\blacksquare$}

\def\x{{\mathbf x}}
\def\z{{\mathbf z}}
\def\a{{\mathbf a}}
\def\A{{\mathbf A}}
\def\P{{\mathbf P}}
\def\r{{\mathbf r}}
\def\s{{\mathbf s}}
\def\u{{\mathbf u}}
\def\y{{\mathbf y}}
\def\I{{\mathbf I}}
\def\e{{\boldsymbol{\nu}}}
\def\Rb{{\mathbb{R}}}
\def\C{{\mathbf C}}

\def\G{{\mathcal G}}
\def\E{{\mathbb E}}
\def\O{{\mathcal O}}
\def\R{{\mathbb{R}}}


\newcommand{\highlight}[1]{%
  \colorbox{yellow!100}{$\displaystyle#1$}}


\title{Faster Non-Convex Federated Learning via Global and Local Momentum (Supplementary Material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<rdas@utexas.edu>?Subject=Your UAI 2022 paper}{Rudrajit Das}{}}
\author[1]{Anish Acharya \thanks{Equal Contribution}}
\author[2]{Abolfazl Hashemi \printfnsymbol{1}}
\author[1]{Sujay Sanghavi}
\author[1]{Inderjit S. Dhillon}
\author[1]{Ufuk Topcu}
% Add affiliations after the authors
\affil[1]{%
    University of Texas at Austin\\
    %Austin, Texas, USA
    USA
}
\affil[2]{%
    Purdue University\\
    West Lafayette, Indiana, USA%\\
}
  
\begin{document}
\maketitle

\begin{abstract}
We propose \texttt{FedGLOMO}, a novel federated learning (FL) algorithm with an iteration complexity of $\mathcal{O}(\epsilon^{-1.5})$ to converge to an $\epsilon$-stationary point (i.e., $\mathbb{E}[\|\nabla f(x)\|^2] \leq \epsilon$) for smooth non-convex functions -- under arbitrary client heterogeneity and compressed communication -- compared to the $\mathcal{O}(\epsilon^{-2})$ complexity of most prior works. Our key algorithmic idea that enables achieving this improved complexity is based on the observation that the convergence in FL is hampered by two sources of high variance: (i) the global server aggregation step with multiple local updates, exacerbated by client heterogeneity, and (ii) the noise of the local client-level stochastic gradients. The first issue is particularly detrimental to FL algorithms that perform plain averaging at the server. By modeling the server aggregation step as a generalized gradient-type update, we propose a variance-reducing momentum-based global update at the server, which when applied in conjunction with variance-reduced local updates at the clients, enables \texttt{FedGLOMO} to enjoy an improved convergence rate. Our experiments illustrate the intrinsic variance reduction effect of \texttt{FedGLOMO}, which implicitly suppresses client-drift in heterogeneous data distribution settings and promotes communication efficiency.
\end{abstract}

\section{Introduction}
\label{sec:intro}
Federated learning (FL) is a new edge-computing approach that advocates training statistical models directly on remote devices by leveraging enhanced local resources on each device (\cite{mcmahan2017communication}). In a standard FL setting, there are $n$ clients, each having its own training data, and a central server that is trying to train a model, parameterized by $\bm{w} \in \mathbb{R}^d$, using the clients' data.
Suppose the data distribution of the $i^{\text{th}}$ client is $\mathcal{D}_i$.
Then the $i^{\text{th}}$ client has an objective function $f_i(\bm{w})$ which is the expected loss, with respect to some loss function $\ell$, over data drawn from $\mathcal{D}_i$, and the goal of the central server is to optimize the average 
\footnote{In general this may be a weighted average, but here we only consider uniform weights, i.e., each weight is ${1}/{n}$.} 
loss $f(\bm{w})$, over the $n$ clients, i.e.,
\begin{equation}
    \label{eq:fl-intro-1}
    f(\bm{w}) := \frac{1}{n} \sum_{i=1}^{n} {f_i}(\bm{w}) \text{ \& } f_i(\bm{w}) = 
    \mathbb{E}_{\bm{x} \sim \mathcal{D}_i}[\ell(\bm{x},\bm{w})].
\end{equation}
The setting where the data distributions of all the clients are identical, i.e. $\mathcal{D}_1 = \ldots = \mathcal{D}_n$, is typically known as the \enquote{homogeneous} setting. Otherwise, the settings where the data distributions are \textit{not} identical are referred to as the  \enquote{heterogeneous} settings.

The core algorithmic idea of FL -- in the form of \texttt{FedAvg} -- was introduced in \cite{mcmahan2017communication}. In \texttt{FedAvg} (summarized in \Cref{alg:fed-avg}), a \textit{subset} of the clients perform \textit{multiple} steps of gradient descent based updates on their local data and then communicate back their respective updates to the server, which then averages them to update the global model (hence the name \texttt{FedAvg}).
This idea of performing multiple local updates before averaging once reduces the communication cost required for training. Another essential strategy in FL to cut down the communication cost is to have the clients send compressed/quantized messages to the server in every round -- this is of particular significance for training deep learning models where the number of model parameters is in millions or more.

In practice however, performing multiple local updates on clients with \textit{heterogeneous} data distributions leads to the so-called phenomenon of \enquote{{client drift}}, wherein the individual client updates do not align well (due to over-fitting on the local client data) inhibiting the convergence of \texttt{FedAvg} to the optimum of the average loss over all the clients. In this paper, we identify the high variance associated with the simple averaging step of \texttt{FedAvg} for the global update to be at the heart of this issue.

Ever since the development of FL, significant attention has been devoted to analyzing \texttt{FedAvg} under different settings, modifying \texttt{FedAvg} using ideas from centralized optimization to accelerate the training or to reduce the communication cost; we discuss these works in \Cref{rel-work}. Compared to centralized optimization, a formidable challenge in the theoretical analysis of FL algorithms is the use of multiple local updates in the clients which is compounded by the \textit{heterogeneous} nature of data distribution among the clients. To limit the extent of client heterogeneity, a standard assumption in FL theory is the \textit{bounded client dissimilarity (BCD) assumption}, i.e.,
\begin{equation}
    \label{eq:bcd}
    \frac{1}{n}\sum_{i=1}^{n}\|\nabla f_i(\bm{w}) - \nabla f(\bm{w})\|^2 \leq G^2 \text{ } \forall \text{ } \bm{w},
\end{equation}
for some large enough constant $G < \infty$ (e.g., see A1 in \cite{karimireddy2020mime}). But this assumption is limiting as it does not allow for \textit{arbitrarily large client heterogeneity}.

{Recently, \cite{arjevani2019lower} showed that the {stochastic first-order complexity} of any
algorithm in the \textit{centralized setting} to reach an $\epsilon$-stationary point (i.e., $\mathbb{E}[\|\nabla f(\bm{x})\|^2] \leq \epsilon$) for \textit{smooth non-convex functions} is $\Omega(\epsilon^{-1.5})$. It is well known that vanilla SGD has a suboptimal complexity of $\mathcal{O}(\epsilon^{-2})$ as it cannot mitigate the high variance of the stochastic gradient noise. Recognizing this issue, \textit{variance-reducing} techniques for SGD (\cite{fang2018spider,zhou2018stochastic,cutkosky2019momentum,liu2020optimal}) have been proposed that attain the optimal complexity of $\mathcal{O}(\epsilon^{-1.5})$. 
Coming to the federated setting, as we discuss in this paper, in addition to the noise in the \textit{local} client-level stochastic gradients, one has to also contend with the high variance associated with the \textit{global} server aggregation step which depends on the client heterogeneity and the number of local update steps. In this case, as we argue in the subsequent sections, applying only local client-level variance-reduction is not enough for improving the iteration complexity of vanilla \texttt{FedAvg} beyond $\mathcal{O}(\epsilon^{-2})$ for smooth, non-convex losses.

To alleviate the issue of variance due to heterogeneity, we propose a novel FL algorithm with \textit{compressed communication} called \texttt{FedGLOMO} (\Cref{alg:2} and \ref{alg:2-local}) which applies \texttt{G}\textit{lobal} as well as \texttt{LO}\textit{cal} \textit{variance-reducing} \texttt{MO}\textit{mentum} to the server update and client updates, respectively.
We prove that the iteration complexity of \texttt{FedGLOMO} is $\mathcal{O}(\epsilon^{-1.5})$ in the smooth non-convex case, which is better than the $\mathcal{O}(\epsilon^{-2})$ complexity of related works in the FL setting; see \Cref{tb:comp} and \Cref{nov4-thm1}.
Further, our theory does not use the BCD assumption, i.e. \cref{eq:bcd}, which is a standard assumption in related works. Instead, we propose and use \Cref{as-het}, which is a more realistic and \textit{empirically verified} assumption on the client drift, even allowing for arbitrary client heterogeneity. 
It is worth mentioning here that for FL, \cite{karimireddy2020mime} also propose an algorithm (\texttt{MimeMVR}) which is shown to attain this improved complexity of $\mathcal{O}(\epsilon^{-1.5})$ but \textit{with} the BCD assumption and \textit{no} compressed communication; we %discuss more 
talk about this at the end of \Cref{rel-work}.}

We summarize our \textbf{contributions} next:
    
\textbf{(a)} We propose \texttt{FedGLOMO} (Alg. \ref{alg:2} and \ref{alg:2-local}), in which we apply a \textit{novel global momentum term at the server} in addition to  \textit{local momentum at the clients}. The design of \texttt{FedGLOMO} is motivated by two critical issues that need to be alleviated to accelerate convergence in FL; these are the high variances associated with: (i) the \textit{global} server aggregation step due to heterogeneity of clients when there are multiple local updates, and (ii) the noise of \textit{local} client-level stochastic gradients. Global and local momentum result in \textit{variance reduction} for the global server update and the local client updates, allowing us to tackle (i) and (ii), respectively. This enables \texttt{FedGLOMO} to converge to an $\epsilon$-stationary point (i.e., $\mathbb{E}[\|\nabla f(\bm{x})\|^2] \leq \epsilon$) for smooth non-convex functions in $\mathcal{O}(\epsilon^{-1.5})$ gradient-based updates, which is better than the $\mathcal{O}(\epsilon^{-2})$ complexity of most related works in the FL setting; see \Cref{tb:comp} and \Cref{nov4-thm1}.
    
\textbf{(b)} Unlike prior work, our theory does not use the limiting {bounded client dissimilarity assumption} (i.e., \cref{eq:bcd}). Instead, to tighten our result, we propose and use \Cref{as-het} -- which is a novel assumption on the client drift, even allowing for \textit{arbitrary client heterogeneity} in the worst case. We empirically verify that \Cref{as-het} holds for \texttt{FedGLOMO} as well as \texttt{FedAvg}. Theoretically, we also show that \Cref{as-het} holds for \textit{any} FL algorithm in the case of linear regression and also with networks whose training dynamics follow that of a linearized model (a.k.a. the \enquote{NTK} regime). Refer to the discussion after \Cref{as-het} and \Cref{rem-sep21-2} for details. 
    
\textbf{(c)} \texttt{FedGLOMO} is the \textit{first FL algorithm} achieving $\mathcal{O}(\epsilon^{-1.5})$ complexity while allowing \textit{compressed client-to-server communication}. We emphasize that from the theory perspective, applying compression in \texttt{FedGLOMO} is not trivial and the most obvious approach does not work; see \Cref{rem-sep21-3}. 
    
\textbf{(d)} In \Cref{sec:exp}, experiments on CIFAR-10 and Fashion-MNIST (\cite{xiao2017fashion}) show that in a highly heterogeneous setting of at most two (out of ten) classes per client, \texttt{FedGLOMO} requires only about \textit{one-third} the number of bits used by \texttt{FedAvg} with PyTorch's default momentum applied to the local client updates; see \Cref{fig:1}. Our experiments also illustrate the variance reduction provided by our scheme which implicitly mitigates client-drift under heterogeneous data distribution and in turn promotes communication-efficiency.
    
\section{Related Work}
\label{rel-work}
\textbf{\texttt{FedAvg} and related methods:}
\cite{reisizadeh2020fedpaq} propose \texttt{FedPAQ} which is basically \texttt{FedAvg} (\cite{mcmahan2017communication}) with quantized client-to-server communication, and establish its convergence for the homogeneous case. \cite{li2019convergence} establish the convergence of \texttt{FedAvg} for strongly convex functions with heterogeneity (assuming bounded client dissimilarity) but without any compressed communication. \cite{haddadpour2020federated} propose \texttt{FedCOMGATE} which incorporates gradient tracking (\cite{pu2020distributed}) and derive results with data heterogeneity and quantized communication. 
\cite{karimireddy2019scaffold} propose \texttt{SCAFFOLD} which uses control-variates to mitigate the client-drift owing to the heterogeneity of clients.
\cite{li2018federated} present \texttt{FedProx} which adds a proximal term to control the deviation of the client parameters from the global server parameter in the previous round. \cite{reddi2020adaptive} propose federated versions of commonly used adaptive optimization methods and prove their convergence under heterogeneity. 
Local SGD (\cite{zinkevich2010parallelized,stich2018local,yu2018parallel,wang2018cooperative,basu2019qsparse,stich2019error,patel2019communication,woodworth2020local,bayoumi2020tighter,liang2019variance,koloskova2020unified}) is very similar to FL and is essentially based on the same principle as \texttt{FedAvg}. However, in local SGD, there is usually no data heterogeneity and all the clients participate in each round (known as \enquote{full device participation}), both of which do not hold in FL and simplify the derivation of convergence results.
\\
\cite{wang2019slowmo, huo2020faster} present momentum-based updates at the server without any improvement in the %order-wise 
convergence rate as compared to momentum-free updates. \cite{qu2020federated} present Nesterov accelerated \texttt{FedAvg} for convex objectives. \cite{karimireddy2020mime} propose \texttt{Mime}(\texttt{MVR}) which applies momentum at the client-level based on globally computed statistics to control client-drift. \cite{khanduri2021stem} propose \texttt{STEM} which applies momentum globally and locally for local SGD; however, their server aggregation step is just plain averaging as they do not have deal with server-side variance reduction, since all the clients participate in local SGD.

\begin{table*}[t]
\caption{Number of gradient updates, i.e., $T$, required to achieve $\mathbb{E}[\|\nabla f(\bm{w})\|^2] \leq \epsilon$ on smooth non-convex functions. Here, $n$ is the total number of clients and $r$ is the number of clients participating in each round. \enquote{Client Participation} asks whether all ($r=n$) or only a subset ($r<n$) of the clients participate in each round.
\enquote{BCD?} asks if the bounded client dissimilarity assumption (\cref{eq:bcd}) is used or not. \enquote{Compression?} asks whether compressed communication is involved or not.
\\
$*1$: $\alpha \leq n$ is a problem-dependent quantity; in practice, {we expect $\alpha \ll n$} as confirmed in our experiments.
}
\label{tb:comp}
\ra{1}
\begin{adjustbox}{width=\textwidth}
\begin{tabular*}{\linewidth}{@{}ccccc@{}}\toprule
Ref. & $T$ & Client Participation & BCD? & Compression? \\ \midrule
\cite{koloskova2020unified,wang2019slowmo} &$\mathcal{O}(\frac{1}{n\epsilon^2})$&Full ($r=n$)&Yes&{\color{red}\xmark}
\\\midrule
\cite{haddadpour2020federated} &$\mathcal{O}(\frac{1}{n\epsilon^2})$&Full %Device 
($r=n$)&Yes&{\color{black}\cmark}
\\\midrule
\cite{khanduri2021stem} &$\mathcal{O}(\frac{1}{n\epsilon^{1.5}})$&Full ($r=n$)&Yes&{\color{red}\xmark}
\\\midrule
\cite{karimireddy2019scaffold}&$\O(\frac{1}{r \epsilon^2})$&Partial ($r<n$)&Yes&{\color{red}\xmark}
\\\midrule
\cite{karimireddy2020mime}&$\mathcal{O}\big(\frac{1}{\sqrt{r}\epsilon^{1.5}}\big)$&Partial ($r<n$)&Yes&{\color{red}\xmark}
\\\midrule
\textbf{This work} (\texttt{FedGLOMO})
&$\mathcal{O}\big(\max\big(\sqrt{\frac{\alpha}{n}}, {\frac{1}{\sqrt{r}}}\big)\frac{1}{\epsilon^{1.5}}\big)^{*1}$&Partial ($r<n$)&\textbf{No}&{\color{black}\cmark}
\\
\bottomrule
\end{tabular*}
\end{adjustbox}
\end{table*}

\textbf{Distributed optimization with compression:} References \cite{alistarh2017qsgd,suresh2017distributed,reisizadeh2020fedpaq,haddadpour2020federated,tang2018communication,wu2018error,bernstein2018signsgd,alistarh2018convergence,lin2017deep,stich2018sparsified,basu2019qsparse,hashemi2020delicoco,chen2020communication,chen2021communication} aim to minimize the communication bottleneck in distributed optimization by transmitting compressed messages to the central server and establishing their convergence. \cite{horvath2019stochastic,gorbunov2021marina} provide distributed algorithms with improved convergence rates by also applying variance reduction and periodically using full gradients; however, there are no multiple local updates in these works. In \Cref{sec:marina}, we compare our work's complexity against that of \cite{gorbunov2021marina}. In this work, we employ the quantization operator of \cite{alistarh2017qsgd}.

\textbf{Complexity for smooth non-convex stochastic optimization:} 
\cite{arjevani2019lower} show that the optimal stochastic first-order complexity to reach an $\epsilon$-stationary point (i.e., $\mathbb{E}[\|\nabla f(\bm{x})\|^2] \leq \epsilon$) is $\mathcal{O}(\frac{\sigma}{\epsilon^{1.5}})$ where $\sigma^2$ is the variance of the stochastic gradients. 
Unfortunately, vanilla SGD is suboptimal and \textit{variance-reducing} techniques must be applied to attain the optimal complexity; some noteworthy works on variance-reduction for SGD are \texttt{SVRG} (\cite{johnson2013accelerating}), \texttt{SAGA} (\cite{defazio2014saga}) and \texttt{SARAH} (\cite{nguyen2017sarah}). SVRG-style algorithms such as \texttt{SPIDER} (\cite{fang2018spider}) and \texttt{SNVRG} (\cite{zhou2018stochastic}) attain this optimal complexity by periodically using giant batch sizes. \cite{cutkosky2019momentum} propose \texttt{STORM} which also attains this optimal complexity with adaptive learning rates, but without using any large batches. The key idea of \texttt{STORM} is momentum-based variance reduction, obtained by using the stochastic gradient at the previous point \textit{computed over the same batch} on which the stochastic gradient at the current point is computed. \cite{liu2020optimal} present a much simpler proof for essentially the same algorithm by employing a constant learning rate and requiring a large batch size only at the first iteration. Our key idea of global and local momentum is \texttt{STORM}-like \textit{variance-reducing} momentum applied to the aggregation step at the server, interpreted as a generalized gradient-type update, and the local client updates, respectively; see \Cref{sec:main}.

\Cref{tb:comp} compares the complexities of the most relevant related works in FL ($r<n$) and local SGD ($r=n$) with ours on smooth non-convex functions. Note that under the more challenging FL setting with partial-device participation, only \texttt{FedGLOMO} and \texttt{MimeMVR} (\cite{karimireddy2020mime}) attain the improved iteration complexity of $\mathcal{O}(\epsilon^{-1.5})$ with respect to $\epsilon$. 
However, unlike \cite{karimireddy2020mime}, our work does not rely on the bounded client dissimilarity assumption (\cref{eq:bcd}) and allows for compressed client-to-server communication, in which case  maintaining the improved complexity is not trivial; for details, see Remarks \ref{rem-sep21-2} and \ref{rem-sep21-3}, respectively. There are meaningful algorithmic differences between our work and \cite{karimireddy2020mime} too. The biggest one is that while we explicitly apply momentum in the server aggregation step (global momentum) as well as in the client updates (local momentum), \cite{karimireddy2020mime} only apply \textit{globally computed} momentum in the local client updates. 
For a detailed discussion of the differences of our work from \cite{karimireddy2020mime}, see \Cref{sec:disc}.
Since \texttt{Mime} is designed to deal with client drift, we empirically compare it against \texttt{FedGLOMO} without compression in a highly heterogeneous setting in \Cref{sec:exp}.

\section{Preliminaries}
\label{sec:prelim}
Recall the setting and the optimization problem that the server is trying to solve as defined in \cref{eq:fl-intro-1}. We assume that the clients have access to unbiased stochastic gradients of their individual losses. We denote the stochastic gradient of $f_i$ at $\bm{w}$ computed over a batch of samples $\mathcal{B}$, by $\widetilde{\nabla} f_i(\bm{w};\mathcal{B})$. Also in this paper, $K$ is the number of communication rounds, $E$ is the number of local updates per round or the period, and $T = KE$ is the total number of local updates or the (order-wise) number of gradient-based updates. Further, $r$ is the number of clients that the server accesses in each round, i.e., the global batch size.

Vectors and matrices are written in boldface. For any positive integer $m$, the set $\{1,\ldots,m\}$ is denoted by $[m]$, and the uniform distribution over the set  $\{0,\ldots,m\}$ is denoted by $\text{Unif}[0,m]$.
$\mathbbm{1}(.)$ is the indicator function. Next, we recap smooth functions.
\begin{definition}[\textbf{Smoothness}]
A function $g:\Theta \xrightarrow{} \mathbb{R}$ is to said to be $L$-smooth if for all $\bm{\theta}, \bm{\theta}' \in \Theta$, $\|\nabla g(\bm{\theta}) - \nabla g(\bm{\theta}')\| \leq L\|\bm{\theta} - \bm{\theta}'\|$. For all $\bm{\theta}, \bm{\theta}' \in \Theta$, we also have: $g(\bm{\theta}') \leq g(\bm{\theta}) + \langle \nabla g(\bm{\theta}), \bm{\theta}' - \bm{\theta} \rangle + \frac{L}{2}\|\bm{\theta}' - \bm{\theta}\|^2$.
\end{definition}
\section{\texttt{FedGLOMO}: \texttt{G}lobal and \texttt{LO}cal \texttt{MO}mentum-Based Variance Reduction}
\label{sec:main}
\begin{algorithm}[t]
	\caption{\texttt{FedGLOMO} - Server Update}
	\label{alg:2}
	\begin{algorithmic}[1]
		\STATE {\bfseries Input:} Initial point $\bm{w}_0$, \# of rounds of communication $K$, period $E$, learning rates  $\{\eta_{k}\}_{k=0}^{K-1}$ and global batch size $r$. $Q_D$ is the quantization operator. Set $\bm{w}_{-1} = \bm{w}_0$.
		\FOR{$k =0,\dots, K-1$}
		\STATE 
		Server sends $\bm{w}_k$, $\bm{w}_{k-1}$ to a set $\mathcal{S}_k$ of $r$ clients chosen uniformly at random w/o replacement.
		%\vspace{0.1 cm}
		\FOR{client $i \in \mathcal{S}_k$}
		\STATE Set $\bm{w}_{k,0}^{(i)} = \bm{w}_k$ and $\widehat{\bm{w}}_{k-1,0}^{(i)} = \bm{w}_{k-1}$. Run \Cref{alg:2-local} for client $i$.
		\ENDFOR
		%\vspace{0.1 cm}
		%\paragraph{}
		\IF{$k = 0$}
		\label{step-0}
		\STATE Set
		$\bm{u}_{k} = \frac{1}{r}\sum_{i \in \mathcal{S}_k}Q_D({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}})$.
		\label{glob-mom-0}
		\ELSE
		%\vspace{0.1 cm}
		\STATE Set
		$\bm{u}_{k} = \frac{\beta_k}{r} \sum_{i \in \mathcal{S}_k}Q_{D}(\bm{w}_{k} - {\bm{w}_{k,E}^{(i)}}) + 
		(1-\beta_k)\bm{u}_{k-1} + \frac{(1-\beta_k)}{r} \sum_{i \in \mathcal{S}_k} Q_{D}((\bm{w}_{k} - {\bm{w}_{k,E}^{(i)}}) - ({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}}))$.
		{\color{blue} // \texttt{(Global Momentum)}}\label{glob-mom}
		%\vspace{0.1 cm}
		%\paragraph{}
		\ENDIF
		%\vspace{0.1 cm}
		\STATE Update $\bm{w}_{k+1} = \bm{w}_{k} - \bm{u}_k$.
		%\vspace{0.1 cm}
		\ENDFOR
	\end{algorithmic}
\end{algorithm}

\begin{algorithm}[t]
	\caption{\texttt{FedGLOMO} - Client Update}
	\label{alg:2-local}
	\begin{algorithmic}[1]
		\FOR{$\tau = 0,\ldots,E-1$}
		%\vspace{0.1 cm}
		\IF{$\tau = 0$}
		%\vspace{0.1 cm}
		\STATE Set $\bm{v}_{k,\tau}^{(i)} = {\nabla} f_i(\bm{w}_{k,\tau}^{(i)})$,  $\widehat{\bm{v}}_{k-1,\tau}^{(i)} = {\nabla} f_i(\widehat{\bm{w}}_{k-1,\tau}^{(i)})$.
		\label{l1}
		\ELSE
		%\vspace{0.1 cm}
		\STATE Pick a random batch of samples 
		in client $i$, say $\mathcal{B}_{k,\tau}^{(i)}$. Compute the stochastic gradients 
		of $f_i$ at $\bm{w}_{k,\tau}^{(i)}$, $\widehat{\bm{w}}_{k-1,\tau}^{(i)}$, $\bm{w}_{k,\tau-1}^{(i)}$ and $\widehat{\bm{w}}_{k-1,\tau-1}^{(i)}$ over $\mathcal{B}_{k,\tau}^{(i)}$ viz.
		$\widetilde{\nabla} f_i(\bm{w}_{k,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)})$, $\widetilde{\nabla} f_i(\widehat{\bm{w}}_{k-1,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)})$, $\widetilde{\nabla} f_i(\bm{w}_{k,\tau-1}^{(i)};\mathcal{B}_{k,\tau}^{(i)})$ and $\widetilde{\nabla} f_i(\widehat{\bm{w}}_{k-1,\tau-1}^{(i)};\mathcal{B}_{k,\tau}^{(i)})$. 
		%\vspace{0.2 cm}
		\paragraph{}
		\STATE 
		Update: $\bm{v}_{k,\tau}^{(i)} = \widetilde{\nabla} f_i(\bm{w}_{k,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)}) + \big(\bm{v}_{k,\tau-1}^{(i)} - \widetilde{\nabla} f_i(\bm{w}_{k,\tau-1}^{(i)};\mathcal{B}_{k,\tau}^{(i)})\big)$ and
		\\
		$\widehat{\bm{v}}_{k-1,\tau}^{(i)} = \widetilde{\nabla} f_i(\widehat{\bm{w}}_{k-1,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)}) + \big(\widehat{\bm{v}}_{k-1,\tau-1}^{(i)} - \widetilde{\nabla} f_i(\widehat{\bm{w}}_{k-1,\tau-1}^{(i)};\mathcal{B}_{k,\tau}^{(i)})\big)$. 
		{\color{blue} // \texttt{(Local Mom.)}} \label{l2}
		%\vspace{0.1 cm}
		\ENDIF
		%\vspace{0.1 cm}
		\STATE Update $\bm{w}_{k,\tau+1}^{(i)} = \bm{w}_{k,\tau}^{(i)} - \eta_{k}\bm{v}_{k,\tau}^{(i)}$ and  $\widehat{\bm{w}}_{k-1,\tau+1}^{(i)} = \widehat{\bm{w}}_{k-1,\tau}^{(i)} - \eta_{k}\widehat{\bm{v}}_{k-1,\tau}^{(i)}$.
		\label{l3}
		%\vspace{0.1 cm}
		\ENDFOR
		%\vspace{0.1 cm}
		\paragraph{}
		\STATE Send $Q_{D}(\bm{w}_{k} - {\bm{w}_{k,E}^{(i)}})$ and $Q_{D}((\bm{w}_{k} - {\bm{w}_{k,E}^{(i)}}) - ({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}}))$ 
		to the server.
		\label{comp}
	\end{algorithmic}
\end{algorithm}

\begin{algorithm}[t]
	\caption{\texttt{FedAvg} \cite{mcmahan2017communication}
	}
	\label{alg:fed-avg}
	\begin{algorithmic}[1]
		\STATE {\bfseries Input:} 
		Initial point $\bm{w}_0$, \# of communication rounds $K$, period $E$, learning rates  $\{\eta_{k}\}_{k=0}^{K-1}$ and global batch size $r$.
		%\vspace{0.1 cm}
		\FOR{$k =0,\dots, K-1$}
		%\vspace{0.1 cm}
		\STATE Server sends $\bm{w}_k$ to a set $\mathcal{S}_k$ of $r$ clients chosen uniformly at random w/o replacement.
		%\vspace{0.1 cm}
		\FOR{client $i \in \mathcal{S}_k$}
		%\vspace{0.1 cm}
		\STATE Set $\bm{w}_{k,0}^{(i)} = \bm{w}_k$.
		%\vspace{0.1 cm}
		\FOR{$\tau = 0,\ldots,E-1$}
		%\vspace{0.1 cm}
		\STATE Pick a random batch of samples 
		in client $i$, $\mathcal{B}_{k,\tau}^{(i)}$.
		Compute the stochastic gradient of $f_i$ at $\bm{w}_{k,\tau}^{(i)}$ over $\mathcal{B}_{k,\tau}^{(i)}$, viz. $\widetilde{\nabla} f_i(\bm{w}_{k,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)})$.
		%\vspace{0.1 cm}
		\STATE Update $\bm{w}_{k,\tau+1}^{(i)} = \bm{w}_{k,\tau}^{(i)} - \eta_{k} \widetilde{\nabla} f_i(\bm{w}_{k,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)})$.
		%\vspace{0.1 cm}
		\ENDFOR
		%\vspace{0.1 cm}
		\STATE Send $(\bm{w}_k - \bm{w}_{k,E}^{(i)})$ to the server.
		\label{line:fedavg-1}
		%\vspace{0.1 cm}
		\ENDFOR
		%\vspace{0.1 cm}
		\STATE Update $\bm{w}_{k+1} = \bm{w}_k -  \frac{1}{r}\sum_{i \in \mathcal{S}_k}(\bm{w}_k - \bm{w}_{k,E}^{(i)})$.
		\label{line:fedavg-2}
		%\vspace{0.1 cm}
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
There are two issues that need to be alleviated for improving the convergence rate in FL: (i) the high variance of simple averaging used in the \textit{global} server aggregation step (of \texttt{FedAvg}), when there are multiple local updates, which is exacerbated by heterogeneity of the clients, and (ii) the high variance associated with the noise of \textit{local} client-level stochastic gradients. The key idea of \texttt{FedGLOMO} (\Cref{alg:2} and \ref{alg:2-local}) is to apply \textit{variance-reducing} \textbf{global} and \textbf{local} momentum to combat (i) and (ii), respectively. We now describe {global} and {local} momentum in detail.

\textbf{Global} momentum is applied to the sever aggregation step which is line \ref{glob-mom} in \Cref{alg:2}. To understand it better, let us revisit \texttt{FedAvg} (summarized in \Cref{alg:fed-avg}, although in a slightly different way than usual) and its server aggregation step (line \ref{line:fedavg-2}) which is just simple averaging. Similar to the update of SGD suffering from high variance, this naive averaging step -- which we think of as the average of a batch of generalized stochastic gradients -- is characterized by high variance stemming from heterogeneity and multiple local updates. So, this way of server aggregation slows down the convergence rate of \texttt{FedAvg} (and other related methods).

In this paper, we re-envision the server aggregation as a generalized gradient-based update by thinking of $(\bm{w}_k - \bm{w}_{k,E}^{(i)})$ as the generalized  gradient. Then, we wish to incorporate the style of variance-reducing momentum applied in \texttt{STORM} (\cite{cutkosky2019momentum,liu2020optimal}) to our generalized gradient-based update; note that their method is for stochastic gradients in the case of centralized optimization. 
To that end, let us briefly recap \texttt{STORM}'s update rule. For a function $h(\bm{z})$, \texttt{STORM}'s update for the $j^{\text{th}}$ iteration is:
\begin{multline}
    \label{apr20-1}
    \bm{z}_{j+1} = \bm{z}_j - \eta_j \bm{v}_j, \text{ where } 
    \bm{v}_j = 
    \{\widetilde{\nabla} h(\bm{z}_j;\xi_j) + 
    \\
    (1-\beta_j) (\bm{v}_{j-1} - \widetilde{\nabla} h(\bm{z}_{j-1};\xi_j))\mathbbm{1}(j>0)\}.
\end{multline}
In \cref{apr20-1}, $\xi_j$ denotes the source of randomness in the $j^{\text{th}}$ iteration and $\beta_j \in [0,1)$ is the momentum parameter. Note the use of the stochastic gradient at $\bm{z}_{j-1}$ computed on $\xi_j$.
Coming back to \Cref{alg:2}, the quantity $\bm{u}_k$ plays the role of $\bm{v}_j$ in  \cref{apr20-1}. To see this clearly, let us analyze $E_{Q_D}[\bm{u}_k]$ (see lines \ref{glob-mom-0} and \ref{glob-mom} in \Cref{alg:2}).
Under \Cref{as5}, the compression operator $Q_D$ produces an unbiased estimate of the input. Then defining ${g}(\bm{w}_k;\mathcal{S}_k) \triangleq \frac{1}{r}\sum_{i \in \mathcal{S}_k}({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}})$ and $\widehat{g}(\bm{w}_{k-1};\mathcal{S}_k) \triangleq \frac{1}{r}\sum_{i \in \mathcal{S}_k}({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})$, we have:
\begin{multline}
    \label{apr20-3}
    \mathbb{E}_{Q_D}[\bm{u}_k] = \{{g}(\bm{w}_k;\mathcal{S}_k) + 
    \\
    (1-\beta_k)\big(\bm{u}_{k-1} - \widehat{g}(\bm{w}_{k-1};\mathcal{S}_k)\big) \mathbbm{1}(k > 0)\}.
\end{multline}
In \cref{apr20-3}, ${g}(\bm{w}_k;\mathcal{S}_k)$ and $\widehat{g}(\bm{w}_{k-1};\mathcal{S}_k)$ play the roles of $\widetilde{\nabla} h(\bm{z}_j;\xi_j)$ and $\widetilde{\nabla} h(\bm{z}_{j-1};\xi_j)$, respectively. 
With this, one can clearly see that \cref{apr20-3} is the analogue of \cref{apr20-1} for the global server aggregation in FL. However, this equivalence is not so apparent without looking at the expected value of $\bm{u}_k$ with respect to ${Q}_D$; in fact, the choice of quantities that are compressed in {line \ref{comp} of Alg. \ref{alg:2-local}} and used in line \ref{glob-mom} of Alg. \ref{alg:2} is crucial for establishing provable guarantees (also see \Cref{rem-sep21-3}).

Now that we understand global momentum, let us move on to \textbf{local} momentum. For this see lines \ref{l1}, \ref{l2} and \ref{l3} in \Cref{alg:2-local}; these give us $(\bm{w}_{k} - {\bm{w}_{k,E}^{(i)}})$ and $({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})$ after running for $E$ steps. But notice that these lines are the same as \cref{apr20-1} with $\beta_j = 0$ and the stochastic gradient at the first iteration replaced by the full gradient. It is worth mentioning here that these local updates are also similar to \texttt{SPIDER} which is an SVRG-style update proposed in \cite{fang2018spider}. However, recognizing that this is also a special case of the \texttt{STORM} update with $\beta_j = 0$, we prefer calling it momentum in order to have a unifying terminology for both the global and local updates. 

One might wonder what is the role of global momentum as \texttt{SPIDER} can be extended to improve the complexity in distributed optimization \textit{without multiple local updates}. For this, {in \Cref{sec:lomo}}, we consider \texttt{FedLOMO} (Algorithm \ref{alg:1} and \ref{alg:1-local}) which is a simpler version of \texttt{FedGLOMO} with only {lo}cal {mo}mentum and \textit{no} global momentum (i.e, plain averaging at the server which is equivalent to setting $\beta_k = 1$ in \Cref{alg:2}),
and show that it does not achieve $\mathcal{O}(\epsilon^{-1.5})$ complexity under partial-device participation and compression ({see \Cref{fl-thm3}}). The root cause of this is client heterogeneity which amplifies its effect under \textit{multiple local updates}; without incorporating some form of variance reduction in the server aggregation step, the complexity cannot be improved.

Let us try to provide some intuition as to how incorporating global momentum helps. Suppose we keep $\eta_k = \eta$ and $\beta_k = \beta < 1$ for all $k$. Theoretically, we get a lower bound for $\beta$ which is $\mathcal{O}(\eta^2)$. Then with this momentum-based aggregation strategy, the variance reduces by a factor of $\mathcal{O}(\beta/\eta) = \mathcal{O}(\eta)$ as compared to aggregation by plain averaging.
(There are some other terms too but these are sufficiently small.)
This reduction in the variance by a factor of $\mathcal{O}(\eta)$ is what improves the convergence rate of \texttt{FedGLOMO}.

It is true that \texttt{FedGLOMO} has to communicate twice the amount of information per round as compared to \texttt{FedAvg} or \texttt{FedPAQ} (\cite{reisizadeh2020fedpaq}) which is just \texttt{FedAvg} with compressed communication. One can set the precision of the quantizer sufficiently low to account for the extra per-round communication cost of \texttt{FedGLOMO} -- we adopt this approach in our experiments. Also, we only assume access to the full client gradient in line \ref{l1} of Alg. \ref{alg:2-local} for simplicity of analysis, but our main result (i.e., \Cref{nov4-thm1}) can be readily extended to the case of large enough batch sizes.
\section{Main Result for \texttt{FedGLOMO}}
\label{sec:result:glomo}
First, we state our assumptions.
\begin{assumption}[\textbf{Smoothness}]
\label{as1} 
$\ell(\bm{x},\bm{w})$ is $L$-smooth with respect to $\bm{w}$, for all $\bm{x}$. Thus, each $f_i(\bm{w})$ ($i \in [n]$) is $L$-smooth, and so is $f(\bm{w})$.
\end{assumption}

\begin{assumption}[\textbf{Non-negativity}]
\label{as-may15}
Each $f_i(\bm{w})$ is non-negative and therefore, $f_i^{*} \triangleq \min f_i(\bm{w}) \geq 0$.
\end{assumption}
Most loss functions used in practice satisfy this anyways and if not, we can just add a constant offset to achieve non-negativity. 

\begin{assumption}[\textbf{Quantization}]\label{as5}
The quantization operator $Q_D$ in Alg. \ref{alg:2} and \ref{alg:2-local} is unbiased, i.e., $\mathbb{E}[Q_D(\bm{x}) | \bm{x}] = \bm{x}$, and its variance satisfies $\mathbb{E}[\|Q_D(\bm{x})-\bm{x}\|^2 | \bm{x}] \leq q\|\bm{x}\|^2$ for some $q > 0$. The \enquote{qsgd} operator proposed in Section 3.1 of \cite{alistarh2017qsgd} satisfies \Cref{as5}.
\end{assumption}

\begin{assumption}[\textbf{Client Drift/Heterogeneity}]\label{as-het}
Let $\mathcal{A}$ be an FL algorithm with $E$ local update steps and $K$ communication rounds. Let $\bm{w}_{k,\tau}^{(i)}$ be the $i^{\text{th}}$ client's local parameter at the start of the $(\tau+1)^{\text{st}}$ local step of the $(k+1)^{\text{st}}$ round of $\mathcal{A}$, for $i \in [n]$ (similar to the notation in Alg. \ref{alg:2}, \ref{alg:2-local}, and \ref{alg:fed-avg}).
Define $\widetilde{\bm{e}}_{k,\tau}^{(i)} \triangleq \nabla f_i(\bm{w}_{k,\tau}^{(i)}) - \nabla f_i\big(\frac{1}{n}\sum_{j \in [n]} \bm{w}_{k,\tau}^{(j)}\big)$.
Then for some $\alpha \ll n$, the following holds:
\begin{equation}
    \label{het-eq}
    \mathbb{E}\Big[\Big\|\sum_{i \in [n]}\widetilde{\bm{e}}_{k, \tau}^{(i)}\Big\|^2\Big] \leq \alpha \sum_{i \in [n]} \mathbb{E}\Big[\Big\|\widetilde{\bm{e}}_{k,\tau}^{(i)}\Big\|^2\Big],
\end{equation}
$\forall$ $\tau \in \{0,\ldots,E-1\}$ and $k \in \{0,\ldots,K-1\}$. The expectation above is w.r.t. any stochasticity in the local updates.
\end{assumption}
\Cref{het-eq} in the above assumption always holds with $\alpha = n$ for any FL algorithm; this follows from the fact that for any $m > 1$ vectors $\{\bm{a}_j\}_{j=1}^m$, $\|\sum_{j=1}^m \bm{a}_j\|^2 \leq m \sum_{j=1}^m \|\bm{a}_j\|^2$ (this can be obtained by using the Cauchy-Schwarz inequality). However, we empirically observe $\alpha \ll n$ in practice for \texttt{FedGLOMO} as well as \texttt{FedAvg}; see \Cref{sec:het-asm-expt} and \Cref{sec:fed_avg_conv}, respectively. 
The value of $\alpha$ in \Cref{as-het} is a measure of the amount of client drift induced by the algorithm which also depends on the degree of heterogeneity in the system -- as the heterogeneity increases (decreases), we observe $\alpha$ to also increase (decrease). 

From \Cref{fig:het0} (in \Cref{sec:het-asm-expt}), we see that for the highly heterogeneous setting that we consider for our experiments in \Cref{sec:exp}, $\alpha < 0.06 n$ for most of the trajectory of \texttt{FedGLOMO} on both CIFAR-10 and Fashion-MNIST (abbreviated as FMNIST).
In the homogeneous case, $\alpha < 0.03 n$ and $\alpha < 0.02 n$ for most of the trajectory on CIFAR-10 and FMNIST, respectively. We observe a similar trend of $\alpha$ for \texttt{FedAvg} in \Cref{sec:fed_avg_conv}. Additionally, we derive a convergence result for \texttt{FedAvg} under \Cref{as-het} and without the bounded client dissimilarity assumption (i.e., \cref{eq:bcd}) in \Cref{sec:fed_avg_conv}. 

{\textbf{Some theoretical motivation for \Cref{as-het}:} Let us consider \textit{linear regression} to provide a scenario where $\alpha = 0$ provably for \textbf{any} FL algorithm. Suppose in client $i$, we have feature and label pairs $(\bm{x}, y) \sim (\mathcal{X}_i, \mathcal{Y}_i)$, where the label
\[y = \langle \bm{w}_i^{\ast}, \bm{x} \rangle + \xi,\]
with $\xi \sim \mathcal{N}_i$ being {independent zero-mean} client-dependent random noise. Obviously, the label distribution $\mathcal{Y}_i$ here depends on the feature distribution $\mathcal{X}_i$, noise distribution $\mathcal{N}_i$ and $\bm{w}_i^{\ast}$. We assume that the covariance matrix of the feature vectors is the same across all the clients, i.e., $\mathbb{E}_{\bm{x} \sim \mathcal{X}_i}[\bm{x} {\bm{x}}^T] = \bm{Q}$ for all $i \in [n]$; this is possible for e.g., by normalization or whitening of the features. Note that by assuming the same covariance matrix across all the clients, we are \textit{not} assuming that the feature distributions are the same across clients, but even if they are, there is heterogeneity through the different label distributions. Then, with the squared loss, our per-client objective function is: 
\[f_i(\bm{w}) = \mathbb{E}_{(\bm{x}, y) \sim (\mathcal{X}_i, \mathcal{Y}_i)}\Big[\frac{1}{2}(y - \langle \bm{w}, \bm{x} \rangle)^2\Big].\]
With the aforementioned conditions, it can be verified that $\nabla f_i(\bm{w}) = \bm{Q}(\bm{w} - \bm{w}_i^{\ast})$. Thus,
\[\widetilde{\bm{e}}_{k, \tau}^{(i)} = \bm{Q} \Big(\bm{w}_{k, \tau}^{(i)} - \frac{1}{n}\sum_{j \in [n]}\bm{w}_{k, \tau}^{(j)}\Big),\]
and so $\sum_{i \in [n]} \widetilde{\bm{e}}_{k, \tau}^{(i)} = \vec{0}$. So, \textit{\Cref{as-het} holds here with $\alpha = 0$ for any FL algorithm}. 

In fact, the above analysis and result (i.e., $\alpha = 0$) can be extended to networks whose training dynamics follow that of a linearized model, which has been shown to be the case for infinite-width networks (see for e.g., \cite{lee2019wide} and \cite{jacot2018neural}) and has been also used on applications for finite-width networks (for e.g., in \cite{mu2020gradients}).}

We now present the abridged version of the convergence result of \texttt{FedGLOMO}, followed by some important remarks. Its full version and detailed proof are in \Cref{sec:full-glomo} and \Cref{sec-pf-2}, respectively.

\begin{theorem} [\textbf{Smooth non-convex}]
\label{nov4-thm1}
Let Assumptions \ref{as1}, \ref{as-may15} and \ref{as5} hold. Further, suppose \Cref{as-het} is true for \texttt{FedGLOMO}. 
In \texttt{FedGLOMO}, for each round $k$, set $\eta_{k} = \eta =  \mathcal{O}(\frac{1}{L E K^{1/3} C^{1/3}})$, where $C = \mathcal{O}\big(\max\big(\frac{\alpha}{n}, \frac{E^2 (1+q)^2}{r}\big)\big)$, and $\beta_k = \mathcal{O}({(1+q) \eta^2 L^2 E^4})$. Suppose we use full-device participation (i.e., the global batch size is $n$) \textbf{only at} $k = 0$. Then, \texttt{FedGLOMO} can achieve $\mathbb{E}_{k^{*} \sim \textup{Unif}[0,K-1]}[\|\nabla f(\bm{w}_{k^{*}})\|^2] \leq \epsilon$ in 
$K = \mathcal{O}\big( \max\big(\sqrt{\frac{\alpha}{n}}, \frac{1+q}{\sqrt{r}}\big){\epsilon^{-1.5}}\big)$ rounds of communication and $E = \mathcal{O}(1)$ local steps.
\end{theorem}
\begin{remark}[\textbf{Better iteration complexity}]
\label{rem-sep21-1}
{As per \Cref{nov4-thm1}, for converging to an $\epsilon$-stationary point, \texttt{FedGLOMO} needs $T = KE$ to be $\mathcal{O}\big(\max\big(\sqrt{\frac{\alpha}{n}}, \frac{1}{\sqrt{r}}\big){\epsilon}^{-1.5}\big)$. {This iteration complexity is the same as that of \texttt{MimeMVR} (\cite{karimireddy2020mime}) \textit{but without using the bounded client dissimilarity assumption}, i.e. \cref{eq:bcd}, (also see the next remark for more details on this) and better than other related works in the federated setting; see \Cref{tb:comp}.}
We underscore the significance of global momentum here by comparing this complexity of \texttt{FedGLOMO} to that of \texttt{FedLOMO} (recall this is a simpler version of \texttt{FedGLOMO} with only local momentum and \textit{no} global momentum, described in \Cref{sec:lomo}) under partial-device participation and compression which is $\mathcal{O}\big(\frac{1}{r} \epsilon^{-2}\big)$; see \Cref{fl-thm3}.}
\end{remark}

\begin{remark}[\textbf{No requirement of bounded client dissimilarity (BCD) assumption}]
\label{rem-sep21-2}
{Divergent from related works, \Cref{nov4-thm1} \textit{does not use} the commonly used BCD assumption, i.e., \cref{eq:bcd}.
This is achieved by utilizing the smoothness and non-negativity of the $f_i$'s, specifically $\frac{1}{n}\sum_{i \in [n]}\|\nabla f_i(\bm{w})\|^2 \leq \frac{1}{n}\sum_{i \in [n]} 2L(f_i(\bm{w}) - f_i^{*}) \leq 2L f(\bm{w})$; see the proof outline of \Cref{nov4-thm1} in \Cref{sec:full-glomo}. 
Instead of the BCD assumption, we use our empirically verified \Cref{as-het} to provide a tighter (when $\alpha \ll n$) and data-dependent convergence result.
Note that \Cref{as-het} will always hold for some $\alpha \leq n$, regardless of the degree of client heterogeneity. Thus, \Cref{nov4-thm1} allows for \textit{arbitrary client heterogeneity}.}
\end{remark}

\begin{remark}[\textbf{Compressed communication}]
\label{rem-sep21-3}
To our knowledge, \texttt{FedGLOMO} is the \textit{first algorithm} that attains the aforementioned improved iteration complexity for FL on smooth non-convex functions \textit{with compressed communication}. We emphasize that the choice of quantities compressed in line \ref{comp} of \Cref{alg:2-local}
is important. This particular choice enables deriving the improved complexity by first deriving a result analogous to smoothness, i.e.,
$\|({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}}) - ({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})\| \leq \widehat{L}\|\bm{w}_{k} - \bm{w}_{k-1}\|$ ({see \Cref{nov-1-lem3} in \Cref{sec-pf-2}}). The straightforward choice of sending $Q_{D}(\bm{w}_{k} - {\bm{w}_{k,E}^{(i)}})$ and $Q_{D}({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})$ prohibits us from deriving the improved rate, unless 
we also assume $Q_D(.)$ to be a Lipschitz operator.
\\
In \Cref{sec:red_bits}, for $r \ll n$, we show that using the quantization scheme of \cite{alistarh2017qsgd} with $s = \sqrt{d}$, \texttt{FedGLOMO} achieves more than a five-fold saving in the \textit{total} communication cost as compared to when there is full-precision communication in \texttt{FedGLOMO}.
\end{remark}

\begin{remark}[\textbf{A limitation}]
\label{rem-sep29-5}
Even though our iteration complexity of $T = \mathcal{O}(\epsilon^{-1.5})$ is better than that of \texttt{FedCOMGATE} proposed by  \cite{haddadpour2020federated} (which is $\mathcal{O}(\epsilon^{-2})$), our communication complexity of $K = \mathcal{O}(\epsilon^{-1.5})$ is higher than that theirs which is $K = \mathcal{O}(\epsilon^{-1})$ (albeit under an extra assumption on the quantizer, namely Assumption 5 in their paper). Ideally, we would like to have $E = \mathcal{O}(\epsilon^{-p})$ and $K = \mathcal{O}(\epsilon^{-(1.5-p)})$ for some $p > 0$, in order to reduce \texttt{FedGLOMO}'s communication complexity. Exploring whether such a result is obtainable with our proposed style of momentum is an interesting future direction.
\end{remark}
\section{Experiments}
\label{sec:exp}
To show the efficacy of \textit{global} momentum in \texttt{FedGLOMO}, we compare it against \texttt{FedLOMO} (recall this has only local momentum and no global momentum; see \Cref{sec:lomo}) and \texttt{FedAvg} (\cite{mcmahan2017communication}) with the standard momentum available in PyTorch applied to (i) only its local updates, and (ii) both local and global updates -- all with compressed client-to-server communication. We denote (i) and (ii) by \texttt{FedAvg}-lm and \texttt{FedAvg}-glm (\enquote{lm} and \enquote{glm} stand for local momentum, and global + local momentum), respectively. \texttt{FedAvg} \textit{with compression} is referred to as \texttt{FedPAQ} (\cite{reisizadeh2020fedpaq}). Similarly, we call \texttt{FedAvg}-lm and \texttt{FedAvg}-glm \textit{with compression}, as \texttt{FedPAQ}-lm and \texttt{FedPAQ}-glm. We also compare against \texttt{FedCOMGATE} (\cite{haddadpour2020federated}) which uses gradient tracking to \textit{theoretically} derive a better communication-complexity than us (see \Cref{rem-sep29-5}).
For compression, the \enquote{qsgd} operator proposed in \cite{alistarh2017qsgd} is used.

We consider the task of classification on CIFAR-10 and Fashion-MNIST (\cite{xiao2017fashion}) abbreviated as FMNIST henceforth. The model used is a two-layer neural network with ReLU activation in the hidden layers. The size of both the hidden layers is 300/600 for FMNIST/CIFAR-10. We train the models using the categorical cross-entropy loss with $\ell_2$-regularization. 
The weight decay value in PyTorch (to apply $\ell_2$-regularization) is set to 1e-4. 
We consider both homogeneous 
and heterogeneous data distribution among the clients. Similar to \cite{mcmahan2017communication}, for the heterogeneous case, we distribute the data among the clients such that each client can have data from either one or (at most) two classes -- note that this is a high degree of heterogeneity. {The exact procedure is described in \Cref{sec:extra-exp}.} The number of clients ($n$) in all the experiments is set to 50, with each client having the same number of samples. The global batch-size $r$ is 25, and the number of local updates per round (i.e., $E$) is 10. 
All full gradients are replaced by stochastic gradients computed on a (per-client) batch size of 256. The learning rates, momentum parameters of the algorithms, and some other experimental details are in \Cref{sec:extra-exp}.

In Fig. \ref{fig:1}, we compare \texttt{FedPAQ}-lm, \texttt{FedPAQ}-glm, \texttt{FedLOMO} and \texttt{FedCOMGATE} with 4 (resp., 8) bits per-round against \texttt{FedGLOMO} with 2 (resp., 4) bits per-round on FMNIST (resp., CIFAR-10) in the heterogeneous and homogeneous cases. We set the number of per-round bits used by \texttt{FedGLOMO} to be half the number used by all other algorithms, so that each one has the same \textit{per-round} communication budget. All plots depict results over 3 independent runs; the shaded regions represent $\pm 1$ standard deviation whereas the solid lines are the respective means. Please see the discussion in the figure caption. These results illustrate the \textit{power of global momentum}.

Next, in the \textit{no-compression heterogeneous} case, we compare against \texttt{Mime} (specifically, \enquote{\texttt{MimeSGDm}}) of \cite{karimireddy2020mime} which also attains a complexity of $\mathcal{O}(\epsilon^{-1.5})$ but without compressed communication, and is tailored to handle client heterogeneity. Having shown the suboptimality of \texttt{FedLOMO} and \texttt{FedPAQ}-lm in Fig. \ref{fig:1}, we only compare \texttt{FedAvg}-glm, \texttt{FedGLOMO} without compression and \texttt{MimeSGDm} in the heterogeneous case in Fig. \ref{fig:2}. The plots in Fig. \ref{fig:2} show that the implicit client-drift controlling ability of our proposed global momentum is on par with the explicit client-drift controlling mechanism of \texttt{Mime}. The test error values averaged over the last five rounds for the plots in Figs \ref{fig:1} and \ref{fig:2} are in Tables \ref{tab1} and \ref{tab2}, respectively.

We also provide some more empirical results on CIFAR-100 in \Cref{cifar100}.

\begin{table}[!htb]
\begin{center}
\begin{tabular}{|l|c|c|}
%\toprule
\hline
\textbf{Algo.} & \textbf{CIFAR-10} \textbf{Het.} & \textbf{FMNIST} \textbf{Het.}
\\
\hline
\texttt{FedPAQ}-lm & 50.26 $\pm$ 0.85 & 16.17 $\pm$ 0.53 
\\
\hline
\texttt{FedPAQ}-glm & 49.88 $\pm$ 1.15 & 15.87 $\pm$ 1.10
\\
\hline
\texttt{FedLOMO} & 53.74 $\pm$ 0.17 & 18.95 $\pm$ 0.19
\\
\hline
\texttt{FedGLOMO} & \textbf{46.42 $\pm$ 0.05} & \textbf{13.55 $\pm$ 0.32}
\\
\hline
\texttt{FedCOMGATE} & \textbf{46.26 $\pm$ 0.25} & 15.32 $\pm$ 0.09
\\
\hline
\hline
\textbf{Algo.} & \textbf{CIFAR-10} \textbf{Hom.} & \textbf{FMNIST} \textbf{Hom.} 
\\
\hline
\texttt{FedPAQ}-lm & \textbf{45.13 $\pm$ 0.07} & 13.08 $\pm$ 0.05
\\
\hline
\texttt{FedPAQ}-glm & 45.70 $\pm$ 0.10 & 11.76 $\pm$ 0.06
\\
\hline
\texttt{FedLOMO} & 45.96 $\pm$ 0.01 & 14.22 $\pm$ 0.01
\\
\hline
\texttt{FedGLOMO} & \textbf{44.97 $\pm$ 0.05} & \textbf{10.98 $\pm$ 0.05}
\\
\hline
\texttt{FedCOMGATE} & 45.46 $\pm$ 0.03 & 12.24 $\pm$ 0.01
\\
\hline
\end{tabular}
\end{center}
\caption{Average \textbf{test error} \% ($\pm$ standard deviation) over the last five rounds for the plots in the \textit{heterogeneous} (\textit{top}) and \textit{homogeneous} (\textit{bottom}) cases in \Cref{fig:1}.}
\label{tab1}
\end{table}

\begin{table}[!htb]
\begin{center}
%\begin{small}
%\begin{sc}
\begin{tabular}{|l|c|c|}
%\toprule
\hline
\textbf{Algo.} & \textbf{CIFAR-10 Het.} & \textbf{FMNIST Het.} \\
\hline
\texttt{FedAvg}-glm & 50.26 $\pm$ 0.74 & 16.17 $\pm$ 0.53
\\
\hline
\texttt{MimeSGD}m & 46.10 $\pm$ 0.13 & \textbf{13.34 $\pm$ 0.25}
\\
\hline
\texttt{FedGLOMO} & \textbf{45.41 $\pm$ 0.15} & \textbf{13.48 $\pm$ 0.26}
\\
\hline
\end{tabular}
%\end{sc}
%\end{small}
\end{center}
\caption{Average \textbf{test error} \% ($\pm$ standard deviation) over the last five rounds for the plots in \Cref{fig:2}.}
\label{tab2}
\end{table}


\begin{figure*}[t]
\centering 
\subfloat[Het. FMNIST train loss]{
    \label{fig:1_a}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/fmnist_het_train_2.pdf}
	} 
%\hspace{-0.5cm}
\subfloat[Het. FMNIST test err]{
    \label{fig:1_b}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/fmnist_het_test_2.pdf}
	} 
%\\
%\hspace{-0.5cm}
\subfloat[Het. CIFAR10 train loss]{
    \label{fig:1_c}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/cifar10_het_train_2.pdf}
	} 
%\hspace{-0.5cm}
\subfloat[Het. CIFAR10 test err]{
    \label{fig:1_d}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/cifar10_het_test_2.pdf}
	} 
\\
\subfloat[Hom. FMNIST train loss]{
    \label{fig:11_a}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/fmnist_hom_train_2.pdf}
	} 
%\hspace{-0.5cm}
\subfloat[Hom. FMNIST test err]{
    \label{fig:11_b}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/fmnist_hom_test_2.pdf}
	} 
%\\
%\hspace{-0.5cm}
\subfloat[Hom. CIFAR10 train loss]{
    \label{fig:11_c}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/cifar10_hom_train_2.pdf}
	} 
%\hspace{-0.5cm}
\subfloat[Hom. CIFAR10 test err]{
    \label{fig:11_d}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/cifar10_hom_test_2.pdf}
	} 
\caption{Comparison of \texttt{FedPAQ}-lm, \texttt{FedPAQ}-glm, \texttt{FedLOMO}, \texttt{FedGLOMO} and \texttt{FedCOMGATE} (\cite{haddadpour2020federated}) with the same per-round communication budget on FMNIST and CIFAR-10 in the heterogeneous (top four figs.) and homogeneous (bottom four figs.) settings, respectively. The x-axis is the total number of communicated bits divided by the dimension $d$ and the global batch-size $r$. \texttt{FedGLOMO} is the \textbf{fastest} and most \textbf{communication-efficient} algorithm in almost all the cases; for e.g., in the heterogeneous case for both datasets, \texttt{FedGLOMO} attains the final test error of \texttt{FedPAQ}-glm (resp., \texttt{FedPAQ}-lm) with less than a \textbf{half} (resp., only about a \textbf{third}) of the number of bits used by \texttt{FedPAQ}-glm (resp., \texttt{FedPAQ}-lm). Further, \texttt{FedGLOMO} and \texttt{FedLOMO} have a smoother trajectory than other algorithms in the heterogeneous case due to variance-reducing momentum. Observe that \texttt{FedLOMO} and \texttt{FedPAQ}-lm (with only local momentum) are slower than \texttt{FedGLOMO} and \texttt{FedPAQ}-slm (with both local and global momentum), showing the ineffectiveness of only local momentum and \textbf{the power of combining both local and global momentum}. Also, note that \texttt{FedGLOMO} performs much better than \texttt{FedCOMGATE} in the homogeneous case.
}
\label{fig:1}
\end{figure*}

\begin{figure*}[t]
\centering 
\subfloat[FMNIST train loss]{
    \label{fig:2_a}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/fmnist_mime_train_2.pdf}
	} 
%\hspace{-0.5cm}
\subfloat[FMNIST test err]{
    \label{fig:2_b}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/fmnist_mime_test_2.pdf}
	} 
%\\
%\hspace{-0.5cm}
\subfloat[CIFAR-10 train loss]{
    \label{fig:2_c}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/cifar10_mime_train_2.pdf}
	} 
%\hspace{-0.5cm}
\subfloat[CIFAR-10 test err]{
    \label{fig:2_d}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/cifar10_mime_test_2.pdf}
	} 
%\hspace{0.2 in}  
\caption{Comparison of \texttt{FedAvg}-glm, \texttt{FedGLOMO} (without compression) and \texttt{MimeSGDm} on FMNIST and CIFAR-10 in the \textbf{heterogeneous} case. On both datasets, \texttt{FedAvg}-glm is the slowest while \texttt{FedGLOMO} is somewhat faster than \texttt{MimeSGDm}. While \texttt{Mime} has an explicit client-drift control mechanism, we do not have that in \texttt{FedGLOMO}, but still \textbf{our proposed global momentum implicitly mitigates client-drift} as well as \texttt{Mime}.}
\label{fig:2}
\end{figure*}

\paragraph{Verifying Assumption \ref{as-het} for \texttt{FedGLOMO}:}
\label{sec:het-asm-expt}
For each round $k$, we compute $\alpha = \max_{\tau \in [E]} 
\frac{\|\sum_{i \in [n]}\widetilde{\bm{e}}_{k,\tau}^{(i)}\|^2}{\sum_{i \in [n]} \|\widetilde{\bm{e}}_{k,\tau}^{(i)}\|^2}$, where $\widetilde{\bm{e}}_{k,\tau}^{(i)}$ is as defined in \Cref{as-het}, for 4 and 2 bit \texttt{FedGLOMO} on CIFAR-10 and FMNIST, respectively. 
Note that we remove the expectation (w.r.t. the stochastic gradients) while computing $\alpha$ for empirical verification. In Fig. \ref{fig:het0}, we plot $(\alpha/n)$ over different rounds for the heterogeneous as well as homogeneous case on both datasets; see the discussion in the figure caption.
\begin{figure*}[!htb]
\centering 
\subfloat[CIFAR-10]{
    \label{fig:het_a}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/CIFAR10_2_alpha.pdf}
	} 
\subfloat[FMNIST]{
    \label{fig:het_b}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/FMNIST_2_alpha.pdf}
	} 
\caption{Variation of $(\frac{\alpha}{n})$ over different rounds of $4$ and $2$ bit \texttt{FedGLOMO} for CIFAR-10 (Fig. \ref{fig:het_a}) and FMNIST (Fig. \ref{fig:het_b}) in the heterogeneous and homogeneous cases. In both cases, notice that $\alpha \ll n$ throughout training. {Also, as discussed after the statement of 
\Cref{as-het}, note that $(\frac{\alpha}{n})$ is higher for the heterogeneous case (except at the end of training for FMNIST). See \Cref{fig:het1} for the same on \texttt{FedAvg}.}
}
\label{fig:het0}
\end{figure*}

\section{Conclusion}
We presented \texttt{FedGLOMO}, a communication-efficient algorithm for faster federated learning via the application of variance-reducing momentum, both in the aggregation step at the server as well as local client updates. We showed that \texttt{FedGLOMO} has better iteration complexity than prior work on smooth non-convex functions with compressed communication. Further, unlike prior work, our result does not use the bounded client dissimilarity assumption, even holding under arbitrary client heterogeneity. We also demonstrate the efficacy of \texttt{FedGLOMO} via extensive experiments.

\begin{acknowledgements}
This work is supported by NSF grants CCF-1564000, IIS-1546452 and HDR-1934932, AFOSR grant FA9550-19-1-0005, and NASA grant 80NSSC21M0071.
\end{acknowledgements}

\bibliography{das_303}

\onecolumn
\appendix

\begin{center}
    \textbf{\LARGE Appendix}
\end{center}

{\Large \textbf{Contents}}
\\
\\
{\large \textbf{\Cref{sec:full-glomo}:} Full Statement of Theorem~\ref{nov4-thm1} and Proof Outline}
\\
\\
{\large \textbf{\Cref{sec:red_bits}:} Reduction in Total Communication
Cost when \texorpdfstring{$r \ll n$}{Lg}}
\\
\\
{\large \textbf{\Cref{sec:disc}:} Algorithmic and Theoretical Comparison with MIME ({\cite{karimireddy2020mime}})}
\\
\\
{\large \textbf{\Cref{sec:marina}}: Comparison with \cite{gorbunov2021marina}}
\\
\\
{\large \textbf{\Cref{sec:extra-exp}:} Experimental Details and Some More Results}
\begin{itemize}
    \item \textbf{\Cref{cifar100}:} Results on CIFAR-100
\end{itemize}
{\large \textbf{\Cref{sec:lomo}:} \texttt{FedLOMO}: A Simpler Version of \texttt{FedGLOMO}}
\begin{itemize}
    \item \textbf{\Cref{sec:result:lomo}:} Main Result for {\texttt{FedLOMO}}
\end{itemize}
{\large \textbf{\Cref{sec-res-pf}:} Detailed Proofs}
\begin{itemize}
    \item \textbf{\Cref{sec-pf-2}:} Detailed  Proof of the Result of \texttt{FedGLOMO}
    \item \textbf{\Cref{sec-pf-1}:} Detailed Proof of the Result of \texttt{FedLOMO}
\end{itemize}
{\large \textbf{\Cref{sec:fed_avg_conv}:} Convergence of \texttt{FedAvg} under Assumption \ref{as-het}}

\section{Full Statement of Theorem~\ref{nov4-thm1} and Proof Outline}
\label{sec:full-glomo}
Here we present the full version of \Cref{nov4-thm1} as \Cref{apr20-thm1}. The detailed proof of this result can be found in \Cref{sec-pf-2}. We also provide a brief proof outline after the theorem statement.

\begin{theorem} [\textbf{Expanded version of \Cref{nov4-thm1}}]
\label{apr20-thm1}
Let Assumptions \ref{as1}, \ref{as-may15} and \ref{as5} hold. Further, suppose \Cref{as-het} is true for \texttt{FedGLOMO}.
In \texttt{FedGLOMO}, for each round $k$, set: 
\[\text{$\eta_{k} = \eta = \frac{1}{6 L E K^{1/3} (\frac{1}{n}(\alpha + \frac{4}{E}) + 800 e^2 (1+q) (E+1)^2 (\frac{q}{n} + \frac{(1+q)(n-r)}{r (n-1)}))^{1/3}}$ and}\] 
\[\beta_k = \beta = 160 e^2 (1+q) \eta^2 L^2 E^2 (E+1)^2.\]
Suppose we use full-device participation (i.e., the global batch size is $n$) \textbf{only at} $k = 0$. Then if $\frac{K^{-1/3}}{1200e^2(1+q) \big(\frac{q}{n} + \frac{(1+q)(n-r)}{r (n-1)}\big)} \leq E+1 \leq \frac{\sqrt{1+q} (n-r)}{3 r(n-1)} K$, we have:
\begin{flalign*}
   \frac{1}{K} \sum_{k=0}^{K-1}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] \leq \frac{39 L f(\bm{w}_{0})}{K^{2/3}} \Big({\frac{1}{n}\Big(\alpha + \frac{4}{E}\Big)} + 800 e^2 (1+q) (E+1)^2 \Big(\frac{q}{n} + \frac{(1+q)(n-r)}{r(n-1)}\Big)\Big)^{1/3}.
\end{flalign*}
Thus, \texttt{FedGLOMO} can achieve $\mathbb{E}_{k^{*} \sim \textup{Unif}[0,K-1]}[\|\nabla f(\bm{w}_{k^{*}})\|^2] \leq \epsilon$ in $K = \mathcal{O}\Big( \max\Big(\sqrt{\frac{\alpha}{n}}, (1+q)\sqrt{\frac{(n-r)}{r(n-1)}}\Big){\epsilon^{-1.5}}\Big)$ rounds of communication and $E = \mathcal{O}(1)$ local steps.
\end{theorem}
{Note that the above result is independent of the variance of local stochastic gradients (of the clients). In short, this happens because we use local full gradients at $\tau=0$ and because the local stochastic gradients are Lipschitz.}

\subsection*{{Proof Outline}:}
\label{sec:pf-out-3}
Before getting to the proof outline, we would like to mention that the key technical challenge in deriving the improved convergence result with global momentum-based variance reduction is obtaining an analogue of the Lipschitzness of stochastic gradients to the change in local parameters over $E$ local steps. More specifically, for pure stochastic optimization, a key step in proving convergence of momentum-based variance reduction methods is using the Lipschitzness of the stochastic gradients or the update quantities (see \cite{cutkosky2019momentum,liu2020optimal}), i.e., \[\|\nabla\widetilde{f}(\bm{x}_t,\xi_t) - \nabla\widetilde{f}(\bm{x}_{t-1},\xi_t)\| \leq L \|\bm{x}_t - \bm{x}_{t-1}\|.\] In the FL setting where aggregation is performed at the server, we need an analogue of this at the server, i.e., something like
\[\|({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}}) - ({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})\| \leq \widetilde{L}\|\bm{w}_{k} - \bm{w}_{k-1}\|.\] Deriving this result is a part of our contribution and is done in  \Cref{nov-1-lem3} (in \Cref{sec-pf-2}).

\begin{proof}
We set $\eta_k = \eta$ and $\beta_k = \beta$ $\forall$ $k \in \{0,\ldots,K-1\}$. Then, using \Cref{nov-1-lem0} with full global as well as local batch sizes at $k=0$ (by which $\bm{u}_{0} = \overline{\bm{\delta}}_{0}$ in the statement of \Cref{nov-1-lem0}), we have at any $k' > 0$:
\begin{multline}
    \label{eq-feb9-11}
    \mathbb{E}[f(\bm{w}_{k'})] \leq 
    f(\bm{w}_{0}) -\frac{\eta E}{4}\sum_{k=0}^{k'-1}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] 
    + \frac{16 \eta^3 L^2 E^2 (\alpha E + 4) }{n^2} \sum_{k=0}^{k'-1}\sum_{i \in [n]}\mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2]
    \\
    + 
    160\eta E \beta \Big(\frac{q}{n^2} + \frac{(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big)\Big)
    \sum_{k=0}^{k'-1}\sum_{i \in [n]} \mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2],
\end{multline}
for $4\eta L E^2 \leq 1$ and $\beta \geq \frac{80 e^2 (1+q) \eta^2 L^2 E^2 (E+1)^2}{(1 - 4\eta L E)}$. 

Also, since the $f_i$'s are $L$-smooth and non-negative, using \Cref{lem1-oct20}, we have that: 
\[\sum_{i \in [n]} \mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2 \leq \sum_{i \in [n]} 2L(\mathbb{E}[f_i(\bm{w}_k)] - f_i^{*}) \leq 2n L \mathbb{E}[f(\bm{w}_k)] - 2L \sum_{i \in [n]} f_i^{*} \leq 2n L \mathbb{E}[f(\bm{w}_k)].\]
This step allows us to circumvent the need for the bounded client dissimilarity assumption. Using this in (\ref{eq-feb9-11}), we get:
\begin{multline}
    \label{eq:may23-1}
    \mathbb{E}[f(\bm{w}_{k'})] \leq 
    f(\bm{w}_{0}) -\frac{\eta E}{4}\sum_{k=0}^{k'-1}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] 
    \\
    + \underbrace{64 \eta L E \Big(\frac{\eta^2 L^2 E (\alpha E + 4)}{n} + 5 \beta \Big(\frac{q}{n} + \frac{(1+q)(n-r)}{r(n-1)}\Big)\Big)}_{=\gamma}\sum_{k=0}^{k'-1} \mathbb{E}[f(\bm{w}_{k})].
\end{multline}
Unfolding the above recursion and simplifying a bit, we get:
\begin{flalign}
    \label{eq:may23-2}
    \sum_{k=0}^{k'-1}\mathbb{E}[f(\bm{w}_{k})] \leq k' f(\bm{w}_{0}) - \frac{\eta E}{4}\sum_{k=0}^{k'-1} \mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] + \gamma k' \sum_{k=0}^{k'-1} \mathbb{E}[f(\bm{w}_{k})].
\end{flalign}
Let us now ensure that $\gamma k' \leq \frac{1}{2}$ for all $k' \in \{1,\ldots,K\}$, so that we can simplify (\ref{eq:may23-2}) to:
\begin{equation}
    \label{eq:may23-3}
    \sum_{k=0}^{k'-1}\mathbb{E}[f(\bm{w}_{k})] \leq 2k' f(\bm{w}_{0}) - \frac{\eta E}{2}\sum_{k=0}^{k'-1} \mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] \leq 2k' f(\bm{w}_{0}).
\end{equation}
Now for $8 \eta L E^2 \leq 1$, it can be verified that $\beta = 160e^2 (1+q) \eta^2 L^2 E^2 (E+1)^2$ is a valid choice. Using this, we get that:
\begin{equation}
    \label{eq:may23-4}
    \gamma k' \leq \gamma K = \underbrace{64 \eta^3 L^3 E^3 K \Big(\frac{1}{n}\Big(\alpha + \frac{4}{E}\Big) + 800 e^2 (1+q) (E+1)^2 \Big(\frac{q}{n} + \frac{(1+q)(n-r)}{r(n-1)}\Big)\Big)}_{\text{(A)}}.
\end{equation}
Setting
\begin{equation*}
    \eta = \frac{1}{6 L E K^{1/3} (\frac{1}{n}(\alpha + \frac{4}{E}) + 800 e^2 (1+q) (E+1)^2 (\frac{q}{n} + \frac{(1+q)(n-r)}{r (n-1)}))^{1/3}},
\end{equation*}
we have (A) $ < \frac{1}{2}$. We also need to ensure that $8 \eta L E^2 \leq 1$ and $\beta = 160e^2 (1+q) \eta^2 L^2 E^2 (E+1)^2 < 1$. The range of $E$ in the theorem statement is obtained by combining the constraints (on $E$) that we get from these two requirements.

Finally, using (\ref{eq:may23-3}) in (\ref{eq:may23-1}) with $k' = K$, substituting our choice of $\eta$ and $\beta$, and then simplifying a bit more, we get the final convergence result.
\end{proof}

\section{Reduction in total communication
cost when \texorpdfstring{$r \ll n$}{Lg}}
\label{sec:red_bits}
Here, we derive the claim made in the second paragraph of \Cref{rem-sep21-3}.

We consider the practical regime of $r \ll n$ (as well as $\alpha \ll n$).

First, consider the case where the clients communicate at full precision using 32 bits, i.e., $q=0$. The number of rounds of communication $K_1$ needed to reach an $\epsilon$ stationary point is:
\begin{flalign*}
    K_1 \approx \Big(\frac{39L f(\bm{w}_0)}{\epsilon}\Big)^{1.5} \Big(800 e^2 (E+1)^2 \frac{(n-r)}{r(n-1)}\Big)^{0.5}
\end{flalign*}
Since the communication cost per-round is proportional to $r \times (32d)$ bits (recall $d$ is the model dimension), the total communication cost $C_1$ in this case is:
\begin{flalign*}
    C_1 & = 32dr \times K_1 
    \\
    & = 32dr \Big(\frac{39L f(\bm{w}_0)}{\epsilon}\Big)^{1.5} \Big(800 e^2 (E+1)^2 \frac{(n-r)}{r(n-1)}\Big)^{0.5}.
\end{flalign*}
Now, let us consider the QSGD quantizer of \cite{alistarh2017qsgd} with $s = \sqrt{d}$ \footnote{\cite{alistarh2017qsgd} use $n$ to denote the dimension}. With this choice, $q=1$. Here, the number of rounds of communication $K_2$ needed to reach an $\epsilon$ stationary point is:
\begin{flalign*}
    K_2 \approx \Big(\frac{39L f(\bm{w}_0)}{\epsilon}\Big)^{1.5} \Big(3200 e^2 (E+1)^2 \frac{(n-r)}{r(n-1)} \Big)^{0.5}.
\end{flalign*}
Now using Theorem 3.4 of \cite{alistarh2017qsgd}, under the special case of $s = \sqrt{d}$, the communication cost per-round can be reduced to $r \times (2.8d + 32)$ bits. Hence, the total communication cost $C_2$ in this case is:
\begin{flalign*}
    C_2 & \approx (2.8d + 32) r \times K_2 
    \\
    & \approx (2.8d + 32) r \Big(\frac{39L f(\bm{w}_0)}{\epsilon}\Big)^{1.5} \Big(3200 e^2 (E+1)^2 \frac{(n-r)}{r(n-1)} \Big)^{0.5}.
\end{flalign*}
Therefore,
\[\frac{C_1}{C_2} \approx \frac{32}{2.8 \times 4^{0.5}} \approx 5.7.\]

\section{Algorithmic and Theoretical Comparison with MIME (\texorpdfstring{\cite{karimireddy2020mime}}{Lg})}
\label{sec:disc}
We now discuss the major algorithmic and theoretical differences of our work from  \cite{karimireddy2020mime}. 

\begin{itemize}
    \item Algorithmically, \cite{karimireddy2020mime} do not explicitly \textit{apply} any momentum at the server.  
    Instead, they apply globally computed momentum in the local updates of the clients.
    On the other hand, \texttt{FedGLOMO} has an explicit momentum-based update at the server to enable global variance reduction, apart from local momentum applied in the client updates.
    \item Unlike \texttt{FedGLOMO}, the algorithms of \cite{karimireddy2020mime} do not have any quantized/compressed communication. As we discussed in \Cref{rem-sep21-3}, 
    maintaining the improved complexity of $\mathcal{O}(\epsilon^{-1.5})$ with compressed communication is not trivial.
    \item Even in the absence of any compressed communication, \texttt{FedGLOMO} is more communication-efficient than Mime requiring three-fourth / half the number of bits that Mime requires per-round for server to clients as well as clients to server communication / only clients to server communication (which is typically the bottleneck in FL). This is because in Mime, the server needs to send $\bm{x}$ (sending some other statistics $\bm{s}$ would require even more bits) and $\bm{c}$ to the clients, and the clients need to send back $(\bm{y}_i,\nabla f_i(\bm{x})$) to the server (please see their notation). In \texttt{FedGLOMO}, the server needs to send $\bm{w}_k$ and $\bm{w}_{k-1}$ to the clients, but the clients can just send back $\{{(\bm{w}_{k} - {\bm{w}_{k,E}^{(i)}})}  - (1-\beta_k){({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})}\}$ to the server in the absence of any quantization -- this can be verified by just removing the quantization operator $Q_D$ and expanding the update rule of $\bm{u}_k$ (line 10 of \Cref{alg:2}) for $k>0$.
    \item Our theory for \texttt{FedGLOMO} does not use the bounded client dissimilarity (BCD) assumption, i.e., \cref{eq:bcd}.
    Instead, we propose and use \Cref{as-het}, which allows for arbitrary client heterogeneity; in the worst case, \Cref{as-het} will hold with $\alpha = n$. In contrast, the results of MimeMVR use the BCD assumption.
    \item See the full version of Theorem V (on page 39 of the latest arXiv draft) of \cite{karimireddy2020mime} for MimeMVR. Their result is in terms of the gradient of $f$ at the local client parameters and not the actual server parameters, which is not ideal. Our result for \texttt{FedGLOMO} is completely in terms of the gradient of $f$ at the server parameters.
\end{itemize}


\section{Comparison with \texorpdfstring{\cite{gorbunov2021marina}}{Lg}}
\label{sec:marina}
As mentioned in \Cref{rel-work}, \cite{gorbunov2021marina} also propose algorithms with improved complexity in the \textit{distributed setting without multiple local update steps}. 
Since our work is under partial-device participation, we compare against their algorithm for the same case, i.e. PP-MARINA. Note that PP-MARINA has a probability $p$ of using \textit{full gradients from all clients} (i.e., full device participation) in each iteration. For a fair comparison against \texttt{FedGLOMO}, which uses gradients from all the clients \textit{only in the first round} (see \Cref{nov4-thm1}), $p$ should be set to $\frac{1}{K}$ ($K$ being the number of rounds) -- in which case, their complexity is $O(\epsilon^{-2})$ which is worse than ours. See Theorem 4.1 in their paper for this.


\section{Experimental Details and some more results}
\label{sec:extra-exp}
We first describe the procedure we have used to generate heterogeneous data distribution (among the clients). First, the training data (of both CIFAR-10 and FMNIST) was sorted based on labels and then divided into 100 equal data-shards. Splitting the data into 100 equal shards (after sorting) ensures that each shard contains data from only one class for both CIFAR-10 and FMNIST. Since the number of clients in our experiments is fixed to 50, each client is assigned 2 shards chosen uniformly at random without replacement -- this ensures that each client can have data belonging to either just one class or two classes at the most. For the homogeneous case, we distribute the training data uniformly at random among the clients.

In all our experiments, we use the learning rate schedule suggested in \cite{bottou2012stochastic} where we decimate the client learning rate by 1\% after every round, i.e., $\eta_k = (0.99)^k \eta_0$, $\eta_0$ being the initial learning rate. Note that this learning rate schedule has been used earlier for FL experiments in \cite{haddadpour2020federated}.
We search the initial learning rates over $\{10^{-3}, 5 \times 10^{-3}, 10^{-2}, 5 \times 10^{-2}, 10^{-1}\}$; the best performance is obtained with an initial learning rate of $10^{-2}$ in almost all the cases. 

For \texttt{FedGLOMO}, we use a constant value of $\beta_k = 0.2$. For \texttt{FedAvg}-lm, \texttt{FedAvg}-glm, \texttt{FedPAQ}-lm and \texttt{FedPAQ}-glm, the local (client-level) momentum parameter (in the Pytorch optimizer) is set to 0.9 (which is also the default value used). For \texttt{FedAvg}-glm and \texttt{FedPAQ}-glm,
we just implement the server update as a PyTorch optimizer update with momentum, and this momentum parameter is searched over $\{0.9,0.7,0.5\}$. For \texttt{MimeSGDm}, we search its momentum hyper-parameter over $\{0,0.9,0.99\}$ as suggested in \cite{karimireddy2020mime}.

We make a small modification to \texttt{FedGLOMO} in our experiments for the heterogeneous case. Specifically, we modify line \ref{l2} (which is the local momentum application step) of \Cref{alg:2-local} as follows:
\[\text{Update: } \bm{v}_{k,\tau}^{(i)} = \widetilde{\nabla} f_i(\bm{w}_{k,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)}) + {\color{blue} 0.8}\big(\bm{v}_{k,\tau-1}^{(i)} - \widetilde{\nabla} f_i(\bm{w}_{k,\tau-1}^{(i)};\mathcal{B}_{k,\tau}^{(i)})\big) \text{ and}\]
\[\widehat{\bm{v}}_{k-1,\tau}^{(i)} = \widetilde{\nabla} f_i(\widehat{\bm{w}}_{k-1,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)}) + {\color{blue} 0.8}\big(\widehat{\bm{v}}_{k-1,\tau-1}^{(i)} - \widetilde{\nabla} f_i(\widehat{\bm{w}}_{k-1,\tau-1}^{(i)};\mathcal{B}_{k,\tau}^{(i)})\big).\]
Without applying the above damping factor of 0.8, \texttt{FedGLOMO} seems to diverge -- this is probably because we have chosen the number of local updates to be too large.

All experiments are run on a single NVIDIA TITAN Xp GPU.

\subsection{Results on CIFAR-100}
\label{cifar100}
We consider the task of classification on CIFAR-100 (100 classes). We use 512-dimensional features extracted from the last layer of a ResNet-34 model  pretrained on ImageNet. The architecture is a two-layered ReLU neural network with the size of hidden layers being 300 and 150, respectively. As we have in the main paper, the total number of clients  is 50 and 50\% of the clients participate in each round (i.e., $r = 0.5n$). Also, we consider heterogeneous and homogeneous (i.i.d.) settings similar to the main paper; in the heterogeneous setting, each client has data from 2 classes. The number of rounds and the number of local steps per round are 50 and 20, respectively. We compare 3 different runs of \texttt{FedPAQ}-lm, \texttt{FedPAQ}-glm, \texttt{FedLOMO} and \texttt{FedCOMGATE} with 8 bits per round against \texttt{FedGLOMO} with 4 bits per round (so that the total number of bits communicated per round of all algorithms is the same). All other details (about learning rates, momentum parameters, etc.) are the same as in the main paper. The
test error (error = 100 - accuracy) values averaged over the last five rounds for this case is listed in \Cref{tab-cifar100}.

\begin{table}[!htb]
\begin{center}
%\begin{small}
%\begin{sc}
\begin{tabular}{|l|c|c|}
%\toprule
\hline
\textbf{Algo.} & \textbf{CIFAR-100 Het.} & \textbf{CIFAR-100 Hom.} 
\\
\hline
\texttt{FedPAQ}-lm & 31.36 $\pm$ 0.05 & 31.02 $\pm$ 0.12
\\
\hline
\texttt{FedPAQ}-glm & 31.76 $\pm$ 0.09 & 30.78 $\pm$ 0.16
\\
\hline
\texttt{FedLOMO} & 31.26 $\pm$ 0.16 & 30.73 $\pm$ 0.05
\\
\hline
\texttt{FedGLOMO} & \textbf{30.76 $\pm$ 0.10} & \textbf{30.05 $\pm$ 0.06}
\\
\hline
\texttt{FedCOMGATE} & 34.27 $\pm$ 0.09 & 33.59 $\pm$ 0.09
\\
\hline
\end{tabular}
%\end{sc}
%\end{small}
\end{center}
\caption{Average \textbf{test error} \% ($\pm$ standard deviation) over the last five rounds for CIFAR-100.}
\label{tab-cifar100}
\end{table}
In both the heterogeneous and homogeneous settings, notice that \texttt{FedGLOMO} converges to the lowest test error; this is consistent with the results in the main paper on CIFAR-10 and FMNIST.


\section{\texttt{FedLOMO}: A Simpler Version of \texttt{FedGLOMO}}
\label{sec:lomo}
Now we consider a simpler version of \texttt{FedGLOMO}, which we call \texttt{FedLOMO}, that applies only local momentum in the client updates and does simple averaging at the server (like \texttt{FedAvg}), i.e., there is no global momentum (and hence the name of this variant does not have a \enquote{\texttt{G}}). \texttt{FedLOMO} is summarized in \Cref{alg:1} and \ref{alg:1-local}.
Notice that the momentum application occurs in {line \ref{line:mom}} of \Cref{alg:1-local}. 

As mentioned in the main paper, \texttt{FedLOMO} does not achieve the optimal convergence rate for smooth non-convex functions due to the absence of global momentum; see \Cref{fl-thm3} and the subsequent remarks. 


Just like the results of \texttt{FedGLOMO}, we do not use the BCD assumption (i.e., \cref{eq:bcd}) to derive the results of \texttt{FedLOMO}.


\begin{algorithm}[!htb]
	\caption{\texttt{FedLOMO} - Server Update}
	\label{alg:1}
	\begin{algorithmic}[1]
		\STATE {\bfseries Input:} 
		Initial point $\bm{w}_0$, \# of rounds of communication $K$, period $E$, learning rates  $\{\eta_{k}\}_{k=0}^{K-1}$, per-client batch size $b$, and  global batch size $r$. $Q_D$ is the quantization operator.
		%\vspace{0.1 cm}
		\FOR{$k =0,\dots, K-1$}
		%\vspace{0.1 cm}
		\STATE Server chooses a set $\mathcal{S}_k$ of $r$ clients uniformly at random without replacement and sends $\bm{w}_k$ to them.
		%\vspace{0.1 cm}
		\FOR{client $i \in \mathcal{S}_k$}
		%\vspace{0.1 cm}
		\STATE Set $\bm{w}_{k,0}^{(i)} = \bm{w}_k$ and run \Cref{alg:1-local} for client $i$.
		%\vspace{0.1 cm}
		\ENDFOR
		%\vspace{0.1 cm}
		\STATE Update $\bm{w}_{k+1} = \bm{w}_{k} + \frac{1}{r}\sum_{i \in \mathcal{S}_k}Q_{D}({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}})$.
		\label{line:fedavg}
		%\vspace{0.1 cm}
		\ENDFOR
	\end{algorithmic}
\end{algorithm}


\begin{algorithm}[!htb]
	\caption{\texttt{FedLOMO} - {Client Update}}
	\label{alg:1-local}
	\begin{algorithmic}[1]
		\FOR{$\tau = 0,\ldots,E-1$}
		%\vspace{0.1 cm}
		\IF{$\tau = 0$}
		%\vspace{0.1 cm}
		\STATE $\bm{v}_{k,\tau}^{(i)} = \nabla f_i(\bm{w}_{k,\tau}^{(i)})$. 
		\label{line:full-grad}
		%\vspace{0.1 cm}
		\ELSE
		%\vspace{0.1 cm}
		\STATE Pick a random batch of $b$ samples in client $i$, say $\mathcal{B}_{k,\tau}^{(i)}$. Compute the stochastic gradients of $f_i$ at $\bm{w}_{k,\tau}^{(i)}$ and $\bm{w}_{k,\tau-1}^{(i)}$ over $\mathcal{B}_{k,\tau}^{(i)}$ viz. $\widetilde{\nabla} f_i(\bm{w}_{k,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)})$ and $\widetilde{\nabla} f_i(\bm{w}_{k,\tau-1}^{(i)};\mathcal{B}_{k,\tau}^{(i)})$, respectively.
		%\vspace{0.2 cm}
		\STATE \label{line:mom}
		Update 
		$\bm{v}_{k,\tau}^{(i)} = \widetilde{\nabla} f_i(\bm{w}_{k,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)}) + \big(\bm{v}_{k,\tau-1}^{(i)}  - \widetilde{\nabla} f_i(\bm{w}_{k,\tau-1}^{(i)};\mathcal{B}_{k,\tau}^{(i)})\big)$.
		%\\
		\text{// {\color{blue} \texttt{(Local Momentum)}}} 
		%\vspace{0.1 cm}
		\ENDIF
		%\vspace{0.1 cm}
		\STATE Update $\bm{w}_{k,\tau+1}^{(i)} = \bm{w}_{k,\tau}^{(i)} - \eta_{k}\bm{v}_{k,\tau}^{(i)}$.
		%\vspace{0.1 cm}
		\ENDFOR
		%\vspace{0.1 cm}
		\STATE Send $Q_{D}({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}})$ to the server.
		%\vspace{0.1 cm}
	\end{algorithmic}
\end{algorithm}
\subsection{Main Result for {\texttt{FedLOMO}}}
\label{sec:result:lomo}
Now, we present the convergence result of \texttt{FedLOMO} for the smooth non-convex case in \Cref{fl-thm3}. Its proof is in \Cref{sec-pf-1}. 

\begin{theorem}[\textbf{Smooth non-convex case for \texttt{FedLOMO}}]
\label{fl-thm3}
Let Assumptions \ref{as1}, \ref{as-may15} and \ref{as5} hold. Further, suppose \Cref{as-het} holds for \texttt{FedLOMO}. 
Define a distribution $\mathbb{P}$ for $k \in \{0,\ldots,K-1\}$ such that $\mathbb{P}(k) = \frac{(1+\zeta)^{(K-1-k)}}{\sum_{k=0}^{K-1}(1+\zeta)^k}$ where $\zeta$ will be defined later. Sample $k^{*}$ from $\mathbb{P}$. 
\\
\textbf{1. With compression ($q > 0$) and partial-device participation ($r < n$):}
\\
In \texttt{FedLOMO}, set $\eta_{k} = \frac{1}{8 L E \sqrt{B K}}$ for all $k$, where $B = \frac{q}{n} + \frac{4(1+q)(n-r)}{r(n-1)}$. Then for $K > \frac{1}{64 B^3} (\frac{1}{n}(\alpha + \frac{4}{E}))$:
\begin{flalign*}
    \nonumber
    \mathbb{E}[\|\nabla f(\bm{w}_{k^{*}})\|^2] \leq \frac{64 \sqrt{B} L f(\bm{w}_0)}{K^{1/2}} \text{ with }\zeta := \frac{1}{4K} + \frac{1}{16 (B K)^{1.5}} \Big(\frac{1}{n}\Big(\alpha + \frac{4}{E}\Big)\Big). 
\end{flalign*}
So \texttt{FedLOMO} needs $K = \mathcal{O}(\frac{1}{r \epsilon^2})$ rounds of communication to achieve $\mathbb{E}[\|\nabla f(\bm{w}_{k^{*}})\|^2] \leq \epsilon$, for $\epsilon < \mathcal{O}(\frac{n B^2}{\alpha}) = \mathcal{O}(\frac{n/\alpha}{r^2})$.
\\
\\
\textbf{2. No compression ($q = 0$) and full-device participation ($r = n$):}
\\
In \texttt{FedLOMO}, set $\eta_k = \frac{1}{4 L E} \Big(\frac{n}{(\alpha + \frac{4}{E}) K}\Big)^{1/3}$ for all $k$. Then for $K > \Big(\frac{n}{\alpha + \frac{4}{E}}\Big)$:
\begin{flalign*}
    \nonumber
    \mathbb{E}[\|\nabla f(\bm{w}_{k^{*}})\|^2] \leq \frac{16 L f(\bm{w}_0)}{K^{2/3}} \Bigg(\frac{\alpha + \frac{4}{E}}{n}\Bigg)^{1/3}.
\end{flalign*}
So \texttt{FedLOMO} needs $K = \mathcal{O}(\frac{1}{\epsilon^{1.5}} \sqrt{\frac{\alpha}{n}})$ rounds of communication to achieve $\mathbb{E}[\|\nabla f(\bm{w}_{k^{*}})\|^2] \leq \epsilon$, for $\epsilon < \mathcal{O}(\frac{\alpha}{n})$.
\end{theorem}
We make some remarks to discuss implications of this result and establish connections to some claims made in the main paper.
\begin{remark}[\textbf{Worse
iteration complexity than \texttt{FedGLOMO} under compression and partial-device participation}]
Under compression (i.e., $q > 0$) and partial-device participation (i.e., $r < n$), the number of iterations $T = KE$ of \texttt{FedLOMO} is $\mathcal{O}(\frac{1}{r\epsilon^{2}})$ as per the above theorem (since we do not have any constraint on $E$ depending on $\epsilon$). So the iteration complexity of \texttt{FedLOMO} is poorer than that of \texttt{FedGLOMO} under the same setting. However, it is on a par with the results of \cite{haddadpour2020federated, koloskova2020unified, wang2019slowmo, karimireddy2019scaffold}.
\label{r3-1}
\end{remark}

\begin{remark}[\textbf{Same
iteration complexity as \texttt{FedGLOMO} under no compression and full-device participation}]
When there is no compression (i.e., $q = 0$) and full-device participation (i.e., $r = n$), the number of iterations $T = KE$ of \texttt{FedLOMO} turns out to be $\mathcal{O}(\frac{1}{\epsilon^{1.5}}\sqrt{\frac{\alpha}{n}})$; this is exactly the same as that of \texttt{FedGLOMO} under the same setting. Intuitively this makes sense because under full-device participation (and no compression), the global momentum of \texttt{FedGLOMO} becomes redundant.
\label{r3-100}
\end{remark}

\begin{remark}[\textbf{High variance of simple averaging at the server}]
At a high level, \texttt{FedLOMO} fails to attain the complexity of \texttt{FedGLOMO} under partial-device participation and compression because of the high variance of the \texttt{FedAvg}-like plain averaging step at the server. The high variance is itself due to the amplified effect of client heterogeneity with multiple local updates. Without the application of some \textit{global variance-reduction} technique (like the one in \texttt{FedGLOMO}), the complexity cannot be improved.
More precisely, in \Cref{fl-thm3}, $B$ is a constant that is not $\mathcal{O}(\eta L E)$ in general, due to which \texttt{FedLOMO} does not achieve the improved convergence rate of $\mathcal{O}(K^{-2/3})$ that \texttt{FedGLOMO} attains; see the proof of \Cref{fl-thm3} in \Cref{sec-pf-1} for more details.
However, in the case of full-device participation and no compression, $B$ is 0 which allows \texttt{FedLOMO} to also achieve $\mathcal{O}(K^{-2/3})$ convergence by choosing $\eta = \mathcal{O}(\frac{1}{LEK^{1/3}})$.
\label{r3-1-4}
\end{remark}


\section{Detailed Proofs}
\label{sec-res-pf}

\subsection{Detailed  Proof of the Result of \texttt{FedGLOMO}}
\label{sec-pf-2}
\textbf{Some definitions used in the proofs}: 
\[\bm{\delta}_k^{(i)} \triangleq \mathbb{E}_{\mathcal{B}_{1}^{(i)},\ldots,\mathcal{B}_{E-1}^{(i)}}[\bm{w}_k - \bm{w}_{k,E}^{(i)}] \text{ for any $E-1$ batches $\{\mathcal{B}_{1}^{(i)},\ldots,\mathcal{B}_{E-1}^{(i)}\}$ in client $i$, and } \overline{\bm{\delta}}_k \triangleq \frac{1}{n}\sum_{i \in [n]}\bm{\delta}_k^{(i)}.\]
\[{g}_Q(\bm{w}_k;\mathcal{S}_k) \triangleq \frac{1}{r}\sum_{i \in \mathcal{S}_k}Q_D({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}})\]
\[\Delta{g}_Q(\bm{w}_k,\bm{w}_{k-1};\mathcal{S}_k) \triangleq \frac{1}{r}\sum_{i \in \mathcal{S}_k}Q_D(({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}}) - ({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}}))\]
\[{g}(\bm{w}_k;\mathcal{S}_k) \triangleq \frac{1}{r}\sum_{i \in \mathcal{S}_k}({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}}) = \mathbb{E}_{Q_D}[{g}_Q(\bm{w}_k;\mathcal{S}_k)]\]
\[\widehat{g}(\bm{w}_{k-1};\mathcal{S}_k) \triangleq \frac{1}{r}\sum_{i \in \mathcal{S}_k}({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})\]
\[\overline{\bm{w}}_{k,\tau} \triangleq \frac{1}{n}\sum_{i \in [n]} \bm{w}_{k,\tau}^{(i)} \text{ and } \overline{\bm{v}}_{k,\tau} \triangleq \frac{1}{n}\sum_{i \in [n]} \bm{v}_{k,\tau}^{(i)}\]
\[{\bm{e}}_{k,\tau}^{(i)} \triangleq \bm{v}_{k,\tau}^{(i)} - \nabla f_i(\bm{w}_{k, \tau}^{(i)}) \text{ and } \widetilde{\bm{e}}_{k,\tau}^{(i)} \triangleq \nabla f_i(\bm{w}_{k, \tau}^{(i)}) - \nabla f_i(\overline{\bm{w}}_{k,\tau})\]


\noindent \textbf{Proof of \Cref{apr20-thm1} (recall that this is the full version of \Cref{nov4-thm1}):}
\begin{proof}
We set $\eta_k = \eta$ and $\beta_k = \beta$ $\forall$ $k \in \{0,\ldots,K-1\}$.
\\
Then using \Cref{nov-1-lem0}, we have that:
\begin{multline}
    \label{eq:nov-4-thm1-1}
    \mathbb{E}[f(\bm{w}_{k'})] \leq 
    f(\bm{w}_{0}) -\frac{\eta E}{4}\sum_{k=0}^{k'-1}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] 
    + \frac{16 \eta^3 L^2 E^2 (\alpha E + 4) }{n^2} \sum_{k=0}^{k'-1}\sum_{i \in [n]}\mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2]
    \\
    + \frac{5}{4 \eta E \beta }{\mathbb{E}[\|\bm{u}_{0} - \overline{\bm{\delta}}_{0}\|^2]} 
    + 
    160\eta E \beta \Big(\frac{q}{n^2} + \frac{(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big)\Big)
    \sum_{k=0}^{k'-1}\sum_{i \in [n]} \mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2],
\end{multline}
for $4\eta L E^2 \leq 1$ and $\beta \geq \frac{80 e^2 (1+q) \eta^2 L^2 E^2 (E+1)^2}{(1 - 4\eta L E)}$. 

Suppose we use full batch sizes for the local updates as well as the server update at $k = 0$ (the latter means $r=n$ only for $k=0$). Then, $\bm{u}_{0} = \overline{\bm{\delta}}_{0}$ above. Also, since the $f_i$'s are $L$-smooth, using \Cref{lem1-oct20}, we have that: 
\[\sum_{i \in [n]} \mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2 \leq \sum_{i \in [n]} 2L(\mathbb{E}[f_i(\bm{w}_k)] - f_i^{*}) \leq 2n L \mathbb{E}[f(\bm{w}_k)] - 2L \sum_{i \in [n]} f_i^{*} \leq 2n L \mathbb{E}[f(\bm{w}_k)].\]
The last step above follows because the $f_i^{*}$'s are non-negative. This trick allows us to circumvent the need for the bounded client dissimilarity assumption.

Using these in (\ref{eq:nov-4-thm1-1}), we get:
\begin{multline}
    \label{eq:may15-1}
    \mathbb{E}[f(\bm{w}_{k'})] \leq 
    f(\bm{w}_{0}) -\frac{\eta E}{4}\sum_{k=0}^{k'-1}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] 
    \\
    + \underbrace{64 \eta L E \Big(\frac{\eta^2 L^2 E (\alpha E + 4)}{n} + 5 \beta \Big(\frac{q}{n} + \frac{(1+q)(n-r)}{r(n-1)}\Big)\Big)}_{=\gamma}\sum_{k=0}^{k'-1} \mathbb{E}[f(\bm{w}_{k})].
\end{multline}
Using (\ref{eq:may15-1}) recursively, we get:
\begin{flalign}
    \sum_{k=0}^{k'-1}\mathbb{E}[f(\bm{w}_{k})] & \leq k' f(\bm{w}_{0}) - \frac{\eta E}{4}\sum_{k=0}^{k'-2} (k'-1-k)\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] + \gamma \sum_{k=0}^{k'-2} (k'-1-k) \mathbb{E}[f(\bm{w}_{k})]
    \\
    \label{eq:may15-2}
    & \leq k' f(\bm{w}_{0}) - \frac{\eta E}{4}\sum_{k=0}^{k'-1} \mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] + \gamma k' \sum_{k=0}^{k'-1} \mathbb{E}[f(\bm{w}_{k})].
\end{flalign}
Let us now ensure that $\gamma k' \leq \frac{1}{2}$ for all $k' \in \{1,\ldots,K\}$, in which case we can simplify (\ref{eq:may15-2}) to:
\begin{equation}
    \label{eq:may15-3}
    \sum_{k=0}^{k'-1}\mathbb{E}[f(\bm{w}_{k})] \leq 2k' f(\bm{w}_{0}) - \frac{\eta E}{2}\sum_{k=0}^{k'-1} \mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] \leq 2k' f(\bm{w}_{0}).
\end{equation}
Now:
\begin{equation}
    \gamma k' \leq \gamma K = 64 \eta L E \Big(\frac{\eta^2 L^2 E (\alpha E + 4)}{n} + 5 \beta \Big(\frac{q}{n} + \frac{(1+q)(n-r)}{r(n-1)}\Big)\Big)K.
\end{equation}

{Now if we set $8 \eta L E^2 \leq 1$, then it can be verified that $\beta = 160e^2 (1+q) \eta^2 L^2 E^2 (E+1)^2$ is a valid choice. Using this above, we get that:
\begin{equation}
    \gamma k' \leq \gamma K = \underbrace{64 \eta^3 L^3 E^3 K \Big(\frac{1}{n}\Big(\alpha + \frac{4}{E}\Big) + 800 e^2 (1+q) (E+1)^2 \Big(\frac{q}{n} + \frac{(1+q)(n-r)}{r(n-1)}\Big)\Big)}_{\text{(A)}}.
\end{equation}
Setting $\eta = \frac{1}{6 L E K^{1/3} (\frac{1}{n}(\alpha + \frac{4}{E}) + 800 e^2 (1+q) (E+1)^2 (\frac{q}{n} + \frac{(1+q)(n-r)}{r (n-1)}))^{1/3}}$, we have (A) $ < \frac{1}{2}$. But we must also have
\begin{equation}
    \label{eq:may15-4}
    8 \eta L E^2 = \frac{4 E}{3 K^{1/3} \big(\frac{1}{n}\big(\alpha + \frac{4}{E}\big) + 800 e^2 (1+q) (E+1)^2 \big(\frac{q}{n} + \frac{(1+q)(n-r)}{r (n-1)}\big)\big)^{1/3}} \leq 1.
\end{equation}
This holds for $K^{1/3} (E+1) \geq \frac{1}{1200e^2(1+q) \big(\frac{q}{n} + \frac{(1+q)(n-r)}{r (n-1)}\big)}$.

Further $\beta$ must be smaller than 1, so
\begin{equation}
    \beta = 160 e^2 (1+q) \eta^2 L^2 E^2 (E+1)^2 = \frac{160 e^2 (1+q) (E+1)^2}{36 K^{2/3} \big(\big(\frac{1}{n}\big(\alpha + \frac{4}{E}\big) + 800 e^2 (1+q) (E+1)^2 \big(\frac{q}{n} + \frac{(1+q)(n-r)}{r (n-1)}\big)\big)^{2/3}} < 1.
\end{equation}
This holds for $E+1 \leq \frac{\sqrt{1+q} (n-r)}{3 r(n-1)} K$.

Now using (\ref{eq:may15-3}) in (\ref{eq:may15-1}) with $k' = K$ and our choice of $\beta = 160 e^2 (1+q) \eta^2 L^2 E^2 (E+1)^2$ and $\eta = \frac{1}{6 L E K^{1/3} (\frac{1}{n}(\alpha + \frac{4}{E}) + 800 e^2 (1+q) (E+1)^2 (\frac{q}{n} + \frac{(1+q)(n-r)}{r (n-1)}))^{1/3}}$, we get:
\begin{multline}
    \label{eq:may15-5}
    \mathbb{E}[f(\bm{w}_{K})] \leq 
    f(\bm{w}_{0}) -\frac{\eta E}{4}\sum_{k=0}^{K-1}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] 
    \\
    + {128 \eta^3 L^3 E^3 \Big(\frac{1}{n}\Big(\alpha + \frac{4}{E}\Big) + 800 e^2 (1+q)(E+1)^2 \Big(\frac{q}{n} + \frac{(1+q)(n-r)}{r(n-1)}\Big)\Big)}K f(\bm{w}_0).
\end{multline}
Rearranging the above a bit and using the fact that $f(\bm{w}_K) \geq 0$, we get:
%\small
\begin{equation}
    \label{eq:may15-6}
    \frac{1}{K} \sum_{k=0}^{K-1}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] \leq \frac{4 f(\bm{w}_{0})}{\eta E K} + 512 \eta^2 L^3 E^2 \Big(\frac{1}{n}\Big(\alpha + \frac{4}{E}\Big) + 800 e^2 (1+q) (E+1)^2 \Big(\frac{q}{n} + \frac{(1+q)(n-r)}{r(n-1)}\Big)\Big) f(\bm{w}_0).
\end{equation}
%\normalsize
Substituting the value of $\eta$ above, we get:
\begin{equation}
    \label{eq:may15-7}
    \frac{1}{K} \sum_{k=0}^{K-1}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] \leq \frac{39 L f(\bm{w}_{0})}{K^{2/3}} \Big({\frac{1}{n}\Big(\alpha + \frac{4}{E}\Big)} + 800 e^2 (1+q) (E+1)^2 \Big(\frac{q}{n} + \frac{(1+q)(n-r)}{r(n-1)}\Big)\Big)^{1/3}.
\end{equation}
This concludes the proof.
}
\end{proof}

\noindent \textbf{Lemmas used in the proof of \Cref{apr20-thm1}:}

\begin{lemma}
\label{nov-1-lem0}
Suppose $4\eta L E^2 \leq 1$ and $\beta \geq \frac{80 e^2 (1+q) \eta^2 L^2 E^2 (E+1)^2}{(1 - 4\eta L E)}$. 
Then for any $k' \in \{1,\ldots,K\}$, we have:
\begin{multline*}
    \mathbb{E}[f(\bm{w}_{k'})] \leq 
    f(\bm{w}_{0}) -\frac{\eta E}{4}\sum_{k=0}^{k'-1}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] 
    + {\frac{16 \eta^3 L^2 E^2 (\alpha E + 4)}{n^2}} \sum_{k=0}^{k'-1}\sum_{i \in [n]}\mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2]
    \\
    + \frac{5}{4 \eta E \beta }{\mathbb{E}[\|\bm{u}_{0} - \overline{\bm{\delta}}_{0}\|^2]} 
    + 
    160\eta E \beta \Big(\frac{q}{n^2} + \frac{(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big)\Big)
    \sum_{k=0}^{k'-1}\sum_{i \in [n]} \mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2].
\end{multline*}
\end{lemma}
\begin{proof}
Per the previous definitions:
\begin{equation}
    \label{eq:nov-1-thm1-1}
    \bm{u}_k = \beta {g}_Q(\bm{w}_k;\mathcal{S}_k) + (1-\beta)\bm{u}_{k-1} + (1-\beta)\Delta{g}_Q(\bm{w}_k,\bm{w}_{k-1};\mathcal{S}_k)
\end{equation}
By $L$-smoothness of $f$, we have for $k \geq 1$:
\begin{flalign}
    \nonumber
    \mathbb{E}[f(\bm{w}_{k+1})] & \leq \mathbb{E}[f(\bm{w}_{k})] + \mathbb{E}[\langle \nabla f(\bm{w}_{k}), \bm{w}_{k+1} - \bm{w}_{k} \rangle] + \frac{L}{2}\mathbb{E}[\|\underbrace{\bm{w}_{k+1} - \bm{w}_{k}}_{=-\bm{u}_k}\|^2]
    \\
    \label{eq:nov-1-thm1-1-0}
    & = \mathbb{E}[f(\bm{w}_{k})] + \underbrace{\mathbb{E}[\langle \nabla f(\bm{w}_{k}), -\bm{u}_{k}\rangle]}_\text{(I*)} 
    + \underbrace{\frac{1}{8\eta E}\mathbb{E}[\|\bm{u}_{k}\|^2]}_\text{(II*)}
    - \Big(\frac{1}{8\eta E} - \frac{L}{2}\Big)\mathbb{E}[\|\bm{w}_{k+1} - \bm{w}_{k}\|^2].
\end{flalign}
Let us analyze (I*) first. 
\begin{flalign}
    %\nonumber
    \label{eq:nov6-1}
    \mathbb{E}[\langle \nabla f(\bm{w}_{k}), -\bm{u}_{k}\rangle] & = \mathbb{E}[\langle \nabla f(\bm{w}_{k}), -{g}(\bm{w}_k;\mathcal{S}_k) - (1 - \beta)(\bm{u}_{k-1} - \widehat{g}(\bm{w}_{k-1};\mathcal{S}_k)) \rangle]
    \\
    \nonumber
    & = \mathbb{E}[\langle \nabla f(\bm{w}_{k}), -{g}(\bm{w}_k;\mathcal{S}_k)] - (1 - \beta)\mathbb{E}[\langle \nabla f(\bm{w}_{k}), \bm{u}_{k-1} - \widehat{g}(\bm{w}_{k-1};\mathcal{S}_k) \rangle]
    \\
    \nonumber
    & = \underbrace{\mathbb{E}[\langle \nabla f(\bm{w}_{k}), \frac{1}{n}\sum_{i \in [n]}({\bm{w}_{k,E}^{(i)}} - \bm{w}_{k}) \rangle]}_\text{(III*)} + \underbrace{(1-\beta)\mathbb{E}[\langle - \nabla f(\bm{w}_{k}), \bm{u}_{k-1} - \overline{\bm{\delta}}_{k-1}
    \rangle]}_\text{(IV*)}
\end{flalign}
(\ref{eq:nov6-1}) follows by taking expectation with respect to $Q_D$. (III*) is obtained by taking expectation with respect to $\mathcal{S}_k$ above. (IV*) is obtained by taking expectation with respect to $\{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,E-1}^{(i)}\}_{i=1}^{n}$ and $\mathcal{S}_k$ above.


From \Cref{lem1-may11}, for $\eta < \frac{1}{L}$ and $E < \frac{1}{4}\text{min}\Big(\frac{1}{\eta L}, \frac{1}{\eta^2 L^2} - \frac{1}{\eta L}\Big)$, we can bound (III*) as:
%\small
\begin{multline}
    \label{eq:nov-1-thm1-2}
    \text{(III*)} \leq -\frac{\eta E}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] - \frac{\eta}{2}{\Big(1 - \eta^2 L^2 E^2 \Big)}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] 
    + \frac{16 \eta^3 L^2 E^2 (\alpha E + 4)}{n^2}\sum_{i \in [n]}\mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2].
\end{multline}
%\normalsize
Note that for $\eta L \ll 1$ (which is going to be the case eventually), we can combine all the above constraints on $\eta$ and $E$ into $4 \eta L E < 1$.

As for (IV*):
%\small
\begin{flalign}
    %\label{eq:nov-1-thm1-3-i}
    \text{(IV*)}
    & \leq (1-\beta)\mathbb{E}\big[\|\nabla f(\bm{w}_{k})\| \|\bm{u}_{k-1} - \overline{\bm{\delta}}_{k-1}\|\big]
    \\
    \label{eq:nov-1-thm1-3-i}
    & \leq \frac{(1-\beta)}{2}\Big(\frac{\eta E}{2(1-\beta)}{\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2]} + \frac{2(1-\beta)\mathbb{E}[\|\bm{u}_{k-1} - \overline{\bm{\delta}}_{k-1}\|^2]}{\eta E}\Big) 
    \\
    \label{eq:nov-1-thm1-3}
    & = \frac{\eta E}{4}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] + \frac{(1-\beta)^2}{\eta E}\mathbb{E}[\|\bm{u}_{k-1} - \overline{\bm{\delta}}_{k-1}\|^2].
\end{flalign}
%\normalsize
(\ref{eq:nov-1-thm1-3-i}) above follows by the AM-GM inequality. 
\\
Adding (\ref{eq:nov-1-thm1-2}) and (\ref{eq:nov-1-thm1-3}), we get:
\begin{multline}
    \label{eq:nov-1-thm1-4}
    \text{(I*)} \leq -\frac{\eta E}{4}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] -\frac{\eta}{2}(1 - \eta^2  L^2 E^2)\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] + \frac{(1-\beta)^2}{\eta E}\mathbb{E}[\|\bm{u}_{k-1} - \overline{\bm{\delta}}_{k-1}\|^2]
    \\
    + \frac{16 \eta^3 L^2 E^2 (\alpha E + 4)}{n^2}\sum_{i \in [n]}\mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2].
\end{multline}
Now, let us analyze (II*). We have:
\begin{flalign}
    %\nonumber
    \label{eq:nov-1-thm1-4-2}
    \mathbb{E}[\|\bm{u}_k\|^2] \leq 2 \mathbb{E}[\|\overline{\bm{\delta}}_k\|^2] + 2 \mathbb{E}[\|\bm{u}_k - \overline{\bm{\delta}}_k\|^2]
\end{flalign}
Notice that:
\begin{equation}
    \label{eq:nov-1-thm1-4-2-0}
    \overline{\bm{\delta}}_k = \mathbb{E}_{\{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,E-1}^{(i)}\}_{i=1}^{n}}\Big[\frac{1}{n}\sum_{i \in [n]}(\bm{w}_k - \bm{w}_{k,E}^{(i)})\Big] = \mathbb{E}_{\{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,E-1}^{(i)}\}_{i=1}^{n}}[\sum_{\tau=0}^{E-1} \eta \overline{\bm{v}}_{k,\tau}].
\end{equation}
Thus:
\begin{equation}
    \label{eq:nov-1-thm1-5}
    \mathbb{E}[\|\overline{\bm{\delta}}_k\|^2] \leq  \eta^2 \mathbb{E}_{}\Big[\Big\|\sum_{\tau=0}^{E-1} \overline{\bm{v}}_{k,\tau}\Big\|^2\Big] \leq E \eta^2 \sum_{\tau=0}^{E-1} \mathbb{E}_{}[\| \overline{\bm{v}}_{k,\tau}\|^2].
\end{equation}
The expectation above is with respect to all the randomness in the algorithm so far.
\\
Using (\ref{eq:nov-1-thm1-5}) and the result of \Cref{nov-1-lem2} in (\ref{eq:nov-1-thm1-4-2}) with $2 \eta L E^2 \leq 1$, we have that:
\begin{multline}
    \label{eq:nov-1-thm1-6}
    \mathbb{E}[\|\bm{u}_k\|^2] \leq 2 E \eta^2 \sum_{\tau=0}^{E-1} \mathbb{E}_{}[\| \overline{\bm{v}}_{k,\tau}\|^2] + 2 \Big\{(1-\beta)^2\mathbb{E}[\|\bm{u}_{k-1} - \overline{\bm{\delta}}_{k-1}\|^2] + 2 \beta^2 \mathbb{E}[\|{g}_Q(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k\|^2] 
    \\
    + 8 e^2 (1+q) (1-\beta)^2 \eta^2 L^2 E^2 (E+1)^2 \mathbb{E}[\|\bm{w}_{k} - \bm{w}_{k-1}\|^2]\Big\}.
\end{multline}
Recalling that (II*) = $\frac{1}{8\eta E} \mathbb{E}[\|\bm{u}_k\|^2]$, we get:
\begin{multline}
    \label{eq:nov-1-thm1-8}
    \text{(II*)} \leq \frac{\eta}{4} \sum_{\tau=0}^{E-1} \mathbb{E}_{}[\| \overline{\bm{v}}_{k,\tau}\|^2] + \frac{1}{4\eta E} \Big\{(1-\beta)^2\mathbb{E}[\|\bm{u}_{k-1} - \overline{\bm{\delta}}_{k-1}\|^2] 
    + 2 \beta^2 \mathbb{E}[\|{g}_Q(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k\|^2] 
    \\
    + 8 e^2 (1+q) (1-\beta)^2 \eta^2 L^2 E^2 (E+1)^2 \mathbb{E}[\|\bm{w}_{k} - \bm{w}_{k-1}\|^2]\Big\}.
\end{multline}
Adding (\ref{eq:nov-1-thm1-4}) and (\ref{eq:nov-1-thm1-8}):
\begin{multline}
    \label{eq:nov-1-thm1-9}
    \text{(I*)} + \text{(II*)} \leq -\frac{\eta E}{4}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] -\frac{\eta}{2}\underbrace{\Big(1 - \eta^2  L^2 E^2 - \frac{1}{2}\Big)}_\text{$>0$ for $4 \eta L E \leq 1$}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] 
    \\
    + \frac{16 \eta^3 L^2 E^2}{n^2}%\underbrace{(\beta E + 4)}_{<(\beta + 4)E}
    (\alpha E + 4)
    \sum_{i \in [n]}\mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2]
    + \frac{5(1-\beta)^2}{4\eta E}\underbrace{\mathbb{E}[\|\bm{u}_{k-1} - \overline{\bm{\delta}}_{k-1}\|^2]}_\text{from \Cref{nov-1-lem2}}
    \\
    + \frac{\beta^2}{2 \eta E} \mathbb{E}[\|{g}_Q(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k\|^2] 
    + 2 e^2 (1+q) (1-\beta)^2 \eta L^2 E (E+1)^2 \mathbb{E}[\|\bm{w}_{k} - \bm{w}_{k-1}\|^2].
\end{multline}
Therefore, using %(\ref{eq:nov-1-thm1-10}),
\Cref{nov-1-lem2} %and \Cref{nov-1-lem3} (as done before in (\ref{eq:nov-1-thm1-7}))
recursively, we get:
\begin{multline}
    \label{eq:nov-1-thm1-11}
    \text{(I*)} + \text{(II*)} \leq -\frac{\eta E}{4}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] 
    + \frac{16 \eta^3 L^2 E^2 (\alpha E + 4) }{n^2}\sum_{i \in [n]}\mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2]
    \\
    + \frac{5(1-\beta)^{2k}}{4\eta E}{\mathbb{E}[\|\bm{u}_{0} - \overline{\bm{\delta}}_{0}\|^2]} 
    + \frac{5\beta^2}{2\eta E}%\beta^2
    \sum_{l=1}^{k}(1-\beta)^{2(k-l)}\underbrace{\mathbb{E}[\|{g}_Q(\bm{w}_l;\mathcal{S}_l) - \overline{\bm{\delta}}_l\|^2]}_\text{(V*)}
    \\
    + 10 e^2 (1+q) \eta L^2 E (E+1)^2 \sum_{l=1}^{k}(1-\beta)^{2(k-l+1)}\mathbb{E}[\|\bm{w}_{l} - \bm{w}_{l-1}\|^2].
\end{multline}
Using \Cref{lem-may11-n1}, we get:
\begin{equation}
    \text{(V*)} \leq 4 \eta^2 E\Big(\frac{q}{n^2} + \frac{(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big)\Big)\sum_{i \in [n]}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2].
\end{equation}

Putting this back in (\ref{eq:nov-1-thm1-11}), we get:
\begin{multline}
    \label{eq:nov-1-thm1-12}
    \text{(I*)} + \text{(II*)} \leq -\frac{\eta E}{4}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2]
    + \frac{16 \eta^3 L^2 E^2 (\alpha E + 4) }{n^2} \sum_{i \in [n]}\mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2]
    \\
    + \frac{5(1-\beta)^{2k}}{4\eta E}{\mathbb{E}[\|\bm{u}_{0} - \overline{\bm{\delta}}_{0}\|^2]}
    + %\frac{10(1+q)\eta \beta^2}{r(n-1)}\Big(1 - \frac{r}{n}\Big)
    10\eta \beta^2 \Big(\frac{q}{n^2} + \frac{(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big)\Big)
    \sum_{l=1}^{k}(1-\beta)^{2(k-l)}\sum_{i \in [n]}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{l,\tau}^{(i)}\|^2]
    %\\
    %+ 10(1+q) e^{8\eta L(E+1)^2} \eta L^2 E (E+1)^2 \sum_{l=1}^{k}(1-\beta)^{2(k-l+1)}\mathbb{E}[\|\bm{w}_{l} - \bm{w}_{l-1}\|^2].
    \\
    + 10 e^2 (1+q) \eta L^2 E (E+1)^2 \sum_{l=1}^{k}(1-\beta)^{2(k-l+1)}\mathbb{E}[\|\bm{w}_{l} - \bm{w}_{l-1}\|^2].
\end{multline}
Next, using (\ref{eq:nov-1-thm1-12}) in (\ref{eq:nov-1-thm1-1-0}), we get that:
\begin{multline}
    \label{eq:nov-1-thm1-13}
    \mathbb{E}[f(\bm{w}_{k+1})] \leq 
    \mathbb{E}[f(\bm{w}_{k})] -\frac{\eta E}{4}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] 
    + \frac{16 \eta^3 L^2 E^2 (\alpha E + 4) }{n^2} \sum_{i \in [n]} \mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2]
    \\
    + \frac{5(1-\beta)^{2k}}{4\eta E}{\mathbb{E}[\|\bm{u}_{0} - \overline{\bm{\delta}}_{0}\|^2]} 
    + %\frac{10 (1+q)\eta \beta^2}{r(n-1)}\Big(1 - \frac{r}{n}\Big)
    10\eta \beta^2 \Big(\frac{q}{n^2} + \frac{(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big)\Big)
    \sum_{l=1}^{k}(1-\beta)^{2(k-l)}\sum_{i \in [n]}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{l,\tau}^{(i)}\|^2]
    \\
    + 10 e^2 (1+q) \eta L^2 E (E+1)^2 \sum_{l=1}^{k}(1-\beta)^{2(k-l+1)}\mathbb{E}[\|\bm{w}_{l} - \bm{w}_{l-1}\|^2]
    - \Big(\frac{1}{8\eta E} - \frac{L}{2}\Big)\mathbb{E}[\|\bm{w}_{k+1} - \bm{w}_{k}\|^2].
\end{multline}
Summing the above from $k=0$ through $(k'-1)$ for any $k' \in \{1,\ldots,K\}$, we get:
\begin{multline}
    \label{eq:nov-1-thm1-14}
    \mathbb{E}[f(\bm{w}_{k'})] \leq 
    f(\bm{w}_{0}) -\frac{\eta E}{4}\sum_{k=0}^{k'-1}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] 
    + \frac{16 \eta^3 L^2 E^2 (\alpha E + 4) }{n^2} \sum_{k=0}^{k'-1}\sum_{i \in [n]} \mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2]
    \\
    + \sum_{l=0}^{\infty}\frac{5(1-\beta)^{2l}}{4\eta E}{\mathbb{E}[\|\bm{u}_{0} - \overline{\bm{\delta}}_{0}\|^2]} 
    + %\frac{10(1+q)\eta \beta^2}{r(n-1)}\Big(1 - \frac{r}{n}\Big)
    10 \eta \beta^2 \Big(\frac{q}{n^2} + \frac{(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big)\Big)
    \sum_{k=0}^{k'-1}\sum_{i \in [n]}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2]\sum_{l=0}^{\infty}{(1-\beta)^{2l}}
    \\
    + 10 e^2 (1+q) \eta L^2 E (E+1)^2 (1-\beta)^2
    \sum_{k=1}^{k'-1}\mathbb{E}[\|\bm{w}_{k} - \bm{w}_{k-1}\|^2]
    \sum_{l=0}^{\infty}(1-\beta)^{2l} 
    - \Big(\frac{1}{8\eta E} - \frac{L}{2}\Big)\sum_{k=0}^{k'-1}\mathbb{E}[\|\bm{w}_{k+1} - \bm{w}_{k}\|^2].
\end{multline}
Simplifying the above by noting that $\sum_{l=0}^{\infty}(1-\beta)^{2l} \leq \sum_{l=0}^{\infty}(1-\beta)^{l} = 1/\beta$, we get:
\begin{multline}
    \label{eq:nov-1-thm1-15}
    \mathbb{E}[f(\bm{w}_{k'})] \leq 
    f(\bm{w}_{0}) -\frac{\eta E}{4}\sum_{k=0}^{k'-1}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] 
    + \frac{16 \eta^3 L^2 E^2 (\alpha E + 4) }{n^2} \sum_{k=0}^{k'-1}\sum_{i \in [n]} \mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2]
    \\
    + \frac{5}{4 \eta E \beta }{\mathbb{E}[\|\bm{u}_{0} - \overline{\bm{\delta}}_{0}\|^2]} 
    + %\frac{10(1+q) \eta \beta}{r(n-1)}\Big(1 - \frac{r}{n}\Big)
    10\eta \beta \Big(\frac{q}{n^2} + \frac{(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big)\Big)
    \sum_{k=0}^{k'-1}\sum_{i \in [n]}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2]
    \\
    + \underbrace{\frac{10 e^2 (1+q) \eta L^2 E (E+1)^2}{\beta} 
    \sum_{k=1}^{k'-1}\mathbb{E}[\|\bm{w}_{k} - \bm{w}_{k-1}\|^2] - \frac{(1-4\eta L E)}{8\eta E}\sum_{k=0}^{k'-1}\mathbb{E}[\|\bm{w}_{k+1} - \bm{w}_{k}\|^2]}_\text{(VI*) -- want this to be $\leq$ 0}
\end{multline}
We want (VI*) to be $\leq 0$. For this, we must have:
\begin{equation}
    \label{eq:nov-1-thm1-16}
    \beta \geq \frac{80 e^2 (1+q) \eta^2 L^2 E^2 (E+1)^2}{(1 - 4\eta L E)}.
\end{equation}
Note that the denominator above is positive since we already have a constraint of $4\eta L E \leq 1$. 

With $\beta$ satisfying the above constraint, and using the result of \Cref{fl-lem-new1} for $\sum_{\tau=0}^{E-1} \mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2]$, we get:
\begin{multline}
    \label{eq:nov-1-thm1-17}
    \mathbb{E}[f(\bm{w}_{k'})] \leq 
    f(\bm{w}_{0}) -\frac{\eta E}{4}\sum_{k=0}^{k'-1}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] 
    + \frac{16 \eta^3 L^2 E^2 (\alpha E + 4) }{n^2} \sum_{k=0}^{k'-1}\sum_{i \in [n]}\mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2]
    \\
    + \frac{5}{4 \eta E \beta }{\mathbb{E}[\|\bm{u}_{0} - \overline{\bm{\delta}}_{0}\|^2]} 
    + 
    160\eta E \beta \Big(\frac{q}{n^2} + \frac{(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big)\Big)
    \sum_{k=0}^{k'-1}\sum_{i \in [n]} \mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2].
\end{multline}
Finally, note that we have two constraints namely: 
$4 \eta L E \leq 1$ and $2 \eta L E^2 \leq 1$. We can merge these constraints into $4 \eta L E^2 \leq 1$ for $E \geq 1$ (which is the case).

This gives us the desired result.
\end{proof}

\begin{lemma}
\label{lem1-may11}
For $\eta < \frac{1}{L}$ and $E < \frac{1}{4}\text{min}\Big(\frac{1}{\eta L}, \frac{1}{\eta^2 L^2} - \frac{1}{\eta L}\Big)$, (III*) in the proof of \Cref{nov-1-lem0} can be bounded as:
\begin{multline*}
    \text{(III*) } = \mathbb{E}[\langle \nabla f(\bm{w}_{k}), \frac{1}{n}\sum_{i \in [n]}(\bm{w}_{k,E}^{(i)} - \bm{w}_{k})\rangle] \leq -\frac{\eta E}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] - \frac{\eta}{2}{\Big(1 - \eta^2 L^2 E^2 \Big)}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] 
    \\
    + {\frac{16 \eta^3 L^2 E^2 (\alpha E + 4)}{n^2}\sum_{i \in [n]}\|\nabla f_i(\bm{w}_k)\|^2}.
\end{multline*}
\end{lemma}

\begin{proof}
$\text{(III*)} = \mathbb{E}[\langle \nabla f(\bm{w}_{k}), \frac{1}{n}\sum_{i \in [n]}(\bm{w}_{k,E}^{(i)} - \bm{w}_{k})\rangle]$. Then:
%\small
\begin{flalign}
    \nonumber
    \text{(III*)} & = \mathbb{E}[\langle \nabla f(\bm{w}_{k}), -\frac{1}{{n}}\sum_{i \in [n]} \sum_{\tau=0}^{E-1}\eta \bm{v}_{k,\tau}^{(i)}\rangle]
    \\
    \nonumber
    & = -{\eta}\sum_{\tau=0}^{E-1} \mathbb{E}[\langle \nabla f(\bm{w}_{k}), \underbrace{\frac{1}{n}\sum_{i \in [n]} \bm{v}_{k,\tau}^{(i)}}_{=\overline{\bm{v}}_{k,\tau}}\rangle]
    \\
    \label{eq:may11-1}
    & = \sum_{\tau=0}^{E-1}\Big\{-\frac{\eta}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] -\frac{\eta}{2}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] + \frac{\eta}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k}) - \overline{\bm{v}}_{k,\tau}\|^2]\Big\}
    \\
    \nonumber
    & = \sum_{\tau=0}^{E-1}\Big\{-\frac{\eta}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] -\frac{\eta}{2}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] + \frac{\eta}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k}) - \nabla f(\overline{\bm{w}}_{k,\tau}) + \nabla f(\overline{\bm{w}}_{k,\tau}) - \overline{\bm{v}}_{k,\tau}\|^2]\Big\}
    \\
    \label{eq:may11-2}
    & \leq \sum_{\tau=0}^{E-1}\Big\{-\frac{\eta}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] -\frac{\eta}{2}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] + {\eta}\mathbb{E}[\underbrace{\|\nabla f(\bm{w}_{k}) - \nabla f(\overline{\bm{w}}_{k,\tau})\|}_{\leq L \|\bm{w}_{k} - \overline{\bm{w}}_{k,\tau}\|}]^2 +  \eta \mathbb{E}[\|\nabla f(\overline{\bm{w}}_{k,\tau}) - \overline{\bm{v}}_{k,\tau}\|^2]\Big\}
    \\
    \label{eq:may11-3}
    & \leq \sum_{\tau=0}^{E-1}\Big\{-\frac{\eta}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] -\frac{\eta}{2}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] + {\eta L^2}\mathbb{E}[\|\bm{w}_{k} - \overline{\bm{w}}_{k,\tau}\|]^2 +  \eta \mathbb{E}[\|\nabla f(\overline{\bm{w}}_{k,\tau}) - \overline{\bm{v}}_{k,\tau}\|^2]\Big\}
\end{flalign}
%\normalsize
(\ref{eq:may11-1}) above follows by using the fact that for any two vectors $\bm{a}$ and $\bm{b}$, $\langle \bm{a}, \bm{b} \rangle = \frac{1}{2}(\|\bm{a}\|^2 + \|\bm{b}\|^2 - \|\bm{a}-\bm{b}\|^2)$. Also, (\ref{eq:may11-2}) follows from the fact that for any two vectors $\bm{a}$ and $\bm{b}$, $\|\bm{a} + \bm{b}\|^2 \leq 2\|\bm{a}\|^2 + 2\|\bm{b}\|^2$.
\\
Per definitions, observe that:
\begin{equation}
    \label{eq:may11-4}
    \overline{\bm{w}}_{k,\tau+1} = \overline{\bm{w}}_{k,\tau} - \eta \overline{\bm{v}}_{k,\tau}.
\end{equation}
From this, we have that $\bm{w}_{k} - \overline{\bm{w}}_{k,\tau} = \eta \sum_{t=0}^{\tau-1}\overline{\bm{v}}_{k,t}$. Hence, $\|\bm{w}_{k} - \overline{\bm{w}}_{k,\tau}\|^2 = \eta^2 \|\sum_{t=0}^{\tau-1}\overline{\bm{v}}_{k,t}\|^2 \leq \eta^2 \tau \sum_{t=0}^{\tau-1} \|\overline{\bm{v}}_{k,t}\|^2$ -- this follows from the fact that for any $p > 1$ vectors $\{\bm{u}_1,\ldots,\bm{u}_p\}$, $\|\sum_{i=1}^p \bm{u}_i\|^2 \leq p \sum_{i=1}^p \|\bm{u}_i\|^2$. Using all this in (\ref{eq:may11-3}), we get:
\begin{flalign}
    \nonumber
    \text{(III*)} & \leq -\frac{\eta E}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] +  \sum_{\tau=0}^{E-1}\Big\{-\frac{\eta}{2}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] + {\eta^3  L^2}\tau \sum_{t=0}^{\tau-1} \mathbb{E}[\|\overline{\bm{v}}_{k,t}\|^2] +  \eta \mathbb{E}[\|\nabla f(\overline{\bm{w}}_{k,\tau}) - \overline{\bm{v}}_{k,\tau}\|^2]\Big\}
    \\
    %\nonumber
    \label{eq:may11-5}
    & \leq -\frac{\eta E}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] -\frac{\eta}{2}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] + \frac{\eta^3  L^2 E^2}{2}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] + \eta \underbrace{\sum_{\tau=0}^{E-1}\mathbb{E}[\|\nabla f(\overline{\bm{w}}_{k,\tau}) - \overline{\bm{v}}_{k,\tau}\|^2]}_\text{from \Cref{fl-lem2}}
\end{flalign}
Using \Cref{fl-lem2} to bound the last term above gives us:
%\small
\begin{equation}
    \text{(III*)} \leq -\frac{\eta E}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] - \frac{\eta}{2}{\Big(1 - \eta^2 L^2 E^2 \Big)}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] + \frac{16 \eta^3 L^2 E^2 (\alpha E + 4)}{n^2}\sum_{i \in [n]}\|\nabla f_i(\bm{w}_k)\|^2.
\end{equation}
This gives us the desired result.
\end{proof}

\begin{lemma}
\label{fl-lem2}
For $\eta < \frac{1}{L}$ and $E < \frac{1}{4}\text{min}\Big(\frac{1}{\eta L}, \frac{1}{\eta^2 L^2} - \frac{1}{\eta L}\Big)$, we have:
\begin{equation*}
    \sum_{\tau=0}^{E-1} \mathbb{E}[\|\overline{\bm{v}}_{k,\tau} - \nabla f(\overline{\bm{w}}_{k,\tau})\|^2] 
    \leq
    {\frac{16 \eta^2 L^2 E^2 (\alpha E + 4)}{n^2}}\sum_{i \in [n]}\|\nabla f_i(\bm{w}_k)\|^2,
\end{equation*}
where the expectation is with respect to the randomness due to $\{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,E-1}^{(i)}\}_{i=1}^{n}$.
\end{lemma}
\begin{proof}
Let $\overline{\bm{e}}_{k,\tau} = \overline{\bm{v}}_{k,\tau} - \nabla f(\overline{\bm{w}}_{k,\tau})$. Then:
\begin{flalign}
    \nonumber
    \|\overline{\bm{e}}_{k,\tau}\|^2 & =  \|\overline{\bm{v}}_{k,\tau} - \nabla f(\overline{\bm{w}}_{k,\tau})\|^2
    \\
    \nonumber
    & = \Big\|\frac{1}{n}\sum_{i \in [n]}(\bm{v}_{k,\tau}^{(i)} - \nabla f_i(\overline{\bm{w}}_{k,\tau}))\Big\|^2
    \\
    %\label{eq:fl-4}
    \nonumber
    & = 
    \Big\|\frac{1}{n}\sum_{i \in [n]}(\bm{e}_{k,\tau}^{(i)} + \widetilde{\bm{e}}_{k,\tau}^{(i)})\Big\|^2
    \\
    \label{eq:fl-4}
    & \leq \frac{2}{n^2}\Big\|\sum_{i \in [n]}\bm{e}_{k,\tau}^{(i)}\Big\|^2 + \frac{2}{n^2}\Big\|\sum_{i \in [n]}\widetilde{\bm{e}}_{k,\tau}^{(i)}\Big\|^2
\end{flalign}
So:
\begin{equation}
    \label{eq:fl-5}
    \mathbb{E}[\|\overline{\bm{e}}_{k,\tau}\|^2] \leq
    \frac{2}{n^2}\mathbb{E}\Big[\Big\|\sum_{i \in [n]}\bm{e}_{k,\tau}^{(i)}\Big\|^2\Big] + 
    \frac{2}{n^2}\mathbb{E}\Big[\Big\|\sum_{i \in [n]}\widetilde{\bm{e}}_{k,\tau}^{(i)}\Big\|^2\Big]
\end{equation}
But:
\[\mathbb{E}\Big[\Big\|\sum_{i \in [n]}\bm{e}_{k,\tau}^{(i)}\Big\|^2\Big] = \sum_{i \in [n]}\mathbb{E}\Big[\Big\|\bm{e}_{k,\tau}^{(i)}\Big\|^2\Big] + \sum_{i \ne j:i,j \in [n]}\langle \mathbb{E}[\bm{e}_{k,\tau}^{(i)}], \mathbb{E}[\bm{e}_{k,\tau}^{(j)}] \rangle\]
In the cross-term above, we can take expectations individually as $\{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,E-1}^{(i)}\}$ and $\{\mathcal{B}_{k,1}^{(j)},\ldots,\mathcal{B}_{k,E-1}^{(j)}\}$ are independent for $i \ne j$. Next, from \Cref{fl-lem0}, $\mathbb{E}[\bm{e}_{k,\tau}^{(i)}] = \vec{0}$ $\forall$ $i,k,\tau$. Hence:
\[\mathbb{E}\Big[\Big\|\sum_{i \in [n]}\bm{e}_{k,\tau}^{(i)}\Big\|^2\Big] = \sum_{i \in [n]}\mathbb{E}\Big[\Big\|\bm{e}_{k,\tau}^{(i)}\Big\|^2\Big].\]
Using the above result and {\Cref{as-het}}
in (\ref{eq:fl-5}), we get that:
\begin{equation}
    \label{eq:fl-6}
    \mathbb{E}[\|\overline{\bm{e}}_{k,\tau}\|^2] \leq
    \frac{2}{n^2}\sum_{i \in [n]}\mathbb{E}[\|\bm{e}_{k,\tau}^{(i)}\|^2] + 
    \frac{2 \alpha}{n^2}\sum_{i \in [n]}\mathbb{E}[\|\widetilde{\bm{e}}_{k,\tau}^{(i)}\|^2].
\end{equation}
Now:
\begin{flalign}
    %\label{eq:fl-7}
    \nonumber
    \mathbb{E}\Big[\Big\|\widetilde{\bm{e}}_{k,\tau}^{(i)}\Big\|^2\Big] & = \mathbb{E}[\|\nabla f_i(\bm{w}_{k, \tau}^{(i)}) - \nabla f_i(\overline{\bm{w}}_{k,\tau})\|^2]
    \\
    \nonumber
    & = L^2 \mathbb{E}[\|\bm{w}_{k, \tau}^{(i)} - \overline{\bm{w}}_{k,\tau}\|^2]
    \\
    \nonumber
    & \leq L^2 \mathbb{E}[\|(\bm{w}_{k, 0}^{(i)} - \eta \sum_{t=0}^{\tau-1} \bm{v}_{k,t}^{(i)}) - (\overline{\bm{w}}_{k,0} - \eta \sum_{t=0}^{\tau-1} \overline{\bm{v}}_{k,t})\|^2]
\end{flalign}
But since $\bm{w}_{k,0}^{(i)} = \bm{w}_{k}$ $\forall$ $i$, we have $\overline{\bm{w}}_{k,0} = \bm{w}_{k}$. Hence:
\begin{flalign}
    \nonumber
    \mathbb{E}\Big[\Big\|\widetilde{\bm{e}}_{k,\tau}^{(i)}\Big\|^2\Big] & = \eta^2 L^2 \mathbb{E}[\| \sum_{t=0}^{\tau-1} \overline{\bm{v}}_{k,t} - \sum_{t=0}^{\tau-1} \bm{v}_{k,t}^{(i)}\|^2] 
    \\
    \nonumber
    & \leq \eta^2 L^2 \tau \sum_{t=0}^{\tau-1} \mathbb{E}[\|\overline{\bm{v}}_{k,t} - \bm{v}_{k,t}^{(i)}\|^2]
    \\
    \nonumber
    & = \eta^2 L^2 \tau \sum_{t=0}^{\tau-1} \mathbb{E}[\|\overline{\bm{v}}_{k,t}\|^2 + \|\bm{v}_{k,t}^{(i)}\|^2 - 2\langle \overline{\bm{v}}_{k,t}, \bm{v}_{k,t}^{(i)} \rangle]
\end{flalign}
Substituting the above in (\ref{eq:fl-6}), we get:
\begin{flalign}
    %\label{eq:fl-7}
    \nonumber
    \mathbb{E}[\|\overline{\bm{e}}_{k,\tau}\|^2] & \leq
    \frac{2}{n^2}\sum_{i \in [n]}\mathbb{E}[\|\bm{e}_{k,\tau}^{(i)}\|^2] + 
    \frac{2 \alpha}{n^2}\sum_{i \in [n]}\eta^2 L^2 \tau \sum_{t=0}^{\tau-1} \mathbb{E}[\|\overline{\bm{v}}_{k,t}\|^2 + \|\bm{v}_{k,t}^{(i)}\|^2 - 2\langle \overline{\bm{v}}_{k,t}, \bm{v}_{k,t}^{(i)} \rangle]
    \\
    %\nonumber
    \label{eq:fl-7}
    & = \frac{2}{n^2}\sum_{i \in [n]}\mathbb{E}[\|\bm{e}_{k,\tau}^{(i)}\|^2] + \frac{2 \alpha \eta^2 L^2 \tau}{n^2}\sum_{t=0}^{\tau-1}\{n\mathbb{E}[\|\overline{\bm{v}}_{k,t}\|^2] + \sum_{i \in [n]}\mathbb{E}[\|\bm{v}_{k,t}^{(i)}\|^2] - 2 \langle \overline{\bm{v}}_{k,t}, \sum_{i \in [n]} \bm{v}_{k,t}^{(i)} \rangle \}
    \\
    %\nonumber
    \label{eq:fl-8}
    & = \frac{2}{n^2}\sum_{i \in [n]}\mathbb{E}[\|\bm{e}_{k,\tau}^{(i)}\|^2] + \frac{2 \alpha \eta^2 L^2 \tau}{n^2} \sum_{t=0}^{\tau-1}\sum_{i \in [n]}\Big(\mathbb{E}[\|\bm{v}_{k,t}^{(i)}\|^2] {- \mathbb{E}[\|\overline{\bm{v}}_{k,t}\|^2]}\Big).
    \\
    \label{eq:oct13-lem2-1}
    & \leq \frac{2}{n^2}\sum_{i \in [n]}\mathbb{E}[\|\bm{e}_{k,\tau}^{(i)}\|^2] + \frac{2 \alpha \eta^2 L^2 \tau}{n^2} \sum_{t=0}^{\tau-1}\sum_{i \in [n]}\mathbb{E}[\|\bm{v}_{k,t}^{(i)}\|^2].
\end{flalign}
To get (\ref{eq:fl-8}) from (\ref{eq:fl-7}), we use the fact $\sum_{i \in [n]} \bm{v}_{k,t}^{(i)} = n\overline{\bm{v}}_{k,t}$. Now summing up (\ref{eq:oct13-lem2-1}) from $\tau=0$ through to $E-1$, we get:
\begin{flalign}
    \label{eq:new-ref-2}
    \sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{e}}_{k,\tau}\|^2] & \leq \frac{2}{n^2}\sum_{i \in [n]}\underbrace{\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{e}_{k,\tau}^{(i)}\|^2]}_\text{from \Cref{fl-lem-new2}} + \frac{2 \alpha \eta^2 L^2 E^2}{2n^2}\sum_{i \in [n]}\underbrace{\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2]}_\text{from \Cref{fl-lem-new1}}.
\end{flalign}
Now using \Cref{fl-lem-new2} and \Cref{fl-lem-new1} above with $\eta < \frac{1}{L}$ and $E < \frac{1}{4}\text{min}\Big(\frac{1}{\eta L}, \frac{1}{\eta^2 L^2} - \frac{1}{\eta L}\Big)$, we get:
\begin{flalign*}
    \sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{e}}_{k,\tau}\|^2] & \leq \frac{2}{n^2}\sum_{i \in [n]} 32 E^2 \eta^2 L^2 \|\nabla f_i(\bm{w}_k)\|^2
    \\
    & + \frac{\alpha \eta^2 L^2 E^2}{n^2}\sum_{i \in [n]}16 E\|\nabla f_i(\bm{w}_{k})\|^2.
\end{flalign*}
This gives us the desired result.
\end{proof}

\begin{lemma}
\label{fl-lem0}
$\mathbb{E}_{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,\tau}^{(i)}}[\bm{e}_{k,\tau}^{(i)}] = \vec{0}$ $\forall$ %$i \in [n], 
$k \in \{0,\ldots,K-1\}, \tau \in \{1,\ldots,E-1\}$.
\end{lemma}
\begin{proof}
Note that: 
\[\bm{e}_{k,0}^{(i)} = \bm{v}_{k,0}^{(i)} - \nabla f_i(\bm{w}_{k,0}^{(i)}) = \vec{0}.\]
For $\tau > 0$:
\begin{flalign*}
    &\mathbb{E}_{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,\tau}^{(i)}}[\bm{e}_{k,\tau}^{(i)}]  = \mathbb{E}_{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,\tau}^{(i)}}[\bm{v}_{k,\tau}^{(i)} - \nabla f_i(\bm{w}_{k,\tau}^{(i)})]
    \\
    & = \mathbb{E}_{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,\tau}^{(i)}}[\widetilde{\nabla} f_i(\bm{w}_{k,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)}) + (\bm{v}_{k,\tau-1}^{(i)} - \widetilde{\nabla} f_i(\bm{w}_{k,\tau-1}^{(i)};\mathcal{B}_{k,\tau}^{(i)})) - \nabla f_i(\bm{w}_{k,\tau}^{(i)})]
    \\
    & = \mathbb{E}_{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,\tau-1}^{(i)}}[\mathbb{E}_{\mathcal{B}_{k,\tau}^{(i)}}[\widetilde{\nabla} f_i(\bm{w}_{k,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)}) + (\bm{v}_{k,\tau-1}^{(i)} - \widetilde{\nabla} f_i(\bm{w}_{k,\tau-1}^{(i)};\mathcal{B}_{k,\tau}^{(i)})) - \nabla f_i(\bm{w}_{k,\tau}^{(i)})|\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,\tau-1}^{(i)}]]
    \\
    & = \mathbb{E}_{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,\tau-1}^{(i)}}[(\bm{v}_{k,\tau-1}^{(i)} - \nabla f_i(\bm{w}_{k,\tau-1}^{(i)}))]
    \\
    & = \mathbb{E}_{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,\tau-1}^{(i)}}[\bm{e}_{k,\tau-1}^{(i)}].
\end{flalign*}
Doing this recursively, we get:
\begin{equation}
    \label{eq:fl-01}
    \mathbb{E}_{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,\tau}^{(i)}}[\bm{e}_{k,\tau}^{(i)}] = %\mathbb{E}_{\mathcal{B}_{k,0}^{(i)}}[\bm{e}_{k,0}^{(i)}] = 
    \bm{e}_{k,0}^{(i)} = \vec{0}.
\end{equation}
Note that this result also holds if we use full gradients at $\tau=0$.
\end{proof}

\begin{lemma}
\label{fl-lem-new1}
For $\eta < \frac{1}{L}$ and $E < \frac{1}{4}\text{min}\Big(\frac{1}{\eta L}, \frac{1}{\eta^2 L^2} - \frac{1}{\eta L}\Big)$, we have:
%\small
\[\sum_{\tau=0}^{E-1} \mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2] \leq 16 E\|\nabla f_i(\bm{w}_{k})\|^2.\]
Note that in this lemma, the expectation is with respect to the randomness only due to $\{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,E-1}^{(i)}\}_{i=1}^{n}$.
%\normalsize
\end{lemma}
\begin{proof}
First, recall that ${\bm{e}}_{k,\tau}^{(i)} = \bm{v}_{k,\tau}^{(i)} - \nabla f_i(\bm{w}_{k,\tau}^{(i)})$. {Note that $\bm{e}_{k,0}^{(i)} = \vec{0}$, as we are using clients' full gradients at $\tau=0$.}
We have:
\begin{equation}
    \label{may11-2-1}
    \mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2] \leq 2\mathbb{E}[\|\bm{e}_{k,\tau}^{(i)}\|^2] + 2\mathbb{E}[\|\nabla f_i(\bm{w}_{k,\tau}^{(i)})\|^2].
\end{equation}

Using Lemma 2.1 of \cite{liu2020optimal} with $\beta = 0$, we have:
\begin{flalign}
    %\label{eq:fl-13}
    \nonumber
    \mathbb{E}[\|{\bm{e}}_{k,\tau}^{(i)}\|^2] & \leq
    \mathbb{E}[\|{\bm{e}}_{k,0}^{(i)}\|^2] + 2L^2\sum_{t=0}^{\tau-1}\mathbb{E}[\|\bm{w}_{k,t+1}^{(i)} - \bm{w}_{k,t}^{(i)}\|^2]
    \\
    \label{eq:fl-13}
    & \leq 2L^2\sum_{t=0}^{\tau-1}\mathbb{E}[\|\bm{w}_{k,t+1}^{(i)} - \bm{w}_{k,t}^{(i)}\|^2].
\end{flalign}
{The last step follows because $\bm{e}_{k,0}^{(i)} = \vec{0}$.}


Summing the above from $\tau = 0$ through to $E-1$, we get:
\begin{flalign}
    %\label{eq:fl-14}
    \nonumber
    \sum_{\tau=0}^{E-1}\mathbb{E}[\|{\bm{e}}_{k,\tau}^{(i)}\|^2] & \leq 
    2L^2\sum_{\tau=0}^{E-1}\sum_{t=0}^{\tau-1}\mathbb{E}[\|\bm{w}_{k,t+1}^{(i)} - \bm{w}_{k,t}^{(i)}\|^2]
    \\
    \label{eq:fl-14}
    & \leq {2 E L^2}\sum_{\tau=0}^{E-2}\mathbb{E}[\|\bm{w}_{k,\tau+1}^{(i)} - \bm{w}_{k,\tau}^{(i)}\|^2].
\end{flalign}
%\\
Next, re-arranging equation (11) in Lemma 2.2 of \cite{liu2020optimal} (observe that in our case, $G_{\eta}(.)$ is simply the gradient), we get:
\begin{equation}
    \label{eq:fl-15}
    \mathbb{E}[\|\nabla f_i(\bm{w}_{k,\tau}^{(i)})\|^2] \leq \frac{2}{\eta}\mathbb{E}[f_i(\bm{w}_{k,\tau}^{(i)}) - f_i(\bm{w}_{k,\tau+1}^{(i)})] -\frac{1}{\eta^2}(1 - \eta{L})\mathbb{E}[\|\bm{w}_{k,\tau+1}^{(i)} - \bm{w}_{k,\tau}^{(i)}\|^2] + \mathbb{E}[\|\bm{e}_{k,\tau}^{(i)}\|^2]
\end{equation}
Summing (\ref{eq:fl-15}) from $\tau=0$ to $E-1$ and using (\ref{eq:fl-14}), we get:
\begin{multline}
    \label{eq:fl-16}
    \sum_{\tau=0}^{E-1}\mathbb{E}[\|\nabla f_i(\bm{w}_{k,\tau}^{(i)})\|^2] \leq \frac{2}{\eta}(f_i(\bm{w}_{k}) - \mathbb{E}[f_i(\bm{w}_{k,E}^{(i)})]) 
    -\frac{(1-\eta L)}{\eta^2}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{w}_{k,\tau+1}^{(i)} - \bm{w}_{k,\tau}^{(i)}\|^2] 
    \\
    + {2 E L^2}\sum_{\tau=0}^{E-2}\mathbb{E}[\|\bm{w}_{k,\tau+1}^{(i)} - \bm{w}_{k,\tau}^{(i)}\|^2].
\end{multline}
Next, summing (\ref{eq:fl-14}) and (\ref{eq:fl-16}) gives us:
\begin{multline}
    \label{eq:fl-17}
    \sum_{\tau=0}^{E-1}\{\mathbb{E}[\|{\bm{e}}_{k,\tau}^{(i)}\|^2]+\mathbb{E}[\|\nabla f_i(\bm{w}_{k,\tau}^{(i)})\|^2]\} \leq \frac{2}{\eta}(f_i(\bm{w}_{k}) - \mathbb{E}[f_i(\bm{w}_{k,E}^{(i)})])
    \\
    - \underbrace{\Big(\frac{1-\eta L}{\eta^2}\Big)}_\text{$> 0$ for $\eta < \frac{1}{L}$}\mathbb{E}[\|\bm{w}_{k,E}^{(i)} - \bm{w}_{k,E-1}^{(i)}\|^2] - \underbrace{\Big(\frac{(1-\eta L)}{\eta^2} - {4 E L^2}\Big)}_\text{$> 0$ for $E < \frac{(1-\eta L)}{4\eta^2 L^2}$}\sum_{\tau=0}^{E-2}\mathbb{E}[\|\bm{w}_{k,\tau+1}^{(i)} - \bm{w}_{k,\tau}^{(i)}\|^2].
\end{multline}
So if we have $\eta < \frac{1}{L}$ and $E < \frac{1}{4}(\frac{1}{\eta^2 L^2} - \frac{1}{\eta L})$, we get:
\begin{equation}
    \label{eq:fl-18}
    \sum_{\tau=0}^{E-1}\{\mathbb{E}[\|{\bm{e}}_{k,\tau}^{(i)}\|^2]+\mathbb{E}[\|\nabla f_i(\bm{w}_{k,\tau}^{(i)})\|^2]\} \leq \frac{2}{\eta}(f_i(\bm{w}_{k}) - \mathbb{E}[f_i(\bm{w}_{k,E}^{(i)})]).
\end{equation}
Now from \Cref{fl-lem3}, for $E < \frac{1}{4}\text{min}\Big(\frac{1}{\eta L}, \frac{1}{\eta^2 L^2} - \frac{1}{\eta L}\Big)$, we have that:
\begin{equation}
\label{eq:fl-add-19}
f_i(\bm{w}_{k}) - \mathbb{E}[f_i(\bm{w}_{k,E}^{(i)})] \leq 4 \eta E\|\nabla f_i(\bm{w}_{k})\|^2.
\end{equation}
Putting (\ref{eq:fl-add-19}) in (\ref{eq:fl-18}) and then using it (\ref{may11-2-1}) gives us the desired result.
\end{proof}

\begin{lemma}
\label{fl-lem3}
For $\eta < \frac{1}{L}$ and $E < \frac{1}{4}\text{min}\Big(\frac{1}{\eta L}, \frac{1}{\eta^2 L^2} - \frac{1}{\eta L}\Big)$, we have:
\[f_i(\bm{w}_{k}) - \mathbb{E}[f_i(\bm{w}_{k,E}^{(i)})] \leq 4\eta E\|\nabla f_i(\bm{w}_{k})\|^2.\]
The expectation above is with respect to the randomness only due to $\{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,E-1}^{(i)}\}_{i=1}^{n}$.
\end{lemma}
\begin{proof}
By $L$-smoothness of each $f_i$, we have:
\[f_i(\bm{w}_{k,E}^{(i)}) \geq f_i(\bm{w}_{k}) + \langle \nabla f_i(\bm{w}_{k}), \bm{w}_{k,E}^{(i)} - \bm{w}_{k} \rangle - \frac{L}{2}\|\bm{w}_{k,E}^{(i)} - \bm{w}_{k}\|^2\]
\begin{flalign*}
    \implies f_i(\bm{w}_{k}) - f_i(\bm{w}_{k,E}^{(i)}) & \leq \langle \nabla f_i(\bm{w}_{k}), \bm{w}_{k} - \bm{w}_{k,E}^{(i)} \rangle + \frac{L}{2}\|\bm{w}_{k,E}^{(i)} - \bm{w}_{k}\|^2
    \\
    & \leq \underbrace{\frac{\alpha'}{2}\|\nabla f_i(\bm{w}_{k})\|^2 + \frac{1}{2\alpha'} \|\bm{w}_{k,E}^{(i)} - \bm{w}_{k}\|^2}_\text{follows by Young's inequality} + \frac{L}{2}\|\bm{w}_{k,E}^{(i)} - \bm{w}_{k}\|^2 \text{ for } \alpha' > 0.
\end{flalign*}
Recall that $\bm{w}_{k,E}^{(i)} - \bm{w}_{k} = \eta \sum_{\tau=0}^{E-1}\bm{v}_{k,\tau}^{(i)}$. Hence taking expectation above with $\alpha' = 2\eta E$, we get that:
\begin{flalign}
    f_i(\bm{w}_{k}) - \mathbb{E}[f_i(\bm{w}_{k,E}^{(i)})] & \leq 
    \eta E\|\nabla f_i(\bm{w}_{k})\|^2 +\eta^2 E \Big(\frac{1}{4\eta E} + \frac{L}{2}\Big)\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2]
    \\
    \label{eq:fl-1-1}
    & \leq \eta E\|\nabla f_i(\bm{w}_{k})\|^2 + \frac{3\eta}{8}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2].
\end{flalign}
(\ref{eq:fl-1-1}) follows from the fact that $\eta L E < \frac{1}{4}$. Next, from the proof of \Cref{fl-lem-new1}, for $E < \frac{(1-\eta L)}{4\eta^2 L^2}$: 
\[\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2] %= \sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{e}_{k,\tau}^{(i)}\|^2 + \|\nabla f_i(\bm{w}_{k,\tau}^{(i)})\|^2] 
\leq \frac{2}{\eta}(f_i(\bm{w}_{k}) - \mathbb{E}[f_i(\bm{w}_{k,E}^{(i)})]).\]
Putting this in (\ref{eq:fl-1-1}), we get:
\[f_i(\bm{w}_{k}) - \mathbb{E}[f_i(\bm{w}_{k,E}^{(i)})] \leq \eta E\|\nabla f_i(\bm{w}_{k})\|^2 +\frac{3}{4}(f_i(\bm{w}_{k}) - \mathbb{E}[f_i(\bm{w}_{k,E}^{(i)})].\]
\begin{equation}
    \label{eq:fl-1-2}
     \implies f_i(\bm{w}_{k}) - \mathbb{E}[f_i(\bm{w}_{k,E}^{(i)})] \leq 4\eta E\|\nabla f_i(\bm{w}_{k})\|^2.
\end{equation}
%This concludes the proof of \Cref{fl-lem3}.
\end{proof}

%\begin{comment}
\begin{lemma}
\label{fl-lem-new2}
For $\eta < \frac{1}{L}$ 
and $E < \frac{1}{4}\text{min}\Big( \frac{1}{\eta L}, \frac{1}{\eta^2 L^2} - \frac{1}{\eta L}\Big)$, we have:
\[\sum_{\tau=0}^{E-1}\mathbb{E}[\|{\bm{e}}_{k,\tau}^{(i)}\|^2] \leq 32 E^2 \eta^2 L^2 \|\nabla f_i(\bm{w}_k)\|^2.\]
The expectation above is with respect to the randomness only due to $\{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,E-1}^{(i)}\}_{i=1}^{n}$.
\end{lemma}
\begin{proof}
Note that in \Cref{fl-lem-new1}, we have already bounded $\sum_{\tau=0}^{E-1}\mathbb{E}[\|{\bm{e}}_{k,\tau}^{(i)}\|^2]$ (see (\ref{eq:fl-14})) -- but here we expand it more for use in \Cref{fl-lem2}.
\\
First, from (\ref{eq:fl-14}), we have:
\begin{flalign*}
    %\label{eq:fl-13}
    \sum_{\tau=0}^{E-1}\mathbb{E}[\|{\bm{e}}_{k,\tau}^{(i)}\|^2] \leq {2 E L^2}\sum_{\tau=0}^{E-2}\mathbb{E}[\|\bm{w}_{k,\tau+1}^{(i)} - \bm{w}_{k,\tau}^{(i)}\|^2].
\end{flalign*}
Next, using the fact that $\bm{w}_{k,\tau+1}^{(i)} = \bm{w}_{k,\tau}^{(i)} - \eta\bm{v}_{k,\tau}^{(i)}$, we get:
\begin{flalign}
    \label{eq:fl-lem-new2-1}
    \nonumber
    \sum_{\tau=0}^{E-1}\mathbb{E}[\|{\bm{e}}_{k,\tau}^{(i)}\|^2] & \leq {2 E \eta^2 L^2}\sum_{\tau=0}^{E-2}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2]
    %\\
    %\nonumber
    %& 
    \leq {2 E \eta^2 L^2}\underbrace{\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2]}_\text{from \Cref{fl-lem-new1}}
    %\\
    %\label{eq:fl-lem-new2-1}
    %& 
    \leq {2 E \eta^2 L^2}(16 E\|\nabla f_i(\bm{w}_{k})\|^2).
\end{flalign}
This gives us the desired result.
\end{proof}

\begin{lemma}
\label{nov-1-lem2}
Suppose $2 \eta L E^2 \leq 1$. Then:
\begin{multline*}
    \mathbb{E}[\|\bm{u}_k - \overline{\bm{\delta}}_k\|^2] \leq (1-\beta)^2\mathbb{E}[\|\bm{u}_{k-1} - \overline{\bm{\delta}}_{k-1}\|^2] + 2 \beta^2 \mathbb{E}[\|{g}_Q(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k\|^2] 
    \\
    + 8 e^2 (1+q) (1-\beta)^2 \eta^2 L^2 E^2 (E+1)^2 \mathbb{E}[\|\bm{w}_{k} - \bm{w}_{k-1}\|^2]. 
\end{multline*}
\end{lemma}
\begin{proof}
First, note that for each $i \in [n]$, $\mathbb{E}_{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,E-1}^{(i)}}[\bm{w}_{k} - \widehat{\bm{w}}_{k,E}^{(i)}] = \bm{\delta}_{k}^{(i)}$. So:
\begin{equation}
    \label{eq:nov-1-lem1-2}
    \mathbb{E}_{\mathcal{S}_k,\{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,E-1}^{(i)}\}_{i=1}^{n}}[{g}(\bm{w}_k;\mathcal{S}_k)] = \overline{\bm{\delta}}_k.
\end{equation}
%Also, observe that 
Similarly, for each $i \in [n]$, $\mathbb{E}_{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,E-1}^{(i)}}[\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}] = %\mathbb{E}_{\mathcal{B}_{k-1,0}^{(i)},\ldots,\mathcal{B}_{k-1,E-1}^{(i)}}[{\bm{w}}_{k-1,E}^{(i)}]
\bm{\delta}_{k-1}^{(i)}$. Hence:
\begin{equation}
    \label{eq:nov-1-lem1-3}
    \mathbb{E}_{\mathcal{S}_k,\{\mathcal{B}_{k,1}^{(i)},\ldots,\mathcal{B}_{k,E-1}^{(i)}\}_{i=1}^{n}%,\{\mathcal{B}_{k-1,0}^{(i)},\ldots,\mathcal{B}_{k-1,E-1}^{(i)}\}_{i=1}^{n}
    }[\widehat{g}(\bm{w}_{k-1};\mathcal{S}_k)] = \overline{\bm{\delta}}_{k-1}.
\end{equation}
We have:
\begin{flalign}
    \nonumber
    \mathbb{E}&[\|\bm{u}_k - \overline{\bm{\delta}}_k\|^2]  = 
    \mathbb{E}[\|\beta {g}_Q(\bm{w}_k;\mathcal{S}_k) + (1-\beta)\bm{u}_{k-1} + (1-\beta)\Delta{g}_Q(\bm{w}_k,\bm{w}_{k-1};\mathcal{S}_k) - \overline{\bm{\delta}}_k\|^2]
    %\mathbb{E}[\|{g}(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k + (1 - \beta)(\bm{u}_{k-1} - \widehat{g}(\bm{w}_{k-1};\mathcal{S}_k))\|^2]
    \\
    \nonumber
    & = 
    \mathbb{E}[\|(1 - \beta)(\bm{u}_{k-1} - \overline{\bm{\delta}}_{k-1}) + \beta{g}_Q(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k + (1 - \beta)(\overline{\bm{\delta}}_{k-1} + \Delta{g}_Q(\bm{w}_k,\bm{w}_{k-1};\mathcal{S}_k))\|^2]
    %\mathbb{E}[\|(1 - \beta)(\bm{u}_{k-1} - \overline{\bm{\delta}}_{k-1}) + {g}(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k + (1 - \beta)(\overline{\bm{\delta}}_{k-1} - \widehat{g}(\bm{w}_{k-1};\mathcal{S}_k))\|^2]
    \\
    \label{eq:nov-1-lem2-1}
    & = (1-\beta)^2\mathbb{E}[\|\bm{u}_{k-1} - \overline{\bm{\delta}}_{k-1}\|^2] + 
    \mathbb{E}[\|\beta{g}_Q(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k + (1 - \beta)(\overline{\bm{\delta}}_{k-1} + \Delta{g}_Q(\bm{w}_k,\bm{w}_{k-1};\mathcal{S}_k))\|^2]
    %\mathbb{E}[\|{g}(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k + (1 - \beta)(\overline{\bm{\delta}}_{k-1} - \widehat{g}(\bm{w}_{k-1};\mathcal{S}_k))\|^2].
\end{flalign}
The cross-term in (\ref{eq:nov-1-lem2-1}) vanishes by taking expectation with respect to $Q_D$ and $\mathcal{S}_k$. Next:
\begin{flalign}
    \nonumber
    & %\mathbb{E}[\|{g}(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k + (1 - \beta)(\overline{\bm{\delta}}_{k-1} - \widehat{g}(\bm{w}_{k-1};\mathcal{S}_k))\|^2] 
    \mathbb{E}[\|\beta{g}_Q(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k + (1 - \beta)(\overline{\bm{\delta}}_{k-1} + \Delta{g}_Q(\bm{w}_k,\bm{w}_{k-1};\mathcal{S}_k))\|^2]
    \\
    \nonumber
    & = 
    \mathbb{E}[\|\beta({g}_Q(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k) + (1 - \beta)(\overline{\bm{\delta}}_{k-1} + \Delta{g}_Q(\bm{w}_k,\bm{w}_{k-1};\mathcal{S}_k)) - \overline{\bm{\delta}}_k)\|^2]
    %\mathbb{E}[\|\beta({g}(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k) + (1 - \beta)(\overline{\bm{\delta}}_{k-1} - \widehat{g}(\bm{w}_{k-1};\mathcal{S}_k) + {g}(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k)\|^2]
    \\
    \label{eq:nov-1-lem2-2}
    & \leq 
    2 \beta^2 \mathbb{E}[\|{g}_Q(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k\|^2] + 2 (1-\beta)^2\mathbb{E}[\|\overline{\bm{\delta}}_{k-1} + \Delta{g}_Q(\bm{w}_k,\bm{w}_{k-1};\mathcal{S}_k) - \overline{\bm{\delta}}_k\|^2]
    %2 \beta^2 \mathbb{E}[\|{g}(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k\|^2] + 2(1-\beta)^2 \mathbb{E}[\|\overline{\bm{\delta}}_{k-1} - \widehat{g}(\bm{w}_{k-1};\mathcal{S}_k) + {g}(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k\|^2].
\end{flalign}
Next, note that:
\begin{flalign}
    \nonumber
    & \mathbb{E}[\|\overline{\bm{\delta}}_{k-1} + \Delta{g}_Q(\bm{w}_k,\bm{w}_{k-1};\mathcal{S}_k) - \overline{\bm{\delta}}_k\|^2]
    %\mathbb{E}[\|\overline{\bm{\delta}}_{k-1} - \widehat{g}(\bm{w}_{k-1};\mathcal{S}_k) + {g}(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k\|^2]
    \\
    \nonumber
    & = \mathbb{E}[\|\Delta{g}_Q(\bm{w}_k,\bm{w}_{k-1};\mathcal{S}_k)\|^2] + \mathbb{E}[\|\overline{\bm{\delta}}_k - \overline{\bm{\delta}}_{k-1}\|^2] - 2 \mathbb{E}[\langle \Delta{g}_Q(\bm{w}_k,\bm{w}_{k-1};\mathcal{S}_k), \overline{\bm{\delta}}_k - \overline{\bm{\delta}}_{k-1} \rangle]
    %\mathbb{E}[\| {g}(\bm{w}_k;\mathcal{S}_k) - \widehat{g}(\bm{w}_{k-1};\mathcal{S}_k)\|^2] + \mathbb{E}[\|\overline{\bm{\delta}}_k - \overline{\bm{\delta}}_{k-1}\|^2] - 2 \mathbb{E}[ \langle {g}(\bm{w}_k;\mathcal{S}_k) - \widehat{g}(\bm{w}_{k-1};\mathcal{S}_k), \overline{\bm{\delta}}_k - \overline{\bm{\delta}}_{k-1} \rangle]
    \\
    \label{eq:nov-1-lem2-3}
    & = %\mathbb{E}[\| {g}(\bm{w}_k;\mathcal{S}_k) - \widehat{g}(\bm{w}_{k-1};\mathcal{S}_k)\|^2] 
    \mathbb{E}[\|\Delta{g}_Q(\bm{w}_k,\bm{w}_{k-1};\mathcal{S}_k)\|^2] + \mathbb{E}[\|\overline{\bm{\delta}}_k - \overline{\bm{\delta}}_{k-1}\|^2] - 2\mathbb{E}[\|\overline{\bm{\delta}}_k - \overline{\bm{\delta}}_{k-1}\|^2]
    \\
    \label{eq:nov-1-lem2-4}
    & \leq \mathbb{E}[\|\Delta{g}_Q(\bm{w}_k,\bm{w}_{k-1};\mathcal{S}_k)\|^2].
    %\mathbb{E}[\| {g}(\bm{w}_k;\mathcal{S}_k) - \widehat{g}(\bm{w}_{k-1};\mathcal{S}_k)\|^2].
\end{flalign}
(\ref{eq:nov-1-lem2-3}) follows by first taking expectation with respect to $Q_D$ and then using (\ref{eq:nov-1-lem1-2}) and (\ref{eq:nov-1-lem1-3}). 

Further:
\begin{flalign}
    \nonumber
    \mathbb{E}[\|\Delta{g}_Q(\bm{w}_k,\bm{w}_{k-1};\mathcal{S}_k)\|^2] & = \mathbb{E}\Big[\Big\|\frac{1}{r}\sum_{i \in \mathcal{S}_k}Q_D(({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}}) - ({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}}))\Big\|^2\Big]
    \\
    \nonumber
    & \leq \mathbb{E}_{\mathcal{S}_k}\Big[\frac{r}{r^2} \sum_{i \in \mathcal{S}_k} \mathbb{E}\Big[\| Q_D(({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}}) - ({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}}))\|^2\Big]\Big]
    \\
    \label{nov6-2}
    & \leq \mathbb{E}_{\mathcal{S}_k}\Big[\frac{1}{r} \sum_{i \in \mathcal{S}_k} (1+q) \mathbb{E}\Big[\|({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}}) - ({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})\|^2\Big]\Big]
    \\
    \label{eq:may10-3}
    & = \frac{1}{n} \sum_{i \in [n]} (1+q) \mathbb{E}\Big[\|({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}}) - ({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})\|^2\Big].
\end{flalign}
(\ref{nov6-2}) follows from \Cref{as5} on the variance of $Q_D$. Further, using \Cref{nov-1-lem3}, we get
\begin{equation}
    \label{nov6-3}
    \mathbb{E}[\|({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}}) - ({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})\|^2] \leq
    4 e^2 \eta^2 L^2 E^2 (E+1)^2 \mathbb{E}[\|\bm{w}_{k} - \bm{w}_{k-1}\|^2],
\end{equation}
for $2 \eta L E^2 \leq 1$. 

Using this in (\ref{eq:may10-3}):
\begin{flalign}
    \label{nov6-4}
    %\nonumber
    \mathbb{E}[\|\Delta{g}_Q(\bm{w}_k,\bm{w}_{k-1};\mathcal{S}_k)\|^2] & \leq 
    4 e^2 (1+q) \eta^2 L^2 E^2 (E+1)^2 \mathbb{E}[\|\bm{w}_{k} - \bm{w}_{k-1}\|^2].
\end{flalign}
Now using (\ref{nov6-4}) in (\ref{eq:nov-1-lem2-4}) and then using it in (\ref{eq:nov-1-lem2-2}), we get:
%\small
\begin{multline}
    \label{eq:nov-1-lem2-5}
    \mathbb{E}[\|\beta{g}_Q(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k + (1 - \beta)(\overline{\bm{\delta}}_{k-1} + \Delta{g}_Q(\bm{w}_k,\bm{w}_{k-1};\mathcal{S}_k))\|^2]
    \\
    \leq 
    2 \beta^2 \mathbb{E}[\|{g}_Q(\bm{w}_k;\mathcal{S}_k) - \overline{\bm{\delta}}_k\|^2] +
    8 e^2 (1+q) (1-\beta)^2 \eta^2 L^2 E^2 (E+1)^2 \mathbb{E}[\|\bm{w}_{k} - \bm{w}_{k-1}\|^2].
\end{multline}
%\normalsize
Finally, putting (\ref{eq:nov-1-lem2-5}) back in (\ref{eq:nov-1-lem2-1}) gives us the desired result.
\end{proof}


\begin{lemma}
\label{nov-1-lem3}
Suppose $2 \eta L E^2 \leq 1$. Then $\forall$ $k \geq 0$ and $i \in [n]$, we have:
\begin{equation*}
    \mathbb{E}[\|({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}}) - ({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})\|] \leq
    2 e (\eta L E (E+1)) \|\bm{w}_{k} - \bm{w}_{k-1}\|.
\end{equation*}
\end{lemma}
\begin{proof}
We have for any $i \in [n]$:
\begin{flalign}
    \nonumber
    \|({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}}) - ({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})\| & = \Big\|\sum_{\tau=0}^{E-1}\eta \bm{v}_{k,\tau}^{(i)} - \sum_{\tau=0}^{E-1}\eta \widehat{\bm{v}}_{k-1,\tau}^{(i)}\Big\|
    %\|{g}(\bm{w}_k;\mathcal{S}_k) - \widehat{g}(\bm{w}_{k-1};\mathcal{S}_k)\| &= \Big\|\frac{1}{r}\sum_{i \in \mathcal{S}_k}({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}}) - \frac{1}{r}\sum_{i \in \mathcal{S}_k}({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})\Big\|
    %\\
    %\nonumber
    %& = \Big\|\frac{1}{r}\sum_{i \in \mathcal{S}_k}\sum_{\tau=0}^{E-1}\eta \bm{v}_{k,\tau}^{(i)} - \frac{1}{r}\sum_{i \in \mathcal{S}_k}\sum_{\tau=0}^{E-1}\eta \widehat{\bm{v}}_{k-1,\tau}^{(i)}\Big\|
    \\
    %\nonumber
    \label{eq:nov-1-lem3-1}
    & \leq %\frac{1}{r}\sum_{i \in \mathcal{S}_k} 
    \sum_{\tau=0}^{E-1} \eta \|\bm{v}_{k,\tau}^{(i)} - \widehat{\bm{v}}_{k-1,\tau}^{(i)}\|.
\end{flalign}
The last step follows by the triangle inequality.
\\
Next, we have:
%\small
\begin{multline*}
    \|\bm{v}_{k,\tau}^{(i)} - \widehat{\bm{v}}_{k-1,\tau}^{(i)}\| = \|\{\widetilde{\nabla} f_i(\bm{w}_{k,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)}) + (\bm{v}_{k,\tau-1}^{(i)} - \widetilde{\nabla} f_i(\bm{w}_{k,\tau-1}^{(i)};\mathcal{B}_{k,\tau}^{(i)}))\} 
    \\
    - \{\widetilde{\nabla} f_i(\widehat{\bm{w}}_{k-1,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)}) 
    + (\widehat{\bm{v}}_{k-1,\tau-1}^{(i)}
    - \widetilde{\nabla} f_i(\widehat{\bm{w}}_{k-1,\tau-1}^{(i)};\mathcal{B}_{k,\tau}^{(i)}))\}\|
\end{multline*}
%\normalsize
Note that $\mathcal{B}_{k,\tau}^{(i)}$ can be the full batch too. 

Re-arranging the above, using the triangle inequality and the smoothness of the stochastic gradients, we get:
\begin{equation}
    \label{eq:nov-1-lem3-2}
    \|\bm{v}_{k,\tau}^{(i)} - \widehat{\bm{v}}_{k-1,\tau}^{(i)}\| \leq L\|\bm{w}_{k,\tau}^{(i)} - \widehat{\bm{w}}_{k-1,\tau}^{(i)}\| +
    \|\bm{v}_{k,\tau-1}^{(i)} - \widehat{\bm{v}}_{k-1,\tau-1}^{(i)}\| +
    L\|\bm{w}_{k,\tau-1}^{(i)} - \widehat{\bm{w}}_{k-1,\tau-1}^{(i)}\|.
\end{equation}
Unfolding the above recursion, we get:
\begin{equation}
    \label{eq:nov-1-lem3-3}
    \|\bm{v}_{k,\tau}^{(i)} - \widehat{\bm{v}}_{k-1,\tau}^{(i)}\| \leq 2L\sum_{t=0}^{\tau} \|\bm{w}_{k,t}^{(i)} - \widehat{\bm{w}}_{k-1,t}^{(i)}\|.
\end{equation}
Just as a sanity check for (\ref{eq:nov-1-lem3-3}), observe that $\|\bm{v}_{k,0}^{(i)} - \widehat{\bm{v}}_{k-1,0}^{(i)}\| = \|\nabla f_i(\bm{w}_k) - \nabla f_i(\bm{w}_{k-1})\| \leq L \|\bm{w}_k - \bm{w}_{k-1}\|$. Next:
\begin{flalign}
    \nonumber
    \|\bm{w}_{k,\tau+1}^{(i)} - \widehat{\bm{w}}_{k-1,\tau+1}^{(i)}\|
    & = \|\bm{w}_{k,\tau}^{(i)} - \widehat{\bm{w}}_{k-1,\tau}^{(i)} - \eta (\bm{v}_{k,\tau}^{(i)} - \widehat{\bm{v}}_{k-1,\tau}^{(i)})\|
    \\
    \nonumber
    & \leq \|\bm{w}_{k,\tau}^{(i)} - \widehat{\bm{w}}_{k-1,\tau}^{(i)}\| + \eta \|\bm{v}_{k,\tau}^{(i)} - \widehat{\bm{v}}_{k-1,\tau}^{(i)}\|
    \\
    %\label{eq:nov-1-lem3-4}
    \nonumber
    & \leq \|\bm{w}_{k,\tau}^{(i)} - \widehat{\bm{w}}_{k-1,\tau}^{(i)}\| + 2 \eta L \sum_{t=0}^{\tau} \|\bm{w}_{k,t}^{(i)} - \widehat{\bm{w}}_{k-1,t}^{(i)}\|.
\end{flalign}
The last step follows by using (\ref{eq:nov-1-lem3-3}). Thus:
\begin{equation}
    \label{eq:nov-1-lem3-4}
    \|\bm{w}_{k,\tau}^{(i)} - \widehat{\bm{w}}_{k-1,\tau}^{(i)}\| \leq \|\bm{w}_{k,\tau-1}^{(i)} - \widehat{\bm{w}}_{k-1,\tau-1}^{(i)}\| + 2 \eta L \sum_{t=0}^{\tau-1} \|\bm{w}_{k,t}^{(i)} - \widehat{\bm{w}}_{k-1,t}^{(i)}\|.
\end{equation}
%For simplicity, let us define
%\[\Delta_w = \]
%\\
Based on (\ref{eq:nov-1-lem3-4}), we claim that:
\begin{equation}
    \label{eq:nov-1-lem3-5}
    \|\bm{w}_{k,\tau}^{(i)} - \widehat{\bm{w}}_{k-1,\tau}^{(i)}\| \leq (1 + 2 \eta L E)^{\tau}\|\bm{w}_k - \bm{w}_{k-1}\|.
\end{equation}

We prove this by induction. Let us first examine the base case of $\tau = 1$. We have:
\begin{flalign*}
    \|\bm{w}_{k,1}^{(i)} - \widehat{\bm{w}}_{k-1,1}^{(i)}\| & = \|\bm{w}_{k} - {\bm{w}}_{k-1} - \eta (\bm{v}_{k,0}^{(i)} - \widehat{\bm{v}}_{k-1,0}^{(i)})\|
    \\
    & = \|\bm{w}_{k} - {\bm{w}}_{k-1} - \eta(\nabla f_i(\bm{w}_{k}) - \nabla f_i({\bm{w}}_{k-1}))\|
    \\
    & \leq \|\bm{w}_{k} - {\bm{w}}_{k-1}\| + \eta L \|\bm{w}_{k} - {\bm{w}}_{k-1}\|
    \\
    & \leq (1 + 2 \eta L E)^{1}\|\bm{w}_{k} - {\bm{w}}_{k-1}\|.
\end{flalign*}
For ease of notation, let us define ${d}_{k} \triangleq \|\bm{w}_{k} - \bm{w}_{k-1}\|$. Now suppose the claim is true for $\tau \leq t$. Then using (\ref{eq:nov-1-lem3-4}), we have for $\tau= t+1$:
%\small
\begin{flalign}
    \nonumber
    \|\bm{w}_{k,t+1}^{(i)} - \widehat{\bm{w}}_{k-1,t+1}^{(i)}\|
    & \leq  \Big\{(1 + 2 \eta L E)^{t} + 2\eta L \sum_{t_2=0}^{t}(1 + 2 \eta L E)^{t_2}\Big\}{d}_{k}
    \\
    \nonumber
    & \leq \Big\{(1 + 2 \eta L E)^{t} + 2 \eta L (t+1) (1 + 2 \eta L E)^{t} \Big\}{d}_{k}
    \\
    \label{eq:nov-1-lem3-6}
    & \leq (1 + 2 \eta L E)^{t} (1 + 2 \eta L (t+1)) {d}_{k} \leq (1 + 2 \eta L E)^{t+1} {d}_{k}.
\end{flalign}
%\normalsize
This proves our claim.
\\
Now, using our claim, i.e., (\ref{eq:nov-1-lem3-5}) in (\ref{eq:nov-1-lem3-3}), we get:
%\small
\begin{flalign}
    %\label{eq:nov-1-lem3-8}
    %\nonumber
    \label{eq:nov-1-lem3-8}
    \|\bm{v}_{k,\tau}^{(i)} - \widehat{\bm{v}}_{k-1,\tau}^{(i)}\| & \leq 2L\sum_{t=0}^{\tau}(1 + 2 \eta L E)^{t}\|\bm{w}_{k} - \bm{w}_{k-1}\| \leq 2 L (\tau+1) (1 + 2 \eta L E)^{\tau}\|\bm{w}_{k} - \bm{w}_{k-1}\|.
\end{flalign}
%\normalsize
Note that this bound is independent of $i$.
\\
Finally, using (\ref{eq:nov-1-lem3-8}) in (\ref{eq:nov-1-lem3-1}), we get:
\begin{flalign}
    \nonumber
    \|({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}}) - ({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})\|
    & \leq \sum_{\tau=0}^{E-1} \eta \|\bm{v}_{k,\tau}^{(i)} - \widehat{\bm{v}}_{k-1,\tau}^{(i)}\|
    \\
    \nonumber
    & \leq \sum_{\tau=0}^{E-1} 2 \eta L (\tau + 1) (1 + 2 \eta L E)^{\tau}\|\bm{w}_{k} - \bm{w}_{k-1}\|
    \\
    \nonumber
    & \leq 2 \eta L E (E+1) (1 + 2 \eta L E)^{E}\|\bm{w}_{k} - \bm{w}_{k-1}\|
    \\
    %\nonumber
    \label{eq:nov-1-lem3-9}
    & \leq 2 \eta L E (E+1) e^{2 \eta L E^2} \|\bm{w}_{k} - \bm{w}_{k-1}\|.
\end{flalign}
The last step follows from the fact that $1+z \leq e^z$ $\forall$ $z$.

Finally, setting $2 \eta L E^2 \leq 1$ gives us the desired result.
\end{proof}


\begin{lemma}
\label{lem-may11-n1}
(V*) in the proof of \Cref{nov-1-lem0} can be bounded as:
\begin{equation*}
    \text{(V*)} \leq 4 \eta^2 E\Big(\frac{q}{n^2} + \frac{(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big)\Big)\sum_{i \in [n]}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2].
\end{equation*}
\end{lemma}

\begin{proof}
We have (V*) = $\mathbb{E}[\|{g}_Q(\bm{w}_l;\mathcal{S}_l) - \overline{\bm{\delta}}_l\|^2]$. Note that:
\begin{multline}
    \label{eq:may11-n10}
    \mathbb{E}[\|{g}_Q(\bm{w}_l;\mathcal{S}_l) - \overline{\bm{\delta}}_l\|^2] \leq \eta^2 \underbrace{\mathbb{E}\Big[\Big\|\frac{1}{r}\sum_{i \in \mathcal{S}_l}\frac{Q_D({\bm{w}_{l} - \bm{w}_{l,E}^{(i)}})}{\eta} - \frac{1}{n}\sum_{i \in [n]}\frac{Q_D({\bm{w}_{l} - \bm{w}_{l,E}^{(i)}})}{\eta}\Big\|^2\Big]}_\text{(A)} \\
    +
    \eta^2\underbrace{\mathbb{E}\Big[\Big\|\frac{1}{n}\sum_{i \in [n]}\Big\{\frac{Q_D({\bm{w}_{l} - \bm{w}_{l,E}^{(i)}})}{\eta} - \frac{({\bm{w}_{l} - \bm{w}_{l,E}^{(i)}})}{\eta}\Big\}\Big\|^2\Big]}_\text{(B)}
\end{multline}

In (A), we take expectation with respect to $\mathcal{S}_k$ and $Q_D(.)$ -- for that, we use Lemma 4 of \cite{reisizadeh2020fedpaq}. Note that $\x_{k,\tau}^{(i)} - \x_k$ in their lemma corresponds to $({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}})$ in our case. Specifically, using eqn. (59) and (60) in \cite{reisizadeh2020fedpaq} (they also have \Cref{as5}), we get:
\begin{flalign}
    \nonumber
    \text{(A)} & \leq \frac{1}{r(n-1)}\Big(1 - \frac{r}{n}\Big)4(1+q)\sum_{i \in [n]}\mathbb{E}[\|{\bm{w}_{k,E}^{(i)} - \bm{w}_{k}}\|^2] 
    \\
    \nonumber
    & = \frac{1}{r(n-1)}\Big(1 - \frac{r}{n}\Big) 4(1+q)\sum_{i \in [n]}\mathbb{E}[\|\sum_{\tau=0}^{E-1}\eta \bm{v}_{k,\tau}^{(i)}\|^2] 
    \\
    \label{eq:oct13-7}
    & \leq \frac{\eta^2}{r(n-1)}\Big(1 - \frac{r}{n}\Big) 4(1+q)E\sum_{i \in [n]}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2]
\end{flalign}
Next, we deal with (B). We have:
\begin{flalign}
    \nonumber
    \text{(B)} & = \mathbb{E}\Big[\mathbb{E}_{Q_D}\Big[\Big\|\frac{1}{n}\sum_{i \in [n]}\Big\{Q_{D}\Big({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}}\Big) - \Big({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}}\Big)\Big\}\Big\|^2\Big]\Big]
    \\
    \nonumber
    & \leq \frac{q}{n^2}\sum_{i \in [n]}\mathbb{E}\Big[\Big\|{\bm{w}_{k,E}^{(i)} - \bm{w}_{k}}\Big\|^2\Big]
    \\
    \label{eq:oct13-8}
    & \leq \frac{q E \eta^2 }{n^2}\sum_{i \in [n]}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2].
\end{flalign}
Now using (\ref{eq:oct13-7}) and (\ref{eq:oct13-8}) in (\ref{eq:may11-n10}), we get:
\begin{flalign}
    \nonumber
    \text{(V*)} & \leq \frac{\eta^2}{r(n-1)}\Big(1 - \frac{r}{n}\Big) 4 (1+q) E\sum_{i \in [n]}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2] + \frac{\eta^2 q E}{n^2}\sum_{i \in [n]}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2]
    \\
    \label{eq:nov-1-thm1-10}
    & \leq
    4 \eta^2 E\Big(\frac{q}{n^2} + \frac{(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big)\Big)\sum_{i \in [n]}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2].
\end{flalign}
This gives us the desired result.
\end{proof}


\begin{lemma}
\label{lem1-oct20}
For any $L$-smooth function $h(\bm{x})$, we have $\forall$ $\bm{x}$:
\[\|\nabla h(\bm{x})\|^2 \leq 2L(h(\bm{x}) - h^{*}) \text{ where } h^{*} = \min_{\bm{x}}h(\bm{x}).\]

\end{lemma}
\begin{proof}
For any $y$, we have that:
\begin{equation}
    \label{eq1-lem1-oct20}
    h^{*} \leq h(\bm{y}) \leq \underbrace{h(\bm{x}) + \langle \nabla h(\bm{x}), \bm{y} - \bm{x} \rangle + \frac{L}{2}\|\bm{y} - \bm{x}\|^2}_\text{$:= h_2(\bm{y})$}
\end{equation}
%Minimizing the RHS over $\bm{y}$
Setting $\nabla h_2(\bm{y}) = \vec{0}$
, we get that $\widehat{\bm{y}} = \bm{x} - \frac{1}{L} \nabla h(\bm{x})$ is the minimizer of $h_2(\bm{y})$ (which is a quadratic with respect to $\bm{y}$). Plugging this back in (\ref{eq1-lem1-oct20}) gives us:
\begin{equation}
    \label{eq2-lem1-oct20}
    h^{*} \leq {h(\bm{x}) + \Big \langle \nabla h(\bm{x}), -\frac{1}{L} \nabla h(\bm{x}) \Big  \rangle  + \frac{L}{2}\Big \|-\frac{1}{L} \nabla h(\bm{x})\Big \|^2} = h(\bm{x}) - \frac{1}{2L}\|\nabla h(\bm{x})\|^2.
\end{equation}
This gives us the desired result.
\end{proof}

%\subsection{PLC Case}
\subsection{Detailed Proof of the Result of \texttt{FedLOMO}}
\label{sec-pf-1}
%\noindent \textbf{Some definitions used in the proofs}: 
Let us redefine the quantities needed to prove the results of \texttt{FedLOMO}.
\[\overline{\bm{w}}_{k,\tau} \triangleq \frac{1}{n}\sum_{i \in [n]} \bm{w}_{k,\tau}^{(i)} \text{ and } \overline{\bm{v}}_{k,\tau} \triangleq \frac{1}{n}\sum_{i \in [n]} \bm{v}_{k,\tau}^{(i)}\]
\[{\bm{e}}_{k,\tau}^{(i)} \triangleq \bm{v}_{k,\tau}^{(i)} - \nabla f_i(\bm{w}_{k, \tau}^{(i)}) \text{ and } \widetilde{\bm{e}}_{k,\tau}^{(i)} \triangleq \nabla f_i(\bm{w}_{k, \tau}^{(i)}) - \nabla f_i(\overline{\bm{w}}_{k,\tau})\]

\noindent \textbf{Proof of \Cref{fl-thm3}}:
\begin{proof}
Let us set $\eta_k = \eta$.
\\
Using \Cref{oct-13-lem1}, with $\eta < \frac{1}{L}$ and $E < \frac{1}{4}\text{min}\Big(\frac{1}{\eta L}, \frac{1}{\eta^2 L^2} - \frac{1}{\eta L}\Big)$:
\begin{multline}
    \label{sept25-eq1}
    \mathbb{E}[f(\bm{w}_{k+1})] \leq \mathbb{E}[f(\bm{w}_{k})] -\frac{\eta E}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] - \frac{\eta}{2}(1 - \eta^2  L^2 E^2 - \eta L E )\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2]
    \\
    + 16\eta L E^2\Big\{\frac{\eta^2 L (\alpha E + 4)}{n^2} + \frac{\eta}{2}\Big(\frac{q}{n^2} + \frac{4(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big) \Big)\Big\}\sum_{i \in [n]}{\mathbb{E}[\|\nabla f_i(\bm{w}_{k})\|^2]}.
    %16 \eta L E^2\Big\{\frac{\eta^2 L}{n}{\Big(E + \frac{4}{n}\Big)} + \frac{\eta}{2}\Big(\frac{q}{n^2} + \frac{4(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big) \Big)\Big\}\sum_{i \in [n]}{\mathbb{E}[\|\nabla f_i(\bm{w}_{k})\|^2]}
\end{multline}
Note here that for $\eta < \frac{1}{2L}$, $\frac{1}{\eta L} < \frac{1}{\eta^2 L^2} - \frac{1}{\eta L}$ and so $E < \frac{1}{4\eta L}$ or $\eta L E < \frac{1}{4}$. Since $E>1$, we are just left with $\eta L E < \frac{1}{4}$. 

Next, we circumvent the need for %\Cref{as-nov4} 
the bounded client dissimilarity assumption by using the fact that each $f_i$ is $L$-smooth and so $\|\nabla f_i(\bm{w}_k)\|^2 \leq 2L (f_i(\bm{w}_k) - f_i^{*})$ using \Cref{lem1-oct20}. Hence:
\begin{equation}
    \label{sept25-eq2}
    \sum_{i \in [n]}{\mathbb{E}[\|\nabla f_i(\bm{w}_{k})\|^2]} \leq 2L\sum_{i \in [n]}\mathbb{E}[(f_i(\bm{w}_k) - f_i^{*})] = 2nL \mathbb{E}[(f(\bm{w}_k) - f^{*} + \Delta^{*})],
\end{equation}
where $\Delta^{*} := f^{*} - \frac{1}{n}\sum_{i=1}^n f_i^{*}$. %Further:
%\[-\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] \leq -\mathbb{E}[\|\overline{\bm{v}}_{k,0}\|^2] = - \mathbb{E}\Big[\Big\|\frac{1}{n}\sum_{i=1}^n \nabla f_i(\bm{w}_k)\Big\|^2\Big] = - \mathbb{E}[\|\nabla f(\bm{w}_k)\|^2].\]
Using all this in (\ref{sept25-eq1}), we get:
\begin{multline}
    \label{sept25-eq3}
    \mathbb{E}[f(\bm{w}_{k+1})] \leq \mathbb{E}[f(\bm{w}_{k})] -\frac{\eta E}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] - \frac{\eta}{2}\underbrace{(1 - \eta^2  L^2 E^2 - \eta L E )}_\text{$> 0$ for $\eta L E < \frac{1}{4}$}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2]
    %\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2]
    \\
    + 32 \eta L^2 E^2\Big\{{\eta^2 L}{\Big( \frac{\alpha E +4}{n}\Big)}%_{=A < 2E} 
    + \frac{\eta}{2}\underbrace{\Big(\frac{q}{n} + \frac{4(1+q)(n-r)}{r(n-1)} \Big)}_{:=B}\Big\}\mathbb{E}[(f(\bm{w}_k) - f^{*} + \Delta^{*})].
\end{multline}
Note that $(1 - \eta^2  L^2 E^2 - \eta L E ) > \frac{11}{16}$ for $\eta L E < \frac{1}{4}$.
Further, $-f^{*} + \Delta^{*} = -f^{*} + f^{*} - \frac{1}{n}\sum_{i=1}^n f_i^{*} = - \frac{1}{n}\sum_{i=1}^n f_i^{*}$; hence, we can ignore the corresponding term when the $f_i^{*}$'s are non-negative. Re-writing the above equation, we get:
\begin{flalign}
    %\label{thm3-eq1}
    \nonumber
    \mathbb{E}[f(\bm{w}_{k+1})] & \leq \mathbb{E}[f(\bm{w}_{k})] -\frac{\eta E}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] 
    + 32\eta L^2 E^2\Big\{{\eta^2 L}\Big(\frac{\alpha E +4}{n}\Big) + \frac{\eta B}{2}\Big\}\mathbb{E}[f(\bm{w}_k)]
    \\
    & \leq \mathbb{E}[f(\bm{w}_{k})]\Big\{1+\underbrace{\Big(\frac{32 \eta^3 L^3 E^3}{n}\Big(\alpha + \frac{4}{E}\Big) + 16 B\eta^2 L^2 E^2\Big)}_{=\zeta}\Big\} - \frac{\eta E}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2]. 
\end{flalign}
Let us denote $\frac{32 \eta^3 L^3 E^3}{n}\Big(\alpha + \frac{4}{E}\Big) + 16 B\eta^2 L^2 E^2$ as $\zeta$ for brevity. 
\\
Unfolding the above recursion from $k=0$ through $K-1$, we get:
\begin{flalign}
    \label{thm3-eq2}
    \mathbb{E}[f(\bm{w}_{K})] \leq f(\bm{w}_{0})(1+\zeta)^K - \frac{\eta E}{2}\sum_{k=0}^{K-1}(1+\zeta)^{(K-1-k)}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2]. 
\end{flalign}
Re-arranging the above, we get:
\begin{flalign}
    \label{thm3-eq3}
    \sum_{k=0}^{K-1}p_k \mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] \leq \frac{2}{\eta E} \frac{f(\bm{w}_0)(1+\zeta)^K}{\sum_{k=0}^{K-1}(1+\zeta)^k}, \text{ where } p_k = \frac{(1+\zeta)^{(K-1-k)}}{\sum_{k=0}^{K-1}(1+\zeta)^k}.
\end{flalign}
Notice that $p_k$ defines a distribution over $k$. Hence, the LHS is $\mathbb{E}_{k \sim \mathbb{P}(k)}[\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2]]$ with $\mathbb{P}(k) = p_k$. Incorporating this and simplifying further, we get:
\begin{flalign}
    \label{thm3-eq4}
    \mathbb{E}_{k \sim \mathbb{P}(k)}[\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2]] \leq \frac{2}{\eta E} \Big\{\frac{f(\bm{w}_0)\zeta}{1 - (1+\zeta)^{-K}}\Big\}, \text{ where } \mathbb{P}(k) = \frac{(1+\zeta)^{(K-1-k)}}{\sum_{k=0}^{K-1}(1+\zeta)^k}.
\end{flalign}
Also note that: $(1+\zeta)^{-K} < 1 - \zeta K + {\zeta^2}\frac{K(K+1)}{2} < 1 - \zeta K + {\zeta^2}K^2$. Hence, $1 - (1+\zeta)^{-K} > \zeta K (1 - \zeta K)$. Using this in (\ref{thm3-eq4}), we have for $\zeta K < 1$:
\begin{flalign}
    \label{thm3-eq5}
    \mathbb{E}_{k \sim \mathbb{P}(k)}[\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2]] \leq \frac{2 f(\bm{w}_0)}{\underbrace{\eta E K (1 - \zeta K)}_{=d(\eta)}}, \text{ where } \mathbb{P}(k) = \frac{(1+\zeta)^{(K-1-k)}}{\sum_{k=0}^{K-1}(1+\zeta)^k}.
\end{flalign}
Plugging in the value of $\zeta$ in (\ref{thm3-eq5}), the denominator, $d(\eta) = \eta E K \Big(1 - 16 \eta^2 L^2 E^2 \Big(\frac{2\eta L E}{n}\Big(\alpha + \frac{4}{E}\Big) + B\Big)K\Big)$. 
\\
\\
\textbf{Case 1, $q \ne 0$ (compression) and $r < n$ (partial-device participation):}
\\
Before going ahead, we would like to highlight that the reason \texttt{FedLOMO} does not achieve the optimal rate here is because $B$ is a constant that is not $\mathcal{O}(\eta L E)$. % in general; if we were to consider the special case of no compression and full-device participation (i.e., $r=n$), then $B$ would be 0 which would allow \texttt{FedLOMO} to achieve the optimal rate.

Let us choose $\eta = \frac{1}{8 L E \sqrt{B K}}$. Note that:
\begin{equation}
    \label{may14-1}
    \eta L E \leq \frac{1}{4} \text{ for } K \geq \frac{1}{4B}.
\end{equation}
Thus, for sufficiently large $K$, this choice of $\eta$ is valid. Also:
\begin{equation}
    \label{may14-2}
    \zeta K = \frac{1}{4} + \frac{1}{16 B^{1.5}\sqrt{K}}\Big(\frac{1}{n}\Big(\alpha + \frac{4}{E}\Big)\Big) < \frac{3}{4} \text{ for } K > \frac{1}{64 B^3} \Big(\frac{1}{n}\Big(\alpha + \frac{4}{E}\Big)\Big).
\end{equation}
So for $K \geq \frac{1}{64 B^3} (\frac{1}{n}(\alpha + \frac{4}{E}))$,
\begin{equation}
    \label{may14-3}
    d(\eta) = \eta E K (1 - \zeta K) \geq \frac{\sqrt{K}}{8L\sqrt{B}}(1 - \frac{3}{4}) = \frac{\sqrt{K}}{32 L\sqrt{B}}.
\end{equation}
Plugging this in (\ref{thm3-eq5}), we get:
\begin{flalign}
    \nonumber
    \mathbb{E}_{k \sim \mathbb{P}(k)}[\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2]] \leq & \frac{64 \sqrt{B} L f(\bm{w}_0)}{K^{1/2}}, \text{ where } \mathbb{P}(k) = \frac{(1+\zeta)^{(K-1-k)}}{\sum_{k=0}^{K-1}(1+\zeta)^k} \text{ for } k \in \{0,\ldots,K-1\},
    \\
    \label{thm3-eq12}
    & \zeta = \frac{1}{4K} + \frac{1}{16 B^{1.5}K^{1.5}} \Big(\frac{1}{n}\Big(\alpha + \frac{4}{E}\Big)\Big) \text{ and } B = \frac{q}{n} + \frac{4(1+q)(n-r)}{r(n-1)}.
\end{flalign}
\\
\\
\textbf{Case 2, $q=0$ (no compression) and $r=n$ (full-device participation):}
\\
Here, $B=0$ and $\zeta = \frac{32 \eta^3 L^3 E^3}{n}\Big(\alpha + \frac{4}{E}\Big)$. 

Let us choose $\eta = \frac{1}{4 L E} \Big(\frac{n}{(\alpha + \frac{4}{E}) K}\Big)^{1/3}$. Now note that $\eta L E \leq \frac{1}{4}$ for $K > \Big(\frac{n}{\alpha + \frac{4}{E}}\Big)$. Also, $\zeta K = \frac{1}{2}$ with our choice of $\eta$. Plugging this in (\ref{thm3-eq5}), we get:
\begin{multline}
    \nonumber
    \mathbb{E}_{k \sim \mathbb{P}(k)}[\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2]] \leq \frac{16 L f(\bm{w}_0)}{K^{2/3}} \Bigg(\frac{\alpha + \frac{4}{E}}{n}\Bigg)^{1/3}, 
    \\
    \text{ where } \mathbb{P}(k) = \frac{(1+\zeta)^{(K-1-k)}}{\sum_{k=0}^{K-1}(1+\zeta)^k} \text{ for } k \in \{0,\ldots,K-1\} \text{ and } \zeta = \frac{1}{2K}.
\end{multline}
This concludes the proof. 
\end{proof}

\textbf{Key lemma used in the proof of \Cref{fl-thm3}}:

\begin{lemma}
\label{oct-13-lem1}
For $\eta_k = \eta$ where $\eta < \frac{1}{L}$ and $E < \frac{1}{4}\text{min}\Big(\frac{1}{\eta L}, \frac{1}{\eta^2 L^2} - \frac{1}{\eta L}\Big)$ in \texttt{FedLOMO}, we have:
\begin{multline*}
    \mathbb{E}[f(\bm{w}_{k+1})] \leq \mathbb{E}[f(\bm{w}_{k})] -\frac{\eta E}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] - \frac{\eta}{2}(1 - \eta^2  L^2 E^2 - \eta L E )\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2]
    \\
    + 16\eta L E^2\Big\{\frac{\eta^2 L (\alpha E + 4)}{n^2} + \frac{\eta}{2}\Big(\frac{q}{n^2} + \frac{4(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big) \Big)\Big\}\sum_{i \in [n]}{\mathbb{E}[\|\nabla f_i(\bm{w}_{k})\|^2]}.
    %16 \eta L E^2\Big\{\frac{\eta^2 L}{n}\Big(E + \frac{4}{n}\Big) + \frac{\eta}{2}\Big(\frac{q}{n^2} + \frac{4(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big) \Big)\Big\}\sum_{i \in [n]}{\mathbb{E}[\|\nabla f_i(\bm{w}_{k})\|^2]}
\end{multline*}
\end{lemma}
\begin{proof}
By the $L$-smoothness of $f$, we have:
\begin{equation}
    \label{eq:oct13-1}
    \mathbb{E}[f(\bm{w}_{k+1})] \leq \mathbb{E}[f(\bm{w}_{k})] + \underbrace{\mathbb{E}\Big[\Big\langle \nabla f(\bm{w}_{k}), \frac{1}{r}\sum_{i \in \mathcal{S}_k} Q_{D}({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}})\Big\rangle \Big]}_\text{(I)} + \underbrace{\frac{L}{2}\mathbb{E}\Big[\Big\|\frac{1}{r}\sum_{i \in \mathcal{S}_k} Q_{D}({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}})\Big\|^2\Big]}_\text{(II)}
\end{equation}
Let us analyze (I) first -- taking expectation with respect to $\mathcal{S}_k$ and $Q_D(.)$ (recall that $Q_D(.)$ is unbiased from \Cref{as5}), we get:
\begin{flalign}
    \nonumber
    \text{(I)} & = \mathbb{E}[\langle \nabla f(\bm{w}_{k}), \frac{1}{n}\sum_{i \in [n]}(\bm{w}_{k,E}^{(i)} - \bm{w}_{k})\rangle]
\end{flalign}
But this is the same as (III*) in the proof of \Cref{nov-1-lem0}; using \Cref{lem1-may11} and \Cref{as-het}, we get:
\begin{flalign}
    \label{eq:oct13-5}
    \text{(I)} \leq -\frac{\eta E}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] -\frac{\eta}{2}(1 - \eta^2  L^2 E^2)\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] + \frac{16 \eta^3 L^2 E^2 (\alpha E + 4)}{n^2}\sum_{i \in [n]}\mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2],
\end{flalign}
when $\eta < \frac{1}{L}$ and $E < \frac{1}{4}\text{min}\Big(\frac{1}{\eta L}, \frac{1}{\eta^2 L^2} - \frac{1}{\eta L}\Big)$.


Let us now analyze (II). Recall that:
\begin{flalign}
    \nonumber
    \text{(II)} & = \frac{L}{2}\mathbb{E}\Big[\Big\|\frac{1}{r}\sum_{i \in \mathcal{S}_k} Q_{D}({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}})\Big\|^2\Big].
\end{flalign}
Observe that:
\[\mathbb{E}_{\mathcal{S}_k}\Big[\frac{1}{r}\sum_{i \in \mathcal{S}_k} Q_{D}({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}})\Big] = \frac{1}{n}\sum_{i \in [n]} Q_{D}({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}}).\]
Hence:
\begin{multline}
    \label{eq:oct13-6}
    \text{(II)} = \frac{L}{2}\Big\{\underbrace{\mathbb{E}\Big[\Big\|\frac{1}{n}\sum_{i \in [n]} Q_{D}({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}})\Big\|^2\Big]}_\text{(III)} 
    \\
    +
    \underbrace{\mathbb{E}\Big[\Big\|\frac{1}{r}\sum_{i \in \mathcal{S}_k} Q_{D}({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}}) - \frac{1}{n}\sum_{i \in [n]} Q_{D}({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}})\Big\|^2\Big]}_\text{(IV)}
    \Big\}.
\end{multline}
Note that in (III), the expectation is without $\mathcal{S}_k$. In (IV), we take expectation with respect to $\mathcal{S}_k$ and $Q_D(.)$ -- for that, we use Lemma 4 of \cite{reisizadeh2020fedpaq}. Note that $\x_{k,\tau}^{(i)} - \x_k$ in their lemma corresponds to $({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}})$ in our case. Specifically, using eqn. (59) and (60) in \cite{reisizadeh2020fedpaq} (they also have \Cref{as5}), we get:
\begin{flalign}
    \nonumber
    \text{(IV)} & \leq \frac{1}{r(n-1)}\Big(1 - \frac{r}{n}\Big)4(1+q)\sum_{i \in [n]}\mathbb{E}[\|{\bm{w}_{k,E}^{(i)} - \bm{w}_{k}}\|^2] 
    \\
    \nonumber
    & = \frac{1}{r(n-1)}\Big(1 - \frac{r}{n}\Big) 4(1+q)\sum_{i \in [n]}\mathbb{E}[\|\sum_{\tau=0}^{E-1}\eta \bm{v}_{k,\tau}^{(i)}\|^2] 
    \\
    \label{eq:oct13-7-1}
    & \leq \frac{\eta^2}{r(n-1)}\Big(1 - \frac{r}{n}\Big) 4(1+q)E\sum_{i \in [n]}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2]
\end{flalign}
Next, we deal with (III). Noting that $\mathbb{E}_{Q_D}\Big[\frac{1}{n}\sum_{i \in [n]} Q_{D}({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}})\Big] = ({\overline{\bm{w}}_{k,E} - \bm{w}_{k}})$, we get:
\begin{flalign}
    \nonumber
    \text{(III)} & = \mathbb{E}[\|{\overline{\bm{w}}_{k,E} - \bm{w}_{k}}\|^2] + \mathbb{E}\Big[\mathbb{E}_{Q_D}\Big[\Big\|\frac{1}{n}\sum_{i \in [n]}\Big\{Q_{D}\Big({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}}\Big) - \Big({\bm{w}_{k,E}^{(i)} - \bm{w}_{k}}\Big)\Big\}\Big\|^2\Big]\Big]
    \\
    \nonumber
    & \leq \mathbb{E}\Big[\Big\|\sum_{\tau=0}^{E-1}\eta \overline{\bm{v}}_{k,\tau}\Big\|^2\Big] + \frac{q}{n^2}\sum_{i \in [n]}\mathbb{E}\Big[\Big\|{\bm{w}_{k,E}^{(i)} - \bm{w}_{k}}\Big\|^2\Big]
    \\
    \label{eq:oct13-8-1}
    & \leq \eta^2 E\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] + \frac{q E \eta^2 }{n^2}\sum_{i \in [n]}\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2]
\end{flalign}
Now, using (\ref{eq:oct13-7-1}) and (\ref{eq:oct13-8-1}) in (\ref{eq:oct13-6}) gives us:
\begin{flalign}
    %\label{eq:oct13-9}
    \nonumber
    \text{(II)} & \leq \frac{L E \eta^2}{2}\Big\{\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] + \Big(\frac{q}{n^2} + \frac{4(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big) \Big)\sum_{i \in [n]}\underbrace{\sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{v}_{k,\tau}^{(i)}\|^2}_\text{from \Cref{fl-lem-new1}} \Big\}
    \\
    \label{eq:oct13-9}
    & \leq \frac{L E \eta^2}{2}\Big\{\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] + \Big(\frac{q}{n^2} + \frac{4(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big) \Big)16 E \sum_{i \in [n]}{\mathbb{E}[\|\nabla f_i(\bm{w}_{k})\|^2]}\Big\}.
\end{flalign}
Therefore, using (\ref{eq:oct13-5}) and (\ref{eq:oct13-9}) in (\ref{eq:oct13-1}), we get:
\begin{multline*}
    \mathbb{E}[f(\bm{w}_{k+1})] \leq \mathbb{E}[f(\bm{w}_{k})] 
    \\
    -\frac{\eta E}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] -\frac{\eta}{2}(1 - \eta^2  L^2 E^2)\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] + \frac{16 \eta^3 L^2 E^2 (\alpha E + 4)}{n^2}\sum_{i \in [n]}\mathbb{E}[\|\nabla f_i(\bm{w}_k)\|^2]
    \\
    + \frac{L E \eta^2}{2}\Big\{\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2] + \Big(\frac{q}{n^2} + \frac{4(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big) \Big)16 E \sum_{i \in [n]}{\mathbb{E}[\|\nabla f_i(\bm{w}_{k})\|^2]}\Big\}
\end{multline*}
\begin{multline}
    \implies \mathbb{E}[f(\bm{w}_{k+1})] \leq \mathbb{E}[f(\bm{w}_{k})] -\frac{\eta E}{2}\mathbb{E}[\|\nabla f(\bm{w}_{k})\|^2] - \frac{\eta}{2}(1 - \eta^2  L^2 E^2 - \eta L E )\sum_{\tau=0}^{E-1}\mathbb{E}[\|\overline{\bm{v}}_{k,\tau}\|^2]
    \\
    + 16\eta L E^2\Big\{\frac{\eta^2 L (\alpha E + 4)}{n^2} + \frac{\eta}{2}\Big(\frac{q}{n^2} + \frac{4(1+q)}{r(n-1)}\Big(1 - \frac{r}{n}\Big) \Big)\Big\}\sum_{i \in [n]}{\mathbb{E}[\|\nabla f_i(\bm{w}_{k})\|^2]}
\end{multline}
This completes the proof.
\end{proof}

\section{Convergence of \texttt{FedAvg} under Assumption \ref{as-het}}
\label{sec:fed_avg_conv}
Here, we provide a convergence result for \texttt{FedAvg} (\Cref{alg:fed-avg}) in the absence of the bounded client dissimilarity assumption (i.e. \cref{eq:bcd}) and instead assuming that \Cref{as-het} holds for \texttt{FedAvg}. 

Before presenting the convergence result, we show empirical proof that \Cref{as-het} holds for \texttt{FedAvg}. For this, we compute and plot $\alpha$ (as we did in \Cref{sec:het-asm-expt}) for 8 and 4 bit \texttt{FedAvg} on CIFAR-10 and FMNIST, respectively; the results are in \Cref{fig:het1}.

\begin{figure}[!htb]
\centering 
\subfloat[CIFAR-10]{
    \label{fig:het_c}
	\includegraphics[width=0.45\textwidth]{UAI_figs2/CIFAR10_alpha_FedAvg.pdf}
	} 
\subfloat[FMNIST]{
    \label{fig:het_d}
	\includegraphics[width=0.45\textwidth]{UAI_figs2/FMNIST_alpha_FedAvg.pdf}
	} 
\caption{Variation of $(\frac{\alpha}{n})$ over different rounds of $8$ and $4$ bit \texttt{FedAvg} for CIFAR-10 (Fig. \ref{fig:het_c}) and FMNIST (Fig. \ref{fig:het_d}) in the heterogeneous and homogeneous cases. In both cases, notice that $\alpha \ll n$ throughout training. {Also, as expected, observe that $(\frac{\alpha}{n})$ is higher for the heterogeneous case (except towards the end of training for FMNIST).}
}
\label{fig:het1}
\end{figure}

\begin{theorem}[\textbf{Smooth non-convex case for \texttt{FedAvg}}]
\label{thm-fedavg}
Let Assumptions \ref{as1} and \ref{as-may15} hold. Further, suppose \Cref{as-het} holds for \texttt{FedAvg} (\Cref{alg:fed-avg}). 
Let $\sigma^2$ be the maximum variance of the local (client-level) stochastic gradients.
In \texttt{FedAvg}, set $\eta_{k} = \frac{1}{L E}\sqrt{\frac{r}{K}}$ for all $k$. 
Define a distribution $\mathbb{P}$ for $k \in \{0,\ldots,K-1\}$ such that $\mathbb{P}(k) = \frac{(1+\zeta)^{(K-1-k)}}{\sum_{k=0}^{K-1}(1+\zeta)^k}$ where $\zeta := \eta^2 L^2 E^2 \Big(\frac{(n-r)}{6r(n-1)} + {\frac{8 \alpha \eta L E}{9 n}}\Big)$. Sample $k^{*}$ from $\mathbb{P}$. Then for $K \geq \max\Big(\frac{64r^3}{9}(\frac{\alpha}{n})^2, 4r\Big)$:
\begin{equation}
    \label{eq:fedavg-convg}
    \mathbb{E}[\|\nabla f(\bm{w}_{k^{*}})\|^2] \leq \frac{4 L f(\bm{w}_0)}{\sqrt{r K}} +
    \frac{\sigma^2}{\sqrt{r K}} {\Big(\frac{1}{E} + \frac{(n-r)}{3(n-1)}\Big)} + \frac{8 \sigma^2 r}{9 K} \Big(\frac{\alpha}{n}\Big) + \frac{\sigma^2}{E K} \Big(\frac{r}{n}\Big).
    %\frac{4 (L f(\bm{w}_0) + \frac{\sigma^2}{3})}{\sqrt{r K}} + \frac{8 \sigma^2 r}{9 K} \Big(\frac{\alpha}{n}\Big) + \frac{\sigma^2}{E K} \Big(\frac{r}{n}\Big).
\end{equation}
So \texttt{FedAvg} needs $K = \mathcal{O}(\frac{1}{r \epsilon^2})$ rounds of communication to achieve $\mathbb{E}[\|\nabla f(\bm{w}_{k^{*}})\|^2] \leq \epsilon$, for $\epsilon < \mathcal{O}\big(\max\big(\frac{1}{r}, \frac{n/\alpha}{r^2}\big)\big)$.
\\
%Also note that the above convergence result holds with $\alpha = n$, in which case \Cref{as-het} holds unconditionally.
Note that if we plug in $\alpha = n$ in \cref{eq:fedavg-convg}, then we get a convergence result for \texttt{FedAvg} without making use of \Cref{as-het}.
\end{theorem}
Thus, we recover the same complexity for \texttt{FedAvg}/Local SGD (which is basically \texttt{FedAvg} with full-device participation) as \cite{karimireddy2019scaffold,koloskova2020unified} -- but without the bounded client dissimilarity assumption.

Note that the iteration complexity of \texttt{FedAvg} is $\mathcal{O}(\epsilon^{-2})$, even with $r = n$. In contrast, note that the iteration complexity of \texttt{FedLOMO} improves to $\mathcal{O}(\epsilon^{-1.5})$ with $r = n$ (and no compression) as per \Cref{fl-thm3}.

In the convergence result of \Cref{thm-fedavg}, we see that $\alpha$ only shows up in the non-dominant term. So unlike \texttt{FedGLOMO}, \Cref{as-het} holding with $\alpha \ll n$ does not improve the \textit{order-wise} convergence rate/complexity of \texttt{FedAvg}.

\begin{proof}
Using \Cref{sep26-lem3}, for $\eta_k L E \leq \frac{1}{2}$, we can bound the per-round progress as:
\begin{multline}
    \label{eq:sept26-18}
    \mathbb{E}[f(\bm{w}_{k+1})] 
    \leq \mathbb{E}[f(\bm{w}_k)]
    - \frac{\eta_k E}{2} \mathbb{E}[\|\nabla f(\bm{w}_k)\|^2] + \eta_k^2 L E^2 \Big(\frac{(n-r)}{6r(n-1)} + {\frac{8 \alpha \eta_k L E}{9 n}}\Big)\Big(\frac{1}{n}\sum_{i \in [n]} \mathbb{E}[\|\nabla {f}_i(\bm{w}_{k})\|^2]\Big) 
    \\
    + \frac{\eta_k^2 L E}{2} \Big(\frac{\eta_k L E}{n}\Big(1 + \frac{8\alpha E}{9}\Big) + \frac{1}{r} + \frac{(n-r)E}{3r(n-1)}\Big)\sigma^2.
\end{multline}
Now applying our earlier trick of using the $L$-smoothness and non-negativity of the $f_i$'s, we get:
\[\sum_{i \in [n]} \|\nabla f_i(\bm{w}_k)\|^2 \leq \sum_{i \in [n]} 2L(f_i(\bm{w}_k) - f_i^{*}) \leq 2n L f(\bm{w}_k) - 2L \sum_{i \in [n]} f_i^{*} \leq 2n L f(\bm{w}_k).\]
Putting this in \cref{eq:sept26-18}, we get for a constant learning rate of $\eta_k = \eta$:
\begin{multline}
    \label{eq:sept26-18-1}
    \mathbb{E}[f(\bm{w}_{k+1})] 
    \leq 
    \Big(1 + \eta^2 L^2 E^2 \Big(\frac{(n-r)}{6r(n-1)} + {\frac{8 \alpha \eta L E}{9 n}}\Big)\Big) \mathbb{E}[f(\bm{w}_k)]
    - \frac{\eta E}{2} \mathbb{E}[\|\nabla f(\bm{w}_k)\|^2]
    \\
    + \frac{\eta^2 L E}{2} \Big(\frac{\eta L E}{n}\Big(1 + \frac{8\alpha E}{9}\Big) + \frac{1}{r} + \frac{(n-r)E}{3r(n-1)}\Big)\sigma^2.
\end{multline}
For ease of notation, define $\zeta := \eta^2 L^2 E^2 \Big(\frac{(n-r)}{6r(n-1)} + {\frac{8 \alpha \eta L E}{9 n}}\Big)$ and $\zeta_2 := \Big(\frac{\eta L E}{n}\Big(1 + \frac{8\alpha E}{9}\Big) + \frac{1}{r} + \frac{(n-r)E}{3r(n-1)}\Big)$. Then, unfolding the recursion of \cref{eq:sept26-18-1} from  $k=0$ through to $k=K-1$, we get:
\begin{multline}
    \label{eq:sept26-19}
    \mathbb{E}[f(\bm{w}_{K})] 
    \leq 
    (1 + \zeta)^K f(\bm{w}_0)
    - \frac{\eta E}{2} \sum_{k=0}^{K-1}(1+\zeta)^{(K-1-k)} \mathbb{E}[\|\nabla f(\bm{w}_k)\|^2]
    + \frac{\eta^2 L E}{2} \zeta_2 \sigma^2 \sum_{k=0}^{K-1}(1+\zeta)^{(K-1-k)}.
\end{multline}
Let us define $p_k := \frac{(1+\zeta)^{(K-1-k)}}{\sum_{k'=0}^{K-1}(1+\zeta)^{(K-1-k')}}$. Then, re-arranging \cref{eq:sept26-19} and using the fact that $\mathbb{E}[f(\bm{w}_{K})] \geq 0$, we get:
\begin{flalign}
    \sum_{k=0}^{K-1}p_k \mathbb{E}[\|\nabla f(\bm{w}_k)\|^2] & \leq \frac{2 (1 + \zeta)^K f(\bm{w}_0)}{\eta E \sum_{k'=0}^{K-1}(1+\zeta)^{k'}} + {\eta L}\zeta_2 \sigma^2 
    \\
    \label{eq:sep27-1}
    & = \frac{2 \zeta f(\bm{w}_0)}{\eta E
    (1 - (1+\zeta)^{-K})} + \eta L E \Big(\frac{\eta L}{n}\Big(1 + \frac{8\alpha E}{9}\Big) + \frac{1}{r E} + \frac{(n-r)}{3r(n-1)}\Big)\sigma^2,
\end{flalign}
where the last step follows by using the fact that $\sum_{k'=0}^{K-1}(1+\zeta)^{k'} = \frac{(1+\zeta)^{K} - 1}{\zeta}$ and plugging in the value of $\zeta_2$. Now as we did in the proof of \Cref{fl-thm3}:
\begin{equation*}
    (1+\zeta)^{-K} < 1 - \zeta K + {\zeta^2}\frac{K(K+1)}{2} < 1 - \zeta K + {\zeta^2}K^2 \implies 1 - (1+\zeta)^{-K} > \zeta K (1 - \zeta K).
\end{equation*}
Plugging this in \cref{eq:sep27-1}, we have for $\zeta K < 1$:
\begin{flalign}
    \label{eq:sep27-2}
    \sum_{k=0}^{K-1}p_k \mathbb{E}[\|\nabla f(\bm{w}_k)\|^2] & \leq \frac{2 f(\bm{w}_0)}{\eta E K (1 - \zeta K)} + \eta L E \Big(\frac{\eta L}{n}\Big(1 + \frac{8\alpha E}{9}\Big) + \frac{1}{r E} + \frac{(n-r)}{3r(n-1)}\Big)\sigma^2.
\end{flalign}
In this case, note that the optimal step size will be $\eta = \mathcal{O}(\frac{1}{L E \sqrt{K}})$, even for $r = n$. This is in contrast to \texttt{FedLOMO} for which the optimal step size is $\eta = \mathcal{O}(\frac{1}{L E K^{1/3}})$ for $r = n$. 

So let us pick $\eta = \frac{1}{L E}\sqrt{\frac{r}{K}}$. Note that we need to have $\eta L E \leq \frac{1}{2}$; this happens for $K \geq 4r$. Further, let us ensure $\zeta K < \frac{1}{2}$; this happens for $K \geq \frac{64r^3}{9}(\frac{\alpha}{n})^2$. Thus, we should have $K \geq \max\Big(\frac{64r^3}{9}(\frac{\alpha}{n})^2, 4r\Big)$. Putting $\eta = \frac{1}{L E}\sqrt{\frac{r}{K}}$ in \cref{eq:sep27-2} and also using $1 - \zeta K \geq \frac{1}{2}$, we get:
\begin{equation}
    \sum_{k=0}^{K-1}p_k \mathbb{E}[\|\nabla f(\bm{w}_k)\|^2] \leq \frac{4 L f(\bm{w}_0)}{\sqrt{r K}} +
    \frac{\sigma^2}{\sqrt{r K}} {\Big(\frac{1}{E} + \frac{(n-r)}{3(n-1)}\Big)} + \frac{8 \sigma^2 r}{9 K} \Big(\frac{\alpha}{n}\Big) + \frac{\sigma^2}{E K} \Big(\frac{r}{n}\Big).
\end{equation}
%Using the fact that $\frac{1}{E} + \frac{(n-r)}{3(n-1)} \leq \frac{4}{3}$ gives us the final result.
This finishes the proof.
\end{proof}

\begin{lemma}
\label{sep26-lem3}
For $\eta_k L E \leq \frac{1}{2}$, we have:
\begin{multline*}
    \mathbb{E}[f(\bm{w}_{k+1})] 
    \leq \mathbb{E}[f(\bm{w}_k)]
    - \frac{\eta_k E}{2} \mathbb{E}[\|\nabla f(\bm{w}_k)\|^2] + \eta_k^2 L E^2 \Big(\frac{(n-r)}{6r(n-1)} + \frac{8 \alpha \eta_k L E}{9 n}\Big)\Big(\frac{1}{n}\sum_{i \in [n]} \mathbb{E}[\|\nabla {f}_i(\bm{w}_{k})\|^2]\Big) 
    \\
    + \frac{\eta_k^2 L E}{2} \Big(\frac{\eta_k L E}{n}\Big(1 + \frac{8\alpha E}{9}\Big) + \frac{1}{r} + \frac{(n-r)E}{3r(n-1)}\Big)\sigma^2.
\end{multline*}
\end{lemma}
\begin{proof}
Define
\[
\widehat{\bm{u}}_{k,\tau}^{(i)} := \nabla \widetilde{f}_i(\bm{w}^{(i)}_{k, \tau}; \mathcal{B}^{(i)}_{k, \tau}) 
\text{, }
\widehat{\bm{u}}_{k,\tau} := \frac{1}{n}\sum_{i \in [n]} \widehat{\bm{u}}_{k,\tau}^{(i)}
\text{, }
\bm{u}_{k,\tau} := \frac{1}{n}\sum_{i \in [n]}\nabla f_i(\bm{w}^{(i)}_{k, \tau}) \text{, } \]
\[\overline{\bm{w}}_{k,\tau} := \frac{1}{n}\sum_{i \in [n]}\bm{w}^{(i)}_{k, \tau} \text{ and } \widetilde{\bm{e}}_{k,\tau}^{(i)} = \nabla f_i(\bm{w}^{(i)}_{k, \tau}) - \nabla f_i(\overline{\bm{w}}_{k,\tau}).\]
Then:
\begin{equation}
    \label{eq:feb28-1}
    \bm{w}_{k+1} = \bm{w}_k - \eta_k \sum_{\tau=0}^{E-1}\Big(\frac{1}{r}\sum_{i \in \mathcal{S}_k}\widehat{\bm{u}}_{k,\tau}^{(i)}\Big).
\end{equation}
\begin{equation}
    \label{eq:feb28-1-0}
    \overline{\bm{w}}_{k,\tau} = \bm{w}_k - \eta_k \sum_{t=0}^{\tau-1}\widehat{\bm{u}}_{k,t}.
\end{equation}
\begin{equation}
    \label{eq:feb28-2}
    \mathbb{E}_{\{\mathcal{B}^{(i)}_{k, \tau}\}_{i=1}^n}[\widehat{\bm{u}}_{k,\tau}] = \bm{u}_{k,\tau}.
\end{equation}
\begin{equation}
    \label{eq:feb28-3}
    \mathbb{E}\Big[\Big\|\sum_{t=0}^{\tau-1}\widehat{\bm{u}}_{k,t}\Big\|^2\Big] \leq \tau \sum_{t=0}^{\tau-1}\mathbb{E}[\|\bm{u}_{k,t}\|^2] + \frac{\tau \sigma^2}{n}.
\end{equation}
\begin{equation}
    \label{eq:feb28-3-1}
    \mathbb{E}\Big[\Big\|\sum_{t=0}^{\tau-1}\widehat{\bm{u}}_{k,t}^{(i)}\Big\|^2\Big] \leq \tau \sum_{t=0}^{\tau-1}\mathbb{E}[\|\nabla f_i(\bm{w}^{(i)}_{k, t})\|^2] + {\tau \sigma^2}.
\end{equation}
Recall that $\sigma^2$ is the maximum variance of the local (client-level) stochastic gradients.
In \cref{eq:feb28-3}, the expectation is w.r.t. $\{\mathcal{B}^{(i)}_{k, t}\}_{i=1, t=0}^{n, \tau-1}$ and it follows due to the independence of the noise in each local update of each client. 
Similarly, \cref{eq:feb28-3-1}, the expectation is w.r.t. $\{\mathcal{B}^{(i)}_{k, t}\}_{t=0}^{\tau-1}$ and it follows due to the independence of the noise in each local update.


Next, using the $L$-smoothness of $f$ and \cref{eq:feb28-1}, we get
%\small
\begin{flalign}
    \label{eq:feb28-4-0-1}
    \mathbb{E}[f(\bm{w}_{k+1})] & \leq 
    \mathbb{E}[f(\bm{w}_k)] - \mathbb{E}\Big[ \Big\langle \nabla f(\bm{w}_k), \eta_k \sum_{\tau=0}^{E-1} \Big(\frac{1}{r}\sum_{i \in \mathcal{S}_k}\widehat{\bm{u}}_{k,\tau}^{(i)}\Big) \Big\rangle\Big] + \frac{L}{2}\mathbb{E}\Big[\Big\|\eta_k \sum_{\tau=0}^{E-1} \Big(\frac{1}{r}\sum_{i \in \mathcal{S}_k}\widehat{\bm{u}}_{k,\tau}^{(i)}\Big)\Big\|^2\Big]
    \\
    \label{eq:feb28-4-0}
    & = \mathbb{E}[f(\bm{w}_k)] - \mathbb{E}[ \langle \nabla f(\bm{w}_k), \sum_{\tau=0}^{E-1} \eta_k \widehat{\bm{u}}_{k,\tau} \rangle] + \frac{\eta_k^2 L}{2}\Big\{\frac{n(r-1)}{r(n-1)}\mathbb{E}[\|\sum_{\tau=0}^{E-1} \widehat{\bm{u}}_{k,\tau}\|^2] + \frac{(n-r)}{r(n-1)}\Big(\frac{1}{n}\sum_{i \in [n]} \mathbb{E}[\|\sum_{\tau=0}^{E-1} \widehat{\bm{u}}_{k,\tau}^{(i)}\|^2]\Big)\Big\}
    \\
    \label{eq:feb28-4}
    & \leq \mathbb{E}[f(\bm{w}_k)] - \eta_k \mathbb{E}[\langle \nabla f(\bm{w}_k), 
    \sum_{\tau=0}^{E-1}
    {\bm{u}}_{k,\tau}\rangle] +
    \frac{\eta_k^2 L E}{2}\Big\{
    \frac{n(r-1)}{r(n-1)} \Big(\sum_{\tau=0}^{E-1}\mathbb{E}[\|{\bm{u}}_{k,\tau}\|^2] + \frac{\sigma^2}{n} \Big) 
    \\
    \nonumber
    & + \frac{(n-r)}{r(n-1)}
    \Big(\frac{1}{n}\sum_{i \in [n]} \sum_{\tau=0}^{E-1}\mathbb{E}[\|\nabla f_i(\bm{w}^{(i)}_{k, \tau})\|^2] + {\sigma^2}\Big)\Big\}
\end{flalign}
%\normalsize
Note that \cref{eq:feb28-4-0} follows by taking expectation w.r.t. $\mathcal{S}_k$ in \cref{eq:feb28-4-0-1}, while \cref{eq:feb28-4} follows from \cref{eq:feb28-2}, \cref{eq:feb28-3} and \cref{eq:feb28-3-1}.

For any 2 vectors $\bm{a}$ and $\bm{b}$, we have that $\langle \bm{a}, \bm{b} \rangle = \frac{1}{2}(\|\bm{a}\|^2 + \|\bm{b}\|^2 - \|\bm{a} - \bm{b}\|^2)$. Using this:
\begin{flalign}
\label{eq:feb28-5}
\langle \nabla f(\bm{w}_k), \sum_{\tau=0}^{E-1} \bm{u}_{k,\tau} \rangle & = \sum_{\tau=0}^{E-1} \langle \nabla f(\bm{w}_k), \bm{u}_{k,\tau} \rangle = \frac{1}{2}\sum_{\tau=0}^{E-1}(\|\nabla f(\bm{w}_k)\|^2 + \|\bm{u}_{k,\tau} \|^2 - \|\nabla f(\bm{w}_k) - \bm{u}_{k,\tau}\|^2).
\end{flalign}
Putting this in \cref{eq:feb28-4}, we get:
\begin{multline}
    \label{eq:feb28-6}
    \mathbb{E}[f(\bm{w}_{k+1})] 
    \leq \mathbb{E}[f(\bm{w}_k)] - \frac{\eta_k E}{2} \mathbb{E}[\|\nabla f(\bm{w}_k)\|^2] - \frac{\eta_k}{2}
    \Big(1 - \eta_k L E \frac{n(r-1)}{r(n-1)}\Big)
    \sum_{\tau=0}^{E-1} \mathbb{E}[\|{\bm{u}}_{k,\tau}\|^2]
    \\
    + \frac{\eta_k}{2} \underbrace{\sum_{\tau=0}^{E-1} \mathbb{E}[\|\nabla f(\bm{w}_k) - \bm{u}_{k,\tau}\|^2]}_\text{(A)}
    + \frac{\eta_k^2 L E}{2 r} \sigma^2 + \frac{(n-r)}{r(n-1)} \frac{\eta_k^2 L E}{2}
    \underbrace{\Big(\frac{1}{n}\sum_{i \in [n]} \sum_{\tau=0}^{E-1}\mathbb{E}[\|\nabla f_i(\bm{w}^{(i)}_{k, \tau})\|^2]\Big)}_\text{(B)}.
\end{multline}
We upper bound (A) and (B) using \Cref{sept26-lem2} and \Cref{sept26-lem1}, respectively. Plugging in these bounds, we get:
\begin{multline}
    \label{eq:sep26-15}
    \mathbb{E}[f(\bm{w}_{k+1})] 
    \leq \mathbb{E}[f(\bm{w}_k)] - \frac{\eta_k E}{2} \mathbb{E}[\|\nabla f(\bm{w}_k)\|^2] - \frac{\eta_k}{2}
    \underbrace{\Big(1 - \eta_k L E \frac{n(r-1)}{r(n-1)} - \eta_k^2 L^2 E^2 \Big)}_\text{(C)}
    \sum_{\tau=0}^{E-1} \mathbb{E}[\|{\bm{u}}_{k,\tau}\|^2]
    \\
    + \eta_k^2 L E^2 \Big(\frac{(n-r)}{6r(n-1)} + \frac{8 \alpha \eta_k L E}{9 n}\Big)\Big(\frac{1}{n}\sum_{i \in [n]} \mathbb{E}[\|\nabla {f}_i(\bm{w}_{k})\|^2]\Big) + \frac{\eta_k^2 L E}{2} \Big(\frac{\eta_k L E}{n}\Big(1 + \frac{8\alpha E}{9}\Big) + \frac{1}{r} + \frac{(n-r)E}{3r(n-1)}\Big)\sigma^2,
\end{multline}
for $\eta_k L E \leq \frac{1}{2}$. Note that $\text{(C)} \geq 0$ for $\eta_k L E \leq \frac{1}{2}$. Thus, for $\eta_k L E \leq \frac{1}{2}$, we have:
\begin{multline}
    \label{eq:sep26-16}
    \mathbb{E}[f(\bm{w}_{k+1})] 
    \leq \mathbb{E}[f(\bm{w}_k)]
    - \frac{\eta_k E}{2} \mathbb{E}[\|\nabla f(\bm{w}_k)\|^2] + \eta_k^2 L E^2 \Big(\frac{(n-r)}{6r(n-1)} + \frac{8 \alpha \eta_k L E}{9 n}\Big)\Big(\frac{1}{n}\sum_{i \in [n]} \mathbb{E}[\|\nabla {f}_i(\bm{w}_{k})\|^2]\Big) 
    \\
    + \frac{\eta_k^2 L E}{2} \Big(\frac{\eta_k L E}{n}\Big(1 + \frac{8\alpha E}{9}\Big) + \frac{1}{r} + \frac{(n-r)E}{3r(n-1)}\Big)\sigma^2.
\end{multline}
\end{proof}


\begin{lemma}
\label{sept26-lem2}
For $\eta_k L E \leq \frac{1}{2}$:
\begin{equation*}
    \sum_{\tau=0}^{E-1} \mathbb{E}[\|\nabla f(\bm{w}_k) - \bm{u}_{k,\tau}\|^2] \leq \eta_k^2 L^2 E^2 \sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{u}_{k,\tau}\|^2] + \frac{16 \alpha \eta_k^2 L^2 E^3}{9 n^2} \sum_{i \in [n]} \mathbb{E}[\|\nabla {f}_i(\bm{w}_{k})\|^2] + \frac{\eta_k^2 L^2 E^2}{n}\Big(1 + \frac{8\alpha E}{9}\Big) \sigma^2.
\end{equation*}
\end{lemma}
\begin{proof}
We have:
\begin{flalign}
    \mathbb{E}[\|\nabla f(\bm{w}_k) - \bm{u}_{k,\tau}\|^2] & = \mathbb{E}[\|\nabla f(\bm{w}_k) - \nabla f(\overline{\bm{w}}_{k,\tau}) + \nabla f(\overline{\bm{w}}_{k,\tau}) - \bm{u}_{k,\tau}\|^2]
    \\
    & \leq 2\mathbb{E}[\|\nabla f(\bm{w}_k) - \nabla f(\overline{\bm{w}}_{k,\tau})\|^2] + 2\mathbb{E}[\|\nabla f(\overline{\bm{w}}_{k,\tau}) - \bm{u}_{k,\tau}\|^2]
    \\
    \label{eq:sep26-1}
    & \leq 2L^2\mathbb{E}[\|\bm{w}_k - \overline{\bm{w}}_{k,\tau}\|^2] + 2\mathbb{E}\Big[\Big\|\frac{1}{n}\sum_{i \in [n]} \underbrace{(\nabla f_i(\overline{\bm{w}}_{k,\tau})- \nabla f_i(\bm{w}^{(i)}_{k, \tau}))}_{=-\widetilde{\bm{e}}_{k,\tau}^{(i)}}\Big\|^2\Big]
    \\
    \label{eq:sep26-2}
    & \leq 2 \eta_k^2 L^2 \mathbb{E}\Big[\Big\|\sum_{t=0}^{\tau-1}\widehat{\bm{u}}_{k,t}\Big\|^2\Big] + \frac{2 \alpha}{n^2} \sum_{i \in [n]} \mathbb{E}[\|\widetilde{\bm{e}}_{k,\tau}^{(i)}\|^2]
    \\
    \label{eq:sep26-3}
    & \leq 2 \eta_k^2 L^2 \Big(\tau \sum_{t=0}^{\tau-1}\mathbb{E}[\|\bm{u}_{k,t}\|^2] + \frac{\tau \sigma^2}{n}\Big) + \frac{2 \alpha L^2}{n^2} \sum_{i \in [n]} \mathbb{E}[\|\bm{w}^{(i)}_{k, \tau} - \overline{\bm{w}}_{k,\tau}\|^2].
\end{flalign}
\Cref{eq:sep26-1} follows from the $L$-smoothness of $f$ and the definition of $\bm{u}_{k,\tau}$. 
\Cref{eq:sep26-2} follows from \cref{eq:feb28-1-0} and \Cref{as-het}. \Cref{eq:sep26-3} follows from \cref{eq:feb28-3} and the $L$-smoothness of $f_i$.

But:
\begin{flalign}
    \sum_{i \in [n]} \mathbb{E}[\|\bm{w}_{k, \tau}^{(i)} - \overline{\bm{w}}_{k,\tau}\|^2]
    & = \sum_{i \in [n]} \mathbb{E}[\|(\bm{w}_{k, 0}^{(i)} - \eta_k \sum_{t=0}^{\tau-1} \widehat{\bm{u}}_{k,t}^{(i)}) - (\overline{\bm{w}}_{k,0} - \eta_k \sum_{t=0}^{\tau-1} \widehat{\bm{u}}_{k,t})\|^2]
    \\
    \label{eq:sept26-4}
    & = \eta_k^2 \sum_{i \in [n]} \mathbb{E}[\| \sum_{t=0}^{\tau-1} \widehat{\bm{u}}_{k,t} - \sum_{t=0}^{\tau-1} \widehat{\bm{u}}_{k,t}^{(i)}\|^2] 
    \\
    & \leq \eta_k^2 \tau \sum_{i \in [n]} \sum_{t=0}^{\tau-1} \mathbb{E}[\|\widehat{\bm{u}}_{k,t} - \widehat{\bm{u}}_{k,t}^{(i)}\|^2]
    \\
    \label{eq:sept26-5}
    & = \eta_k^2 \tau \sum_{t=0}^{\tau-1} \sum_{i \in [n]} \mathbb{E}[\|\widehat{\bm{u}}_{k,t}\|^2 + \|\widehat{\bm{u}}_{k,t}^{(i)}\|^2 - 2\langle \widehat{\bm{u}}_{k,t}, \widehat{\bm{u}}_{k,t}^{(i)} \rangle]
\end{flalign}
\Cref{eq:sept26-4} follows because  $\bm{w}_{k,0}^{(i)} = \bm{w}_{k}$ $\forall$ $i \in [n]$, due to which  $\overline{\bm{w}}_{k,0} = \bm{w}_{k}$. Next, using the fact that $\widehat{\bm{u}}_{k,\tau} = \frac{1}{n}\sum_{i \in [n]} \widehat{\bm{u}}_{k,\tau}^{(i)}$, we can simplify \cref{eq:sept26-5} to:
\begin{flalign}
    \sum_{i \in [n]} \mathbb{E}[\|\bm{w}_{k, \tau}^{(i)} - \overline{\bm{w}}_{k,\tau}\|^2]
    & \leq \eta_k^2 \tau \sum_{t=0}^{\tau-1} \sum_{i \in [n]} (\mathbb{E}[\|\widehat{\bm{u}}_{k,\tau}^{(i)}\|^2] - \mathbb{E}[\|\widehat{\bm{u}}_{k,t}\|^2])
    \\
    & \leq \eta_k^2 \tau \sum_{t=0}^{\tau-1} \sum_{i \in [n]} \mathbb{E}[\|{\widehat{\bm{u}}_{k,\tau}^{(i)}}\|^2]
    \\
    \label{eq:sept26-10}
    & \leq \eta_k^2 \tau \sum_{t=0}^{\tau-1} \sum_{i \in [n]} (\mathbb{E}[\|\nabla {f}_i(\bm{w}^{(i)}_{k, t})\|^2] + \sigma^2).
\end{flalign}
Next, using \Cref{sept26-lem1} for $\eta_k L E \leq \frac{1}{2}$ in \cref{eq:sept26-10}, we get:
\begin{flalign}
    \label{eq:sept26-11}
    \sum_{i \in [n]} \mathbb{E}[\|\bm{w}_{k, \tau}^{(i)} - \overline{\bm{w}}_{k,\tau}\|^2]
    \leq \frac{4 \eta_k^2 \tau^2}{3} \sum_{i \in [n]} (2\mathbb{E}[\|\nabla {f}_i(\bm{w}_{k})\|^2] + \sigma^2).
\end{flalign}
Plugging \cref{eq:sept26-11} back in \cref{eq:sep26-3}, we get:
\begin{flalign}
    \mathbb{E}[\|\nabla f(\bm{w}_k) - \bm{u}_{k,\tau}\|^2] & \leq 2 \eta_k^2 L^2 \Big(\tau \sum_{t=0}^{\tau-1}\mathbb{E}[\|\bm{u}_{k,t}\|^2] + \frac{\tau \sigma^2}{n}\Big) + \frac{8 \alpha \eta_k^2 L^2 \tau^2}{3 n^2} \sum_{i \in [n]} (2\mathbb{E}[\|\nabla {f}_i(\bm{w}_{k})\|^2] + \sigma^2)
    \\
    \label{eq:sept26-12}
    & = 2 \eta_k^2 L^2 \tau \sum_{t=0}^{\tau-1}\mathbb{E}[\|\bm{u}_{k,t}\|^2] + \frac{16 \alpha \eta_k^2 L^2 \tau^2}{3 n^2} \sum_{i \in [n]} \mathbb{E}[\|\nabla {f}_i(\bm{w}_{k})\|^2] + \frac{\eta_k^2 L^2 \tau \sigma^2}{n}\Big(2 + \frac{8\alpha}{3} \tau\Big).
\end{flalign}
Summing up \cref{eq:sept26-12} for $\tau \in \{0, \ldots, E-1\}$, we get:
\begin{equation}
    \sum_{\tau=0}^{E-1} \mathbb{E}[\|\nabla f(\bm{w}_k) - \bm{u}_{k,\tau}\|^2] \leq \eta_k^2 L^2 E^2 \sum_{\tau=0}^{E-1}\mathbb{E}[\|\bm{u}_{k,\tau}\|^2] + \frac{16 \alpha \eta_k^2 L^2 E^3}{9 n^2} \sum_{i \in [n]} \mathbb{E}[\|\nabla {f}_i(\bm{w}_{k})\|^2] + \frac{\eta_k^2 L^2 E^2}{n}\Big(1 + \frac{8\alpha E}{9}\Big) \sigma^2.
\end{equation}

\end{proof}


\begin{lemma}
\label{sept26-lem1}
For $\eta_k L E \leq \frac{1}{2}$, we have:
\begin{equation*}
    \sum_{t=0}^{\tau-1} \mathbb{E}[\|\nabla {f}_i(\bm{w}^{(i)}_{k, t})\|^2] \leq \frac{\tau}{3}(8 \mathbb{E}[\|\nabla {f}_i(\bm{w}_k)\|^2] + \sigma^2).
\end{equation*}
\end{lemma}
\begin{proof}
\begin{flalign}
    \nonumber
    \mathbb{E}[\|\nabla {f}_i(\bm{w}^{(i)}_{k, t})\|^2] & = \mathbb{E}[\|\nabla {f}_i(\bm{w}^{(i)}_{k, t}) - \nabla {f}_i(\bm{w}_k) + \nabla {f}_i(\bm{w}_k)\|^2]
    \\
    \nonumber
    & \leq 2 \mathbb{E}[\|\nabla {f}_i(\bm{w}_k)\|^2] + 2 \mathbb{E}[\|\nabla {f}_i(\bm{w}^{(i)}_{k, t}) - \nabla {f}_i(\bm{w}_k)\|^2]
    \\
    \label{eq:feb28-7}
    & \leq 2 \mathbb{E}[\|\nabla {f}_i(\bm{w}_k)\|^2] + 2L^2 \mathbb{E}[\|\bm{w}^{(i)}_{k, t} - \bm{w}_k\|^2].
\end{flalign}
But:   
\begin{flalign}
    \label{eq:feb28-8}
    \mathbb{E}[\|\bm{w}_k - \bm{w}^{(i)}_{k, t}\|^2] = \mathbb{E}\Big[\Big\|\eta_k \sum_{t'=0}^{t-1} \nabla \widetilde{f}_i(\bm{w}^{(i)}_{k, t'}; \mathcal{B}^{(i)}_{k, t'})\Big\|^2\Big]
    & \leq \eta_k^2 t  \sum_{t'=0}^{t-1} \mathbb{E}\Big[\|\nabla \widetilde{f}_i(\bm{w}^{(i)}_{k, t'}; \mathcal{B}^{(i)}_{k, t'})\|^2\Big]
    \leq \eta_k^2 t \sum_{t'=0}^{t-1} (\mathbb{E}[\|\nabla {f}_i(\bm{w}^{(i)}_{k, t'})\|^2] + \sigma^2).
\end{flalign}    
Putting this back in \cref{eq:feb28-7}, we get:
\begin{flalign}
    \label{eq:feb28-9}
    \mathbb{E}[\|\nabla {f}_i(\bm{w}^{(i)}_{k, t})\|^2] \leq 
    2\mathbb{E}[\|\nabla {f}_i(\bm{w}_k)\|^2] + 2 \eta_k^2 L^2 t \sum_{t'=0}^{t-1} (\mathbb{E}[\|\nabla {f}_i(\bm{w}^{(i)}_{k, t'})\|^2] + \sigma^2).
\end{flalign}
Now summing up \cref{eq:feb28-9} for all $t \in \{0,\ldots,\tau-1\}$, we get:
\begin{flalign}
    \nonumber
    \sum_{t=0}^{\tau-1} \mathbb{E}[\|\nabla {f}_i(\bm{w}^{(i)}_{k, t})\|^2] & \leq 2\tau (\mathbb{E}[\|\nabla {f}_i(\bm{w}_k)\|^2]) + 2 \eta_k^2 L^2 \sum_{t=0}^{\tau-1} \tau \sum_{t'=0}^{t-1} (\mathbb{E}[\|\nabla {f}_i(\bm{w}^{(i)}_{k, t'})\|^2] + \sigma^2)
    \\
    \label{eq:feb28-10}
    & \leq 2\tau(\mathbb{E}[\|\nabla {f}_i(\bm{w}_k)\|^2) + \eta_k^2 L^2 \tau^2 \sum_{t=0}^{\tau-1} (\mathbb{E}[\|\nabla {f}_i(\bm{w}^{(i)}_{k, t})\|^2] + \sigma^2).
\end{flalign}
Let us set $\eta_k L E \leq 1/2$. Then:
\begin{equation*}
    \sum_{t=0}^{\tau-1} \mathbb{E}[\|\nabla {f}_i(\bm{w}^{(i)}_{k, t})\|^2] \leq 2 \tau (\mathbb{E}[\|\nabla {f}_i(\bm{w}_k)\|^2) + \frac{1}{4} \sum_{t=0}^{\tau-1} \mathbb{E}[\|\nabla {f}_i(\bm{w}^{(i)}_{k, t})\|^2] + \frac{\sigma^2 \tau}{4}.
\end{equation*}
Simplifying, we get:
\begin{equation}
    \label{eq:feb28-11}
    \sum_{t=0}^{\tau-1} \mathbb{E}[\|\nabla {f}_i(\bm{w}^{(i)}_{k, t})\|^2] \leq \frac{\tau}{3}(8 \mathbb{E}[\|\nabla {f}_i(\bm{w}_k)\|^2] + \sigma^2).
\end{equation}
\end{proof}

\end{document}
