% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}
\usepackage{subcaption}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{bbm}
\usepackage{amssymb}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{mathtools}
\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}


\newtheorem{theorem}{Theorem}
\newtheorem{proposition}{Proposition}
\theoremstyle{definition}
\newtheorem{definition}{Definition}[section]

\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\newcommand{\abs}[1]{\left\lvert#1\right\rvert}
\renewcommand{\qedsymbol}{$\blacksquare$}
\newcommand{\ts}{\textsuperscript}
\usepackage{subcaption,times}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\usepackage{import}
%\usepackage{enumitem}
\usepackage{amsmath}
\usepackage{mathtools}
\usepackage{dsfont}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage{csquotes}
\DeclarePairedDelimiter\ceil{\lceil}{\rceil}
\makeatletter
\newcommand{\printfnsymbol}[1]{%
  \textsuperscript{\@fnsymbol{#1}}%
}
\makeatother

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}
\usepackage{framed}
\colorlet{shadecolor}{pink}
\usepackage{authblk}
\usepackage{adjustbox}
\usepackage{bbm}

\usepackage{graphicx}
\usepackage{soul}
\usepackage{subcaption}
\usepackage{booktabs} % for professional tables
\usepackage{tablefootnote}

\usepackage{amsmath,amsthm,amssymb,amsfonts}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{enumerate}
\usepackage{cleveref}
\usepackage{comment}
\usepackage{bm}
\usepackage{pifont}
\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%



\theoremstyle{plain}
\newtheorem{corollary}{Corollary}[theorem]
\newtheorem{lemma}[]{Lemma}
\newtheorem{assumption}{Assumption}[]

\theoremstyle{definition}
\newtheorem{remark}{Remark}[]

\newcommand{\ra}[1]{\renewcommand{\arraystretch}{#1}}
\renewcommand{\qedsymbol}{$\blacksquare$}

\def\x{{\mathbf x}}
\def\z{{\mathbf z}}
\def\a{{\mathbf a}}
\def\A{{\mathbf A}}
\def\P{{\mathbf P}}
\def\r{{\mathbf r}}
\def\s{{\mathbf s}}
\def\u{{\mathbf u}}
\def\y{{\mathbf y}}
\def\I{{\mathbf I}}
\def\e{{\boldsymbol{\nu}}}
\def\Rb{{\mathbb{R}}}
\def\C{{\mathbf C}}

\def\G{{\mathcal G}}
\def\E{{\mathbb E}}
\def\O{{\mathcal O}}
\def\R{{\mathbb{R}}}


\newcommand{\highlight}[1]{%
  \colorbox{yellow!100}{$\displaystyle#1$}}


\title{Faster Non-Convex Federated Learning via Global and Local Momentum}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<rdas@utexas.edu>?Subject=Your UAI 2022 paper}{Rudrajit Das}{}}
\author[1]{Anish Acharya \thanks{Equal Contribution}}
\author[2]{Abolfazl Hashemi \printfnsymbol{1}}
\author[1]{Sujay Sanghavi}
\author[1]{Inderjit S. Dhillon}
\author[1]{Ufuk Topcu}
% Add affiliations after the authors
\affil[1]{%
    University of Texas at Austin\\
    %Austin, Texas, USA
    USA
}
\affil[2]{%
    Purdue University\\
    West Lafayette, Indiana, USA%\\
}
  
\begin{document}
\maketitle

\begin{abstract}
We propose \texttt{FedGLOMO}, a novel federated learning (FL) algorithm with an iteration complexity of $\mathcal{O}(\epsilon^{-1.5})$ to converge to an $\epsilon$-stationary point (i.e., $\mathbb{E}[\|\nabla f(x)\|^2] \leq \epsilon$) for smooth non-convex functions -- under arbitrary client heterogeneity and compressed communication -- compared to the $\mathcal{O}(\epsilon^{-2})$ complexity of most prior works. Our key algorithmic idea that enables achieving this improved complexity is based on the observation that the convergence in FL is hampered by two sources of high variance: (i) the global server aggregation step with multiple local updates, exacerbated by client heterogeneity, and (ii) the noise of the local client-level stochastic gradients. The first issue is particularly detrimental to FL algorithms that perform plain averaging at the server. By modeling the server aggregation step as a generalized gradient-type update, we propose a variance-reducing momentum-based global update at the server, which when applied in conjunction with variance-reduced local updates at the clients, enables \texttt{FedGLOMO} to enjoy an improved convergence rate. Our experiments illustrate the intrinsic variance reduction effect of \texttt{FedGLOMO}, which implicitly suppresses client-drift in heterogeneous data distribution settings and promotes communication efficiency.
\end{abstract}

\section{Introduction}
\label{sec:intro}
Federated learning (FL) is a new edge-computing approach that advocates training statistical models directly on remote devices by leveraging enhanced local resources on each device (\cite{mcmahan2017communication}). In a standard FL setting, there are $n$ clients, each having its own training data, and a central server that is trying to train a model, parameterized by $\bm{w} \in \mathbb{R}^d$, using the clients' data.
Suppose the data distribution of the $i^{\text{th}}$ client is $\mathcal{D}_i$.
Then the $i^{\text{th}}$ client has an objective function $f_i(\bm{w})$ which is the expected loss, with respect to some loss function $\ell$, over data drawn from $\mathcal{D}_i$, and the goal of the central server is to optimize the average 
\footnote{In general this may be a weighted average, but here we only consider uniform weights, i.e., each weight is ${1}/{n}$.} 
loss $f(\bm{w})$, over the $n$ clients, i.e.,
\begin{equation}
    \label{eq:fl-intro-1}
    f(\bm{w}) := \frac{1}{n} \sum_{i=1}^{n} {f_i}(\bm{w}) \text{ \& } f_i(\bm{w}) = 
    \mathbb{E}_{\bm{x} \sim \mathcal{D}_i}[\ell(\bm{x},\bm{w})].
\end{equation}
The setting where the data distributions of all the clients are identical, i.e. $\mathcal{D}_1 = \ldots = \mathcal{D}_n$, is typically known as the \enquote{homogeneous} setting. Otherwise, the settings where the data distributions are \textit{not} identical are referred to as the  \enquote{heterogeneous} settings.

The core algorithmic idea of FL -- in the form of \texttt{FedAvg} -- was introduced in \cite{mcmahan2017communication}. In \texttt{FedAvg} (summarized in \Cref{alg:fed-avg}), a \textit{subset} of the clients perform \textit{multiple} steps of gradient descent based updates on their local data and then communicate back their respective updates to the server, which then averages them to update the global model (hence the name \texttt{FedAvg}).
This idea of performing multiple local updates before averaging once reduces the communication cost required for training. Another essential strategy in FL to cut down the communication cost is to have the clients send compressed/quantized messages to the server in every round -- this is of particular significance for training deep learning models where the number of model parameters is in millions or more.

In practice however, performing multiple local updates on clients with \textit{heterogeneous} data distributions leads to the so-called phenomenon of \enquote{{client drift}}, wherein the individual client updates do not align well (due to over-fitting on the local client data) inhibiting the convergence of \texttt{FedAvg} to the optimum of the average loss over all the clients. In this paper, we identify the high variance associated with the simple averaging step of \texttt{FedAvg} for the global update to be at the heart of this issue.

Ever since the development of FL, significant attention has been devoted to analyzing \texttt{FedAvg} under different settings, modifying \texttt{FedAvg} using ideas from centralized optimization to accelerate the training or to reduce the communication cost; we discuss these works in \Cref{rel-work}. Compared to centralized optimization, a formidable challenge in the theoretical analysis of FL algorithms is the use of multiple local updates in the clients which is compounded by the \textit{heterogeneous} nature of data distribution among the clients. To limit the extent of client heterogeneity, a standard assumption in FL theory is the \textit{bounded client dissimilarity (BCD) assumption}, i.e.,
\begin{equation}
    \label{eq:bcd}
    \frac{1}{n}\sum_{i=1}^{n}\|\nabla f_i(\bm{w}) - \nabla f(\bm{w})\|^2 \leq G^2 \text{ } \forall \text{ } \bm{w},
\end{equation}
for some large enough constant $G < \infty$ (e.g., see A1 in \cite{karimireddy2020mime}). But this assumption is limiting as it does not allow for \textit{arbitrarily large client heterogeneity}.

{Recently, \cite{arjevani2019lower} showed that the {stochastic first-order complexity} of any
algorithm in the \textit{centralized setting} to reach an $\epsilon$-stationary point (i.e., $\mathbb{E}[\|\nabla f(\bm{x})\|^2] \leq \epsilon$) for \textit{smooth non-convex functions} is $\Omega(\epsilon^{-1.5})$. It is well known that vanilla SGD has a suboptimal complexity of $\mathcal{O}(\epsilon^{-2})$ as it cannot mitigate the high variance of the stochastic gradient noise. Recognizing this issue, \textit{variance-reducing} techniques for SGD (\cite{fang2018spider,zhou2018stochastic,cutkosky2019momentum,liu2020optimal}) have been proposed that attain the optimal complexity of $\mathcal{O}(\epsilon^{-1.5})$. 
Coming to the federated setting, as we discuss in this paper, in addition to the noise in the \textit{local} client-level stochastic gradients, one has to also contend with the high variance associated with the \textit{global} server aggregation step which depends on the client heterogeneity and the number of local update steps. In this case, as we argue in the subsequent sections, applying only local client-level variance-reduction is not enough for improving the iteration complexity of vanilla \texttt{FedAvg} beyond $\mathcal{O}(\epsilon^{-2})$ for smooth, non-convex losses.

To alleviate the issue of variance due to heterogeneity, we propose a novel FL algorithm with \textit{compressed communication} called \texttt{FedGLOMO} (\Cref{alg:2} and \ref{alg:2-local}) which applies \texttt{G}\textit{lobal} as well as \texttt{LO}\textit{cal} \textit{variance-reducing} \texttt{MO}\textit{mentum} to the server update and client updates, respectively.
We prove that the iteration complexity of \texttt{FedGLOMO} is $\mathcal{O}(\epsilon^{-1.5})$ in the smooth non-convex case, which is better than the $\mathcal{O}(\epsilon^{-2})$ complexity of related works in the FL setting; see \Cref{tb:comp} and \Cref{nov4-thm1}.
Further, our theory does not use the BCD assumption, i.e. \cref{eq:bcd}, which is a standard assumption in related works. Instead, we propose and use \Cref{as-het}, which is a more realistic and \textit{empirically verified} assumption on the client drift, even allowing for arbitrary client heterogeneity. 
It is worth mentioning here that for FL, \cite{karimireddy2020mime} also propose an algorithm (\texttt{MimeMVR}) which is shown to attain this improved complexity of $\mathcal{O}(\epsilon^{-1.5})$ but \textit{with} the BCD assumption and \textit{no} compressed communication; we %discuss more 
talk about this at the end of \Cref{rel-work}.}

We summarize our \textbf{contributions} next:
    
\textbf{(a)} We propose \texttt{FedGLOMO} (Alg. \ref{alg:2} and \ref{alg:2-local}), in which we apply a \textit{novel global momentum term at the server} in addition to  \textit{local momentum at the clients}. The design of \texttt{FedGLOMO} is motivated by two critical issues that need to be alleviated to accelerate convergence in FL; these are the high variances associated with: (i) the \textit{global} server aggregation step due to heterogeneity of clients when there are multiple local updates, and (ii) the noise of \textit{local} client-level stochastic gradients. Global and local momentum result in \textit{variance reduction} for the global server update and the local client updates, allowing us to tackle (i) and (ii), respectively. This enables \texttt{FedGLOMO} to converge to an $\epsilon$-stationary point (i.e., $\mathbb{E}[\|\nabla f(\bm{x})\|^2] \leq \epsilon$) for smooth non-convex functions in $\mathcal{O}(\epsilon^{-1.5})$ gradient-based updates, which is better than the $\mathcal{O}(\epsilon^{-2})$ complexity of most related works in the FL setting; see \Cref{tb:comp} and \Cref{nov4-thm1}.
    
\textbf{(b)} Unlike prior work, our theory does not use the limiting {bounded client dissimilarity assumption} (i.e., \cref{eq:bcd}). Instead, to tighten our result, we propose and use \Cref{as-het} -- which is a novel assumption on the client drift, even allowing for \textit{arbitrary client heterogeneity} in the worst case. We empirically verify that \Cref{as-het} holds for \texttt{FedGLOMO} as well as \texttt{FedAvg}. Theoretically, we also show that \Cref{as-het} holds for \textit{any} FL algorithm in the case of linear regression and also with networks whose training dynamics follow that of a linearized model (a.k.a. the \enquote{NTK} regime). Refer to the discussion after \Cref{as-het} and \Cref{rem-sep21-2} for details. 
    
\textbf{(c)} \texttt{FedGLOMO} is the \textit{first FL algorithm} achieving $\mathcal{O}(\epsilon^{-1.5})$ complexity while allowing \textit{compressed client-to-server communication}. We emphasize that from the theory perspective, applying compression in \texttt{FedGLOMO} is not trivial and the most obvious approach does not work; see \Cref{rem-sep21-3}. 
    
\textbf{(d)} In \Cref{sec:exp}, experiments on CIFAR-10 and Fashion-MNIST (\cite{xiao2017fashion}) show that in a highly heterogeneous setting of at most two (out of ten) classes per client, \texttt{FedGLOMO} requires only about \textit{one-third} the number of bits used by \texttt{FedAvg} with PyTorch's default momentum applied to the local client updates; see \Cref{fig:1}. Our experiments also illustrate the variance reduction provided by our scheme which implicitly mitigates client-drift under heterogeneous data distribution and in turn promotes communication-efficiency.
    
\section{Related Work}
\label{rel-work}
\textbf{\texttt{FedAvg} and related methods:}
\cite{reisizadeh2020fedpaq} propose \texttt{FedPAQ} which is basically \texttt{FedAvg} (\cite{mcmahan2017communication}) with quantized client-to-server communication, and establish its convergence for the homogeneous case. \cite{li2019convergence} establish the convergence of \texttt{FedAvg} for strongly convex functions with heterogeneity (assuming bounded client dissimilarity) but without any compressed communication. \cite{haddadpour2020federated} propose \texttt{FedCOMGATE} which incorporates gradient tracking (\cite{pu2020distributed}) and derive results with data heterogeneity and quantized communication. 
\cite{karimireddy2019scaffold} propose \texttt{SCAFFOLD} which uses control-variates to mitigate the client-drift owing to the heterogeneity of clients.
\cite{li2018federated} present \texttt{FedProx} which adds a proximal term to control the deviation of the client parameters from the global server parameter in the previous round. \cite{reddi2020adaptive} propose federated versions of commonly used adaptive optimization methods and prove their convergence under heterogeneity. 
Local SGD (\cite{zinkevich2010parallelized,stich2018local,yu2018parallel,wang2018cooperative,basu2019qsparse,stich2019error,patel2019communication,woodworth2020local,bayoumi2020tighter,liang2019variance,koloskova2020unified}) is very similar to FL and is essentially based on the same principle as \texttt{FedAvg}. However, in local SGD, there is usually no data heterogeneity and all the clients participate in each round (known as \enquote{full device participation}), both of which do not hold in FL and simplify the derivation of convergence results.
\\
\cite{wang2019slowmo, huo2020faster} present momentum-based updates at the server without any improvement in the %order-wise 
convergence rate as compared to momentum-free updates. \cite{qu2020federated} present Nesterov accelerated \texttt{FedAvg} for convex objectives. \cite{karimireddy2020mime} propose \texttt{Mime}(\texttt{MVR}) which applies momentum at the client-level based on globally computed statistics to control client-drift. \cite{khanduri2021stem} propose \texttt{STEM} which applies momentum globally and locally for local SGD; however, their server aggregation step is just plain averaging as they do not have deal with server-side variance reduction, since all the clients participate in local SGD.

\begin{table*}[t]
\caption{Number of gradient updates, i.e., $T$, required to achieve $\mathbb{E}[\|\nabla f(\bm{w})\|^2] \leq \epsilon$ on smooth non-convex functions. Here, $n$ is the total number of clients and $r$ is the number of clients participating in each round. \enquote{Client Participation} asks whether all ($r=n$) or only a subset ($r<n$) of the clients participate in each round.
\enquote{BCD?} asks if the bounded client dissimilarity assumption (\cref{eq:bcd}) is used or not. \enquote{Compression?} asks whether compressed communication is involved or not.
\\
$*1$: $\alpha \leq n$ is a problem-dependent quantity; in practice, {we expect $\alpha \ll n$} as confirmed in our experiments.
}
\label{tb:comp}
\ra{1}
\begin{adjustbox}{width=\textwidth}
\begin{tabular*}{\linewidth}{@{}ccccc@{}}\toprule
Ref. & $T$ & Client Participation & BCD? & Compression? \\ \midrule
\cite{koloskova2020unified,wang2019slowmo} &$\mathcal{O}(\frac{1}{n\epsilon^2})$&Full ($r=n$)&Yes&{\color{red}\xmark}
\\\midrule
\cite{haddadpour2020federated} &$\mathcal{O}(\frac{1}{n\epsilon^2})$&Full %Device 
($r=n$)&Yes&{\color{black}\cmark}
\\\midrule
\cite{khanduri2021stem} &$\mathcal{O}(\frac{1}{n\epsilon^{1.5}})$&Full ($r=n$)&Yes&{\color{red}\xmark}
\\\midrule
\cite{karimireddy2019scaffold}&$\O(\frac{1}{r \epsilon^2})$&Partial ($r<n$)&Yes&{\color{red}\xmark}
\\\midrule
\cite{karimireddy2020mime}&$\mathcal{O}\big(\frac{1}{\sqrt{r}\epsilon^{1.5}}\big)$&Partial ($r<n$)&Yes&{\color{red}\xmark}
\\\midrule
\textbf{This work} (\texttt{FedGLOMO})
&$\mathcal{O}\big(\max\big(\sqrt{\frac{\alpha}{n}}, {\frac{1}{\sqrt{r}}}\big)\frac{1}{\epsilon^{1.5}}\big)^{*1}$&Partial ($r<n$)&\textbf{No}&{\color{black}\cmark}
\\
\bottomrule
\end{tabular*}
\end{adjustbox}
\end{table*}

\textbf{Distributed optimization with compression:} References \cite{alistarh2017qsgd,suresh2017distributed,reisizadeh2020fedpaq,haddadpour2020federated,tang2018communication,wu2018error,bernstein2018signsgd,alistarh2018convergence,lin2017deep,stich2018sparsified,basu2019qsparse,hashemi2020delicoco,chen2020communication,chen2021communication} aim to minimize the communication bottleneck in distributed optimization by transmitting compressed messages to the central server and establishing their convergence. \cite{horvath2019stochastic,gorbunov2021marina} provide distributed algorithms with improved convergence rates by also applying variance reduction and periodically using full gradients; however, there are no multiple local updates in these works. In {Appendix D}, we compare our work's complexity against that of \cite{gorbunov2021marina}. In this work, we employ the quantization operator of \cite{alistarh2017qsgd}.

\textbf{Complexity for smooth non-convex stochastic optimization:} 
\cite{arjevani2019lower} show that the optimal stochastic first-order complexity to reach an $\epsilon$-stationary point (i.e., $\mathbb{E}[\|\nabla f(\bm{x})\|^2] \leq \epsilon$) is $\mathcal{O}(\frac{\sigma}{\epsilon^{1.5}})$ where $\sigma^2$ is the variance of the stochastic gradients. 
Unfortunately, vanilla SGD is suboptimal and \textit{variance-reducing} techniques must be applied to attain the optimal complexity; some noteworthy works on variance-reduction for SGD are \texttt{SVRG} (\cite{johnson2013accelerating}), \texttt{SAGA} (\cite{defazio2014saga}) and \texttt{SARAH} (\cite{nguyen2017sarah}). SVRG-style algorithms such as \texttt{SPIDER} (\cite{fang2018spider}) and \texttt{SNVRG} (\cite{zhou2018stochastic}) attain this optimal complexity by periodically using giant batch sizes. \cite{cutkosky2019momentum} propose \texttt{STORM} which also attains this optimal complexity with adaptive learning rates, but without using any large batches. The key idea of \texttt{STORM} is momentum-based variance reduction, obtained by using the stochastic gradient at the previous point \textit{computed over the same batch} on which the stochastic gradient at the current point is computed. \cite{liu2020optimal} present a much simpler proof for essentially the same algorithm by employing a constant learning rate and requiring a large batch size only at the first iteration. Our key idea of global and local momentum is \texttt{STORM}-like \textit{variance-reducing} momentum applied to the aggregation step at the server, interpreted as a generalized gradient-type update, and the local client updates, respectively; see \Cref{sec:main}.

\Cref{tb:comp} compares the complexities of the most relevant related works in FL ($r<n$) and local SGD ($r=n$) with ours on smooth non-convex functions. Note that under the more challenging FL setting with partial-device participation, only \texttt{FedGLOMO} and \texttt{MimeMVR} (\cite{karimireddy2020mime}) attain the improved iteration complexity of $\mathcal{O}(\epsilon^{-1.5})$ with respect to $\epsilon$. 
However, unlike \cite{karimireddy2020mime}, our work does not rely on the bounded client dissimilarity assumption (\cref{eq:bcd}) and allows for compressed client-to-server communication, in which case  maintaining the improved complexity is not trivial; for details, see Remarks \ref{rem-sep21-2} and \ref{rem-sep21-3}, respectively. There are meaningful algorithmic differences between our work and \cite{karimireddy2020mime} too. The biggest one is that while we explicitly apply momentum in the server aggregation step (global momentum) as well as in the client updates (local momentum), \cite{karimireddy2020mime} only apply \textit{globally computed} momentum in the local client updates. 
For a detailed discussion of the differences of our work from \cite{karimireddy2020mime}, see {Appendix C}.
Since \texttt{Mime} is designed to deal with client drift, we empirically compare it against \texttt{FedGLOMO} without compression in a highly heterogeneous setting in \Cref{sec:exp}.

\section{Preliminaries}
\label{sec:prelim}
Recall the setting and the optimization problem that the server is trying to solve as defined in \cref{eq:fl-intro-1}. We assume that the clients have access to unbiased stochastic gradients of their individual losses. We denote the stochastic gradient of $f_i$ at $\bm{w}$ computed over a batch of samples $\mathcal{B}$, by $\widetilde{\nabla} f_i(\bm{w};\mathcal{B})$. Also in this paper, $K$ is the number of communication rounds, $E$ is the number of local updates per round or the period, and $T = KE$ is the total number of local updates or the (order-wise) number of gradient-based updates. Further, $r$ is the number of clients that the server accesses in each round, i.e., the global batch size.

Vectors and matrices are written in boldface. For any positive integer $m$, the set $\{1,\ldots,m\}$ is denoted by $[m]$, and the uniform distribution over the set  $\{0,\ldots,m\}$ is denoted by $\text{Unif}[0,m]$.
$\mathbbm{1}(.)$ is the indicator function. Next, we recap smooth functions.
\begin{definition}[\textbf{Smoothness}]
A function $g:\Theta \xrightarrow{} \mathbb{R}$ is to said to be $L$-smooth if for all $\bm{\theta}, \bm{\theta}' \in \Theta$, $\|\nabla g(\bm{\theta}) - \nabla g(\bm{\theta}')\| \leq L\|\bm{\theta} - \bm{\theta}'\|$. For all $\bm{\theta}, \bm{\theta}' \in \Theta$, we also have: $g(\bm{\theta}') \leq g(\bm{\theta}) + \langle \nabla g(\bm{\theta}), \bm{\theta}' - \bm{\theta} \rangle + \frac{L}{2}\|\bm{\theta}' - \bm{\theta}\|^2$.
\end{definition}
\section{\texttt{FedGLOMO}: \texttt{G}lobal and \texttt{LO}cal \texttt{MO}mentum-Based Variance Reduction}
\label{sec:main}
\begin{algorithm}[t]
	\caption{\texttt{FedGLOMO} - Server Update}
	\label{alg:2}
	\begin{algorithmic}[1]
		\STATE {\bfseries Input:} Initial point $\bm{w}_0$, \# of rounds of communication $K$, period $E$, learning rates  $\{\eta_{k}\}_{k=0}^{K-1}$ and global batch size $r$. $Q_D$ is the quantization operator. Set $\bm{w}_{-1} = \bm{w}_0$.
		\FOR{$k =0,\dots, K-1$}
		\STATE 
		Server sends $\bm{w}_k$, $\bm{w}_{k-1}$ to a set $\mathcal{S}_k$ of $r$ clients chosen uniformly at random w/o replacement.
		\FOR{client $i \in \mathcal{S}_k$}
		\STATE Set $\bm{w}_{k,0}^{(i)} = \bm{w}_k$ and $\widehat{\bm{w}}_{k-1,0}^{(i)} = \bm{w}_{k-1}$. Run \Cref{alg:2-local} for client $i$.
		\ENDFOR
		\IF{$k = 0$}
		\label{step-0}
		\STATE Set
		$\bm{u}_{k} = \frac{1}{r}\sum_{i \in \mathcal{S}_k}Q_D({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}})$.
		\label{glob-mom-0}
		\ELSE
		\STATE Set
		$\bm{u}_{k} = \frac{\beta_k}{r} \sum_{i \in \mathcal{S}_k}Q_{D}(\bm{w}_{k} - {\bm{w}_{k,E}^{(i)}}) + 
		(1-\beta_k)\bm{u}_{k-1} + \frac{(1-\beta_k)}{r} \sum_{i \in \mathcal{S}_k} Q_{D}((\bm{w}_{k} - {\bm{w}_{k,E}^{(i)}}) - ({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}}))$.
		{\color{blue} // \texttt{(Global Momentum)}}\label{glob-mom}
		\ENDIF
		\STATE Update $\bm{w}_{k+1} = \bm{w}_{k} - \bm{u}_k$.
		\ENDFOR
	\end{algorithmic}
\end{algorithm}

\begin{algorithm}[t]
	\caption{\texttt{FedGLOMO} - Client Update}
	\label{alg:2-local}
	\begin{algorithmic}[1]
		\FOR{$\tau = 0,\ldots,E-1$}
		\IF{$\tau = 0$}
		\STATE Set $\bm{v}_{k,\tau}^{(i)} = {\nabla} f_i(\bm{w}_{k,\tau}^{(i)})$,  $\widehat{\bm{v}}_{k-1,\tau}^{(i)} = {\nabla} f_i(\widehat{\bm{w}}_{k-1,\tau}^{(i)})$.
		\label{l1}
		\ELSE
		\STATE Pick a random batch of samples 
		in client $i$, say $\mathcal{B}_{k,\tau}^{(i)}$. Compute the stochastic gradients 
		of $f_i$ at $\bm{w}_{k,\tau}^{(i)}$, $\widehat{\bm{w}}_{k-1,\tau}^{(i)}$, $\bm{w}_{k,\tau-1}^{(i)}$ and $\widehat{\bm{w}}_{k-1,\tau-1}^{(i)}$ over $\mathcal{B}_{k,\tau}^{(i)}$ viz.
		$\widetilde{\nabla} f_i(\bm{w}_{k,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)})$, $\widetilde{\nabla} f_i(\widehat{\bm{w}}_{k-1,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)})$, $\widetilde{\nabla} f_i(\bm{w}_{k,\tau-1}^{(i)};\mathcal{B}_{k,\tau}^{(i)})$ and $\widetilde{\nabla} f_i(\widehat{\bm{w}}_{k-1,\tau-1}^{(i)};\mathcal{B}_{k,\tau}^{(i)})$. 
		\paragraph{}
		\STATE 
		Update: $\bm{v}_{k,\tau}^{(i)} = \widetilde{\nabla} f_i(\bm{w}_{k,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)}) + \big(\bm{v}_{k,\tau-1}^{(i)} - \widetilde{\nabla} f_i(\bm{w}_{k,\tau-1}^{(i)};\mathcal{B}_{k,\tau}^{(i)})\big)$ and
		\\
		$\widehat{\bm{v}}_{k-1,\tau}^{(i)} = \widetilde{\nabla} f_i(\widehat{\bm{w}}_{k-1,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)}) + \big(\widehat{\bm{v}}_{k-1,\tau-1}^{(i)} - \widetilde{\nabla} f_i(\widehat{\bm{w}}_{k-1,\tau-1}^{(i)};\mathcal{B}_{k,\tau}^{(i)})\big)$. 
		{\color{blue} // \texttt{(Local Mom.)}} \label{l2}
		\ENDIF
		\STATE Update $\bm{w}_{k,\tau+1}^{(i)} = \bm{w}_{k,\tau}^{(i)} - \eta_{k}\bm{v}_{k,\tau}^{(i)}$ and  $\widehat{\bm{w}}_{k-1,\tau+1}^{(i)} = \widehat{\bm{w}}_{k-1,\tau}^{(i)} - \eta_{k}\widehat{\bm{v}}_{k-1,\tau}^{(i)}$.
		\label{l3}
		\ENDFOR
		\paragraph{}
		\STATE Send $Q_{D}(\bm{w}_{k} - {\bm{w}_{k,E}^{(i)}})$ and $Q_{D}((\bm{w}_{k} - {\bm{w}_{k,E}^{(i)}}) - ({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}}))$ 
		to the server.
		\label{comp}
	\end{algorithmic}
\end{algorithm}

\begin{algorithm}[t]
	\caption{\texttt{FedAvg} \cite{mcmahan2017communication}
	}
	\label{alg:fed-avg}
	\begin{algorithmic}[1]
		\STATE {\bfseries Input:} 
		Initial point $\bm{w}_0$, \# of communication rounds $K$, period $E$, learning rates  $\{\eta_{k}\}_{k=0}^{K-1}$ and global batch size $r$.
		\FOR{$k =0,\dots, K-1$}
		\STATE Server sends $\bm{w}_k$ to a set $\mathcal{S}_k$ of $r$ clients chosen uniformly at random w/o replacement.
		\FOR{client $i \in \mathcal{S}_k$}
		\STATE Set $\bm{w}_{k,0}^{(i)} = \bm{w}_k$.
		\FOR{$\tau = 0,\ldots,E-1$}
		\STATE Pick a random batch of samples 
		in client $i$, $\mathcal{B}_{k,\tau}^{(i)}$.
		Compute the stochastic gradient of $f_i$ at $\bm{w}_{k,\tau}^{(i)}$ over $\mathcal{B}_{k,\tau}^{(i)}$, viz. $\widetilde{\nabla} f_i(\bm{w}_{k,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)})$.
		\STATE Update $\bm{w}_{k,\tau+1}^{(i)} = \bm{w}_{k,\tau}^{(i)} - \eta_{k} \widetilde{\nabla} f_i(\bm{w}_{k,\tau}^{(i)};\mathcal{B}_{k,\tau}^{(i)})$.
		\ENDFOR
		\STATE Send $(\bm{w}_k - \bm{w}_{k,E}^{(i)})$ to the server.
		\label{line:fedavg-1}
		\ENDFOR
		\STATE Update $\bm{w}_{k+1} = \bm{w}_k -  \frac{1}{r}\sum_{i \in \mathcal{S}_k}(\bm{w}_k - \bm{w}_{k,E}^{(i)})$.
		\label{line:fedavg-2}
		\ENDFOR
	\end{algorithmic}
\end{algorithm}
There are two issues that need to be alleviated for improving the convergence rate in FL: (i) the high variance of simple averaging used in the \textit{global} server aggregation step (of \texttt{FedAvg}), when there are multiple local updates, which is exacerbated by heterogeneity of the clients, and (ii) the high variance associated with the noise of \textit{local} client-level stochastic gradients. The key idea of \texttt{FedGLOMO} (\Cref{alg:2} and \ref{alg:2-local}) is to apply \textit{variance-reducing} \textbf{global} and \textbf{local} momentum to combat (i) and (ii), respectively. We now describe {global} and {local} momentum in detail.

\textbf{Global} momentum is applied to the sever aggregation step which is line \ref{glob-mom} in \Cref{alg:2}. To understand it better, let us revisit \texttt{FedAvg} (summarized in \Cref{alg:fed-avg}, although in a slightly different way than usual) and its server aggregation step (line \ref{line:fedavg-2}) which is just simple averaging. Similar to the update of SGD suffering from high variance, this naive averaging step -- which we think of as the average of a batch of generalized stochastic gradients -- is characterized by high variance stemming from heterogeneity and multiple local updates. So, this way of server aggregation slows down the convergence rate of \texttt{FedAvg} (and other related methods).

In this paper, we re-envision the server aggregation as a generalized gradient-based update by thinking of $(\bm{w}_k - \bm{w}_{k,E}^{(i)})$ as the generalized  gradient. Then, we wish to incorporate the style of variance-reducing momentum applied in \texttt{STORM} (\cite{cutkosky2019momentum,liu2020optimal}) to our generalized gradient-based update; note that their method is for stochastic gradients in the case of centralized optimization. 
To that end, let us briefly recap \texttt{STORM}'s update rule. For a function $h(\bm{z})$, \texttt{STORM}'s update for the $j^{\text{th}}$ iteration is:
\begin{multline}
    \label{apr20-1}
    \bm{z}_{j+1} = \bm{z}_j - \eta_j \bm{v}_j, \text{ where } 
    \bm{v}_j = 
    \{\widetilde{\nabla} h(\bm{z}_j;\xi_j) + 
    \\
    (1-\beta_j) (\bm{v}_{j-1} - \widetilde{\nabla} h(\bm{z}_{j-1};\xi_j))\mathbbm{1}(j>0)\}.
\end{multline}
In \cref{apr20-1}, $\xi_j$ denotes the source of randomness in the $j^{\text{th}}$ iteration and $\beta_j \in [0,1)$ is the momentum parameter. Note the use of the stochastic gradient at $\bm{z}_{j-1}$ computed on $\xi_j$.
Coming back to \Cref{alg:2}, the quantity $\bm{u}_k$ plays the role of $\bm{v}_j$ in  \cref{apr20-1}. To see this clearly, let us analyze $E_{Q_D}[\bm{u}_k]$ (see lines \ref{glob-mom-0} and \ref{glob-mom} in \Cref{alg:2}).
Under \Cref{as5}, the compression operator $Q_D$ produces an unbiased estimate of the input. Then defining ${g}(\bm{w}_k;\mathcal{S}_k) \triangleq \frac{1}{r}\sum_{i \in \mathcal{S}_k}({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}})$ and $\widehat{g}(\bm{w}_{k-1};\mathcal{S}_k) \triangleq \frac{1}{r}\sum_{i \in \mathcal{S}_k}({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})$, we have:
\begin{multline}
    \label{apr20-3}
    \mathbb{E}_{Q_D}[\bm{u}_k] = \{{g}(\bm{w}_k;\mathcal{S}_k) + 
    \\
    (1-\beta_k)\big(\bm{u}_{k-1} - \widehat{g}(\bm{w}_{k-1};\mathcal{S}_k)\big) \mathbbm{1}(k > 0)\}.
\end{multline}
In \cref{apr20-3}, ${g}(\bm{w}_k;\mathcal{S}_k)$ and $\widehat{g}(\bm{w}_{k-1};\mathcal{S}_k)$ play the roles of $\widetilde{\nabla} h(\bm{z}_j;\xi_j)$ and $\widetilde{\nabla} h(\bm{z}_{j-1};\xi_j)$, respectively. 
With this, one can clearly see that \cref{apr20-3} is the analogue of \cref{apr20-1} for the global server aggregation in FL. However, this equivalence is not so apparent without looking at the expected value of $\bm{u}_k$ with respect to ${Q}_D$; in fact, the choice of quantities that are compressed in {line \ref{comp} of Alg. \ref{alg:2-local}} and used in line \ref{glob-mom} of Alg. \ref{alg:2} is crucial for establishing provable guarantees (also see \Cref{rem-sep21-3}).

Now that we understand global momentum, let us move on to \textbf{local} momentum. For this see lines \ref{l1}, \ref{l2} and \ref{l3} in \Cref{alg:2-local}; these give us $(\bm{w}_{k} - {\bm{w}_{k,E}^{(i)}})$ and $({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})$ after running for $E$ steps. But notice that these lines are the same as \cref{apr20-1} with $\beta_j = 0$ and the stochastic gradient at the first iteration replaced by the full gradient. It is worth mentioning here that these local updates are also similar to \texttt{SPIDER} which is an SVRG-style update proposed in \cite{fang2018spider}. However, recognizing that this is also a special case of the \texttt{STORM} update with $\beta_j = 0$, we prefer calling it momentum in order to have a unifying terminology for both the global and local updates. 

One might wonder what is the role of global momentum as \texttt{SPIDER} can be extended to improve the complexity in distributed optimization \textit{without multiple local updates}. For this, {Appendix F}, we consider \texttt{FedLOMO} (Algorithm  {4 and 5 in the Appendix}) which is a simpler version of \texttt{FedGLOMO} with only local momentum and \textit{no} global momentum (i.e, plain averaging at the server which is equivalent to setting $\beta_k = 1$ in \Cref{alg:2}),
and show that it does not achieve $\mathcal{O}(\epsilon^{-1.5})$ complexity under partial-device participation and compression ({see Theorem 3 in Appendix}). The root cause of this is client heterogeneity which amplifies its effect under \textit{multiple local updates}; without incorporating some form of variance reduction in the server aggregation step, the complexity cannot be improved.

Let us try to provide some intuition as to how incorporating global momentum helps. Suppose we keep $\eta_k = \eta$ and $\beta_k = \beta < 1$ for all $k$. Theoretically, we get a lower bound for $\beta$ which is $\mathcal{O}(\eta^2)$. Then with this momentum-based aggregation strategy, the variance reduces by a factor of $\mathcal{O}(\beta/\eta) = \mathcal{O}(\eta)$ as compared to aggregation by plain averaging.
(There are some other terms too but these are sufficiently small.)
This reduction in the variance by a factor of $\mathcal{O}(\eta)$ is what improves the convergence rate of \texttt{FedGLOMO}.

It is true that \texttt{FedGLOMO} has to communicate twice the amount of information per round as compared to \texttt{FedAvg} or \texttt{FedPAQ} (\cite{reisizadeh2020fedpaq}) which is just \texttt{FedAvg} with compressed communication. One can set the precision of the quantizer sufficiently low to account for the extra per-round communication cost of \texttt{FedGLOMO} -- we adopt this approach in our experiments. Also, we only assume access to the full client gradient in line \ref{l1} of Alg. \ref{alg:2-local} for simplicity of analysis, but our main result (i.e., \Cref{nov4-thm1}) can be readily extended to the case of large enough batch sizes.

\section{Main Result for \texttt{FedGLOMO}}\label{sec:result:glomo}
First, we state our assumptions.
\begin{assumption}[\textbf{Smoothness}]
\label{as1} 
$\ell(\bm{x},\bm{w})$ is $L$-smooth with respect to $\bm{w}$, for all $\bm{x}$. Thus, each $f_i(\bm{w})$ ($i \in [n]$) is $L$-smooth, and so is $f(\bm{w})$.
\end{assumption}

\begin{assumption}[\textbf{Non-negativity}]
\label{as-may15}
Each $f_i(\bm{w})$ is non-negative and therefore, $f_i^{*} \triangleq \min f_i(\bm{w}) \geq 0$.
\end{assumption}
Most loss functions used in practice satisfy this anyways and if not, we can just add a constant offset to achieve non-negativity. 

\begin{assumption}[\textbf{Quantization}]\label{as5}
The quantization operator $Q_D$ in Alg. \ref{alg:2} and \ref{alg:2-local} is unbiased, i.e., $\mathbb{E}[Q_D(\bm{x}) | \bm{x}] = \bm{x}$, and its variance satisfies $\mathbb{E}[\|Q_D(\bm{x})-\bm{x}\|^2 | \bm{x}] \leq q\|\bm{x}\|^2$ for some $q > 0$. The \enquote{qsgd} operator proposed in Section 3.1 of \cite{alistarh2017qsgd} satisfies \Cref{as5}.
\end{assumption}

\begin{assumption}[\textbf{Client Drift/Heterogeneity}]\label{as-het}
Let $\mathcal{A}$ be an FL algorithm with $E$ local update steps and $K$ communication rounds. Let $\bm{w}_{k,\tau}^{(i)}$ be the $i^{\text{th}}$ client's local parameter at the start of the $(\tau+1)^{\text{st}}$ local step of the $(k+1)^{\text{st}}$ round of $\mathcal{A}$, for $i \in [n]$ (similar to the notation in Alg. \ref{alg:2}, \ref{alg:2-local}, and \ref{alg:fed-avg}).
Define $\widetilde{\bm{e}}_{k,\tau}^{(i)} \triangleq \nabla f_i(\bm{w}_{k,\tau}^{(i)}) - \nabla f_i\big(\frac{1}{n}\sum_{j \in [n]} \bm{w}_{k,\tau}^{(j)}\big)$.
Then for some $\alpha \ll n$, the following holds:
\begin{equation}
    \label{het-eq}
    \mathbb{E}\Big[\Big\|\sum_{i \in [n]}\widetilde{\bm{e}}_{k, \tau}^{(i)}\Big\|^2\Big] \leq \alpha \sum_{i \in [n]} \mathbb{E}\Big[\Big\|\widetilde{\bm{e}}_{k,\tau}^{(i)}\Big\|^2\Big],
\end{equation}
$\forall$ $\tau \in \{0,\ldots,E-1\}$ and $k \in \{0,\ldots,K-1\}$. The expectation above is w.r.t. any stochasticity in the local updates.
\end{assumption}
\Cref{het-eq} in the above assumption always holds with $\alpha = n$ for any FL algorithm; this follows from the fact that for any $m > 1$ vectors $\{\bm{a}_j\}_{j=1}^m$, $\|\sum_{j=1}^m \bm{a}_j\|^2 \leq m \sum_{j=1}^m \|\bm{a}_j\|^2$ (this can be obtained by using the Cauchy-Schwarz inequality). However, we empirically observe $\alpha \ll n$ in practice for \texttt{FedGLOMO} as well as \texttt{FedAvg}; see \Cref{sec:het-asm-expt} and {Appendix H}, respectively. 
The value of $\alpha$ in \Cref{as-het} is a measure of the amount of client drift induced by the algorithm which also depends on the degree of heterogeneity in the system -- as the heterogeneity increases (decreases), we observe $\alpha$ to also increase (decrease). 

From \Cref{fig:het0} (in \Cref{sec:het-asm-expt}), we see that for the highly heterogeneous setting that we consider for our experiments in \Cref{sec:exp}, $\alpha < 0.06 n$ for most of the trajectory of \texttt{FedGLOMO} on both CIFAR-10 and Fashion-MNIST (abbreviated as FMNIST). In the homogeneous case, $\alpha < 0.03 n$ and $\alpha < 0.02 n$ for most of the trajectory on CIFAR-10 and FMNIST, respectively. We observe a similar trend of $\alpha$ for \texttt{FedAvg} in {Appendix H}. Additionally, we derive a convergence result for \texttt{FedAvg} under \Cref{as-het} and without the bounded client dissimilarity assumption (i.e., \cref{eq:bcd}) in {Appendix H}. 

{\textbf{Some theoretical motivation for \Cref{as-het}:} Let us consider \textit{linear regression} to provide a scenario where $\alpha = 0$ provably for \textbf{any} FL algorithm. Suppose in client $i$, we have feature and label pairs $(\bm{x}, y) \sim (\mathcal{X}_i, \mathcal{Y}_i)$, where the label
\[y = \langle \bm{w}_i^{\ast}, \bm{x} \rangle + \xi,\]
with $\xi \sim \mathcal{N}_i$ being {independent zero-mean} client-dependent random noise. Obviously, the label distribution $\mathcal{Y}_i$ here depends on the feature distribution $\mathcal{X}_i$, noise distribution $\mathcal{N}_i$ and $\bm{w}_i^{\ast}$. We assume that the covariance matrix of the feature vectors is the same across all the clients, i.e., $\mathbb{E}_{\bm{x} \sim \mathcal{X}_i}[\bm{x} {\bm{x}}^T] = \bm{Q}$ for all $i \in [n]$; this is possible for e.g., by normalization or whitening of the features. Note that by assuming the same covariance matrix across all the clients, we are \textit{not} assuming that the feature distributions are the same across clients, but even if they are, there is heterogeneity through the different label distributions. Then, with the squared loss, our per-client objective function is: 
\[f_i(\bm{w}) = \mathbb{E}_{(\bm{x}, y) \sim (\mathcal{X}_i, \mathcal{Y}_i)}\Big[\frac{1}{2}(y - \langle \bm{w}, \bm{x} \rangle)^2\Big].\]
With the aforementioned conditions, it can be verified that $\nabla f_i(\bm{w}) = \bm{Q}(\bm{w} - \bm{w}_i^{\ast})$. Thus,
\[\widetilde{\bm{e}}_{k, \tau}^{(i)} = \bm{Q} \Big(\bm{w}_{k, \tau}^{(i)} - \frac{1}{n}\sum_{j \in [n]}\bm{w}_{k, \tau}^{(j)}\Big),\]
and so $\sum_{i \in [n]} \widetilde{\bm{e}}_{k, \tau}^{(i)} = \vec{0}$. So, \textit{\Cref{as-het} holds here with $\alpha = 0$ for any FL algorithm}. 

In fact, the above analysis and result (i.e., $\alpha = 0$) can be extended to networks whose training dynamics follow that of a linearized model, which has been shown to be the case for infinite-width networks (see for e.g., \cite{lee2019wide} and \cite{jacot2018neural}) and has been also used on applications for finite-width networks (for e.g., in \cite{mu2020gradients}).}

We now present the abridged version of the convergence result of \texttt{FedGLOMO}, followed by some important remarks. Its full version and detailed proof are in {Appendix A and G.1}, respectively.

\begin{theorem} [\textbf{Smooth non-convex}]
\label{nov4-thm1}
Let Assumptions \ref{as1}, \ref{as-may15} and \ref{as5} hold. Further, suppose \Cref{as-het} is true for \texttt{FedGLOMO}. 
In \texttt{FedGLOMO}, for each round $k$, set $\eta_{k} = \eta = \mathcal{O}(\frac{1}{L E K^{1/3} C^{1/3}})$, where $C = \mathcal{O}\big(\max\big(\frac{\alpha}{n}, \frac{E^2 (1+q)^2}{r}\big)\big)$, and $\beta_k = \mathcal{O}({(1+q) \eta^2 L^2 E^4})$. Suppose we use full-device participation (i.e., the global batch size is $n$) \textbf{only at} $k = 0$. Then, \texttt{FedGLOMO} can achieve $\mathbb{E}_{k^{*} \sim \textup{Unif}[0,K-1]}[\|\nabla f(\bm{w}_{k^{*}})\|^2] \leq \epsilon$ in 
$K = \mathcal{O}\big( \max\big(\sqrt{\frac{\alpha}{n}}, \frac{1+q}{\sqrt{r}}\big){\epsilon^{-1.5}}\big)$ rounds of communication and $E = \mathcal{O}(1)$ local steps.
\end{theorem}
\begin{remark}[\textbf{Better iteration complexity}]
\label{rem-sep21-1}
{As per \Cref{nov4-thm1}, for converging to an $\epsilon$-stationary point, \texttt{FedGLOMO} needs $T = KE$ to be $\mathcal{O}\big(\max\big(\sqrt{\frac{\alpha}{n}}, \frac{1}{\sqrt{r}}\big){\epsilon}^{-1.5}\big)$. {This iteration complexity is the same as that of \texttt{MimeMVR} (\cite{karimireddy2020mime}) \textit{but without using the bounded client dissimilarity assumption}, i.e. \cref{eq:bcd}, (also see the next remark for more details on this) and better than other related works in the federated setting; see \Cref{tb:comp}.}
We underscore the significance of global momentum here by comparing this complexity of \texttt{FedGLOMO} to that of \texttt{FedLOMO} (recall this is a simpler version of \texttt{FedGLOMO} with only local momentum and \textit{no} global momentum, described in {Appendix F}) under partial-device participation and compression which is $\mathcal{O}\big(\frac{1}{r} \epsilon^{-2}\big)$; see {Theorem 3 in the Appendix}.}
\end{remark}

\begin{remark}[\textbf{No requirement of bounded client dissimilarity (BCD) assumption}]
\label{rem-sep21-2}
{Divergent from related works, \Cref{nov4-thm1} \textit{does not use} the commonly used BCD assumption, i.e., \cref{eq:bcd}.
This is achieved by utilizing the smoothness and non-negativity of the $f_i$'s, specifically $\frac{1}{n}\sum_{i \in [n]}\|\nabla f_i(\bm{w})\|^2 \leq \frac{1}{n}\sum_{i \in [n]} 2L(f_i(\bm{w}) - f_i^{*}) \leq 2L f(\bm{w})$; see the proof outline of \Cref{nov4-thm1} in {Appendix A}. 
Instead of the BCD assumption, we use our empirically verified \Cref{as-het} to provide a tighter (when $\alpha \ll n$) and data-dependent convergence result. Note that \Cref{as-het} will always hold for some $\alpha \leq n$, regardless of the degree of client heterogeneity. Thus, \Cref{nov4-thm1} allows for \textit{arbitrary client heterogeneity}.}
\end{remark}

\begin{remark}[\textbf{Compressed communication}]
\label{rem-sep21-3}
To our knowledge, \texttt{FedGLOMO} is the \textit{first algorithm} that attains the aforementioned improved iteration complexity for FL on smooth non-convex functions \textit{with compressed communication}. We emphasize that the choice of quantities compressed in line \ref{comp} of \Cref{alg:2-local}
is important. This particular choice enables deriving the improved complexity by first deriving a result analogous to smoothness, i.e.,
$\|({\bm{w}_{k} - \bm{w}_{k,E}^{(i)}}) - ({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})\| \leq \widehat{L}\|\bm{w}_{k} - \bm{w}_{k-1}\|$ (see {Lemma 9 in Appendix G.1}). The straightforward choice of sending $Q_{D}(\bm{w}_{k} - {\bm{w}_{k,E}^{(i)}})$ and $Q_{D}({\bm{w}_{k-1} - \widehat{\bm{w}}_{k-1,E}^{(i)}})$ prohibits us from deriving the improved rate, unless 
we also assume $Q_D(.)$ to be a Lipschitz operator.
\\
In {Appendix B}, for $r \ll n$, we show that using the quantization scheme of \cite{alistarh2017qsgd} with $s = \sqrt{d}$, \texttt{FedGLOMO} achieves more than a five-fold saving in the \textit{total} communication cost as compared to when there is full-precision communication in \texttt{FedGLOMO}.
\end{remark}

\begin{remark}[\textbf{A limitation}]
\label{rem-sep29-5}
Even though our iteration complexity of $T = \mathcal{O}(\epsilon^{-1.5})$ is better than that of \texttt{FedCOMGATE} proposed by  \cite{haddadpour2020federated} (which is $\mathcal{O}(\epsilon^{-2})$), our communication complexity of $K = \mathcal{O}(\epsilon^{-1.5})$ is higher than that theirs which is $K = \mathcal{O}(\epsilon^{-1})$ (albeit under an extra assumption on the quantizer, namely Assumption 5 in their paper). Ideally, we would like to have $E = \mathcal{O}(\epsilon^{-p})$ and $K = \mathcal{O}(\epsilon^{-(1.5-p)})$ for some $p > 0$, in order to reduce \texttt{FedGLOMO}'s communication complexity. Exploring whether such a result is obtainable with our proposed style of momentum is an interesting future direction.
\end{remark}
\section{Experiments}
\label{sec:exp}
To show the efficacy of \textit{global} momentum in \texttt{FedGLOMO}, we compare it against \texttt{FedLOMO} (recall this has only local momentum and no global momentum; see {Appendix F}) and \texttt{FedAvg} (\cite{mcmahan2017communication}) with the standard momentum available in PyTorch applied to (i) only its local updates, and (ii) both local and global updates -- all with compressed client-to-server communication. We denote (i) and (ii) by \texttt{FedAvg}-lm and \texttt{FedAvg}-glm (\enquote{lm} and \enquote{glm} stand for local momentum, and global + local momentum), respectively. \texttt{FedAvg} \textit{with compression} is referred to as \texttt{FedPAQ} (\cite{reisizadeh2020fedpaq}). Similarly, we call \texttt{FedAvg}-lm and \texttt{FedAvg}-glm \textit{with compression}, as \texttt{FedPAQ}-lm and \texttt{FedPAQ}-glm. We also compare against \texttt{FedCOMGATE} (\cite{haddadpour2020federated}) which uses gradient tracking to \textit{theoretically} derive a better communication-complexity than us (see \Cref{rem-sep29-5}).
For compression, the \enquote{qsgd} operator proposed in \cite{alistarh2017qsgd} is used.

We consider the task of classification on CIFAR-10 and Fashion-MNIST (\cite{xiao2017fashion}) abbreviated as FMNIST henceforth. The model used is a two-layer neural network with ReLU activation in the hidden layers. The size of both the hidden layers is 300/600 for FMNIST/CIFAR-10. We train the models using the categorical cross-entropy loss with $\ell_2$-regularization. 
The weight decay value in PyTorch (to apply $\ell_2$-regularization) is set to 1e-4. 
We consider both homogeneous 
and heterogeneous data distribution among the clients. Similar to \cite{mcmahan2017communication}, for the heterogeneous case, we distribute the data among the clients such that each client can have data from either one or (at most) two classes -- note that this is a high degree of heterogeneity. The exact procedure is described in {Appendix E}. The number of clients ($n$) in all the experiments is set to 50, with each client having the same number of samples. The global batch-size $r$ is 25, and the number of local updates per round (i.e., $E$) is 10. All full gradients are replaced by stochastic gradients computed on a (per-client) batch size of 256. The learning rates, momentum parameters of the algorithms, and some other experimental details are in {Appendix E}.

In Fig. \ref{fig:1}, we compare \texttt{FedPAQ}-lm, \texttt{FedPAQ}-glm, \texttt{FedLOMO} and \texttt{FedCOMGATE} with 4 (resp., 8) bits per-round against \texttt{FedGLOMO} with 2 (resp., 4) bits per-round on FMNIST (resp., CIFAR-10) in the heterogeneous and homogeneous cases. We set the number of per-round bits used by \texttt{FedGLOMO} to be half the number used by all other algorithms, so that each one has the same \textit{per-round} communication budget. All plots depict results over 3 independent runs; the shaded regions represent $\pm 1$ standard deviation whereas the solid lines are the respective means. Please see the discussion in the figure caption. These results illustrate the \textit{power of global momentum}.

Next, in the \textit{no-compression heterogeneous} case, we compare against \texttt{Mime} (specifically, \enquote{\texttt{MimeSGDm}}) of \cite{karimireddy2020mime} which also attains a complexity of $\mathcal{O}(\epsilon^{-1.5})$ but without compressed communication, and is tailored to handle client heterogeneity. Having shown the suboptimality of \texttt{FedLOMO} and \texttt{FedPAQ}-lm in Fig. \ref{fig:1}, we only compare \texttt{FedAvg}-glm, \texttt{FedGLOMO} without compression and \texttt{MimeSGDm} in the heterogeneous case in Fig. \ref{fig:2}. The plots in Fig. \ref{fig:2} show that the implicit client-drift controlling ability of our proposed global momentum is on par with the explicit client-drift controlling mechanism of \texttt{Mime}. The test error values averaged over the last five rounds for the plots in Figures \ref{fig:1} and \ref{fig:2} are in Tables \ref{tab1} and \ref{tab2}, respectively.

We also provide some more empirical results on CIFAR-100 in {Appendix E.1}.

\begin{table}[!htb]
\begin{center}
\begin{tabular}{|l|c|c|}
%\toprule
\hline
\textbf{Algo.} & \textbf{CIFAR-10} \textbf{Het.} & \textbf{FMNIST} \textbf{Het.}
\\
\hline
\texttt{FedPAQ}-lm & 50.26 $\pm$ 0.85 & 16.17 $\pm$ 0.53 
\\
\hline
\texttt{FedPAQ}-glm & 49.88 $\pm$ 1.15 & 15.87 $\pm$ 1.10
\\
\hline
\texttt{FedLOMO} & 53.74 $\pm$ 0.17 & 18.95 $\pm$ 0.19
\\
\hline
\texttt{FedGLOMO} & \textbf{46.42 $\pm$ 0.05} & \textbf{13.55 $\pm$ 0.32}
\\
\hline
\texttt{FedCOMGATE} & \textbf{46.26 $\pm$ 0.25} & 15.32 $\pm$ 0.09
\\
\hline
\hline
\textbf{Algo.} & \textbf{CIFAR-10} \textbf{Hom.} & \textbf{FMNIST} \textbf{Hom.} 
\\
\hline
\texttt{FedPAQ}-lm & \textbf{45.13 $\pm$ 0.07} & 13.08 $\pm$ 0.05
\\
\hline
\texttt{FedPAQ}-glm & 45.70 $\pm$ 0.10 & 11.76 $\pm$ 0.06
\\
\hline
\texttt{FedLOMO} & 45.96 $\pm$ 0.01 & 14.22 $\pm$ 0.01
\\
\hline
\texttt{FedGLOMO} & \textbf{44.97 $\pm$ 0.05} & \textbf{10.98 $\pm$ 0.05}
\\
\hline
\texttt{FedCOMGATE} & 45.46 $\pm$ 0.03 & 12.24 $\pm$ 0.01
\\
\hline
\end{tabular}
\end{center}
\caption{Average \textbf{test error} \% ($\pm$ standard deviation) over the last five rounds for the plots in the \textit{heterogeneous} (\textit{top}) and \textit{homogeneous} (\textit{bottom}) cases in \Cref{fig:1}.}
\label{tab1}
\end{table}

\begin{table}[!htb]
\begin{center}
%\begin{small}
%\begin{sc}
\begin{tabular}{|l|c|c|}
%\toprule
\hline
\textbf{Algo.} & \textbf{CIFAR-10 Het.} & \textbf{FMNIST Het.} \\
\hline
\texttt{FedAvg}-glm & 50.26 $\pm$ 0.74 & 16.17 $\pm$ 0.53
\\
\hline
\texttt{MimeSGD}m & 46.10 $\pm$ 0.13 & \textbf{13.34 $\pm$ 0.25}
\\
\hline
\texttt{FedGLOMO} & \textbf{45.41 $\pm$ 0.15} & \textbf{13.48 $\pm$ 0.26}
\\
\hline
\end{tabular}
%\end{sc}
%\end{small}
\end{center}
\caption{Average \textbf{test error} \% ($\pm$ standard deviation) over the last five rounds for the plots in \Cref{fig:2}.}
\label{tab2}
\end{table}


\begin{figure*}[t]
\centering 
\subfloat[Het. FMNIST train loss]{
    \label{fig:1_a}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/fmnist_het_train_2.pdf}
	} 
%\hspace{-0.5cm}
\subfloat[Het. FMNIST test err]{
    \label{fig:1_b}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/fmnist_het_test_2.pdf}
	} 
%\\
%\hspace{-0.5cm}
\subfloat[Het. CIFAR10 train loss]{
    \label{fig:1_c}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/cifar10_het_train_2.pdf}
	} 
%\hspace{-0.5cm}
\subfloat[Het. CIFAR10 test err]{
    \label{fig:1_d}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/cifar10_het_test_2.pdf}
	} 
\\
\subfloat[Hom. FMNIST train loss]{
    \label{fig:11_a}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/fmnist_hom_train_2.pdf}
	} 
%\hspace{-0.5cm}
\subfloat[Hom. FMNIST test err]{
    \label{fig:11_b}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/fmnist_hom_test_2.pdf}
	} 
%\\
%\hspace{-0.5cm}
\subfloat[Hom. CIFAR10 train loss]{
    \label{fig:11_c}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/cifar10_hom_train_2.pdf}
	} 
%\hspace{-0.5cm}
\subfloat[Hom. CIFAR10 test err]{
    \label{fig:11_d}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/cifar10_hom_test_2.pdf}
	} 
\caption{Comparison of \texttt{FedPAQ}-lm, \texttt{FedPAQ}-glm, \texttt{FedLOMO}, \texttt{FedGLOMO} and \texttt{FedCOMGATE} (\cite{haddadpour2020federated}) with the same per-round communication budget on FMNIST and CIFAR-10 in the heterogeneous (top four figs.) and homogeneous (bottom four figs.) settings, respectively. The x-axis is the total number of communicated bits divided by the dimension $d$ and the global batch-size $r$. \texttt{FedGLOMO} is the \textbf{fastest} and most \textbf{communication-efficient} algorithm in almost all the cases; for e.g., in the heterogeneous case for both datasets, \texttt{FedGLOMO} attains the final test error of \texttt{FedPAQ}-glm (resp., \texttt{FedPAQ}-lm) with less than a \textbf{half} (resp., only about a \textbf{third}) of the number of bits used by \texttt{FedPAQ}-glm (resp., \texttt{FedPAQ}-lm). Further, \texttt{FedGLOMO} and \texttt{FedLOMO} have a smoother trajectory than other algorithms in the heterogeneous case due to variance-reducing momentum. Observe that \texttt{FedLOMO} and \texttt{FedPAQ}-lm (with only local momentum) are slower than \texttt{FedGLOMO} and \texttt{FedPAQ}-slm (with both local and global momentum), showing the ineffectiveness of only local momentum and \textbf{the power of combining both local and global momentum}. Also, note that \texttt{FedGLOMO} performs much better than \texttt{FedCOMGATE} in the homogeneous case.
}
\label{fig:1}
\end{figure*}

\begin{figure*}[t]
\centering 
\subfloat[FMNIST train loss]{
    \label{fig:2_a}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/fmnist_mime_train_2.pdf}
	} 
%\hspace{-0.5cm}
\subfloat[FMNIST test err]{
    \label{fig:2_b}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/fmnist_mime_test_2.pdf}
	} 
%\\
%\hspace{-0.5cm}
\subfloat[CIFAR-10 train loss]{
    \label{fig:2_c}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/cifar10_mime_train_2.pdf}
	} 
%\hspace{-0.5cm}
\subfloat[CIFAR-10 test err]{
    \label{fig:2_d}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/cifar10_mime_test_2.pdf}
	} 
%\hspace{0.2 in}  
\caption{Comparison of \texttt{FedAvg}-glm, \texttt{FedGLOMO} (without compression) and \texttt{MimeSGDm} on FMNIST and CIFAR-10 in the \textbf{heterogeneous} case. On both datasets, \texttt{FedAvg}-glm is the slowest while \texttt{FedGLOMO} is somewhat faster than \texttt{MimeSGDm}. While \texttt{Mime} has an explicit client-drift control mechanism, we do not have that in \texttt{FedGLOMO}, but still \textbf{our proposed global momentum implicitly mitigates client-drift} as well as \texttt{Mime}.}
\label{fig:2}
\end{figure*}

{\paragraph{Verifying Assumption \ref{as-het} for \texttt{FedGLOMO}:}
\label{sec:het-asm-expt}
For each round $k$, we compute $\alpha = \max_{\tau \in [E]} 
\frac{\|\sum_{i \in [n]}\widetilde{\bm{e}}_{k,\tau}^{(i)}\|^2}{\sum_{i \in [n]} \|\widetilde{\bm{e}}_{k,\tau}^{(i)}\|^2}$, where $\widetilde{\bm{e}}_{k,\tau}^{(i)}$ is as defined in \Cref{as-het}, for 4 and 2 bit \texttt{FedGLOMO} on CIFAR-10 and FMNIST, respectively. 
Note that we remove the expectation (w.r.t. the stochastic gradients) while computing $\alpha$ for empirical verification. In Fig. \ref{fig:het0}, we plot $(\alpha/n)$ over different rounds for the heterogeneous as well as homogeneous case on both datasets; see the discussion in the figure caption.}

\begin{figure*}[!htb]
\centering 
\subfloat[CIFAR-10]{
    \label{fig:het_a}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/CIFAR10_2_alpha.pdf}
	} 
\subfloat[FMNIST]{
    \label{fig:het_b}
	\includegraphics[width=0.23\textwidth]{UAI_figs2/FMNIST_2_alpha.pdf}
	} 
\caption{Variation of $(\frac{\alpha}{n})$ over different rounds of $4$ and $2$ bit \texttt{FedGLOMO} for CIFAR-10 (Fig. \ref{fig:het_a}) and FMNIST (Fig. \ref{fig:het_b}) in the heterogeneous and homogeneous cases. In both cases, notice that $\alpha \ll n$ throughout training. {Also, as discussed after the statement of 
\Cref{as-het}, note that $(\frac{\alpha}{n})$ is higher for the heterogeneous case (except at the end of training for FMNIST). See Figure {4 in the Appendix} for the same on \texttt{FedAvg}.}
}
\label{fig:het0}
\end{figure*}

\section{Conclusion}
We presented \texttt{FedGLOMO}, a communication-efficient algorithm for faster federated learning via the application of variance-reducing momentum, both in the aggregation step at the server as well as local client updates. We showed that \texttt{FedGLOMO} has better iteration complexity than prior work on smooth non-convex functions with compressed communication. Further, unlike prior work, our result does not use the bounded client dissimilarity assumption, even holding under arbitrary client heterogeneity.
We also demonstrate the efficacy of \texttt{FedGLOMO} via extensive experiments.


\begin{acknowledgements}
This work is supported by NSF grants CCF-1564000, IIS-1546452 and HDR-1934932, AFOSR grant FA9550-19-1-0005, and NASA grant 80NSSC21M0071.
\end{acknowledgements}

\bibliography{das_303}

\end{document}
