
%\documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)

%% Choose your variant of English; be consistent

\usepackage[american]{babel}
\usepackage{natbib}
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools, cuted}
\usepackage{booktabs}
\usepackage{tikz}
\usepackage{hyperref}
\usepackage{cleveref}
\usepackage{amsthm, amssymb, bm, amsmath}
\usepackage{amsfonts}
\usepackage{subcaption}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{float}
\usepackage{booktabs}
% \usepackage{array}

% \input{macros}
\allowdisplaybreaks

%% Self-defined macros
\newcommand{\TODO}[1]{\todo[color=blue!25, inline]{ TODO: #1} \index{To Do: !#1}}

\theoremstyle{plain}
\newtheorem{lem}{Lemma}
\newtheorem{thm}{Theorem}
\newtheorem*{thm*}{Theorem}
\newtheorem{defn}{Definition}
\newtheorem{coro}{Corollary}
\newtheorem{clm}{Claim}
\newtheorem{conj}{Conjecture}
\newtheorem{exple}{Example}
\newtheorem{prop}{Proposition}
\newtheorem{propty}{Property}
\newtheorem{rem}{Remark}
\newtheorem{assum}{Assumption}

\newcommand{\Ex}{\mathbb{E}}
\newcommand{\Eg}[2]{\mathbb{E}_{#1}\left[{#2}\right]}
\newcommand{\E}[1]{\mathbb{E}\left[{#1}\right]}
\newcommand{\V}[1]{\mathrm{Var}\left[{#1}\right]}
\newcommand{\posfunc}[1]{\lvert {#1}\rvert ^{+}}
\newcommand{\HyperExp}{\textit{HyperExp}}
\newcommand{\SExp}{\textit{ShiftedExp}}
\newcommand{\Exp}{\textit{Exp}}
\newcommand{\Pareto}{\textit{Pareto}}
\newcommand{\xor}{\oplus}
\newcommand{\cmark}{\ding{51}}
\newcommand{\xmark}{\ding{55}}


\newcommand{\norm}[1]{\left\lVert#1\right\rVert^{2}}
\newcommand{\matsq}[1]{#1^{T}#1}
\newcommand{\bigO}[1]{\mathcal{O}\left({#1}\right)}
\newcommand{\brac}[1]{\left({#1}\right)}


\graphicspath{{Figures/}}

\crefname{equation}{}{}
\Crefname{equation}{}{}
\crefname{thm}{theorem}{theorems}
\Crefname{thm}{Theorem}{Theorems}
\crefname{clm}{claim}{claims}
\Crefname{clm}{Claim}{Claims}
\Crefname{coro}{Corollary}{Corollaries}
\Crefname{lem}{Lemma}{Lemmas}
\Crefname{sec}{Section}{Sections}
\crefname{app}{appendix}{appendices}
\Crefname{app}{Appendix}{Appendices}
\crefname{prop}{proposition}{propositions}
\Crefname{prop}{Proposition}{Propositions}
\Crefname{propty}{Property}{Properties}
\crefname{figure}{fig.}{figures}
\Crefname{figure}{Fig.}{Figures}
\crefname{defn}{definition}{definitions}
\Crefname{defn}{Definition}{Definitions}
\crefname{fact}{fact}{facts}
\Crefname{fact}{Fact}{Facts}
\crefname{appendix}{appendix}{appendices}
\Crefname{appendix}{Appendix}{Appendices}
\crefname{algo}{algorithm}{algorithms}
\Crefname{algo}{Algorithm}{Algorithms}
\crefname{algorithm}{algorithm}{algorithms}
\Crefname{algorithm}{Algorithm}{Algorithms}
\crefname{tbl}{table}{table}
\Crefname{tbl}{Table}{Table}
\crefname{table}{table}{table}
\Crefname{table}{Table}{Table}
\crefname{algorithm}{algorithm}{algorithms}
\Crefname{algorithm}{Algorithm}{Algorithms}

\crefname{conj}{conjecture}{conjectures}
\Crefname{conj}{Conjecture}{Conjectures}
\crefname{obs}{observation}{observations}
\Crefname{obs}{Observation}{Observations}

% Defining new commands and environments
\newtheorem{remark}{Remark}
\newtheorem{theorem}{\textbf{Theorem}}
\newcommand{\gradvect}{\mathbf{x}}
\newcommand{\gradsc}{x}
\newcommand{\wtsc}{w}
\newcommand{\wtvect}{\mathbf{w}}
\newcommand{\gradcomp}{\mathbf{h}}
\newcommand{\compsc}{h}
\newcommand{\ind}{Z}
\newcommand{\avgtru}{\bar{\gradvect}}
\newcommand{\avgest}{\hat{\gradvect}}
\newcommand{\idmat}{\mathbf{I}}
\newcommand{\codemat}{\mathbf{E}}
\newcommand{\codepinv}{\mathbf{W}}
\newcommand{\codeprob}{\rho}
\newcommand{\gradnum}{n}
\newcommand{\graddim}{d}
\newcommand{\var}{\gamma}
\newcommand{\gradnoise}{\bm{\eta}}
\newcommand{\compdim}{k}
\newcommand{\scale}{\alpha}
\newcommand{\prob}{p}
\newcommand{\covmat}{\mathbf{C}}
\newcommand{\eig}{\lambda}
\newcommand{\eigmat}{\bm{\Lambda}}
\newcommand{\eigvects}{\mathbf{U}}
%\newcommmand{\randscale}{\omega}

\newcommand{\encmat}{\mathbf{G_e}}
\newcommand{\recmat}{\mathbf{G}}
\newcommand{\checkmat}{\mathbf{H}}
\newcommand{\onedata}{\mathbf{x}}
\newcommand{\numfunc}{m}
\newcommand{\numencfunc}{m_e}
\newcommand{\numrecfunc}{M'}
\newcommand{\decrows}{m_d}
\newcommand{\fcdeg}{d}
\newcommand{\lndd}{\Omega}
\newcommand{\ledd}{\omega}
\newcommand{\rndd}{\Lambda}
\newcommand{\redd}{\lambda}
\newcommand{\params}{\bm{\theta}}
\newcommand{\loss}{\mathcal{L}}
\newcommand{\cols}{n}

\newcommand{\mdsmat}{\mathbf{F}}
\newcommand{\mdsnum}{k}

\newcommand{\fcpc}{c}
\newcommand{\fcpdel}{\delta}
\newcommand{\fcps}{S}
\newcommand{\fcsource}{s}
\newcommand{\fcenc}{e}
\newcommand{\RS}{\rho}
\newcommand{\numcomp}{C}
\newcommand{\workcomp}{B}
\newcommand{\tupleset}{\mathbb{S}}
\newcommand{\runtime}{T}
\newcommand{\worktime}{Y}
\newcommand{\queuetime}{Z}
\newcommand{\shifttime}{\tau}
\newcommand{\smalltime}{t}
\newcommand{\exprate}{\mu}
%\newcommmand{\x}{\mathbf{x}}

\def\y{{\mathbf y}}
\def\A{{\mathbf A}}
\def\H{{\mathbf H}}
\def\b{{\mathbf b}}

\newcommand{\bA}{{\bf A}}
\newcommand{\bx}{{\bf x}}
\newcommand{\be}{{\bf e}}
\newcommand{\bb}{{\bf b}}
\newcommand{\bg}{{\bf g}}
\newcommand{\bu}{{\bf u}}
\newcommand{\bd}{{\bf d}}
\newcommand{\bdx}{\dot{\bf x}}
\newcommand{\bp}{{\bf \varpi}}
\newcommand{\bdp}{\dot{\bf p}}
\newcommand{\bq}{{\bf q}}
\newcommand{\bdq}{\dot{\bf q}}
\newcommand{\bX}{{\bf X}}
\newcommand{\bdX}{\dot{\bf X}}
\newcommand{\bQ}{{\bf Q}}
\newcommand{\bdQ}{\dot{\bf Q}}
\newcommand{\bP}{{\bf P}}
\newcommand{\bdP}{\dot{\bf P}}
\newcommand{\by}{{\bf y}}
\newcommand{\bdy}{\dot{\bf y}}
\newcommand{\bv}{{\bf v}}
\newcommand{\bh}{{\bf h}}
\newcommand{\bdv}{\dot{\bf v}}
\newcommand{\bw}{{\bf w}}
\newcommand{\bdw}{\dot{\bf w}}
\newcommand{\bt}{{\bf t}}

\newcommand{\bwt}{\bw^{(t)}}
\newcommand{\bvt}{\bv^{(t)}}
\newcommand{\bwit}{\bw_i^{(t)}}
\newcommand{\bwjt}{\bw_j^{(t)}}
\newcommand{\bwitk}{\bw_i^{(t,k)}}
\newcommand{\bwitj}{\bw_i^{(t,j)}}
\newcommand{\xiit}{\xi_i^{(t)}}
\newcommand{\xiitk}{\xi_i^{(t,k)}}
\newcommand{\xiitj}{\xi_i^{(t,j)}}
\newcommand{\bwtp}{\bw^{(t+1)}}
\newcommand{\byt}{\by^{(t)}}
\newcommand{\bytp}{\by^{(t+1)}}
\newcommand{\byit}{\by_i^{(t)}}
\newcommand{\byjt}{\by_j^{(t)}}
\newcommand{\byjtp}{\by_j^{(t+1)}}
\newcommand{\bdit}{\Delta_i^{(t)}}
\newcommand{\bhit}{\bh_i^{(t)}}
\newcommand{\bhjt}{\bh_j^{(t)}}

\newcommand{\Cc}{{\mathcal{C}}}

\newcommand{\set}{{\mathcal{S}}}
\newcommand{\ssize}{M}
\newcommand{\setj}{{\set_j}}
\newcommand{\ssizej}{{\ssize_j}}
\newcommand{\avgestj}{\hat{x}_j}
\newcommand{\weightj}{{T(\ssizej)}}
\newcommand{\weight}{T}
\newcommand{\avgtruj}{\bar{x}_j}
\newcommand{\squares}{R_1}
\newcommand{\cross}{R_2}
\newcommand{\deltai}{\Delta^{(t)}_i}
\newcommand{\deltaj}{\Delta^{(t)}_j}
\newcommand{\activec}{\mathcal{A}(t)}
\newcommand{\gradcomph}{\mathbf{h}'} %^{\text{temp}}}

\newcommand{\scalcomph}{h'}

%^{\text{temp}}}


\newcommand{\mc}{\mathcal}
\newcommand{\mco}{\mathcal O}
\newcommand{\mbb}{\mathbb}
\newcommand{\mbf}{\mathbf}
\newcommand{\mbe}{\mathbb E}
\newcommand{\mbn}{\mathbb N}
\newcommand{\mbr}{\mathbb R}
\newcommand{\mcr}{\mathcal R}
\newcommand{\mcP}{\mathcal P}

\newcommand{\lp}{\left(}
\newcommand{\rp}{\right)}
\newcommand{\ld}{\left.}
\newcommand{\rd}{\right.}
\newcommand{\lcb}{\left\{}
\newcommand{\rcb}{\right\}}
\newcommand{\lb}{\left[}
\newcommand{\rb}{\right]}
\newcommand{\lnr}{\left\|}
\newcommand{\rnr}{\right\|}
\newcommand{\lan}{\left\langle}
\newcommand{\ran}{\right\rangle}

\newcommand{\G}{\nabla}
% Number of agents
\newcommand{\na}{N}
\newcommand{\nas}{M}

\newcommand{\sumin}{\sum_{i=1}^\na}
\newcommand{\sumjn}{\sum_{j=1}^\na}
\newcommand{\sumkt}{\sum_{k=0}^{\tau-1}}
\newcommand{\sumjt}{\sum_{j=0}^{\tau-1}}
\newcommand{\sumjk}{\sum_{j=0}^{k-1}}
\newcommand{\sumtT}{\sum_{t=0}^{T-1}}

\newcommand{\lrc}{\eta_c}
\newcommand{\lrs}{\tilde{\eta}_s}
\newcommand{\lrss}{\eta_s}

\newcommand{\nn}{\nonumber}
\newcommand{\numclients}{N}
\newcommand{\selclients}{M}
\newcommand{\numclusters}{K}
\newcommand{\algoname}{FedVARP}
\newcommand{\clusteralgoname}{ClusterFedVARP}




\newcommand{\GJ}[1]{ \textcolor{magenta}{\textsc{Gauri:} #1}}

\newcommand{\ps}[1]{ \textcolor{red}{\textsc{PS:} #1}}%Pranay

\newcommand{\DJH}[1]{ \textcolor{green}{Divyansh: #1}}

\newcommand{\AN}[1]{ \textcolor{blue}{\textsc{Aushim:} #1}}

\newcommand{\swap}[3][-]{#3#1#2} % just an example
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}


\title{FedVARP: Tackling the Variance Due to Partial Client Participation\\ in Federated Learning}

% Add authors
\author[1]{Divyansh Jhunjhunwala}
\author[1]{Pranay Sharma}
\author[1]{Aushim Nagarkatti}
\author[1]{Gauri Joshi}
% Add affiliations after the authors
\affil[1]{%
    Carnegie Mellon University\\
    Pittsburgh, Pennsylvania, USA
}
  
\begin{document}
\maketitle

\begin{abstract}
Data-heterogeneous federated learning (FL) systems suffer from two significant sources of convergence error: 1) client drift error caused by performing multiple local optimization steps at clients, and 2) partial client participation error caused by the fact that only a small subset of the edge clients participate in every training round. We find that among these, only the former has received significant attention in the literature. To remedy this, we propose \texttt{FedVARP}, a novel variance reduction algorithm applied at the server that eliminates error due to partial client participation. 
To do so, the server simply maintains in memory the most recent update for each client and uses these as surrogate updates for the non-participating clients in every round. Further, to alleviate the memory requirement at the server, we propose a novel clustering-based variance reduction algorithm \texttt{ClusterFedVARP}. Unlike previously proposed methods, both \texttt{FedVARP} and \texttt{ClusterFedVARP} do not require additional computation at clients or communication of additional optimization parameters. Through extensive experiments, we show that \texttt{FedVARP} outperforms state-of-the-art methods, and \texttt{ClusterFedVARP} achieves performance comparable to \texttt{FedVARP} with much less memory requirements.
\end{abstract}



% \input{Sections/Introduction}
\section{Introduction}


% In conventional distributed learning applications, the entire dataset, often collected by numerous disparate and remote edge-devices, is transferred to a \textit{central} server, where all the computations are carried out. With the advent of Big Data applications, and the proliferation of computationally powerful edge-devices, transferring the private data of edge-devices or \textit{clients} to a remote server is neither desirable (due to privacy concerns \citep{leaute2013protecting}), nor always necessary. 
Large-scale machine learning applications rely on numerous edge-devices to contribute their data, to learn better performing models. Federated Learning (FL) is a recent paradigm \citep{konecny2016federated, mcmahan2017communication} for distributed learning in which a \textit{central server} offloads some of the computation to the edge-devices or \textit{clients}, and the clients in return get to retain their private data, while only communicating the locally learned model to the server. For instance, when training a next-word prediction model \citep{hard2018federated}, FL allows a client to enjoy suggestions supplied by thousands of other clients in the same federation without ever explicitly revealing its own personal text history. 
% Further, unlike other existing approaches \citep{li2014scaling}, the clients in FL carry out \textit{multiple} local model updates, between successive communications with the server.

Typical FL applications are targeted towards low-power mobile phones that have severely limited uplink (client to server) bandwidth. This necessitates the need for novel algorithms to reduce the \textit{frequency} of communication required to train FL models. The first and the most popular algorithm in this setting is \texttt{FedAvg}
\citep{mcmahan2017communication}, which reduces communication frequency by requiring clients to perform \textit{multiple} local computations in each round. In each round of \texttt{FedAvg}, clients first download the current global model, and run several steps of SGD on their private data before sending back their local updates to the server. The server then updates the global model using the average of the local updates sent by the clients.

A subtle yet important feature that distinguishes FL systems from traditional data-center settings is the presence of \textit{heterogeneity} in local data across clients. While \texttt{FedAvg} improves communication-efficiency at the clients, it also leads to an additional error caused by this heterogeneity, colloquially known as \textit{client drift} error \citep{karimireddy2019scaffold}. Informally, allowing clients to perform multiple local steps causes local models to drift towards their individual local minimizers, which is inconsistent with the server objective of minimizing the global empirical loss \citep{khaled2020tighter, wang2018cooperative, stich2018local}. Despite recent advances \citep{pathak2020fedsplit, woodworth2020local}, a comprehensive theory regarding the usefulness of local steps remains elusive. Nonetheless, performing multiple local steps remains the most popular option for clients participating in FL due to its superior performance in practice.

Another defining characteristic of FL systems is \textit{partial client participation}. Given the scale of FL \citep{kairouz2019advances}, it is unrealistic to expect \textit{all} the clients to participate in every single round of FL training. For instance, clients may participate only when they are plugged into a power source and have access to a reliable wifi connection \citep{mcmahan2017communication}. 
In practice, we observe that only a small fraction of the total number of clients participate in any given round. This variance in client participation gives rise to what we term as \textit{partial client participation error}. This error further compounds the effect of data heterogeneity as the global model is consistently skewed towards the data distributions of the participating clients in every round. 

While error due to client drift has been well-established \citep{karimireddy2019scaffold, acar2021federated, khaled2020tighter}, we find that partial client participation error has not received similar attention. This is seen by the fact that several methods for mitigating client drift such as \citep{pathak2020fedsplit, zhang2020fedpd} cannot be directly extended to the partial client participation case. 
This is surprising, as our results indicate that error due to partial participation, rather than client drift, \textit{dominates} the convergence rate of \texttt{FedAvg} (Theorem \ref{thm:FedAvg}). For smooth non-convex functions, we quantify the effect of the various noise sources (stochastic gradient noise, partial client participation, and data heterogeneity across clients) on the error floor of \texttt{FedAvg}, and observe that the dominant error is contributed by partial client participation.

\paragraph{Our Contributions.} Keeping in mind the observation that partial client participation is the dominant source of error, we design a novel aggregation strategy at the server that completely eliminates partial client participation error. \emph{Our algorithm keeps the local SGD procedure unchanged and only modifies the server aggregation strategy}. As a result, our approach does not introduce any extra computation at the clients or lead to any additional communication between the clients and the aggregating server. Furthermore, we also design a more server-friendly approach to our algorithm that allows the server to flexibly choose the amount of error reduction based on its system constraints. We summarize our main contributions below.


\begin{itemize}[leftmargin=*]
    \item We analyze the convergence of \texttt{FedAvg} and highlight that the dominant term in the asymptotic error floor comes from the partial participation of clients.
    \item In \Cref{sec:\algoname}, we propose \texttt{\algoname} (Federated \underline{VA}riance \underline{R}eduction for \underline{P}artial Client participation), a novel aggregation strategy applied at the server to eliminate partial participation variance. \texttt{\algoname} uses the fact that the server can store and reuse the \textit{most recent update} for each client as an approximation of its current update. This allows the server to factor in contributions even from the non-participating clients when updating the global model.
    \item To relax the storage requirements of \texttt{\algoname}, we devise a novel clustering based aggregation strategy called \texttt{\clusteralgoname} in \Cref{sec:cluster}. \texttt{\clusteralgoname} in based on the observation that instead of storing unique latest updates for each client, we can cluster clients and store a single unified update that applies to \textit{all} the clients in that cluster. We show that as long as the heterogeneity within a cluster is sufficiently bounded, \texttt{\clusteralgoname} can significantly reduce partial client participation error, while being more storage-efficient.
    \item We conduct extensive experiments on vision and language modeling FL tasks that demonstrate the superior performance of \texttt{\algoname} over existing state-of-the-art methods. Further, we show that \texttt{\clusteralgoname} performs comparably to \texttt{\algoname}, with much less storage requirements in practice. 
\end{itemize}


For the purpose of theoretical analysis, throughout this paper we assume that in each round, the server uniformly selects a subset of clients from the total pool of clients. In practice, our algorithms can also be combined with non-uniform and biased client sampling strategies \citep{cho2020client,chen2020optimal} for greater empirical benefits. Furthermore we note that the idea of reusing client updates has also been considered in a recent work \texttt{MIFA} \citep{gu21mifa_neurips}, albeit in the context of dealing with arbitrary client participation. Owing to this similarity, we have a detailed comparison of our algorithm with \texttt{MIFA} in Section \ref{mifa_comp}. While outside the scope of this work, we believe designing server aggregation strategies to deal with arbitrary client participation is an open and challenging direction for future work. 

\section{Problem Setup}
We use the following notations in the remainder of the paper. Given a positive integer $m$, the set of numbers $\{ 1, 2, \hdots, m \}$ is denoted by $[m]$. Lowercase bold letters, for e.g., $\bx, \by$, are used for vectors. Vectors at client $i$ are denoted with subscript $i$, for e.g., $\bx_i$. Vectors at time $t$ are denoted with superscript $t$, for e.g., $\by^{(t)}$.

We consider optimizing the following finite sum of functions in a Federated Learning (FL) setting.
{\small
\begin{align}
    \min_{\bw \in \mathbb{R}^d} f(\bw) = \frac{1}{\numclients}\sum_{i=1}^\numclients f_i(\bw)
\label{eq:prob_form}
\end{align}}%
where $f_i(\bw) \triangleq \Eg{\xi_i \sim \mathcal{D}_i}{\ell(\bw,\xi_i)}$ is the local objective of the $i$-th client. Here $\ell(\cdot, \cdot)$ is the loss function, and $\xi_i$ represents a random data sample from the local data distribution $\mathcal{D}_i$.
$\numclients$ is the total number of clients in the FL system. Note that our formulation can be easily extended to the case where client objectives $\{ f_i(\cdot) \}$ are unequally weighted.

We begin by recalling the \texttt{FedAvg} algorithm. At round $t$, the server selects a random subset of clients $\set^{(t)}$ and sends the global model $\bw^{(t)}$ to these clients. The selected clients run \texttt{LocalSGD} (Algorithm \ref{FedAvg}) for $\tau$ steps.
These clients then send back their updates $\Delta_i^{(t)} = (\bw^{(t)} - \bw_i^{(t,\tau)})/\lrc \tau$ to the server ($\lrc$ is the client learning rate), which aggregates them to update the global model as follows:
{\small
\begin{align}
\label{fedavg_update}
    \bw^{(t+1)} = \bw^{(t)} - \lrs \frac{1}{|\set^{(t)}|}\sum_{i \in \set^{(t)}} \Delta_i^{(t)}
\end{align}}%
where $\lrs = \lrss \lrc \tau$, with $\lrss$ being the server learning rate.

\begin{algorithm}[h]
\caption{\texttt{LocalSGD}$(i,\bw^{(t)},\tau, \lrc)$}
\label{FedAvg}
\begin{algorithmic}[1]

% \Procedure{\texttt{LocalSGD}}{$i,\bw^{(t)},\tau, \lrc$}
\State Set $\bw_i^{(t,0)} = \bw^{(t)}$
\For{$k = 0,1\dots,\tau-1$}
\State Compute stochastic gradient $\nabla f_i(\bw_i^{(t,k)},\xi_i^{(t,k)})$
\State $\bw_i^{(t,k+1)} = \bw_i^{(t,k)} - \lrc \nabla f_i(\bw_i^{(t,k)},\xi_i^{(t,k)})$
\EndFor
\State Return $(\bw^{(t)} - \bw_i^{(t,\tau)})/\lrc \tau$
% \EndProcedure
\end{algorithmic}
\end{algorithm}

Note that due to the data heterogeneity, randomly sampling $\set^{(t)}$ inherently introduces some variance within our FL system, which we term as the \textit{partial participation error}. We characterize the effect of this partial participation error on the convergence bound of \texttt{FedAvg} in the next section. 



\subsection{Convergence Analysis of FedAvg}

Before stating our convergence bound, we make the following standard assumptions.
% used in our analysis.

\begin{assum}
\label{smooth_assump}
(Smoothness). Each local objective function is $L$-Lipshitz smooth, that is, $\lnr \nabla f_i(\bx)-\nabla f_i(\by) \rnr \leq L\lnr \bx-\by \rnr$, for all $i \in [\numclients]$.
\end{assum}


\begin{assum}
\label{stochastic}
(Unbiased gradient and bounded local variance). The stochastic gradient at each client is an unbiased estimator of the local gradient, i.e., $\Eg{\xi_i \sim \mathcal{D}_i}{ \nabla f_i(\bw,\xi_i)} = \nabla f_i(\bw)$ and its variance is bounded $\mbe_{\xi_i \sim \mathcal{D}_i} \norm{ \nabla f_i(\bw,\xi_i)-\nabla f_i(\bw)} \leq \sigma^2$, for all $i \in [\numclients]$.

\end{assum}

\begin{assum}
\label{global_var_assum}
(Bounded global variance). There exists a constant $\sigma_g>0$ such that the difference between the local gradient at the $i$-th client and the global gradient is bounded as follows: $\norm{\nabla f_i(\bw)-\nabla f(\bw)} \leq \sigma_g^2$, for all $i \in [\numclients]$.
\end{assum}

Following previous work \citep{mcmahan2017communication,karimireddy2019scaffold, wang2020tackling}, we model partial client participation as uniformly sampling a subset of clients \textit{without replacement} from the total pool of clients.

\begin{thm}[FedAvg Error Decomposition]
\label{thm:FedAvg}
Under Assumptions \ref{smooth_assump}, \ref{stochastic}, \ref{global_var_assum}, suppose in each round the server randomly selects $\selclients$ out of $\numclients$ clients without replacement to perform $\tau$ steps of local SGD. If the client learning rate $\lrc$, and the server learning rate $\lrss$ are chosen such that $\lrc \leq \frac{1}{8 L \tau}$, $\lrss \lrc \leq \frac{1}{24 \tau L}$, then the iterates $\{ \bwt \}$ generated by \texttt{FedAvg} satisfy
{\small\begin{align*}
    & \min_{t \in \{0, \hdots, T-1 \}} \mbe \norm{\nabla  f(\bw^{(t)})} \\
    & \leq \bigO{\frac{f(\bw^{(0)})-f^*}{\lrss \lrc \tau T}}+
    \underbrace{\bigO{\frac{\lrss \lrc L \sigma^2}{\selclients} + \lrc^2 L^2 (\tau-1)\sigma^2}}_{{\text{stochastic gradient error}}}\\
    & + \underbrace{\bigO{\frac{\lrss \lrc \tau L (\numclients-\selclients)\sigma_g^2}{\selclients(\numclients-1)}}}_{\text{partial participation error}} + \underbrace{\bigO{\lrc^2 L^2 \tau(\tau-1)\sigma_g^2}}_{\text{client drift error}},
\end{align*}}
where $f^* = \argmin_\mathbf{x} f(\mathbf{x})$.
\label{fedavgconv}
\end{thm}



\begin{rem}
\label{decomp_rem}
Our result shows that the total error floor of \texttt{FedAvg} can be decomposed into three distinct sources of error: 1) stochastic gradients; 2) partial client participation; and 3) client drift. Stochastic gradient error arises due to the variance of local gradients (quantified by $\sigma^2$ in Assumption \ref{stochastic}) and is unavoidable unless each local objective has a finite sum structure.
The cause for both partial participation error and the client drift error lies in data-heterogeneity present among clients (quantified by $\sigma_g$ in Assumption \ref{global_var_assum}). Setting $\selclients = \numclients$ (full participation) gets rid of the error due to partial participation. Similarly, setting $\tau = 1$ (\texttt{FedSGD}) eliminates the client drift error.  
\end{rem}

Our analysis closely follows \citep{wang2020tackling} with the difference that we sample clients without replacement instead of sampling with replacement. A full proof is provided in the supplementary material for completeness. 

\begin{coro}
\label{corro_1}
Setting $\lrc = \frac{1}{\sqrt{T}\tau L}$ and $\lrss = \sqrt{\tau \selclients}$, \texttt{FedAvg} converges to a stationary point of the global objective $f(\bw)$ at a rate given by,
\begin{align*}
&\min_{t \in \{0, \hdots, T-1 \}} \mbe \norm{\nabla  f(\bw^{(t)})} \\
&\leq \underbrace{\bigO{\frac{1}{\sqrt{\selclients \tau T}}}}_{\text{stochastic gradient error}}+ \underbrace{\bigO{\sqrt{\frac{\tau}{\selclients T}}}}_{\text{partial participation error}}+ \underbrace{\bigO{\frac{1}{T}}}_{\text{client drift error}}    
\end{align*}
\end{coro}

\begin{rem}
\label{corro_rem}
Note that in this case the convergence rate of \texttt{FedAvg} is dominated by the error due to partial participation resulting in the leading $\bigO{\sqrt{\frac{\tau}{\selclients T}}}$ term whereas client drift error decays at a much faster $\bigO{\frac{1}{T}}$ rate. This is primarily due to the fact that client drift error is scaled by $\lrc^2$ whereas the partial participation error is scaled by $\lrss \lrc \tau$ as seen in Theorem \ref{fedavgconv}. In practice, $\lrc$ is usually set much smaller than $\lrss$ and hence the total error due to data-heterogeneity is dominated by the variance due to partial client participation rather than client drift. 
\end{rem}


Previous works such as \citep{karimireddy2019scaffold,li2020federated,acar2021federated} h
ave proposed regularizing the local objectives at clients with a global correction term that prevents client models from drifting towards their local minima. In effect, this regularization \textit{artificially} enforces similarity among the modified client objectives such that the effect of data-heterogeneity ($\sigma_g$) is completely eliminated. 
% causing the FL system to behave in a more and more i.i.d fashion as the number of training rounds proceeds \ps{Why is ``i.i.d.''ness increasing with $\tau$?}. As a result when client models converge, they converge to the same model, which is a stationary point of the global loss. Note that in the truly i.i.d setting $(\sigma_g = 0)$, the error due to partial participation as well as client drift is zero. Consequently these algorithms are robust to partial participation, although they are originally motivated by a client-drift perspective. This allows these algorithms to obtain faster rates of convergence as mitigating client drift itself does not ensure faster convergence, as discussed in Remark \ref{corro_rem}. \ps{This paragraph is confusing to read.}
However, doing so requires clients to \textit{modify} the local procedures that they run on their devices to incorporate the global correction term. This either requires additional computation at devices (as in \citep{acar2021federated}) or additional communication between client and server (as in \citep{karimireddy2019scaffold}).  Our goal, on the other hand is to just tackle the variance arising from partial client participation in FL. As a result, our proposed algorithm only modifies the \textit{server update procedure} without requiring clients to perform any additional computation or communication. Since partial participation variance dominates the convergence rate of \texttt{FedAvg}, eliminating this variance allows us to enjoy the same rates of convergence as \texttt{FedDyn} \citep{acar2021federated} and \texttt{SCAFFOLD} \citep{karimireddy2019scaffold}. We discuss our proposed algorithm and its benefits in greater detail in the next section.
% \input{Sections/Problem_Setup}
% \input{Sections/ProposedAlgorithm}
% \input{Sections/ProposedClusterAlgo}
\section{The FedVARP Algorithm and its Convergence Analysis}
\label{sec:\algoname}

\subsection{Proposed FedVARP algorithm}
\label{subsec:proposed \algoname}
\texttt{SAGA} \citep{defazio2014saga} was one of the first variance-reduced SGD algorithms that achieved exponential convergence rate for single node strongly convex optimization by maintaining in memory previously computed gradients for each data point.
Inspired by the \texttt{SAGA} algorithm \citep{defazio2014saga}, we propose a novel algorithm \texttt{\algoname} (Algorithm \ref{alg_\algoname}) to tackle variance arising due to partial client participation in FL.
% Similar to \texttt{SAGA}, \texttt{\algoname} maintains a memory of client states $\{\by_i\}_{i=1}^\numclients$ used to store the update sent by a client the last time it participated in FL training. 
The main novelty in \texttt{\algoname} lies in applying the variance reduction correction \textit{globally} at the server without adding any additional computation or communication at clients. We elaborate on further details below. 


Similar to \texttt{FedAvg}, in each round of \texttt{\algoname}, the server selects a random subset $\set^{(t)}$ of clients that perform \texttt{LocalSGD} and send back their updates $\Delta_i^{(t)}$ to the server.
Recall that in \texttt{FedAvg} the global model is updated just using the average of the $\{ \Delta_i^{(t)} \}_{i \in \set^{(t)}}$ (see \ref{fedavg_update}). However this adds a large variance to the \texttt{FedAvg} update as client data is heterogeneous and the number of selected clients could be much smaller than the total number of clients $\numclients$.
% Consider a hypothetical situation where all clients were selected for training in round $t$ and sent their updates to the server. In this case, we define our \textit{true} aggregated update $\bh^{(t)}$ as follows,
% \begin{align}
%     \bh^{(t)} = \frac{1}{\numclients}\sum_{j=1}^\numclients \deltaj
% \end{align}
The key to reducing this variance is to \textit{approximate} the updates of the clients that do not participate. 
We propose that the server use the \textit{latest observed update} for each client as the approximation for its current update.
Let $\{\by_i^{(t)}\}_{i=1}^\numclients$ represent a state for each client maintained at the server. 
% \ps{Should we call $\{\byit\}$ ``memory'' rather than ``states''? State might be misinterpreted to mean the model. I think Divyansh had this question earlier, I somehow missed it.} 
After every round, we perform the following update (we initialize $\by_i^{(0)} = \mathbf{0} \text{ for all } i \in [N]$),
{\small\begin{align}
    \by_j^{(t+1)} = 
    \begin{cases}
    \deltaj & \text{ if } j \in \set^{(t)} \\
    \by_j^{(t)} & \text{ otherwise}
    \end{cases}, \text{ for all } j \in [n]
\end{align}}%
This ensures that $\by_i^{(t)}$ maintains the latest observed update from the $i$-th client in round $t$. 
% Intuitively we expect that as long as clients participate frequently and sufficiently small learning rates, $\by^{(t)}$ will be
% The client states are initially set to zero, that is $\by_i^{(1)} = \mathbf{0}$ for all $i \in [\numclients]$. 
Note that this implementation requires the server to maintain $\bigO{Nd}$ memory which can be expensive in a federated setting. In Section \ref{sec:cluster} we outline a more practical algorithm \texttt{\clusteralgoname} to reduce the storage requirement. 

% \GJ{Might be good to talk about the distributed implementation of $y's$ here as Divyansh had commented earlier}

Given $\{\by_i^{(t)}\}_{i=1}^\numclients$, we can \textit{reuse} the latest observed updates of \textit{all} clients and $\Delta_i^{(t)}$'s of participating clients to compute a \textit{variance reduced} aggregated update,
{\small
\begin{align}
    \bv^{(t)} = \frac{1}{|\set^{(t)}|}\sum_{i \in \set^{(t)}}\brac{\deltai - \by_i^{(t)}} +\frac{1}{\numclients}\sum_{j=1}^\numclients \by_j^{(t)},
\label{update_eq}
\end{align}}%
which is used to update the global model as follows,
{\small
\begin{align}
    \bw^{(t+1)} = \bw^{(t)} - \lrs \bv^{(t)}.
\end{align}
}%
% Note that $\Eg{\set^{(t)}}{\bv^{(t)}} = \bh^{(t)}$ and therefore $\bv^{(t)}$ is an \textit{unbiased} estimator of $\bh^{(t)}$.
% which is used to update the global model as follows $\bwtp = \bwt - \lrs \bv^{(t)}$. 

\begin{algorithm}[h]
\caption{\texttt{\algoname}}
\label{alg_\algoname}
\begin{algorithmic}[1]
\State \textbf{Input:} initial model $\bw^{(0)}$, server learning rate $\lrss$, client learning rate $\lrc$, number of local SGD steps $\tau$, $\lrs = \lrss \lrc \tau$,  number of rounds $T$, initial states $\by_i^{(0)} = \mathbf{0}$ for all $i \in [n]$, $\by^{(0)} = \mathbf{0}$
\For {$t = 0, 1, \dots, T-1$}
\State Sample $\set^{(t)} \subseteq [\numclients]$ uniformly without replacement
\For{$i \in \set^{(t)}$}
\State $\deltai \gets \texttt{LocalSGD}(i,\bw^{(t)},\tau,\lrc)$
\EndFor
\State // At Server:
\State $\bv^{(t)} = \by^{(t)}+\frac{1}{|\set^{(t)}|}\sum_{i \in \set^{(t)}}\brac{\deltai - \by_i^{(t)}}$
\State $\bwtp = \bwt - \lrs \bv^{(t)}$
\State $\by^{(t+1)} = \by^{(t)} + \frac{1}{\numclients}\sum_{i \in \set^{(t)}}\brac{\deltai - \by_i^{(t)}}$
\State //State update
\For {$j \in [\numclients]$}
\State $\by_j^{(t+1)} = 
    \begin{cases}
    \deltaj \hspace{5pt} \text{ if } j \in \set^{(t)}\\
    \by_j^{(t)} \hspace{5pt} \text{ otherwise }
    \end{cases}$
\EndFor
\EndFor



\end{algorithmic}
\end{algorithm}


Note that \texttt{\algoname} gives higher weight to current client updates as compared to previous client updates which allows it to enjoy the additional \textit{unbiased} property,
{\small
\begin{align}
\label{scala_unbias_prop}
    \Eg{\set^{(t)}}{\bv^{(t)}} = \Eg{\set^{(t)}}{\frac{1}{|\set^{(t)}|}\sum_{i \in \set^{(t)}}\Delta_i^{(t)}}.
\end{align}}%
This implies that in expectation \texttt{\algoname} performs the same update as \texttt{FedAvg}. This simplifies our analysis considerably and allows us to set $\by_i^{(0)} = \mathbf{0}$ without any complications in theory or practice. We further highlight the importance of server-based \texttt{SAGA} in comparison to related work.
% We prove this formally by deriving the exact conditions on $\lrss$ and $\lrc$ for this to happen as illustrated in our theorem below.  

% \textbf{Distributed implementation}
\paragraph{Comparison with MIFA.} 
\label{mifa_comp}
Closely related to this work, \citep{gu21mifa_neurips} proposed the  \texttt{MIFA} algorithm to deal with arbitrary device unavailability in FL. \texttt{MIFA} also maintains in memory the latest observed updates for each client and instead applies a \texttt{SAG}-like \citep{schmidt2017minimizing} aggregation of these updates. Unlike \texttt{\algoname}, \texttt{MIFA} assigns equal weights to both the current and previous updates, making it a biased scheme. 
% As a result, property \ref{scala_unbias_prop} does not hold for \texttt{MIFA}. 
This complicates their analysis significantly, which requires additional assumptions such as almost surely bounded gradient noise and Hessian Lipschitzness. Furthermore, due to this bias, \texttt{MIFA} requires all the clients to participate in the first round, which is unrealistic in many FL settings. We compare the performance of \texttt{\algoname} with \texttt{MIFA} in our experiments (see Section \ref{sec:experiments}) and show that \texttt{\algoname} consistently outperforms \texttt{MIFA}.


\paragraph{Comparison with SCAFFOLD.}
\texttt{SCAFFOLD} \citep{karimireddy2019scaffold} is one of the first works to identify the client drift error and it proposes the use of control variates to correct it. This requires clients to apply a \texttt{SAGA}-like variance reduction correction at \textit{every local} step. This leads to a 2x rise in communication as the clients now need to communicate both the global model as well as the global correction vector to the server. In \texttt{\algoname}, clients perform \texttt{LocalSGD} and are \textit{agnostic} to any aspect of how the variance reduction is applied at the server. This saves the cost of communicating the update to the global correction vector while maintaining the same rate of convergence as \texttt{SCAFFOLD}. 

Hence, we see that server-based SAGA variance reduction is especially suited for the federated setting. It avoids extra computation or communication at the clients (as in \texttt{SCAFFOLD}) or unrealistic client participation scenarios (as in \texttt{MIFA}).




\subsection{Convergence Analysis of \algoname}

% \GJ{I generally recommend naming all Theorems to give an idea of what the main result is. See suggested title below}
\begin{thm}[Convergence of \texttt{\algoname}]
\label{thm:\algoname}
Suppose the functions $\{ f_i \}$ satisfy Assumptions \ref{smooth_assump}, \ref{stochastic}, \ref{global_var_assum}. In each round of \texttt{\algoname}, the server randomly selects $|\set^{(t)}| = \selclients$ (out of $\numclients$) clients, for all $t$, without replacement, to perform $\tau$ steps of local SGD. If the server and client learning rates, $\lrss, \lrc$ respectively, are chosen such that $\lrss \lrc \leq \min \lcb \frac{\selclients^{3/2}}{8 L \tau \numclients}, \frac{5 \selclients}{48 \tau L}, \frac{1}{4 L \tau} \rcb$ and $\lrc \leq \frac{1}{10 L \tau}$,
% then the expected gradient norm of the global model after $T$ rounds of \texttt{\algoname} is bounded as follows:
% \ps{We can replace the last phrase, which is too verbose with the much simpler ``''}
then the iterates $\{ \bwt \}$ generated by \texttt{\algoname} satisfy
{\small
\begin{align*}
    & \min_{t \in \{0, \hdots, T-1 \}} \mbe \norm{\nabla  f(\bw^{(t)})} \leq \bigO{\frac{f(\bw^{(0)})-f^*}{\lrss \lrc \tau T}} \\
    & +
    \underbrace{\bigO{\frac{\lrss \lrc L \sigma^2}{\selclients} + \lrc^2 L^2 (\tau-1)\sigma^2}}_{{\text{stochastic gradient Error}}} + \underbrace{\bigO{\lrc^2 L^2 \tau(\tau-1)\sigma_g^2}}_{\text{client drift error}},
\end{align*}}
where $f^* = \argmin_\mathbf{x} f(\mathbf{x})$.
\end{thm}


We defer the proof and the exact convergence rate of \texttt{\algoname} to our supplementary material.
We observe that \texttt{\algoname} successfully eliminates the partial participation error, while retaining the stochastic sampling error and client drift error. This is to be expected as we do not modify the \texttt{LocalSGD} procedure at the clients to control these errors.

% \begin{coro}
% \label{corro_1}
% Setting $\lrc = \frac{1}{\sqrt{T} L\tau}$ and $\lrss = \sqrt{\tau \selclients}$, \texttt{\algoname} converges to a stationary point of the global objective $f(\bw)$ at a rate given by,
% \begin{flalign*}
% &\hspace{-40pt}\min_{0\leq t \leq T-1} \E{\norm{\nabla  f(\bw^{(t)})} }\\
% &\leq \underbrace{\bigO{\frac{1}{\sqrt{\selclients \tau T}}}}_{\text{stochastic gradient error}}
% + \underbrace{\bigO{\frac{1}{T}}}_{\text{client drift error}}    
% \end{flalign*}
% \end{coro}

% \ps{Need a corollary to show the convergence rate, and largest possible $\tau$}

\paragraph{Reduction to SAGA.} Note that in the case when $\sigma=0$, $\tau=1$ and $M=1$ our algorithm reduces exactly to the \texttt{SAGA} algorithm \citep{defazio2014saga}. Setting $\lrc = \frac{1}{8LN}$ and $\lrss = 1$ we get a rate of $\bigO{\frac{N}{T}}$ for non-convex loss functions. Our rate is slightly worse than the rate of $\bigO{\frac{N^{2/3}}{T}}$ obtained in \citep{reddi2016fast} because we use the \textit{same} sample $i^{(t)}$ to update both $\bw^{(t)}$ and $\by_{i^{(t)}}$. \citep{reddi2016fast} instead draw \textit{two independent} samples $i^{(t)}$ and $j^{(t)}$, where $i^{(t)}$ is used to update the model $\bw^{(t)}$ and $j^{(t)}$ is used to update $ \by_{j^{(t)}}$. For a fixed $\bw^{(t)}$, this effectively ensures independence between $\bwtp$ and $\{\by_j^{(t+1)}\}_{j=1}^N$ which we believe leads to the theoretical improvement in their convergence rates.
% \begin{figure}
%     \centering
%     \includegraphics[width=200pt]{Images/uai_draft_image.pdf}
%     \caption{Some caption}
%     \label{fig:my_label}
% \end{figure}


% Keeping this in mind, we also outline a strategy for reducing the storage cost in \texttt{\algoname} as discussed in Section 5.  
% State corollary

% \DJH{Should we also talk about a distributed way of storing $\by_i^{(t)}$ at client $i$ itself?}
\section{Cluster Fedvarp, and its convergence analysis}
\label{sec:cluster}
% A potential drawback of implementing \texttt{\algoname} in practice is the price of maintaining a $\bigO{nd}$ memory of previous client updates at the server. Even for a powerful server this storage cost can quickly become prohibitive since both $n$ and $d$ can be large in federated settings. This leads us to understand the trade-off between storage whether \texttt{\algoname} can be made more storage-efficient while still

% \ps{Do we mention in this section that the algo is in Appendix?}
While \texttt{\algoname} successfully eliminates partial client participation variance, it does so at the expense of maintaining a $\bigO{\numclients d}$ memory of latest client updates at the server. This storage cost can quickly become prohibitive since both $\numclients$ and $d$ can be large in federated settings \citep{kairouz2019advances, reddi2020adaptive}. To remedy this, we propose \texttt{\clusteralgoname}, a novel server-based aggregation strategy to reduce partial client participation variance while being storage-efficient. 
% This leads us to question if \texttt{\algoname} can be made more storage-efficient, while still achieving a variance-reduction effect.

\texttt{\clusteralgoname} is based on the simple observation that we can reduce storage cost by partitioning our set of $\numclients$ clients into $\numclusters$ disjoint clusters and maintaining a \textit{single} state for all the clients in the same cluster. In other words, instead of maintaining $N$ states for $N$ clients, we maintain just $\numclusters$ cluster states with clients in the same cluster \textit{sharing} the same state. Assuming that there exists such a clustering of clients, our algorithm proceeds as follows. Let $c_i \in [K]$ be the cluster identity of the $i$-th client. We initialize all cluster states to zero, that is, $\by_k^{(0)} = \mathbf{0}$ for all $k \in [K]$.  Different from \texttt{\algoname}, we now use the cluster states of clients to compute $\bvt$, i.e.,
{\small
\begin{align}
    \bvt = \frac{1}{|\set^{(t)}|}\sum_{i \in \set^{(t)}}\brac{\deltai - \by_{c_i}^{(t)}} + \frac{1}{\numclients}\sum_{j=1}^\numclients \by_{c_j}^{(t)}.
\end{align}}%
We observe that $\bvt$ still enjoys the unbiased property outlined in \ref{scala_unbias_prop} since,
{\small
\begin{align}
    \Eg{\set^{(t)}}{\frac{1}{|\set^{(t)}|}\sum_{i \in \set^{(t)}}\by_{c_i}^{(t)}} = \frac{1}{N}\sum_{j=1}^N \by_{c_j}^{(t)}
\end{align}}%
% Note that we are still effectively maintaining $\numclients$ states but under the constraint that clients in the same cluster share the same state. 
% \ps{Do we need this statement? This might confuse the readers, because so far we've led them to believe that we're going to save on storage}
%  
% Note that we still have that $\bvt$ is an unbiased estimator of $\bh^{(t)}$ since 
The major algorithmic difference lies in how we update the cluster states,
{\small
\begin{align}
    \by_k^{(t+1)} = 
    \begin{cases}
        \dfrac{\sum_{ i \in \set^{(t)} \cap \mathcal{C}_k} \deltai}{|\set^{(t)} \cap \mathcal{C}_k|} & \text{ if } |\set^{(t)} \cap \mathcal{C}_k| \neq 0, \\
        \by_k^{(t)} & \text{ otherwise,}
    \end{cases}
\end{align}}%
for all $k \in [K]$. For $k$-th cluster $\Cc_k$, the cluster state is the \textit{average} update of the participating clients that belong to cluster $k$, i.e., $\set^{(t)} \cap \mathcal{C}_k$. If this set is empty the cluster state remains unchanged.

% \DJH{write about more complicated updating setups} 
\begin{algorithm}
\caption{\texttt{\clusteralgoname} }
\label{alg_\clusteralgoname }
\begin{algorithmic}[1]
\State \textbf{Input:} initial model $\bw^{(0)}$, server learning rate $\lrss$, client learning rate $\eta$, local SGD steps $\tau$, $\lrs = \lrss \lrc \tau $, number of rounds $T$, number of clusters $K$, initial cluster states $\by_k^{(0)} = \mathbf{0}$ for all $k \in [K]$, cluster identities $c_i \in [K]$ for all $i \in [\numclients]$, cluster sets $\mathcal{C}_k = \{ i: c_i = k\} \text{ for all } k \in [K]$

\For {$t = 1,2,\dots, T$}
\State Sample $\set^{(t)} \subseteq [\numclients]$ uniformly without replacement
\For{$i \in \set^{(t)}$}
\State $\deltai \gets \texttt{LocalSGD}(i,\bw^{(t)},\tau,\eta)$
\EndFor
\State // At Server:
\State {\small$\bvt = \frac{1}{|\set^{(t)}|}\sum_{i \in \set^{(t)}}\brac{\deltai - \by_{c_i}^{(t)}} + \frac{1}{\numclients}\sum_{j=1}^\numclients \by_{c_j}^{(t)}$}
\State $\bw^{(t+1)} = \bw^{(t)} - \lrs \bvt$
\State //State update
\For {$k \in [\numclusters]$}
\State {\small$\by_k^{(t+1)} = 
    \begin{cases}
        \dfrac{\sum_{ i \in \set^{(t)} \cap \mathcal{C}_k} \deltai}{|\set^{(t)} \cap \mathcal{C}_k|} & \text{ if } |\set^{(t)} \cap \mathcal{C}_k| \neq 0\\
        \by_k^{(t)} & \text{ otherwise}
    \end{cases}$}
\EndFor
\EndFor
\end{algorithmic}
\end{algorithm}

Note that the dissimilarity in client data across clusters is already bounded in Assumption 3. Our motivation behind using a clustering approach is to utilize a tighter bound on the data dissimilarity \textit{within} a cluster. We quantify this precisely via the following assumption.

\begin{assum} 
\label{cluster_var_assump}
(Bounded cluster variance). Let $K$ be the total number of clusters and $\mathcal{C}_k$ be the set of clients belonging to the $k$-th cluster . There exists a constant $\sigma_K \geq 0$ such that the difference between the average gradient of clients in the $k$-th cluster and the local gradient of the $i$-th client in the $k$-th cluster is bounded as follows: {\small$\norm{\nabla f_i(\bw) - \frac{1}{|\mathcal{C}_{k}|}\sum_{j \in \mathcal{C}_{k}} \nabla f_j(\bw)} \leq \sigma^2_K$}, for all $k \in [K]$, for all $i \in \mathcal{C}_k$.
\end{assum}

We see that $\sigma^2_K$ acts a measure of the efficacy of our clustering with the goal being to achieve $\sigma^2_K \ll \sigma^2_g$. In practice, there often exists metadata about clients that can be used to naturally partition clients into well-structured clusters. For instance, when training a next-word prediction model \citep{hard2018federated}, clients could be grouped by geographical location depending on the local dialect. Another example is training recommender systems for social media platforms \citep{jalalirad2019simple}  where we expect connected users to have similar interests.


Intuitively, we expect that for $\numclusters < \numclients$ we will suffer an error of $\bigO{\sigma^2_K}$ when trying to approximate a client's update by its cluster state.
This intuition is captured precisely in our convergence result for \texttt{\clusteralgoname} as stated below. 
% \DJH{also need to assume equal number of clients in each cluster}

\begin{thm}[Convergence of \texttt{\clusteralgoname}]
\label{thm:clusteralgoname}
Suppose the functions $\{ f_i \}$ satisfy Assumptions \ref{smooth_assump}, \ref{stochastic}, \ref{global_var_assum}, \ref{cluster_var_assump}. Further, suppose all the clients are partitioned into $K$ clusters, each with $r$ clients, such that $\numclients = r K$. In each round of \texttt{\clusteralgoname}, the server randomly selects $|\set^{(t)}| = \selclients$ (out of $\numclients$) clients, for all $t$, without replacement, to perform $\tau$ steps of local SGD. Further, the client learning rate $\lrc$, and the server learning rate $\lrss$ are chosen such that {\small$\lrc \leq \frac{1}{10 L \tau}$, $\lrss \lrc \leq \min \lcb \frac{\sqrt{\selclients} (1-p)}{8 L \tau}, \frac{\selclients}{16 \tau L}, \frac{1}{4 L \tau} \rcb$}, where {\small$p = \frac{\binom{\numclients - r}{\selclients}}{\binom{\numclients}{\selclients}}$}.
Then, the iterates $\{ \bwt \}_t$ generated by \texttt{\clusteralgoname} satisfy
{\small
\begin{align*}
    & \min_{t \in \{0, \hdots, T-1 \}} \mbe \norm{\nabla  f(\bw^{(t)})} \\
    & \leq \bigO{\frac{f(\bw^{(0)})-f^*}{\lrss \lrc \tau T}}+
    \underbrace{\bigO{\frac{\lrss \lrc L \sigma^2}{\selclients} + \lrc^2 L^2 (\tau-1)\sigma^2}}_{{\text{stochastic sampling error}}}\\
    &  \underbrace{\bigO{\frac{\lrss \lrc L \tau (\numclients-\selclients)\sigma_K^2}{\selclients (\numclients-1)} }}_{{\text{cluster heterogeneity error}}} + \underbrace{\bigO{\lrc^2 L^2 \tau(\tau-1)\sigma_g^2}}_{\text{client drift error}}
\end{align*}}
\end{thm}
% \ps{And how did you get the $M-N$ in this term? I understand that this term is zero if $N=M$. But, I didn't get the linear dependence?}
We defer the proof and exact convergence rate to our supplementary material.
For $\numclusters = \numclients$ (one client per cluster) we recover the convergence rate of \texttt{\algoname}($\sigma^2_{K=N} = 0$). On the other hand, for $\numclusters =1 $ we get back the \texttt{FedAvg} algorithm since all clients share the same state and there is no variance-reduction ($\sigma^2_{K=1} = \sigma^2_g$). Thus, we see a natural trade-off between storage and variance-reduction as we vary the number of cluster states $\numclusters$.  In practice, \texttt{\clusteralgoname} gives server the flexibility to set $\numclusters$ based on its storage constraints.


We see that \texttt{\clusteralgoname} also allows an interesting trade-off between the server learning rate and cluster approximation error as we vary $\numclusters$. Our analysis shows the bound on the server learning rate comes from trying to control the ``staleness'' of a client's state, which measures the frequency with which a client's state is updated. In \texttt{\algoname}, a client's state is updated only when the client participates, which happens with probability $\frac{\selclients}{\numclients}$. In \texttt{\clusteralgoname} a client's state is updated as long as \textit{any} client from the same cluster participates, which dramatically reduces staleness. However this comes at the cost of the additional cluster heterogeneity error implying a trade-off between convergence speed and error floor.
 




% \input{Sections/Analysis}
% \input{Sections/Experiments}
\section{Experiments}
\label{sec:experiments}

\subsection{Experimental Setup}
\begin{figure*}[ht]
 \centering
    \subfloat[]{\includegraphics[width=0.33\linewidth]{Images/lenet_train_loss.pdf}\label{fig:pow_dme}}
    \subfloat[]{\includegraphics[width=0.33\linewidth]{Images/resnet_train_loss.pdf}\label{fig:kme_dme}}
    \subfloat[]{\includegraphics[width=0.33\linewidth]{Images/shake_train_loss.pdf}\label{fig:log_dme}}
    \\
    \subfloat[]{\includegraphics[width=0.33\linewidth]{Images/lenet_test_acc.pdf}\label{fig:pow_loss}}
    \subfloat[]{\includegraphics[width=0.33\linewidth]{Images/resnet_test_acc.pdf}\label{fig:kme_loss}}
    \subfloat[]{\includegraphics[width=0.33\linewidth]{Images/shake_test_acc.pdf}\label{fig:log_loss}}
    \caption{Experimental Results showing Training Loss and Test Accuracy for: CIFAR-10 on LeNet-5 (a,d), CIFAR-10 on ResNet-18 (b,e), Shakespeare on RNN (c,f). For \texttt{\clusteralgoname} we keep $K = 55$ for CIFAR-10 experiments (4.5x storage reduction) and $K=36$ for Shakespeare experiments (30x storage reduction).  \texttt{\algoname} outperforms baselines in all cases while \texttt{\clusteralgoname} outperforms baselines in most cases. We see greater empirical benefits for CIFAR-10 experiments due to the higher data-heterogeneity across clients.}
    \label{fig:expts}
\end{figure*}

To support our theoretical findings we evaluate our proposed algorithms on the following FL tasks: i) image classification on CIFAR-10 \citep{krizhevsky2009cifar} with LeNet-5 \citep{lecun2015lenet}, ii)
image classification on CIFAR-10 with ResNet-18 \citep{he2016deep}, and iii) next character prediction on Shakespeare \citep{caldas2018leaf} with a RNN model.
% Our setups incorporate both natural vision and language modelling across different model architectures to showcase the broad applicability of our algorithms. 
In all setups, we compare the performance of our algorithms with \texttt{FedAvg}, \texttt{MIFA} \citep{gu21mifa_neurips} and \texttt{SCAFFOLD} \citep{karimireddy2019scaffold} (see Section \ref{subsec:proposed \algoname} for discussion of the algorithms).
We briefly describe the datasets and the natural clustering of clients that we utilize in these datasets. 

\textbf{CIFAR-10.} The CIFAR-10 dataset is a natural image dataset consisting of 60000 32x32 colour images, with each
image assigned to one of 10 classes (6000 images per class). We create a federated non-iid split of the CIFAR-10 dataset among 250 clients using a similar procedure as \citep{mcmahan2017communication}. The data is first sorted by labels and
divided into 500 shards with each shard corresponding to data of a particular label. Clients are randomly assigned 2 such shards which implies each client has a data distribution corresponding to either 1 or 2 classes.  For \texttt{\clusteralgoname}, we group clients having the same data distribution in the same cluster giving us 55 unique clusters.



\textbf{Shakespeare.}
Shakespeare is a language modelling task where each client is a role from one of the plays in \textit{The Collective Works of William Shakespeare} \citep{shakespeare2014complete}. We pick clients that have lines corresponding to at least 120 characters which leaves us with 1089 unique clients. The task is to predict the next character given an input sequence of 20 characters from a client's text.  For \texttt{\clusteralgoname}, we group clients belonging to the same play in the same cluster giving us a total of 36 clusters.

\textbf{Experimental Details.} To simulate partial client participation we uniformly sample $\selclients = 5$ clients without replacement in every round for all algorithms. This gives us a participation rate of $2\%$ for CIFAR-10 experiments and $<1 \%$ for Shakespeare as seen in practice for typical FL settings \citep{kairouz2019advances}. We allow clients to perform 5 local epochs before sending their updates. We use a batch size of 64 in all experiments. 
We fix the server learning rate $\lrss$ to 1 and tune the client learning rate $\lrc$ over the grid $\{10^{-1},10^{-1.5},10^{-2},10^{-2.5}, 10^{-3}\}$ for all algorithms. For ResNet-18 we replace the batch normalization layers by group normalization \citep{hsieh2020non}. Our Shakespeare RNN was a single layer Gated Recurrent Unit (GRU) with 128 hidden parameters and embedding dimension of 8. 

\subsection{Comparison with Baselines}
 
% Find the best learning rate for each algo, keep the server learning to be 1. Comment on batch size?? Average across 3 random seeds.





% \begin{center}
% \begin{tabular}{ | c | c | c | } 
%   \hline
%   cell1 dummy text dummy text dummy text& cell2 & cell3 \\ 
%   \hline
%   cell1 dummy text dummy text dummy text & cell5 & cell6 \\ 
%   \hline
%   cell7 & cell8 & cell9 \\ 
%   \hline
% \end{tabular}
% \end{center}




Our experiments clearly demonstrate that our proposed algorithms consistently outperform other baselines without requiring additional communication or computation at clients. \texttt{\clusteralgoname} closely matches the performance of \texttt{\algoname} in all experiments thereby highlighting the practical gains of clustering-based storage reduction. For instance, to achieve 50\% test accuracy on CIFAR-10 classification with LeNet-5 our algorithms take less than 536 rounds while \texttt{FedAvg} takes 1158 rounds giving us up to 2.1x speedup. 
% \texttt{\algoname} achieves up to 2.1x speedup over \texttt{FedAvg} in CIFAR-10 experiments and 1.1x speedup for Shakespeare to achieve 50 \% test accuracy. 
The benefits are especially pronounced for CIFAR-10 as the artificial data partitioning leads to greater heterogeneity across clients thereby accentuating the effect of partial participation. 


Our algorithms also outperform competing variance-reduction methods \texttt{MIFA} and \texttt{SCAFFOLD} in all experiments. The performance of \texttt{MIFA} is severely affected by its bias in the initial rounds of training since we do not assume that all clients participate in the first round of training. This again highlights the practical usefulness of the unbiased variance-reduction applied in \texttt{\algoname} and \texttt{\clusteralgoname}. 
While theoretically appealing we find that modifying the \texttt{LocalSGD} procedure using \texttt{SCAFFOLD} to mitigate client drift actually hurts performance in practical FL settings. Our findings are consistent with \citep{reddi2020adaptive}
and make the case for reducing client drift using carefully tuned local learning rates while focusing on server-based optimization techniques to reduce variance. 
% Hence our results comprehensively support our observations that mitigating partial client participation error using server-based variance reduction techniques \algoname and \clusteralgoname can lead to significant speedup in FL settings. 

% With full client participation we observe that \texttt{MIFA} performs similarly. Something about \texttt{SCAFFOLD}. Our observation for \texttt{SCAFFOLD} is consistent with those found in \citep{reddi2020adaptive}

% \texttt{\algoname} and \texttt{\clusteralgoname} outperform others across a suite of tasks. \texttt{\algoname} significantly outperforms competing methods in

% Say something about \texttt{\clusteralgoname} doing better initially. variance reduction doesn't usually work??
% the practically relevant massively distributed scenario. We report the performance of \texttt{\algoname} on CIFAR-10 and CIFAR-100 with moderate and large number of devices in Table 1, while keeping
% the participation level constant 10 and the data amounts balanced. Specifically, the moderately distributed setting has 100 devices with 500 images per device. The massively distributed setting has
% 1000 devices with 50 images per device for CIFAR-10, as well as 500 devices with 100 images per
% device for CIFAR-100. In each distributed setting, the data is partitioned in both IID and non-IID
% (Dirichlet 0.3) fashion. \texttt{\algoname} leads to substantial transmission reduction in each of the regimes. First, the communication saving in the massive setting is significantly larger relative to the moderate
% setting. Compared to \texttt{SCAFFOLD}, \texttt{\algoname} leads to 4.8× and 2.9× gains respectively on CIFAR-10
% IID setting. SCAFFOLD is not able to achieve 80
% within 2000 rounds in the massive setting (shown
% in Figure 4a), thus actual saving is more than 4.8×. Similar trend is observed in the non-IID setting of CIFAR-10 and CIFAR-100. Second, all the methods require more communications to achieve
% a reasonable accuracy in the massive setting as the dataset is more decentralized. For instance, it takes \texttt{\algoname} 637 rounds to achieve 84.5 with 100 devices, while it takes 840 rounds to achieve
% 80.0 with 1000 devices. Similar trend is observed for CIFAR-100 and other methods. \texttt{\algoname}
% always achieves the target accuracy with fewer rounds and thus leads to significant saving. Third, a
% higher target accuracy may result in a greater saving. For instance, the saving relative to \texttt{SCAFFOLD} increases from 3× to 4.8× in the CIFAR-10 IID massive setting. We may attribute this to the fact that \texttt{\algoname} aligns device functions to global loss and efficiently optimizes the problem.




%

% \begin{figure*}[h]
%      \centering
%      \begin{subfigure}[c]{0.33\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{Images/lenet_acc.png}
%          \caption{LeNet on CIFAR-10}
%         %  \label{fig:y equals x}
%      \end{subfigure}
%      \hfill
%      \begin{subfigure}[c]{0.33\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{Images/resnet_acc.png}
%         \caption{ResNet on CIFAR 10}
%         \label{fig:case_study_dme}
%      \end{subfigure}
%      \hfill
%       \begin{subfigure}[c]{0.33\textwidth}
%          \centering
%          \includegraphics[width=\textwidth]{Images/shake_acc.png}
%         \caption{RNN on Shakespeare}
%         \label{fig:case_study_dme}
%      \end{subfigure}
%      \hfill
% \caption{
% % Simulated results for our objective defined in \Cref{obj:case study}, where $\be_i \sim \mathcal{N} (0,\mathbf{I}_d)$, $n=15$ and $ d=1000$. We set $\eta = 0.1$ and $k/d = 0.1$. We compare with more sophisticated schemes than Rand-$k$-Temporal such as Rand-$k$(Wangni) and \citep{wangni2018gradient} and Induced Compressor(Horváth) \citep{horvath2020better} that take into account coordinate magnitude during sparsification. \textbf{(a)} We see that  Rand-$k$-Temporal effectively converges to the true optima while other sparsification schemes are unable to do so. \textbf{(b)} The mean estimation error for Rand-$k$-Temporal decreases sharply as iterations proceed while other sparsification schemes continue to have a high error floor.
% }
% \end{figure*}

% \twocolumn

% \newpage




% \input{Sections/RelatedWork}
\section{Related Work}   

\textbf{Convergence Analysis of \texttt{FedAvg}:}
The original \texttt{FedAvg} \citep{mcmahan2017communication} work inspired a rich line of work trying to analyze \texttt{FedAvg} in various settings  \citep{khaled2020tighter, yu2018parallel,li2019on}. The convergence results closest to our setting are found in \citep{wang2020tackling, karimireddy2019scaffold, yang2021achieving} that analyze \texttt{FedAvg} in the presence of non-iid data as well as partial client participation for non-convex objectives.
We refer readers to \citep{kairouz2019advances, wang2021field} for a comprehensive review of convergence results in FL. 
% \citep{charles2021large} conduct empirical experiments to understand the effect of cohort size in FL. 

\paragraph{Variance Reduction.} Since the inception of SAG \citep{schmidt2017minimizing} and SAGA \citep{defazio2014saga}, several variance-reduction methods for centralized stochastic problems have been proposed that do not require additional storage. We divide these works into two broad categories and discuss applying them in a federated context to reduce partial client participation. 

1) \textbf{SVRG-style Variance Reduction.} SVRG \citep{johnson2013accelerating} and related methods like SCSG \citep{lei2017non} SARAH \citep{nguyen2017sarah}, and SPIDER \citep{fang2018spider} trade-off storage with computation and need to compute the full (or a large-minibatch) gradient at regular intervals. While these methods achieve theoretically better rates than SAGA, applying them in a federated context would require \textit{all} clients to participate in some rounds of training which we believe is unrealistic.
 
 2) \textbf{Momentum-based Variance Reduction.} A recent line of work explores the connection between SGD with momentum and variance-reduction and proposes new algorithms STORM \citep{cutkosky2019momentum} and HybridSARAH \citep{tran2019hybrid}, that do not require full-batch gradient computation at any iteration. This has inspired federated counterparts \citep{das2020faster}, \citep{khanduri2021stem}, \citep{li2021zerosarah}. \citep{das2020faster} and \citep{li2021zerosarah} propose to use such approaches to reduce client participation variance. However there are two drawbacks. The central server needs to communicate two sets of global models $\bw^{(t)}$ and $\bw^{(t-1)}$ to the participating clients, doubling server to client communication. Secondly, participating clients need to run local SGD for both sets of global models, thereby doubling computation. Again while theoretically attractive we believe such approaches are not suitable for practical FL settings.

% However, utilizing these methods to mitigate the partial client participation has additional challenges.

% As discussed earlier, these methods require computation of the full gradients at regular intervals. Adapting this idea for partial client participation would require \textit{all} clients to participate in a single round, which we believe is too stringent.

% \citep{cutkosky2019momentum} and HybridSARAH \citep{tran2019hybrid} are recently proposed methods that do not require full gradient computation. However applying such techniques in a federated setting would require clients to compute their local updates on \textit{both} $\bw^{(t)}$ and $\bw^{(t-1)}$.
% thereby doubling computation at clients. 

% STEM \citep{khanduri2021stem}.






\textbf{Clustered Federated Learning and Variance Reduction.}
The idea of utilizing cluster structure among clients has given rise to the paradigm of \textit{clustered federated learning} \citep{ghosh2020cfl}, \citep{sattler2020clustered}, where \textit{separate} global models are learned for each cluster. On the other hand, we propose to learn a \textit{single} global model and use the cluster structure for reducing the variance arising due to partial client participation. A similar idea of sharing gradient information while reducing variance has been explored in $\mathcal{N}$-SAGA \citep{hofmann2015variance} but their focus is on a single node centralized setting and the analysis is restricted to strongly convex functions. An interesting direction for future work is to linearly combine a client's previous state with its cluster state to reduce staleness as done in \citep{allen2016exploiting}. 



% \paragraph{Non-Uniform Client Sampling.}
% An orthogonal line of work  \citep{cho2020client, chen2020optimal} considers biased client sampling where clients with higher training losses are picked more frequently to encourage faster convergence. We believe such schemes can be combined with variance reduction for greater empirical benefits.

% \paragraph{Client Subsampling} An orthogonal approach


% \newpage
% \input{Sections/Conclusion}
\section{Conclusion}
We consider the problem of eliminating variance arising due to partial client participation in large-scale FL systems. We first show that partial participation variance dominates the convergence rate of \texttt{FedAvg} for smooth non-convex loss functions. 
We propose \texttt{\algoname}, a novel aggregation strategy applied at the server to completely eliminate this variance without requiring any additional computation or communication at the clients. Next we propose a more practical clustering-based strategy \texttt{\clusteralgoname} that reduces variance while being storage-efficient. Our theoretical findings are comprehensively supported by our experimental results which show that our proposed algorithms consistently outperform existing baselines.

% Citations should include the author's last name and year.
% They should be part of the sentence.
% An example parenthetical citation: “Good introductions to the topic are available \citep{latexcompanion}.”
% An example textual citation: “\citet{einstein} discusses electrodynamics of moving bodies.”
% Do not use a parenthetical citation where a textual one is appropriate.
% An example of what \emph{not} to do: “\citep{einstein} discusses electrodynamics of moving bodies.”

% We strongly advise to use reference list software such as Bib\TeX{} and a citation package such as \textsf{natbib}.
% The reference style you use should be compatible with the author-year citations.
% Both the citation style and reference style used should be consistent.

% For the original submission, take care not to reveal the authors' identity through the manner in which one's own previous work is cited.
% For example, writing
% “I discussed electrodynamics of moving bodies before \citep{einstein}.” would be inappropriate, as it reveals the author's identity.
% Instead, write “\citet{einstein} discussed electrodynamics of moving bodies.”

% \subsubsection{Footnotes}
% You can include footnotes in your text.\footnote{
%     Use footnotes sparingly, as they can be distracting, having readers skip back and forth between the main text and the foot of the page.
% }
% The footnote mark should follow the fragment to which it refers, so a footnote\footnote{
%     A footnote is material put at the foot of a page.
% }
% for a word has a footnote mark attached to that word and a footnote for a phrase or sentence has a footnote mark attached to the closing punctuation.

% \section{Math}\label{sec:math}
% The class file does not load any math support package like \textsf{amsmath}\footnote{%
%   See the \textsf{amsmath} documentation at \url{https://ctan.org/pkg/amsmath} for further details.
% }.
% We advise using the \textsf{mathtools}\footnote{%
%   See the \textsf{mathtools} documentation at \url{https://ctan.org/pkg/mathtools} for further details.
% }
% package, which extends \textsf{amsmath} with fixes and even more useful commands.
% Feel free to load other support packages for symbols, theorems, etc.

% Use the \textsf{amsmath} environments for displayed equations.
% So, specifically, use the \texttt{equation} environment instead of \verb|$$...$$| and the \texttt{align} environment instead of \texttt{eqnarray}.\footnote{For reasons why you should not use the obsolete \texttt{eqnarray} environment, see Lars Madsen, \textit{Avoid eqnarray!} TUGboat 33(1):21--25, 2012.}
% An \texttt{equation}:
% \begin{equation}\label{eq:example}
%   0 = 1 - 1.
% \end{equation}
% Two \texttt{align}'ed equations:
% \begin{align*} % no numbers with starred version
%   1 + 2 &= 3,\\
%   1 - 2 &= -1.
% \end{align*}
% Equations can also be put inline, of course.
% For example, Equation~\eqref{eq:example}: \(0=1+1\). % $0=1+1$ also works
% (Notice that both inline and displayed math are part of the sentence, so punctuation should be added to displayed math.)

% The \textsf{amsmath} and \textsf{mathtools} packages provide a lot of nice functionality, such as many common math operators, e.g., \(\sin\) and \(\max\), and also commands for defining new ones.

% \section{Floats}\label{sec:floats}
% Floats, such as figures, tables and algorithms, are moving objects and are supposed to float to the nearest convenient location.
% Please do not force them to go in the middle of a paragraph.
% They must respect the column width.

% Two-column floats are possible.
% They appear at the top of the next page, so strategic placement may be necessary.
% For an example, see Figure~\ref{fig:tikz}.
% They may not enter the margins.
% \begin{figure*}
%     \centering
%     \begin{tikzpicture}[xscale=1.5]
%         \coordinate (origin);
%         \draw[->] (origin) -- +(1cm,0) node[below] {$x$};
%         \draw[->] (origin) -- +(0,1cm) node[left] {$y$};
%         \fill[gray] (45:1cm) circle[radius=.2cm];
%     \end{tikzpicture}
%     \caption{A Nice Filled Ellipse with a Pair of Coordinate Axes.}\label{fig:tikz}
% \end{figure*}

% All material in floats should be legible and of good quality.
% So avoid very small or large text and pixelated or fuzzy lines.

% \subsection{Figures}\label{sec:figures}
% Figures should go in the \texttt{figure} environment and be centered therein.
% The caption should go below the figure.
% Use \verb|\includegraphics| for external graphics files but omit the file extension.
% Supported formats are \textsf{pdf} (preferred for vector drawings and diagrams), \textsf{png} (preferred for screenshots), and \textsf{jpeg} (preferred for photographs).
% Do not use \verb|\epsfig| or \verb|\psfig|.
% If you want to scale the image, it is better to use a fraction of the line width rather than an explicit length.
% For example, see Figure~\ref{fig:Eindhoven}.
% \begin{figure}
%   \centering
%   \includegraphics[width=0.7\linewidth,page=3]{Eindhoven}
%   \caption{A View of a Nice City.}\label{fig:Eindhoven}
% \end{figure}

% Do not use \verb|\graphicspath|.
% If the images are contained in a subdirectory, specify this when you include the image, for example \verb|\includegraphics{figures/mypic}|.

% \subsection{Tables}\label{sec:tables}
% Tables should go in the \texttt{table} environment and be centered therein.
% The caption should go above the table and be in title caps.
% For an example, see Table~\ref{tab:data}.
% \begin{table}
%     \centering
%     \caption{An Interesting Table.}\label{tab:data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \subsection{Algorithms}\label{sec:algorithms}
% You can load your favorite algorithm package, such as \textsf{algorithm2e}\footnote{See the \textsf{algorithm2e} documentation at \url{https://ctan.org/pkg/algorithm2e}.}.
% Use the environment defined in the package to create a centered float with an algorithm inside.





% \begin{contributions} % will be removed in pdf for initial submission,
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions.
%     This is a nice way of making clear who did what and to give proper credit.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

\begin{acknowledgements} 
This research was generously supported in part by the NSF Award (CNS-2112471), the NSF CAREER Award (CCF-2045694), and the David H. Barakat and LaVerne Owen-Barakat College of Engineering Dean's Fellowship at Carnegie Mellon University.
\end{acknowledgements}

%\newpage

\bibliography{uai2022-template}

\newpage


\appendix
% NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% \section{Math font exposition}
% How math looks in equations is important:
% \begin{equation*}
%   F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.

% \onecolumn
% \section{You \emph{can} have an appendix here.}
% \input{Appendix}

\end{document}
