\documentclass[accepted]{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
 % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\usepackage{multirow}

\usepackage{mathtools, nccmath, textcomp} 
\usepackage{xcolor} %bbb
\usepackage{amsfonts} %bbb
\usepackage{macros} %bbb

\usepackage{subcaption}
\usepackage{makecell}
\usepackage{enumitem}
\usepackage[textsize=scriptsize, textwidth=0.5in]{todonotes}
\newcommand{\psa}[1]{\textcolor{green}{B: #1}} %bbb answer ps s
\newcommand{\psq}[1]{\textcolor{red}{B: #1}} %bbb followup ps s
\newcommand{\ba}[1]{\textcolor{purple}{B: #1}}
\newcommand{\carlee}[1]{\textcolor{magenta}{C: #1}}
\newcommand{\tdcarlee}[1]{\todo[color=blue!10, linecolor=black!50]{\textbf{C}: #1}}
% \newcommand{\camera}[1]{\textcolor{blue}{#1}} %camera ready updates
\newcommand{\camera}[1]{#1}

\usepackage[normalem]{ulem}
\usepackage[normalem]{ulem}
\usepackage{algorithm, algorithmicx, algcompatible}
\usepackage{algpseudocode} %bbb
% Example of adjusting indentation - Not a direct command, illustrative purposes only
\algrenewcommand\algorithmicindent{1em}

\usepackage{stfloats}
\usepackage{hyperref, tabularx}
\usepackage[capitalise]{cleveref}
\makeatletter
\newcommand{\multiline}[1]{%
  \begin{tabularx}{\dimexpr\linewidth-\ALG@thistlm}[t]{@{}X@{}}
    #1
  \end{tabularx}
}
\makeatother

\makeatletter
\newcounter{HALG@line}
\renewcommand{\theHALG@line}{\thealgorithm.\arabic{ALG@line}}
\makeatother

\makeatletter
\providecommand\theHALG@line{\thealgorithm.\arabic{ALG@line}}
\makeatother
% \algnewcommand{\Initialize}[1]{%
%   \State \textbf{Initialize:}
%   \Statex \hspace*{\raggedright #1}
% }
% %{\algorithmicindent}\parbox[t]{.8\linewidth}
\algnewcommand{\Initialize}[1]{%
  \State \textbf{Initialize:} #1}

\title{FedAST: Federated Asynchronous Simultaneous Training}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<baskin@andrew.cmu.edu>?Subject=About FedAST paper in UAI 2024}{Baris~Askin}{}}
% \author[1]{Baris~Askin}
\author[1]{Pranay~Sharma}
\author[1]{Carlee~Joe-Wong}
\author[1]{Gauri~Joshi}
% Add affiliations after the authors
\affil[1]{%
    Carnegie Mellon University\\
    Pittsburgh, Pennsylvania, USA
}
  \PassOptionsToPackage{hypertexnames=false}{hyperref}
  \begin{document}
\maketitle

\begin{abstract}
%\GJ{Algorithm name suggestion: FedAST -- federated simultaneous training} \ba{I changed all algorithm names: FedAMT -> FedAST. Also, instead of "multi model", we may use federated simultaneous training (FST). So, I changed baseline abbreviations accordingly as well.}
Federated Learning (FL) enables edge devices or \textit{clients} to collaboratively train machine learning (ML) models without sharing their private data. Much of the existing work in FL focuses on efficiently learning a model for a single task. In this paper, we study \textit{simultaneous training} of multiple FL models using a common set of clients. The few existing simultaneous training methods employ synchronous aggregation of client updates, which can cause significant delays because large models and/or slow clients can bottleneck the aggregation. On the other hand, a na\"ive asynchronous aggregation is adversely affected by stale client updates. We propose \(\nameofthealgorithm\), a buffered asynchronous federated simultaneous training algorithm that overcomes bottlenecks from slow models and adaptively allocates client resources across heterogeneous tasks. We provide theoretical convergence guarantees of \(\nameofthealgorithm\) for smooth non-convex objective functions. Extensive experiments over multiple real-world datasets demonstrate that our proposed method outperforms existing simultaneous FL approaches, achieving up to $46.0\%$ reduction in time to train multiple tasks to completion. 
\end{abstract}

\section{Introduction}\label{sec:intro}
Federated Learning (FL) is a distributed learning paradigm where edge devices or \textit{clients} collaboratively train machine learning (ML) models using privately held local data \citep{fedavg, kairouz2021advances}. Clients iteratively update their local models, which are periodically sent to a central server for aggregation. The aggregated model is then sent to the clients to begin the next round of local updates. 
% In FL, a global model is trained on local datasets collected by clients via successive communication rounds between the clients and a central server. The server orchestrates local training at the clients and periodically aggregates their updates to form a common global model.
% \carlee{emphasize that this is iterative--e.g., ``Clients iteratively update local models, which are periodically sent to a central server for aggregation; the aggregated model is then used to begin the next round of local client updates.''}
% \paragraph{Limitation of Single-Model FL}
Since its introduction in \citep{fedavg}, various practical and theoretical aspects of FL, including client selection \citep{clientSelectionInHet, powerofchoice}\camera{, communication challenges \citep{after_review1,after_review2},} scalability and fast training \citep{AsyncFL, fednova}, have been extensively studied.
%communication efficiency \citep{improvingComm},
%efficient aggregation of client updates \citep{fednova, fedvarp}
% \carlee{what does ``efficient'' mean? that sounds more like compression or over-the-air updates, which is not what the papers you have cited work on}
%privacy concerns \citep{feddiffpriv, secureAggr}.
% \carlee{Why are all of these challenges relevant to this work? I don't think we need to talk about specific single-model FL challenges unless we later solve them for the MMFL case. E.g., privacy and efficient aggregation aren't mentioned in the paper again}
% \ps{Please list a few more and cite 1-2 references for each?}
However, these works almost exclusively assume that the server aims to learn model(s) for a \textit{single task}. Some FL frameworks attempt to learn models personalized to each client~\citep{mansour2020three_FL, li2021ditto_FL, tan2022personalized_FL}, but these models are still intended for the same learning task, e.g., next-word prediction on keyboards.
% Although personalization in FL \citep{mansour2020three_FL, li2021ditto_FL, tan2022personalized_FL} has received significant attention, the goal is still to learn client-specific models for a single machine learning task, for example, next-word prediction on smart keyboards.% specialized towards user's writing style. %\carlee{explain/define a task. Or give an example so people know what you mean.}
%
% Single-model federated learning (FL), where the server aims to train one model, has been extensively discussed in the literature since the introduction of , Federated Averaging (FedAvg), by \citep{fedavg}. `Many works seek to optimize/analyze/study various aspects of single-model FL, including client selection schemes \citep{clientSelectionInHet} and optimal aggregation methods \citep{fednova}. %\carlee{I don't know if ``problems'' is the right terminology: maybe say that ``Many works seek to optimize/analyze/study various aspects of single-model FL, including...'' and then list aspects that we also consider for multi-model.} %\ps{We should call them problems of FL, rather than FedAvg, since they've been studied in more general context as well. Also, client selection or optimal aggregation or sync vs async are problems, while theory is not a problem per se, it's a means to better understand the underlying problem. Does that make sense? \psq{It totally makes sense, changing the wording.}}

Many practical applications need devices to perform a wide range of learning tasks, which require training of multiple ML models. For instance, our phones need language models for keyboard next-word prediction as well as image recommendation models to highlight images more likely to be shared \citep{fedavg}. \camera{ \citet{mmcars} propose training multiple models in federated smart car networks for different tasks, such as pothole detection and maneuver prediction. Another example can be a chat application requiring speech recognition and response text generator models concurrently, while \cite{airquality} suggest federated learning of multiple models for air quality index forecasting.} Thus, in this paper, we seek to answer the following question:
% \vspace{-5}
\begin{center}
\emph{How can we efficiently train models for multiple tasks in a federated setting using a shared pool of clients?}
\end{center}
% \vspace{-5}
% \ps{Maybe we can say ``models for multiple tasks'' rather than ``multiple models'' to distinguish from the personalization work we mention above?} \GJ{Edited}

\paragraph{Simple Solutions that Extend FedAvg.} 
% \sout{Let us look at some initial solution approaches that directly extend the standard federated averaging (FedAvg) algorithm employed in single-model training.} 
A na\"ive approach to training multiple models is \textit{sequential training}, where the models corresponding to different tasks are trained one at a time, each utilizing all the clients. The total training runtime of this approach scales linearly with the number of tasks. An alternative is for all the clients to train all tasks at the same time. However, with this approach each client will have to keep all models in memory, which is infeasible for resource-limited edge clients such as smartphones. To preserve memory, clients will have to queue the training requests and process them sequentially, again resulting in the runtime linearly increasing with the number of tasks.
%\sout{An important fact tthat underlines the inefficiency of sequential training is that increasing the number of participating clients brings diminishing marginal benefits for the speed of achieving the target accuracy (e.g., doubling the number of clients does not halve the number of communication rounds required). \ps{Maybe what we want to say is ``increasing the number of clients decreases the number of communication rounds required. However, each round also takes longer, since the server needs to wait for the slowest client.''} \carlee{citation here?}}
On the other hand, \textit{parallel} or \textit{simultaneous training} (ST) of all the models with \camera{time-varying subsets}
% a subset
of clients assigned to each task can strike a better trade-off between accuracy and runtime. 
%\sout{The simplest approach to perform simultaneous training is to partition the clients into disjoint subsets and assign one subset to each model. The model for each task is then trained solely using the assigned subset of clients. However, due to potentially high heterogeneity across the clients' local data, each model will not see diverse data and the trained global models may not generalize well to all clients. An alternative approach that allows each task to see data from the entire client pool proposed in recent work}
%\ps{I have crossed out this previous approach, because this approach is obviously bad, so I don't expect a reviewer asking about it.}\ba{In addition to what I said about the reason to add this parts in the office, showing that multi-model is not trivial, we discuss/compare them in later parts of the text. We should remove those parts if we want to remove here.}
\citet{BhuyanMM}'s approach assigns a disjoint subset of clients to each model in each round, which significantly improves the time taken to reach a target accuracy as compared to sequential training. However, these federated simultaneous training (FST) approaches leave room for significant improvement. There are two particular drawbacks: 1) \textit{straggler delays} due to synchronous aggregation, and 2) the \textit{lack of adaptation} to 
%the model complexity and 
the training progress of heterogeneous tasks,  which we address in this work. 

\paragraph{Synchronous Aggregation and Straggler Delays.} Conventional FL employs synchronous aggregation, where in each round, the server waits to receive updates from all the participating clients before each aggregation. However, when the clients have diverse hardware and communication capabilities, faster clients must remain idle until slow or \textit{straggling} clients finish, causing a large wallclock runtime to complete each communication round. This problem is further exacerbated in FL with multiple simultaneous models \citep{BhuyanMM, MM_bobs}, where the aggregation is synchronized across tasks as well. Therefore, the server has to wait for the slowest client across \textit{all} the parallel tasks. Solutions proposed to alleviate the straggler problem in the single-model context include allowing faster clients to run more local steps \citep{fednova}, aggregating only the client updates that arrive before a timeout \citep{fedsysdesign}, and sub-sampling from the set of available clients \citep{optimalClientSampling}. Although these approaches perform well when stragglers appear uniformly at random, they do not work well in the simultaneous training setting because some models (e.g., larger ones) are naturally slower to train. When the multiple models have inherently different training times, synchronized global aggregation rounds are bottlenecked by the slowest client assigned to the most computationally intensive model, leading to large idle times. 

\paragraph{Asynchronous Aggregation and Staleness Issues.} Another solution to the straggler problem is asynchronous aggregation at the server, as proposed in AsyncFL \citep{AsyncFL}, where the server updates the global model whenever it receives any client update. While asynchronous aggregation has been extensively studied in single-model federated learning \citep{chen2020asynchronous, wang2022asynchronous, xu2023asynchronous,after_review3}, it has not been well-explored for simultaneous federated training. Although AsyncFL addresses the straggler issue, it suffers from undesired \textit{staleness} even in the standard FL setting, since the received client updates are often based on outdated 
%global
models. To alleviate the staleness problem in single-model FL, \citet{fedbuff} proposed storing the incoming client updates in a buffer at the server and aggregating when the buffer is full. 
% A large buffer size reduces staleness but delays the aggregation of the global model. 

%Training multiple models, asynchronous simultaneous training presents unique challenges as it allocates limited resources across tasks whose processes are not synchronized, unlike in synchronized FST. \GJ{I don't understand the point being made in the above sentence. Why is this a unique challenge?} Furthermore, 
 
\paragraph{Adaptive Allocation of Clients to Heterogeneous Tasks.} In this work, we employ asynchronous buffered aggregation to overcome the straggler issue while controlling staleness.
% \carlee{while controlling staleness.} % without suffering from the staleness problem. 
However, extending single-model FL algorithms \citep{AsyncFL, fedbuff} to the simultaneous training of multiple models is not straightforward --- running multiple independent instances of asynchronous FL can be suboptimal. This is because the tasks can have heterogeneous computation complexities and different data heterogeneity that affect both the number of rounds required to achieve a given target accuracy as well as the wall-clock time taken to complete each round. Since a shared set of clients is used to train the models, the training processes are coupled -- more resources assigned to one task implies less for the others. Moreover, the optimal resource requirement for each task can change over time according to its data heterogeneity and training progress and may be difficult to predict before training.
% \carlee{and may be difficult to predict before training}. 
% \ps{Previous two sentences motivate non-uniform distribution of clients across tasks, but not dynamic allocation.} \GJ{Edited. PTAL} 
Therefore, we propose an adaptive algorithm that \textit{dynamically} reallocates clients across tasks depending on their training progress, and also adapts the buffer size used for asynchronous aggregation of updates.


\paragraph{Our Contributions.}
We formalize the FST setting in Section~\ref{sec:formulation} and then make the following main contributions:
\begin{itemize}[leftmargin=*]
    \item We introduce  $\nameofthealgorithm$, a \underline{Fed}erated \underline{A}synchronous \underline{S}imultaneous \underline{T}raining algorithm\footnote{\camera{Our code is provided at \camera{\url{https://github.com/askinb/FedAST}}.}} to simultaneously train models for multiple tasks (Section~\ref{sec:alg_conv}). Our work is one of the first to mitigate the straggler problem faced by synchronous FST methods that extend vanilla FedAvg. 
    \item The proposed algorithm addresses the problem of balancing resources across heterogeneous tasks, a unique challenge to the FST framework, using novel dynamic client allocation, and it also dynamically adjusts the buffer size used in asynchronous aggregation to strike the best trade-off between staleness and runtime. 
    \item We provide a theoretical convergence analysis of $\nameofthealgorithm$ (Section~\ref{sec:convergence}), which improves previous analyses even in the single-model FL setting. It improves upon \citep{sharper} by considering multiple local updates and the buffer, and on \citep{fedbuff} by relaxing the restrictive assumptions. 
    \item We experimentally validate $\nameofthealgorithm$'s performance (Section~\ref{sec:exp}) in terms of its wall-clock training time and model accuracy on multiple real-world datasets compared to synchronous and asynchronous FL baselines.
\end{itemize}
% \carlee{We conclude and discuss future work in Section~\ref{sec:conclusion}.}
We conclude and discuss future work in Section~\ref{sec:conclusion}.

\paragraph{Related Work.}
Only a few recent works \citep{MM_bobs, MM_ucb, marieMM, BhuyanMM} consider federated simultaneous training of multiple models.
In \citep{MM_bobs}, clients are selected with either Bayesian optimization or reinforcement learning to minimize training time and unfairness in participation. \citet{MM_ucb} formulate the client assignment FL as a bandit problem leveraging local training losses as scores. \citet{marieMM} introduce biased client sampling, favoring the clients with higher local losses. These methods lack convergence guarantees. \citet{BhuyanMM} assign clients uniformly at random or in a round-robin fashion and analyze the convergence assuming convex objective functions and bounded gradients. While these works only consider synchronous aggregation, \citet{asyncMM} propose a fully asynchronous FST algorithm. Their approach entails solving a non-convex optimization problem to optimize client assignment, which requires information about delays and models that may be difficult to obtain in practice. Also, the obtained bound does not converge to a stationary point
in the presence of data heterogeneity 
and suffers from increased staleness when the number of clients increases. \camera{Lastly, \cite{asfl_paper} propose an extension of their single-model adaptive asynchronous approach to the multi-model setting. However, they do not carefully handle heterogeneous data distributions across clients and the staleness of updates in their theoretical guarantees for a single model, and they lack these guarantees for multiple models. Also, their method under-utilizes client resources, since after sending a model update to the server, the clients are idle until the next training round.}
%the clients sending an update wait idly until the next round in their algorithm.}


\section{PROBLEM FORMULATION}\label{sec:formulation}
%%%
\paragraph{Notations.}
For a positive integer $c$, we define $[c] \triangleq \{1,\dots,c\}$. $\widetilde{\G}$ denotes stochastic gradients. Bold lowercase letters (e.g., $\bx$) denote vectors. $|A|$ denotes the cardinality of set $A$. $\|\cdot\|$ denotes the Euclidean norm.

We now formally introduce the federated simultaneous training (FST) setting, where $\N$ clients train $\M$ models $\mathbf{x}_1, \dots, \mathbf{x}_\M$ corresponding to $\M$ independent tasks. For each task $m \in [M]$, our goal is to find the model that solves that following optimization problem:
\begin{align}
    \min_{\x\in \mathbb{R}^{d_m}} \left\{ \fglobalj(\x) := \frac{1}{N}\sum_{i=1}^{N}\fclientij(\x) \right\},
\end{align}
where $\fglobalj$ is the global loss function for task $m$, and $\fclientij$ is the local loss for task $m$ at client $i$. 
% In each communication round of the FedAvg algorithm \citep{fedavg}, only a subset of clients is available for training, that can be used to perform local model updates. The local models are then aggregated by the server to update the global model. %The overall goal is to minimize the loss of each task, subject to the limited resources available at the clients. 

%As discussed in Section~\ref{sec:intro}, perhaps the simplest approach to solve this problem is to train the models sequentially, using all the clients for a single task at any time.
% \carlee{It isn't clear here that the disjoint sets stay the same in each round for this strawman}
% Although these solutions may seem straightforward to implement in practice, the former
%However, this approach is time-inefficient.
% , while the latter could lead to models with poor generalization. 
%Therefore, we need methods specialized to FST.
% settings are needed.

First, we examine a simple extension of FedAvg \citep{fedavg} to simultaneous training of models for $\M$ tasks. At the start of each round, the server randomly partitions the available set of clients across the tasks \citep{BhuyanMM}. The server sends the current models $\{\xjt\}_{m=1}^M$ for all the tasks to the corresponding subset of clients. The clients perform local training (Algorithm~\ref{alg:cap}) and return their updates to the server, which \textit{synchronously} aggregates the updates for each task. %Note that the server has to wait for updates from all the clients across all the tasks before it can begin the next round. 
This na\"ive simultaneous training extension of FedAvg performs poorly due to stragglers. The time it takes for a client to return its updates depends on its resources and the size of the model assigned. Since the server waits for the slowest update across all the tasks before commencing the next round, the server waits much longer if a large model is assigned to a slow client. We mitigate this problem via \textit{asynchronous} training in $\nameofthealgorithm$, discussed next.
% Note that the rounds are synchronized across tasks.

%Locally, clients perform consecutive mini-batch SGD iterations using their private data. Suppose client $i$ is among the clients selected by the server to update the model corresponding to task $m$ in the $t$-th round. Starting with the global model $\xjt$, client $i$ runs $\locitj$ consecutive mini-batch SGD steps (see Algorithm~\ref{alg:cap}). \ps{Do we need this description of Local-SGD here?}

\begin{algorithm}[t]
\caption{\texttt{LocalTrain$(m$,$\locitj$,$\xjt$,$\lrcj)$} at client $i$}\label{alg:cap}
\begin{algorithmic}[1]
\State\label{alg_1_line_set_dummy}\textbf{Set} $\xijtz\gets\xjt$
\For{$k = 1,\dots,\locitj$}
    \State  $\xijtk \gets \xijtkm-\lrcj \widetilde{\nabla} \fclientij(\xijtkm)$
    % \State $k\gets k+1$
\EndFor
\State \textbf{Return} $\del_m \gets (\xjt-\xijlocitj) / (\locitj\lrcj)$
\end{algorithmic}
\end{algorithm}



%\GJ{Currently, you are just qualitatively saying that asynchronous aggregation avoids stragglers and reduces wall-clock time. This can be made more concrete by a proper definition and explanation of the expected wall-clock time per round. You could give an example with unit exponential training times per client. With synchronous aggregation, the expected wall-clock time per round is $\log (\sum_{m=1}^{M} R_m))$. Instead, with asynchronous training, the expected time per round for each model is just $1/R_m$. Not sure where exactly such an explanation should be placed.. Maybe before or inside the convergence analysis section?}
%\GJ{I just noticed that you have mentioned the above example in a footnote after the convergence analysis. Would be good to make that more prominent by bringing it into the main paper. \psa{Thanks! I brought it into the main paper. Although it doesn't reflect the arrivals in our algorithm, I think it explains the reasoning.}}
%%%
\section{ALGORITHM DESCRIPTION }\label{sec:alg_conv}
%\GJ{Why is our proposed algorithm stated inside the problem setting section? The problem formulation or problem setting section is meant to introduce the notation and state the objective that we are trying to achieve. In the context of our problem it would be describing multi-model FL concretely, talking about synchronous algorithms and then discussing the straggler issue which motivates asynchronous algorithms. Please make the FedAMT description a separate section appearing after the problem setting section.}
Next, we describe $\nameofthealgorithm$ (Algorithm~\ref{alg:main}), our proposed \underline{Fed}erated \underline{A}synchronous \underline{S}imultaneous \underline{T}raining algorithm, illustrated in Figure~\ref{fig:server_vis} for $M=2$ tasks. 
For each task $m\in [M]$, the server maintains a round index $t_m$ that is initialized to $\tj = 0$, the number of active training requests $R_m^{(t_m)}$, and buffer size $b_m^{(t_m)}$. $R_m^{(t_m)}$ and $b_m^{(t_m)}$ quantify the resources (client computation and memory) allocated to task $m$ in round $t_m$.
% gives the pseudocode of $\nameofthealgorithm$, and we explain its key components below. 
% \ps{Can we define $t_m, R_m, b_m$ in the first para?}
We provide two versions of \(\nameofthealgorithm\) based on the value of \mbox{\textit{option} \(\in\{S,D\}\)}. When \textit{option} is \(S\) (\textit{static}), the resource allocation for each task remains the same throughout the training process (i.e., $R_m^{(t_m)} \equiv R_m$ and $b_m^{(t_m)} \equiv b_m$). With \textit{option} $=D$ (\textit{dynamic}), \(\nameofthealgorithm\) dynamically reallocates resources across tasks using the \texttt{Realloc} subroutine (\Cref{alg:calcRb}).
% \tdcarlee{It is a little confusing to talk about $R_m$ and $b_m$ here, before we know what a ``training request'' is. I would explain when they are introduced that adjusting these parameters is how we do dynamic client allocation}  
%\carlee{I think we need to define $t_m$ as the round counter for task $m$}

% \sout{, that is provably convergent under the common assumptions in the literature.}
%\ps{We can remove the next two para.}
%
\begin{algorithm}[t]
\caption{\nameofthealgorithm}
\label{alg:main}
\begin{algorithmic}[1]
\State{\textbf{Input}: Client and server learning rates $\{\lrcj,\lrsj\}_{m=1}^M$, \(\textit{option} \in \{S,D\}\), no. of local updates $\{ \locitj \}_{m=1}^M$}
\Initialize{$\forall m \in [M]$: $\tj\gets 0$ (round index), model $\xjz$, buffer $\buffj \gets \emptyset$. Total no. of updates \(c\gets0\)}
\For{Models $m = 1,\dots,M$ (in parallel)\label{alg:parallel_train}}
    \State \multiline{Randomly select $\awj^{(0)}$ clients and send \hspace{5mm} \texttt{LocalTrain$(m$, $\locitj$, $\xjz$, $\lrcj)$} requests\label{algline:init}}
    \While{$\tj < \Tj$}
        \State Wait until server receives an update $\del_m$ %\ps{``Server receives an update''}
            \State $\buffj \gets \buffj \cup \{\del_m\}$, \(c\gets c+1\) \label{algline:rcvupd} 
% \If{\(c\mod c_{period} = 0\)}
    \State\label{algline:adjustRb}\(\{(R_i^{(t_i+1)}, b_i^{(t_i+1)})\}_{i=1}^M\) \(\gets \texttt{Realloc(}\textit{option},c\texttt{)}\) 
% \EndIf
        \If{$|\buffj| = \bsj^{(t_m)}$\label{algline:fullbuff}}
            \State\label{algline:aggr}$\xjtp\gets \xjt - \lrsj\lrcj\locitj
            \frac{1}{b_m^{(t_m)}}\sum_{\del\in B_m}\del$
            \State $\tj\gets \tj+1$ and $\buffj \gets \emptyset$
    \EndIf
        \State \label{algline:newjob}\multiline{%
            Select \(K^{(t_m)}_{m}\) random client(s) and send \texttt{LocalTrain}$(m, \locitj, \xjt, \lrcj)$ request(s)}
    \EndWhile
\EndFor
\State \textbf{Output:} Trained models $\{\x_{m}^{(T_m)} \}_{m=1}^M$
\end{algorithmic}
\end{algorithm}


\begin{figure}[thb]
    \centering
        % \vspace{.3in}  
  \centerline{\includegraphics[width=0.43\textwidth]{figures/scheme.pdf}}
        % \vspace{.3in}
        \caption{In our proposed algorithm $\nameofthealgorithm$, the server assigns local training requests (shown in striped and orange blocks for two simultaneous tasks), which are queued at the clients and processed in a first-come-first-served manner. Completed requests are aggregated asynchronously at the server. In the figure, snapshots of the process at two different times are seen. Adjusting the number of requests, $\nameofthealgorithm$ periodically \textit{reallocates} the resources shared across models.
         }
    \label{fig:server_vis}
\end{figure}

\paragraph{Assignment of Local Training Requests to Clients and Their Execution.} Consider task $m \in [M]$. 
% We use $\tj$ to denote the round index for task $m$, and initialize it to $\tj = 0$. 
The server begins by sending out $\awj^{(0)}$ local training requests for task $m$ to clients selected uniformly at random, along with the initial model $\xjz$ (Algorithm~\ref{alg:main}, Line \ref{algline:init}). The number of local training requests $\awj^{(\tj)}$ is adapted over time using the \texttt{Realloc} function (\Cref{alg:calcRb}), enabling us to dynamically reallocate client resources across tasks.
% as we will describe below in the dynamic adaptation using \texttt{Realloc}.\tdcarlee{Continuing my comment on $R_m$ and $b_m$ above, replace this with: ``enabling us to dynamically reallocate client resources across tasks with the \texttt{Realloc} function''} 
Each client processes the training request by performing $\tau_m$ local mini-batch SGD iterations (see \Cref{alg:cap}) and sends the resulting model update $\del_m$ back to the server. If a client receives multiple requests, they are queued and processed in a first-come-first-served manner.\footnote{Processing the requests in parallel would require clients to keep all the $M$ models in local memory, which can be infeasible.} Therefore, the number of \textit{active clients} (clients working on training requests) at any time might be less than the number of \textit{active training requests} (that clients are working on or are stored in their queues).
% \tdcarlee{mention client availability here if there is space, e.g., ``We further assume that some clients (distributed uniformly at random) may be unavailable, in which case they simply reject the training requests.''}

\paragraph{Buffered Asynchronous Aggregation at the Server.} The updates $\del_m$ sent by the clients are aggregated at the server in an asynchronous manner as follows. To keep staleness in check, the server maintains a buffer $B_m$ for task $m$, which stores the received client updates for model $m$ (Algorithm~\ref{alg:main}, Line \ref{algline:rcvupd}). The buffer size $b_m^{(t_m)}$ can be adapted over time (using the \texttt{Realloc} function). Whenever the server receives an update for task $m$, it randomly selects $K^{(t_m)}_{m}$ client(s) to send a new training request along with the current global model (Algorithm~\ref{alg:main}, Line \ref{algline:newjob}). As we explain below, $K^{(t_m)}_{m}=1$ (respectively, $K^{(t_m)}_{m} \in \{0,1,2\}$) for $option=S$ ($option=D$).
% $1$ new client (in the dynamic case, it may be either $0$, $1$, or $2$ clients as we will explain later below) and sends it a training request along with the current global model (Algorithm~\ref{alg:main}, Line \ref{algline:newjob}). 
% Also, the server maintains a counter \(c\) (Line \ref{algline:rcvupd}) for the total number of received updates. \GJ{What is this counter used for? Can explain that here briefly} The counter is common across all the $M$ tasks to determine when
%
% To achieve more efficient client utilization, our approach allows the number of active training requests for each task to dynamically change (Line~\ref{alg:calcRb}). The server reevaluates this allocation periodically.
When the buffer for model $m$ gets full (formally, $|\buffj| = b_m^{(t_m)}$)
% (Algorithm~\ref{alg:main}, Line \ref{algline:fullbuff}), 
the server aggregates the updates stored in the buffer to update the global model (Algorithm~\ref{alg:main}, Line \ref{algline:aggr}). 

\paragraph{Dynamic Adaptation of the Number of Active Requests and Buffer Size using \texttt{Realloc} (Algorithm~\ref{alg:calcRb}).} \label{para:calcrb}
\camera{With the \textit{static} option ($option=S$), the \texttt{Realloc} subroutine always runs its Line~\ref{alg3_line:set_same} to maintain the initial values of $\awj$ and $\bsj$ throughout the whole training process. The resource allocation across tasks does not change over time. On the other hand, with the \textit{dynamic} option, the \texttt{Realloc} subroutine adjusts the resource allocation during the training.}
The server maintains a counter \(c\), tracking the total number of updates received across all \(M\) tasks (Algorithm~\ref{alg:main}, Line \ref{algline:rcvupd}). If $option=D$ (\textit{dynamic}), this counter is used to periodically trigger the dynamic adaptation of the number of active training requests $\awj$ and the buffer size $b_m$ across tasks (Algorithm~\ref{alg:main}, Line~\ref{algline:adjustRb}).
Intuitively, we should allocate more clients (and consequently, more training requests $R_m$) to tasks with larger inter-client data heterogeneity. To empirically estimate this heterogeneity, the server stores the last $\nbupdvar$ ($\nbupdvar$ is a tunable parameter) updates $\del_m$ for each task $m$ (denoted $\{ \del_{m,i}\}_{i=1}^V$) and computes
\begin{align}
    \label{eq:empirical_hetero}
    \hat{\sigma}_{g,m}^2 \propto \mfrac{1}{\nbupdvar}\times\sum\nolimits_{i=1}^\nbupdvar \mfrac{\normbs{\del_{m,i}-\overline{\del_{m}}}}{\normbs{\overline{\del_{m}}}},
\end{align}
where $\overline{\del_{m}}$ is the empirical mean of the $\del_{m,i}$'s.\footnote{We normalize by $\normbs{\overline{\del_{m}}}$ to account for different model sizes since larger models often have larger unnormalized variance.} Further, in our experiments, we empirically observe that the optimal choice of buffer size $b_m$ is proportional to the number of active requests $R_m$. See Appendix~\ref{app_sect:buffer_size} for our extensive experiments. Using \eqref{eq:empirical_hetero} and these empirical observations, the optimal resource allocation emerges as the solution to the following constraints.
\begin{equation}
    \begin{aligned}
        \sum\nolimits_{i=1}^M R_i^{(t_i+1)} &= \sum\nolimits_{i=1}^MR_i^{(t_i)}, \\
        \mfrac{R_1^{(t_1+1)}}{{\hat{\sigma}_{g,1}}} &= \mfrac{R_2^{(t_2+1)}}{{\hat{\sigma}_{g,2}}} = \dots = \mfrac{R_M^{(t_M+1)}}{{\hat{\sigma}_{g,M}}},
    \end{aligned}
    \label{eq:realloc_constr}
\end{equation}
where the first set of constraints maintains the total computation budget across tasks, and the second set ensures the allocation of a larger number of training requests to clients with higher heterogeneity. We elaborate on the theoretical motivation for the second set of constraints 
%in the next section (\ref{sec:convergence})
in Section~\ref{sec:convergence}, once we establish our convergence results. We also refer the reader to Appendix~\ref{app_sect:CalcRb} for more details on \texttt{Realloc}.


\begin{algorithm}[t]
\caption{\texttt{Realloc(}\textit{option},\(c\)\texttt{)}}\label{alg:calcRb}
\begin{algorithmic}[1]
\If{\(\textit{option}=D\) and \(c\mod c_{period}=0\) \label{alg3_line:firstif}}
\State \(\{{\hat{\sigma}_{g,m}^2}\}_{m=1}^M \gets \texttt{EstimateVariances()}\) \label{alg3_line:estimate}
\State\label{alg3_line:proportional_dist}\multiline{Find \(\{R_m^{(t_m+1)}\}_{m=1}^M\) that solves \eqref{eq:realloc_constr}}
\State \(b_m^{(t_m+1)} \gets \big( b_m^{(t_m)} R_m^{(t_m+1)} \big)/R_m^{(t_m)}\) for all $m \in [M]$
\Else: \textbf{ for all} $m\in$ $\{i:R_i^{(t_i+1) }\text{not defined}\}$ \textbf{ do} 
\State\label{alg3_line:set_same}\( (R_m^{(t_m+1)}, b_m^{(t_m+1)})\gets(R_m^{(t_m)}, b_m^{(t_m)})\)
% \ElsIf{\(\{(R_i^{t_i+1}, b_i^{t_i+1})\}_{i=1}^M\) are not defined before}
% \State \(\{(R_i^{t_i+1}, b_i^{t_i+1})\}_{i=1}^M\gets\{(R_i^{t_i}, b_i^{t_i})\}_{i=1}^M\)
\EndIf
\State \textbf{Return} \(\{(R_m^{(t_m+1)}, b_m^{(t_m+1)})\}_{m=1}^M\)
\end{algorithmic}
\end{algorithm} 

% \GJ{The content of the following remarks is good. But I suggest making these paragraphs with titles (similar to the 'adjusting the number of active requests...' paragraph above) rather than remarks. The italic text in remark environment makes it hard to read. \ba{done!}}

%\paragraph{Queuing Requests at Clients.}
%If a client receives new local training request(s) before it has finished its prior requests, they are all queued at the client and processed in a first-come-first-served manner. Due to possible queued requests, the number of \textit{active clients} (clients actively working on local training requests) at any time might be less than the number of \textit{active training requests} (requests that clients either work on or store in their queues).

%\paragraph{Dynamic Client Allocation across Tasks.}\label{rem:c}
%The server can adjust client allocation across tasks periodically. This happens if $\text{option}=D$ (dynamic). In this case, if \(c \mod c_{period} = 0\) (where recall, $c$ is the total number of updates received at the server across tasks), \(R\) and \(b\) parameters are updated. For more details, see  
%\hyperref[para:calcrb]{below} and Algorithm~\ref{alg:calcRb}. \carlee{Isn't this discussed above wehn you talk about adjusting the number of active requests?}

\paragraph{Sending out New Requests to Reach the New Resource Allocation.} To transition from one allocation $\{ R_m^{(t_i)} \}_m$ to another $\{ R_m^{(t_m+1)} \}_m$ in an asynchronous setting, we must adjust the number of new requests that are sent out every time the server receives a client update. The number of new requests \(K_{m}^{(t_m)}\) sent out on receiving any update $\del_m$ is always $1$ in the static ($\textit{option}=S$) case since $R_m$ remains constant throughout training. In the dynamic case ($\textit{option}=D$), \(K_{m}^{(t_m)}\) can be $0$ (when $R_m^{(t_m+1)} < R_m^{(t_m)}$), $1$ (when $R_m^{(t_m+1)} = R_m^{(t_m)}$), or $2$ (when $R_m^{(t_m+1)} > R_m^{(t_m)}$). 
% \sout{By adjusting the number of new requests in this way, the number of active training requests $R_m$ for each task $m$ will gradually reach the desired new $R_m$.}
% \ba{This strategy ensures as the number of active training requests $R_m$ for each task $m$ will gradually reach the desired new $R_m$ without queue }
% \ba{This strategy ensures a smooth transition to the desired new number of active training requests $R_m$ gradually for each task since }
% \ba{This strategy 
We employ this gradual 
% and smooth 
transition to the desired new number of active training requests $\{R_m^{(t_m+1)}\}_m$ for each task instead of a sudden change in allocation to avoid possible longer queues at the clients during the transition phase.
% }
% \tdcarlee{why not do this all at once? In general, why does the number of active training requests need to equal $R$ (to limit client queuing?) \ba{yes, to limit queues, we keep total \# of requests bounded.}} % It can be $2$ or $0$ depending on whether the algorithm decides to increase or decrease the number of active local training requests respectively at Line \ref{algline:adjustRb}. % It is \(1\) if there is just one missing update (which is actually the latest received one). 

%\ps{If we choose to follow the template I suggested, description of the basic algorithm, followed by remarks explaining things in detail, this paragraph should come before the remarks. \ba{thanks for the review, I like your suggestion and edited!}}

%\GJ{The two paragraphs below seem out of place in the algorithm description section. Some of their contents can be merged into the paragraphs above}

%$\nameofthealgorithm$ solves the straggler problem by allowing asynchronous local training across clients and asynchronous aggregation of local client updates across tasks. The expected wall-clock time is reduced since the global model updates take a much shorter time. However, asynchrony causes staleness in client updates. We measure the staleness of a client for a specific task in terms of the \textit{number of global model updates at the server} between when the client receives the training request for this task and when it sends the update back to the server.

%In $\nameofthealgorithm$, the server maintains a separate \textit{buffer} for each task to keep the staleness in check. Further, owing to the cross-client data heterogeneity in FL, individual client updates can have a large variance. Since the buffer enables averaging multiple client updates, the resulting server update better captures the underlying data heterogeneity. We compare $\nameofthealgorithm$ with the \textit{buffer-less} asynchronous implementation in Figures~\ref{fig:buff_hom_acc} and \ref{fig:buff_het_acc} to empirically show the benefits of using a buffer. Further, $\nameofthealgorithm$ ensures fairness across clients by requesting local training from randomly selected clients, irrespective of their speed. 


% \begin{algorithm}
% \caption{\nameofthealgorithm{} to train $\M$ models concurrently }\label{alg:main}%\ps{Can you add line numbers to the algorithm. Then we can add line numbers in the description above.}
% \begin{algorithmic}[1]
% %\Require $J$: Number of active works, $T$: target round, $N$: \# of total clients,  $b$: Buffer size, $\tau$: \# of local SGD iterates, $\eta_s$: Client learning rate, $\eta_s$: Server learning rate\\
% \Initialize{$\tj\gets 0$. Initialize $\xjz$, choose $\awj$ random clients and put \textit{LocalTraining(j, $\locitj$, $\xjz$, $\lrcj$)} into their job queues, and $\buffj \gets \emptyset$,  $\forall j\in [M]$\label{algline:init}.}
% \While{$\exists j \: \tj \neq \Tj$}
%     Clients asynchronously run their queued jobs. \ps{Don't follow the while condition}
%     \If{\textit{Server receives an update u for task j}}
%         \State $\buffj \gets \buffj \cup \{u\}$ \label{algline:rcvupd}
%         \If{$|\buffj| = \bsj$}
%             \State $\xjtp\gets \xjt + \lrsj\times\frac{1}{\bsj}\sum_i\buffj[i]$ \label{algline:aggr}
%             \State $t^{(j)}\gets t^{(j+1)}$ and $B^j \gets \{\}$
%         \EndIf
%         \If{$t^{(j)}\neq T^{(j)}$}
%             \State  Pick a client uniformly at random and put \textit{LocalTraining($j$, $\tau^{(j)}$, $x^{(j,t)}$, $\eta^{(j)}_c$)} into its queue \label{algline:newjob}\ps{Why is this happening? \psq{Randomly selecting a new client and sending the latest model to it. The aim of the if statement is to stop the training when a task reaches its own round limit.}}
%         \EndIf
%     \EndIf
% \EndWhile
% \end{algorithmic}
% \end{algorithm}


% \ps{Algorithm~\ref{alg:main} outlines the pseudocode of $\nameofthealgorithm$. The training of each model runs simultaneously. Figure~\ref{fig:server_vis} visualizes this process for two sample tasks. Consider some $m \in [M]$. The server begins the training by sending out $\awj^{0}$ local training requests for task $m$ to clients selected uniformly randomly, along with the initial model $\xjz$ (Line \ref{algline:init}). 
% The server maintains a buffer $B_m$ for each task $m$, which stores the received client updates for model $m$ (Line \ref{algline:rcvupd}). Also, the server maintains a counter \(c\) (Line \ref{algline:rcvupd}) for the total number of received updates, across all the $M$ tasks. To achieve more efficient client utilization, our approach allows the number of active training requests for each task to dynamically change. The server reevaluates this allocation periodically. When the buffer for model $m$ is full, i.e., $|\buffj| = \bsj$ (Line \ref{algline:fullbuff}), the server aggregates the client updates in the buffer to update the global model (Line \ref{algline:aggr}). Finally, whenever the server receives an update for task $m$, it randomly selects \(K_{m}^{(t_m)}\) new client(s) and sends training request(s) along with the current global model (Line \ref{algline:newjob}).}


% When \textit{option} \(=1\), \(K_{m}^{(t_m)}\) returns one of \(0\), \(1\), or \(2\) to adjust the actual total number of active local training requests according to \(R_m^{t_m}\) distribution. It returns \(1\) if there is just one missing update (which is actually the latest received one). It can also return \(2\)/\(0\) depending on whether the algorithm decides to increase/decrease the number of active local training requests at Line \ref{algline:adjustRb}.



\section{Convergence Analysis}
\label{sec:convergence} %\carlee{This is the only subsection in Section 3, so either make another one, take this as its own section, or make this an inline heading}

In this section, we provide the convergence result
% for $M$ models trained using 
for $\nameofthealgorithm$ with the static \textit{option} (\(S\)). Since \(R_m^{(t_m)}\) and \(b_m^{(t_m)}\) are constant when \textit{option} \(=S\), we drop time indices for simplicity. The convergence with dynamic allocation (\textit{option} $=D$) can be shown with an additional assumption. We relegate this to Appendix~\ref{app_sec:opt1_conv} due to space limitations.

Next, we discuss the assumptions used in our analysis.

\begin{assump}[Smoothness]
The loss functions are $\Li$-smooth, i.e., for all $i \in [N]$, for all $m \in [M]$, and for all $\x,\y \in \mathbb{R}^{d_m}$, $\norm{\G \fclientij (\x)-\G \fclientij (\y)} \leq \Li \norm{\x-\y}$.
\label{assump:smoothness}
\end{assump}
\vspace{-3mm}
\begin{assump}[Bounded Variance]
The stochastic gradient at each client is an unbiased, bounded-variance estimator of the true local gradient, i.e., for all $\x \in \mbb R^{d_m}$, $i \in [N]$, and $m \in [M]$, $\mathbb E [\tG \fclientij (\x)] = \G \fclientij (\x)$ and \mbox{$\mathbb E \| \tG \fclientij (\x) -\G \fclientij (\x) \|^2\leq \lhetsj$.} 
\label{assump:lochet}
\end{assump}
\vspace{-3mm}
\begin{assump}[Bounded Heterogeneity]
The local gradients are within bounded distance of the global gradient, such that for all $m \in [M]$ and $\x \in \mathbb{R}^{d_m}$, $\max\limits_{i \in [N]}\norms{\G \fclientij (\x) - \G \fglobalj (\x)} \leq \ghetsj$.
\label{assump:globhet}
\end{assump}
\vspace{-3mm}
\begin{assump}[Bounded Staleness]
The client updates of task $m$ are received within at most $\rdm_m$ server model updates after the server sends the training request. 
\label{assump:maxstale}
\end{assump}

\camera{These assumptions are standard in the literature. Assumptions~\ref{assump:smoothness}-\ref{assump:globhet} are commonly used in the synchronous \citep{fednova,fedvarp} and asynchronous \citep{sharper, fedbuff} FL analyses. Assumption~\ref{assump:maxstale} is used in the convergence proof to guarantee that none of the requested client updates takes an arbitrarily large time to return to the server and is also common in asynchronous FL works \citep{sharper, fedbuff}. Furthermore, the maximum staleness can be enforced by dropping over-delayed updates in practice during the training.}

\begin{theorem}[Convergence of $\nameofthealgorithm$] \label{thm:main}
Suppose that Assumptions \ref{assump:smoothness} - \ref{assump:maxstale} hold, and there are $\awj$ active local training requests corresponding to task $m \in [M]$, and the server and client learning rates, $\{ \lrsj, \lrcj \}$ %$\{ \lrsj, \lrcj \}_{m=1}^M$
respectively, satisfy $\lrsj \leq \sqrt{\locitj\bsj}$ and $\lrcj \leq \min \{%\frac{1}{4\Li\locitj},
(6\Li\locitj\sqrt{\locitj\bsj})^{-1}, (4\Li\locitj\sqrt{\locitj\awj\rdm_m})^{-1} \}$ for all tasks $m \in [M]$. Here, $\bsj$ is the buffer size, and $\locitj$ is the number of local training steps. Then, the iterates, $\{ \{ \x_m^{(t)} \}_{t=1}^{T_m} \}_{m=1}^M$, of Algorithm \ref{alg:main} satisfy:
% {\small
\begin{equation}
    \begin{aligned}
        & \avgtelj \mbe \| \G \fglobalj (\xt) \|^2 \leq \underbrace{\bOP{\mfrac{\gapTerm_m}{T_m \lrcj \lrsj \locitj}}}_{\text{FedAvg Error - I}} \\
        & + \underbrace{\bOP{\lp \mfrac{\Li \lrcj \lrsj}{\bsj} + \Li^2 [\lrcj]^2 \locitj \rp (\lhetsj + \locitj \ghetsj)}}_{\text{FedAvg Error - II}} \\
        & + \underbrace{\bOP{\mfrac{\Li^2 [\lrsj]^2 [\lrcj]^2 \locitj\awj}{\bsj^2 } (\lhetsj + \locitj \awj\ghetsj)}}_{\text{Asynchronous Aggregation Error}},
        % & + \bOP{\frac{\Li \lrsj \lrcj}{\bsj} + \Li^2 [\lrcj]^2 \locitj \lp \locitj - 1 \rp + \frac{\Li^2 [\lrsj]^2 \awj^2}{\bsj^2}} .
    \end{aligned}
    \label{eq:thm:main}
\end{equation}
% }%
where $\gapTerm_m = \fglobalj(\xjz) - \min_\x \fglobalj (\x)$.
%\ps{This $z$ in the first line is a typo, right? We should expand the theorem statement to remind the reader what all the symbols mean.}
\label{theorem:main}
\end{theorem}

\textbf{Proof.} See Section \ref{app_sect:main_proof} in the Appendix.

\paragraph{Comparison with Synchronous FL Analyses.}
The \mbox{\textit{FedAvg Error - I} and \textit{- II}} terms in \eqref{eq:thm:main} capture the error bound for synchronous FedAvg \citep[Theorem~1]{fedvarp}. 
Since the server updates for model $m$ involve aggregating $\bsj$ client updates, the buffer size $\bsj$ is analogous to the number of participating clients in FedAvg. 
The third error term in \eqref{eq:thm:main} arises due to asynchronous aggregation and increases with $\awj$, the number of active local training requests. Intuitively, given the same buffer size $\bsj$, increasing $\awj$ leads to higher worst-case staleness $\rdm_m$. However, as long as $\Li \lrsj \lrcj \awj^2 \locitj \leq \bsj$, asynchrony is not the dominant source of error in \eqref{eq:thm:main}, and we achieve the same rate of convergence as synchronous FedAvg (see Corollary \ref{cor:conv_rate}).
% \ps{I guess we can remove everything after this. Do we need the point about wall-clock time?\psq{ Just looking at the bound, decreasing $R_m$ to $1$ seems the best as it appears only in nominators. However, increasing $R_m$ shortens the round times. Highlighting this can be good. I am shortening the rest. We can keep it if it makes sense, or we can remove it.}}
%The number of active work, $\aw$, is unique to asynchronous algorithm and it makes the terms induced by the asynchrony larger. 

\paragraph{Comparison with Asynchronous FL Analyses.}
FedBuff \citep{fedbuff} considers buffered asynchronous aggregation for a single model. Still, comparing \cite[Corollary~1]{fedbuff} and the bound in \eqref{eq:thm:main} for $M=1$, their convergence result (i) depends on stronger assumptions (bounded gradient norm and uniform arrivals of client updates), 
% (ii) does not achieve any benefit of larger buffer size in the dominant error term, 
and (ii) has worse asynchronous aggregation error. Moreover, our analysis is more general compared to \citep{sharper} as they do not consider multiple local SGD steps and the buffer. Simultaneous asynchronous training is considered by \citep{asyncMM}, but we observe that they do not achieve convergence unless the data distribution across clients is identical (see \cite[Eq.~(19)]{asyncMM}). We discuss the comparison of \(\nameofthealgorithm\) to single-model and simultaneous asynchronous federated training baselines in more detail in Appendix Section~\ref{app_sect:theory_comparison}.

% \GJ{Give a title to this corollary, something like 'Asymptotic convergence after setting learning rates'}
\begin{cor}[Asymptotic convergence after setting learning rates]
% \textbf{(Asymptotic convergence):}
\label{cor:conv_rate}
    Let $T_m \geq \locitj \max\lcb%16,
    36\bsj,16\awj\rdm\rcb$. Setting the learning rates $\lrcj = (\locitj L \sqrt{T_m})^{-1}, \lrsj = \sqrt{\locitj \bsj}$
    %\ps{Shouldn't we say $\lrs = \Theta(\sqrt{\bs})$ and $\lrc = \Theta \lp \sqrt{\frac{\locit}{\Li T}} \rp$?}
    , the bound in Theorem~\ref{theorem:main} reduces to:
\begin{align}
    & \avgtelj \mbe \| \G \fglobalj (\xt) \|^2 \leq \bOP{\mfrac{\gapTerm_m \Li}{\sqrt{\bsj \locitj T_m}}} \nn \\
    &+\bOP{\lp\mfrac{1}{\sqrt{T_m \bsj\locitj}}+\mfrac{1}{\locitj T_m}\rp(\lhetsj + \locitj \ghetsj)} \nn \\
    &+\bOP{\mfrac{\awj}{T_m\bsj}(\lhetsj + \locitj\awj\ghetsj)}. \label{eq:cor:conv_rate}
\end{align}
\end{cor}

Although the given bound in Corollary~\ref{cor:conv_rate} does not seem to depend on the staleness bound $\rdm$ (Assumption~\ref{assump:maxstale}), its effect is implicit in the number of active requests $\awj$ and buffer size $\bsj$. The maximum staleness is positively correlated with $\awj$ and negatively correlated with $\bsj$. In our experiments (Appendix~\ref{app_sect:buffer_size}), we tune the buffer size to maintain the update staleness at a reasonable level.

Looking at the bounds in \eqref{eq:thm:main} or \eqref{eq:cor:conv_rate}, increasing $\aw_m$ makes the bound worse because to reach the same accuracy in \eqref{eq:cor:conv_rate}, we need to run a higher number of server updates $T_m$. However, increasing $\aw_m$ also shortens the duration between two successive server updates, making the algorithm faster in wall-clock time. We illustrate this effect with a wall-clock comparison to FST baselines below. 

% \ba{We should keep one of two paragraphs below.} 
% \ba{\paragraph{Run-time Comparison of FST Methods.}
% Suppose we want to train $M$ identical models simultaneously using $K$ always active clients, and the arrival times of all the client updates are distributed as $Exp(\lambda)$. Let us say tasks share resources equally and assume the number of actively assigned clients is always more than $K/(2M)$ for each model in $\nameofthealgorithm$ (see Appendix Section~\ref{app_sect:runtime} for our empirical reasoning). Then, the time units required for all models to reach the same arbitrarily small gradient norm would be as follows. $\nameofthealgorithm$: $\bOP{\sqrt{\frac{M}{\lambda K}}}$, synchronous simultaneous training: $\bOP{\sqrt{\frac{M\log K}{\lambda K}}}$, synchronous sequential training: $\bOP{\frac{M\sqrt{\log K}}{\sqrt{\lambda K}}}$, client set-partitioned training: \textit{can never reach}. Note that these are rough calculations (see Appendix Section~\ref{app_sect:runtime}) expected run-times based on the best known convergence rate in the literature.}

\paragraph{Impact of $\awj$ on Wall-clock Time.}
Suppose the arrival times of all the client updates (assuming there is no queue on the clients) are distributed as $Exp(\lambda)$. The expected time to fill the buffer corresponding to task $m$ is $\bsj/(\awj\lambda)$. Therefore, in $\nameofthealgorithm$, the expected time to complete one round at the server is inversely proportional to $\awj$.
On the other hand, the expected time to finish one round of synchronous simultaneous FedAvg training is $\frac{1}{\lambda}\sum_{k=1}^{R_1+\dots+R_M}\frac{1}{k}\approx\frac{1}{\lambda}\log(\sum_{k=1}^M{R_k})$, which increases with $\awj$. Also, the summation \camera{over simultaneously trained tasks} shows an exacerbated straggler effect since all the clients wait for the slowest client across all the tasks.
% index over summation should be k as m is used to indicate the specific task in the paragraph
\paragraph{Design of \texttt{Realloc} (Algorithm~\ref{alg:calcRb}).} Next, we theoretically justify the dynamic allocation of resources across tasks described in Section~\ref{sec:alg_conv} (\Cref{alg:calcRb}, with \(\textit{option}=D\)), which adjusts the number of active requests ($\{R_m\}_{m=1}^M$). Given the limited number of available clients (which limits the total number of active training requests), to achieve the best possible allocation, we minimize the sum of the \camera{most} dominant terms in the bounds (FedAvg Error-II in \eqref{eq:thm:main}) across tasks. We also use the empirical observation that the optimal choice of buffer-size $b_m$ scales linearly with $R_m$ (Appendix~\ref{app_sect:buffer_size}). The resulting optimization problem is
% that our aim in FST problem is obtaining a convergence as fast as possible for each model with a limited number of available clients (this limits the total number of active training requests without increasing the staleness a lot), we formulate an optimization problem where we minimize the sum of dominant terms of each task's bound (Eq.~\ref{eq:thm:main}) subject to the total number of active training requests is limited at $R$:
% \[\]
\begin{equation}
    \min_{\{R_m,b_m\}_{m=1}^M}\sum_{m=1}^M\mfrac{\lrsj\lrcj\locitj}{R_m}\ghetsj \text{ s.t. } \sum_{m=1}^MR_m=R,\label{eq:realloc_theory}
\end{equation}
\camera{where $R$ is the budget for the total number of training requests across all tasks in the system depending on the number of available clients.} The \texttt{Realloc} function (Algorithm~\ref{alg:calcRb}) solves the optimization problem \eqref{eq:realloc_theory}. 
% using an empirical observation between $\bsj$ and $\awj$ to allocate resources across clients.
% \ps{Coupling of $R_m, b_m$; empirical estimate of $\ghetsj$ in terms of local updates.}
See Appendix~\ref{app_sect:CalcRb} for more details.
% }

% \sout{As the {\color{blue}concurrency} in the system increases, the server updates the global model multiple times before a client can complete an update request.}
% On the other hand, the buffer size has an inverse relation with this frequency. 
% Therefore, one must tune the learning rate as described in Theorem~\ref{theorem:main} to have the guarantee. 
%\ps{Why on buffer size? Won't $\rdm$ be the same for fixed $R$, whatever $k$ be? \psq{I think it directly depends on the buffer size as well. We measure the staleness in terms of round and buffer size affects the round frequency}} 

% \ps{We should instead explain the dependence on staleness through $\awj$, as we discussed on slack. \psa{Done.} \psq{Also, could you check the discussion below?}}

% \GJ{The discussion below is a bit confusing to me. Can you control the number of clients that accept training requests?}
% In Theorem~\ref{theorem:main} and Corollary~\ref{cor:conv_rate}, we assume that the set of clients which accept local training requests is a uniformly random subset of all clients at any time (remaining clients reject new requests). The size of this set does not directly affect the bounds. However, having 
% %more active clients 
% a larger set of clients that accept local training requests enables us to increase the number of active clients ($\awj$)
% without extending the client queues. 
% % So, we can employ more local training requests at the same time.
% Adjusting the buffer size accordingly, this also does not lead to a staleness problem.
% % , but lets us utilize the variance reduction benefits of larger buffers. 
% % \ps{We should discuss this once.}

%Finally

\section{EXPERIMENTAL RESULTS}\label{sec:exp}
% Next, we complement our theoretical analysis of simultaneous federated training with experiments. 
We outline our experimental setup in Section~\ref{sect:implementation}, discuss the existing baselines in Section~\ref{sect:comp_methods}, and compare the baselines with $\nameofthealgorithm$ under varied settings in Section~\ref{sect:exp_res}.

\subsection{Datasets and Implementation}\label{sect:implementation}

We consider image classification tasks with the MNIST \citep{mnist}, Fashion-MNIST \citep{fashionmnist} and CIFAR-10 \citep{cifar10} datasets, and next character prediction with the Shakespeare \citep{leaf} dataset using the same models as in previous works \citep{feddyn,asynchfl,lenet}.
We compare the wall-clock time required by different algorithms to reach some predetermined target test accuracy levels (see \Cref{table:exp_summary}). In Appendix~\ref{app_sec:diff_acc}, we present experiments with other target accuracy levels to show the consistency of our results. \camera{We also validate our results with ResNet-18, a larger model, trained for the CIFAR-100 classification task in Appendix~\ref{app_sec:resnet_experiments}.} In all experiments, we conduct three Monte Carlo runs with different random seeds and report the average results.

\begin{table}[h]
\centering
\caption{The datasets and models used in experiments, along with corresponding target test accuracy levels.}
\label{table:exp_summary}
\begin{tabular}{|c|c|c|}
\hline
Dataset & Model & Target Accuracy \\ \hline
MNIST & MLP & $93\%$ \\ \hline
Fashion-MNIST & LeNet-5 & $82\%$ \\ \hline
CIFAR-10 & CNN & $63\%$ \\ \hline
Shakespeare & LSTM & $42\%$ \\ \hline
\end{tabular}
\end{table}

For image classification tasks, we partition the training data across clients using the Dirichlet distribution with \mbox{$\alpha = 0.1$} to \camera{create inter-client data heterogeneity} \citep{firstDirichlet}. The Shakespeare dataset is \textit{naturally} heterogeneous as the lines of each role in the plays of Shakespeare are assigned to a different client. There are a total of $1000$ clients, $30\%$ of which are available to accept new training requests, independent of the past.

\paragraph{Modeling Client Delays.}
As suggested in \citep{shiftedexp2, slowandstale, shiftedexp1, MM_bobs}, we use \textit{shifted-exponential} (exponential plus constant) random variables to model the time taken by a client to complete a local training request and return the update to the server. We pick the run-time generation parameters of each task according to real measurements on NVIDIA GeForce GTX TITAN X GPUs. To simulate hardware heterogeneity across clients, we divide them into $25\%$ slow, $50\%$ normal-speed, and $25\%$ fast clients \citep{favano}. We relegate additional implementation details to the Appendix. 

\begin{figure}[t]
% \vspace{.3in}
\centerline{\includegraphics[width=0.45\textwidth]{figures/comp_1_3.pdf}}
% \vspace{.3in}
\vspace{-2mm}
\caption{Mean test accuracy for compared algorithms on six identical CIFAR-10 tasks trained simultaneously. $\nameofthealgorithm$ trains faster than synchronous methods. The synchronous method without straggler mitigation is by far the slowest.
}
\label{fig:comp1_3}
\end{figure}

\subsection{Baseline Algorithms}\label{sect:comp_methods}
\camera{We explain the synchronous and asynchronous baseline methods to which we compare $\nameofthealgorithm$:}
\paragraph{Synchronous Simultaneous Training.}
The following synchronous methods differ only in client selection.
\vspace{-3mm}
\begin{enumerate}[leftmargin=*]
\item $\mmsync$ \citep{BhuyanMM}: randomly partition the client set across tasks at each round;
\item $\mmbobs$  \citep{MM_bobs}: Bayesian optimization-based assignment of clients to tasks;
% in each iteration %to minimize training time while also promoting fairness in device participation, and 
\item $\mmucb$ \citep{MM_ucb}: client selection as a multi-armed bandit problem.
% with local training losses as scores.
\end{enumerate}
\vspace{-2mm}
\camera{In Figure~\ref{fig:comp1_3}, we first simultaneously train six CIFAR-10 models and compare the performance of all synchronous baselines and $\nameofthealgorithm$. As synchronous methods perform poorly due to a severe straggler issue, we augment them with a straggler mitigation method by aggregating only the first $\firstk$ client updates for each task and discarding the rest \citep{fedsysdesign} as the default option. We choose  $\firstk = 30$ by validation experiments across datasets 
% \carlee{this sentence doesn't make sense}
in Appendix~\ref{app_sec:tuning_first_k}. This extra augmentation \textit{makes the baselines more competitive}. In Figure~\ref{fig:comp1_3}, we also add the result of \mbox{$\mmsync$ \texttt{-NoStrag.Mit.}}, which is the $\mmsync$ without our augmented straggler mitigation. It shows that the synchronous baselines have a large straggler effect without our extra augmentation.}

% As the simple synchronous method ($\mmsync$ \texttt{-NoStrag.Mit.}) experiences a severe straggler issue. To make these baselines competitive with our proposed $\nameofthealgorithm$, we use a default straggler mitigation by
% where the server 
% aggregating only the first $\firstk$ client updates for each task and discarding the rest \citep{fedsysdesign}. %Also, in the experiment in Figure~\ref{fig:comp1_3}, the positive effect of the straggler mitigation we added for the competitors is evident. 

\begin{figure}[t]
    \centering
    \centerline{\includegraphics[width=0.4\textwidth]{figures/acc_hom_both.pdf}}
    % \vspace{.3in}
    \vspace{-2mm}
    \caption{The mean final test accuracy values of {\color{blue}$\nameofthealgorithm$ (blue)}, {\color{olive}$\nobuffer$ (olive green)} and {\color{violet}centralized training (violet)}  with varying active client ratio, when training $3$ identical models. The left (right) figure is for CIFAR-10 (Fashion-MNIST) dataset. With more active clients, the importance of buffer increases due to increasing staleness.  
    % \carlee{Put the conclusion in the figure: more active clients means more need for a buffer due to stragglers}
    }
    \label{fig:buff_hom_acc}
\end{figure}

\begin{figure}[t]
    \centering
    \centerline{\includegraphics[width=0.45\textwidth]{figures/acc_het_both.pdf}}
    \vspace{-2mm}
    \caption{The mean test accuracy values of $\nameofthealgorithm$ and $\nobuffer$, when simultaneously training one model for CIFAR-10 and one for Fashion-MNIST. $\nameofthealgorithm$ achieves higher and more stable accuracy levels. }
    \label{fig:buff_het_acc}
\end{figure}

\paragraph{Asynchronous Federated Simultaneous Training.}
To our knowledge, \citep{asyncMM} is the only other work that \camera{mainly} studies asynchronous simultaneous FL. However, their client selection scheme requires the knowledge of network-wide staleness and smoothness constants, which are hard to estimate. 
If the tasks have similar model complexity and task difficulty, their client selection is similar to that of $\nameofthealgorithm$ with a buffer size of $1$.
We thus include this \textit{no-buffer} version of $\nameofthealgorithm$ (we call it $\nobuffer$) as a baseline.

\subsection{Results and Insights} 

We assess the performance of $\nameofthealgorithm$ under various scenarios. In \textit{homogeneous-task} experiments, where multiple independent copies of the same model are trained simultaneously using the same dataset, we report the average accuracy over time. In \textit{heterogeneous-task} experiments involving differing tasks and models, efficiently distributing resources to accelerate the completion of all tasks is the main challenge. For homogeneous tasks, we use $\nameofthealgorithm$ with the static \textit{option} ($S$) and uniform client distribution across tasks. In heterogeneous-task experiments, we use dynamic allocation ($option = D$) to enhance resource allocation efficiency. To show the benefits of dynamic allocation over static allocation, we also explore heterogeneous-task scenarios with \textit{option} $=S$. Dynamic allocation reduces overall training time by up to \(11.9\%\), with comprehensive results shown in Appendix~\ref{app_sect:var_comp}.


To quantify the time \textit{saved} by using $\nameofthealgorithm$ over some competing baseline, we define \textit{time gain} as
% \vspace{-1mm}
%{\small
\begin{align*}
    \text{Gain} \triangleq \mfrac{T_{\texttt{Baseline}} - T_{\nameofthealgorithm}}{T_{\texttt{Baseline}}} \times 100 \%,
\end{align*}%
% \vspace{-1mm}
where $T_{\texttt{Baseline}}$ $(T_{\nameofthealgorithm})$ is the simulated time for \texttt{Baseline} $(\nameofthealgorithm)$ to reach the target accuracy. 

\label{sect:exp_res}
\paragraph{Comparison with All Synchronous FST Methods.}
First, we compare the synchronous baselines discussed in \Cref{sect:comp_methods} on the CIFAR-10 dataset (Figure~\ref{fig:comp1_3}), where we simultaneously train six identical models. We observe that synchronous methods without straggler mitigation converge very slowly. Among the straggler-mitigated synchronous variants that we implement, Figure~\ref{fig:comp1_3} shows that $\mmbobs$, has similar performance to $\mmsync$ because it struggles due to the large search space of the optimization problem, stemming from the exponential number of possible client schedules. 
Further, we do not observe any performance gains from using $\mmucb$ over $\mmsync$. Given that $\mmbobs$ and $\mmucb$ have similar performance as $\mmsync$, in subsequent experiments, we choose $\mmsync$ as the sole synchronous baseline. 


\begin{figure*}[t]
\centering\centerline{\includegraphics[width=0.95\textwidth]{figures/all_larger_bar.pdf}}
    \caption{Mean training times of $\nameofthealgorithm$ and $\mmsync$ to attain target accuracy levels in (\Cref{table:exp_summary}) on $2$/$4$/$6$ tasks with CIFAR-10, Fashion-MNIST, MNIST, and Shakespeare datasets. $\nameofthealgorithm$ requires consistently lower wall-clock time for training compared to $\mmsync$; the percentages represent these time gains.
    }
    \label{fig:hom_all}
\end{figure*}

\paragraph{Need for Buffer.} 
As discussed earlier, incorporating the buffer mitigates the negative impact of highly stale updates. Since staleness increases with the number of active clients, asynchronous FL methods without a buffer exhibit limited scalability as the number of clients grows. To demonstrate this, in Figure~\ref{fig:buff_hom_acc}, we conduct two experiments: 1) training three models for CIFAR-10 simultaneously, and 2) training three models for Fashion-MNIST simultaneously. We plot the final accuracy values varying the ratio of the active clients. We observe that for small active client ratios, $\nameofthealgorithm$ and $\nobuffer$ have comparable performance. However, with more active clients, the staleness of the updates increases, resulting in significantly worse performance of the fully asynchronous $\nobuffer$ algorithm. Then, in Figure~\ref{fig:buff_het_acc}, we simultaneously train two models, one each for CIFAR-10 and Fashion-MNIST. \camera{Observing the unsteady learning curves of $\nobuffer$, we conclude that the buffer makes the system more robust to stale updates.}


\paragraph{Comparison with $\mmsync$.}
Next, we compare $\nameofthealgorithm$ with the chosen synchronous method, $\mmsync$.

\begin{figure}[tbh]
    \centering  \centerline{\includegraphics[width=0.45\textwidth]{figures/het_fig.pdf}}
    \caption{Training curves of a single Monte Carlo run of the \textit{heterogeneous experiment}. Dashed vertical lines show times when tasks reach their target accuracy, with $\nameofthealgorithm$ reaching it faster than $\mmsync$.
    }
    \label{fig:het_exp_curves}
\end{figure}

\begin{figure}[tbh]
    \centering    \centerline{\includegraphics[width=0.4\textwidth]{figures/het_bar.pdf}}
    \caption{Mean time required to reach target accuracy and time gain of $\nameofthealgorithm$ over $\mmsync$ in \textit{the heterogeneous experiment}. While $\nameofthealgorithm$ does not require manual fine-tuning, the client allocation in $\mmsync$ is tuned at $100$, $84$, $48$, and $68$ clients for MNIST, Fashion-MNIST, CIFAR-10, and Shakespeare tasks respectively. $\nameofthealgorithm$ has notable time gain ($40.1\%$) over $\mmsync$ to finish all tasks.
    }
    \label{fig:het_bar}
\end{figure}

1) \textit{Homogeneous Tasks:} We conduct experiments training $2$, $4$, and $6$ identical models for each of MNIST, Fashion-MNIST, CIFAR-10, and Shakespeare datasets. Figure~\ref{fig:hom_all} shows the average finish times of the algorithms, and the significant time gains of our algorithm $\nameofthealgorithm$ over synchronous $\mmsync$ (even after incorporating straggler mitigation). We observe that the gain increases with the number of simultaneously trained tasks because $\mmsync$ is especially vulnerable to the straggler problem.


2) \textit{Heterogeneous Tasks:} 
The heterogeneous experiment trains $4$ models simultaneously, one each for the MNIST, Fashion-MNIST, CIFAR-10, and Shakespeare datasets. Once one model reaches its target accuracy, its training stops, and its clients are reallocated to other tasks. We use $\nameofthealgorithm$ with \textit{option} $=D$ for dynamic client allocation. For the synchronous baseline $\mmsync$, we ran $30$ different client allocation schemes, including our proposed allocation scheme and uniform allocation across tasks. We report the results achieved with the best-performing scheme.


Figure~\ref{fig:het_exp_curves} shows learning curves for $\nameofthealgorithm$ and $\mmsync$ from a single Monte Carlo run. The dashed vertical lines denote the time instants when a model reaches its target accuracy, following which the clients training this model get reallocated to other tasks. Figure~\ref{fig:het_bar} shows the average finish times for $4$ simultaneously trained models with $\nameofthealgorithm$ and $\mmsync$. For example, with $\nameofthealgorithm$, the model for MNIST dataset hits its target accuracy at $99$, after which the clients training this model get reallocated to the other models. At $619$, the training of the final model (for CIFAR-10) is complete.  Comparing the finish times for the last model, $\nameofthealgorithm$ provides a $40.1\%$ time gain over $\mmsync$.


We observe that thanks to dynamic client allocation, $\nameofthealgorithm$ automatically detects which tasks have higher heterogeneity across clients and need a larger buffer. We notice that the Shakespeare task is allocated fewer clients because it is estimated to be less heterogeneous, which is true based on the label distribution of data samples across clients. We also repeat this experiment (Appendix~\ref{app_sect:var_comp}) using $\nameofthealgorithm$ with static \textit{option} ($S$) to validate the proposed dynamic client allocation strategy. Dynamic client allocation consistently has time gain up to $11.9\%$ compared to the static version.

\section{CONCLUSION}\label{sec:conclusion}
In this paper, we present $\nameofthealgorithm$, a federated learning framework to simultaneously train multiple models using buffered asynchronous aggregations. We theoretically prove the convergence of our algorithm for smooth non-convex objective functions. Experiments across multiple datasets, demonstrates the $\nameofthealgorithm$'s superiority over existing simultaneous FL baselines, achieving up to $46.0\%$ reduction in training time. In future work, we plan to enhance $\nameofthealgorithm$ by incorporating client selection based on local data distributions and computational powers of clients.


\section*{Acknowledgements}

This work was partially supported by the US National Science Foundation under grants CNS-1751075 and CNS-2106891 to CJW and NSF CCF 2045694, CNS-2112471, CPS-2111751, and ONR N00014-23-1-2149 to GJ and the Ben Cook Presidential Graduate Fellowship to BA.
% \subsubsection*{References}

% References follow the acknowledgements.  Use an unnumbered third level
% heading for the references section.  Please use the same font
% size for references as for the body of the paper---remember that
% references do not count against your page length total. 



% References
\bibliography{References}

\newpage

\onecolumn

\title{FedAST: Federated Asynchronous Simultaneous Training\\(Appendix)}
\maketitle
\appendix

\section{Adjusting the number of active requests and \texttt{Realloc}}\label{app_sect:CalcRb}
Before designing \texttt{Realloc} algorithm for dynamic client allocation \textit{option} ($D$), we conduct initial validation experiments with the static \textit{option} ($S$), wherein the allocation of active local training requests across clients and buffer sizes remain unchanged throughout the training. Note that with the static \textit{option}\nolinebreak, Algorithm~\ref{alg:calcRb} only executes its Line~\ref{alg3_line:set_same} and returns the previous round's values always. We \textit{empirically} observe that setting the ratio of the number of active local training requests to buffer size fixed and below $37$ works well. Refer to Section~\ref{app_sect:buffer_size} for the validation experiments. Then, incorporating this ratio (\(R_m\approx 37b_m\)) within the convergence bound in Equation~\ref{eq:thm:main} of Theorem~\ref{thm:main}, we find out that the dominant term (excluding smoothness constants) becomes: \(\bOP{\frac{\lrsj\lrcj\locitj}{\awj}}\ghetsj\). Further, given the limited number of available clients, we cannot increase the total number of active local training requests arbitrarily without increasing the staleness. Thus, we employ \(\sum_{m=1}^MR_m=R\) where \(R\) is a constant of how many active training requests we assign in total depending on the number of available clients in the setting. Since the goal of federated simultaneous training is to minimize the objective functions of all tasks concurrently, we propose to adjust $\{R_m\}_{m=1}^M$ by solving,
    \begin{align}
    \min_{\{R_m\}_{m=1}^M}\sum_{m=1}^M\frac{\lrsj\lrcj\locitj}{\awj}\ghetsj \text{ subject to } \sum_{m=1}^MR_m=R.\label{eq:constrained_optim_problem}
\end{align}
 The solution of the minimization problem in (\ref{eq:constrained_optim_problem}) suggests allocating local training requests, $\{R_m\}_{m=1}^M$, in proportion to \(\sigma_{g,m}\sqrt{\lrsj\lrcj\locitj}\) for each model \(m\).  
Using this approach, Algorithm~\ref{alg:calcRb} adjusts resource allocation. Further, as the update variance across clients may vary in time during training, we employ adaptive periodical reallocation of resources across models (Line~\ref{alg3_line:firstif} in Algorithm~\ref{alg:calcRb}). Therefore, we use round indices to denote the changing number of active training requests and buffer sizes. 

\begin{algorithm}[t]
\caption{\texttt{EstimateVariances()}}\label{alg:estimatevariances}

\begin{algorithmic}[1]
\Require{\multiline{The set of latest $\nbupdvar$ updates $\{\del_{m,1}, \del_{m,2},\dots,\del_{m,\nbupdvar}\}_{m=1}^M$, server-side learning rates \(\{\lrsj\}_{m=1}^M\), client-side\\learning rates  \(\{\lrcj\}_{m=1}^M\), and the number of local SGD steps of all models \(\{\locitj\}_{m=1}^M\).}}
\State \(\{\overline{\del_{m}}\}_{m=1}^M \gets \{\frac{1}{\nbupdvar}\times\sum_{i=1}^\nbupdvar\del_{m,i}\}_{m=1}^M\) \Comment{Calculate the means of the latest updates}
\State \(\{{\widetilde{\sigma}_{g,m}^2}\}_{m=1}^M \gets \{\frac{1}{\nbupdvar}\times\sum_{i=1}^\nbupdvar \normbs{\del_{m,i}-\overline{\del_{m}}}/\normbs{\overline{\del_{m}}}\}_{m=1}^M\) \Comment{Calculate the normalized sample variances}
\State \(\{{\hat{\sigma}_{g,m}^2}\}_{m=1}^M \gets \{{\lrcj\lrsj\locitj\widetilde{\sigma}_{g,m}^2}\}_{m=1}^M\) \Comment{Multiply with other constants suggested by the convergence guarantee (\ref{eq:constrained_optim_problem})}
\State \textbf{Return} $\{{\hat{\sigma}_{g,m}^2}\}$ 
\end{algorithmic}
\end{algorithm} 

As we do not have access to true data heterogeneity levels, we need to estimate it (Line~\ref{alg3_line:estimate} in Algorithm~\ref{alg:calcRb}). When $\nameofthealgorithm$ is run with \textit{option} $=D$ (dynamic client allocation option), the server keeps the latest $\nbupdvar$ updates of each model. This requires constant and small memory space kept in the server. To present how variance estimation (\texttt{EstimateVariances()}) works, assume that \(\{\del_{m,1}, \del_{m,2},\dots,\del_{m,\nbupdvar}\}_{m=1}^M\) are the sets of latest received updates of tasks $m\in[M]$ where each $\del_{m,k}$ for $k\in[V]$ is the output of $k$\textsuperscript{th} latest local training (Algorithm~\ref{alg:cap}). As the output of any local training is the average of all calculated stochastic gradients during that local training, we use those outputs as approximations of the gradients calculated on the local data of clients. Algorithm~\ref{alg:estimatevariances} describes \texttt{EstimateVariances()}. It first calculates the mean of the latest updates for each task, 
\(\{\overline{\del_{m}}\}_{m=1}^M = \{\frac{1}{\nbupdvar}\times\sum_{i=1}^\nbupdvar\del_{m,i}\}_{m=1}^M\).
Then, \texttt{EstimateVariances()} returns sample variance multiplied with other terms (\(\lrsj\lrcj\locitj\)) suggested by (\ref{eq:constrained_optim_problem}) and normalized by the mean update norm (to prevent large models or models with inherently large weights from dominating others),
\mbox{\(\{{\hat{\sigma}_{g,m}^2}\}_{m=1}^M = \{\frac{\lrsj\lrcj\locitj}{\nbupdvar}\times\sum_{i=1}^\nbupdvar \normbs{\del_{m,i}-\overline{\del_{m}}}/\normbs{\overline{\del_{m}}}\}_{m=1}^M\)}. Then, \texttt{Realloc} algorithm allocates the number of active training requests proportionally to the square root of these values (Algorithm~\ref{alg:calcRb}, Line~\ref{alg3_line:proportional_dist}).

This approach is sensible both theoretically and intuitively. Based on our experimental observations regarding the relationship between the number of active training requests and buffer size, increasing the number of active local training requests necessitates an increase in buffer size. Moreover, a larger buffer proves beneficial in reducing the variance across updates, the buffered updates are averaged during the aggregation. \texttt{Realloc} aims to allocate more clients and provide a larger buffer size for tasks with higher heterogeneity. We choose the number of stored latest updates $V=8$ and the period of number of total updates from all clients to trigger reallocation in \texttt{Realloc} subroutine $c_{period}=0.75\times M\times\sum_{m=1}^MR_m$ in our experiments. The benefits of dynamic allocation \mbox{(\textit{option} $=D$)} over static and uniform resource allocation \mbox{(\textit{option} $=S$)} are demonstrated when tasks/models are heterogeneous, as shown in \mbox{Figures \ref{fig:var_unif_comp_high}-\ref{fig:var_unif_comp_low_bar}}.

\section{Theoretical Comparison of  $\nameofthealgorithm$ with Baselines}
\label{app_sect:theory_comparison}
We compare $\nameofthealgorithm$ with single-model FL methods, too. \citep{fedbuff} is the most similar algorithm to $\nameofthealgorithm$ (with single-model). However, even for the single-task case, $\nameofthealgorithm$ differs by employing a uniform client assignment to ensure unbiased participation of clients irrespective of their hardware speeds. This allows us to relax the assumptions to prove the convergence guarantee. \citep{fedbuff} relies on a strong assumption that the server receives updates from clients uniformly at random and that the norm of gradients is bounded. Moreover, compared to \citep{sharper}, our analysis is more general as $\nameofthealgorithm$ uses multiple SGD steps in local training and a buffer. Some other recent single-model asynchronous FL works, \citep{quafl} and \citep{favano}, do not have straightforward and efficient simultaneous federated training extensions for multiple models. 

\citep{asyncMM} is another asynchronous simultaneous federated learning method. However, \citep{asyncMM} indeed fails to converge to a stationary point asymptotically unless data is homogeneous, and their assumptions include Bounded Gradient Norm and Weak Convexity.

\begin{table}[h!] 
\centering 
\caption{Comparison of $\nameofthealgorithm$'s convergence guarantees to  \cite{fedbuff} (single-task asynchronous buffered FL algorithm) and \cite{asyncMM} (an asynchronous FST algorithm). $T$: \#global rounds, $\tau$: \#local steps, $b$: buffer size.}
\label{tab:theo_comp}
\begin{tabular}{@{}ccc@{}}
\toprule
\textbf{Algorithm} &
  \textbf{\begin{tabular}[c]{@{}c@{}}Non-standard assumptions\end{tabular}} &
  \textbf{\begin{tabular}[c]{@{}c@{}}Convergence\end{tabular}} \\ \midrule
\cite{fedbuff} &
  Bounded Gradient \& Receiving Updates Uniformly &
  $\bOP{{\sqrt{\locit/(Tb)}}}$\textsuperscript{\textcolor{red}{(a)}}\\
\cite{asyncMM} &
  \begin{tabular}[c]{@{}c@{}}Bounded Gradient \& Weak Convexity\end{tabular} &
  {Not converge}\\
$\nameofthealgorithm$ &
  ---&
  $\bOP{{\sqrt{\locit/(Tb)}}}$ \\ \bottomrule
\end{tabular}
\\[10pt] % Adjust space between the table and the note
\footnotesize{\textcolor{red}{(a)} Although the convergence guarantee in the published \citep{fedbuff} paper seems to have a better rate, we pointed out a mistake in their proof. Here, we use the corrected version we received via private communication.}
\end{table}


% \section{Run-time Comparison of Federated Simultaneous Training Algorithms} \label{app_sect:runtime}
% We provide intuition to understand how  theoretical run-time of mentioned algorithms under a specific setting with some simplifying assumptions including that each client's local training time for each model is from exponential distribution, \textit{Exp($\lambda$)}. Let us say that we have total $K$ active clients anytime and $M$ identical tasks with identical models that we want to train.
% \begin{enumerate}
%     \item \textit{Na\"ive Sequential Synchronous Training:} We train models one by one, using all $K$ active clients. We know the most dominant term in the convergence analysis will be \(\bOP{1/\sqrt{KT}}\) where \(T\) is the number of total rounds for each task  \citep[Corollary 1]{fedvarp}. Further, the expected round time for any model will be the expected value of the maximum of $K$ exponential random variables, which is \(\frac{1}{\lambda}\sum_{n=1}^K\frac{1}{n}\leq\frac{1}{\lambda}\log K\). Therefore, total \textit{run-time} required to reach arbitrarily small $\epsilon$-norm gradients for all of \(K\) models is bounded by \(\bOP{\frac{M\sqrt{\log K}}{\epsilon^2\sqrt{\lambda K}}}\).
%     \item \textit{Client set-partitioned training:} We first divide the client set into \(M\) non-overlapping subsets. Then, each model is trained with one of the subsets. This method is not guaranteed to converge a model with an arbitrarily small gradient-norm due to heterogeneity.
%     \item \textit{Synchronous Simultaneous Training:} We train models by distributing \(K/M\) clients to each task at each round. However, as the algorithm requires synchrony, again, the expected round-time is the maximum of $K$ exponential random variables, which is bounded by \(\frac{1}{\lambda}\log K\). Using the most dominant term from the bound in \cite[Corollary 1]{fedvarp}, $\bOP{\sqrt{M/(KT)}}$, the total \textit{run-time} required to reach arbitrarily small $\epsilon$-norm gradients for all of \(K\) models is bounded by \(\bOP{\frac{\sqrt{M\log K}}{\epsilon^2\sqrt{\lambda K}}}\).
%     \item \textit{$\nameofthealgorithm$ (Asynchronous Simultaneous Training):} Having \(K/M\) active local training request for each model for \(\nameofthealgorithm\), assume that there exists at least \(K/(2)\) total active clients working anytime independent of past and future. It is a hard assumption to prove rigorously; however, we observe that held in our experiments with $300$ active clients and $2-6$ simultaneous tasks all the time. Assuming this, the expected time for the server to receive an update is smaller than $2/(K\lambda)$. As the models are identical, the expected time to they 
% \end{enumerate}

\section{EXPERIMENTAL SETUP DETAILS}\label{suppl_sect:experiment_details}
In our study, we explore a simultaneous federated learning (FL) setting for multiple models. We present the details of our experiments in this section.

\subsection{Simulation Environment} We simulate the training with PyTorch on NVIDIA GeForce GTX TITAN X graphics processing units (GPUs) of our internal cluster. We build our code upon the public codes of \citep{flsim, asynchfl}.

\subsection{Setting Overview} We consider the federated training of $M$ models simultaneously using $\N$ clients. $\N$ is $1000$ in all experiments and $M$, specified for each experiment explicitly, varies between $2-6$.

\subsection{Tasks and Models} We use $5$ different tasks across the experiments: 
MNIST \citep{mnist}, Fashion-MNIST \citep{fashionmnist}, CIFAR-10 and CIFAR-100 \citep{cifar10} image classification tasks, and Shakespeare \citep{leaf} next character prediction task. We use a multilayer perceptron for MNIST as in \citep{feddyn}, convolutional networks for Fashion-MNIST as in \citep{lenet} and for CIFAR-10 as in \citep{feddyn}, ResNet-18 model for CIFAR-100 as in \cite{feddyn}, and a long short-term memory network for Shakespeare as in \citep{asynchfl}.

\subsection{Datasets and Data Distribution} We consider the data heterogeneity across clients in FL frameworks. We download MNIST, Fashion-MNIST, and \mbox{CIFAR-10/100} datasets from PyTorch built-in library methods. The train and test splits provided by the library are used without any modifications. To simulate heterogeneous data distribution across clients, we use Dirichlet distribution with $\alpha=0.1$ following the approach suggested in \cite{firstDirichlet}. We ensure that each client has $300$ data points for MNIST, Fashion-MNIST, and CIFAR-10/100 tasks by repeating the train set if necessary. We obtain and preprocess the Shakespeare dataset as described in \citep{leaf}. This dataset has inherently heterogeneous distribution across clients as each client corresponds to a unique role from Shakespeare's plays.

\subsection{Design Parameters}
In this section, we explain how we choose the design parameters.

\paragraph{Client dataset sizes, batch sizes, and number of local steps.} While distributing CIFAR-10/100, MNIST, and Fashion-MNIST datasets across clients, each client is allocated $300$ data points from each dataset. The Shakespeare dataset, however, maintains its original distribution of data points across roles, so clients have different numbers of data samples in the Shakespeare task.
For CIFAR-10/100, MNIST, and Fashion-MNIST tasks, we set the batch size to $32$ while we employ a batch size of $64$ for the Shakespeare task. We fix the number of local steps in local training ($\locitj$ parameter in Algorithm~\ref{alg:cap} in the main text) of clients at $27$ for all tasks. This makes $3$ epochs for CIFAR-10/100, MNIST, and Fashion-MNIST tasks. As the number of data points varies across clients for the Shakespeare dataset, there is no fixed number of epochs.

\paragraph{Buffer size.} \label{app_sect:buffer_size}
The buffer in $\nameofthealgorithm$ is crucial for mitigating the negative impacts of highly stale updates, as extensively discussed in the main text. The staleness of updates is influenced by the number of active local training requests, denoted as $\awj$, and the buffer size, $\bsj$, associated with all model $m\in[M]$. When $\nameofthealgorithm$ is run with static \textit{option} ($S$), these numbers are kept constant during the training, but they may change (this time we denote $R_m^{(t_m)}$ and $b_m^{(t_m)}$) when we use dynamic client allocation \textit{option} ($D$). A higher number of simultaneous local training requests leads to a higher staleness because it increases the global model's update frequency at the server. On the other hand, buffer size is inversely related to staleness, given its opposing effect on the aggregation frequency. Based on our experimental observations, selecting the number of active training requests and the buffer size of model $m$ such that their ratio is fixed and below $37$, ($\awj/\bsj \lesssim 37$ or $R_m^{(t_m)}/b_m^{t_m}\lesssim37$), works well. Selecting the buffer size of $\nameofthealgorithm$ based on this observation avoids the detrimental effects of stale updates while benefiting from fast training thanks to the asynchronous algorithm. We show two experimental results in Figures~\ref{fig:buff_size_1} and \ref{fig:buff_size_2}. In Figure~\ref{fig:buff_size_1}, we train one Fashion-MNIST and one CIFAR-10 models simultaneously by assigning $175$ active training requests to each task and observe that buffer size of $5$ strikes a balance between high final test accuracy and fast training to achieve the target accuracy for both tasks. In Figure~\ref{fig:buff_size_2}, we repeat a similar experiment with MNIST and CIFAR-10 tasks by assigning $105$ active training requests to each. This time, we observe that a buffer size of $3$ performs the best for both tasks. These experimental results support our buffer size choice.

\begin{figure}[H]
    \centering
    % \vspace{.3in}
\centerline{\includegraphics[width=0.85\textwidth]{figures/buff_size_1.pdf}}
    % \vspace{.3in}
    \caption{ The final test accuracy and required time to get target accuracy (in Table~\ref{table:exp_summary}) for simultaneous training (using $\nameofthealgorithm$ with static \textit{option}) of one Fashion-MNIST and one CIFAR-10 model with different buffer sizes. We assign the same number of local training requests (\(175\)) to each task.}\label{fig:buff_size_1}
\end{figure}

\begin{figure}[H]
    \centering
    % \vspace{.3in}
\centerline{\includegraphics[width=0.85\textwidth]{figures/buff_size_2.pdf}}
    % \vspace{.3in}
    \caption{ The final test accuracy and required time to get target accuracy (in Table~\ref{table:exp_summary}) for simultaneous training (using $\nameofthealgorithm$ with static \textit{option}) of one MNIST and one CIFAR-10 model with different buffer sizes. We assign the same number of local training requests (\(105\)) to each task.}\label{fig:buff_size_2}
\end{figure}

\paragraph{Learning rate and weight decay.}
We search for the best learning rate and weight decay hyperparameters considering the training speed and final accuracy levels. We seek client-side learning rate within the range of $[1\times10^{-3},1\times10]$, server-side learning rate within $[3\times10^{-2},3]$, and weight decays within $[1\times10^{-7}, 1\times10^{-2}]$. We observe that client-side learning rates of $6\times10^{-2}$ and $7$ with weight decays of $3\times10^{-4}$ and $7\times10^{-5}$ work best respectively for Fashion-MNIST and Shakespeare tasks for all methods. For CIFAR-10 task, a client-side learning rate of $1\times10^{-1}$ with weight decays of $7\times10^{-4}$ and $3\times10^{-4}$ perform best for asynchronous and synchronous methods, respectively. For MNIST, we use client-side learning rates of $1\times10^{-1}$ and $2\times10^{-1}$ for asynchronous and synchronous methods, respectively, with a weight decay of $3\times10^{-4}$. For server-side learning rates, we observe that $1$ for synchronous methods ($\mmsync$, $\mmbobs$, and $\mmucb$), $0.1$ for $\nameofthealgorithm$, and $0.038$ for $\nobuffer$ perform well for all tasks.

\subsection{Modeling Training Times, Model Sizes, and Client Speed Heterogeneity}

In our experiments, following \citep{shiftedexp2, shiftedexp1, MM_bobs, slowandstale}, we employ the \textit{shifted-exponential} random variables to model the duration between when the server sends a local training request to a client, and when it receives the update of the local training. The exponential component of the distribution reflects the stochastic nature of the device speeds, while the shift component accounts for unavoidable delays such as disk I/O operations.

Whenever a client $i$ performs local training for task $m$, we draw a random number from the distribution with a cumulative distribution function (CDF) of,
\begin{align}
P(X\leq x)=\begin{cases} 
      1-\exp\{-\frac{x-\beta_{i,m}}{2\beta_{i,m}}\}, & x\geq\beta_{i,m} \\
      0, & \text{otherwise}
   \end{cases}, \nn
\end{align}
where $\beta_{i,m}$ depends on the speed of client $i$ and the size of the model associated with task $m$. Then, we multiply this random number by the number of local steps to calculate the simulation time between when the server requests for the local training, and when it receives the update back.

We quantify the effect of the model sizes based on the average time required to calculate one stochastic gradient for each model on the GPUs of our internal cluster. By our measurements, we set,
\begin{equation}
\frac{\beta_{i,\text{MNIST}}}{0.148}=\frac{\beta_{i,\text{Fashion-MNIST}}}{0.240}=\frac{\beta_{i,\text{CIFAR-10}}}{0.228}=\frac{\beta_{i,\text{Shakespeare}}}{0.555}=\frac{\beta_{i,\text{CIFAR-100}}}{2.071},\;\;\forall i \in [N].\nn
\end{equation}

In our experiments, we also take the heterogeneity in the speed of client devices into consideration. We categorize clients into three speed groups: slow ($\%25$), normal-speed ($\%50$), and fast ($\%25$). The speed rates for these categories are inversely proportional to $1.3$, $1$, and $0.7$, such that,
\begin{equation}
    \frac{\beta_{\text{slow client},m}}{1.3}=\frac{\beta_{\text{normal-speed client},m}}{1}=\frac{\beta_{\text{fast client},m}}{0.7},\;\;\forall m \in [M].\nn
\end{equation}




\section{ADDITIONAL EXPERIMENTS}
In this section, we present supplementary experiments.
\subsection{Tuning Parameter $\firstk$ of the Straggler Mitigation Technique Used for Synchronous Methods (Accepting only the First-$\firstk$ Updates)} \label{app_sec:tuning_first_k}
In our experiments, to mitigate the high straggler effect, the server in synchronous methods ($\mmsync$, $\mmbobs$, and $\mmucb$) only aggregates the first $\firstk$ client updates for each task and discards the rest, following \citep{fedsysdesign}. To tune parameter $\firstk$, we run validation experiments with $\mmsync$ on single CIFAR-10, MNIST, and Fashion-MNIST tasks and evaluated the training performance with respect to simulated time and number of global rounds. A larger $\firstk$ results in a longer simulated time per round since we wait for more clients. On the other hand, the variance in aggregated updates on each round becomes smaller since we average more updates. Therefore, the target accuracy is attained faster in terms of the number of global rounds. We also observed that keeping $\firstk$ too small yields lower final accuracy. Navigating these trade-offs, we find that $\firstk = 30$ strikes an effective balance.
\begin{figure}[H]
% \vspace{.3in}
\centerline{\includegraphics[width=0.99\textwidth]{figures/cifar.pdf}}
% \vspace{.3in}
\caption{Performance of $\mmsync$ with varying $\firstk$ in CIFAR-10 task. The chosen point is shown with a red star.}
\end{figure}

\begin{figure}[H]
% \vspace{.3in}
\centerline{\includegraphics[width=0.99\textwidth]{figures/fmnist.pdf}}
% \vspace{.3in}
\caption{Performance of $\mmsync$ with varying $\firstk$ in Fashion-MNIST task. The chosen point is shown with a red star.}
\end{figure}

\begin{figure}[H]
% \vspace{.3in}
\centerline{\includegraphics[width=0.99\textwidth]{figures/mnist.pdf}}
% \vspace{.3in}
\caption{Performance of $\mmsync$ with varying $\firstk$ in MNIST task. The chosen point is shown with a red star.}
\end{figure}


% \subsection{Synchronous Simultaneous Performance without Straggler Mitigation}
% We repeat the same experiment in Figure\ref{fig:comp1_3} in the main text, where we train \(3\) identical CIFAR-10 models simultaneously. We observe that the synchronous method requires more than \(5\times\) time to reach a similar accuracy level compared to the one with straggler mitigation we used for competitors in the presented results.
% \begin{figure}[H]
%     \centering
%     % \vspace{.3in}
% \centerline{\includegraphics[width=0.35\textwidth]{figures/sync_no_strag_mitig.pdf}}
%     % \vspace{.3in}
%     \caption{ The average test accuracy of \(3\) simultaneous CIFAR-10 task trained by $\mmsync$ without straggler mitigation technique we use for our competitors. \(68\%\) accuracy level is shown with a red star. Compared to Figure~\ref{fig:comp1_3}, $\mmsync$ takes more than \(5\times\) time to get this accuracy due to the exacerbated straggler effect.}\label{fig:sync_no_strag_mit}
% \end{figure}


\subsection{Test Loss Plots of Figures \ref{fig:buff_hom_acc} and \ref{fig:buff_het_acc} in the Main Text}
We illustrate test loss plots of the experiments in Figures \ref{fig:buff_hom_acc} and \ref{fig:buff_het_acc} in the main text. 

\begin{figure}[H]
    \centering
        % \vspace{.3in}
        \centerline{\includegraphics[width=0.5\textwidth]{figures/hom_both.pdf}}
        % \vspace{.3in}

    \caption{The mean final loss values of {\color{blue}$\nameofthealgorithm$ (blue)}, {\color{olive}$\nobuffer$ (olive green)} and {\color{violet}centralized training (violet)}  with varying active client ratio, when training $3$ identical models. The left figure is for CIFAR-10 dataset, while the right figure is for Fashion-MNIST dataset. With a higher number of active clients, thanks to the buffer, $\nameofthealgorithm$ remains its performance while  $\nobuffer$ gets worse.
    % Increasing the ratio of active clients significantly worsens the final loss of $\nobuffer$.
    }
    \label{fig:buff_hom}
\end{figure}

\begin{figure}[H]
    \centering
    % \vspace{.3in}
    \centerline{\includegraphics[width=0.58\textwidth]{figures/het_both.pdf}}
    % \vspace{.3in}
    \caption{The mean test loss values of $\nameofthealgorithm$ and $\nobuffer$, when simultaneously training one model for CIFAR-10 and one for Fashion-MNIST. $\nameofthealgorithm$ achieves lower and more stable loss levels. 
    %\ps{Modify the legend of centralized training to clarify that this is just the eventual loss achieved, it's not the training curve. Or remove the legend altogether and mention this in the caption.}
    }
    \label{fig:buff_het}
\end{figure}

\subsection{Training Curves of Homogeneous Experiments}
 In Figure~\ref{fig:hom_all_curves}, we provide the average training curves of the homogeneous-task experiment in Figure~\ref{fig:hom_all}.
\begin{figure}[H]
    \centering
    % \vspace{.3in}
    \centerline{\includegraphics[width=\textwidth]{figures/all_larger_3.pdf}}
    % \vspace{.3in}
    \caption{Training curves of $\nameofthealgorithm$ and $\mmsync$ on $2$/$4$/$6$ tasks with CIFAR-10, Fashion-MNIST, MNIST, and Shakespeare datasets. Time gains of $\nameofthealgorithm$ over $\mmsync$ to attain target accuracy are shown on the colored horizontal lines. Horizontal black lines indicate target accuracy levels, same as the ones stated in \Cref{table:exp_summary}. 
    }
    \label{fig:hom_all_curves}
\end{figure}

\camera{\subsection{An Additional experiment with a larger model (ResNet-18) on CIFAR-100} \label{app_sec:resnet_experiments}
We run a homogeneous-task experiment with a larger model, ResNet-18, as implemented by \cite{feddyn} on the CIFAR-100 dataset, a $100$-class image classification dataset \citep{cifar10}. We use the same experimental settings as those in other experiments except for a few differences elaborated here. We use the Dirichlet distribution with \mbox{$\alpha = 1$} to simulate heterogeneity following the approach suggested in \cite{firstDirichlet}. We use a client-side learning rate ($\lrs$) of $0.06$ and the number of local SGD steps ($\locit$) of $5$ for both $\nameofthealgorithm$ and $\mmsync$. We present the experimental results in Figure~\ref{fig:cifar100_exps}. Our experiments show that $\nameofthealgorithm$ outperforms the synchronous baseline with a ResNet-18 model on the CIFAR-100 dataset by providing time gains of $22.0\%$, $40.7\%$, and $56.3\%$ for $2$, $4$, and $6$ simultaneously trained models respectively.
}

\begin{figure}[ht]
    \centering
    \begin{subfigure}{.53\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/cifar100_new.pdf}
        \captionsetup{skip=10pt} % This sets the space below the image before the caption starts.
        \caption{Training curves of $\nameofthealgorithm$ and $\mmsync$ on $2$/$4$/$6$ simultaneous CIFAR-100 tasks. Time gains of $\nameofthealgorithm$ over $\mmsync$ to attain target accuracy are shown on the colored horizontal lines. Horizontal black lines indicate the target accuracy level, $\%50$.}
        \label{fig:resnet_curves}
    \end{subfigure}%
    \hfill % This ensures that there is a space between the two subfigures.
    \begin{subfigure}{.45\textwidth}
        \centering
        \includegraphics[width=0.70\linewidth]{figures/cifar100_bar.pdf}
        \captionsetup{skip=10pt} % Adjusts the space below the image for the second subfigure.
        \caption{Mean training times of $\nameofthealgorithm$ and $\mmsync$ to attain target accuracy level, $\%50$, on $2$/$4$/$6$ simultaneous tasks with CIFAR-100 dataset. $\nameofthealgorithm$ requires consistently lower wall-clock time for training compared to $\mmsync$; the percentages represent these time gains.}
        \label{fig:resnet_bars}
    \end{subfigure}
    \caption{Experimental results on simultaneous repeated tasks with  $\nameofthealgorithm$ and $\mmsync$ on CIFAR-100.}
    \label{fig:cifar100_exps}
\end{figure}


\subsection{Experiments with different target accuracy levels} \label{app_sec:diff_acc}
To see how $\nameofthealgorithm$ and the competitor $\mmsync$ work with different target accuracy, we conduct the experiment in Figure~\ref{fig:het_bar} with \(+3\%\) higher and \(-10\%\) lower target accuracy levels as presented in Table~\ref{table:diff_target_acc_updated}. We observe that proposed $\nameofthealgorithm$ reduces the overall training time by \(55.9\%\) and \(16.3\%\), respectively for higher and lower target accuracy levels. We conclude that the advantage of $\nameofthealgorithm$ over $\mmsync$ increases with the difficulty of the task (i.e., reaching higher accuracy).

\begin{table}[H]
\centering
\caption{Different target accuracy levels used in experiments to validate the proposed methods, with lower and higher accuracy targets.}
\label{table:diff_target_acc_updated}
\begin{tabular}{|c|c|c|c|}
\hline
\textbf{Dataset} & \textbf{Lower Target Accuracy} & \textbf{Target Accuracy in the Main Text} & \textbf{Higher Target Accuracy} \\ \hline
MNIST & $83\%$ & $93\%$ & $96\%$ \\ \hline
Fashion-MNIST & $72\%$ & $82\%$ & $85\%$ \\ \hline
CIFAR-10 & $53\%$ & $63\%$ & $66\%$ \\ \hline
Shakespeare & $32\%$ & $42\%$ & $45\%$ \\ \hline
\end{tabular}
\end{table}

\begin{figure}[H]
    \centering
    \begin{minipage}[c]{0.56\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/het_higher_fig.pdf} 
        \caption{Training curves of a single Monte Carlo run of the \textit{heterogeneous experiment with higher target accuracy levels} in Table~\ref{table:diff_target_acc_updated}. Dashed vertical lines show times when tasks reach their target accuracy. The setting is the same as the experiment in Figure~\ref{fig:het_bar}.}
        \label{fig:het_higher_curves}
    \end{minipage}
    \hfill
    \begin{minipage}[c]{0.43\textwidth}
        \centering
    \centering    \centerline{\includegraphics[width=\textwidth]{figures/het_higher_bar.pdf}}
    \caption{Mean time required to reach target accuracy and time gain of $\nameofthealgorithm$ over $\mmsync$ in \textit{the heterogeneous experiment with higher target accuracy levels} in Table~\ref{table:diff_target_acc_updated}. The setting is the same as the experiment in Figure~\ref{fig:het_bar}.}
    \label{fig:het_higher_bar}
    \end{minipage}
\end{figure}

\begin{figure}[H]
    \centering
    \begin{minipage}[c]{0.56\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/het_lower_fig.pdf} 
        \caption{Training curves of a single Monte Carlo run of the \textit{heterogeneous experiment with lower target accuracy levels} in Table~\ref{table:diff_target_acc_updated}. Dashed vertical lines show times when tasks reach their target accuracy. The setting is the same as the experiment in Figure~\ref{fig:het_bar}.}
        \label{fig:het_lower_curves}
    \end{minipage}
    \hfill
    \begin{minipage}[c]{0.43\textwidth}
        \centering
    \centering    \centerline{\includegraphics[width=\textwidth]{figures/het_lower_bar.pdf}}
    \caption{Mean time required to reach target accuracy and time gain of $\nameofthealgorithm$ over $\mmsync$ in \textit{the heterogeneous experiment with lower target accuracy levels} in Table~\ref{table:diff_target_acc_updated}. The setting is the same as the experiment in Figure~\ref{fig:het_bar}.}
    \label{fig:het_lower_bar}
    \end{minipage}
\end{figure}


\subsection{Performance of $\nameofthealgorithm$ without static resource allocation } \label{app_sect:var_comp}
We conduct heterogeneous-task experiments to validate the performance gain of dynamic resource allocation \mbox{($\nameofthealgorithm$ (\texttt{D}))} over static option ($\uniform$) with uniform allocation across tasks in heterogeneous settings. For uniform resource allocation, we allocate the same number of active training requests to each task in $\uniform$. To show the consistency of our results, we run experiments at all target accuracy levels in Table~\ref{table:diff_target_acc_updated}. We present the results in Figure~\ref{fig:var_unif_comp_high_bar} (higher target accuracy), Figure~\ref{fig:var_unif_comp_mid_bar} (the target accuracy in the main text), and Figure~\ref{fig:var_unif_comp_low_bar} (lower target accuracy). We conclude that our dynamic client allocation based on the variance estimates of the updates reduces the total training time compared to the uniform static client allocation. The advantage of dynamic resource allocation becomes more prominent with more difficult tasks (i.e., higher target accuracy level).




\begin{figure}[!h]
    \centering
    \begin{minipage}[c]{0.56\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/var_vs_uniform_higher.pdf} 
        \caption{Training curves of a single Monte Carlo run in the experiment with dynamic resource allocation \textit{option} ($\nameofthealgorithm$\texttt{(D)}) and static \textit{option} with uniform resource allocation ($\uniform$). The setting is the \textit{heterogeneous experiment with higher target accuracy levels} in Table~\ref{table:diff_target_acc_updated}. Dashed vertical lines show times when tasks reach their target accuracy.}
    \label{fig:var_unif_comp_high}
    \end{minipage}
    \hfill 
    \begin{minipage}[c]{0.43\textwidth}
        \centering
    \centering
    \centerline{\includegraphics[width=\textwidth]{figures/var_vs_uniform_higher_bar.pdf}}
    \caption{Mean training times required to reach target accuracy and time gain of dynamic resource allocation \textit{option} ($\nameofthealgorithm$\texttt{(D)}) over static \textit{option} with uniform resource allocation ($\uniform$). The setting is the \textit{heterogeneous experiment with higher target accuracy levels} in Table~\ref{table:diff_target_acc_updated}.}
    \label{fig:var_unif_comp_high_bar}
    \end{minipage}
\end{figure}

\begin{figure}[!h]
    \centering
    \begin{minipage}[c]{0.56\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/var_vs_uniform_mid.pdf} 
        \caption{Training curves of a single Monte Carlo run in the experiment with dynamic resource allocation \textit{option} ($\nameofthealgorithm$\texttt{(D)}) and static \textit{option} with uniform resource allocation ($\uniform$). The setting is the \textit{heterogeneous experiment with the target accuracy levels used in the main text} in Table~\ref{table:diff_target_acc_updated}. Dashed vertical lines show times when tasks reach their target accuracy.}
    \label{fig:var_unif_comp_mid}
    \end{minipage}
    \hfill 
    \begin{minipage}[c]{0.43\textwidth}
        \centering
    \centering
    \centerline{\includegraphics[width=\textwidth]{figures/var_vs_uniform_mid_bar.pdf}}
    \caption{Mean training times required to reach target accuracy and time gain of dynamic resource allocation \textit{option} ($\nameofthealgorithm$\texttt{(D)}) over static \textit{option} with uniform resource allocation ($\uniform$). The setting is the \textit{heterogeneous experiment with the target accuracy levels used in the main text} in Table~\ref{table:diff_target_acc_updated}.}
    \label{fig:var_unif_comp_mid_bar}
    \end{minipage}
\end{figure}


\begin{figure}[!h]
    \centering
    \begin{minipage}[c]{0.56\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/var_vs_uniform_lower.pdf} 
        \caption{Training curves of a single Monte Carlo run in the experiment with dynamic resource allocation \textit{option} ($\nameofthealgorithm$\texttt{(D)}) and static \textit{option} with uniform resource allocation ($\uniform$). The setting is the \textit{heterogeneous experiment with lower target accuracy levels} in Table~\ref{table:diff_target_acc_updated}. Dashed vertical lines show times when tasks reach their target accuracy.}
    \label{fig:var_unif_comp_low}
    \end{minipage}
    \hfill 
    \begin{minipage}[c]{0.43\textwidth}
        \centering
    \centering
    \centerline{\includegraphics[width=\textwidth]{figures/var_vs_uniform_lower_bar.pdf}}
    \caption{Mean training times required to reach target accuracy and time gain of dynamic resource allocation \textit{option} ($\nameofthealgorithm$\texttt{(D)}) over static \textit{option} with uniform resource allocation ($\uniform$). The setting is the \textit{heterogeneous experiment with lower target accuracy levels} in Table~\ref{table:diff_target_acc_updated}.}
    \label{fig:var_unif_comp_low_bar}
    \end{minipage}
\end{figure}




\section{PROOFS OF THE CONVERGENCE ANALYSIS OF $\nameofthealgorithm$ WITH STATIC \textit{OPTION} ($S$)}  \label{app_sect:main_proof}
In this section, we present the proofs of the mathematical claims made in the paper. First, we define and explain the notations used in this section. After that, we introduce intermediate lemmas used in the main proof (Section \ref{sect:intermed_lemmas}). Then, we present the proofs of Theorem~\ref{thm:main} and Corollary~\ref{cor:conv_rate} (Section \ref{sect:proofmain}). Finally, we prove intermediate lemmas (Section \ref{sect:proofs_intermediate_lemmas}).

\subsection{Notations and Definitions}
$\nameofthealgorithm$ enables us to divide the convergence analyses of simultaneous tasks into individual ones. We focus on the convergence analysis of a single task within a simultaneous multi-model setting and the analysis holds for all tasks trained together. For brevity, we provide the proofs for a single task of multiple models trained simultaneously. Therefore, we drop all model indices in our analysis. We also drop time indices from the number of active requests ($\aw$) and buffer size ($\bs$) terms as they remain the same during the training with the static option of $\nameofthealgorithm$. Table~\ref{tab:notation_summary} summarizes all notation. Please note that the analysis presented here holds for every model $m\in[M]$ simultaneously trained within $\nameofthealgorithm$ framework.

\subsubsection{The Update Rules of $\nameofthealgorithm$}
We first revisit the local training and global update rules of $\nameofthealgorithm$. The notation may vary slightly from those in the main paper due to dropping model indices, but still accurately depicts the same algorithmic procedures, Algorithms~\ref{alg:cap} and \ref{alg:main} in the main text. 

\paragraph{Local update rule.} During local training, clients perform $\locit$ consecutive local stochastic gradient steps and return the output to the server. When a client receives the $t$\textsuperscript{th} version of the global model, $\xt$, it takes $\locit$ mini-batch stochastic gradient descent steps (for $k=1,\dots,\locit$) with following rule:
\begin{align}
    \xitk\gets\x_i^{(t,k-1)}-\lrc\tG\fii{\x_i^{(t,k-1)}},
\end{align}
where $\x_i^{(t,0)}\triangleq\xt$ and $\widetilde{\G}$ denotes stochastic gradients. We define the average of local stochastic gradients as \mbox{$\displaystyle\delit\triangleq\frac{1}{\locit}\sum_{k=0}^{\locit-1}\tG\fii{\x_i^{(t,k)}}$}. Then, the client returns $\displaystyle\frac{\xt-\x_i^{(t,\tau)}}{\locit\lrc} = \frac{1}{\locit}\sum_{k=0}^{\locit-1}\tG\fii{\x_i^{(t,k)}}=\delit$ to the server. The server stores the updates in a buffer.

\paragraph{Staleness.} The server receives the updates of local training requests asynchronously. It means that the received updates may come in a different order than local training requests sent to clients. Therefore, an aggregated update may have been calculated with an older version of the model, and this is called \textit{staleness}. We quantify the staleness of an update in terms of the number of global rounds passed between the times when the server sends the local training request and receives the update. The staleness is random for each update, depending on client selections for all tasks and all clients' availability, computation, and communication speeds. We denote the staleness of client $i$'s update received at the server at the $t$\textsuperscript{th} round as $\rdit$. Recall that Assumption~\ref{assump:maxstale} (Bounded Staleness) bounds this random value above at $\rdm$.

\paragraph{Global update rule.} On each global round $t$, when the buffer at the server, $\setB$, is full ($|\setB|=\bs$, where $\bs$ is the buffer size), the server aggregates the updates to proceed to the next global round. Here, $\setB$ is the set of clients whose updates are received after $(t-1)$\textsuperscript{th} and before $t$\textsuperscript{th} aggregation. The aggregation rule can be written as follows:
\begin{align} \label{line:globalupdate}
    \x^{(t+1)}\gets\xt-\locit\lrs\lrc\frac{1}{\bs}\sum_{i\in\setB}\delito&=\xt-\lrs\frac{1}{\bs}\sum_{i\in\setB}\lp\xtoi-\x_i^{(t-\rdit,\tau)}\rp\\
    &=\xt-\lrs\lrc\frac{1}{\bs}\sum_{i\in\setB}\sum_{k=0}^{\locit-1}\tG\fii{\x_i^{(t-\rdit,k)}}.\nn
\end{align}



\subsubsection{Virtual Sequence and Set Definitions\label{sect:virtual_seq_and_set_defns}}
We utilize the perturbed iterate idea from \cite{sharper,perturbedIterate}. 

First, let us introduce some helpful sets and notations. Consider $\setA$, which represents the set of clients chosen by the server to receive the $t$\textsuperscript{th} version of the model. Recall that the server in $\nameofthealgorithm$ selects the clients uniformly at random with replacement from all clients. The size of this set, $|\setA|$, is always equal to the buffer size, $\bs$, (except initialization, $t=0$) because $\bs$ new local training requests are made on each round. For instance, if $\bs$ is set to $3$, and the server selects the 2\textsuperscript{nd}, 16\textsuperscript{th}, and 31\textsuperscript{st} clients during the 4\textsuperscript{th} aggregation round, then $\setAt{4}$ is $\{2, 16, 31\}$. The server sends $\x^{(4)}$ to the 2\textsuperscript{nd}, 16\textsuperscript{th}, and 31\textsuperscript{st} clients and requests local training with this model. In practical terms, $\setA$ is a multiset, allowing multiple occurrences of the same client if a client is selected more than once. Throughout the proof, we consider each occurrence of the same client in multiset as a distinct update calculated on that particular client. While we acknowledge a slight abuse of notation, this does not lead to any mathematical flaw, and we believe that this significantly enhances the clarity and comprehensibility of the proof.

Now, let us define $\setC$ as the set of clients that have incomplete local training requests at the time of the $t$\textsuperscript{th} aggregation because of the asynchronous nature of $\nameofthealgorithm$. The size of this set, $|\setC|$, is always equal to the number of active local training requests, $\aw$, because the server sends a new local training request for every update it receives. For instance, if $\aw$ is $4$, and the server has sent local training requests to the 12\textsuperscript{th}, 27\textsuperscript{th}, 41\textsuperscript{st}, and 55\textsuperscript{th} clients prior to the 5\textsuperscript{th} aggregation, yet these clients are still processing their updates, then $\setCt{5}$ would be $\{12, 27, 41, 55\}$. Note that $\setCt{0}$ is an empty set, as there are no active local training requests before the algorithm starts. It is worth noting that $\setC$ is a multiset, allowing multiple occurrences of the same client if a client has more than one active local training request (recall that multiple requests are queued at the client side). Each occurrence of a client within this multiset represents a different local training calculated on that client. We again acknowledge a slight abuse of notation, but this does not lead to any mathematical flaw, and we believe that this makes the flow of proof significantly easier.

Next, we define the virtual sequence $\vst$ for $t=0,1,\dots, T$ as the model that receives local training updates of the global model, $\xt$ for $t=0,1,\dots, T$, in the correct order. Namely, unlike $\xt$, $\vst$ receives the local training updates in the order in which the server sends those requests. However, it is crucial to note that the local training updates are still calculated with the global model, $\xt$. The update rule of the virtual sequence is:
\begin{align}\label{line:vsupdate}
    \vstp\gets\vst-\locit\lrs\lrc\frac{1}{\bs}\sumA\delit=\vst-\locit\lrs\lrc\frac{1}{\bs}\sumA\frac{1}{\locit}\sumLock\tG\fii{\xitk}, 
\end{align}
for $t=0,1,\dots,T-1$ where $\vs^{(0)}\triangleq\x^{(0)}.$

\begin{remark} \label{obs:diff_z_x}Now, we state an observation using the definitions of $\setC$, the virtual sequence, and the global model. When the $t$\textsuperscript{th} aggregation happens at the server, the virtual sequence, $\vst$, has received all the updates from all previous local training requests on rounds $0,1,\dots,t-1$. At the same time, the global model, $\xt$, has received the same updates except for the updates of clients in $\setC$. By the update rules in (\ref{line:globalupdate}) and (\ref{line:vsupdate}), note that each received update at the server contributes to the global model and virtual sequence equally. Therefore, we can express their difference as:
\begin{align}
    \vst-\xt=-\locit\lrs\lrc\frac{1}{\bs}\sumC\delito.
\end{align}
\end{remark}



\begin{remark} \label{obs:counting_max_updates}
    If we count the number of occurrences of the round index $y$ among the model versions of assigned updates in set $\setC$ over all rounds $t=0,\dots,T-1$, we can bound this value as:
    \begin{align} \label{line:obs2}
    \sumtel\sumC\mathbf{1}\{t-\rdit=y\}\leq\bs\rdm,\quad\forall y=0,\dots,T-1,
    \end{align}
    where $\mathbf{1}$ is an indicator function that returns $1$ if the statement is true, and returns $0$ otherwise. The reasoning for this observation is as follows. On each round, the server selects $\bs$ clients (since the server selects one new client for each received and buffered update where the buffer size is $\bs$) and sends them the up-to-date global model. We also know that all local training requests must be returned to the server within $\rdm$ rounds by Assumption~\ref{assump:maxstale} (Bounded Staleness). Therefore, over the rounds $t=0,\dots,T-1$, any round indices can appear at most $\bs\rdm$ times in the summation in the left-hand side of the inequality in (\ref{line:obs2}). We will use this remark later in the proof.
\end{remark}

% \begin{remark}
%     When the norms of the gradients of global models that the clients in $\setC$ use are averaged over the rounds $t=0,\dots,T-1$, the obtained expression can be bounded as: 
%     % The sum of the norms of the global model's gradients calculated with the stale version that clients in $\setC$ receive over the rounds $t= 0,\dots,T-1$ can be bounded as:
%     \begin{align} \label{line:obs2}
%     \avgtelm\sumC\normbs{\G\f{\xtoi}}\leq\bs\rdm\avgtelm\normbs{\G\f{\xt}}.
%     \end{align}
    
%     The reasoning for this observation is as follows. On each round, the server selects $\bs$ clients (except for $t=0$, but $\mathcal{C}^{(0)}$ is already an empty set) and sends them the up-to-date global model. We also know that all local training requests must be returned to the server within $\rdm$ rounds by Assumption 4 (Bounded Staleness). Therefore, over the rounds $t=0,\dots,T-1$, the gradient's norm of any global model version $t\in\{0,1,\dots,T-1\}$ can appear at most $\bs\times\rdm$ times in the summation in the left-hand side of the inequality in (\ref{line:obs2}). We will use this remark later in the proof.
% \end{remark}

\subsubsection{Notation}
We define some useful variables used in the proof and present the notation used in $\nameofthealgorithm$ in Table~\ref{tab:notation_summary}. Also, we again want to remind the reader that we dropped all model indices in the proof as the theoretical results we present here hold for any of multiple tasks trained simultaneously, satisfying Assumptions \ref{assump:smoothness} - \ref{assump:maxstale}.


% Please add the following required packages to your document preamble:
% \usepackage{graphicx}
\renewcommand{\arraystretch}{1.7} % Change 1.5 to whatever factor you want
% Please add the following required packages to your document preamble:
% \usepackage{graphicx}
\begin{table}[H]
\centering
\caption{Summary of notations used in the mathematical analysis of $\nameofthealgorithm$.}
\label{tab:notation_summary}
\resizebox{\columnwidth}{!}{%
\begin{tabular}{|l|l|}
\hline
$\fii{\cdot}$: The loss function at client $i$ &
  $\Li$: Smoothness constant in Assumption \ref{assump:smoothness} \\ \hline
$\f{\cdot}$: The global loss function &
  $\locit$: Number of local SGD steps \\ \hline
$\xt$: The global model at the $t^{\text{th}}$ round &
  $\lrc$: Client-side learning rate \\ \hline
\begin{tabular}[c]{@{}l@{}}$\xitk$: The local model of client $i$ at the $k^{\text{th}}$ local step of the\\ $t^{\text{th}}$ round\end{tabular} &
  $\lrs$: Server-side learning rate \\ \hline
$\vst$: The virtual sequence at the $t^{\text{th}}$ round (Section~\ref{sect:virtual_seq_and_set_defns})&
  $\bs$: Buffer size \\ \hline
$\nabla, \Tilde{\nabla}$: Gradient and stochastic gradient operators &
  $\lhets$: Maximum local variance in Assumption~\ref{assump:lochet} \\ \hline
\begin{tabular}[c]{@{}l@{}}$\sgitk=\tG\fii{\xitk}$: Local stochastic gradient of client $i$\\  at round t and local step $k$\end{tabular} &
  $\ghets$: Maximum global variance in Assumption~\ref{assump:globhet} \\ \hline
$\displaystyle\delit=\frac{1}{\locit}\sumLock\sgitk$: The update of client $i$ at round t &
  $\aw$: Number of total active local training requests anytime \\ \hline
$\edelit=\expb{\delit}$: The expected update of client $i$ at round t &
  $\rdm$: Maximum staleness in Assumption~\ref{assump:maxstale} \\ \hline
\begin{tabular}[c]{@{}l@{}}$\lrst=\lrs\locit$: Server learning rate multiplied by the number \\ of local training steps\end{tabular} &
  $\rdit$: The staleness of client $i$'s update at round $t$ \\ \hline
\begin{tabular}[c]{@{}l@{}}$\setA$: The set of clients to which the server sends\\the $t^{\text{th}}$ version of the model (Section~\ref{sect:virtual_seq_and_set_defns})\end{tabular} &
  \begin{tabular}[c]{@{}l@{}}$\setC$: The set of clients which are requested local training, but\\ have not returned their updates to the server yet (Section~\ref{sect:virtual_seq_and_set_defns})\end{tabular} \\ \hline
\end{tabular}%
}
\end{table}
\renewcommand{\arraystretch}{1} % Change 1.5 to whatever factor you want

\subsection{Intermediate Lemmas}\label{sect:intermed_lemmas}

We present intermediate lemmas used through the proof.

{\allowdisplaybreaks \begin{lemma} For a set of $Q$ vectors, \(\mathbf{u}_1,\dots,\mathbf{u}_Q\), where $Q$ is a positive integer,
    {\allowdisplaybreaks \begin{align}
        \normbs{\sum_{q=1}^Q\mathbf{u}_q}\leq Q\sum_{q=1}^Q\normbs{\mathbf{u}_q}.\nn
    \end{align}}
    \label{lemma:sos}
\textit{Proof.} The lemma is a direct consequence of Jensen's inequality with a convex function \(\normbs{\cdot}\) and uniform random distribution over the set of vectors \(\mathbf{u}_1,\dots,\mathbf{u}_Q\).
\end{lemma}}

{\allowdisplaybreaks \begin{lemma} Suppose that $\fii{\cdot}$ satisfies Assumption \ref{assump:smoothness} (Smoothness) and Assumption~\ref{assump:lochet} (Bounded Variance) for all $i\in[N]$, and assume that $\lrc\leq\frac{1}{L\locit}$. Then the iterates of $\nameofthealgorithm$ satisfy,
\begin{align}
    \expns{\G\fii{\xt}-\edelit}\leq\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\expns{\G\fii{\xt}},\:\:\forall i\in\N,\nn
\end{align}
where $\dd\triangleq\Li^2\lrc^2\locit\lp\locit-1\rp$.

Further, suppose Assumption~\ref{assump:globhet} (Bounded Heterogeneity) holds. Then, the iterates of $\nameofthealgorithm$ satisfy,
{\allowdisplaybreaks \begin{align}
    \frac{1}{\N}\sumAll\expns{\edelit-\G\fii{\xt}}\leq\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\expns{\G\f{\xt}}+\frac{\dd}{1-\dd}\ghets.\nn
\end{align}}
\label{lemma:L1_1}
\end{lemma}

\begin{remark}
     The true gradient at any client using the global model is close to the local update of that client.
\end{remark}}


{\allowdisplaybreaks \begin{lemma} The iterates of $\nameofthealgorithm$ and defined virtual sequence satisfy,
%The negative inner product of the global loss function's gradient at virtual sequence and the population mean of all expected updates is bounded.
\begin{equation}
    \T{1}\triangleq-\inp{\G\f{\vst}}{\frac{1}{\N}\sumAll\edelit}\leq-\frac{1}{2}\normbs{\G\f{\xt}}+\frac{1}{2}\normbs{\G\f{\vst}-\f{\xt}}+\frac{1}{2}\normbs{\sumAllP{\edelit-\G\fii{\xt}}}.\nn
\end{equation}
\label{lemma:T1}
\end{lemma}}


{\allowdisplaybreaks \begin{lemma} Suppose that $\fii{\cdot}$ satisfies Assumption~\ref{assump:lochet} (Bounded Variance and Unbiased Stochastic Gradients) for all $i\in[N]$, then the iterates of $\nameofthealgorithm$ satisfy, 
{\allowdisplaybreaks \begin{align}
    \T{2}\triangleq\expns{\frac{1}{\bs}\sumA\delit}\leq\expns{\frac{1}{\bs}\sumA\edelit}+\frac{\lhets}{\locit\bs}.\nn
\end{align}}
\label{lemma:T2}
\end{lemma}}


\begin{remark}
The noisy global update due to stochastic gradients is close to the expected update calculated with full gradients. The buffer and multiple local steps are useful to reduce the variance due to local SGD steps. 
\end{remark}

{\allowdisplaybreaks \begin{lemma} The iterates of $\nameofthealgorithm$ satisfy, 
%\ps{This description should come after the lemma statement as an explanation. It does not belong in the lemma statement.}
\begin{align}
    \expns{\frac{1}{\bs}\sumAP{\G\fii{\xt}-\G\f{\xt}}}=\frac{1}{\bs\N}\sumAll\expns{\G\fii{\xt}-\G\f{\xt}}.\nn
\end{align}
Further, suppose Assumption~\ref{assump:globhet} (Bounded Heterogeneity) holds. Then, the iterates of $\nameofthealgorithm$ also satisfy,
{\allowdisplaybreaks \begin{align}
    \T{3}\triangleq\expns{\frac{1}{\bs}\sumA\edelit}\leq\frac{3}{\N}\sumAll{\expns{\edelit-\G\fii{\xt}}}+\frac{3\ghets}{\bs}+3\expns{\G\f{\xt}}.\nn
\end{align}}
\label{lemma:clientselection_and_T3}
\end{lemma}

\begin{remark}
 $\nameofthealgorithm$ benefits the global variance reduction thanks to the buffer. 
\end{remark}


{\allowdisplaybreaks \begin{lemma} The virtual sequence and the iterates of $\nameofthealgorithm$ satisfy,
    {\allowdisplaybreaks \begin{align}
        \avgtelm\expns{\G\f{\vst}-\G\f{\xt}}&\leq\lp1+\frac{3\aw\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets\nn\\&+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\frac{1+\dd}{1-\dd}\avgtelm\expns{\G\f{\xt}}.\nn
    \end{align}}
    \label{lemma:sequence_diff}
\end{lemma}}
\begin{remark}
As discussed in Remark~\ref{obs:diff_z_x}, although the virtual sequence and global model get updates in a different order, they receive the same updates. Therefore, we can bound their difference.
\end{remark}




\subsection{Proofs of Main Statements}\label{sect:proofmain}
We present and prove Theorem~\ref{theorem:main} and Corollary~\ref{cor:conv_rate} here.
\subsubsection{Theorem 1 (Convergence bound)}
First we restate the theorem:

\textbf{Theorem 1. \textit{(Convergence bound):}}
\textit{Suppose Assumptions \ref{assump:smoothness} - \ref{assump:maxstale} hold, there are $\aw$ active local training requests, and the server and client learning rates, $\lrs, \lrc$ respectively, satisfy $\lrs\leq\sqrt{\locit\bs}$ and $\lrc\leq\min\lcb\frac{1}{6\Li\locit\sqrt{\locit\bs}},\frac{1}{4\Li\locit\sqrt{\locit\aw\rdm}}\rcb$, where $\bs$ is the buffer size, and $\locit$ is the number of local training steps. Then, the iterations of Algorithm~\ref{alg:main} ($\nameofthealgorithm$) satisfy:}
\begin{align*}
    \avgtelm & \expns{\G\f{\xt}} \leq \bOP{\frac{\f{\x^{(0)}} - \min_\x \f{\x}}{T\lr\locit}}+\bOP{\lp\frac{\Li\lr}{\bs}+\Li^2\lrc^2\locit+\frac{\Li^2\lrsq\locit\aw}{\bs^2}\rp\lhets}\nn\\&+\bOP{\lp\frac{\Li\lr\locit}{\bs}+\Li^2\lrc^2\locit\lp\locit-1\rp+\frac{\Li^2\lrsq\locit^2\aw^2}{\bs^2}\rp\ghets}\nn.
\end{align*}


\textit{Proof.}
Using the update rule of the virtual sequence (\ref{line:vsupdate}) and Assumption ($\asmpt$) \ref{assump:smoothness} (Smoothness), and taking the conditional expectation with respect to $\vst$, we have,
{\allowdisplaybreaks \begin{align}
    \expb{\f{\vstp}}&\leq\f{\vst}+\inp{\G\f{\vst}}{\expb{\vstp-\vst}}+\frac{\Li}{2}\expns{\vstp-\vst}\nn
    \\
    &= \f{\vst}+\inp{\G\f{\vst}}{\expb{-\lrst\lrc\frac{1}{\bs}\sumA\delit}}+\frac{\Li}{2}\expns{\lrst\lrc\frac{1}{\bs}\sumA\delit}\nn\\
     & \overset{\asmpt~\ref{assump:lochet}}{=} \f{\vst}-\lrst\lrc\frac{1}{\bs}\inp{\G\f{\vst}}{\expb{\sumA\edelit}}+\frac{\Li}{2}\lrst^2\lrc^2\expns{\frac{1}{\bs}\sumA\delit}\nn\\
     % &= \f{\vst}-\lrst\lrc\expb{\inp{\G\f{\vst}}{\frac{1}{\N}\sumAll\edelit}}+\frac{\Li}{2}\lrst^2\lrc^2\expns{\frac{1}{\bs}\sumA\delit}\nn \tag{} \\
     &\overset{\substack{\textit{Uniform}\\\textit{client}\\\textit{selection}}}{=} \f{\vst}+\lrst\lrc \expb{ \underbrace{-\inp{\G\f{\vst}}{\frac{1}{\N}\sumAll\edelit}}_{\triangleq\T{1}}}+\frac{\Li}{2}\lrst^2\lrc^2\underbrace{\expns{\frac{1}{\bs}\sumA\delit}}_{\triangleq\T{2}}\nn.
\end{align}}
Using Lemmas~\ref{lemma:T1} and \ref{lemma:T2}, we can bound $\T{1}$ and $\T{2}$. Then, dividing both sides by $\lrst\lrc$:
{\allowdisplaybreaks \begin{align}
    % &\frac{\expb{\f{\vstp}}-\f{\vst}}{\lrt} \nn \\
    % & \leq-\frac{1}{2}\normbs{\G\f{\xt}}+\frac{1}{2}\normbs{\G\f{\vst}-\G\f{\xt}} + \frac{1}{2}\normbs{\frac{1}{\N}\sumAllP{\edelit-\G\fii{\xt}}} \nn\\
    % & \quad +\frac{\Li\lrt}{2}\lp\expns{\frac{1}{\bs}\sumA\edelit}+\frac{\lhets}{\locit\bs}\rp \nn  \\
    &\frac{\expb{\f{\vstp}}-\f{\vst}}{\lrt}\leq-\frac{1}{2}\expns{\G\f{\xt}}+\frac{1}{2}\expns{\G\f{\vst}-\G\f{\xt}}  \nn \\
    & \quad+ \frac{1}{2\N}\sumAllP{\expns{\edelit-\G\fii{\xt}}} +\frac{\Li\lrt}{2}\underbrace{\expns{\frac{1}{\bs}\sumA\edelit}}_{\triangleq\T{3}} + \frac{\Li\lrt}{2} \frac{\lhets}{\locit\bs}. \nn% \tag{\ps{expectations missing in this eq.}}
\end{align}}
Using Lemma~\ref{lemma:clientselection_and_T3}, we can bound $\T{3}$:
{\allowdisplaybreaks \begin{align}
    &\frac{\expb{\f{\vstp}}-\f{\vst}}{\lrt} \nn \\
    & \leq -\frac{1}{2}\expns{\G\f{\xt}}+\frac{1}{2} \expns{\G\f{\vst}-\G\f{\xt}}+\frac{1}{2\N}\sumAllP{\expns{\edelit-\G\fii{\xt}}}\nn\\
    & \quad +\Li\lrt\lp \frac{3}{2\N}\sumAll{\expns{\edelit-\G\fii{\xt}}}+\frac{3\ghets}{2\bs}+\frac{3}{2}\expns{\G\f{\xt}} +\frac{\lhets}{2\locit\bs}\rp\nn\\
    &=\lp-\frac{1}{2}+\frac{3\Li\lrt}{2}\rp\expns{\G\f{\xt}}+\frac{1}{2} \mbe \normbs{\G\f{\vst}-\G\f{\xt}}+\Li\lrt\lp\frac{3\ghets}{2\bs}+\frac{\lhets}{2\locit\bs}\rp\nn\\
    & \quad +\lp\frac{3\Li\lrt}{2}+\frac{1}{2}\rp\frac{1}{\N}\sumAll{\expns{\edelit-\G\fii{\xt}}}\nn\\
    &\overset{\textit{Lemma}~\ref{lemma:L1_1}}{\leq}\lp-\frac{1}{2}+\frac{3\Li\lrt}{2}\rp\expns{\G\f{\xt}}+\frac{1}{2} \mbe \normbs{\G\f{\vst}-\G\f{\xt}}+\Li\lrt\lp\frac{3\ghets}{2\bs}+\frac{\lhets}{2\locit\bs}\rp\nn\\
    & \quad +\lp\frac{3\Li\lrt}{2}+\frac{1}{2}\rp\lp\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\expns{\G\f{\xt}}+\frac{\dd}{1-\dd}\ghets\rp\nn\\
    &=\lp-\frac{1}{2}+\frac{3\Li\lrt}{2}+\frac{\dd}{2\lp1-\dd\rp}+\frac{3\Li\lrt\dd}{2\lp1-\dd\rp}\rp\expns{\G\f{\xt}}+\frac{1}{2} \mbe \normbs{\G\f{\vst}-\G\f{\xt}}\nn\\
    & \quad +\lp\frac{\Li\lrt}{2\locit\bs}+\frac{3\Li^3\lrc^3\lrst\locit}{4\lp1-\dd\rp}+\frac{\Li^2\lrc^2\locit}{4\lp1-\dd\rp}\rp\lhets+\lp\frac{3\Li\lrt}{2\bs}+\frac{3\Li\lrt\dd}{2\lp1-\dd\rp}+\frac{\dd}{2\lp1-\dd\rp}\rp\ghets\nn,
\end{align}}
where $\dd\triangleq\Li^2\lrc^2\locit\lp\locit-1\rp$. Using the tower property of conditional expectation, telescoping the inequality over the round indices $t=0,1,\dots, T-1$, and using Lemma \ref{lemma:sequence_diff}, we get,
{\allowdisplaybreaks \begin{align}
    &\avgtelPm{\frac{1}{2}-\frac{3\Li\lrt}{2}-\frac{\dd}{2\lp1-\dd\rp}-\frac{3\Li\lrt\dd}{2\lp1-\dd\rp}}\expns{\G\f{\xt}}\leq
    \frac{1}{2T}\sumtel\expns{\G\f{\vst}-\G\f{\xt}}\nn\\&+\frac{\f{\vs^{(0)}}-\expb{\f{\vs^{\lp T\rp}}}}{T\lrt}+\lp\frac{\Li\lrt}{2\locit\bs}+\frac{3\Li^3\lrc^3\lrst\locit}{4\lp1-\dd\rp}+\frac{\Li^2\lrc^2\locit}{4\lp1-\dd\rp}\rp\lhets+\lp\frac{3\Li\lrt}{2\bs}+\frac{3\Li\lrt\dd}{2\lp1-\dd\rp}+\frac{\dd}{2\lp1-\dd\rp}\rp\ghets\nn\\
    &\overset{\textit{Lemma}~\ref{lemma:sequence_diff}}{\leq}\lp1+\frac{3\aw\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\aw}{2\bs^2\locit}\lhets+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\aw^2}{2\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{2\bs}\frac{1+\dd}{1-\dd}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{\f{\vs^{(0)}}-\expb{\f{\vs^{\lp T\rp}}}}{T\lrt}+\lp\frac{\Li\lrt}{2\locit\bs}+\frac{3\Li^3\lrc^3\lrst\locit}{4\lp1-\dd\rp}+\frac{\Li^2\lrc^2\locit}{4\lp1-\dd\rp}\rp\lhets+\lp\frac{3\Li\lrt}{2\bs}+\frac{3\Li\lrt\dd}{2\lp1-\dd\rp}+\frac{\dd}{2\lp1-\dd\rp}\rp\ghets\nn.
\end{align}}

Suppose the learning rates satisfy $\lrs\leq\sqrt{\locit\bs}$ (which also makes $\lrst\leq\locit\sqrt{\locit\bs}$) and $\lrc\leq\min\lcb\frac{1}{6\Li\locit\sqrt{\locit\bs}},\frac{1}{4\Li\locit\sqrt{\locit\aw\rdm}}\rcb$, the following inequality holds:
% https://www.wolframalpha.com/input?i=.5-3%2F12-1%2F30-1%2F60-51%2F480

{\allowdisplaybreaks \begin{align}
\frac{1}{2}-\frac{3\Li\lrt}{2}-\frac{\dd}{2\lp1-\dd\rp}-\frac{3\Li\lrt\dd}{2\lp1-\dd\rp}-\frac{3\Li^2\lrsqt\aw\rdm}{2\bs}\frac{1+\dd}{1-\dd}\geq\agrc.
\label{proof:numeric_inequality}
\end{align}}

Also, notice that $\vs^{(0)}$ is equal to $\x^{(0)}$ by definitions (Section~\ref{sect:virtual_seq_and_set_defns}) of these sequences and $\min_\x \f{\x}\leq \f{\vs^{\lp T\rp}}$.
{\allowdisplaybreaks 
\begin{align}
    & \avgtelm \expns{\G \f{\xt}} \leq \agrci \frac{\f{\x^{(0)}} - \min_\x \f{\x}}{T\lrt}\tag{Using \eqref{proof:numeric_inequality}}\\
    &+\agrci\lp\frac{\Li\lrt}{2\locit\bs}+\frac{3\Li^3\lrc^3\lrst\locit}{4\lp1-\dd\rp}+\frac{\Li^2\lrc^2\locit}{4\lp1-\dd\rp}+\lp1+\frac{3\aw\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\aw}{2\bs^2\locit}\rp\lhets\nn\\
    &+\agrci\lp \frac{3\Li\lrt}{2\bs}+\frac{3\Li\lrt\dd}{2\lp1-\dd\rp}+\frac{\dd}{2\lp1-\dd\rp} + \frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\aw^2}{2\bs^2} \rp\ghets\nn.
\end{align}}

Define $\gapTerm \triangleq \f{\x^{(0)}} - \min_\x \f{\x}$. After reducing high-order terms using the assumptions, $\lrs\leq\sqrt{\locit\bs}$ (which also makes $\lrst\leq\locit\sqrt{\locit\bs}$) and $\lrc\leq\min\lcb\frac{1}{6\Li\locit\sqrt{\locit\bs}},\frac{1}{4\Li\locit\sqrt{\locit\aw\rdm}}\rcb$, and incorporating the constants into the $\mco(\cdot)$ notation, we have:
{\allowdisplaybreaks \begin{align}
    \avgtelm & \expns{\G\f{\xt}} \leq \bOP{\frac{\gapTerm}{T\lr\locit}}+\bOP{\lp\frac{\Li\lr}{\bs}+\Li^2\lrc^2\locit+\frac{\Li^2\lrsq\locit\aw}{\bs^2}\rp\lhets}\nn\\&+\bOP{\lp\frac{\Li\lr\locit}{\bs}+\Li^2\lrc^2\locit\lp\locit-1\rp+\frac{\Li^2\lrsq\locit^2\aw^2}{\bs^2}\rp\ghets}\nn.
\end{align}}
This concludes the proof.
%\ps{This looks correct.}

\subsubsection{Proof of Corollary 1 (Convergence Rate)}
% Remind that the only assumptions we used throughout the proof are $\lrs\leq\sqrt{\bs}$ and $\lrc\leq\min\lcb\frac{1}{4\Li\locit},\frac{1}{8\Li\sqrt{\bs}},\frac{1}{4\Li\sqrt{\aw\rdm}}\rcb$.
First, notice that learning rates, $\lrs=\sqrt{\locit\bs}$ and $\lrc=\min\lcb\frac{1}{\locit\Li \sqrt{T}},\frac{1}{6\Li\locit\sqrt{\locit\bs}},\frac{1}{4\Li\locit\sqrt{\locit\aw\rdm}}\rcb$ satisfy the assumptions ($\lrs\leq\sqrt{\locit\bs}$ and $\lrc\leq\min\lcb\frac{1}{6\Li\locit\sqrt{\locit\bs}},\frac{1}{4\Li\locit\sqrt{\locit\aw\rdm}}\rcb$) used through the proof.


When $T\geq\max\lcb36\bs\locit,16\locit\aw\rdm\rcb$; set learning rates $\lrs=\sqrt{\locit\bs}$ and $\lrc=\frac{1}{\locit\Li \sqrt{T}}$. Then, the bound in Theorem~\ref{theorem:main} reduces to:
{\allowdisplaybreaks \begin{align}
    \avgtelm&\expns{\G\f{\xt}}\leq\bOP{\frac{\Li}{\sqrt{T\bs\locit}}}\gapTerm+\bOP{\frac{1}{\sqrt{T\bs\locit}}+\frac{1}{\locit T}+\frac{\aw}{T\bs}}\lhets+\bOP{\sqrt{\frac{\locit}{T\bs}}+\frac{1}{T}+\frac{\locit\aw^2}{T\bs}}\ghets\nn.
\end{align}}

\subsection{Proofs of Intermediate Lemmas}
\label{sect:proofs_intermediate_lemmas}
\textit{Proof of Lemma \ref{lemma:L1_1}.} 
We borrow the proof technique from \citep[C.5]{fednova}.
{\allowdisplaybreaks \begin{align}
    &\expns{\G\fii{\xt}-\edelit}=\expns{\G\fii{\xt}-\frac{1}{\locit}\sumLock\G\fii{\xitk}}
    % =\expns{\frac{1}{\locit}\sumLockP{\G\fii{\xt}-\G\fii{\xitk}}}
    \nn\\
    &\overset{\textit{Lemma}~\ref{lemma:sos}}{\leq}\frac{1}{\locit}\sumLocklims{1}{-1}\expns{\G\fii{\xt}-\G\fii{\xitk}}\nn\\
    &\overset{\asmpt~\ref{assump:smoothness}}{\leq}\frac{\Li^2}{\locit}\underbrace{\sumLocklims{1}{-1}\expns{\xt-\xitk}}_{\triangleq\T{recursive}}=\frac{\Li^2\lrc^2}{\locit}\sumLocklims{1}{-1}\expns{\sumvlims{0}{-1}\sgitv}\label{lemma1_rec_line1}\\
    % &\leq\frac{\Li^2\lrc^2}{\locit}\sumLocklims{1}{-1}\expns{\sumvlimsP{0}{-1}{\sgitv-\G\fii{\xitv}+\G\fii{\xitv}}}\nn\\
    &\overset{\substack{\textit{Lemma 2 in}\\\text{\citep{fednova}}}}{=}\frac{\Li^2\lrc^2}{\locit}\sumLocklimsP{1}{-1}{\sumvlims{0}{-1}\expns{\sgitv-\G\fii{\xitv}}+\expns{\sumvlims{0}{-1}\G\fii{\xitv}}}\tag{Using Assumption~\ref{assump:lochet}}
    %\tag{\ps{refer to local unbiased assumption here}}
    \\
    &\overset{\textit{Lemma}~\ref{lemma:sos}}{\leq}\frac{\Li^2\lrc^2}{\locit}\sumLocklims{1}{-1}\sumvlimsP{0}{-1}{\expns{\sgitv-\G\fii{\xitv}} +\kk\expns{\G\fii{\xitv}}}\nn\\
    &\overset{\asmpt~\ref{assump:lochet}}{\leq}\frac{\Li^2\lrc^2}{\locit}\sumLocklimsP{1}
{-1}{\kk\lhets+\kk\sumvlims{0}{-1}\expns{\G\fii{\xitv}}}\nn\\
    &\leq\frac{\Li^2\lrc^2}{\locit}\lp\frac{\lp\locit-1\rp\locit}{2}\lhets+\frac{\lp\locit-1\rp\locit}{2}\sumLocklims{0}{-2}\expns{\G\fii\xitk}\rp\nn\\
    &\leq\lrc^2\Li^2\frac{\locit-1}{2}\lp\lhets+\sumLocklims{0}{-2}\expns{\G\fii\xitk}\rp\nn\\
    &\leq\lrc^2\Li^2\frac{\locit-1}{2}\lp\lhets+\sumLocklimsP{0}{-2}{2\expns{\G\fii\xitk-\G\fii\xt}+2\expns{\G\fii\xt}}\rp\nn\\
    &\overset{\asmpt~\ref{assump:smoothness}}{\leq}\lrc^2\Li^2\frac{\locit-1}{2}\lp\lhets+\sumLocklimsP{0}{-2}{2\Li^2\expns{\xitk-\xt}+2\expns{\G\fii\xt}}\rp\nn\\
    &\leq\lrc^2\Li^2\frac{\locit-1}{2}\lp\lhets+\sumLocklimsP{1}{-1}{2\Li^2\expns{\xitk-\xt}+2\expns{\G\fii\xt}}\rp\nn\\
    &\leq\lrc^2\Li^2\frac{\locit-1}{2}\lp\lhets+2\Li^2\T{recursive} + 2 \locit \expns{\G\fii\xt}\rp\label{lemma1_rec_line2}.
\end{align}}
Using the recursive appearances of $\T{recursive}$ in \eqref{lemma1_rec_line1} and \eqref{lemma1_rec_line2}:
{\allowdisplaybreaks \begin{align}
    \frac{\T{recursive}}{\locit}&=\frac{1}{\locit}\sumLocklims{1}{-1}\expns{\xitk-\xt}\leq\lrc^2\frac{\locit-1}{2}\lhets+\lrc^2\locit\lp\locit-1\rp\expns{\fii{\xt}}+\lrc^2\Li^2\lp\locit-1\rp\T{recursive}\nn.
\end{align}}
Arranging the terms, defining $\dd\triangleq\Li^2\lrc^2\locit\lp\locit-1\rp$, and assuming $\lrc\leq\frac{1}{L\locit}$ which makes $\dd\leq1$,
{\allowdisplaybreaks \begin{align}
    \expns{\G\fii{\xt}-\edelit}&\leq\frac{\Li^2\T{recursive}}{\locit}\leq\frac{\Li^2\lrc^2\lp\locit-1\rp\lhets/2+\Li^2\lrc^2\locit\lp\locit-1\rp\expns{\G\fii\xt}}{1-\Li^2\lrc^2\locit\lp\locit-1\rp}\nn\\
    &\leq\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\expns{\fii{\xt}},\:\:\forall i\in\N.\nn
\end{align}}
This proves the first part of Lemma~\ref{lemma:L1_1}. Now, averaging it across clients:
{\allowdisplaybreaks \begin{align}
    &\frac{1}{\N}\sumAll\expns{\edelit-\G\fii{\xt}}\leq\frac{1}{\N}\sumAllP{\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\expns{\fii{\xt}}}\nn\\
    &= \frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\frac{1}{\N}\sumAll\expns{\G\fii{\xt}-\G\f{\xt}+\G\f{\xt}}\nn\\
    &= \frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\frac{1}{\N}\sumAll\expns{\G\fii{\xt}-\G\f{\xt}}\nn\\&+\frac{\dd}{1-\dd}\expns{\G\f{\xt}}\nn+\frac{\dd}{1-\dd}\frac{2}{\N}\sumAll\inp{\G\fii{\xt}-\G\f{\xt}}{\G\f{\xt}}\nn\\
    &\overset{\asmpt~\ref{assump:globhet}}{\leq}\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\expns{\G\f{\xt}}+\frac{\dd}{1-\dd}\ghets.\tag{Since $\mfrac{1}{N}\sum_{i=1}^N\G\fii{\xt}=\G\f{\xt}$}
\end{align}}
This concludes the proof of Lemma \ref{lemma:L1_1}. 

\textit{Proof of Lemma~\ref{lemma:T1}.}
{\allowdisplaybreaks \begin{align}
    \T{1}&\triangleq-\inp{\G\f{\vst}}{\frac{1}{\N}\sumAll\edelit}=-\inp{\G\f{\vst}}{\frac{1}{\N}\sumAllP{\edelit-\G\fii{\xt}+\G\f{\xt}}}\nn\\
    &=-\inp{\G\f{\vst}}{\G\f{\xt}}-\inp{\G\f{\vst}}{\frac{1}{\N}\sumAllP{\edelit-\G\fii{\xt}}}\nn\\
    &=-\frac{1}{2}\normbs{\G\f{\vst}}-\frac{1}{2}\normbs{\G\f{\xt}}+\frac{1}{2}\normbs{\G\f{\vst}-\G\f{\xt}}-\frac{1}{2}\normbs{\G\f{\vst}}\nn\\
    &-\frac{1}{2}\normbs{\frac{1}{\N}\sumAllP{\edelit-\G\fii{\xt}}}+\frac{1}{2}\normbs{\G\f{\vst}-\frac{1}{\N}\sumAllP{\edelit-\G\fii{\xt}}}\nn\\
    &\overset{\textit{Lemma}~\ref{lemma:sos}}{\leq}-\normbs{\G\f{\vst}}-\frac{1}{2}\normbs{\G\f{\xt}}+\frac{1}{2}\normbs{\G\f{\vst}-\G\f{\xt}}\nn\\
    &+\frac{1}{2}\normbs{\frac{1}{\N}\sumAllP{\edelit-\G\fii{\xt}}}+\normbs{\G\f{\vst}}\nn\\
    &=-\frac{1}{2}\normbs{\G\f{\xt}}+\frac{1}{2}\normbs{\G\f{\vst}-\G\f{\xt}}+\frac{1}{2}\normbs{\frac{1}{\N}\sumAllP{\edelit-\G\fii{\xt}}}\nn.
\end{align}}

\textit{Proof of Lemma~\ref{lemma:T2}.}
{\allowdisplaybreaks \begin{align}
    \T{2}&\triangleq\expns{\frac{1}{\bs}\sumA\delit}=\expns{\frac{1}{\bs}\sumA\edelit+\frac{1}{\bs}\sumAP{\delit-\edelit}}\nn\\
    &=\expns{\frac{1}{\bs}\sumA\edelit+\frac{1}{\bs}\sumAP{\frac{1}{\locit}\sumLock\lp\sgitk-\G\fii{\xitk}\rp}}\nn\\
    &= \expns{\frac{1}{\bs}\sumA\edelit}+\expns{\frac{1}{\bs}\sumAP{\frac{1}{\locit}\sumLock\lp\sgitk-\G\fii{\xitk}\rp}}\nn \tag{Using Assumption~\ref{assump:lochet}}\\
    &\overset{\substack{\textit{Lemma 2 in}\\\text{\citep{fednova}}}}{=} \expns{\frac{1}{\bs}\sumA\edelit}+\frac{1}{\bs\N}\sumAll{\frac{1}{\locit^2}\sumLock\expns{\lp\sgitk-\G\fii{\xitk}\rp}}\nn 
    %\tag{\ps{Shouldn't it be $\frac{1}{Nb}$ in 2nd term rather than $\frac{1}{N^2}$?}}
    \\
    &\leq\expns{\frac{1}{\bs}\sumA\edelit}+\frac{\lhets}{\locit\bs}.\nn
\end{align}}
% \ps{Final result looks good. Check the comments above.}

\textit{Proof of Lemma \ref{lemma:clientselection_and_T3}.}

\begin{align}
    &\expns{\frac{1}{\bs}\sumAP{\G\fii{\xt}-\G\f{\xt}}}\nn\\
    &=\frac{1}{\bs^2}\expb{\sumA{\normbs{\G\fii{\xt}-\G\f{\xt}}}+\sum_{\substack{i\textit{ and }r\textit{ are}\\\textit{two different}\\\textit{items in }\setA}}\inp{\G\fii{\xt}-\G\f{\xt}}{\G\frr{\xt}-\G\f{\xt}}}\nn\\
    &\overset{(a)}{=}\frac{1}{\bs\N}\sumAll\expns{\G\fii{\xt}-\G\f{\xt}}+\expb{\frac{1}{\N^2}\sumAll\sumrlim{1}{\N}\inp{\G\fii{\xt}-\G\f{\xt}}{\G\frr{\xt}-\G\f{\xt}}}\nn\\
    &\overset{(b)}{=}\frac{1}{\bs\N}\sumAll\expns{\G\fii{\xt}-\G\f{\xt}},\label{line:lemma4_first_part}
\end{align}
where \textit{(a)} follows that the clients in $\setA$ are selected uniformly at random with replacement among all clients (see Section~\ref{sect:virtual_seq_and_set_defns}), and \textit{(b)} follows that $\sum_{i=1}^N\G\fii{\xt}=N\f{\xt}$. This proves the first part of Lemma \ref{lemma:clientselection_and_T3}.
% \ps{Check the reasoning here. The second term in 3rd line would be zero only if you have with-replacement sampling - emphasize that. Also, $\mbe$ is missing.}
\begin{align}
    \T{3}&\triangleq\expns{\frac{1}{\bs}\sumA\edelit}=\expns{\frac{1}{\bs}\sumAP{\edelit-\G\fii{\xt}+\G\fii{\xt}-\G\f{\xt}}+\G\f{\xt}}\nn\\
    &\overset{\textit{Lemma}~\ref{lemma:sos}}{\leq}3\expns{\frac{1}{\bs}\sumAP{\edelit-\G\fii{\xt}}}+3\expns{\frac{1}{\bs}\sumAP{\G\fii{\xt}-\G\f{\xt}}}+3\expns{\G\f{\xt}}\nn\\
    &\overset{\substack{\textit{Using (}\ref{line:lemma4_first_part}\textit{)}\\\textit{and}\\\textit{Lemma}~\ref{lemma:sos}}}{\leq}\frac{3}{\N}\sumAll{\expns{\edelit-\G\fii{\xt}}}+\frac{3}{\bs\N}\sumAll{\expns{\G\fii{\xt}-\G\f{\xt}}}+3\expns{\G\f{\xt}}\nn\\
    &\overset{\asmpt~\ref{assump:globhet}}{\leq}\frac{3}{\N}\sumAll{\expns{\edelit-\G\fii{\xt}}}+\frac{3\ghets}{\bs}+3\expns{\G\f{\xt}}\nn.
\end{align}

\textit{Proof of Lemma~\ref{lemma:sequence_diff}.} We start by using Assumption \ref{assump:smoothness} (Smoothness) and Remark \ref{obs:diff_z_x}.
{\allowdisplaybreaks \begin{align}
    &\expns{\G\f{\vst}-\G\f{\xt}}\leq\Li^2\expns{\vst-\xt}=\Li^2\expns{\lrt\frac{1}{\bs}\sumC\delito}\nn 
    %\tag{\ps{How did the 2nd eq. follow? We didn't state this anywhere. What is $\gamma_i^t$?}}
    \\
    &=\Li^2\expns{\frac{\lrt}{\bs}\sumCP{\delito-\edelito+\edelito}}\nn\\
    &\overset{\asmpt~\ref{assump:lochet}}{=}\Li^2\lrsqt\expns{\frac{1}{\bs}\sumCP{\delito-\edelito}}+\Li^2\lrsqt\expns{\frac{1}{\bs}\sumC{\edelito}}\nn\\
    &=\Li^2\lrsqt\expns{\frac{1}{\bs}\sumC\frac{1}{\locit}\sumLockP{\sgitok-\G\fii{\xitok}}}+\Li^2\lrsqt\expns{\frac{1}{\bs}\sumC{\edelito}}\nn\\
    &\overset{\asmpt~\ref{assump:lochet}}{\leq}\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{\Li^2\lrsqt\aw}{\bs^2}\expb{\sumC\normbs\edelito}\nn \nn\\
    &\leq\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{\Li^2\lrsqt\aw}{\bs^2}\expb{\sumC\normbs{\edelito-\G\fii{\xtoi}+\G\fii{\xtoi}-\G\f{\xtoi}+\G\f{\xtoi}}}\nn\\
    &\overset{\textit{Lemma}~\ref{lemma:sos}}{\leq}\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets\nn\\
    &+\frac{3\Li^2\lrsqt\aw}{\bs^2}\expb{\sumCP{\normbs{\G\f{\xtoi}}+\normbs{\edelito-\G\fii{\xtoi}}+\normbs{\G\f{\xtoi}-\G\fii{\xtoi}}}}\nn\\
    &\leq\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw}{\bs^2}\expb{\sumCP{\normbs{\G\f{\xtoi}}+\normbs{\edelito-\G\fii{\xtoi}}}}.\nn
\end{align}}
Telescoping the inequality over $t=0,\dots,T-1$:
{\allowdisplaybreaks \begin{align}
    &\avgtelm\expns{\G\f{\vst}-\G\f{\xt}}\leq\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets\nn\\&+\frac{3\Li^2\lrsqt\aw}{\bs^2}\avgtelm\expb{\sumCP{\normbs{\G\f{\xtoi}}+\normbs{\edelito-\G\fii{\xtoi}}}}\nn\\
    & %\overset{(a)}{\leq}
    \overset{\textit{Remark}~\ref{obs:counting_max_updates}}{\leq}\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{3\Li^2\lrsqt\aw^2} {\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\avgtelm\expns{\G\f{\xt}}\nn
    % \tag{\ps{This $\rdm$ in 3rd term needs some explanation}}
    \\
    & \quad +\frac{3\Li^2\lrsqt\aw}{\bs^2}\avgtelm\expb{\sumC{\normbs{{\edelito-\G\fii{\xtoi}}}}} \nn 
    %\tag{\ps{Norm and sum over $\mc C^{(t)}$ missing}} 
    \\
    &\overset{\textit{Lemma}~\ref{lemma:L1_1}}{\leq}\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\aw}{\bs^2}\avgtelm\expb{\sumCP{\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\normbs{\G\fii{\xtoi}}}} \nn \\
    % \label{line:L1_1used}\\
    &\leq\lp1+\frac{3\aw\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\aw}{\bs^2}\avgtelm\expb{\sumC\frac{\dd}{1-\dd}\normbs{\G\fii{\xtoi}}}\nn\\
    &\overset{\textit{Lemma}~\ref{lemma:sos}}{\leq}\lp1+\frac{3\aw\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\aw}{\bs^2}\avgtelm\expb{\sumCP{\frac{2\dd}{1-\dd}\normbs{\G\fii{\xtoi}-\G\f{\xtoi}}+\frac{2\dd}{1-\dd}\normbs{\G\f{\xtoi}}}}\nn\\
    &\overset{\asmpt~\ref{assump:globhet}}{\leq}\lp1+\frac{3\aw\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\aw}{\bs^2}\avgtelm\expb{\sumC\frac{2\dd}{1-\dd}\normbs{\G\f{\xtoi}}}\nn\\
    &\overset{\textit{Remark}~\ref{obs:counting_max_updates}}{\leq}\lp1+\frac{3\aw\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\frac{2\dd}{1-\dd}\avgtelm\expns{\G\f{\xt}}\nn
    % \tag{\ps{Again, this $\rdm$ in 3rd term needs some explanation}} 
    \\
    &=\lp1+\frac{3\aw\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\frac{1+\dd}{1-\dd}\avgtelm\expns{\G\f{\xt}}.\nn
\end{align}} 

\section{Convergence of $\nameofthealgorithm$ with Dynamic Client Allocation (\textit{option} $=D$)} \label{app_sec:opt1_conv}
With a similar approach to the proof of static client allocation, we can show the convergence of the $\nameofthealgorithm$ with dynamic client allocation (\textit{option} $=D$), too. Adopting all of the previously used notation, we also need some new definitions to analyze this version of the algorithm, as the number of active training requests and buffer size can change dynamically during the training.

\paragraph{Notation for changing buffer size and number of active training requests.}
Let us define $\bst$ and $\awt$ as the buffer size and the number of active local training requests of the model. Further, define $\bsmin$ and $\bsmax$ the minimum and maximum value that the buffer size can take. Similarly, define $\awmin$ and $\awmax$ as the minimum and maximum number of active training requests. Moreover, we define $\bssk\triangleq\bsmax/\bsmin$ as the measure of skewness in buffer size.

\paragraph{Global update rule and virtual sequence definition.} \label{sect:virtual_seq_and_set_defn2} Although the local update rule remains the same, the global update rule slightly changes for dynamic client allocation due to changing buffer size:
\begin{align} \label{line:globalupdate2}
    \x^{(t+1)}\gets\xt-\locit\lrs\lrc\frac{1}{\bst}\sum_{i\in\setB}\delito&=\xt-\lrs\frac{1}{\bst}\sum_{i\in\setB}\lp\xtoi-\x_i^{(t-\rdit,\tau)}\rp\\
    &=\xt-\lrs\lrc\frac{1}{\bst}\sum_{i\in\setB}\sum_{k=0}^{\locit-1}\tG\fii{\x_i^{(t-\rdit,k)}},\nn
\end{align}
where $|\setB|=\bst$. Note that \eqref{line:globalupdate2} is almost identical to \eqref{line:globalupdate}, except the varying buffer-size $\bst$.

Next, we define $\fdit$ as the index of the global round when a local training request sent to client $i$ in round $t$ returns to the server. Basically, it is the current round index $t$, added to the future value of staleness that the requested update will have. We need to define a new virtual sequence \(\yst\), which is different from the $\vst$ defined earlier.
\begin{align}
\label{line:ysupdate}
    \ystp\gets\yst-\locit\lrs\lrc\sumA\frac{1}{\bsfdit}\delit=\yst-\locit\lrs\lrc\sumA\frac{1}{\bsfdit}\frac{1}{\locit}\sumLock\tG\fii{\xitk}, 
\end{align}
for $t=0,1,\dots,T-1$ where $\ys^{(0)}\triangleq\x^{(0)}.$ Here, $\setA$ is defined similarly as it was in Section~\ref{sect:virtual_seq_and_set_defns}. Note that the probability of being in $\setA$ is equal across clients due to uniform client selection. However, this time, the size of this set does not have to be equal to the buffer size at round $t$. Due to the new client selection rule (Line~\ref{algline:newjob} in Algorithm~\ref{alg:main}), the server may assign $0$, $1$, or $2$ clients for each received update. Therefore, we know that $0<\setAs\leq2\bst$.

Here, we need a simplifying assumption for the purpose of this proof:\\
\begin{assump}[$\bsfdit$ Values] \label{assump:for_dynamic_convergence}
     We assume that any $\bsfdit$ value is known at the time when a local training request is sent to client $i$ at round $t$, and these values are independent of any future information including the received updates. We further assume that ${\bsfdit}$ values are equal (denote ${\bsfdt}$) for all clients in $\setA$.
\end{assump}
\begin{remark}
    When we keep the period of dynamic client allocation long enough, we observe that most of the assigned local training requests at one round fall in the same window before the next dynamic client allocation happens (Line~\ref{algline:adjustRb} in Algorithm~\ref{alg:main}). Hence, based on our empirical observations, what assumption implies holds for most of the local training requests. Further, \textbf{this assumption can be avoided} by taking an average of the updates during aggregation weighted inversely with the number of local training requests sent at the same global round. In other words, one may have avoided this assumption by weighting an update from client $i$ with $1/|\mathcal{A}^{(t-\gamma_i^t)}|$ instead of taking average over buffer during aggregation at round $t$. However, we did not see any practical benefit of this type of weighting in our experiments, and this strange weighting would be just for theoretical purposes. Therefore, we keep the current version.
\end{remark}

% Here, we need two simplifying assumptions for the purpose of this proof: \label{extra_assumptions}\\
% \textit{1) We assume that the expected value of ${1}/{\bsfdit}$ is equal to ${1}/{\bsfdt}$ across clients in $\setA$:} Since we keep the period of dynamic client allocation long enough, we observe that most of the assigned local requests at one round fall in the same window between two dynamic client allocation happens (Line~\ref{alg:calcRb} in Algorithm~\ref{alg:main}). Further, this assumption can be avoided by taking an average of the updates during aggregation weighted inversely with the number of local training requests sent at the same global round (i.e., weighting with $1/\setAs$). However, we did not see any practical benefit of this type of weighting in our experiments, and this strange weighting would be just for theoretical purposes.\\
% \textit{2) The updates of any round $t$ and buffer sizes at the round when those updates are received back are independent:} Although our adaptive buffer size and resource allocation take the update statistics into account, here, we assume that the effect of the training requests at one round is minimal on the buffer size at the rounds the updates are received back.
We first state the theorem showing the convergence of \(\nameofthealgorithm\) with dynamic client allocation option.
\begin{theorem}\textbf{(Convergence of $\nameofthealgorithm$ with \textit{option} $=D$):}
Suppose Assumptions \ref{assump:smoothness} - \ref{assump:for_dynamic_convergence} hold, and the learning rates satisfy $\lrs\leq\bssk^{-3/2}\sqrt{\locit\bs}$ and $\lrc\leq\min\lcb\frac{\bssk^{-3/2}}{24\Li\locit\sqrt{\locit\bs}},\frac{\bssk^{-3/2}}{16\Li\locit\sqrt{\locit\aw\rdm}}\rcb$. Then, the iterations of Algorithm 1 ($\nameofthealgorithm$) with \textit{option} $=D$ satisfy:
\begin{align*}
        \avgtelm & \expns{\G\f{\xt}} \leq \bOP{\frac{\f{\x^{(0)}} - \min_\x \f{\x}}{T\lr\locit}}+\bOP{\lp\frac{\Li\lr\bssk^3}{\bsmin}+\Li^2\lrc^2\bssk^2\locit+\frac{\Li^2\lrsq\locit\awmax\bssk^2}{\bsmin^2}\rp\lhets}\nn\\&+\bOP{\lp\frac{\Li\lr\locit\bssk^3}{\bsmin}+\Li^2\lrc^2\locit\lp\locit-1\rp\bssk^2+\frac{\Li^2\lrsq\locit^2\awmax^2\bssk^2}{\bsmin^2}\rp\ghets}\nn.
\end{align*}
\label{theorem:convergence_dynamic}
\end{theorem}

\textbf{Proof.}\\
We will need one extra lemma corresponding to Lemma~\ref{lemma:sequence_diff}.

{\allowdisplaybreaks \begin{lemma} The new virtual sequence $\lp\yst\rp$ and the iterates of $\nameofthealgorithm$ satisfy,
    {\allowdisplaybreaks \begin{align}
        \avgtelm\expns{\G\f{\yst}-\G\f{\xt}}&\leq\lp1+\frac{3\awmax\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets\nn\\&+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\frac{1+\dd}{1-\dd}\avgtelm\expns{\G\f{\xt}}.\nn
    \end{align}}
    \label{lemma:sequence_diff_2}
\end{lemma}}


Now, using the update rule of the virtual sequence (\ref{line:ysupdate}) and Assumption \ref{assump:smoothness} (Smoothness), and taking the conditional expectation with respect to $\yst$, we have,
{\allowdisplaybreaks\begin{align}
&\expb{\f{\ystp}}\leq\f{\yst}+\inp{\G\f{\yst}}{\expb{\ystp-\yst}}+\frac{\Li}{2}\expns{\ystp-\yst}\nn\\
&= \f{\yst}+\inp{\G\f{\yst}}{\expb{-\lrst\lrc\sumA\frac{1}{\bsfdit}\delit}}+\frac{\Li}{2}\expns{\lrst\lrc\sumA\frac{1}{\bsfdit}\delit}\nn\\
 & \leq \f{\yst}-\lrst\lrc{\frac{\setAs}{\bsfdt}}\expb{\inp{\G\f{\yst}}{\frac{1}{\setAs}\sumA\edelit}}\tag{Using Assumption~\ref{assump:for_dynamic_convergence}}\\&
 +\frac{\Li\setAs^2}{2(\bsfdt)^2}\expns{\lrst\lrc\frac{1}{\setAs}\sumA\delit} \tag{$\setAs$ is not random with conditional expectation}\\
 & = \f{\yst}+\lrst\lrc{\frac{\setAs}{\bsfdt}}\expb{\underbrace{-\inp{\G\f{\yst}}{\frac{1}{\N}\sumAll\edelit}}_{\triangleq\T{1}}}+2\bssk^2{\Li}\lrst^2\lrc^2\expns{\frac{1}{\setAs}\sumA\delit}. \tag{$\frac{\setAs}{\bsfdt}\leq2\bssk$}
\end{align}}
Next, using Lemma~\ref{lemma:T1} (with $\yst$ sequence) and Lemma~\ref{lemma:T2} (with $\setAs$), using $1/\bssk\leq\setAs/\bsfdt\leq2\bssk$, and dividing both sides by $\lrst\lrc$ we obtain,
{\allowdisplaybreaks\begin{align}
&\frac{\expb{\f{\ystp}}-\f{\yst}}{\lrt}\leq-\frac{1}{2\bssk}\expns{\G\f{\xt}}+\bssk\expns{\G\f{\yst}-\G\f{\xt}}  \nn \\
& \quad+ \frac{\bssk}{\N}\sumAll{\expns{\edelit-\G\fii{\xt}}} +{2\bssk^2\Li\lrt}{\expns{\frac{1}{\setAs}\sumA\edelit}} + {2\bssk^2\Li\lrt} \frac{\lhets}{\locit\bsmin}. \nn
\end{align}}
Using Lemma~\ref{lemma:clientselection_and_T3}, we get,
% \ps{Do we need the assumption that buffer-sizes are same within a window so far?}
{\allowdisplaybreaks \begin{align}
    &\frac{\expb{\f{\ystp}}-\f{\yst}}{\lrt} \nn \\
    & \leq -\frac{1}{2\bssk}\expns{\G\f{\xt}}+\bssk\expns{\G\f{\yst}-\G\f{\xt}}+\frac{\bssk}{\N}\sumAll{\expns{\edelit-\G\fii{\xt}}}\nn\\
    & \quad +\bssk^2\Li\lrt\lp \frac{6}{\N}\sumAll{\expns{\edelit-\G\fii{\xt}}}+\frac{6\ghets}{\bsmin}+{6}\expns{\G\f{\xt}} +\frac{2\lhets}{\locit\bsmin}\rp\nn\\
    &=\lp-\frac{1}{2\bssk}+{6\bssk^2\Li\lrt}\rp\expns{\G\f{\xt}}+ \bssk\mbe\normbs{\G\f{\yst}-\G\f{\xt}}+\bssk^2\Li\lrt\lp\frac{6\ghets}{\bsmin}+\frac{2\lhets}{\locit\bsmin}\rp\nn\\
    & \quad +\lp{6\bssk^2\Li\lrt}+\bssk\rp\frac{1}{\N}\sumAll{\expns{\edelit-\G\fii{\xt}}}\nn\\
    &\leq\lp-\frac{1}{2\bssk}+{6\bssk^2\Li\lrt}\rp\expns{\G\f{\xt}}+ \bssk\mbe\normbs{\G\f{\yst}-\G\f{\xt}}+\bssk^2\Li\lrt\lp\frac{6\ghets}{\bsmin}+\frac{2\lhets}{\locit\bsmin}\rp\nn\\
    & \quad +\lp{6\bssk^2\Li\lrt}+\bssk\rp\lp\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\expns{\G\f{\xt}}+\frac{\dd}{1-\dd}\ghets\rp\tag{Using Lemma~\ref{lemma:L1_1}}\\
    &=\lp-\frac{1}{2\bssk}+{6\bssk^2\Li\lrt}+\frac{\bssk\dd}{\lp1-\dd\rp}+\frac{6\bssk^2 \Li\lrt\dd}{\lp1-\dd\rp}\rp\expns{\G\f{\xt}}+\bssk \mbe \normbs{\G\f{\yst}-\G\f{\xt}}\nn\\
    & \quad +\lp\frac{2\bssk^2\Li\lrt}{\locit\bsmin}+\frac{3\bssk^2\Li^3\lrc^3\lrst\locit}{\lp1-\dd\rp}+\frac{\bssk\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\rp\lhets+\lp\frac{6\bssk^2\Li\lrt}{\bsmin}+\frac{6\bssk^2\Li\lrt\dd}{\lp1-\dd\rp}+\frac{\bssk\dd}{\lp1-\dd\rp}\rp\ghets\nn,
\end{align}}
where $\dd\triangleq\Li^2\lrc^2\locit\lp\locit-1\rp$. Using tower property of conditional expectation, telescoping the inequality over the round indices $t=0,1,\dots, T-1$, and using Lemma \ref{lemma:sequence_diff_2}, we get,
{\allowdisplaybreaks \begin{align}
    &\avgtelPm{\frac{1}{2\bssk}-{6\bssk^2\Li\lrt}-\frac{\bssk\dd}{\lp1-\dd\rp}-\frac{6\bssk^2\Li\lrt\dd}{\lp1-\dd\rp}}\expns{\G\f{\xt}}\leq
    \frac{\bssk}{T}\sumtel\expns{\G\f{\yst}-\G\f{\xt}}\nn\\&+\frac{\f{\ys^{(0)}}-\expb{\f{\ys^{\lp T\rp}}}}{T\lrt}+\lp\frac{2\bssk^2\Li\lrt}{\locit\bsmin}+\frac{3\bssk^2\Li^3\lrc^3\lrst\locit}{\lp1-\dd\rp}+\frac{\bssk\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\rp\lhets+\lp\frac{6\bssk^2\Li\lrt}{\bsmin}+\frac{6\bssk^2\Li\lrt\dd}{\lp1-\dd\rp}+\frac{\bssk\dd}{\lp1-\dd\rp}\rp\ghets\nn\\
    &\leq\lp1+\frac{3\awmax\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\awmax\bssk}{\bsmin^2\locit}\lhets+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\awmax^2\bssk}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk^2\rdm}{\bsmin}\frac{1+\dd}{1-\dd}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{\f{\ys^{(0)}}-\expb{\f{\ys^{\lp T\rp}}}}{T\lrt}+\lp\frac{2\bssk^2\Li\lrt}{\locit\bsmin}+\frac{3\bssk^2\Li^3\lrc^3\lrst\locit}{\lp1-\dd\rp}+\frac{\bssk\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\rp\lhets+\lp\frac{6\bssk^2\Li\lrt}{\bsmin}+\frac{6\bssk^2\Li\lrt\dd}{\lp1-\dd\rp}+\frac{\bssk\dd}{\lp1-\dd\rp}\rp\ghets\nn.
\end{align}}

Suppose the learning rates satisfy $\lrs\leq\bssk^{-3/2}\sqrt{\locit\bs}$ (which also makes $\lrst\leq\bssk^{-3/2}\locit\sqrt{\locit\bs}$) and $\lrc\leq\min\lcb\frac{\bssk^{-3/2}}{24\Li\locit\sqrt{\locit\bs}},\frac{\bssk^{-3/2}}{16\Li\locit\sqrt{\locit\aw\rdm}}\rcb$, the following inequality holds:
% https://www.wolframalpha.com/input?i=.5-3%2F12-1%2F30-1%2F60-51%2F480

{\allowdisplaybreaks \begin{align}
\frac{1}{2}-{6\bssk^3\Li\lrt}-\frac{\bssk^2\dd}{\lp1-\dd\rp}-\frac{6\bssk^3\Li\lrt\dd}{\lp1-\dd\rp}-\frac{6\Li^2\lrsqt\awmax\bssk^3\rdm}{\bsmin}\frac{1+\dd}{1-\dd}\geq\agrcD.
\label{proof:numeric_inequality2}
\end{align}}
Also, notice that $\ys^{(0)}$ is equal to $\x^{(0)}$ by definitions (Section~\ref{sect:virtual_seq_and_set_defn2}) of these sequences and $\min_\x \f{\x}\leq \f{\ys^{\lp T\rp}}$.

{\allowdisplaybreaks 
\begin{align}
    & \avgtelm \expns{\G \f{\xt}} \leq \agrci \frac{\f{\x^{(0)}} - \min_\x \f{\x}}{T\lrt}\bssk\tag{Using \eqref{proof:numeric_inequality2}}\\
    &+\agrci\lp\frac{2\bssk^3\Li\lrt}{\locit\bsmin}+\frac{3\bssk^3\Li^3\lrc^3\lrst\locit}{\lp1-\dd\rp}+\frac{\bssk^2\Li^2\lrc^2\locit}{2\lp1-\dd\rp}+\lp1+\frac{3\awmax\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\awmax\bssk^2}{\bsmin^2\locit}\rp\lhets\nn\\
    &+\agrci\lp \frac{6\bssk^3\Li\lrt}{\bsmin}+\frac{6\bssk^3\Li\lrt\dd}{\lp1-\dd\rp}+\frac{\bssk^2\dd}{\lp1-\dd\rp} +\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\awmax^2\bssk^2}{\bsmin^2}  \rp\ghets\nn.
\end{align}}

Define $\gapTerm \triangleq \f{\x^{(0)}} - \min_\x \f{\x}$. After reducing high-order terms using the assumptions, $\lrs\leq\bssk^{-3/2}\sqrt{\locit\bs}$ (which also makes $\lrst\leq\bssk^{-3/2}\locit\sqrt{\locit\bs}$) and $\lrc\leq\min\lcb\frac{\bssk^{-3/2}}{24\Li\locit\sqrt{\locit\bs}},\frac{\bssk^{-3/2}}{16\Li\locit\sqrt{\locit\aw\rdm}}\rcb$, and incorporating the constants into the $\mco(\cdot)$ notation, we have:
{\allowdisplaybreaks \begin{align}
    \avgtelm & \expns{\G\f{\xt}} \leq \bOP{\frac{\gapTerm\bssk}{T\lr\locit}}+\bOP{\lp\frac{\Li\lr\bssk^3}{\bsmin}+\Li^2\lrc^2\bssk^2\locit+\frac{\Li^2\lrsq\locit\awmax\bssk^2}{\bsmin^2}\rp\lhets}\nn\\&+\bOP{\lp\frac{\Li\lr\locit\bssk^3}{\bsmin}+\Li^2\lrc^2\locit\lp\locit-1\rp\bssk^2+\frac{\Li^2\lrsq\locit^2\awmax^2\bssk^2}{\bsmin^2}\rp\ghets}\nn.
\end{align}}
This concludes the proof.



\textbf{Proof of Lemma~\ref{lemma:sequence_diff_2}:} We start by using Assumption \ref{assump:smoothness} (Smoothness) and observing that Remark \ref{obs:diff_z_x} still holds with $\yst$ for the dynamic client allocation option. 
% \ps{Why?}
{\allowdisplaybreaks \begin{align}
    &\expns{\G\f{\yst}-\G\f{\xt}}\leq\Li^2\expns{\yst-\xt}=\Li^2\expns{\lrt\sumC\frac{1}{\bsfdito}\delito}\nn 
    %\tag{\ps{How did the 2nd eq. follow? We didn't state this anywhere. What is $\gamma_i^t$?}}
    \\
    &=\Li^2\expns{{\lrt}\sumC\frac{1}{\bsfdito}\lp{\delito-\edelito+\edelito}\rp}\nn\\
    &=\Li^2\lrsqt\expns{\sumC\frac{1}{\bsfdito}\lp{\delito-\edelito}\rp}+\Li^2\lrsqt\expns{\sumC\frac{1}{\bsfdito}{\edelito}}\nn\\
    &=\Li^2\lrsqt\expns{\sumC\frac{1}{\bsfdito}\frac{1}{\locit}\sumLockP{\sgitok-\G\fii{\xitok}}}+\Li^2\lrsqt\expns{\sumC{\frac{1}{\bsfdito}\edelito}}\nn \tag{Using Assumption~\ref{assump:lochet}}\\
    &\leq\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{\Li^2\lrsqt\awmax}{\bsmin^2}\expb{\sumC\normbs\edelito}\nn \tag{Using $\norm{\sum_{i=0}^n x_i}^2 \leq n \sum_{i=0}^n \norm{x_i}^2 $ and $|\setC|\leq \awmax$}\\
    &\leq\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{\Li^2\lrsqt\awmax}{\bsmin^2}\expb{\sumC\normbs{\edelito-\G\fii{\xtoi}+\G\fii{\xtoi}-\G\f{\xtoi}+\G\f{\xtoi}}}\nn\\
    &\leq\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets\nn\\
    &+\frac{3\Li^2\lrsqt\awmax}{\bsmin^2}\expb{\sumCP{\normbs{\G\f{\xtoi}}+\normbs{\edelito-\G\fii{\xtoi}}+\normbs{\G\f{\xtoi}-\G\fii{\xtoi}}}}\nn\\
    &\leq\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{3\Li^2\lrsqt\awmax}{\bsmin^2}\expb{\sumCP{\normbs{\G\f{\xtoi}}+\normbs{\edelito-\G\fii{\xtoi}}}}.\nn
\end{align}}
Telescoping the inequality over $t=0,\dots,T-1$:
{\allowdisplaybreaks \begin{align}
    &\avgtelm\expns{\G\f{\yst}-\G\f{\xt}}\leq\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets\nn\\&+\frac{3\Li^2\lrsqt\awmax}{\bsmin^2}\avgtelm\expb{\sumCP{\normbs{\G\f{\xtoi}}+\normbs{\edelito-\G\fii{\xtoi}}}}\nn\\
    & %\overset{(a)}{\leq}
    \leq\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\avgtelm\expns{\G\f{\xt}}\tag{Using Remark \ref{obs:counting_max_updates}, however, this time,}
    % \tag{\ps{This $\rdm$ in 3rd term needs some explanation}}
    \\
    & \quad +\frac{3\Li^2\lrsqt\awmax}{\bsmin^2}\avgtelm\expb{\sumC{\normbs{{\edelito-\G\fii{\xtoi}}}}} \tag{the maximum appearance can be $2\rdm\bsmax$}
    %\tag{\ps{Norm and sum over $\mc C^{(t)}$ missing}} 
    \\
    &\leq\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\awmax}{\bsmin^2}\avgtelm\expb{\sumCP{\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\normbs{\G\fii{\xtoi}}}} \tag{Using Lemma~\ref{lemma:L1_1}} \\
    % \label{line:L1_1used}\\
    &\leq\lp1+\frac{3\awmax\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\awmax}{\bsmin^2}\avgtelm\expb{\sumC\frac{\dd}{1-\dd}\normbs{\G\fii{\xtoi}}}\nn\\
    &\leq\lp1+\frac{3\awmax\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\awmax}{\bsmin^2}\avgtelm\expb{\sumCP{\frac{2\dd}{1-\dd}\normbs{\G\fii{\xtoi}-\G\f{\xtoi}}+\frac{2\dd}{1-\dd}\normbs{\G\f{\xtoi}}}}\nn\\
    &\leq\lp1+\frac{3\awmax\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\awmax}{\bsmin^2}\avgtelm\expb{\sumC\frac{2\dd}{1-\dd}\normbs{\G\f{\xtoi}}}\nn\\
    &\leq\lp1+\frac{3\awmax\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\frac{2\dd}{1-\dd}\avgtelm\expns{\G\f{\xt}}\nn \tag{Using Remark \ref{obs:counting_max_updates}}\\
    &=\lp1+\frac{3\awmax\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\frac{1+\dd}{1-\dd}\avgtelm\expns{\G\f{\xt}}.\nn
\end{align}} 

% \bibliography{References}
\end{document}
