\documentclass{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
 % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
\usepackage{multirow}

\usepackage{mathtools, nccmath, textcomp} 
\usepackage{xcolor} %bbb
\usepackage{amsfonts} %bbb
\usepackage{dsfont} %bbb
\usepackage{macros} %bbb

\usepackage{subcaption}
\usepackage{makecell}
\usepackage{enumitem}
\usepackage[textsize=scriptsize, textwidth=0.5in]{todonotes}
\newcommand{\psa}[1]{\textcolor{green}{B: #1}} %bbb answer ps s
\newcommand{\psq}[1]{\textcolor{red}{B: #1}} %bbb followup ps s
\newcommand{\ba}[1]{\textcolor{purple}{B: #1}}
\newcommand{\carlee}[1]{\textcolor{magenta}{C: #1}}
\newcommand{\tdcarlee}[1]{\todo[color=blue!10, linecolor=black!50]{\textbf{C}: #1}}


\usepackage[normalem]{ulem}
\usepackage[normalem]{ulem}
\usepackage{algorithm, algorithmicx, algcompatible}
\usepackage{algpseudocode} %bbb
% Example of adjusting indentation - Not a direct command, illustrative purposes only
\algrenewcommand\algorithmicindent{1em}

\usepackage{hyperref, tabularx}
\usepackage[capitalise]{cleveref}
\makeatletter
\newcommand{\multiline}[1]{%
  \begin{tabularx}{\dimexpr\linewidth-\ALG@thistlm}[t]{@{}X@{}}
    #1
  \end{tabularx}
}
\makeatother
% \algnewcommand{\Initialize}[1]{%
%   \State \textbf{Initialize:}
%   \Statex \hspace*{\raggedright #1}
% }
% %{\algorithmicindent}\parbox[t]{.8\linewidth}
\algnewcommand{\Initialize}[1]{%
  \State \textbf{Initialize:} #1}

\title{FedAST: Federated Asynchronous Simultaneous Training}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2024 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Harry~Q.~Bovik}
\author[1,2]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[1]{Further~Coauthor}
\author[3]{Further~Coauthor}
\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Computer Science Dept.\\
    Cranberry University\\
    Pittsburgh, Pennsylvania, USA
}
\affil[2]{%
    Second Affiliation\\
    Address\\
    …
}
\affil[3]{%
    Another Affiliation\\
    Address\\
    …
  }
  
  \begin{document}
\maketitle

\begin{abstract}
%\GJ{Algorithm name suggestion: FedAST -- federated simultaneous training} \ba{I changed all algorithm names: FedAMT -> FedAST. Also, instead of "multi model", we may use federated simultaneous training (FST). So, I changed baseline abbreviations accordingly as well.}
Federated Learning (FL) enables edge devices or \textit{clients} to collaboratively train machine learning (ML) models without sharing their private data. Much of the existing work in FL focuses on efficiently learning a model for a single task. In this paper, we study \textit{simultaneous training} of multiple FL models using a common set of clients. The few existing simultaneous training methods employ synchronous aggregation of client updates, which can cause significant delays because large models and/or slow clients can bottleneck the aggregation. On the other hand, a na\"ive asynchronous aggregation is adversely affected by stale client updates. We propose \(\nameofthealgorithm\), a buffered asynchronous federated simultaneous training algorithm that overcomes bottlenecks from slow models and adaptively allocates client resources across heterogeneous tasks. We provide theoretical convergence guarantees for \(\nameofthealgorithm\) for smooth non-convex objective functions. Extensive experiments over multiple real-world datasets demonstrate that our proposed method outperforms existing simultaneous FL approaches, achieving up to $46.0\%$ reduction in time to train multiple tasks to completion. 
\end{abstract}

\section{Introduction}\label{sec:intro}
Federated Learning (FL) is a distributed learning paradigm where edge devices or \textit{clients} collaboratively train machine learning (ML) models using privately held local data \citep{fedavg, kairouz2021advances}. Clients iteratively update their local models, which are periodically sent to a central server for aggregation. The aggregated model is then sent to the clients to begin the next round of local updates. 
% In FL, a global model is trained on local datasets collected by clients via successive communication rounds between the clients and a central server. The server orchestrates local training at the clients and periodically aggregates their updates to form a common global model.
% \carlee{emphasize that this is iterative--e.g., ``Clients iteratively update local models, which are periodically sent to a central server for aggregation; the aggregated model is then used to begin the next round of local client updates.''}
% \paragraph{Limitation of Single-Model FL}
Since its introduction in \citep{fedavg}, various practical and theoretical aspects of FL, including client selection \citep{clientSelectionInHet, powerofchoice} and scalable and fast training \citep{AsyncFL, fednova}, have been extensively studied.
%communication efficiency \citep{improvingComm},
%efficient aggregation of client updates \citep{fednova, fedvarp}
% \carlee{what does ``efficient'' mean? that sounds more like compression or over-the-air updates, which is not what the papers you have cited work on}
%privacy concerns \citep{feddiffpriv, secureAggr}.
% \carlee{Why are all of these challenges relevant to this work? I don't think we need to talk about specific single-model FL challenges unless we later solve them for the MMFL case. E.g., privacy and efficient aggregation aren't mentioned in the paper again}
% \ps{Please list a few more and cite 1-2 references for each?}
However, these works almost exclusively assume that the server aims to learn model(s) for a \textit{single task}. Some FL frameworks attempt to learn models personalized to each client~\citep{mansour2020three_FL, li2021ditto_FL, tan2022personalized_FL}, but these models are still intended for the same learning task, e.g., next-word prediction on keyboards.
% Although personalization in FL \citep{mansour2020three_FL, li2021ditto_FL, tan2022personalized_FL} has received significant attention, the goal is still to learn client-specific models for a single machine learning task, for example, next-word prediction on smart keyboards.% specialized towards user's writing style. %\carlee{explain/define a task. Or give an example so people know what you mean.}
%
% Single-model federated learning (FL), where the server aims to train one model, has been extensively discussed in the literature since the introduction of , Federated Averaging (FedAvg), by \citep{fedavg}. `Many works seek to optimize/analyze/study various aspects of single-model FL, including client selection schemes \citep{clientSelectionInHet} and optimal aggregation methods \citep{fednova}. %\carlee{I don't know if ``problems'' is the right terminology: maybe say that ``Many works seek to optimize/analyze/study various aspects of single-model FL, including...'' and then list aspects that we also consider for multi-model.} %\ps{We should call them problems of FL, rather than FedAvg, since they've been studied in more general context as well. Also, client selection or optimal aggregation or sync vs async are problems, while theory is not a problem per se, it's a means to better understand the underlying problem. Does that make sense? \psq{It totally makes sense, changing the wording.}}

Many practical applications need devices to perform a wide range of learning tasks, which requires training
% \tdcarlee{Below we contrast sequential vs. simultaneous training: maybe just say we need to train multiple models here instead of specifying simultaneous training?} 
of multiple ML models. For instance, our phones need language models for keyboard next-word prediction as well as image recommendation models to highlight images more likely to be shared \citep{fedavg}. Thus, in this paper, we seek to answer the following question:
% \vspace{-5}
\begin{center}
\emph{How can we efficiently train models for multiple tasks in a federated setting using a shared pool of clients?}
\end{center}
% \vspace{-5}
% \ps{Maybe we can say ``models for multiple tasks'' rather than ``multiple models'' to distinguish from the personalization work we mention above?} \GJ{Edited}

\paragraph{Simple Solutions that Extend FedAvg.} 
% \sout{Let us look at some initial solution approaches that directly extend the standard federated averaging (FedAvg) algorithm employed in single-model training.} 
A na\"ive approach to training multiple models is \textit{sequential training}, where the models corresponding to different tasks are trained one at a time, each utilizing all the clients. The total training runtime of this approach scales linearly with the number of tasks. An alternative is for all the clients to train all tasks at the same time. But then each client will have to keep all models in memory, which is infeasible for resource-limited edge clients. To preserve memory, clients will have to queue the training requests and process them sequentially, again resulting in the runtime linearly increasing with the number of tasks.
%\sout{An important fact that underlines the inefficiency of sequential training is that increasing the number of participating clients brings diminishing marginal benefits for the speed of achieving the target accuracy (e.g., doubling the number of clients does not halve the number of communication rounds required). \ps{Maybe what we want to say is ``increasing the number of clients decreases the number of communication rounds required. However, each round also takes longer, since the server needs to wait for the slowest client.''} \carlee{citation here?}}
On the other hand, \textit{parallel} or \textit{simultaneous training} (ST) of all the models with a subset of clients assigned to each task can strike a better trade-off between accuracy and runtime. 
%\sout{The simplest approach to perform simultaneous training is to partition the clients into disjoint subsets and assign one subset to each model. The model for each task is then trained solely using the assigned subset of clients. However, due to potentially high heterogeneity across the clients' local data, each model will not see diverse data and the trained global models may not generalize well to all clients. An alternative approach that allows each task to see data from the entire client pool proposed in recent work}
%\ps{I have crossed out this previous approach, because this approach is obviously bad, so I don't expect a reviewer asking about it.}\ba{In addition to what I said about the reason to add this parts in the office, showing that multi-model is not trivial, we discuss/compare them in later parts of the text. We should remove those parts if we want to remove here.}
\citep{BhuyanMM}'s approach assigns a disjoint subset of clients to each model in each round, which significantly improves the time taken to reach a target accuracy as compared to sequential training. However, these federated simultaneous training (FST) approaches leave room for significant improvement. There are two particular drawbacks: 1) \textit{straggler delays} due to synchronous aggregation, and 2) the \textit{lack of adaptation} to 
%the model complexity and 
the training progress of heterogeneous tasks,  which we address in this work. %\carlee{I think we may need to address the baseline of asking all clients to train all tasks at the same time. We can say that this runtime also scales linearly with the number of tasks, as the runtime of each iteration will grow with the number of tasks due to finite client computation resources} \GJ{Good point! We should explain that baseline and compare with it as well.}
%To mitigate these issues, the approach of \textit{federated simultaneous training (FST)} (also known as \textit{multi-model FL}), using a different set of clients for each model in each round, has been previously proposed, despite little existing work. Initial efforts, such as \citep{BhuyanMM}, demonstrate that simultaneously training multiple models with a common set of clients can improve training time compared to sequential training, while also resulting in more generalized models than when training separately with partitioned clients. 
%


%\GJ{Here, we can start off by first explaining step-by-step what makes multi-model FL different and non-trivial as compared to single-model FL. I suggest going through the following policies: 1) sequentially training the models -- this will cause large delays for models trained later, and the marginal benefit of having more clients to achieve target accuracy is diminishing, 2) parallel training by partitioning the clients into sets, one per model, and using them separately for training. This will cause larger data heterogeneity and each model will see less diverse data, 3) parallel training by sampling clients uniformly at random for each model in each round (Bhuyan et al) - this alleviates the data heterogeneity issue but the aggregation is synchronous across models and it can be bottlenecked by the slowest client in the system. This is even worse when the models have different complexities and they require different numbers of clients.} \GJ{After this we can go to asynchronous FL and its multi-model version. One drawback of our current results is that it is a na\"ive extension of previous asynchronous FL works to the multi-model setting. Can we boost the novelty in some way? One option is to avoid or reduce queueing at the clients using a accept/reject policy. Another option is to include runtime analyses of synchronous versus asynchronous FL algorithms.}

%\carlee{image classification is not a great example as it's hard to generate the classification labels on phones. Maybe say image recommendation instead}\GJ{+1 - You can say image tagging as an example application}%\ps{cite reference \psq{I cite other papers using the same examples, is it okay?}} 
%Another example is self-driving cars that require separate models for vehicle/pedestrian recognition, navigation and control \citep{fed_car_map,fed_car_6g} %\carlee{why is AV control a federated learning problem? I think you have plenty of examples, no need to mention these specifically}. 
%Examples like these abound, as FL is adopted across various domains such as smart industry, healthcare, and banking systems \citep{iotsurvey}. %\GJ{The previous sentence is incomplete. It can also be omitted without losing the message of this paragraph.} 
%\carlee{These na\"ive training algorithms, however, still leave room for significant improvement.}
%\carlee{can also add a survey on federated learning for IoT here, e.g., \url{https://ieeexplore.ieee.org/abstract/document/9460016}}
%\ps{This is a good example. Can we add a reference here?}
%The federated learning framework where a single server orchestrates training of multiple independent models for unrelated ML tasks simultaneously 
%\ps{should we say ``multiple independent models/tasks simultaneously''?} using the same set of clients is called Multi-model Federated Learning (MMFL) \citep{BhuyanMM}. 
% \ps{If possible, we should be consistent in our usage of ``client'' or ``worker.'' Also, based on our description above, \textit{model} is what helps us perform a \textit{task}. If possible, we should try to be consistent in this. \psq{I ll go over the full text to make all occurrences consistent}}%\carlee{Motivate here why we need concurrent training, e.g., ``Prior work has shown that interleaving the training of multiple models can accelerate the overall training time compared to sequentially training each model separately.''}

% \paragraph{Stragglers in FL}
%Single- and multi-model FL encounter some common problems such as long training times due to different-paced clients. \carlee{Rephrase this to keep the focus on MMFL and introduce our goal of reducing wall-clock training time: ``Like single-model FL training, MMFL training algorithms should aim to finish training all models within a reasonable wall-clock training time, which is particularly challenging when clients compute local model updates at different speeds. This challenge is particularly apparent in the conventional FL framework...''}
% Conventional FL framework employs synchronous aggregation, where the server waits to receive all the client updates before aggregating. However, when clients have diverse hardware and communication capabilities, faster clients must wait idly for slower clients or \textit{stragglers} to finish, impacting the FL wall-clock time. Solutions proposed to alleviate this problem include allowing clients to run unequal numbers of local steps based on their computational speed \citep{fednova}, aggregating only the updates that arrive before a timeout \citep{fedsysdesign}, and sub-sampling from the set of available clients \citep{optimalClientSampling}.
% Unlike these works, \citep{AsyncFL} employs asynchronous Federated Learning (AsyncFL) to address the straggler issue. \ps{This last sentence also needs to come after the next para.}


% In AsyncFL, the server updates the global model whenever it receives an update from a client local run. \carlee{This sentence interrupts the flow to the next paragraph. I would talk about asynchronous FL after discussing why synchronous FL won't work for MMFL.} 


%\ps{This para is good in that it sets the context for stragglers and introduces the async paradigm. The drawback is it breaks the flow from the previous para, which was on challenges of MMFL. Next para is again on challenge of MMFL due to stragglers. If we can shuffle the sequence somehow so that we can mention both challenges of MMFL together and also introduce stragglers and asynch. Too many constraints! :D \psq{Too many constraints but it will make the flow much better :D Can we cover it by adding some conjunctions? I am adding the sentences in purple. }}

\paragraph{Synchronous Aggregation and Straggler Delays.} Conventional FL employs synchronous aggregation, where in each round, the server waits to receive updates from all the participating clients before each aggregation. However, when the clients have diverse hardware and communication capabilities, faster clients must remain idle until slow or \textit{straggling} clients finish, causing a large wallclock runtime to complete each communication round. This problem is further exacerbated in FL with multiple simultaneous models \citep{BhuyanMM, MM_bobs}, where the aggregation is synchronized across tasks as well. Therefore, the server has to wait for the slowest client across \textit{all} the parallel tasks. Solutions proposed to alleviate the straggler problem in the single-model context include allowing faster clients to run more local steps \citep{fednova}, aggregating only the client updates that arrive before a timeout \citep{fedsysdesign}, and sub-sampling from the set of available clients \citep{optimalClientSampling}. Although these approaches perform well when stragglers appear uniformly at random, they do not work well in the simultaneous training setting because some models (e.g., larger ones) are inherently slower to train. When the multiple models have inherently different training times, %trained simultaneously are of different sizes, 
synchronized global aggregation rounds are bottlenecked by the slowest client assigned to the most computationally intensive model, leading to large idle times. %\GJ{Expand on the last sentence further -- the aggregation will be bottlenecked by the slowest client assigned to the largest and most computationally intensive model.}
%than others. %Further, these methods assume random stragglers. \ps{Do we need to mention this?} To alleviate the straggler problem, we employ asynchronous updates at the server for each task. 
%\carlee{Why can't we alleviate this with single-model techniques like sub-sampling and running different numbers of local steps? Explain why such techniques, which generally assume random stragglers, don't really help when some models are inherently slower to train for all clients.} 
%\ps{What if we flip the order of the two challenges? We first introduce stragglers, then mention the corresponding challenge in MMFL, and then talk about resource allocation problem in MMFL. \psq{This sounds good to me too. Changing the orders and add conjunction sentences in purple.}}

%\paragraph{Buffer in Asynch FL} %\carlee{I would move this above the resource allocation paragraph as we do more to address the staleness challenge than the client allocation one}
%\carlee{take out this paragraph title; you haven't introduced asynchronous FL yet}

%With asynchronous aggregation at the server as proposed in AsyncFL \citep{AsyncFL}, the server updates the global model whenever it receives a client model update. Although AsyncFL resolves the straggler issue, it suffers from undesired \textit{staleness} since the received client updates are often based on outdated global models. To alleviate the staleness problem in single-model FL, \citep{fedbuff} proposed storing the incoming client updates at the server in a buffer. Once the buffer gets full, the server updates the global model by aggregating the stored client updates. Compared to training without buffers, fewer global updates are then likely to occur between the time a given client starts and finishes its local update. However, this also delays the global model aggregation as the server waits for multiple client model updates.

%In our multi-model setting, the server employs separate \textit{buffers} to store clients' local training updates corresponding to each task. A larger buffer results in lower staleness, but it increases the aggregation time for each task independently. The frequency of updates also depends on how many clients we allocate to each task. Therefore, MMFL with buffered asynchronous updates presents unique challenges. We need to adjust the client allocation to models with various complexities and control the staleness-speed trade-off by tuning the buffer size for each model all together. For example, a model with higher complexity or data heterogeneity across clients may require more updates. Assigning more clients to this model, however, leads to higher staleness. So, we need to adjust its buffer size along with the client allocation for each model under the restriction of limited clients. Also, our solution entails fair training by giving each client an equal chance to contribute to each model's training as the final global models. 

%b%\GJ{The above two paragraphs (now commented out) were going into too much detail. They are almost giving away the whole summary of the paper. Below is the suggested new version of the first paragraph. I think the second paragraph belongs better in the our contributions section, and I have incorporated it (in a sentence or two) into that section.}

%\GJ{At the beginning of this paragraph, highlight that asynchronous FL methods have only been studied for single-model training. Currently, this is mentioned in the last sentence of the paragraph. And can you say something about why this is not a straightforward extension? \ba{I have edited. I hesitated to say there is no async. simultaneous FL work because of the ArXiv submission.}}

\paragraph{Asynchronous Aggregation and Staleness Issues.} Another solution to the straggler problem is asynchronous aggregation at the server, as proposed in AsyncFL \citep{AsyncFL}, where the server updates the global model whenever it receives any client update. While asynchronous aggregation has been extensively studied in single-model federated learning \citep{chen2020asynchronous, wang2022asynchronous, xu2023asynchronous}, it has not been well-explored for simultaneous federated training. Although AsyncFL addresses the straggler issue, it suffers from undesired \textit{staleness} even in the standard FL setting, since the received client updates are often based on outdated global models. To alleviate the staleness problem in single-model FL, \citep{fedbuff} proposed storing the incoming client updates in a buffer at the server and aggregating when the buffer is full. 
% A large buffer size reduces staleness but delays the aggregation of the global model. 

%Training multiple models, asynchronous simultaneous training presents unique challenges as it allocates limited resources across tasks whose processes are not synchronized, unlike in synchronized FST. \GJ{I don't understand the point being made in the above sentence. Why is this a unique challenge?} Furthermore, 
 
\paragraph{Adaptive Allocation of Clients to Heterogeneous Tasks.} In this work, we employ asynchronous buffered aggregation to overcome the straggler issue while controlling staleness.
% \carlee{while controlling staleness.} % without suffering from the staleness problem. 
However, extending single-model FL algorithms \citep{AsyncFL, fedbuff} to the simultaneous training of multiple models is not straightforward --- running multiple independent instances of asynchronous FL can be suboptimal. This is because the tasks can have heterogeneous computation complexities and different data heterogeneity that affect both the number of rounds required to achieve a given target accuracy as well as the wall-clock time taken to complete each round. Since a shared set of clients is used to train the models, the training processes are coupled -- more resources assigned to one task implies less for the others. Moreover, the optimal resource requirement for each task can change over time according to its data heterogeneity and training progress and may be difficult to predict before training.
% \carlee{and may be difficult to predict before training}. 
% \ps{Previous two sentences motivate non-uniform distribution of clients across tasks, but not dynamic allocation.} \GJ{Edited. PTAL} 
Therefore, we propose an adaptive algorithm that \textit{dynamically} reallocates clients across tasks depending on their training progress, and also adapts the buffer size used for asynchronous aggregation of updates.
%. Once the buffer gets full, the server updates the global model by aggregating the stored client updates. %Compared to training without buffers, fewer global updates are then likely to occur between the time a given client starts and finishes its local update.
%While it reduces staleness, this also delays the global model aggregation as the server waits for multiple client model updates.


%\carlee{This should be explained better. \psq{I added more explanation for FedBuff, do you think it needs more explanation for MMFL buffers?}} 
%This reduces the adverse staleness effect of asynchronous updates since the buffer leads to less frequent global updates. So, the difference between the versions of model used to calculate asynchronous client updates and the current global model at the time of an update is received is lower. \carlee{``Difference...is lower'' is a little vague. Instead you could say something like ``Thus, fewer global updates are likely occur between the time a given client starts its local update and the time it finishes, reducing staleness.''}
 %\carlee{what is a ``global round'' here?} 
%\carlee{The terminology in this paragraph is a little confusing, as ``update'' is used for both the global aggregation and the local client model update \psq{I tried using ``aggregation'' whenever possible instead of ``global update''. For fully-async, I couldn't find a synonym of ``update'' that fits well. As you suggested, I changed ``update''s with global model update/client model update.}}
 %\carlee{Say explicitly that a longer buffer results in lower staleness; otherwise it's not clear there is a tradeoff.}
%a%\ps{This para seems to suggest that we just borrowed the idea of buffer from \citep{fedbuff} and applied it to multi-model FL. But, we don't mention any additional challenges beyond single-model FL. \psa{Divided the paragraph into two and removed the next paragraph. Now, we emphasize the design challenges unique to MMFL setting as well.}}

%a%\carlee{This paragraph feels a little disconnected from the others. Maybe make this challenge about how to manage different tasks' buffers in the MMFL setting: which clients get to work on each model, how to set the buffer size for each model, etc. Then our solution is to use client queuing to enforce unbiased updates for each model (and we find buffer size heuristics that work well yet are flexible to heterogeneous models).\psa{I changed the structure of last two paragraphs a bit. Now, we emphasize the design }}

% \paragraph{Challenges of MMFL I - Resource Allocation}
% Moreover, MMFL presents unique challenges. Dealing with multiple models simultaneously, the server has  to manage the scheduling of a limited number of clients to train each model efficiently. %\carlee{I think the issue is a limited number of clients, not computation/comms resources: we don't go into allocating those.} 
% Similar to single-model FL, the server requires deciding when to assign clients with local training requests. %\carlee{what is a ``job''? } 
% \ps{In single model FL, there is not really an allocation problem, since server cannot control clients' availability. It uses clients whenever they appear. \psq{Here, I tried to refer to client selection papers, such as Yae Jee's Power-of-Choice Selection. Would it be okay if I cite it as an example?} In MMFL this problem appears since in each round, we can choose which client works with which model. Your next sentence is good, just the previous sentence needs to be rephrased/removed.} Further, the server needs to choose which task is assigned to a client in each training iteration of MMFL. \carlee{How do we address this challenge? By tuning the number of active clients and thus the size of the buffer for each task? \psq{Our allocation scheme in the heterogeneous experiment can be an example of the allocation challenge. However, it is limited to one experiment in the paper. We may consider removing this discussion.}}
% % \ps{Other than resource allocation, can we think of any other challenge unique to MMFL?}


\paragraph{Our Contributions.}
% \tdcarlee{Include the experimental results here as a contribution: it looks strange to not include them.}
We formalize the FST setting in Section~\ref{sec:formulation} and then make the following main contributions:
 % \carlee{We formalize the FST setting in Section~\ref{sec:formulation} and then make the following main contributions:} %The main contributions of this paper are summarized as follows:
\begin{itemize}[leftmargin=*]
    \item We introduce  $\nameofthealgorithm$, a \underline{Fed}erated \underline{A}synchronous \underline{S}imultaneous \underline{T}raining algorithm\footnote{Our code is provided in the Supplementary Materials.} to simultaneously train models for multiple tasks (Section~\ref{sec:alg_conv}). Our work is one of the first to mitigate the straggler problem faced by synchronous FST methods that extend vanilla FedAvg. %\ps{This reads like a weak statement. We should talk about novelty rather than nuance in the first point. I suggest making the FedAST para above as the first bullet.} \GJ{Edited}
    \item The proposed algorithm addresses the problem of balancing resources across heterogeneous tasks, a unique challenge to the FST framework, using novel dynamic client allocation, and it also dynamically adjusts the buffer size used in asynchronous aggregation to strike the best trade-off between staleness and runtime. %\GJ{Edited this bullet}
    \item We provide a theoretical convergence analysis of $\nameofthealgorithm$ (Section~\ref{sec:convergence}), which improves previous analyses even in the single-model FL setting. It improves upon \citep{sharper} by considering multiple local updates and the buffer, and on \citep{fedbuff} by relaxing the restrictive assumptions. 
    % \GJ{I added more details of the analysis comparison here. Please check if this is correct.}
    \item We experimentally validate $\nameofthealgorithm$'s performance (Section~\ref{sec:exp}) in terms of its wall-clock training time and model accuracy on multiple real-world datasets compared to synchronous and asynchronous FL baselines.
    % \item \carlee{We experimentally validate $\nameofthealgorithm$'s performance (Section~\ref{sec:exp}) in terms of its wall-clock training time and model accuracy on multiple real-world datasets compared to synchronous and asynchronous FL baselines.}
\end{itemize}
% \carlee{We conclude and discuss future work in Section~\ref{sec:conclusion}.}
We conclude and discuss future work in Section~\ref{sec:conclusion}.
% The main contributions of this paper are summarized as follows. We propose $\nameofthealgorithm$, a \underline{Fed}erated \underline{A}synchronous \underline{S}imultaneous \underline{T}raining algorithm for federated simultaneous training of multiple models. Our method mitigates the straggler problem faced by synchronous FST frameworks by allowing asynchronous updates across tasks. The work is among the first to explore the nuances of designing a runtime-efficient FST algorithm. Additionally, we are the first to provide a provably convergent asynchronous algorithm in this setting. The proposed algorithm addresses the resource sharing problem, a unique challenge to the FST framework, by employing dynamic client allocation across simultaneous tasks throughout the training, based on their heterogeneity levels. Moreover, $\nameofthealgorithm$ utilizes separate buffers at the server for each task to mitigate the detrimental effect of stale updates without increasing the runtime significantly.
% We demonstrate the theoretical convergence of $\nameofthealgorithm$ in non-convex smooth settings under standard assumptions. Comprehensive experiments validate our method against previously proposed synchronous FST methods.\footnote{Our code is provided in the Supplementary Materials.} \GJ{Can we say something more about how the analysis improves on previous works? \ba{I ll edit contributions}}
%\GJ{From the above paragraph, the novelty and impact of the work is not clear. Need to be more assertive by saying something like 'we are one of the first works to shed light on nuances and the error-runtime trade-offs that arise in simultaneous federated training of multiple models, and the first to analyze asynchronous algorithms in this setup..' } 

\paragraph{Related Work.}
%}\label{sect:related_work}

%
%\GJ{Overall, this section seems quite long. Let me try and cut it down}
%\GJ{Depending on how we edit the intro section, this section can be shortened or merged into the intro. } 

%\paragraph{Synchronous FL.} 
%The conventional FL framework with synchronous aggregation of client updates at the server has attracted significant attention since the introduction of the FedAvg algorithm \citep{fedavg, kairouz2021advances}. Numerous studies have established theoretical guarantees under full and partial client participation \citep{yang2021achieving, powerofchoice, wang2022unified} for strongly-convex \citep{Li2020On, NEURIPS2020_45713f6f}, convex \citep{NEURIPS2020_45713f6f}, and non-convex \citep{fednova, Yang2021AchievingLS} objectives with identical and heterogeneous client data distributions \citep{karimireddy2020scaffold}. To mitigate the problem of straggling clients in synchronous FL, \citep{fedsysdesign} proposes a timer-based method, where the server only aggregates the updates from the fastest clients, while \citep{optimalClientSampling} and \citep{Yang2021AchievingLS} propose subsampling the participating clients in every round.
% has been regarded as a remedy for the straggler problem and supported by theoretical guarantees \citep{fednova, Yang2021AchievingLS, fedvarp}.

 
%\paragraph{Asynchronous FL.} %Despite straggler mitigation methods, the server in synchronous FL algorithms must wait for the slowest participating client. \GJ{The above is not true for the Bonawitz paper, which considers only the fastest clients' updates as mentioned a couple of sentences ago.} Considering that clients may possess diverse hardware and network configurations, faster clients need to remain idle until the next round. \GJ{For brevity and clarity, I suggest omitting the above 2 sentences, and starting the paragraph here.}
%Asynchronous aggregation has been investigated in FL to enhance resource utilization and speed up the training \citep{AsyncFL, chen2020asynchronous, fedbuff, sharper}. The server incorporates client updates in the global model as soon as they arrive, and the available clients can initiate local training with a new version of the global model immediately. While asynchronous FL eliminates the idling of fast clients, the global model is updated possibly multiple times before a slow client completes its local training. Therefore, the updates sent by a client might be based on an outdated version of the global model, adversely affecting convergence. This problem of \textit{staleness} of client updates is amplified with an increase in the number of clients.
%and the number of models simultaneously being trained.Using a buffer at the server that stores multiple consecutive asynchronous client updates before aggregating them has been observed to reduce staleness in the single-model setting \citep{fedbuff}. 
%The buffer size can be tuned to strike a balance between the frequency of updates and the effect of staleness.

%In addition to the practical benefits of asynchronous aggregation, much effort has been put into its theoretical guarantees, although the stale updates pose a challenge for a rigorous analysis. Both \citep{fedbuff, sharper} present convergence bounds for non-convex smooth functions. 
% \GJ{The above description comparing assumptions and analyses is too detailed for this point in the paper. We can move this discussion to after the Theorem where you analyze the convergence of FedAST.}


%\paragraph{Federated Simultaneous Training.}
%\GJ{Put a period after each paragraph title}
Only a few recent works \citep{MM_bobs, MM_ucb, marieMM, BhuyanMM} consider federated simultaneous training of multiple models.
% and propose different ways to assign clients to the different ML tasks in each round. 
In \citep{MM_bobs}, clients are selected with either Bayesian optimization or reinforcement learning to minimize training time and unfairness in participation. \citep{MM_ucb} formulate the client assignment FL as a bandit problem leveraging local training losses as scores. \citep{marieMM} introduce biased client sampling, favoring the clients with higher local losses. These methods lack convergence guarantees. \citep{BhuyanMM} assign clients uniformly at random or in a round-robin fashion and analyze the convergence assuming convex objective functions and bounded gradients. While these works only consider synchronous aggregation, \citep{asyncMM} propose a fully asynchronous FST algorithm. Their approach entails solving a non-convex optimization problem to optimize client assignment, which requires information about delays and models that may be difficult to obtain in practice. Also, the obtained bound does not converge to a stationary point
% has a non-vanishing error term 
in the presence of data heterogeneity 
% \ps{Is it sufficient to mention ``non-vanishing error'' or should we clearly state they don't have convergence in heterogeneous setting?}
and suffers from increased staleness when the number of clients increases. 
% \GJ{Added 'non-vanishing error' to the last sentence. Please check if it looks correct.}

\section{PROBLEM FORMULATION}\label{sec:formulation}
%%%
\paragraph{Notations.}
For a positive integer $c$, we define $[c] \triangleq \{1,\dots,c\}$. $\widetilde{\G}$ denotes stochastic gradients. Bold lowercase letters (e.g., $\bx$) denote vectors. $|A|$ denotes cardinality of set $A$. $\|\cdot\|$ denotes Euclidean norm.

We now formally introduce the federated simultaneous training (FST) setting, where $\N$ clients train $\M$ models $\mathbf{x}_1, \dots, \mathbf{x}_\M$ corresponding to $\M$ independent tasks. For each task $m \in [M]$, our goal is to find the model that solves that following optimization problem:
\begin{align}
    \min_{\x\in \mathbb{R}^{d_m}} \left\{ \fglobalj(\x) := \frac{1}{N}\sum_{i=1}^{N}\fclientij(\x) \right\},
\end{align}
where $\fglobalj$ is the global loss function for task $m$, and $\fclientij$ is the local loss for task $m$ at client $i$. 
% In each communication round of the FedAvg algorithm \citep{fedavg}, only a subset of clients is available for training, that can be used to perform local model updates. The local models are then aggregated by the server to update the global model. %The overall goal is to minimize the loss of each task, subject to the limited resources available at the clients. 

%As discussed in Section~\ref{sec:intro}, perhaps the simplest approach to solve this problem is to train the models sequentially, using all the clients for a single task at any time.
% \carlee{It isn't clear here that the disjoint sets stay the same in each round for this strawman}
% Although these solutions may seem straightforward to implement in practice, the former
%However, this approach is time-inefficient.
% , while the latter could lead to models with poor generalization. 
%Therefore, we need methods specialized to FST.
% settings are needed.

First, we examine a simple extension of FedAvg \citep{fedavg} to simultaneous training of models for $\M$ tasks. At the start of each round, the server randomly partitions the available set of clients across the tasks \citep{BhuyanMM}. The server sends the current models $\{\xjt\}_{m=1}^M$ for all the tasks to the corresponding subset of clients. The clients perform local training and return their updates to the server, which \textit{synchronously} aggregates the updates for each task. %Note that the server has to wait for updates from all the clients across all the tasks before it can begin the next round. 
This na\"ive simultaneous training extension of FedAvg performs poorly due to stragglers. The time it takes for a client to return its updates depends on its resources and the size of the model assigned. Since the server waits for the slowest update across all the tasks before commencing the next round, the server waits much longer if a large model is assigned to a slow client. We mitigate this problem via \textit{asynchronous} training in $\nameofthealgorithm$, discussed next.
% Note that the rounds are synchronized across tasks.

%Locally, clients perform consecutive mini-batch SGD iterations using their private data. Suppose client $i$ is among the clients selected by the server to update the model corresponding to task $m$ in the $t$-th round. Starting with the global model $\xjt$, client $i$ runs $\locitj$ consecutive mini-batch SGD steps (see Algorithm~\ref{alg:cap}). \ps{Do we need this description of Local-SGD here?}

\begin{algorithm}[t]
\caption{\texttt{LocalTrain$(m$,$\locitj$,$\xjt$,$\lrcj)$} at client $i$}\label{alg:cap}
\begin{algorithmic}[1]
\State\label{alg_1_line_set_dummy}\textbf{Set} $\xijtz\gets\xjt$
\For{$k = 1,\dots,\locitj$}
    \State  $\xijtk \gets \xijtkm-\lrcj \widetilde{\nabla} \fclientij(\xijtkm)$
    % \State $k\gets k+1$
\EndFor
\State \textbf{Return} $\del_m \gets (\xjt-\xijlocitj) / (\locitj\lrcj)$
\end{algorithmic}
\end{algorithm}



%\GJ{Currently, you are just qualitatively saying that asynchronous aggregation avoids stragglers and reduces wall-clock time. This can be made more concrete by a proper definition and explanation of the expected wall-clock time per round. You could give an example with unit exponential training times per client. With synchronous aggregation, the expected wall-clock time per round is $\log (\sum_{m=1}^{M} R_m))$. Instead, with asynchronous training, the expected time per round for each model is just $1/R_m$. Not sure where exactly such an explanation should be placed.. Maybe before or inside the convergence analysis section?}
%\GJ{I just noticed that you have mentioned the above example in a footnote after the convergence analysis. Would be good to make that more prominent by bringing it into the main paper. \psa{Thanks! I brought it into the main paper. Although it doesn't reflect the arrivals in our algorithm, I think it explains the reasoning.}}
%%%
\section{ALGORITHM DESCRIPTION }\label{sec:alg_conv}
%\GJ{Why is our proposed algorithm stated inside the problem setting section? The problem formulation or problem setting section is meant to introduce the notation and state the objective that we are trying to achieve. In the context of our problem it would be describing multi-model FL concretely, talking about synchronous algorithms and then discussing the straggler issue which motivates asynchronous algorithms. Please make the FedAMT description a separate section appearing after the problem setting section.}
Next, we describe $\nameofthealgorithm$ (Algorithm~\ref{alg:main}), our proposed \underline{Fed}erated \underline{A}synchronous \underline{S}imultaneous \underline{T}raining algorithm, illustrated in Figure~\ref{fig:server_vis} for $M=2$ tasks. 
For each task $m\in [M]$, the server maintains a round index $t_m$ that is initialized to $\tj = 0$, the number of active training requests $R_m^{(t_m)}$, and buffer size $b_m^{(t_m)}$. $R_m^{(t_m)}$ and $b_m^{(t_m)}$ quantify the resources (client computation and memory) allocated to task $m$ in round $t_m$.
% gives the pseudocode of $\nameofthealgorithm$, and we explain its key components below. 
% \ps{Can we define $t_m, R_m, b_m$ in the first para?}
We provide two versions of \(\nameofthealgorithm\) based on the value of \mbox{\textit{option} \(\in\{S,D\}\)}. When \textit{option} is \(S\) (\textit{static}), the resource allocation for each task remains the same throughout the training process (i.e., $R_m^{(t_m)} \equiv R_m$ and $b_m^{(t_m)} \equiv b_m$). With \textit{option} $=D$ (\textit{dynamic}), \(\nameofthealgorithm\) dynamically reallocates resources across tasks using the \texttt{Realloc} subroutine (\Cref{alg:calcRb}).
% \tdcarlee{It is a little confusing to talk about $R_m$ and $b_m$ here, before we know what a ``training request'' is. I would explain when they are introduced that adjusting these parameters is how we do dynamic client allocation}  
%\carlee{I think we need to define $t_m$ as the round counter for task $m$}

% \sout{, that is provably convergent under the common assumptions in the literature.}
%\ps{We can remove the next two para.}
%
% \paragraph{Random client selection}
% We borrow the random client selection scheme from \citep{sharper}. Uniform sampling of clients enables us to have fair training across clients since each client has an equal chance to join the training.
% \ps{We shouldn't minimalize our contribution here - currently we're saying that we just added some bells and whistles to \citep{sharper}. \psq{It would be good to discuss what we can write here together.}} 
%
% \paragraph{Significance of Multiple Local Steps}
% Moreover, $\nameofthealgorithm$ employs multiple iterations in clients' local training. The problem with the single-local step is the communication burden due to too frequent download/upload of the model \citep{fednova}. Here, as we consider training of multiple models concurrently, the total communication between the server and clients is already more excessive than single-model training. Therefore, using multiple local steps to reduce the communication overhead is crucial. \ps{This para might be unnecessary.} 
% \begin{algorithm}
% \caption{\nameofthealgorithm}
% \label{alg:main}
% \begin{algorithmic}[1]
% %\Require $J$: Number of active works, $T$: target round, $N$: \# of total clients,  $b$: Buffer size, $\tau$: \# of local SGD iterates, $\eta_s$: Client learning rate, $\eta_s$: Server learning rate\\
% \Initialize{$\forall m \in [M]$: $\tj\gets 0$, initial global model $\xjz$, buffer $\buffj \gets \emptyset$\label{algline:init}.}
% \For{Models $m = 1,\dots,M$ (in parallel)}
%     \State Randomly select $\awj \subseteq [N]$ clients
%     \State \multiline{%
%     For all selected clients $j \in \awj$, run \hspace{5mm} \texttt{LocalTrain$(m$, $\locitj$, $\xjz$, $\lrcj)$}}
%     \While{$\tj < \Tj$}
%         % \\\textbf{$\Rightarrow$ Option $\nameofthealgorithm$} : (Clients asynchronously run their queued jobs.)
%         \If{\textit{Server receives update from client $i$}}
%             \State $\buffj \gets \buffj \cup \{i\}$ \label{algline:rcvupd}
%             \State \multiline{%
%             Select one of the available clients and run \texttt{LocalTrain$(m$, $\locitj$, $\xjt$, $\lrcj)$}
%             \ps{Should we consider queue here}
%             \ps{If an update happens, we still send the old model}} 
%         \EndIf
%         \If{$|\buffj| = \bsj$\label{algline:fullbuff}}
%             \State \multiline{%
%             For $i \in B_m$, $\zijt \gets$ \texttt{LocalTrain$(m$, $\locitj$, $\xjt$, $\lrcj)$} \ps{This is being synchronous I guess}}
%             \State \multiline{%
%             $\xjtp\gets \xjt + \lrsj\times\frac{1}{\bsj}\sum_{i \in \buffj} \zijt$ \label{algline:aggr}}
%             \State $\tj\gets \tj+1$ and $\buffj \gets \emptyset$
%         \EndIf
%     \EndWhile
% \EndFor
% \State \textbf{Output:} Trained models $\{ x_{m}^{(T_m)} \}_{m=1}^M$
% \end{algorithmic}
% \end{algorithm}
%
\begin{algorithm}[t]
\caption{\nameofthealgorithm}
\label{alg:main}
\begin{algorithmic}[1]
\State{\textbf{Input}: Client and server learning rates $\{\lrcj,\lrsj\}_{m=1}^M$, \(\textit{option} \in \{S,D\}\), no. of local updates $\{ \locitj \}_{m=1}^M$}
\Initialize{$\forall m \in [M]$: $\tj\gets 0$ (round index), model $\xjz$, buffer $\buffj \gets \emptyset$. Total no. of updates \(c\gets0\)}
\For{Models $m = 1,\dots,M$ (in parallel)\label{alg:parallel_train}}
    \State \multiline{Randomly select $\awj^{(0)}$ clients and send \hspace{5mm} \texttt{LocalTrain$(m$, $\locitj$, $\xjz$, $\lrcj)$} requests\label{algline:init}}
    \While{$\tj < \Tj$}
        \State Wait until server receives an update $\del_m$ %\ps{``Server receives an update''}
            \State $\buffj \gets \buffj \cup \{\del_m\}$, \(c\gets c+1\) \label{algline:rcvupd} 
% \If{\(c\mod c_{period} = 0\)}
    \State\label{algline:adjustRb}\(\{(R_i^{(t_i+1)}, b_i^{(t_i+1)})\}_{i=1}^M\) \(\gets \texttt{Realloc(}\textit{option},c\texttt{)}\) 
% \EndIf
        \If{$|\buffj| = \bsj^{(t_m)}$\label{algline:fullbuff}}
            \State\label{algline:aggr}$\xjtp\gets \xjt - \lrsj\lrcj\locitj
            \frac{1}{b_m^{(t_m)}}\sum_{\del\in B_m}\del$
            \State $\tj\gets \tj+1$ and $\buffj \gets \emptyset$
    \EndIf
        \State \label{algline:newjob}\multiline{%
            Select \(K^{(t_m)}_{m}\) random client(s) and send \texttt{LocalTrain}$(m, \locitj, \xjt, \lrcj)$ request(s)}
    \EndWhile
\EndFor
\State \textbf{Output:} Trained models $\{\x_{m}^{(T_m)} \}_{m=1}^M$
\end{algorithmic}
\end{algorithm}


\begin{figure}[thb]
    \centering
        % \vspace{.3in}  
  \centerline{\includegraphics[width=0.43\textwidth]{figures/scheme.pdf}}
        % \vspace{.3in}
        \caption{In our proposed algorithm $\nameofthealgorithm$, the server assigns local training requests (shown in striped and orange blocks for two simultaneous tasks), which are queued at the clients and processed in a first-come-first-served manner. Completed requests are aggregated asynchronously at the server. In the figure, snapshots of the process at two different times are seen. Adjusting the number of requests, $\nameofthealgorithm$ periodically \textit{reallocates} the resources shared across models.
        % reallocates resources by increasing (illustrated for Model $m$) or decreasing (illustrated for Model $m'$) the number of active requests for each model.
        % \carlee{put stripes on one of the model boxes so it can be read in black and white. I also didn't see how you are decreasing $m'$'s resources--do the $m'$ updates from the last two clients also finish before the dynamic reallocation and are not replaced?}
         }
    \label{fig:server_vis}
\end{figure}
% \ps{We need to clarify queueing of jobs. The difference between active jobs and active clients}
% Algorithm~\ref{alg:main} outlines the pseudocode of $\nameofthealgorithm$. The training of each model runs simultaneously. Figure~\ref{fig:server_vis} visualizes this process for two sample tasks. Consider some $m \in [M]$. The server begins the training by sending out $\awj^{0}$ local training requests for task $m$ to randomly selected clients, along with the initial model $\xjz$ (Line \ref{algline:init}).
% If a client receives new local training requests before it has finished its prior requests, they are all queued at the client and processed on a first-come-first-served basis. Due to possible queued requests, the number of \textit{active clients} (clients actively working on local training requests) at any time might be less than the number of \textit{active training requests} (requests that clients either work on or store in their queues).
% Local training at the clients follows \cref{alg:cap}. \ps{redundant sentence}
% The server maintains a buffer $B_m$ for each task $m$, which stores the received client updates for model $m$ (Line \ref{algline:rcvupd}). Also, we increase the counter, \(c\), for the total received updates across all tasks by one (Line \ref{algline:rcvupd}). \ps{Why is this counter needed?} To dynamically adjust client allocation across tasks, we call \texttt{Realloc} \ps{More intuitive name? ClientAllocation, etc.} function which inputs \textit{option}, determining which option we use for $\nameofthealgorithm$ \ps{Have we introduced opt yet?} and \(c\), the update counter (Line \ref{algline:adjustRb}). This function returns the number of active local training requests and buffer sizes for the next round. If \textit{option} is \(0\), we keep \(R\) and \(b\) values constant across all rounds for all models by always setting \(\{(R_i^{t_i+1}, b_i^{t_i+1})\}_{i=1}^M\) to \(\{(R_i^{t_i}, b_i^{t_i})\}_{i=1}^M\). If \textit{option} \(= 1\), we periodically (with period of \(c_{period}\)) update \(R\) and \(b\) parameters based on the number of total updates received from all models. Namely, when \textit{option} is \(1\) \texttt{Realloc} returns updated values if \(c \mod c_{period} = 0\), otherwise it returns the previous rounds' values similar to \textit{option}\(=2\).
% We provide more details about \texttt{Realloc}  \hyperref[para:calcrb]{below} and in Algorithm~\ref{alg:calcRb}. When the buffer is full, i.e., $|\buffj| = \bsj$ (Line \ref{algline:fullbuff}), the server aggregates the stored client updates to update the global model (Line \ref{algline:aggr}). 
% Whenever the server receives an update for task $m$, it randomly selects \(K_{m}^{(t_m)}\) new client(s) and sends training request(s) along with the current global model (Line \ref{algline:newjob}). \ps{Do we need ``opt'' in the subscript of $K$?} \(K_{m}^{(t_m)}\) is always \(1\) if \textit{option} is \(0\). Hence, the number of active local training requests for each task remains fixed at $R_m^{t_m}$ for \textit{option} of \(0\). When \textit{option} \(=1\), \(K_{m}^{(t_m)}\) returns one of \(0\), \(1\), or \(2\) to adjust the actual total number of active local training requests according to \(R_m^{t_m}\) distribution. It returns \(1\) if there is just one missing update (which is actually the latest received one). It can also return \(2\)/\(0\) depending on whether the algorithm decides to increase/decrease the number of active local training requests at Line \ref{algline:adjustRb}. If any selected client is already active, the request gets queued. \ps{This flow can be improved. We take a big detour from Line 6 to discuss line 7, and then come back to line 8. Instead, we should first mention the basic algorithm. Then in subsequent paragraphs, describe the nuances in more detail. Let me make an attempt. How about the following paragraphs in blue?}




%The training of each model runs simultaneously. 
 %\GJ{The paragraphs below go back and forth between describing the client-side, server-side aggregation, and dynamic reallocation operations. I think we write about each of these separately with one paragraph each, without switching.}

\paragraph{Assignment of Local Training Requests to Clients and their Execution.} Consider task $m \in [M]$. 
% We use $\tj$ to denote the round index for task $m$, and initialize it to $\tj = 0$. 
The server begins by sending out $\awj^{(0)}$ local training requests for task $m$ to clients selected uniformly at random, along with the initial model $\xjz$ (Algorithm~\ref{alg:main}, Line \ref{algline:init}). The number of local training requests $\awj^{(\tj)}$ is adapted over time using the \texttt{Realloc} function (\Cref{alg:calcRb}), enabling us to dynamically reallocate client resources across tasks.
% as we will describe below in the dynamic adaptation using \texttt{Realloc}.\tdcarlee{Continuing my comment on $R_m$ and $b_m$ above, replace this with: ``enabling us to dynamically reallocate client resources across tasks with the \texttt{Realloc} function''} 
Each client processes the training request by performing $\tau_m$ local mini-batch SGD iterations (see \Cref{alg:cap}) and sends the resulting model update $\del_m$ back to the server. If a client receives multiple requests, they are queued and processed in a first-come-first-served manner.\footnote{Processing the requests in parallel would require clients to keep all the $M$ models in local memory, which can be infeasible.} Therefore, the number of \textit{active clients} (clients working on training requests) at any time might be less than the number of \textit{active training requests} (that clients are working on or are stored in their queues).
% \tdcarlee{mention client availability here if there is space, e.g., ``We further assume that some clients (distributed uniformly at random) may be unavailable, in which case they simply reject the training requests.''}

\paragraph{Buffered Asynchronous Aggregation at the Server.} The updates $\del_m$ sent by clients are aggregated at the server in an asynchronous manner as follows. To keep staleness in check, the server maintains a buffer $B_m$ for task $m$, which stores the received client updates for model $m$ (Algorithm~\ref{alg:main}, Line \ref{algline:rcvupd}). The buffer size $b_m^{(t_m)}$ can be adapted over time (using \texttt{Realloc} function). Whenever the server receives an update for task $m$, it randomly selects $K^{(t_m)}_{m}$ client(s) to send a new training request along with the current global model (Algorithm~\ref{alg:main}, Line \ref{algline:newjob}). As we explain below, $K^{(t_m)}_{m}=1$ ($K^{(t_m)}_{m} \in \{0,1,2\}$) for $option=S$ ($option=D$).
% $1$ new client (in the dynamic case, it may be either $0$, $1$, or $2$ clients as we will explain later below) and sends it a training request along with the current global model (Algorithm~\ref{alg:main}, Line \ref{algline:newjob}). 
% Also, the server maintains a counter \(c\) (Line \ref{algline:rcvupd}) for the total number of received updates. \GJ{What is this counter used for? Can explain that here briefly} The counter is common across all the $M$ tasks to determine when
%
% To achieve more efficient client utilization, our approach allows the number of active training requests for each task to dynamically change (Line~\ref{alg:calcRb}). The server reevaluates this allocation periodically.
When the buffer for model $m$ gets full ($|\buffj| = b_m^{(t_m)}$)
% (Algorithm~\ref{alg:main}, Line \ref{algline:fullbuff}), 
the server aggregates the updates stored in the buffer to update the global model (Algorithm~\ref{alg:main}, Line \ref{algline:aggr}). 

\paragraph{Dynamic Adaptation of the Number of Active Requests and Buffer Size using \texttt{Realloc} (Algorithm~\ref{alg:calcRb}).} \label{para:calcrb}
The server maintains a counter \(c\), tracking the total number of updates received across all \(M\) tasks (Line \ref{algline:rcvupd}). If $option=D$, this counter is used to periodically trigger the dynamic adaptation of the number of active training requests $\awj$ and the buffer size $b_m$ across tasks (Line~\ref{algline:adjustRb}).
Intuitively, we should allocate more clients (and consequently, more training requests $R_m$) to tasks with larger inter-client heterogeneity. To empirically estimate this heterogeneity, the server stores the last $\nbupdvar$ ($\nbupdvar$ is a tunable parameter) updates $\del_m$ for each task $m$ (denoted $\{ \del_{m,i}\}_{i=1}^V$) and computes
\begin{align}
    \label{eq:empirical_hetero}
    \hat{\sigma}_{g,m}^2 \propto \mfrac{1}{\nbupdvar}\times\sum\nolimits_{i=1}^\nbupdvar \mfrac{\normbs{\del_{m,i}-\overline{\del_{m}}}}{\normbs{\overline{\del_{m}}}},
\end{align}
where $\overline{\del_{m}}$ is the empirical mean of $\del_{m,i}$'s.\footnote{We normalize by $\normbs{\overline{\del_{m}}}$ to account for different model sizes since larger models often have larger unnormalized variance.} Further, in our experiments, we empirically observe that the optimal choice of buffer size $b_m$ is proportional to the number of active requests $R_m$. See Appendix~\ref{app_sect:buffer_size} for our extensive experiments. Using \eqref{eq:empirical_hetero} and these empirical observations, the optimal resource allocation emerges as the solution to the following constraints.
\begin{equation}
    \begin{aligned}
        \sum\nolimits_{i=1}^M R_i^{(t_i+1)} &= \sum\nolimits_{i=1}^MR_i^{(t_i)}, \\
        \mfrac{R_1^{(t_1+1)}}{{\hat{\sigma}_{g,1}}} &= \mfrac{R_2^{(t_2+1)}}{{\hat{\sigma}_{g,2}}} = \dots = \mfrac{R_M^{(t_M+1)}}{{\hat{\sigma}_{g,M}}},
    \end{aligned}
    \label{eq:realloc_constr}
\end{equation}
where the first set of constraints maintains the total computation budget across tasks, and the second set ensures the allocation of a larger number of training requests to clients with higher heterogeneity. We elaborate on the theoretical motivation for the second set of constraints 
%in the next section (\ref{sec:convergence})
in Section~\ref{sec:convergence}, once we establish our convergence results. We also refer the reader to Appendix~\ref{app_sect:CalcRb} for more details on \texttt{Realloc}.
% according to the complexity and training progress of different tasks. 
%To achieve more efficient client utilization, the server reevaluates this allocation periodically.
%\carlee{This sentence is confusing as it's not clear how the server reallocates clients yet--maybe save it for the next paragraph or explain here that the buffer size and number of active clients can change over time for each task}
%
%We provide two versions of \(\nameofthealgorithm\) based on \mbox{\textit{option} \(\in\{S,D\}\)} value. When \textit{option} is \(S\) (\textit{static}), \(\nameofthealgorithm\) preserves the initial resource allocation (\(\{R_m^{(t_0)}\}_{m=0}^{M}\)) across \(M\) simultaneous tasks in the entire training process. \carlee{is the buffer size also preserved?} However, this approach requires prior knowledge of task complexities. % for a good client allocation, considering a limited number of active clients.
%Further, there may be a need to dynamically change client allocation across tasks depending on their training progress status in time. \carlee{Mention here, or somewhere, that changing the allocation means changing $R_m$, not choosing specific clients.} Therefore we propose \textit{option} $=D$ (\textit{dynamic}), where \(\nameofthealgorithm\) dynamically adjusts resource allocation across tasks during training. 
%

% The server stores the last $\nbupdvar$ (where $\nbupdvar$ is a tunable parameter) 
% updates $\del_m$ for each task and then computes their variance (see Appendix~\ref{app_sect:CalcRb} for more details), which serves as an empirical estimate of that task's inter-client data heterogeneity (see Assumption~\ref{assump:globhet}). Subsequently, \texttt{Realloc} reallocates resources (\(\{R_m^{(t_m)}\}_{m=0}^{M}\)) and buffer sizes (\(\{b_m^{(t_m)}\}_{m=0}^{M}\)) proportionally to the square root of estimated variances. The intuition behind this allocation strategy is to allocate more training requests $R_m$ to tasks that experience larger inter-client heterogeneity. This allows that task to see more diverse data from more clients in future rounds. The buffer size $b_m$ is set proportional to $R_m$ to similarly average out the variance due to high data heterogeneity. %Additionally, more resources can be assigned to tasks with larger buffer sizes while maintaining a similar update staleness. 
%This intuition is also supported by our theoretical analysis, too. \carlee{explain a little why this is the case, e.g., that these are chosen to minimize your convergence bounds} 
% Due to space limitations, we refer the reader to Appendix~\ref{app_sect:CalcRb} for a more detailed explanation and theoretical rationale for our choices of $R_m^{(t_m)}$. \ps{This para is not super clear.}

% \ps{How about the following structure: empirical estimate of inter-client data heterogeneity; 
% \begin{align}
%     \hat{\sigma}_{g,m}^2 \propto \mfrac{1}{\nbupdvar}\times\sum\nolimits_{i=1}^\nbupdvar \mfrac{\normbs{\del_{m,i}-\overline{\del_{m}}}}{\normbs{\overline{\del_{m}}}}
% \end{align}
% need for normalization - the empirical observation that larger models will have larger variance; another empirical observation - $b_m, R_m$ are proportional; solution emerges from the following optimization
% \begin{align}
%     \mfrac{R_1^{(t_1+1)}}{{\hat{\sigma}_{g,1}}} =  \dots = \mfrac{R_M^{(t_M+1)}}{{\hat{\sigma}_{g,M}}}, \sum_{i=1}^MR_i^{(t_i+1)}=\sum_{i=1}^MR_i^{(t_i)}
% \end{align}
% Motivation for this problem comes in next section
% }

% % \ba{The server stores the last $\nbupdvar$ (where $\nbupdvar$ is a tunable parameter) 
% % updates $\del_m$ for each task as approximations of gradients calculated on the dataset of clients sending those updates. Then it computes their sample variance (see Appendix~\ref{app_sect:CalcRb} for more details), which serves as an empirical estimate of that task's inter-client data heterogeneity (see Assumption~\ref{assump:globhet}). Subsequently, \texttt{Realloc} reallocates resources (\(\{R_m^{(t_m)}\}_{m=0}^{M}\)) and buffer sizes (\(\{b_m^{(t_m)}\}_{m=0}^{M}\)) proportionally to the square root of estimated variances. The intuition behind this allocation strategy is to allocate more training requests $R_m$ to tasks that experience larger inter-client heterogeneity. This allows that task to see more diverse data from more clients in future rounds. The buffer size $b_m$ is set proportional to $R_m$ to similarly average out the variance due to high data heterogeneity. We also use a theoretical rationale to design \texttt{Realloc}. Using our theoretical results (Section~\ref{sec:alg_conv}), we show that setting $R_m$ proportional to the heterogeneity level multiplied with some other constants is optimal. Due to space limitations, we refer the reader to Appendix~\ref{app_sect:CalcRb} for a more detailed explanation.} \ps{This para is not super clear.}

\begin{algorithm}[t]
\caption{\texttt{Realloc(}\textit{option},\(c\)\texttt{)}}\label{alg:calcRb}
\begin{algorithmic}[1]
\If{\(\textit{option}=D\) and \(c\mod c_{period}=0\) \label{alg3_line:firstif}}
\State \(\{{\hat{\sigma}_{g,m}^2}\}_{m=1}^M \gets \texttt{EstimateVariances()}\) \label{alg3_line:estimate}
% \State\label{alg3_line:proportional_dist}\multiline{Find \(\{R_i^{(t_i+1)}\}_{i=1}^M\) such that \(R_1^{(t_1+1)}/{\hat{\sigma}_{g,1}} =  \dots = R_M^{(t_M+1)}/{\hat{\sigma}_{g,M}}\) and \(\sum_{i=1}^MR_i^{(t_i+1)}=\sum_{i=1}^MR_i^{(t_i)}\)}
\State\label{alg3_line:proportional_dist}\multiline{Find \(\{R_i^{(t_i+1)}\}_{i=1}^M\) that solves \eqref{eq:realloc_constr}}
\State \(b_i^{(t_i+1)} \gets \big( b_i^{(t_i)} R_i^{(t_i+1)} \big)/R_i^{(t_i)}\) for all $i \in [M]$
% \State{$b_i^{(t_i+1)} \gets \mfrac{b_i^{(t_i)} R_i^{(t_i+1)}}{R_i^{(t_i)}}$}
% \ElsIf{\( \exists i: \quad R_i^{(t_i+1)}\) is not defined}
\Else: \textbf{ for all} $i\in$ $\{i:R_i^{(t_i+1) }\text{not defined}\}$ \textbf{ do} 
\State\label{alg3_line:set_same}\( (R_i^{(t_i+1)}, b_i^{(t_i+1)})\gets(R_i^{(t_i)}, b_i^{(t_i)})\)
% \ElsIf{\(\{(R_i^{t_i+1}, b_i^{t_i+1})\}_{i=1}^M\) are not defined before}
% \State \(\{(R_i^{t_i+1}, b_i^{t_i+1})\}_{i=1}^M\gets\{(R_i^{t_i}, b_i^{t_i})\}_{i=1}^M\)
\EndIf
\State \textbf{Return} \(\{(R_i^{(t_i+1)}, b_i^{(t_i+1)})\}_{i=1}^M\)
\end{algorithmic}
\end{algorithm} 

% \GJ{The content of the following remarks is good. But I suggest making these paragraphs with titles (similar to the 'adjusting the number of active requests...' paragraph above) rather than remarks. The italic text in remark environment makes it hard to read. \ba{done!}}

%\paragraph{Queuing Requests at Clients.}
%If a client receives new local training request(s) before it has finished its prior requests, they are all queued at the client and processed in a first-come-first-served manner. Due to possible queued requests, the number of \textit{active clients} (clients actively working on local training requests) at any time might be less than the number of \textit{active training requests} (requests that clients either work on or store in their queues).

%\paragraph{Dynamic Client Allocation across Tasks.}\label{rem:c}
%The server can adjust client allocation across tasks periodically. This happens if $\text{option}=D$ (dynamic). In this case, if \(c \mod c_{period} = 0\) (where recall, $c$ is the total number of updates received at the server across tasks), \(R\) and \(b\) parameters are updated. For more details, see  
%\hyperref[para:calcrb]{below} and Algorithm~\ref{alg:calcRb}. \carlee{Isn't this discussed above wehn you talk about adjusting the number of active requests?}

\paragraph{Sending out New Requests to Reach the New Resource Allocation.} To transition from one allocation $\{ R_m^{(t_i)} \}_m$ to another $\{ R_m^{(t_m+1)} \}_m$ in an asynchronous setting, we must adjust the number of new requests that are sent out every time the server receives a client update. The number of new requests \(K_{m}^{(t_m)}\) sent out on receiving any update $\del_m$ is always $1$ in the static ($\textit{option}=S$) case since $R_m$ remains constant throughout training. In the dynamic case ($\textit{option}=D$), \(K_{m}^{(t_m)}\) can be $0$ (when $R_m^{(t_m+1)} < R_m^{(t_m)}$), $1$ (when $R_m^{(t_m+1)} = R_m^{(t_m)}$), or $2$ (when $R_m^{(t_m+1)} > R_m^{(t_m)}$). 
% \sout{By adjusting the number of new requests in this way, the number of active training requests $R_m$ for each task $m$ will gradually reach the desired new $R_m$.}
% \ba{This strategy ensures as the number of active training requests $R_m$ for each task $m$ will gradually reach the desired new $R_m$ without queue }
% \ba{This strategy ensures a smooth transition to the desired new number of active training requests $R_m$ gradually for each task since }
% \ba{This strategy 
We employ this gradual 
% and smooth 
transition to the desired new number of active training requests $\{R_m^{(t_m+1)}\}_m$ for each task instead of a sudden change in allocation to avoid possible longer queues at the clients during the transition phase.
% }
% \tdcarlee{why not do this all at once? In general, why does the number of active training requests need to equal $R$ (to limit client queuing?) \ba{yes, to limit queues, we keep total \# of requests bounded.}} % It can be $2$ or $0$ depending on whether the algorithm decides to increase or decrease the number of active local training requests respectively at Line \ref{algline:adjustRb}. % It is \(1\) if there is just one missing update (which is actually the latest received one). 

%\ps{If we choose to follow the template I suggested, description of the basic algorithm, followed by remarks explaining things in detail, this paragraph should come before the remarks. \ba{thanks for the review, I like your suggestion and edited!}}

%\GJ{The two paragraphs below seem out of place in the algorithm description section. Some of their contents can be merged into the paragraphs above}

%$\nameofthealgorithm$ solves the straggler problem by allowing asynchronous local training across clients and asynchronous aggregation of local client updates across tasks. The expected wall-clock time is reduced since the global model updates take a much shorter time. However, asynchrony causes staleness in client updates. We measure the staleness of a client for a specific task in terms of the \textit{number of global model updates at the server} between when the client receives the training request for this task and when it sends the update back to the server.

%In $\nameofthealgorithm$, the server maintains a separate \textit{buffer} for each task to keep the staleness in check. Further, owing to the cross-client data heterogeneity in FL, individual client updates can have a large variance. Since the buffer enables averaging multiple client updates, the resulting server update better captures the underlying data heterogeneity. We compare $\nameofthealgorithm$ with the \textit{buffer-less} asynchronous implementation in Figures~\ref{fig:buff_hom_acc} and \ref{fig:buff_het_acc} to empirically show the benefits of using a buffer. Further, $\nameofthealgorithm$ ensures fairness across clients by requesting local training from randomly selected clients, irrespective of their speed. 


% \begin{algorithm}
% \caption{\nameofthealgorithm{} to train $\M$ models concurrently }\label{alg:main}%\ps{Can you add line numbers to the algorithm. Then we can add line numbers in the description above.}
% \begin{algorithmic}[1]
% %\Require $J$: Number of active works, $T$: target round, $N$: \# of total clients,  $b$: Buffer size, $\tau$: \# of local SGD iterates, $\eta_s$: Client learning rate, $\eta_s$: Server learning rate\\
% \Initialize{$\tj\gets 0$. Initialize $\xjz$, choose $\awj$ random clients and put \textit{LocalTraining(j, $\locitj$, $\xjz$, $\lrcj$)} into their job queues, and $\buffj \gets \emptyset$,  $\forall j\in [M]$\label{algline:init}.}
% \While{$\exists j \: \tj \neq \Tj$}
%     Clients asynchronously run their queued jobs. \ps{Don't follow the while condition}
%     \If{\textit{Server receives an update u for task j}}
%         \State $\buffj \gets \buffj \cup \{u\}$ \label{algline:rcvupd}
%         \If{$|\buffj| = \bsj$}
%             \State $\xjtp\gets \xjt + \lrsj\times\frac{1}{\bsj}\sum_i\buffj[i]$ \label{algline:aggr}
%             \State $t^{(j)}\gets t^{(j+1)}$ and $B^j \gets \{\}$
%         \EndIf
%         \If{$t^{(j)}\neq T^{(j)}$}
%             \State  Pick a client uniformly at random and put \textit{LocalTraining($j$, $\tau^{(j)}$, $x^{(j,t)}$, $\eta^{(j)}_c$)} into its queue \label{algline:newjob}\ps{Why is this happening? \psq{Randomly selecting a new client and sending the latest model to it. The aim of the if statement is to stop the training when a task reaches its own round limit.}}
%         \EndIf
%     \EndIf
% \EndWhile
% \end{algorithmic}
% \end{algorithm}


% \ps{Algorithm~\ref{alg:main} outlines the pseudocode of $\nameofthealgorithm$. The training of each model runs simultaneously. Figure~\ref{fig:server_vis} visualizes this process for two sample tasks. Consider some $m \in [M]$. The server begins the training by sending out $\awj^{0}$ local training requests for task $m$ to clients selected uniformly randomly, along with the initial model $\xjz$ (Line \ref{algline:init}). 
% The server maintains a buffer $B_m$ for each task $m$, which stores the received client updates for model $m$ (Line \ref{algline:rcvupd}). Also, the server maintains a counter \(c\) (Line \ref{algline:rcvupd}) for the total number of received updates, across all the $M$ tasks. To achieve more efficient client utilization, our approach allows the number of active training requests for each task to dynamically change. The server reevaluates this allocation periodically. When the buffer for model $m$ is full, i.e., $|\buffj| = \bsj$ (Line \ref{algline:fullbuff}), the server aggregates the client updates in the buffer to update the global model (Line \ref{algline:aggr}). Finally, whenever the server receives an update for task $m$, it randomly selects \(K_{m}^{(t_m)}\) new client(s) and sends training request(s) along with the current global model (Line \ref{algline:newjob}).}


% When \textit{option} \(=1\), \(K_{m}^{(t_m)}\) returns one of \(0\), \(1\), or \(2\) to adjust the actual total number of active local training requests according to \(R_m^{t_m}\) distribution. It returns \(1\) if there is just one missing update (which is actually the latest received one). It can also return \(2\)/\(0\) depending on whether the algorithm decides to increase/decrease the number of active local training requests at Line \ref{algline:adjustRb}.



\section{Convergence Analysis}
\label{sec:convergence} %\carlee{This is the only subsection in Section 3, so either make another one, take this as its own section, or make this an inline heading}

In this section, we provide the convergence result
% for $M$ models trained using 
for $\nameofthealgorithm$ with the static \textit{option} (\(S\)). Since \(R_m^{(t_m)}\) and \(b_m^{(t_m)}\) are constant when \textit{option} \(=S\), we drop time indices for simplicity. The convergence with dynamic allocation (\textit{option} $=D$) can be shown with an additional assumption. We relegate this to Appendix~\ref{app_sec:opt1_conv} due to space limitations.

%\carlee{did you put the proof of the dynamic case in the appendix?} \GJ{moving the sentence about dynamic allocation's proof from the end of this section to here} 

Next, we discuss the assumptions used in our analysis, that are standard in the literature \citep{fednova, sharper, fedbuff}.
% \tdcarlee{$x$ should be bolded in the assumptions, right?\ba{yes, thanks}}

\begin{assump}[Smoothness]
The loss functions are $\Li$-smooth, i.e., for all $i \in [N]$, for all $m \in [M]$, and for all $\x,\y \in \mathbb{R}^{d_m}$, $\norm{\G \fclientij (\x)-\G \fclientij (\y)} \leq \Li \norm{\x-\y}$.
\label{assump:smoothness}
\end{assump}
\vspace{-3mm}
\begin{assump}[Bounded Variance]
The stochastic gradient at each client is an unbiased, bounded-variance estimator of the true local gradient, i.e., for all $\x \in \mbb R^{d_m}$, $i \in [N]$, and $m \in [M]$, $\mathbb E [\tG \fclientij (\x)] = \G \fclientij (\x)$ and \mbox{$\mathbb E \| \tG \fclientij (\x) -\G \fclientij (\x) \|^2\leq \lhetsj$.} %\ps{Or $\expb{\widetilde{\G}\fii{x}}=\G\fii{x}$ and $\expns{\widetilde{\G}\fii{x}-\G\fii{x}}\leq{\lhets}$}
\label{assump:lochet}
\end{assump}
\vspace{-3mm}
\begin{assump}[Bounded Heterogeneity]
The local gradients are within bounded distance of the global gradient, such that for all $m \in [M]$ and $\x \in \mathbb{R}^{d_m}$, $\max\limits_{i \in [N]}\norms{\G \fclientij (\x) - \G \fglobalj (\x)} \leq \ghetsj$.
\label{assump:globhet}
\end{assump}
\vspace{-3mm}
\begin{assump}[Bounded Staleness]
The 
% staleness of updates is bounded above by $\rdm$ rounds, that is, 
client updates of task $m$ are received within at most $\rdm_m$ server model updates after the server sends the training request. 
%\ps{We have to clarify that staleness is measured in terms of server rounds. \psq{Is this definition unclear?}}
\label{assump:maxstale}
\end{assump}
\begin{theorem}[Convergence  on $\nameofthealgorithm$] \label{thm:main}
% \textbf{(Convergence bound):}
Suppose that Assumptions \ref{assump:smoothness} - \ref{assump:maxstale} hold, and there are $\awj$ active local training requests corresponding to task $m \in [M]$, and the server and client learning rates, $\{ \lrsj, \lrcj \}$ %$\{ \lrsj, \lrcj \}_{m=1}^M$
respectively, satisfy $\lrsj \leq \sqrt{\locitj\bsj}$ and $\lrcj \leq \min \{%\frac{1}{4\Li\locitj},
(6\Li\locitj\sqrt{\locitj\bsj})^{-1}, (4\Li\locitj\sqrt{\locitj\awj\rdm_m})^{-1} \}$ for all tasks $m \in [M]$. Here, $\bsj$ is the buffer size, and $\locitj$ is the number of local training steps. Then, the iterates, $\{ \{ \x_m^{(t)} \}_{t=1}^{T_m} \}_{m=1}^M$, of Algorithm \ref{alg:main} satisfy:
% {\small
\begin{equation}
    \begin{aligned}
        & \avgtelj \mbe \| \G \fglobalj (\xjt) \|^2 \leq \underbrace{\bOP{\mfrac{\gapTerm_m}{T_m \lrcj \lrsj \locitj}}}_{\text{FedAvg Error - I}} \\
        & + \underbrace{\bOP{\lp \mfrac{\Li \lrcj \lrsj}{\bsj} + \Li^2 [\lrcj]^2 \locitj \rp (\lhetsj + \locitj \ghetsj)}}_{\text{FedAvg Error - II}} \\
        & + \underbrace{\bOP{\mfrac{\Li^2 [\lrsj]^2 [\lrcj]^2 \locitj\awj}{\bsj^2 } (\lhetsj + \locitj \awj\ghetsj)}}_{\text{Asynchronous Aggregation Error}},
        % & + \bOP{\frac{\Li \lrsj \lrcj}{\bsj} + \Li^2 [\lrcj]^2 \locitj \lp \locitj - 1 \rp + \frac{\Li^2 [\lrsj]^2 \awj^2}{\bsj^2}} .
    \end{aligned}
    \label{eq:thm:main}
\end{equation}
% }%
where $\gapTerm_m = \fglobalj(\xjz) - \min_\x \fglobalj (\x)$.
%\ps{This $z$ in the first line is a typo, right? We should expand the theorem statement to remind the reader what all the symbols mean.}
\label{theorem:main}
\end{theorem}

\textbf{Proof.} See Section \ref{app_sect:main_proof} in the Appendix.

\paragraph{Comparison with Synchronous FL Analyses.}
The two \textit{FedAvg Error - I, II} terms in \eqref{eq:thm:main} capture the error bound for synchronous FedAvg \citep[Theorem~1]{fedvarp}. 
Since the server updates for model $m$ involve aggregating $\bsj$ client updates, the buffer size $\bsj$ is analogous to the number of participating clients in FedAvg. 
The third error term in \eqref{eq:thm:main} arises due to asynchronous aggregation and increases with $\awj$, the number of active local training requests. Intuitively, given the same buffer size $\bsj$, increasing $\awj$ leads to higher worst-case staleness $\rdm_m$. However, as long as $\Li \lrsj \lrcj \awj \locitj \leq \bsj$, asynchrony is not the dominant source of error in \eqref{eq:thm:main}, and we achieve the same rate of convergence as synchronous FedAvg (see Corollary \ref{cor:conv_rate}).
% \ps{I guess we can remove everything after this. Do we need the point about wall-clock time?\psq{ Just looking at the bound, decreasing $R_m$ to $1$ seems the best as it appears only in nominators. However, increasing $R_m$ shortens the round times. Highlighting this can be good. I am shortening the rest. We can keep it if it makes sense, or we can remove it.}}
%The number of active work, $\aw$, is unique to asynchronous algorithm and it makes the terms induced by the asynchrony larger. 

\paragraph{Comparison with Asynchronous FL Analyses.}
FedBuff \citep{fedbuff} considers buffered asynchronous aggregation for a single model. Still, comparing \cite[Corollary~1]{fedbuff} and the bound in \eqref{eq:thm:main} for $M=1$, their convergence result (i) depends on stronger assumptions (bounded gradient and uniform arrivals of client updates), 
% (ii) does not achieve any benefit of larger buffer size in the dominant error term, 
and (ii) has worse asynchronous aggregation error. Moreover, our analysis is more general compared to \citep{sharper} as they do not consider multiple local SGD steps and the buffer. Simultaneous asynchronous training is considered by \citep{asyncMM}, but we observe that they do not achieve convergence unless the data distribution across clients is identical (see \cite[Eq.~(19)]{asyncMM}). We discuss the comparison of \(\nameofthealgorithm\) to single-model and simultaneous asynchronous federated training baselines in more detail in Appendix Section~\ref{app_sect:theory_comparison}.

% \GJ{Give a title to this corollary, something like 'Asymptotic convergence after setting learning rates'}
\begin{cor}[Asymptotic convergence after setting learning rates]
% \textbf{(Asymptotic convergence):}
\label{cor:conv_rate}
    Let $T_m \geq \locitj \max\lcb%16,
    36\bsj,16\awj\rdm\rcb$. Setting the learning rates $\lrcj = (\locitj L \sqrt{T_m})^{-1}, \lrsj = \sqrt{\locitj \bsj}$
    %\ps{Shouldn't we say $\lrs = \Theta(\sqrt{\bs})$ and $\lrc = \Theta \lp \sqrt{\frac{\locit}{\Li T}} \rp$?}
    , the bound in Theorem~\ref{theorem:main} reduces to:
\begin{align}
    & \avgtelj \mbe \| \G \fglobalj (\xjt) \|^2 \leq \bOP{\mfrac{\gapTerm_m \Li}{\sqrt{\bsj \locitj T_m}}} \nn \\
    &+\bOP{\lp\mfrac{1}{\sqrt{T_m \bsj\locitj}}+\mfrac{1}{\locitj T_m}\rp(\lhetsj + \locitj \ghetsj)} \nn \\
    &+\bOP{\mfrac{\awj}{T_m\bsj}(\lhetsj + \locitj\awj\ghetsj)}. \label{eq:cor:conv_rate}
\end{align}
\end{cor}

Although the given bound in Corollary~\ref{cor:conv_rate} does not seem to depend on the staleness bound $\rdm$ (Assumption~\ref{assump:maxstale}), its effect is implicit in the number of active requests $\awj$ and buffer size $\bsj$. The maximum staleness is positively correlated with $\awj$ and negatively correlated with $\bsj$. In our experiments, we tune the buffer size to maintain the update staleness at a reasonable level.

Looking at the bounds in \eqref{eq:thm:main} or \eqref{eq:cor:conv_rate}, increasing $\aw_m$ makes the bound worse because to reach the same accuracy in \eqref{eq:cor:conv_rate}, we need to run a higher number of server updates $T_m$. However, increasing $\aw_m$ also shortens the duration between two successive server updates, making the algorithm faster in wall-clock time. We illustrate this with a wall-clock comparison to FST baselines below. 

% \ba{We should keep one of two paragraphs below.} 
% \ba{\paragraph{Run-time Comparison of FST Methods.}
% Suppose we want to train $M$ identical models simultaneously using $K$ always active clients, and the arrival times of all the client updates are distributed as $Exp(\lambda)$. Let us say tasks share resources equally and assume the number of actively assigned clients is always more than $K/(2M)$ for each model in $\nameofthealgorithm$ (see Appendix Section~\ref{app_sect:runtime} for our empirical reasoning). Then, the time units required for all models to reach the same arbitrarily small gradient norm would be as follows. $\nameofthealgorithm$: $\bOP{\sqrt{\frac{M}{\lambda K}}}$, synchronous simultaneous training: $\bOP{\sqrt{\frac{M\log K}{\lambda K}}}$, synchronous sequential training: $\bOP{\frac{M\sqrt{\log K}}{\sqrt{\lambda K}}}$, client set-partitioned training: \textit{can never reach}. Note that these are rough calculations (see Appendix Section~\ref{app_sect:runtime}) expected run-times based on the best known convergence rate in the literature.}

\paragraph{Impact of $\awj$ on Wall-clock Time.}
Suppose the arrival times of all the client updates (assuming there is no queue on the clients) are distributed as $Exp(\lambda)$. The expected time to fill the buffer corresponding to task $m$ is $\bsj/(\awj\lambda)$. Therefore, in $\nameofthealgorithm$, the expected time to complete one round at the server is inversely proportional to $\awj$.
On the other hand, the expected time to finish one round of synchronous simultaneous FedAvg training is $\frac{1}{\lambda}\sum_{k=1}^{R_1+\dots+R_M}\frac{1}{k}\approx\frac{1}{\lambda}\log(\sum_{k=1}^M{R_k})$, which increases with $\awj$. Also, the summation terms show an exacerbated straggler effect since all the clients wait for the slowest client across all the tasks.
% \ba{
\paragraph{Design of \texttt{Realloc} (Algorithm~\ref{alg:calcRb}).} Next, we theoretically justify the dynamic allocation of resources across tasks described in Section~\ref{sec:alg_conv} (\Cref{alg:calcRb}, with \(\textit{option}=D\)), which adjusts the number of active requests ($\{R_m\}_{m=1}^M$). Given the limited number of available clients (which limits the total number of active training requests), to achieve the best possible allocation, we minimize the sum of the dominant terms in the bounds (FedAvg Error-II in \eqref{eq:thm:main}) across tasks. We also use the empirical observation that the optimal choice of buffer-size $b_m$ scales linearly with $R_m$ (Appendix~\ref{app_sect:buffer_size}). The resulting optimization problem is
% that our aim in FST problem is obtaining a convergence as fast as possible for each model with a limited number of available clients (this limits the total number of active training requests without increasing the staleness a lot), we formulate an optimization problem where we minimize the sum of dominant terms of each task's bound (Eq.~\ref{eq:thm:main}) subject to the total number of active training requests is limited at $R$:
% \[\]
\begin{equation}
    \min_{\{R_m,b_m\}_{m=1}^M}\sum_{m=1}^M\mfrac{\lrsj\lrcj\locitj}{R_m}\ghetsj \text{ s.t. } \sum_{m=1}^MR_m=R. \label{eq:realloc_theory}
\end{equation}
The \texttt{Realloc} function (Algorithm~\ref{alg:calcRb}) solves the optimization problem \eqref{eq:realloc_theory}. 
% using an empirical observation between $\bsj$ and $\awj$ to allocate resources across clients.
% \ps{Coupling of $R_m, b_m$; empirical estimate of $\ghetsj$ in terms of local updates.}
See Appendix~\ref{app_sect:CalcRb} for more details.
% }

% \sout{As the {\color{blue}concurrency} in the system increases, the server updates the global model multiple times before a client can complete an update request.}
% On the other hand, the buffer size has an inverse relation with this frequency. 
% Therefore, one must tune the learning rate as described in Theorem~\ref{theorem:main} to have the guarantee. 
%\ps{Why on buffer size? Won't $\rdm$ be the same for fixed $R$, whatever $k$ be? \psq{I think it directly depends on the buffer size as well. We measure the staleness in terms of round and buffer size affects the round frequency}} 

% \ps{We should instead explain the dependence on staleness through $\awj$, as we discussed on slack. \psa{Done.} \psq{Also, could you check the discussion below?}}

% \GJ{The discussion below is a bit confusing to me. Can you control the number of clients that accept training requests?}
% In Theorem~\ref{theorem:main} and Corollary~\ref{cor:conv_rate}, we assume that the set of clients which accept local training requests is a uniformly random subset of all clients at any time (remaining clients reject new requests). The size of this set does not directly affect the bounds. However, having 
% %more active clients 
% a larger set of clients that accept local training requests enables us to increase the number of active clients ($\awj$)
% without extending the client queues. 
% % So, we can employ more local training requests at the same time.
% Adjusting the buffer size accordingly, this also does not lead to a staleness problem.
% % , but lets us utilize the variance reduction benefits of larger buffers. 
% % \ps{We should discuss this once.}

%Finally

\section{EXPERIMENTAL RESULTS}\label{sec:exp}
% Next, we complement our theoretical analysis of simultaneous federated training with experiments. 
We outline our experimental setup in Section~\ref{sect:implementation}, discuss the existing baselines in Section~\ref{sect:comp_methods}, and compare the baselines with $\nameofthealgorithm$ under varied settings in Section~\ref{sect:exp_res}.

\subsection{Datasets and Implementation}\label{sect:implementation}

We consider image classification tasks with the MNIST \citep{mnist}, Fashion-MNIST \citep{fashionmnist} and CIFAR-10 \citep{cifar10} datasets, and next character prediction with the Shakespeare \citep{leaf} dataset using the same models as in \citep{feddyn,asynchfl,lenet}.
%We measure performance in terms of final test loss/accuracy values. 
We compare the wall-clock time required by different algorithms to reach some predetermined target test accuracy levels (see \Cref{table:exp_summary}). In Appendix~\ref{app_sec:diff_acc}, we present experiments with other accuracy targets to show the consistency of our results. %(Appendix Section~\ref{app_sec:diff_acc}).

\begin{table}[h]
\centering
\caption{The datasets and models used in experiments, along with corresponding target test accuracy levels.}
\label{table:exp_summary}
\begin{tabular}{|c|c|c|}
\hline
Dataset & Model & Target Accuracy \\ \hline
MNIST & MLP & $93\%$ \\ \hline
Fashion-MNIST & LeNet-5 & $82\%$ \\ \hline
CIFAR-10 & CNN & $63\%$ \\ \hline
Shakespeare & LSTM & $42\%$ \\ \hline
\end{tabular}
\end{table}

For image classification tasks, we partition the training data across clients using Dirichlet distribution with \mbox{$\alpha = 0.1$} \citep{firstDirichlet}. The Shakespeare dataset is \textit{naturally} heterogeneous as the lines of each role in the plays of Shakespeare are assigned to a different client. There are a total of $1000$ clients, $30\%$ of which are available to accept new training requests, independent of the past.

\paragraph{Modeling Client Delays.}
As suggested in \citep{shiftedexp2, slowandstale, shiftedexp1, MM_bobs}, we use \textit{shifted-exponential} (exponential plus constant) random variables to model the time taken by a client to complete a local training request and return the update to the server. We pick the run-time generation parameters of each task according to real measurements on NVIDIA GeForce GTX TITAN X GPUs. To simulate hardware heterogeneity across clients, we divide them into $25\%$ slow, $50\%$ normal-speed, and $25\%$ fast clients \citep{favano}. We relegate additional implementation details to the Appendix. 

\begin{figure}[t]
% \vspace{.3in}
\centerline{\includegraphics[width=0.45\textwidth]{figures/comp_1_3.pdf}}
% \vspace{.3in}
\vspace{-2mm}
\caption{Mean test accuracy for compared algorithms on six identical CIFAR-10 tasks trained simultaneously. $\nameofthealgorithm$ trains faster than synchronous methods. The synchronous method without straggler mitigation is by far the slowest.
}
\label{fig:comp1_3}
\end{figure}

\subsection{Baseline Algorithms}\label{sect:comp_methods}

\paragraph{Synchronous Simultaneous Training.}
The following synchronous methods differ only in client selection.
\vspace{-3mm}
\begin{enumerate}[leftmargin=*]
\item $\mmsync$ \citep{BhuyanMM}: randomly partition the client set across tasks at each round;
\item $\mmbobs$  \citep{MM_bobs}: Bayesian optimization-based assignment of clients to tasks;
% in each iteration %to minimize training time while also promoting fairness in device participation, and 
\item $\mmucb$ \citep{MM_ucb}: client selection as a multi-armed bandit problem.
% with local training losses as scores.
\end{enumerate}
\vspace{-2mm}
As seen in Figure~\ref{fig:comp1_3}, these methods experience a severe straggler issue. To make these baselines competitive with our proposed $\nameofthealgorithm$, we use straggler mitigation by
% where the server 
aggregating only the first $\firstk$ client updates for each task and discarding the rest \citep{fedsysdesign}. In our experiments (Appendix~\ref{app_sec:tuning_first_k}), $\firstk = 30$ performs best across datasets. %Also, in the experiment in Figure~\ref{fig:comp1_3}, the positive effect of the straggler mitigation we added for the competitors is evident. 

\begin{figure}[t]
    \centering
    \centerline{\includegraphics[width=0.4\textwidth]{figures/acc_hom_both.pdf}}
    % \vspace{.3in}
    \vspace{-2mm}
    \caption{The mean final test accuracy values of {\color{blue}$\nameofthealgorithm$ (blue)}, {\color{olive}$\nobuffer$ (olive green)} and {\color{violet}centralized training (violet)}  with varying active client ratio, when training $3$ identical models. The left (right) figure is for CIFAR-10 (Fashion-MNIST) dataset. With more active clients, the importance of buffer increases due to increasing staleness.  
    % \carlee{Put the conclusion in the figure: more active clients means more need for a buffer due to stragglers}
    }
    \label{fig:buff_hom_acc}
\end{figure}

\begin{figure}[t]
    \centering
    \centerline{\includegraphics[width=0.45\textwidth]{figures/acc_het_both.pdf}}
    \vspace{-2mm}
    \caption{The mean test accuracy values of $\nameofthealgorithm$ and $\nobuffer$, when simultaneously training one model for CIFAR-10 and one for Fashion-MNIST. $\nameofthealgorithm$ achieves higher and more stable accuracy levels. }
    \label{fig:buff_het_acc}
\end{figure}

\paragraph{Asynchronous Federated Simultaneous Training.}
To our knowledge, \citep{asyncMM} is the only other work that studies asynchronous simultaneous FL. However, 
% as mentioned in \Cref{sec:intro}, 
their client selection scheme requires the knowledge of network-wide staleness and smoothness constants, which are hard to estimate. 
If the tasks have similar model complexity and task difficulty, their client selection is similar to that of $\nameofthealgorithm$ with a buffer size of $1$.
We include this \textit{no-buffer} version of $\nameofthealgorithm$ (we call it $\nobuffer$) as a baseline.

\subsection{Results and Insights} 

We assess the performance of $\nameofthealgorithm$ under various scenarios. In \textit{homogeneous-task} experiments, where multiple independent copies of the same model are trained simultaneously, we report the average accuracy over time. In \textit{heterogeneous-task} experiments involving differing tasks and models, efficiently distributing resources to accelerate task completion is the main challenge. For homogeneous tasks, we use $\nameofthealgorithm$ with static \textit{option} ($S$) and uniform client distribution across tasks. In heterogeneous-task experiments, we use 
%variance-based
dynamic allocation ($option = D$) to enhance resource allocation efficiency. To show the benefits of dynamic  allocation over static allocation, we also explore heterogeneous-task scenarios with \textit{option} $=S$. Dynamic allocation reduces overall training time by up to \(11.9\%\), with comprehensive results shown in Appendix~\ref{app_sect:var_comp}.

To quantify the time \textit{saved} by using $\nameofthealgorithm$ over some competing baseline, we define \textit{time gain} as
% \vspace{-1mm}
{\small
\begin{align*}
    \text{Gain} \triangleq \mfrac{T_{\texttt{Baseline}} - T_{\nameofthealgorithm}}{T_{\texttt{Baseline}}} \times 100 \%,
\end{align*}}%
% \vspace{-1mm}
where $T_{\texttt{Baseline}}$ $(T_{\nameofthealgorithm})$ is the simulated time for \texttt{Baseline} $(\nameofthealgorithm)$ to reach the target accuracy. 
% Gain quantifies the time \textit{saved} by using $\nameofthealgorithm$ over the competing baseline.% \GJ{Changed the term 'competitor' to 'baseline' here}

\begin{figure*}[t]
\centering\centerline{\includegraphics[width=0.95\textwidth]{figures/all_larger_bar.pdf}}
    % \vspace{.3in}
    \caption{Mean training times of $\nameofthealgorithm$ and $\mmsync$ to attain target accuracy levels in (\Cref{table:exp_summary}) on $2$/$4$/$6$ tasks with CIFAR-10, Fashion-MNIST, MNIST, and Shakespeare datasets. $\nameofthealgorithm$ requires consistently lower wall-clock time for training compared to $\mmsync$; the percentages represent these time gains.
    % \carlee{$\nameofthealgorithm$ requires lower wall-clock time for training compared to $\mmsync$; the percentages represent these time gains.} %Time gains of $\nameofthealgorithm$ over $\mmsync$ are as written as percentages.
    % \GJ{The colors red and blue are used for MNIST and CIFAR-10 in other figures. Consider changing the colors of one of these two sets to avoid overlap.}
    % The time gain increases with the number of simultaneous tasks due to increasing stragglers of $\mmsync$. 
    %\ps{Label y-axis.}
    }
    \label{fig:hom_all}
\end{figure*}

\label{sect:exp_res}
\paragraph{Comparison with all Synchronous FST Methods.}
First, we compare the synchronous baselines discussed in \Cref{sect:comp_methods} on the CIFAR-10 dataset (Figure~\ref{fig:comp1_3}), where we simultaneously train six identical models. We observe that synchronous methods without straggler mitigation converge very slowly. Among the straggler-mitigated synchronous variants that we implement, Figure~\ref{fig:comp1_3} shows that $\mmbobs$, has similar performance to $\mmucb$ because it struggles due to the large search space of the optimization problem, stemming from the exponential number of possible client schedules. 
Further, we do not observe any performance gains from using $\mmucb$ over $\mmsync$. %Since we assume that only $30\%$ of all clients are available in our experimental setting, $\mmucb$ cannot always select the clients with the highest scores as in \citep{MM_ucb}.
Given that $\mmbobs$ and $\mmucb$ have similar performance as $\mmsync$, in subsequent experiments, we choose $\mmsync$ as the sole synchronous baseline. 


\paragraph{Need for Buffer.} 
% Next, we study the significance of the buffer in $\nameofthealgorithm$. 
As discussed earlier, incorporating the buffer mitigates the negative impact of highly stale updates. Since staleness increases with the number of active clients, asynchronous FL methods without a buffer exhibits limited scalability as the number of clients grows. To demonstrate this, in Figure~\ref{fig:buff_hom_acc}, we conduct two experiments: 1) training three models for CIFAR-10 in parallel, and 2) training three models for Fashion-MNIST in parallel. We plot the final accuracy values varying the ratio of the active clients. In Figure~\ref{fig:buff_het_acc}, we simultaneously train two models, one each for CIFAR-10 and Fashion-MNIST.
%, assigning half the total number of active clients to each task. 
Figures \ref{fig:buff_hom_acc} and \ref{fig:buff_het_acc} show that the buffer makes the system more robust to stale updates.
In Figure \ref{fig:buff_hom_acc}, for small active client ratios, $\nameofthealgorithm$ and $\nobuffer$ have comparable performance. 
However, with more active clients, the staleness of updates increases, resulting in significantly worse performance of the fully asynchronous $\nobuffer$ algorithm.
%\GJ{The active clients terminology is inconsistent between this section and Figure 3 and 4, and previous sections.}

 

\paragraph{Comparison with $\mmsync$.}
Next, we compare $\nameofthealgorithm$ with the chosen synchronous method, $\mmsync$.
% , under different settings:

1) \textit{Homogeneous Tasks:} We conduct experiments training $2$, $4$, and $6$ identical models for each of MNIST, Fashion-MNIST, CIFAR-10, and Shakespeare datasets. Figure~\ref{fig:hom_all} shows the average finish times of the algorithms, and the significant time gains of our algorithm $\nameofthealgorithm$ over synchronous $\mmsync$ (even after incorporating straggler mitigation). We observe that the gain increases with the number of simultaneously trained tasks because $\mmsync$ is especially vulnerable to the straggler problem.

2) \textit{Heterogeneous Tasks:} 
The heterogeneous experiment trains $4$ models simultaneously, one each for the MNIST, Fashion-MNIST, CIFAR-10, and Shakespeare datasets. Once one model reaches its target accuracy, its training stops, and its clients are reallocated to other tasks. We use $\nameofthealgorithm$ with \textit{option} $=D$ for dynamic client allocation. For the synchronous baseline $\mmsync$, we ran $30$ different client allocation schemes, including our proposed allocation scheme and uniform allocation across tasks. We report the results achieved with the best-performing scheme.

\begin{figure}[t]
    \centering
  \centerline{\includegraphics[width=0.45\textwidth]{figures/het_fig.pdf}}
  \vspace{-2mm}
    \caption{Training curves of a single Monte Carlo run of the \textit{heterogeneous experiment}. Dashed vertical lines show times when tasks reach their target accuracy, with $\nameofthealgorithm$ reaching it faster than $\mmsync$.
    % \carlee{, with $\nameofthealgorithm$ reaching it faster than $\mmsync$}. 
    % \GJ{The dashed lines can be made darker so that they are more clearly visible}
    }
    \label{fig:het_exp_curves}
\end{figure}
% \vspace{-5mm}

\begin{figure}[thb]
    \centering    \centerline{\includegraphics[width=0.4\textwidth]{figures/het_bar.pdf}}
    \vspace{-2mm}
    \caption{Mean time required to reach target accuracy and time gain of $\nameofthealgorithm$ over $\mmsync$ in \textit{the heterogeneous experiment}. While $\nameofthealgorithm$ does not require manual fine-tuning, the client allocation in $\mmsync$ is tuned at $100$, $84$, $48$, and $68$ clients for MNIST, Fashion-MNIST, CIFAR-10, and Shakespeare tasks respectively. $\nameofthealgorithm$ has notable time gain ($40.1\%$) over $\mmsync$ to finish all tasks.
% Although $\nameofthealgorithm$ and $\mmsync$ share the client resources differently on each task, $\nameofthealgorithm$ is faster for each task. 
% \carlee{can we add static FedAST here?}
    }
    \label{fig:het_bar}
\end{figure}

Figure~\ref{fig:het_exp_curves} shows learning curves for $\nameofthealgorithm$ and $\mmsync$ from a single Monte Carlo run. The dashed vertical lines denote the time instants when a model reaches its target accuracy, following which the clients training this model get reallocated to other tasks. Figure~\ref{fig:het_bar} shows the average finish times for $4$ simultaneously trained models with $\nameofthealgorithm$ and $\mmsync$. For example, with $\nameofthealgorithm$, the model for MNIST dataset hits its target accuracy at $99$, after which the clients training this model get reallocated to the other models. At $619$, the training of the final model (for CIFAR-10) is complete.  Comparing the finish times for the last model, $\nameofthealgorithm$ provides a $40.1\%$ time gain over $\mmsync$.

  % Dynamic client allocation enables $\nameofthealgorithm$ to automatically identify tasks with higher inter-client heterogeneity thereby necessitating larger buffers accordingly. For example, fewer clients are assigned to the Shakespeare task due to its lower estimated heterogeneity, aligning with its true label distribution across clients. Repeating this experiment using $\nameofthealgorithm$ with static \textit{option} confirms the effectiveness of dynamic allocation consistently offering time gain up to $11.9\%$ over the static version. We relegate the results of this experiment to Appendix Section~\ref{app_sect:var_comp}.

We observe that thanks to dynamic client allocation, $\nameofthealgorithm$ automatically detects which tasks have higher heterogeneity across clients and need a larger buffer. Notice that the Shakespeare task is allocated fewer clients because it is estimated to be less heterogeneous, which is true based on the label distribution of data samples across clients. We also repeat this experiment using $\nameofthealgorithm$ with static \textit{option} ($S$) to validate the proposed dynamic client allocation strategy. Dynamic client allocation consistently has time gain up to $11.9\%$ compared to the static version.
  
 % We observe that thanks to dynamic client allocation, $\nameofthealgorithm$ automatically detects which tasks have higher heterogeneity across clients and need a larger buffer.
%  Dynamic client allocation enables $\nameofthealgorithm$ to automatically identify tasks with higher inter-client heterogeneity and needing larger buffers accordingly.
%  % Notice that the Shakespeare task is allocated fewer clients because it is estimated to be less heterogeneous, which is true based on the label distribution of data samples across clients.
%  For example, fewer clients are assigned to the Shakespeare task due to its lower estimated heterogeneity, aligning with its true label distribution across clients.
%  % We also repeat this experiment using $\nameofthealgorithm$ with static \textit{option} ($S$) to validate the proposed dynamic client allocation strategy.
%  Repeating this experiment using $\nameofthealgorithm$ with static \textit{option} 
%  %($S$) 
%  confirms the effectiveness of dynamic allocation
%  % Dynamic client allocation consistently has 
% consistently offering time gain up to $11.9\%$ over the static version.  
%  %\carlee{compared to sync-ST?}.
%  We relegate the results of this experiment to Appendix Section~\ref{app_sect:var_comp}.

\section{CONCLUSION}\label{sec:conclusion}
In this paper, we present $\nameofthealgorithm$, a federated learning framework to simultaneously train multiple models using buffered asynchronous aggregations.
%at the server. 
We theoretically prove the convergence of our algorithm for smooth non-convex objectives.
%functions.
% Using 
Experiments across multiple datasets,
% we show that
demonstrates the $\nameofthealgorithm$'s superiority over existing simultaneous FL baselines, achieving up to $46.0\%$ reduction in training time. 
% In the future, we plan to extend $\nameofthealgorithm$ with smarter client selection that considers the local data distribution of each client.
Future work will enhance $\nameofthealgorithm$ by incorporating smarter client selection based on local data distributions.

% \subsubsection*{Acknowledgements}
% All acknowledgments go at the end of the paper, including thanks to reviewers who gave useful comments, to colleagues who contributed to the ideas, and to funding agencies and corporate sponsors that provided financial support. 
% To preserve the anonymity, please include acknowledgments \emph{only} in the camera-ready papers.


% \subsubsection*{References}

% References follow the acknowledgements.  Use an unnumbered third level
% heading for the references section.  Please use the same font
% size for references as for the body of the paper---remember that
% references do not count against your page length total. 



% References
\bibliography{References}

\newpage

\onecolumn

\title{FedAST: Federated Asynchronous Simultaneous Training\\(Appendix)}
\maketitle
\appendix

\section{Adjusting the number of active requests and \texttt{Realloc}}\label{app_sect:CalcRb}
Before designing \texttt{Realloc} algorithm for dynamic client allocation \textit{option} ($D$), we conduct initial validation experiments with the static \textit{option} ($S$), wherein the allocation of active local training requests across clients and buffer sizes remain unchanged throughout the training. Note that with the static \textit{option}\nolinebreak, Algorithm~\ref{alg:calcRb} only executes Line~\ref{alg3_line:set_same} and returns the previous round's values always. We \textit{empirically} observe that setting the ratio of the number of active local training requests to buffer size fixed and below $37$ works well. Refer to Section~\ref{app_sect:buffer_size} for validation experiments. Then, incorporating this ratio (\(R_m\approx 37b_m\)) within the convergence bound in Equation~\ref{eq:thm:main} of Theorem~\ref{thm:main}, we find out that the dominant term (excluding smoothness constants) becomes: \(\bOP{\frac{\lrsj\lrcj\locitj}{\awj}}\ghetsj\). Further, given the limited number of available clients, we cannot increase the total number of active local training requests arbitrarily without increasing the staleness. Thus, we employ \(\sum_{m=1}^MR_m=R\) where \(R\) is a constant of how many active training requests we assign in total depending on the number of available clients in the setting. Since the goal of federated simultaneous training is to minimize the objective functions of all tasks concurrently, we propose to adjust $\{R_m\}_{m=1}^M$ by solving,
    \begin{align}
    \min_{\{R_m\}_{m=1}^M}\sum_{m=1}^M\frac{\lrsj\lrcj\locitj}{\awj}\ghetsj \text{ subject to } \sum_{m=1}^MR_m=R,\label{eq:constrained_optim_problem}
\end{align}
 whose solution suggests allocating local training requests in proportion to \(\sigma_{g,m}\sqrt{\lrsj\lrcj\locitj}\) for each model \(m\). %\carlee{I think this should actually be in proportion to the square roots of $\lrsj\lrcj\locitj\ghetsj$.} 
With this approach, we use Algorithm~\ref{alg:calcRb} to adjust resource allocation. Further, as the update variance across clients may vary in time during training, we employ adaptive reallocation of resources across models with some periods (Line~\ref{alg3_line:firstif} in Algorithm~\ref{alg:calcRb}). Therefore, we use round indices to denote the changing number of active training requests and buffer sizes. 

\begin{algorithm}[t]
\caption{\texttt{EstimateVariances()}}\label{alg:estimatevariances}

\begin{algorithmic}[1]
\Require{\multiline{The set of latest $\nbupdvar$ updates $\{\del_{m,1}, \del_{m,2},\dots,\del_{m,\nbupdvar}\}_{m=1}^M$, server-side learning rates \(\{\lrsj\}_{m=1}^M\), client-side\\learning rates  \(\{\lrcj\}_{m=1}^M\), and the number of local SGD steps of all models \(\{\locitj\}_{m=1}^M\).}}
\State \(\{\overline{\del_{m}}\}_{m=1}^M \gets \{\frac{1}{\nbupdvar}\times\sum_{i=1}^\nbupdvar\del_{m,i}\}_{m=1}^M\) \Comment{Calculate the means of the latest updates}
\State \(\{{\widetilde{\sigma}_{g,m}^2}\}_{m=1}^M \gets \{\frac{1}{\nbupdvar}\times\sum_{i=1}^\nbupdvar \normbs{\del_{m,i}-\overline{\del_{m}}}/\normbs{\overline{\del_{m}}}\}_{m=1}^M\) \Comment{Calculate the normalized sample variances}
\State \(\{{\hat{\sigma}_{g,m}^2}\}_{m=1}^M \gets \{{\lrcj\lrsj\locitj\widetilde{\sigma}_{g,m}^2}\}_{m=1}^M\) \Comment{Multiply with other constants suggested by the convergence guarantee (\ref{eq:constrained_optim_problem})}
\State \textbf{Return} $\{{\hat{\sigma}_{g,m}^2}\}$ 
\end{algorithmic}
\end{algorithm} 

As we do not have access to actual data heterogeneity levels, we need to estimate it (Line~\ref{alg3_line:estimate} in Algorithm~\ref{alg:calcRb}). When $\nameofthealgorithm$ is run with \textit{option} $=D$ (dynamic client allocation option), the server keeps the latest $\nbupdvar$ updates of each model. This requires constant and small memory space kept in the server. To present how variance estimation (\texttt{EstimateVariances()}) works, assume that \(\{\del_{m,1}, \del_{m,2},\dots,\del_{m,\nbupdvar}\}_{m=1}^M\) are the sets of latest received updates of tasks $m\in[M]$ where each $\del_{m,k}$ for $k\in[V]$ is the output of $k$\textsuperscript{th} latest local training (Algorithm~\ref{alg:cap}). As the output of any local training is the average of all calculated stochastic gradients during that local training, we use those outputs as approximations of the gradients calculated on the local data of clients. Algorithm~\ref{alg:estimatevariances} describes \texttt{EstimateVariances()}. It first calculates the mean of the latest updates for each task, 
\(\{\overline{\del_{m}}\}_{m=1}^M = \{\frac{1}{\nbupdvar}\times\sum_{i=1}^\nbupdvar\del_{m,i}\}_{m=1}^M\).
Then, \texttt{EstimateVariances()} returns sample variance multiplied with other terms (\(\lrsj\lrcj\locitj\)) and normalized by the mean update norm (this normalization prevents large models or models with inherently large weights from dominating others),
\mbox{\(\{{\hat{\sigma}_{g,m}^2}\}_{m=1}^M = \{\frac{\lrsj\lrcj\locitj}{\nbupdvar}\times\sum_{i=1}^\nbupdvar \normbs{\del_{m,i}-\overline{\del_{m}}}/\normbs{\overline{\del_{m}}}\}_{m=1}^M\)}. Then, \texttt{Realloc} algorithm allocates the number of active training requests proportionally to the square root of these values (Algorithm~\ref{alg:calcRb}, Line~\ref{alg3_line:proportional_dist}).

This approach is sensible both theoretically and intuitively. Based on our experimental observations regarding the relationship between the number of active training requests and buffer size, increasing the number of active local training requests necessitates an increase in buffer size. Moreover, a larger buffer proves beneficial in reducing the variance across updates, as aggregation occurs through the averaging of buffered updates. \texttt{Realloc} specifically aims to allocate more clients and provide a larger buffer size for tasks with greater heterogeneity. We choose the number of stored latest updates $V=8$ and the period of number of total updates from all clients to trigger \texttt{Realloc} $c_{period}=0.75\times M\times\sum_{m=1}^MR_m$ in our experiments. The benefits of dynamic allocation (\textit{option} $=D$) over static and uniform resource allocation (\textit{option} $=S$) are demonstrated when tasks/models are heterogeneous, as shown in \mbox{Figures \ref{fig:var_unif_comp_high}-\ref{fig:var_unif_comp_low_bar}}.

\section{Theoretical Comparison of  $\nameofthealgorithm$ to Baselines}
\label{app_sect:theory_comparison}
We compare $\nameofthealgorithm$ to single-model FL methods, too. \citep{fedbuff} is the most similar algorithm to $\nameofthealgorithm$ (with single-model). However, even for the single-task case, $\nameofthealgorithm$ differs  by employing a uniform client assignment to ensure equal participation of clients irrespective of their hardware speeds. This allowed us to relax the assumptions to prove the convergence guarantee. \citep{fedbuff} relies on a strong assumption that the server receives updates from clients uniformly at random and that the norm of gradients is bounded. Moreover, compared to \citep{sharper}, our analysis is more general as $\nameofthealgorithm$ employs multiple SGD steps in local training and a buffer. Some other recent single-model works, \citep{quafl} and \citep{favano}, do not have  straightforward and efficient simultaneous federated training extensions for multiple models. 

\citep{asyncMM} is another asynchronous simultaneous federated learning method. However, \citep{asyncMM} indeed fails to converge to a stationary point asymptotically unless data is homogeneous although their assumptions include Bounded Gradient and Weak Convexity.
% \begin{table}[h!] 
% \centering 
% \caption{Comparison of $\nameofthealgorithm$'s convergence guarantees to  \citep{fedbuff} (single task asynchronous buffered FL algorithm) and \citep{asyncMM} (an asynchronous FST algorithm). $T$: \#global rounds, $\tau$: \#local steps, $b$: buffer size.}
% \label{tab:theo_comp}
% \begin{tabular}{@{}ccc@{}}
% \toprule
% \textbf{Algorithm} &
%   \textbf{\begin{tabular}[c]{@{}c@{}}Non-standard assumptions\end{tabular}} &
%   \textbf{\begin{tabular}[c]{@{}c@{}}Convergence\end{tabular}} \\ \midrule
% \citep{fedbuff} &
%   Bounded Gradient \& Receiving Updates Uniformly &
%   $\bOP{{\sqrt{\locit/(Tb)}}}$\\
% \citep{asyncMM} &
%   \begin{tabular}[c]{@{}c@{}}Bounded Gradient \& Weak Convexity\end{tabular} &
%   {Not converge}\\
% $\nameofthealgorithm$ &
%   ---&
%   $\bOP{{\sqrt{\locit/(Tb)}}}$ \\ \bottomrule
% \end{tabular}
% \end{table}
\begin{table}[h!] 
\centering 
\caption{Comparison of $\nameofthealgorithm$'s convergence guarantees to  \citep{fedbuff} (single-task asynchronous buffered FL algorithm) and \citep{asyncMM} (an asynchronous FST algorithm). $T$: \#global rounds, $\tau$: \#local steps, $b$: buffer size.}
\label{tab:theo_comp}
\begin{tabular}{@{}ccc@{}}
\toprule
\textbf{Algorithm} &
  \textbf{\begin{tabular}[c]{@{}c@{}}Non-standard assumptions\end{tabular}} &
  \textbf{\begin{tabular}[c]{@{}c@{}}Convergence\end{tabular}} \\ \midrule
\citep{fedbuff} &
  Bounded Gradient \& Receiving Updates Uniformly &
  $\bOP{{\sqrt{\locit/(Tb)}}}$\textsuperscript{\textcolor{red}{(a)}}\\
\citep{asyncMM} &
  \begin{tabular}[c]{@{}c@{}}Bounded Gradient \& Weak Convexity\end{tabular} &
  {Not converge}\\
$\nameofthealgorithm$ &
  ---&
  $\bOP{{\sqrt{\locit/(Tb)}}}$ \\ \bottomrule
\end{tabular}
\\[10pt] % Adjust space between the table and the note
\footnotesize{\textcolor{red}{(a)} Although the convergence guarantee in the published \citep{fedbuff} paper seems to have a better rate, we pointed out a mistake in their proof. Here, we use the corrected version we received via private communication.}
\end{table}


% \section{Run-time Comparison of Federated Simultaneous Training Algorithms} \label{app_sect:runtime}
% We provide intuition to understand how  theoretical run-time of mentioned algorithms under a specific setting with some simplifying assumptions including that each client's local training time for each model is from exponential distribution, \textit{Exp($\lambda$)}. Let us say that we have total $K$ active clients anytime and $M$ identical tasks with identical models that we want to train.
% \begin{enumerate}
%     \item \textit{Na\"ive Sequential Synchronous Training:} We train models one by one, using all $K$ active clients. We know the most dominant term in the convergence analysis will be \(\bOP{1/\sqrt{KT}}\) where \(T\) is the number of total rounds for each task  \citep[Corollary 1]{fedvarp}. Further, the expected round time for any model will be the expected value of the maximum of $K$ exponential random variables, which is \(\frac{1}{\lambda}\sum_{n=1}^K\frac{1}{n}\leq\frac{1}{\lambda}\log K\). Therefore, total \textit{run-time} required to reach arbitrarily small $\epsilon$-norm gradients for all of \(K\) models is bounded by \(\bOP{\frac{M\sqrt{\log K}}{\epsilon^2\sqrt{\lambda K}}}\).
%     \item \textit{Client set-partitioned training:} We first divide the client set into \(M\) non-overlapping subsets. Then, each model is trained with one of the subsets. This method is not guaranteed to converge a model with an arbitrarily small gradient-norm due to heterogeneity.
%     \item \textit{Synchronous Simultaneous Training:} We train models by distributing \(K/M\) clients to each task at each round. However, as the algorithm requires synchrony, again, the expected round-time is the maximum of $K$ exponential random variables, which is bounded by \(\frac{1}{\lambda}\log K\). Using the most dominant term from the bound in \cite[Corollary 1]{fedvarp}, $\bOP{\sqrt{M/(KT)}}$, the total \textit{run-time} required to reach arbitrarily small $\epsilon$-norm gradients for all of \(K\) models is bounded by \(\bOP{\frac{\sqrt{M\log K}}{\epsilon^2\sqrt{\lambda K}}}\).
%     \item \textit{$\nameofthealgorithm$ (Asynchronous Simultaneous Training):} Having \(K/M\) active local training request for each model for \(\nameofthealgorithm\), assume that there exists at least \(K/(2)\) total active clients working anytime independent of past and future. It is a hard assumption to prove rigorously; however, we observe that held in our experiments with $300$ active clients and $2-6$ simultaneous tasks all the time. Assuming this, the expected time for the server to receive an update is smaller than $2/(K\lambda)$. As the models are identical, the expected time to they 
% \end{enumerate}

\section{EXPERIMENTAL SETUP DETAILS}\label{suppl_sect:experiment_details}
In our study, we explore a simultaneous federated learning (FL) setting for multiple models. We present the details of our experiments in this section.

\subsection{Simulation Environment} We simulate the training with PyTorch on NVIDIA GeForce GTX TITAN X graphics processing units (GPUs) of our internal cluster. We build our code upon the public codes of \citep{flsim, asynchfl}.

\subsection{Setting Overview} We consider the federated training of $M$ models simultaneously using $\N$ clients. $\N$ is $1000$ in all experiments and $M$, specified for each experiment explicitly, varies between $2-6$.

\subsection{Tasks and Models} We use $4$ different tasks across the experiments: 
MNIST \citep{mnist}, Fashion-MNIST \citep{fashionmnist}, CIFAR-10 \citep{cifar10} image classification tasks, and Shakespeare \citep{leaf} next character prediction task. We use a multilayer perceptron for MNIST as in \citep{feddyn}, convolutional networks for Fashion-MNIST as in \citep{lenet} and for CIFAR-10 as in \citep{feddyn}, and a long short-term memory network for Shakespeare as in \citep{asynchfl}.

\subsection{Datasets and Data Distribution} We consider the data heterogeneity across clients in FL frameworks. We download MNIST, Fashion-MNIST, and CIFAR-10 datasets from PyTorch built-in library methods. The train and test splits provided by the library are used without any modifications. To simulate heterogeneous data distribution across clients, we use Dirichlet distribution with $\alpha=0.1$ following the approach suggested in \citep{firstDirichlet}. Label distribution of each client's local data is random using the Dirichlet distribution. We ensure that each client has $300$ data points for MNIST, Fashion-MNIST, and CIFAR-10 tasks by repeating the train set if necessary. We obtain and preprocess the Shakespeare dataset as described in \citep{leaf}. This dataset has inherently heterogeneous distribution across clients as each client corresponds to a unique role from Shakespeare's plays.

\subsection{Design Parameters}
In this section, we explain how we choose the design parameters.

\paragraph{Client dataset sizes, batch sizes, and number of local steps.} While distributing CIFAR-10, MNIST, and Fashion-MNIST datasets across clients, each client is allocated $300$ data points from each dataset. The Shakespeare dataset, however, maintains its original distribution of data points across roles, so clients have different numbers of data samples in the Shakespeare task.
For CIFAR-10, MNIST, and Fashion-MNIST tasks, we set the batch size to $32$ while we employ a batch size of $64$ for the Shakespeare task. We fix the number of local steps in local training ($\locitj$ parameter in Algorithm~\ref{alg:cap} in the main text) of clients at $27$ for all tasks. This makes $3$ epochs for CIFAR-10, MNIST, and Fashion-MNIST tasks. As the number of data points varies across clients for the Shakespeare dataset, there is no fixed number of epochs.

\paragraph{Buffer size.} \label{app_sect:buffer_size}
The buffer in $\nameofthealgorithm$ is crucial for mitigating the negative impacts of highly stale updates, as extensively discussed in the main text. The staleness of updates is influenced by the number of active local training requests, denoted as $\awj$, and the buffer size, $\bsj$, associated with all model $m\in[M]$. When $\nameofthealgorithm$ is run with static \textit{option} ($S$), these numbers are kept constant during the training, but they may change (this time we denote $R_m^{(t_m)}$ and $b_m^{(t_m)}$) when we use dynamic client allocation \textit{option} ($D$). A higher number of simultaneous local training requests leads to a higher staleness because it increases the global model's update frequency at the server. On the other hand, buffer size is inversely related to staleness, given its opposing effect on the aggregation frequency. Based on our experimental observations, selecting the number of active training requests and  the buffer size of model $m$ such that their ratio is fixed and below $37$, ($\awj/\bsj \lesssim 37$ or $R_m^{(t_m)}/b_m^{t_m}\lesssim37$), works well. Selecting the buffer size of $\nameofthealgorithm$ based on this observation avoids the detrimental effects of stale updates while benefiting from fast training thanks to the asynchronous algorithm. We show two experimental results in Figures~\ref{fig:buff_size_1} and \ref{fig:buff_size_2}. In Figure~\ref{fig:buff_size_1}, we train one Fashion-MNIST and one CIFAR-10 models simultaneously by assigning $175$ active training requests to both tasks and observe that buffer size of $5$ strikes a balance between high final test accuracy and fast training to achieve the target accuracy for both tasks. In Figure~\ref{fig:buff_size_2}, we repeat a similar experiment with MNIST and CIFAR-10 tasks by assigning $105$ active training requests to each. This time, we observe that a buffer size of $3$ performs the best for both tasks.
% As discussed in \ref{} in the main text, we experimentally see that choosing the buffer size proportional to the number of active local training requests, fixing \(b_m/R_m\) ratio at constant, work well for all tasks. We choose this ratio higher than \(1/37\) making both \(b_m\) and \(R_m\) integer. Below, we share two experimental results:

\begin{figure}[H]
    \centering
    % \vspace{.3in}
\centerline{\includegraphics[width=0.85\textwidth]{figures/buff_size_1.pdf}}
    % \vspace{.3in}
    \caption{ The final test accuracy and required time to get target accuracy (in Table~\ref{table:exp_summary}) for simultaneous training (using $\nameofthealgorithm$ with static \textit{option}) of one Fashion-MNIST and one CIFAR-10 model with different buffer sizes. We assign the same number of local training requests (\(175\)) to each task.}\label{fig:buff_size_1}
\end{figure}

\begin{figure}[H]
    \centering
    % \vspace{.3in}
\centerline{\includegraphics[width=0.85\textwidth]{figures/buff_size_2.pdf}}
    % \vspace{.3in}
    \caption{ The final test accuracy and required time to get target accuracy (in Table~\ref{table:exp_summary}) for simultaneous training (using $\nameofthealgorithm$ with static \textit{option}) of one MNIST and one CIFAR-10 model with different buffer sizes. We assign the same number of local training requests (\(105\)) to each task.}\label{fig:buff_size_2}
\end{figure}

\paragraph{Learning rate and weight decay.}
We search for the best learning rate and weight decay hyperparameters considering the training speed and final accuracy levels. We seek client-side learning rate within the range of $[1\times10^{-3},1\times10]$, server-side learning rate within $[3\times10^{-2},3]$, and weight decays within $[1\times10^{-7}, 1\times10^{-2}]$. We observe that client-side learning rates of $6\times10^{-2}$ and $7$ with weight decays of $3\times10^{-4}$ and $7\times10^{-5}$ work best respectively for Fashion-MNIST and Shakespeare tasks for all methods. For CIFAR-10 task, a client-side learning rate of $1\times10^{-1}$ with weight decays of $7\times10^{-4}$ and $3\times10^{-4}$ perform best for asynchronous and synchronous methods, respectively. For MNIST, we use client-side learning rates of $1\times10^{-1}$ and $2\times10^{-1}$ for asynchronous and synchronous methods, respectively, with a weight decay of $3\times10^{-4}$. For server-side learning rates, we observe that $1$ for synchronous methods ($\mmsync$, $\mmbobs$, and $\mmucb$), $0.1$ for $\nameofthealgorithm$, and $0.038$ for $\nobuffer$ perform well for all tasks.

\subsection{Modeling Training Times, Model Sizes, and Client Speed Heterogeneity}

In our experiments, following \citep{shiftedexp2, shiftedexp1, MM_bobs, slowandstale}, we employ the \textit{shifted-exponential} random variables to model the duration between the server sends a local training request to a client, and it receives the update of the local training. The exponential component of the distribution reflects the stochastic nature of the device speeds, while the shift component accounts for unavoidable delays such as disk I/O operations.

Whenever a client $i$ performs local training for task $m$, we draw a random number from the distribution with a cumulative distribution function (CDF) of,
\begin{align}
P(X\leq x)=\begin{cases} 
      1-\exp\{-\frac{x-\beta_{i,m}}{2\beta_{i,m}}\}, & x\geq\beta_{i,m} \\
      0, & \text{otherwise}
   \end{cases}, \nn
\end{align}
where $\beta_{i,m}$ depends on the speed of client $i$ and the size of the model associated with task $m$. Then, we multiply this random number by the number of local steps to calculate the simulation time between the server requests for the local training and it receives the update back.

We quantify the effect of the model sizes based on the average time required to calculate one stochastic gradient for each model on the GPUs of our internal cluster. By our measurements, we set,
\begin{equation}
\frac{\beta_{i,\text{MNIST}}}{0.148}=\frac{\beta_{i,\text{Fashion-MNIST}}}{0.240}=\frac{\beta_{i,\text{CIFAR-10}}}{0.228}=\frac{\beta_{i,\text{Shakespeare}}}{0.555},\;\;\forall i \in [N].\nn
\end{equation}

In our experiments, we also take the heterogeneity in the speed of client devices into consideration. We categorize clients into three speed groups: slow ($\%25$), normal-speed ($\%50$), and fast ($\%25$). The speed rates for these categories are inversely proportional to $1.3$, $1$, and $0.7$, such that,
\begin{equation}
    \frac{\beta_{\text{slow client},m}}{1.3}=\frac{\beta_{\text{normal-speed client},m}}{1}=\frac{\beta_{\text{fast client},m}}{0.7},\;\;\forall m \in [M].\nn
\end{equation}




\section{ADDITIONAL EXPERIMENTS}
In this section, we present supplementary experiments.
\subsection{Tuning Parameter $\firstk$ of the Straggler Mitigation Technique Used for Synchronous Methods (Accepting only the First-$\firstk$ Updates)} \label{app_sec:tuning_first_k}
In our experiments, to mitigate the high straggler effect, the server in synchronous methods ($\mmsync$, $\mmbobs$, and $\mmucb$) only aggregates the first $\firstk$ client updates for each task and discards the rest, following \citep{fedsysdesign}. To tune parameter $\firstk$, we run validation experiments with $\mmsync$ on single CIFAR-10, MNIST, and Fashion-MNIST tasks and evaluated the training performance with respect to simulated time and number of global rounds. A larger $\firstk$ results in a longer simulated time per round since we wait for more clients. On the other hand, the variance in aggregated updates on each round becomes smaller since we average more updates. Therefore, the target accuracy is attained faster in terms of the number of global rounds. We also observed that keeping $\firstk$ too small yields lower final accuracy. Navigating these trade-offs, we find that $\firstk = 30$ strikes an effective balance.
\begin{figure}[H]
\vspace{.3in}
\centerline{\includegraphics[width=0.99\textwidth]{figures/cifar.pdf}}
\vspace{.3in}
\caption{Performance of $\mmsync$ with varying $\firstk$ in CIFAR-10 task. The chosen point is shown with a red star.}
\end{figure}

\begin{figure}[H]
\vspace{.3in}
\centerline{\includegraphics[width=0.99\textwidth]{figures/fmnist.pdf}}
\vspace{.3in}
\caption{Performance of $\mmsync$ with varying $\firstk$ in Fashion-MNIST task. The chosen point is shown with a red star.}
\end{figure}

\begin{figure}[H]
\vspace{.3in}
\centerline{\includegraphics[width=0.99\textwidth]{figures/mnist.pdf}}
\vspace{.3in}
\caption{Performance of $\mmsync$ with varying $\firstk$ in MNIST task. The chosen point is shown with a red star.}
\end{figure}


% \subsection{Synchronous Simultaneous Performance without Straggler Mitigation}
% We repeat the same experiment in Figure\ref{fig:comp1_3} in the main text, where we train \(3\) identical CIFAR-10 models simultaneously. We observe that the synchronous method requires more than \(5\times\) time to reach a similar accuracy level compared to the one with straggler mitigation we used for competitors in the presented results.
% \begin{figure}[H]
%     \centering
%     % \vspace{.3in}
% \centerline{\includegraphics[width=0.35\textwidth]{figures/sync_no_strag_mitig.pdf}}
%     % \vspace{.3in}
%     \caption{ The average test accuracy of \(3\) simultaneous CIFAR-10 task trained by $\mmsync$ without straggler mitigation technique we use for our competitors. \(68\%\) accuracy level is shown with a red star. Compared to Figure~\ref{fig:comp1_3}, $\mmsync$ takes more than \(5\times\) time to get this accuracy due to the exacerbated straggler effect.}\label{fig:sync_no_strag_mit}
% \end{figure}


\subsection{Test Loss Plots of Figures \ref{fig:buff_hom_acc} and \ref{fig:buff_het_acc} in the Main Text}
We illustrate test loss plots of the experiments in Figures \ref{fig:buff_hom_acc} and \ref{fig:buff_het_acc} in the main text. 

\begin{figure}[H]
    \centering
        % \vspace{.3in}
        \centerline{\includegraphics[width=0.5\textwidth]{figures/hom_both.pdf}}
        % \vspace{.3in}

    \caption{The mean final loss values of {\color{blue}$\nameofthealgorithm$ (blue)}, {\color{olive}$\nobuffer$ (olive green)} and {\color{violet}centralized training (violet)}  with varying active client ratio, when training $3$ identical models. The left figure is for CIFAR-10 dataset, while the right figure is for Fashion-MNIST dataset. With a higher number of active clients, thanks to the buffer, $\nameofthealgorithm$ remains its performance while  $\nobuffer$ gets worse.
    % Increasing the ratio of active clients significantly worsens the final loss of $\nobuffer$.
    }
    \label{fig:buff_hom}
\end{figure}

\begin{figure}[H]
    \centering
    % \vspace{.3in}
    \centerline{\includegraphics[width=0.58\textwidth]{figures/het_both.pdf}}
    % \vspace{.3in}
    \caption{The mean test loss values of $\nameofthealgorithm$ and $\nobuffer$, when simultaneously training one model for CIFAR-10 and one for Fashion-MNIST. $\nameofthealgorithm$ achieves lower and more stable loss levels. 
    %\ps{Modify the legend of centralized training to clarify that this is just the eventual loss achieved, it's not the training curve. Or remove the legend altogether and mention this in the caption.}
    }
    \label{fig:buff_het}
\end{figure}

\subsection{Training Curves of Homogeneous Experiments}
 In Figure~\ref{fig:hom_all_curves}, we provide the average training curves of the homogeneous-task experiment in Figure~\ref{fig:hom_all}.
\begin{figure}[H]
    \centering
    % \vspace{.3in}
    \centerline{\includegraphics[width=\textwidth]{figures/all_larger_3.pdf}}
    % \vspace{.3in}
    \caption{Training curves of $\nameofthealgorithm$ and $\mmsync$ on $2$/$4$/$6$ tasks with CIFAR-10, Fashion-MNIST, MNIST, and Shakespeare datasets. Time gains of $\nameofthealgorithm$ over $\mmsync$ to attain target accuracy are shown on the colored horizontal lines. Horizontal black lines indicate target accuracy levels, same as the ones stated in \Cref{table:exp_summary}. 
    % The time gain increases with the number of simultaneous tasks due to increasing stragglers of $\mmsync$. 
    %\ps{Label y-axis.}
    }
    \label{fig:hom_all_curves}
\end{figure}

\subsection{Performance across different target accuracies} \label{app_sec:diff_acc}
To see how $\nameofthealgorithm$ and the competitor $\mmsync$ work with different target accuracy, we conduct the experiment in Figure~\ref{fig:het_bar} with \(+3\%\) higher and \(-10\%\) lower target accuracy levels as shown in Table~\ref{table:exp_summary}. We observe that proposed $\nameofthealgorithm$ reduces the overall training time by \(55.9\%\) and \(16.3\%\), respectively for higher and lower target accuracy levels. We observe that the advantage of $\nameofthealgorithm$ over $\mmsync$ increases with the difficulty of the task (i.e., reaching higher accuracy).

\begin{table}[H]
\centering
\caption{Different target accuracy levels used in experiments to validate the proposed methods, with lower and higher accuracy targets.}
\label{table:diff_target_acc_updated}
\begin{tabular}{|c|c|c|c|}
\hline
\textbf{Dataset} & \textbf{Lower Target Accuracy} & \textbf{Target Accuracy in the Main Text} & \textbf{Higher Target Accuracy} \\ \hline
MNIST & $83\%$ & $93\%$ & $96\%$ \\ \hline
Fashion-MNIST & $72\%$ & $82\%$ & $85\%$ \\ \hline
CIFAR-10 & $53\%$ & $63\%$ & $66\%$ \\ \hline
Shakespeare & $32\%$ & $42\%$ & $45\%$ \\ \hline
\end{tabular}
\end{table}

\begin{figure}[H]
    \centering
    \begin{minipage}[c]{0.56\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/het_higher_fig.pdf} 
        \caption{Training curves of a single Monte Carlo run of the \textit{heterogeneous experiment with higher target accuracy levels} in Table~\ref{table:diff_target_acc_updated}. Dashed vertical lines show times when tasks reach their target accuracy. The setting is the same as the experiment in Figure~\ref{fig:het_bar}.}
        \label{fig:het_higher_curves}
    \end{minipage}
    \hfill
    \begin{minipage}[c]{0.43\textwidth}
        \centering
    \centering    \centerline{\includegraphics[width=\textwidth]{figures/het_higher_bar.pdf}}
    \caption{Mean time required to reach target accuracy and time gain of $\nameofthealgorithm$ over $\mmsync$ in \textit{the heterogeneous experiment with higher target accuracy levels} in Table~\ref{table:diff_target_acc_updated}. The setting is the same as the experiment in Figure~\ref{fig:het_bar}.}
    \label{fig:het_higher_bar}
    \end{minipage}
\end{figure}

\begin{figure}[H]
    \centering
    \begin{minipage}[c]{0.56\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/het_lower_fig.pdf} 
        \caption{Training curves of a single Monte Carlo run of the \textit{heterogeneous experiment with lower target accuracy levels} in Table~\ref{table:diff_target_acc_updated}. Dashed vertical lines show times when tasks reach their target accuracy. The setting is the same as the experiment in Figure~\ref{fig:het_bar}.}
        \label{fig:het_lower_curves}
    \end{minipage}
    \hfill
    \begin{minipage}[c]{0.43\textwidth}
        \centering
    \centering    \centerline{\includegraphics[width=\textwidth]{figures/het_lower_bar.pdf}}
    \caption{Mean time required to reach target accuracy and time gain of $\nameofthealgorithm$ over $\mmsync$ in \textit{the heterogeneous experiment with lower target accuracy levels} in Table~\ref{table:diff_target_acc_updated}. The setting is the same as the experiment in Figure~\ref{fig:het_bar}.}
    \label{fig:het_lower_bar}
    \end{minipage}
\end{figure}


\subsection{Performance of $\nameofthealgorithm$ with constant uniform allocation } \label{app_sect:var_comp}
We conduct heterogeneous-task experiments to validate the performance gain of dynamic client allocation ($\nameofthealgorithm$\texttt{D}) over static option ($\uniform$) with uniform allocation across tasks in heterogeneous settings. To show the consistency of our results, we run experiments at all target accuracy levels (Table~\ref{table:diff_target_acc_updated}) in the experiments Figure~\ref{fig:var_unif_comp_high_bar} (higher), Figure~\ref{fig:var_unif_comp_mid_bar}, and Figure~\ref{fig:var_unif_comp_low_bar} (lower). We conclude that our dynamic client allocation based on the variance estimates of the updates reduces the total training time compared to the uniform static client allocation.




\begin{figure}[!h]
    \centering
    \begin{minipage}[c]{0.56\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/var_vs_uniform_higher.pdf} 
        \caption{Training curves of a single Monte Carlo run in the experiment with dynamic client allocation \textit{option} ($\nameofthealgorithm$\texttt{(D)}) and static \textit{option} with uniform client allocation ($\uniform$). The setting is the \textit{heterogeneous experiment with higher target accuracy levels} in Table~\ref{table:diff_target_acc_updated}. Dashed vertical lines show times when tasks reach their target accuracy.}
    \label{fig:var_unif_comp_high}
    \end{minipage}
    \hfill 
    \begin{minipage}[c]{0.43\textwidth}
        \centering
    \centering
    \centerline{\includegraphics[width=\textwidth]{figures/var_vs_uniform_higher_bar.pdf}}
    \caption{Mean training times required to reach target accuracy and time gain of dynamic client allocation \textit{option} ($\nameofthealgorithm$\texttt{(D)}) over static \textit{option} with uniform client allocation ($\uniform$). The setting is the \textit{heterogeneous experiment with higher target accuracy levels} in Table~\ref{table:diff_target_acc_updated}. Dashed vertical lines show times when tasks reach their target accuracy.}
    \label{fig:var_unif_comp_high_bar}
    \end{minipage}
\end{figure}

\begin{figure}[!h]
    \centering
    \begin{minipage}[c]{0.56\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/var_vs_uniform_mid.pdf} 
        \caption{Training curves of a single Monte Carlo run in the experiment with dynamic client allocation \textit{option} ($\nameofthealgorithm$\texttt{(D)}) and static \textit{option} with uniform client allocation ($\uniform$). The setting is the \textit{heterogeneous experiment with the target accuracy levels used in the main text} in Table~\ref{table:diff_target_acc_updated}. Dashed vertical lines show times when tasks reach their target accuracy.}
    \label{fig:var_unif_comp_mid}
    \end{minipage}
    \hfill 
    \begin{minipage}[c]{0.43\textwidth}
        \centering
    \centering
    \centerline{\includegraphics[width=\textwidth]{figures/var_vs_uniform_mid_bar.pdf}}
    \caption{Mean training times required to reach target accuracy and time gain of dynamic client allocation \textit{option} ($\nameofthealgorithm$\texttt{(D)}) over static \textit{option} with uniform client allocation ($\uniform$). The setting is the \textit{heterogeneous experiment with the target accuracy levels used in the main text} in Table~\ref{table:diff_target_acc_updated}. Dashed vertical lines show times when tasks reach their target accuracy.}
    \label{fig:var_unif_comp_mid_bar}
    \end{minipage}
\end{figure}


\begin{figure}[!h]
    \centering
    \begin{minipage}[c]{0.56\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/var_vs_uniform_lower.pdf} 
        \caption{Training curves of a single Monte Carlo run in the experiment with dynamic client allocation \textit{option} ($\nameofthealgorithm$\texttt{(D)}) and static \textit{option} with uniform client allocation ($\uniform$). The setting is the \textit{heterogeneous experiment with lower target accuracy levels} in Table~\ref{table:diff_target_acc_updated}. Dashed vertical lines show times when tasks reach their target accuracy.}
    \label{fig:var_unif_comp_low}
    \end{minipage}
    \hfill 
    \begin{minipage}[c]{0.43\textwidth}
        \centering
    \centering
    \centerline{\includegraphics[width=\textwidth]{figures/var_vs_uniform_lower_bar.pdf}}
    \caption{Mean training times required to reach target accuracy and time gain of dynamic client allocation \textit{option} ($\nameofthealgorithm$\texttt{(D)}) over static \textit{option} with uniform client allocation ($\uniform$). The setting is the \textit{heterogeneous experiment with lower target accuracy levels} in Table~\ref{table:diff_target_acc_updated}. Dashed vertical lines show times when tasks reach their target accuracy.}
    \label{fig:var_unif_comp_low_bar}
    \end{minipage}
\end{figure}




\section{PROOFS of the convergence analysis of $\nameofthealgorithm$ with static \textit{option} ($S$)}  \label{app_sect:main_proof}
In this section, we present the proofs of the mathematical claims made in the paper. First, we define and explain the notations used in this section. After that, we introduce intermediate lemmas used in the main proof (Section \ref{sect:intermed_lemmas}). Then, we present the proofs of Theorem~\ref{thm:main} and Corollary~\ref{cor:conv_rate} (Section \ref{sect:proofmain}). Finally, we prove intermediate lemmas (Section \ref{sect:proofs_intermediate_lemmas}).

\subsection{Notations and Definitions}
$\nameofthealgorithm$ enables us to focus on the convergence analysis of a single task within a simultaneous multi-model setting. For brevity, we provide the proofs for a single task of multiple models trained simultaneously. Therefore, we drop all model indices in our analysis. We also drop time indices from the number of active requests ($\aw$) and buffer size ($\bs$) terms as they remain the same during the training with static option of $\nameofthealgorithm$. Table~\ref{tab:notation_summary} summarizes all the notation. Please note that these analyses hold for every model $m\in[M]$ within $\nameofthealgorithm$ framework.

\subsubsection{The Update Rules of $\nameofthealgorithm$}
We first revisit the local training and global update rules of $\nameofthealgorithm$. The notation may vary slightly from those in the main paper due to dropping model indices, but still accurately depicts the same algorithmic procedures, Algorithms~\ref{alg:cap} and \ref{alg:main} in the main text. 

\paragraph{Local update rule.} During local training, clients perform $\locit$ consecutive local stochastic gradient steps and return the output to the server. When a client receives the $t$\textsuperscript{th} version of the global model, $\xt$, it takes $\locit$ mini-batch stochastic gradient descent steps (for $k=1,\dots,\locit$) with following rule:
\begin{align}
    \xitk\gets\x_i^{(t,k-1)}-\lrc\tG\fii{\x_i^{(t,k-1)}},
\end{align}
where $\x_i^{(t,0)}\triangleq\xt$ and $\widetilde{\G}$ denotes stochastic gradients. We define the average of local stochastic gradients as $\displaystyle\delit\triangleq\frac{1}{\locit}\sum_{k=0}^{\locit-1}\tG\fii{\x_i^{(t,k)}}$. Then, the client returns $\displaystyle\frac{\xt-\x_i^{(t,\tau)}}{\locit} = \frac{\lrc}{\locit}\sum_{k=0}^{\locit-1}\tG\fii{\x_i^{(t,k)}}=\lrc\delit$ to the server. The server stores the updates in a buffer.

\paragraph{Staleness.} The server receives the updates of local training requests asynchronously. It means that the received updates may come in a different order than local training requests sent to clients. Therefore, an aggregated update may have been calculated with an older version of the model, and this is called \textit{staleness}. We quantify the staleness of an update in terms of the number of global rounds passed between the times when the server sends the local training request and receives the update. The staleness is random for each update, depending on all clients' computation and communication speeds. We denote the staleness of the update of client $i$ at the $t$\textsuperscript{th} round as $\rdit$. Recall that Assumption~\ref{assump:maxstale} (Bounded Staleness) bounds this random value above at $\rdm$.

\paragraph{Global update rule.} On each global round $t$, when the buffer at the server, $\setB$, is full ($|\setB|=\bs$, where $\bs$ is the buffer size), the server aggregates the updates to proceed to the next global round. Here, $\setB$ is the set of clients whose updates are received after $(t-1)$\textsuperscript{th} and before $t$\textsuperscript{th} aggregation. The aggregation rule is as follows:
\begin{align} \label{line:globalupdate}
    \x^{(t+1)}\gets\xt-\locit\lrs\lrc\frac{1}{\bs}\sum_{i\in\setB}\delito&=\xt-\lrs\frac{1}{\bs}\sum_{i\in\setB}\lp\xtoi-\x_i^{(t-\rdit,\tau)}\rp\\
    &=\xt-\lrs\lrc\frac{1}{\bs}\sum_{i\in\setB}\sum_{k=0}^{\locit-1}\tG\fii{\x_i^{(t-\rdit,k)}}.\nn
\end{align}



\subsubsection{Virtual Sequence and Set Definitions\label{sect:virtual_seq_and_set_defns}}
We utilize the perturbed iterate idea from \citep{sharper,perturbedIterate}. 

First, let us introduce some helpful sets and notations. Consider $\setA$, which represents the set of clients chosen by the server to receive the $t$\textsuperscript{th} version of the model. Recall that the server in $\nameofthealgorithm$ selects the clients uniformly at random with replacement from all clients. The size of this set, $|\setA|$, is always equal to the buffer size, $\bs$, (except initialization, $t=0$) because $\bs$ new local training requests are made on each round. For instance, if $\bs$ is set to $3$, and the server selects the 2\textsuperscript{nd}, 16\textsuperscript{th}, and 31\textsuperscript{st} clients during the 4\textsuperscript{th} aggregation round, then $\setAt{4}$ would be $\{2, 16, 31\}$. The server sends $\x^{(4)}$ and requests local training with this model. In practical terms, $\setA$ is a multiset, allowing multiple occurrences of the same client if a client is selected more than once. Throughout the proof, we consider each occurrence of the same client in multiset as a distinct update calculated on that particular client. While we acknowledge a slight abuse of notation, this does not lead to any mathematical flaw, and we believe that this significantly enhances the clarity and comprehensibility of the proof.


%For example, $\setAt{4}$ would be $\{2,16,31\}$ if the buffer size, $\bs$, is $3$, and the server selects  $2$\textsuperscript{nd}, $16$\textsuperscript{th}, and $31$\textsuperscript{st} clients at the $4$\textsuperscript{th} aggregation round. Then the server sends $\x^{(4)}$ to these clients for their local trainings. Note that this selection is uniformly at random over all online clients with replacement. Actually, it is a multiset that allows the multiple occurrences of the same client if a client is selected more than once. Multiple occurrences will represent different updates calculated on that client. Here, we abuse the notation a bit but we strongly believe that this   makes the the proof easily  understandable. 

Now, let us define $\setC$ as the set of clients that have incomplete local training requests at the time of the $t$\textsuperscript{th} aggregation because of the asynchronous nature of $\nameofthealgorithm$. The size of this set, $|\setC|$, is always equal to the number of active local training requests, $\aw$, because the server sends a new local training request for every update it receives. For instance, if $\aw$ is $4$, and the server has sent local training requests to the 12\textsuperscript{th}, 27\textsuperscript{th}, 41\textsuperscript{st}, and 55\textsuperscript{th} clients prior to the 5\textsuperscript{th} aggregation, yet these clients are still processing their updates, then $\setCt{5}$ would be $\{12, 27, 41, 55\}$. Note that $\setCt{0}$ is an empty set, as there are no active local training requests before the algorithm starts. It's worth noting that $\setC$ is a multiset, allowing multiple occurrences of the same client if a client has more than one active local training request (recall that requests are queued at the client side). Each occurrence of a client within this multiset represents a different local training calculated on that client. We again acknowledge a slight abuse of notation, but this does not lead to any mathematical flaw, and we believe that this significantly makes the flow of proof easier.

% Now, let $\setC$ be the set of clients who have uncompleted local training requests at the time of $t$\textsuperscript{th} aggregation due to asynchronous nature of our algorithm. The size of this set, $|\setC|$ is always equal to the number of active local training requests, $\aw$ (except the end of the training process). For example, $\setCt{5}$ would be be $\{12,27,41,55\}$ if $\aw$ is $4$, and the server sent local training requests to $12$\textsuperscript{nd}, $27$\textsuperscript{th}, $41$\textsuperscript{st}, and $55$\textsuperscript{th} clients before the $5$\textsuperscript{th} aggregation happens but those clients still work on their updates. Actually, it is a multiset that allows the multiple occurrences of the same client if a client has more than one active local training request. Recall that the multiple local training requests are queued in clients buffer. Multiple occurrences will represent different updates calculated on that client. Here, we abuse the notation a bit but we strongly believe that this  makes the proof easily  understandable. 

Next, we define the virtual sequence $\vst$ for $t=0,1,\dots, T$ as the model that receives local training updates of the global model, $\xt$ for $t=0,1,\dots, T$, in the correct order. Namely, unlike $\xt$, $\vst$ receives the local training updates in the order in which the server sends those requests. However, it is crucial to note that the local training updates are still calculated with the global model, $\xt$. The update rule of the virtual sequence is:
\begin{align}\label{line:vsupdate}
    \vstp\gets\vst-\locit\lrs\lrc\frac{1}{\bs}\sumA\delit=\vst-\locit\lrs\lrc\frac{1}{\bs}\sumA\frac{1}{\locit}\sumLock\tG\fii{\xitk}, 
\end{align}
for $t=0,1,\dots,T-1$ where $\vs^{(0)}\triangleq\x^{(0)}.$

\begin{remark} \label{obs:diff_z_x}Now, we state an observation using the definitions of $\setC$, the virtual sequence, and the global model. When the $t$\textsuperscript{th} aggregation happens at the server, the virtual sequence, $\vst$, has received all the updates from all previous local training requests on rounds $0,1,\dots,t-1$. At the same time, the global model, $\xt$, has received the same updates except for the updates of clients in $\setC$. By the update rules in (\ref{line:globalupdate}) and (\ref{line:vsupdate}), note that each received update at the server contributes to the global model and virtual sequence equally. Therefore, we can express their difference as:
\begin{align}
    \vst-\xt=-\locit\lrs\lrc\frac{1}{\bs}\sumC\delito.
\end{align}
\end{remark}



\begin{remark} \label{obs:counting_max_updates}
    If we count the number of occurrences of any round index $y$ in the all round indices when the clients in $\setC$ are requested local training ($t-\rdit$ for $i\in\setC$) over all rounds $t=0,\dots,T-1$, we can bound it as:
    \begin{align} \label{line:obs2}
    \sumtel\sumC\mathbf{1}\{t-\rdit=y\}\leq\bs\rdm,\quad\forall y=0,\dots,T-1,
    \end{align}
    where $\mathbf{1}$ is an indicator function that returns $1$ if the statement is true, and returns $0$ otherwise. The reasoning for this observation is as follows. On each round, the server selects $\bs$ clients (since the server selects a new client for each received and buffered update where the buffer size is $\bs$) and sends them the up-to-date global model. We also know that all local training requests must be returned to the server within $\rdm$ rounds by Assumption~\ref{assump:maxstale} (Bounded Staleness). Therefore, over the rounds $t=0,\dots,T-1$, any round indices can appear at most $\bs\rdm$ times in the summation in the left-hand side of the inequality in (\ref{line:obs2}). We will use this remark later in the proof.
\end{remark}

% \begin{remark}
%     When the norms of the gradients of global models that the clients in $\setC$ use are averaged over the rounds $t=0,\dots,T-1$, the obtained expression can be bounded as: 
%     % The sum of the norms of the global model's gradients calculated with the stale version that clients in $\setC$ receive over the rounds $t= 0,\dots,T-1$ can be bounded as:
%     \begin{align} \label{line:obs2}
%     \avgtelm\sumC\normbs{\G\f{\xtoi}}\leq\bs\rdm\avgtelm\normbs{\G\f{\xt}}.
%     \end{align}
    
%     The reasoning for this observation is as follows. On each round, the server selects $\bs$ clients (except for $t=0$, but $\mathcal{C}^{(0)}$ is already an empty set) and sends them the up-to-date global model. We also know that all local training requests must be returned to the server within $\rdm$ rounds by Assumption 4 (Bounded Staleness). Therefore, over the rounds $t=0,\dots,T-1$, the gradient's norm of any global model version $t\in\{0,1,\dots,T-1\}$ can appear at most $\bs\times\rdm$ times in the summation in the left-hand side of the inequality in (\ref{line:obs2}). We will use this remark later in the proof.
% \end{remark}

\subsubsection{Notation}
We define some useful variables used in the proof and remind the notation used in $\nameofthealgorithm$ in Table~\ref{tab:notation_summary}. Also, we again want to remind the reader that we dropped all model indices in the proof as the theoretical results we present here hold for any of multiple tasks trained simultaneously, satisfying Assumptions \ref{assump:smoothness} - \ref{assump:maxstale}.


% Please add the following required packages to your document preamble:
% \usepackage{graphicx}
\renewcommand{\arraystretch}{1.7} % Change 1.5 to whatever factor you want
% Please add the following required packages to your document preamble:
% \usepackage{graphicx}
\begin{table}[H]
\centering
\caption{Summary of notations used in the mathematical analysis of $\nameofthealgorithm$.}
\label{tab:notation_summary}
\resizebox{\columnwidth}{!}{%
\begin{tabular}{|l|l|}
\hline
$\fii{\cdot}$: The loss function at client $i$ &
  $\Li$: Smoothness constant in Assumption \ref{assump:smoothness} \\ \hline
$\f{\cdot}$: The global loss function &
  $\locit$: Number of local SGD steps \\ \hline
$\xt$: The global model at the $t^{\text{th}}$ round &
  $\lrc$: Client-side learning rate \\ \hline
\begin{tabular}[c]{@{}l@{}}$\xitk$: The local model of client $i$ at the $k^{\text{th}}$ local step of the\\ $t^{\text{th}}$ round\end{tabular} &
  $\lrs$: Server-side learning rate \\ \hline
$\vst$: The virtual sequence at the $t^{\text{th}}$ round (Section~\ref{sect:virtual_seq_and_set_defns})&
  $\bs$: Buffer size \\ \hline
$\nabla, \Tilde{\nabla}$: Gradient and stochastic gradient operators &
  $\lhets$: Maximum local variance in Assumption~\ref{assump:lochet} \\ \hline
\begin{tabular}[c]{@{}l@{}}$\sgitk=\tG\fii{\xitk}$: Local stochastic gradient of client $i$\\  at round t and local step $k$\end{tabular} &
  $\ghets$: Maximum global variance in Assumption~\ref{assump:globhet} \\ \hline
$\displaystyle\delit=\frac{1}{\locit}\sumLock\sgitk$: The update of client $i$ at round t &
  $\aw$: Number of total active local training requests anytime \\ \hline
$\edelit=\expb{\delit}$: The expected update of client $i$ at round t &
  $\rdm$: Maximum staleness in Assumption 4 \\ \hline
\begin{tabular}[c]{@{}l@{}}$\lrst=\lrs\locit$: Server learning rate multiplied by the number \\ of local training steps\end{tabular} &
  $\rdit$: The staleness of client $i$'s update at round $t$ \\ \hline
\begin{tabular}[c]{@{}l@{}}$\setA$: The set of clients to which the server sends\\the $t^{\text{th}}$ version of the model (Section~\ref{sect:virtual_seq_and_set_defns})\end{tabular} &
  \begin{tabular}[c]{@{}l@{}}$\setC$: The set of clients which are requested local training, but\\ have not returned their updates to the server yet (Section~\ref{sect:virtual_seq_and_set_defns})\end{tabular} \\ \hline
\end{tabular}%
}
\end{table}
\renewcommand{\arraystretch}{1} % Change 1.5 to whatever factor you want



% \begin{table}[]
% \centering
% \caption{Summary of notations used in the mathematical analysis of $\nameofthealgorithm$.}
% \label{tab:notation_summary}
% \resizebox{\columnwidth}{!}{%
% \begin{tabular}{l|l}
% $\fii{\cdot}$: The loss function at client $i$ &
%   $\Li$: Smoothness constant in Assumption \ref{assump:smoothness} \\
% $\f{\cdot}$: The global loss function &
%   $\locit$: Number of local SGD steps \\
% $\xt$: The global model at $t^{\text{th}}$ round &
%   $\lrc$: Client-side learning rate \\
% \begin{tabular}[c]{@{}l@{}}$\xitk$: The local model of client $i$ at \\ $t^{\text{th}}$ round and $k^{\text{th}}$ local step\end{tabular} &
%   $\lrs$: Server-side learning rate \\
% $\vst$: The virtual model at $t^{\text{th}}$ round &
%   $\bs$: Buffer size \\
% $\nabla, \Tilde{\nabla}$: Gradient and stochastic gradient operators &
%   $\lhets$: Maximum local variance in Assumption~\ref{assump:lochet} \\
% \begin{tabular}[c]{@{}l@{}}$\sgitk=\tG\fii{\xitk}$: Local stochastic gradient of client $i$\\  at round t on local step $k$\end{tabular} &
%   $\ghets$: Maximum global variance in Assumption~\ref{assump:globhet} \\
% $\delit=\frac{1}{\locit}\sumLock\sgitk$: The update of client $i$ at round t &
%   $\aw$: Number of total active local training requests anytime \\
% $\edelit=\expb{\delit}$: The expected update of client $i$ at round t &
%   $\rdm$: Maximum staleness in Assumption 4 \\
% \begin{tabular}[c]{@{}l@{}}$\lrst=\lrs\locit$: Server learning rate multiplied\\ by the number of local training steps\end{tabular} &
%   $\rdit$: The staleness of client $i$'s update at round $t$ \\
% \begin{tabular}[c]{@{}l@{}}$\setA$: The set of clients to which the server \\ sends $t^{\text{th}}$ version of the model\end{tabular} &
%   \begin{tabular}[c]{@{}l@{}}$\setC$: The set of clients which are requested local training, \\ but not return their updates to the server\end{tabular}
% \end{tabular}%
% }
% \end{table}


% {\allowdisplaybreaks 
% \begin{align}
% &\fii{\cdot}\text{: The loss function at client $i$}&&\Li\text{: Smoothness constant in Assumption \ref{assump:smoothness}}\nn\\
% &\f{\cdot}\text{: The global loss function}&&\locit\text{: Number of local SGD steps}\nn\\
% &\xt\text{: The global model at $t$\textsuperscript{th} round}&&\lrc\text{: Client-side learning rate}\nn\\
% &\xitk\text{: The local model of client $i$ at $t$\textsuperscript{th} round and $k$\textsuperscript{th} local step}&&\lrs\text{: Server-side learning rate}\nn\\
% &\vst\text{: The virtual model at $t$\textsuperscript{th} round}&&\bs\text{: Buffer size}\nn\\
% &\nabla, \Tilde{\nabla}\text{: Gradient and stochastic gradient operators}&&\lhets\text{: Maximum local variance in Assumption~\ref{assump:lochet}}\nn\\
% &\sgitk=\tG\fii{\xitk}\text{: Local stochastic gradient}&&\ghets\text{: Maximum global variance in Assumption~\ref{assump:globhet}}\nn\\
% &\delit=\frac{1}{\locit}\sumLock\sgitk\text{: The update of client $i$ at round t}&&\aw\text{: Number of total active local training requests anytime}\nn\\
% &\edelit=\expb{\delit}\text{: The expected update of client $i$ at round t}&&\rdm\text{: Maximum staleness in Assumption 4}\nn\\
% &\lrst=\lrs\locit\text{: Server learning rate multiplied with number of local steps}&&\rdit\text{: The staleness of client $i$'s update at round $t$}\nn\\
% &\setA\text{: The set of clients to which}&&\setC\text{: The set of clients which are requested local }\nn\\
% &\text{the server sends $t$\textsuperscript{th} version of the model}&&\text{trainings, but not returned to the server back}\nn
% \end{align}
% }


\subsection{Intermediate Lemmas}\label{sect:intermed_lemmas}

We present intermediate lemmas used through the proof.

{\allowdisplaybreaks \begin{lemma} Suppose that $\fii{\cdot}$ satisfies Assumption \ref{assump:smoothness} (Smoothness) and Assumption~\ref{assump:lochet} (Bounded Variance) for all $i\in[N]$, then the iterates of $\nameofthealgorithm$ satisfy,
\begin{align}
    \expns{\G\fii{\xt}-\edelit}\leq\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\expns{\G\fii{\xt}},\:\:\forall i\in\N,\nn
\end{align}
where $\dd\triangleq\Li^2\lrc^2\locit\lp\locit-1\rp$.

Further, suppose Assumption~\ref{assump:globhet} (Bounded Heterogeneity) holds. Then, the iterates of $\nameofthealgorithm$ satisfy,
{\allowdisplaybreaks \begin{align}
    \frac{1}{\N}\sumAll\expns{\edelit-\G\fii{\xt}}\leq\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\expns{\G\f{\xt}}+\frac{\dd}{1-\dd}\ghets.\nn
\end{align}}
\label{lemma:L1_1}
\end{lemma}

\begin{remark}
     The true gradient at any client using the global model is close to the local update of that client.
\end{remark}}


{\allowdisplaybreaks \begin{lemma} The iterates of $\nameofthealgorithm$ and defined virtual sequence satisfy,
%The negative inner product of the global loss function's gradient at virtual sequence and the population mean of all expected updates is bounded.
\begin{equation}
    \T{1}\triangleq-\inp{\G\f{\vst}}{\frac{1}{\N}\sumAll\edelit}\leq-\frac{1}{2}\normbs{\G\f{\xt}}+\frac{1}{2}\normbs{\G\f{\vst}-\f{\xt}}+\frac{1}{2}\normbs{\sumAllP{\edelit-\G\fii{\xt}}}.\nn
\end{equation}
\label{lemma:T1}
\end{lemma}}


{\allowdisplaybreaks \begin{lemma} Suppose that $\fii{\cdot}$ satisfies Assumption~\ref{assump:lochet} (Bounded Variance and Unbiased Stochastic Gradients) for all $i\in[N]$, then the iterates of $\nameofthealgorithm$ satisfy, 
{\allowdisplaybreaks \begin{align}
    \T{2}\triangleq\expns{\frac{1}{\bs}\sumA\delit}\leq\expns{\frac{1}{\bs}\sumA\edelit}+\frac{\lhets}{\locit\bs}.\nn
\end{align}}
\label{lemma:T2}
\end{lemma}}


\begin{remark}
The noisy global update due to stochastic gradients is close to the expected update calculated with full gradients. The buffer and multiple local steps are useful to reduce the variance due to local SGD steps. 
\end{remark}

{\allowdisplaybreaks \begin{lemma} The iterates of $\nameofthealgorithm$ satisfy, 
%\ps{This description should come after the lemma statement as an explanation. It does not belong in the lemma statement.}
\begin{align}
    \expns{\frac{1}{\bs}\sumAP{\G\fii{\xt}-\G\f{\xt}}}=\frac{1}{\bs\N}\sumAll\expns{\G\fii{\xt}-\G\f{\xt}}.\nn
\end{align}
Further, suppose Assumption~\ref{assump:globhet} (Bounded Heterogeneity) holds. Then, the iterates of $\nameofthealgorithm$ also satisfy,
{\allowdisplaybreaks \begin{align}
    \T{3}\triangleq\expns{\frac{1}{\bs}\sumA\edelit}\leq\frac{3}{\N}\sumAll{\expns{\edelit-\G\fii{\xt}}}+\frac{3\ghets}{\bs}+3\expns{\G\f{\xt}}.\nn
\end{align}}
\label{lemma:clientselection_and_T3}
\end{lemma}

\begin{remark}
 $\nameofthealgorithm$ benefits the global variance reduction thanks to the buffer. 
\end{remark}


{\allowdisplaybreaks \begin{lemma} The virtual sequence and the iterates of $\nameofthealgorithm$ satisfy,
    {\allowdisplaybreaks \begin{align}
        \avgtelm\expns{\G\f{\vst}-\G\f{\xt}}&\leq\lp1+\frac{3\aw\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets\nn\\&+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\frac{1+\dd}{1-\dd}\avgtelm\expns{\G\f{\xt}}.\nn
    \end{align}}
    \label{lemma:sequence_diff}
\end{lemma}}
\begin{remark}
As discussed in Remark~\ref{obs:diff_z_x}, although the virtual sequence and global model get updates in a different order, they receive the same updates. Therefore, we can bound their difference.
\end{remark}




\subsection{Proofs of Main Statements}\label{sect:proofmain}
We present and prove Theorem 1 and Corollary 1 here.
\subsubsection{Theorem 1 (Convergence bound)}
First we restate the theorem:

\textbf{Theorem 1. \textit{(Convergence bound):}}
\textit{Suppose Assumptions \ref{assump:smoothness} - \ref{assump:maxstale} hold, there are $\aw$ active local training requests, and the server and client learning rates, $\lrs, \lrc$ respectively, satisfy $\lrs\leq\sqrt{\locit\bs}$ and $\lrc\leq\min\lcb\frac{1}{6\Li\locit\sqrt{\locit\bs}},\frac{1}{4\Li\locit\sqrt{\locit\aw\rdm}}\rcb$, where $\bs$ is the buffer size, and $\locit$ is the number of local training steps. Then, the iterations of Algorithm~\ref{alg:main} ($\nameofthealgorithm$) satisfy:}
\begin{align*}
    \avgtelm & \expns{\G\f{\xt}} \leq \bOP{\frac{\f{\x^{(0)}} - \min_\x \f{\x}}{T\lr\locit}}+\bOP{\lp\frac{\Li\lr}{\bs}+\Li^2\lrc^2\locit+\frac{\Li^2\lrsq\locit\aw}{\bs^2}\rp\lhets}\nn\\&+\bOP{\lp\frac{\Li\lr\locit}{\bs}+\Li^2\lrc^2\locit\lp\locit-1\rp+\frac{\Li^2\lrsq\locit^2\aw^2}{\bs^2}\rp\ghets}\nn.
\end{align*}


\textit{Proof.}
Using the update rule of the virtual sequence (\ref{line:vsupdate}) and Assumption ($\asmpt$) \ref{assump:smoothness} (Smoothness), and taking the conditional expectation with respect to $\vst$, we have,
{\allowdisplaybreaks \begin{align}
    \expb{\f{\vstp}}&\leq\f{\vst}+\inp{\G\f{\vst}}{\expb{\vstp-\vst}}+\frac{\Li}{2}\expns{\vstp-\vst}\nn
    \\
    &= \f{\vst}+\inp{\G\f{\vst}}{\expb{-\lrst\lrc\frac{1}{\bs}\sumA\delit}}+\frac{\Li}{2}\expns{\lrst\lrc\frac{1}{\bs}\sumA\delit}\nn\\
     & \overset{\substack{\asmpt~\ref{assump:lochet}}}{=} \f{\vst}-\lrst\lrc\frac{1}{\bs}\inp{\G\f{\vst}}{\expb{\sumA\edelit}}+\frac{\Li}{2}\lrst^2\lrc^2\expns{\frac{1}{\bs}\sumA\delit}\nn\\
     % &= \f{\vst}-\lrst\lrc\expb{\inp{\G\f{\vst}}{\frac{1}{\N}\sumAll\edelit}}+\frac{\Li}{2}\lrst^2\lrc^2\expns{\frac{1}{\bs}\sumA\delit}\nn \tag{} \\
     &\overset{\substack{ \textit{Uniform}\\\textit{Client Selection}}}{=} \f{\vst}+\lrst\lrc \expb{ \underbrace{-\inp{\G\f{\vst}}{\frac{1}{\N}\sumAll\edelit}}_{\triangleq\T{1}}}+\frac{\Li}{2}\lrst^2\lrc^2\underbrace{\expns{\frac{1}{\bs}\sumA\delit}}_{\triangleq\T{2}}\nn.
\end{align}}
Using Lemmas~\ref{lemma:T1} and \ref{lemma:T2}, we can put $\T{1}$ and $\T{2}$ back into the proof and dividing both sides by $\lrst\lrc$:
{\allowdisplaybreaks \begin{align}
    % &\frac{\expb{\f{\vstp}}-\f{\vst}}{\lrt} \nn \\
    % & \leq-\frac{1}{2}\normbs{\G\f{\xt}}+\frac{1}{2}\normbs{\G\f{\vst}-\G\f{\xt}} + \frac{1}{2}\normbs{\frac{1}{\N}\sumAllP{\edelit-\G\fii{\xt}}} \nn\\
    % & \quad +\frac{\Li\lrt}{2}\lp\expns{\frac{1}{\bs}\sumA\edelit}+\frac{\lhets}{\locit\bs}\rp \nn  \\
    &\frac{\expb{\f{\vstp}}-\f{\vst}}{\lrt}\leq-\frac{1}{2}\expns{\G\f{\xt}}+\frac{1}{2}\expns{\G\f{\vst}-\G\f{\xt}}  \nn \\
    & \quad+ \frac{1}{2\N}\sumAllP{\expns{\edelit-\G\fii{\xt}}} +\frac{\Li\lrt}{2}\underbrace{\expns{\frac{1}{\bs}\sumA\edelit}}_{\triangleq\T{3}} + \frac{\Li\lrt}{2} \frac{\lhets}{\locit\bs}. \nn% \tag{\ps{expectations missing in this eq.}}
\end{align}}
Using Lemma~\ref{lemma:clientselection_and_T3}, we can put $\T{3}$ back into the proof:
{\allowdisplaybreaks \begin{align}
    &\frac{\expb{\f{\vstp}}-\f{\vst}}{\lrt} \nn \\
    & \overset{\textit{Lemma}~\ref{lemma:clientselection_and_T3}}{\leq} -\frac{1}{2}\expns{\G\f{\xt}}+\frac{1}{2} \expns{\G\f{\vst}-\G\f{\xt}}+\frac{1}{2\N}\sumAllP{\expns{\edelit-\G\fii{\xt}}}\nn\\
    & \quad +\Li\lrt\lp \frac{3}{2\N}\sumAll{\expns{\edelit-\G\fii{\xt}}}+\frac{3\ghets}{2\bs}+\frac{3}{2}\expns{\G\f{\xt}} +\frac{\lhets}{2\locit\bs}\rp\nn\\
    &=\lp-\frac{1}{2}+\frac{3\Li\lrt}{2}\rp\expns{\G\f{\xt}}+\frac{1}{2} \mbe \normbs{\G\f{\vst}-\G\f{\xt}}+\Li\lrt\lp\frac{3\ghets}{2\bs}+\frac{\lhets}{2\locit\bs}\rp\nn\\
    & \quad +\lp\frac{3\Li\lrt}{2}+\frac{1}{2}\rp\frac{1}{\N}\sumAll{\expns{\edelit-\G\fii{\xt}}}\nn\\
    &\overset{\textit{Lemma}~\ref{lemma:L1_1}}{\leq}\lp-\frac{1}{2}+\frac{3\Li\lrt}{2}\rp\expns{\G\f{\xt}}+\frac{1}{2} \mbe \normbs{\G\f{\vst}-\G\f{\xt}}+\Li\lrt\lp\frac{3\ghets}{2\bs}+\frac{\lhets}{2\locit\bs}\rp\nn\\
    & \quad +\lp\frac{3\Li\lrt}{2}+\frac{1}{2}\rp\lp\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\expns{\G\f{\xt}}+\frac{\dd}{1-\dd}\ghets\rp\tag{$\dd\triangleq\Li^2\lrc^2\locit\lp\locit-1\rp$}\\
    &=\lp-\frac{1}{2}+\frac{3\Li\lrt}{2}+\frac{\dd}{2\lp1-\dd\rp}+\frac{3\Li\lrt\dd}{2\lp1-\dd\rp}\rp\expns{\G\f{\xt}}+\frac{1}{2} \mbe \normbs{\G\f{\vst}-\G\f{\xt}}\nn\\
    & \quad +\lp\frac{\Li\lrt}{2\locit\bs}+\frac{3\Li^3\lrc^3\lrst\locit}{4\lp1-\dd\rp}+\frac{\Li^2\lrc^2\locit}{4\lp1-\dd\rp}\rp\lhets+\lp\frac{3\Li\lrt}{2\bs}+\frac{3\Li\lrt\dd}{2\lp1-\dd\rp}+\frac{\dd}{2\lp1-\dd\rp}\rp\ghets\nn,
\end{align}}
where $\dd\triangleq\Li^2\lrc^2\locit\lp\locit-1\rp$. Telescoping the inequality over the round indices $t=0,1,\dots, T-1$, and using Lemma \ref{lemma:sequence_diff}, we get,
{\allowdisplaybreaks \begin{align}
    &\avgtelPm{\frac{1}{2}-\frac{3\Li\lrt}{2}-\frac{\dd}{2\lp1-\dd\rp}-\frac{3\Li\lrt\dd}{2\lp1-\dd\rp}}\expns{\G\f{\xt}}\leq
    \frac{1}{2T}\sumtel\expns{\G\f{\vst}-\G\f{\xt}}\nn\\&+\frac{\f{\vs^{(0)}}-\expb{\f{\vs^{\lp T\rp}}}}{T\lrt}+\lp\frac{\Li\lrt}{2\locit\bs}+\frac{3\Li^3\lrc^3\lrst\locit}{4\lp1-\dd\rp}+\frac{\Li^2\lrc^2\locit}{4\lp1-\dd\rp}\rp\lhets+\lp\frac{3\Li\lrt}{2\bs}+\frac{3\Li\lrt\dd}{2\lp1-\dd\rp}+\frac{\dd}{2\lp1-\dd\rp}\rp\ghets\nn\\
    &\leq\lp1+\frac{3\aw\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\aw}{2\bs^2\locit}\lhets+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\aw^2}{2\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{2\bs}\frac{1+\dd}{1-\dd}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{\f{\vs^{(0)}}-\expb{\f{\vs^{\lp T\rp}}}}{T\lrt}+\lp\frac{\Li\lrt}{2\locit\bs}+\frac{3\Li^3\lrc^3\lrst\locit}{4\lp1-\dd\rp}+\frac{\Li^2\lrc^2\locit}{4\lp1-\dd\rp}\rp\lhets+\lp\frac{3\Li\lrt}{2\bs}+\frac{3\Li\lrt\dd}{2\lp1-\dd\rp}+\frac{\dd}{2\lp1-\dd\rp}\rp\ghets\nn.
\end{align}}

Suppose the learning rates satisfy $\lrs\leq\sqrt{\locit\bs}$ (which also makes $\lrst\leq\locit\sqrt{\locit\bs}$) and $\lrc\leq\min\lcb\frac{1}{6\Li\locit\sqrt{\locit\bs}},\frac{1}{4\Li\locit\sqrt{\locit\aw\rdm}}\rcb$, the following inequality holds:
% https://www.wolframalpha.com/input?i=.5-3%2F12-1%2F30-1%2F60-51%2F480

{\allowdisplaybreaks \begin{align}
\frac{1}{2}-\frac{3\Li\lrt}{2}-\frac{\dd}{2\lp1-\dd\rp}-\frac{3\Li\lrt\dd}{2\lp1-\dd\rp}-\frac{3\Li^2\lrsqt\aw\rdm}{2\bs}\frac{1+\dd}{1-\dd}\geq\agrc.
\label{proof:numeric_inequality}
\end{align}}

Also, notice that $\vs^{(0)}$ is equal to $\x^{(0)}$ by definitions (Section~\ref{sect:virtual_seq_and_set_defns}) of these sequences and $\min_\x \f{\x}\leq \f{\vs^{\lp T\rp}}$.
{\allowdisplaybreaks 
\begin{align}
    & \avgtelm \expns{\G \f{\xt}} \leq \agrci \frac{\f{\x^{(0)}} - \min_\x \f{\x}}{T\lrt}\tag{Using \eqref{proof:numeric_inequality}}\\
    &+\agrci\lp\frac{\Li\lrt}{2\locit\bs}+\frac{3\Li^3\lrc^3\lrst\locit}{4\lp1-\dd\rp}+\frac{\Li^2\lrc^2\locit}{4\lp1-\dd\rp}+\lp1+\frac{3\aw\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\aw}{2\bs^2\locit}\rp\lhets\nn\\
    &+\agrci\lp \frac{3\Li\lrt}{2\bs}+\frac{3\Li\lrt\dd}{2\lp1-\dd\rp}+\frac{\dd}{2\lp1-\dd\rp} + \frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\aw^2}{2\bs^2} \rp\ghets\nn.
\end{align}}

Define $\gapTerm \triangleq \f{\x^{(0)}} - \min_\x \f{\x}$. After reducing high-order terms using the assumptions, $\lrs\leq\sqrt{\locit\bs}$ (which also makes $\lrst\leq\locit\sqrt{\locit\bs}$) and $\lrc\leq\min\lcb\frac{1}{6\Li\locit\sqrt{\locit\bs}},\frac{1}{4\Li\locit\sqrt{\locit\aw\rdm}}\rcb$, and incorporating the constants into the $\mco(\cdot)$ notation, we have:
{\allowdisplaybreaks \begin{align}
    \avgtelm & \expns{\G\f{\xt}} \leq \bOP{\frac{\gapTerm}{T\lr\locit}}+\bOP{\lp\frac{\Li\lr}{\bs}+\Li^2\lrc^2\locit+\frac{\Li^2\lrsq\locit\aw}{\bs^2}\rp\lhets}\nn\\&+\bOP{\lp\frac{\Li\lr\locit}{\bs}+\Li^2\lrc^2\locit\lp\locit-1\rp+\frac{\Li^2\lrsq\locit^2\aw^2}{\bs^2}\rp\ghets}\nn.
\end{align}}
This concludes the proof.
%\ps{This looks correct.}

\subsubsection{Proof of Corollary 1 (Convergence Rate)}
% Remind that the only assumptions we used throughout the proof are $\lrs\leq\sqrt{\bs}$ and $\lrc\leq\min\lcb\frac{1}{4\Li\locit},\frac{1}{8\Li\sqrt{\bs}},\frac{1}{4\Li\sqrt{\aw\rdm}}\rcb$.
First, notice that learning rates, $\lrs=\sqrt{\locit\bs}$ and $\lrc=\min\lcb\frac{1}{\locit\Li \sqrt{T}},\frac{1}{6\Li\locit\sqrt{\locit\bs}},\frac{1}{4\Li\locit\sqrt{\locit\aw\rdm}}\rcb$ satisfy the assumptions ($\lrs\leq\sqrt{\locit\bs}$ and $\lrc\leq\min\lcb\frac{1}{6\Li\locit\sqrt{\locit\bs}},\frac{1}{4\Li\locit\sqrt{\locit\aw\rdm}}\rcb$) used through the proof.


When $T\geq\max\lcb36\bs\locit,16\locit\aw\rdm\rcb$; set learning rates $\lrs=\sqrt{\locit\bs}$ and $\lrc=\frac{1}{\locit\Li \sqrt{T}}$. Then, the bound in Theorem~\ref{theorem:main} reduces to:
{\allowdisplaybreaks \begin{align}
    \avgtelm&\expns{\G\f{\xt}}\leq\bOP{\frac{\Li}{\sqrt{T\bs\locit}}}\gapTerm+\bOP{\frac{1}{\sqrt{T\bs\locit}}+\frac{1}{\locit T}+\frac{\aw}{T\bs}}\lhets+\bOP{\sqrt{\frac{\locit}{T\bs}}+\frac{1}{T}+\frac{\locit\aw^2}{T\bs}}\ghets\nn.
\end{align}}

\subsection{Proofs of Intermediate Lemmas}
\label{sect:proofs_intermediate_lemmas}
\textit{Proof of Lemma \ref{lemma:L1_1}.} 
We borrow the proof technique from \citep{fednova}.
{\allowdisplaybreaks \begin{align}
    &\expns{\G\fii{\xt}-\edelit}=\expns{\G\fii{\xt}-\frac{1}{\locit}\sumLock\G\fii{\xitk}}
    % =\expns{\frac{1}{\locit}\sumLockP{\G\fii{\xt}-\G\fii{\xitk}}}
    \nn\\
    &\leq\frac{1}{\locit}\sumLocklims{1}{-1}\expns{\G\fii{\xt}-\G\fii{\xitk}}\tag{Using $\norm{\sum_{i=1}^n x_i}^2 \leq n \sum_{i=1}^n \norm{x_i}^2 $}\\
    &\leq\frac{\Li^2}{\locit}\underbrace{\sumLocklims{1}{-1}\expns{\xt-\xitk}}_{\triangleq\T{recursive}}=\frac{\Li^2\lrc^2}{\locit}\sumLocklims{1}{-1}\expns{\sumvlims{0}{-1}\sgitv}\label{lemma1_rec_line1}\\
    % &\leq\frac{\Li^2\lrc^2}{\locit}\sumLocklims{1}{-1}\expns{\sumvlimsP{0}{-1}{\sgitv-\G\fii{\xitv}+\G\fii{\xitv}}}\nn\\
    &=\frac{\Li^2\lrc^2}{\locit}\sumLocklimsP{1}{-1}{\sumvlims{0}{-1}\expns{\sgitv-\G\fii{\xitv}}+\expns{\sumvlims{0}{-1}\G\fii{\xitv}}}\tag{Using Assumption~\ref{assump:lochet}}
    %\tag{\ps{refer to local unbiased assumption here}}
    \\
    &\leq\frac{\Li^2\lrc^2}{\locit}\sumLocklims{1}{-1}\sumvlimsP{0}{-1}{\expns{\sgitv-\G\fii{\xitv}} +\kk\expns{\G\fii{\xitv}}}\nn\\
    &\leq\frac{\Li^2\lrc^2}{\locit}\sumLocklimsP{1}
{-1}{\kk\lhets+\kk\sumvlims{0}{-1}\expns{\G\fii{\xitv}}}\nn\\
    &\leq\frac{\Li^2\lrc^2}{\locit}\lp\frac{\lp\locit-1\rp\locit}{2}\lhets+\frac{\lp\locit-1\rp\locit}{2}\sumLocklims{0}{-2}\expns{\G\fii\xitk}\rp\nn\\
    &\leq\lrc^2\Li^2\frac{\locit-1}{2}\lp\lhets+\sumLocklims{0}{-2}\expns{\G\fii\xitk}\rp\nn\\
    &\leq\lrc^2\Li^2\frac{\locit-1}{2}\lp\lhets+\sumLocklimsP{0}{-2}{2\expns{\G\fii\xitk-\G\fii\xt}+2\expns{\G\fii\xt}}\rp\nn\\
    &\leq\lrc^2\Li^2\frac{\locit-1}{2}\lp\lhets+\sumLocklimsP{0}{-2}{2\Li^2\expns{\xitk-\xt}+2\expns{\G\fii\xt}}\rp\nn\\
    &\leq\lrc^2\Li^2\frac{\locit-1}{2}\lp\lhets+\sumLocklimsP{1}{-1}{2\Li^2\expns{\xitk-\xt}+2\expns{\G\fii\xt}}\rp\nn\\
    &\leq\lrc^2\Li^2\frac{\locit-1}{2}\lp\lhets+2\Li^2\T{recursive} + 2 \locit \expns{\G\fii\xt}\rp\label{lemma1_rec_line2}.
\end{align}}
Using the recursive appearances of $\T{recursive}$ in \eqref{lemma1_rec_line1} and \eqref{lemma1_rec_line2}:
{\allowdisplaybreaks \begin{align}
    \frac{\T{recursive}}{\locit}&=\frac{1}{\locit}\sumLocklims{1}{-1}\expns{\xitk-\xt}\leq\lrc^2\frac{\locit-1}{2}\lhets+\lrc^2\locit\lp\locit-1\rp\expns{\fii{\xt}}+\lrc^2\Li^2\lp\locit-1\rp\T{recursive}\nn.
\end{align}}
Arranging the terms and defining $\dd\triangleq\Li^2\lrc^2\locit\lp\locit-1\rp$,
{\allowdisplaybreaks \begin{align}
    \expns{\G\fii{\xt}-\edelit}&\leq\frac{\Li^2\T{recursive}}{\locit}\leq\frac{\Li^2\lrc^2\lp\locit-1\rp\lhets/2+\Li^2\lrc^2\locit\lp\locit-1\rp\expns{\G\fii\xt}}{1-\Li^2\lrc^2\locit\lp\locit-1\rp}\nn\\
    &\leq\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\expns{\fii{\xt}},\:\:\forall i\in\N.\nn
\end{align}}
This proves the first part of Lemma~\ref{lemma:L1_1}. Now, averaging it across clients:
{\allowdisplaybreaks \begin{align}
    &\frac{1}{\N}\sumAll\expns{\edelit-\G\fii{\xt}}\leq\frac{1}{\N}\sumAllP{\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\expns{\fii{\xt}}}\nn\\
    &= \frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\sumAll\expns{\G\fii{\xt}-\G\f{\xt}+\G\f{\xt}}\nn\\
    &= \frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\frac{1}{\N}\sumAll\expns{\G\fii{\xt}-\G\f{\xt}}\nn\\&+\frac{\dd}{1-\dd}\expns{\G\f{\xt}}\nn+\frac{\dd}{1-\dd}\frac{2}{\N}\sumAll\inp{\G\fii{\xt}-\G\f{\xt}}{\G\f{\xt}}\nn\\
    &\leq\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\expns{\G\f{\xt}}+\frac{\dd}{1-\dd}\ghets.\tag{Using Assumption~\ref{assump:globhet}}
\end{align}}
This concludes the proof of Lemma \ref{lemma:L1_1}. 
%\ps{Looks good.}

\textit{Proof of Lemma~\ref{lemma:T1}.}
{\allowdisplaybreaks \begin{align}
    \T{1}&\triangleq-\inp{\G\f{\vst}}{\frac{1}{\N}\sumAll\edelit}=-\inp{\G\f{\vst}}{\frac{1}{\N}\sumAllP{\edelit-\G\fii{\xt}+\G\f{\xt}}}\nn\\
    &=-\inp{\G\f{\vst}}{\G\f{\xt}}-\inp{\G\f{\vst}}{\frac{1}{\N}\sumAllP{\edelit-\G\fii{\xt}}}\nn\\
    &=-\frac{1}{2}\normbs{\G\f{\vst}}-\frac{1}{2}\normbs{\G\f{\xt}}+\frac{1}{2}\normbs{\G\f{\vst}-\G\f{\xt}}-\frac{1}{2}\normbs{\G\f{\vst}}\nn\\
    &-\frac{1}{2}\normbs{\frac{1}{\N}\sumAllP{\edelit-\G\fii{\xt}}}+\frac{1}{2}\normbs{\G\f{\vst}-\frac{1}{\N}\sumAllP{\edelit-\G\fii{\xt}}}\nn\\
    &\leq-\normbs{\G\f{\vst}}-\frac{1}{2}\normbs{\G\f{\xt}}+\frac{1}{2}\normbs{\G\f{\vst}-\G\f{\xt}}\nn\\
    &+\frac{1}{2}\normbs{\frac{1}{\N}\sumAllP{\edelit-\G\fii{\xt}}}+\normbs{\G\f{\vst}}\nn\\
    &=-\frac{1}{2}\normbs{\G\f{\xt}}+\frac{1}{2}\normbs{\G\f{\vst}-\G\f{\xt}}+\frac{1}{2}\normbs{\frac{1}{\N}\sumAllP{\edelit-\G\fii{\xt}}}\nn.
\end{align}}
% \ps{Looks good.}

\textit{Proof of Lemma~\ref{lemma:T2}.}
{\allowdisplaybreaks \begin{align}
    \T{2}&\triangleq\expns{\frac{1}{\bs}\sumA\delit}=\expns{\frac{1}{\bs}\sumA\edelit+\frac{1}{\bs}\sumAP{\delit-\edelit}}\nn\\
    &=\expns{\frac{1}{\bs}\sumA\edelit+\frac{1}{\bs}\sumAP{\frac{1}{\locit}\sumLock\lp\sgitk-\G\fii{\xitk}\rp}}\nn\\
    &= \expns{\frac{1}{\bs}\sumA\edelit}+\expns{\frac{1}{\bs}\sumAP{\frac{1}{\locit}\sumLock\lp\sgitk-\G\fii{\xitk}\rp}}\nn \tag{Using Assumption~\ref{assump:lochet}}\\
    &= \expns{\frac{1}{\bs}\sumA\edelit}+\frac{1}{\bs\N}\sumAll{\frac{1}{\locit^2}\sumLock\expns{\lp\sgitk-\G\fii{\xitk}\rp}}\nn 
    %\tag{\ps{Shouldn't it be $\frac{1}{Nb}$ in 2nd term rather than $\frac{1}{N^2}$?}}
    \\
    &\leq\expns{\frac{1}{\bs}\sumA\edelit}+\frac{\lhets}{\locit\bs}.\nn
\end{align}}
% \ps{Final result looks good. Check the comments above.}

\textit{Proof of Lemma \ref{lemma:clientselection_and_T3}.}

\begin{align}
    &\expns{\frac{1}{\bs}\sumAP{\G\fii{\xt}-\G\f{\xt}}}\nn\\
    &=\frac{1}{\bs^2}\expb{\sumA{\normbs{\G\fii{\xt}-\G\f{\xt}}}+\sum_{\substack{i\textit{ and }r\textit{ are}\\\textit{two different}\\\textit{items in }\setA}}\inp{\G\fii{\xt}-\G\f{\xt}}{\G\frr{\xt}-\G\f{\xt}}}\nn\\
    &\overset{(a)}{=}\frac{1}{\bs\N}\sumAll\expns{\G\fii{\xt}-\G\f{\xt}}+\expb{\frac{1}{\N^2}\sumAll\sumrlim{1}{\N}\inp{\G\fii{\xt}-\G\f{\xt}}{\G\frr{\xt}-\G\f{\xt}}}\nn\\
    &=\frac{1}{\bs\N}\sumAll\expns{\G\fii{\xt}-\G\f{\xt}},\label{line:lemma4_first_part}
\end{align}
where \textit{(a)} follows that the clients in $\setA$ are selected uniformly at random with replacement among all clients (see Section~\ref{sect:virtual_seq_and_set_defns}). This proves the first part of Lemma \ref{lemma:clientselection_and_T3}.
% \ps{Check the reasoning here. The second term in 3rd line would be zero only if you have with-replacement sampling - emphasize that. Also, $\mbe$ is missing.}
\begin{align}
    \T{3}&\triangleq\expns{\frac{1}{\bs}\sumA\edelit}=\expns{\frac{1}{\bs}\sumAP{\edelit-\G\fii{\xt}+\G\fii{\xt}-\G\f{\xt}}+\G\f{\xt}}\nn\\
    &\leq3\expns{\frac{1}{\bs}\sumAP{\edelit-\G\fii{\xt}}}+3\expns{\frac{1}{\bs}\sumAP{\G\fii{\xt}-\G\f{\xt}}}+3\expns{\G\f{\xt}}\nn\\
    &\leq\frac{3}{\N}\sumAll{\expns{\edelit-\G\fii{\xt}}}+\frac{3}{\bs\N}\sumAll{\expns{\G\fii{\xt}-\G\f{\xt}}}+3\expns{\G\f{\xt}}\tag{Using (\ref{line:lemma4_first_part})}\\
    &\leq\frac{3}{\N}\sumAll{\expns{\edelit-\G\fii{\xt}}}+\frac{3\ghets}{\bs}+3\expns{\G\f{\xt}}\nn.
\end{align}

\textit{Proof of Lemma~\ref{lemma:sequence_diff}.} We start by using Assumption \ref{assump:smoothness} (Smoothness) and Remark \ref{obs:diff_z_x}.
{\allowdisplaybreaks \begin{align}
    &\expns{\G\f{\vst}-\G\f{\xt}}\leq\Li^2\expns{\vst-\xt}=\Li^2\expns{\lrt\frac{1}{\bs}\sumC\delito}\nn 
    %\tag{\ps{How did the 2nd eq. follow? We didn't state this anywhere. What is $\gamma_i^t$?}}
    \\
    &=\Li^2\expns{\frac{\lrt}{\bs}\sumCP{\delito-\edelito+\edelito}}\nn\\
    &=\Li^2\lrsqt\expns{\frac{1}{\bs}\sumCP{\delito-\edelito}}+\Li^2\lrsqt\expns{\frac{1}{\bs}\sumC{\edelito}}\nn\\
    &=\Li^2\lrsqt\expns{\frac{1}{\bs}\sumC\frac{1}{\locit}\sumLockP{\sgitok-\G\fii{\xitok}}}+\Li^2\lrsqt\expns{\frac{1}{\bs}\sumC{\edelito}}\nn \tag{Using Assumption~\ref{assump:lochet}}\\
    &\leq\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{\Li^2\lrsqt\aw}{\bs^2}\expb{\sumC\normbs\edelito}\nn \tag{Using $\norm{\sum_{i=1}^n x_i}^2 \leq n \sum_{i=1}^n \norm{x_i}^2 $}\\
    &\leq\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{\Li^2\lrsqt\aw}{\bs^2}\expb{\sumC\normbs{\edelito-\G\fii{\xtoi}+\G\fii{\xtoi}-\G\f{\xtoi}+\G\f{\xtoi}}}\nn\\
    &\leq\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets\nn\\
    &+\frac{3\Li^2\lrsqt\aw}{\bs^2}\expb{\sumCP{\normbs{\G\f{\xtoi}}+\normbs{\edelito-\G\fii{\xtoi}}+\normbs{\G\f{\xtoi}-\G\fii{\xtoi}}}}\nn\\
    &\leq\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw}{\bs^2}\expb{\sumCP{\normbs{\G\f{\xtoi}}+\normbs{\edelito-\G\fii{\xtoi}}}}.\nn
\end{align}}
Telescoping the inequality over $t=0,\dots,T-1$:
{\allowdisplaybreaks \begin{align}
    &\avgtelm\expns{\G\f{\vst}-\G\f{\xt}}\leq\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets\nn\\&+\frac{3\Li^2\lrsqt\aw}{\bs^2}\avgtelm\expb{\sumCP{\normbs{\G\f{\xtoi}}+\normbs{\edelito-\G\fii{\xtoi}}}}\nn\\
    & %\overset{(a)}{\leq}
    \leq\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\avgtelm\expns{\G\f{\xt}}\tag{Using Remark \ref{obs:counting_max_updates}}
    % \tag{\ps{This $\rdm$ in 3rd term needs some explanation}}
    \\
    & \quad +\frac{3\Li^2\lrsqt\aw}{\bs^2}\avgtelm\expb{\sumC{\normbs{{\edelito-\G\fii{\xtoi}}}}} \nn 
    %\tag{\ps{Norm and sum over $\mc C^{(t)}$ missing}} 
    \\
    &\leq\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\aw}{\bs^2}\avgtelm\expb{\sumCP{\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\normbs{\G\fii{\xtoi}}}} \tag{Using Lemma~\ref{lemma:L1_1}} \\
    % \label{line:L1_1used}\\
    &\leq\lp1+\frac{3\aw\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\aw}{\bs^2}\avgtelm\expb{\sumC\frac{\dd}{1-\dd}\normbs{\G\fii{\xtoi}}}\nn\\
    &\leq\lp1+\frac{3\aw\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\aw}{\bs^2}\avgtelm\expb{\sumCP{\frac{2\dd}{1-\dd}\normbs{\G\fii{\xtoi}-\G\f{\xtoi}}+\frac{2\dd}{1-\dd}\normbs{\G\f{\xtoi}}}}\nn\\
    &\leq\lp1+\frac{3\aw\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\aw}{\bs^2}\avgtelm\expb{\sumC\frac{2\dd}{1-\dd}\normbs{\G\f{\xtoi}}}\nn\\
    &\leq\lp1+\frac{3\aw\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\frac{2\dd}{1-\dd}\avgtelm\expns{\G\f{\xt}}\nn \tag{Using Remark \ref{obs:counting_max_updates}}
    % \tag{\ps{Again, this $\rdm$ in 3rd term needs some explanation}} 
    \\
    &=\lp1+\frac{3\aw\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\aw}{\bs^2\locit}\lhets+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\aw^2}{\bs^2}\ghets+\frac{3\Li^2\lrsqt\aw\rdm}{\bs}\frac{1+\dd}{1-\dd}\avgtelm\expns{\G\f{\xt}}.\nn
\end{align}} 

\section{Convergence of $\nameofthealgorithm$ with Dynamic Client Allocation (\textit{option} $=D$)} \label{app_sec:opt1_conv}
With a similar approach to the proof of static client allocation, we could show the convergence of the $\nameofthealgorithm$ with dynamic client allocation (\textit{option} $=D$), too. Adopting all of the previously used notation, we also need some new definitions to analyze this version of the algorithm as the number of active training requests and buffer size can change dynamically during the training.

\paragraph{Notation for changing buffer size and number of active training requests.}
Let us define $\bst$ and $\awt$ as the buffer size and the number of active local training request of the model. Further define $\bsmin$ and $\bsmax$ the minimum and maximum value that the buffer size can take. Similarly, define $\awmin$ and $\awmax$ as the minimum and maximum number of active training requests. Moreover, we define $\bssk\triangleq\bsmax/\bsmin$ as the measure of skewness in buffer size.

\paragraph{Global update rule and virtual sequence definition.} \label{sect:virtual_seq_and_set_defn2} Although the local update rule remains the same, the global update rule slightly changes for dynamic client allocation due to changing buffer size:
\begin{align} \label{line:globalupdate2}
    \x^{(t+1)}\gets\xt-\locit\lrs\lrc\frac{1}{\bst}\sum_{i\in\setB}\delito&=\xt-\lrs\frac{1}{\bst}\sum_{i\in\setB}\lp\xtoi-\x_i^{(t-\rdit,\tau)}\rp\\
    &=\xt-\lrs\lrc\frac{1}{\bst}\sum_{i\in\setB}\sum_{k=0}^{\locit-1}\tG\fii{\x_i^{(t-\rdit,k)}},\nn
\end{align}
where $|\setB|=\bst$. Note that \eqref{line:globalupdate2} is almost identical to \eqref{line:globalupdate}, except the variable buffer-size $\bst$.

Next, we define $\fdit$ as the index of the global round when a local training request sent to client $i$ in round $t$ returns to the server. Basically, it is the current round index $t$, added to the future value of staleness that the requested update will have. We need to define a new virtual sequence \(\yst\), which differs from the $\vst$ defined earlier.
\begin{align}
\label{line:ysupdate}
    \ystp\gets\yst-\locit\lrs\lrc\sumA\frac{1}{\bsfdit}\delit=\yst-\locit\lrs\lrc\sumA\frac{1}{\bsfdit}\frac{1}{\locit}\sumLock\tG\fii{\xitk}, 
\end{align}
for $t=0,1,\dots,T-1$ where $\ys^{(0)}\triangleq\x^{(0)}.$ Here, $\setA$ is defined similarly as it was in Section~\ref{sect:virtual_seq_and_set_defns}. Note that the probability of being in $\setA$ is equal across clients due to uniform client selection. However, this time, the size of this set does not have to be equal to the buffer size at round $t$. Due to the new client selection rule (Line~\ref{algline:newjob} in Algorithm~\ref{alg:main}), the server may assign $0$, $1$, or $2$ clients for each received update. Therefore, we know that $0<\setAs\leq2\bst$.

Here, we need a simplifying assumption for the purpose of this proof:\\
\begin{assump}[$\bsfdit$ Values] \label{assump:for_dynamic_convergence}
     We assume that any $\bsfdit$ value is known at the time when a local training request is sent to client $i$ at round $t$, and these values are independent of any future information including the received updates. We further assume that ${\bsfdit}$ values are equal (denote ${\bsfdt}$) for all clients in $\setA$.
\end{assump}
\begin{remark}
    When we keep the period of dynamic client allocation long enough, we observe that most of the assigned local training requests at one round fall in the same window before the next dynamic client allocation happens (Line~\ref{algline:adjustRb} in Algorithm~\ref{alg:main}). Hence, based on our empirical observations, what assumption implies holds for most of the local training requests. Further, \textbf{this assumption can be avoided} by taking an average of the updates during aggregation weighted inversely with the number of local training requests sent at the same global round. In other words, one may have avoided this assumption by weighting an update from client $i$ with $1/|\mathcal{A}^{(t-\gamma_i^t)}|$ instead of taking average over buffer during aggregation at round $t$. However, we did not see any practical benefit of this type of weighting in our experiments, and this strange weighting would be just for theoretical purposes. Therefore, we keep the current version.
\end{remark}

% Here, we need two simplifying assumptions for the purpose of this proof: \label{extra_assumptions}\\
% \textit{1) We assume that the expected value of ${1}/{\bsfdit}$ is equal to ${1}/{\bsfdt}$ across clients in $\setA$:} Since we keep the period of dynamic client allocation long enough, we observe that most of the assigned local requests at one round fall in the same window between two dynamic client allocation happens (Line~\ref{alg:calcRb} in Algorithm~\ref{alg:main}). Further, this assumption can be avoided by taking an average of the updates during aggregation weighted inversely with the number of local training requests sent at the same global round (i.e., weighting with $1/\setAs$). However, we did not see any practical benefit of this type of weighting in our experiments, and this strange weighting would be just for theoretical purposes.\\
% \textit{2) The updates of any round $t$ and buffer sizes at the round when those updates are received back are independent:} Although our adaptive buffer size and resource allocation take the update statistics into account, here, we assume that the effect of the training requests at one round is minimal on the buffer size at the rounds the updates are received back.
We first state the theorem showing the convergence of \(\nameofthealgorithm\) with dynamic client allocation option.
\begin{theorem}\textbf{(Convergence of $\nameofthealgorithm$ with \textit{option} $=D$):}
Suppose Assumptions \ref{assump:smoothness} - \ref{assump:for_dynamic_convergence} hold, and the learning rates satisfy $\lrs\leq\bssk^{-3/2}\sqrt{\locit\bs}$ and $\lrc\leq\min\lcb\frac{\bssk^{-3/2}}{24\Li\locit\sqrt{\locit\bs}},\frac{\bssk^{-3/2}}{16\Li\locit\sqrt{\locit\aw\rdm}}\rcb$. Then, the iterations of Algorithm 1 ($\nameofthealgorithm$) with \textit{option} $=D$ satisfy:
\begin{align*}
        \avgtelm & \expns{\G\f{\xt}} \leq \bOP{\frac{\f{\x^{(0)}} - \min_\x \f{\x}}{T\lr\locit}}+\bOP{\lp\frac{\Li\lr\bssk^3}{\bsmin}+\Li^2\lrc^2\bssk^2\locit+\frac{\Li^2\lrsq\locit\awmax\bssk^2}{\bsmin^2}\rp\lhets}\nn\\&+\bOP{\lp\frac{\Li\lr\locit\bssk^3}{\bsmin}+\Li^2\lrc^2\locit\lp\locit-1\rp\bssk^2+\frac{\Li^2\lrsq\locit^2\awmax^2\bssk^2}{\bsmin^2}\rp\ghets}\nn.
\end{align*}
\label{theorem:convergence_dynamic}
\end{theorem}

\textbf{Proof.}\\
We will need one extra lemma corresponding to Lemma~\ref{lemma:sequence_diff}.

{\allowdisplaybreaks \begin{lemma} The new virtual sequence $\lp\yst\rp$ and the iterates of $\nameofthealgorithm$ satisfy,
    {\allowdisplaybreaks \begin{align}
        \avgtelm\expns{\G\f{\yst}-\G\f{\xt}}&\leq\lp1+\frac{3\awmax\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets\nn\\&+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\frac{1+\dd}{1-\dd}\avgtelm\expns{\G\f{\xt}}.\nn
    \end{align}}
    \label{lemma:sequence_diff_2}
\end{lemma}}


Now, using the update rule of the virtual sequence (\ref{line:ysupdate}) and Assumption \ref{assump:smoothness} (Smoothness), and taking the conditional expectation with respect to $\yst$, we have,
{\allowdisplaybreaks\begin{align}
&\expb{\f{\ystp}}\leq\f{\yst}+\inp{\G\f{\yst}}{\expb{\ystp-\yst}}+\frac{\Li}{2}\expns{\ystp-\yst}\nn\\
&= \f{\yst}+\inp{\G\f{\yst}}{\expb{-\lrst\lrc\sumA\frac{1}{\bsfdit}\delit}}+\frac{\Li}{2}\expns{\lrst\lrc\sumA\frac{1}{\bsfdit}\delit}\nn\\
 & \leq \f{\yst}-\lrst\lrc{\frac{\setAs}{\bsfdt}}\expb{\inp{\G\f{\yst}}{\frac{1}{\setAs}\sumA\edelit}}\tag{Using extra assumption}\\&
 +\frac{\Li\setAs^2}{2(\bsfdt)^2}\expns{\lrst\lrc\frac{1}{\setAs}\sumA\delit} \tag{$\setAs$ is not random with conditional expectation}\\
 & = \f{\yst}+\lrst\lrc{\frac{\setAs}{\bsfdt}}\expb{\underbrace{-\inp{\G\f{\yst}}{\frac{1}{\N}\sumAll\edelit}}_{\triangleq\T{1}}}+2\bssk^2{\Li}\lrst^2\lrc^2\expns{\frac{1}{\setAs}\sumA\delit}. \tag{$\frac{\setAs}{\bsfdt}\leq2\bssk$}
\end{align}}
Next, using Lemma~\ref{lemma:T1} (with $\yst$ sequence) and Lemma~\ref{lemma:T2} (with $\setAs$), using $1/\bssk\leq\setAs/\bsfdt\leq2\bssk$, and dividing both sides by $\lrst\lrc$ we obtain,
{\allowdisplaybreaks\begin{align}
&\frac{\expb{\f{\ystp}}-\f{\yst}}{\lrt}\leq-\frac{1}{2\bssk}\expns{\G\f{\xt}}+\bssk\expns{\G\f{\yst}-\G\f{\xt}}  \nn \\
& \quad+ \frac{\bssk}{\N}\sumAll{\expns{\edelit-\G\fii{\xt}}} +{2\bssk^2\Li\lrt}{\expns{\frac{1}{\setAs}\sumA\edelit}} + {2\bssk^2\Li\lrt} \frac{\lhets}{\locit\bsmin}. \nn
\end{align}}
Using Lemma~\ref{lemma:clientselection_and_T3}, we get,
% \ps{Do we need the assumption that buffer-sizes are same within a window so far?}
{\allowdisplaybreaks \begin{align}
    &\frac{\expb{\f{\ystp}}-\f{\yst}}{\lrt} \nn \\
    & \leq -\frac{1}{2\bssk}\expns{\G\f{\xt}}+\bssk\expns{\G\f{\yst}-\G\f{\xt}}+\frac{\bssk}{\N}\sumAll{\expns{\edelit-\G\fii{\xt}}}\nn\\
    & \quad +\bssk^2\Li\lrt\lp \frac{6}{\N}\sumAll{\expns{\edelit-\G\fii{\xt}}}+\frac{6\ghets}{\bsmin}+{6}\expns{\G\f{\xt}} +\frac{2\lhets}{\locit\bsmin}\rp\nn\\
    &=\lp-\frac{1}{2\bssk}+{6\bssk^2\Li\lrt}\rp\expns{\G\f{\xt}}+ \bssk\mbe\normbs{\G\f{\yst}-\G\f{\xt}}+\bssk^2\Li\lrt\lp\frac{6\ghets}{\bsmin}+\frac{2\lhets}{\locit\bsmin}\rp\nn\\
    & \quad +\lp{6\bssk^2\Li\lrt}+\bssk\rp\frac{1}{\N}\sumAll{\expns{\edelit-\G\fii{\xt}}}\nn\\
    &\leq\lp-\frac{1}{2\bssk}+{6\bssk^2\Li\lrt}\rp\expns{\G\f{\xt}}+ \bssk\mbe\normbs{\G\f{\yst}-\G\f{\xt}}+\bssk^2\Li\lrt\lp\frac{6\ghets}{\bsmin}+\frac{2\lhets}{\locit\bsmin}\rp\nn\\
    & \quad +\lp{6\bssk^2\Li\lrt}+\bssk\rp\lp\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\expns{\G\f{\xt}}+\frac{\dd}{1-\dd}\ghets\rp\tag{Using Lemma~\ref{lemma:L1_1}}\\
    &=\lp-\frac{1}{2\bssk}+{6\bssk^2\Li\lrt}+\frac{\bssk\dd}{\lp1-\dd\rp}+\frac{6\bssk^2 \Li\lrt\dd}{\lp1-\dd\rp}\rp\expns{\G\f{\xt}}+\bssk \mbe \normbs{\G\f{\yst}-\G\f{\xt}}\nn\\
    & \quad +\lp\frac{2\bssk^2\Li\lrt}{\locit\bsmin}+\frac{3\bssk^2\Li^3\lrc^3\lrst\locit}{\lp1-\dd\rp}+\frac{\bssk\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\rp\lhets+\lp\frac{6\bssk^2\Li\lrt}{\bsmin}+\frac{6\bssk^2\Li\lrt\dd}{\lp1-\dd\rp}+\frac{\bssk\dd}{\lp1-\dd\rp}\rp\ghets\nn,
\end{align}}
where $\dd\triangleq\Li^2\lrc^2\locit\lp\locit-1\rp$. Telescoping the inequality over the round indices $t=0,1,\dots, T-1$, and using Lemma \ref{lemma:sequence_diff_2}, we get,
{\allowdisplaybreaks \begin{align}
    &\avgtelPm{\frac{1}{2\bssk}-{6\bssk^2\Li\lrt}-\frac{\bssk\dd}{\lp1-\dd\rp}-\frac{6\bssk^2\Li\lrt\dd}{\lp1-\dd\rp}}\expns{\G\f{\xt}}\leq
    \frac{\bssk}{T}\sumtel\expns{\G\f{\yst}-\G\f{\xt}}\nn\\&+\frac{\f{\ys^{(0)}}-\expb{\f{\ys^{\lp T\rp}}}}{T\lrt}+\lp\frac{2\bssk^2\Li\lrt}{\locit\bsmin}+\frac{3\bssk^2\Li^3\lrc^3\lrst\locit}{\lp1-\dd\rp}+\frac{\bssk\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\rp\lhets+\lp\frac{6\bssk^2\Li\lrt}{\bsmin}+\frac{6\bssk^2\Li\lrt\dd}{\lp1-\dd\rp}+\frac{\bssk\dd}{\lp1-\dd\rp}\rp\ghets\nn\\
    &\leq\lp1+\frac{3\awmax\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\awmax\bssk}{\bsmin^2\locit}\lhets+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\awmax^2\bssk}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk^2\rdm}{\bsmin}\frac{1+\dd}{1-\dd}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{\f{\ys^{(0)}}-\expb{\f{\ys^{\lp T\rp}}}}{T\lrt}+\lp\frac{2\bssk^2\Li\lrt}{\locit\bsmin}+\frac{3\bssk^2\Li^3\lrc^3\lrst\locit}{\lp1-\dd\rp}+\frac{\bssk\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\rp\lhets+\lp\frac{6\bssk^2\Li\lrt}{\bsmin}+\frac{6\bssk^2\Li\lrt\dd}{\lp1-\dd\rp}+\frac{\bssk\dd}{\lp1-\dd\rp}\rp\ghets\nn.
\end{align}}

Suppose the learning rates satisfy $\lrs\leq\bssk^{-3/2}\sqrt{\locit\bs}$ (which also makes $\lrst\leq\bssk^{-3/2}\locit\sqrt{\locit\bs}$) and $\lrc\leq\min\lcb\frac{\bssk^{-3/2}}{24\Li\locit\sqrt{\locit\bs}},\frac{\bssk^{-3/2}}{16\Li\locit\sqrt{\locit\aw\rdm}}\rcb$, the following inequality holds:
% https://www.wolframalpha.com/input?i=.5-3%2F12-1%2F30-1%2F60-51%2F480

{\allowdisplaybreaks \begin{align}
\frac{1}{2}-{6\bssk^3\Li\lrt}-\frac{\bssk^2\dd}{\lp1-\dd\rp}-\frac{6\bssk^3\Li\lrt\dd}{\lp1-\dd\rp}-\frac{6\Li^2\lrsqt\awmax\bssk^3\rdm}{\bsmin}\frac{1+\dd}{1-\dd}\geq\agrcD.
\label{proof:numeric_inequality2}
\end{align}}
Also, notice that $\ys^{(0)}$ is equal to $\x^{(0)}$ by definitions (Section~\ref{sect:virtual_seq_and_set_defn2}) of these sequences and $\min_\x \f{\x}\leq \f{\ys^{\lp T\rp}}$.

{\allowdisplaybreaks 
\begin{align}
    & \avgtelm \expns{\G \f{\xt}} \leq \agrci \frac{\f{\x^{(0)}} - \min_\x \f{\x}}{T\lrt}\bssk\tag{Using \eqref{proof:numeric_inequality2}}\\
    &+\agrci\lp\frac{2\bssk^3\Li\lrt}{\locit\bsmin}+\frac{3\bssk^3\Li^3\lrc^3\lrst\locit}{\lp1-\dd\rp}+\frac{\bssk^2\Li^2\lrc^2\locit}{2\lp1-\dd\rp}+\lp1+\frac{3\awmax\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\awmax\bssk^2}{\bsmin^2\locit}\rp\lhets\nn\\
    &+\agrci\lp \frac{6\bssk^3\Li\lrt}{\bsmin}+\frac{6\bssk^3\Li\lrt\dd}{\lp1-\dd\rp}+\frac{\bssk^2\dd}{\lp1-\dd\rp} +\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\awmax^2\bssk^2}{\bsmin^2}  \rp\ghets\nn.
\end{align}}

Define $\gapTerm \triangleq \f{\x^{(0)}} - \min_\x \f{\x}$. After reducing high-order terms using the assumptions, $\lrs\leq\bssk^{-3/2}\sqrt{\locit\bs}$ (which also makes $\lrst\leq\bssk^{-3/2}\locit\sqrt{\locit\bs}$) and $\lrc\leq\min\lcb\frac{\bssk^{-3/2}}{24\Li\locit\sqrt{\locit\bs}},\frac{\bssk^{-3/2}}{16\Li\locit\sqrt{\locit\aw\rdm}}\rcb$, and incorporating the constants into the $\mco(\cdot)$ notation, we have:
{\allowdisplaybreaks \begin{align}
    \avgtelm & \expns{\G\f{\xt}} \leq \bOP{\frac{\gapTerm\bssk}{T\lr\locit}}+\bOP{\lp\frac{\Li\lr\bssk^3}{\bsmin}+\Li^2\lrc^2\bssk^2\locit+\frac{\Li^2\lrsq\locit\awmax\bssk^2}{\bsmin^2}\rp\lhets}\nn\\&+\bOP{\lp\frac{\Li\lr\locit\bssk^3}{\bsmin}+\Li^2\lrc^2\locit\lp\locit-1\rp\bssk^2+\frac{\Li^2\lrsq\locit^2\awmax^2\bssk^2}{\bsmin^2}\rp\ghets}\nn.
\end{align}}
This concludes the proof.



\textbf{Proof of Lemma~\ref{lemma:sequence_diff_2}:} We start by using Assumption \ref{assump:smoothness} (Smoothness) and observing that Remark \ref{obs:diff_z_x} still holds with $\yst$ for the dynamic client allocation option. 
% \ps{Why?}
{\allowdisplaybreaks \begin{align}
    &\expns{\G\f{\yst}-\G\f{\xt}}\leq\Li^2\expns{\yst-\xt}=\Li^2\expns{\lrt\sumC\frac{1}{\bsfdito}\delito}\nn 
    %\tag{\ps{How did the 2nd eq. follow? We didn't state this anywhere. What is $\gamma_i^t$?}}
    \\
    &=\Li^2\expns{{\lrt}\sumC\frac{1}{\bsfdito}\lp{\delito-\edelito+\edelito}\rp}\nn\\
    &=\Li^2\lrsqt\expns{\sumC\frac{1}{\bsfdito}\lp{\delito-\edelito}\rp}+\Li^2\lrsqt\expns{\sumC\frac{1}{\bsfdito}{\edelito}}\nn\\
    &=\Li^2\lrsqt\expns{\sumC\frac{1}{\bsfdito}\frac{1}{\locit}\sumLockP{\sgitok-\G\fii{\xitok}}}+\Li^2\lrsqt\expns{\sumC{\frac{1}{\bsfdito}\edelito}}\nn \tag{Using Assumption~\ref{assump:lochet}}\\
    &\leq\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{\Li^2\lrsqt\awmax}{\bsmin^2}\expb{\sumC\normbs\edelito}\nn \tag{Using $\norm{\sum_{i=0}^n x_i}^2 \leq n \sum_{i=0}^n \norm{x_i}^2 $ and $|\setC|\leq \awmax$}\\
    &\leq\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{\Li^2\lrsqt\awmax}{\bsmin^2}\expb{\sumC\normbs{\edelito-\G\fii{\xtoi}+\G\fii{\xtoi}-\G\f{\xtoi}+\G\f{\xtoi}}}\nn\\
    &\leq\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets\nn\\
    &+\frac{3\Li^2\lrsqt\awmax}{\bsmin^2}\expb{\sumCP{\normbs{\G\f{\xtoi}}+\normbs{\edelito-\G\fii{\xtoi}}+\normbs{\G\f{\xtoi}-\G\fii{\xtoi}}}}\nn\\
    &\leq\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{3\Li^2\lrsqt\awmax}{\bsmin^2}\expb{\sumCP{\normbs{\G\f{\xtoi}}+\normbs{\edelito-\G\fii{\xtoi}}}}.\nn
\end{align}}
Telescoping the inequality over $t=0,\dots,T-1$:
{\allowdisplaybreaks \begin{align}
    &\avgtelm\expns{\G\f{\yst}-\G\f{\xt}}\leq\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets\nn\\&+\frac{3\Li^2\lrsqt\awmax}{\bsmin^2}\avgtelm\expb{\sumCP{\normbs{\G\f{\xtoi}}+\normbs{\edelito-\G\fii{\xtoi}}}}\nn\\
    & %\overset{(a)}{\leq}
    \leq\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\avgtelm\expns{\G\f{\xt}}\tag{Using Remark \ref{obs:counting_max_updates}, however, this}
    % \tag{\ps{This $\rdm$ in 3rd term needs some explanation}}
    \\
    & \quad +\frac{3\Li^2\lrsqt\awmax}{\bsmin^2}\avgtelm\expb{\sumC{\normbs{{\edelito-\G\fii{\xtoi}}}}} \tag{time, the maximum appearance can be $2\rdm\bsmax$}
    %\tag{\ps{Norm and sum over $\mc C^{(t)}$ missing}} 
    \\
    &\leq\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\awmax}{\bsmin^2}\avgtelm\expb{\sumCP{\frac{\Li^2\lrc^2\locit}{2\lp1-\dd\rp}\lhets+\frac{\dd}{1-\dd}\normbs{\G\fii{\xtoi}}}} \tag{Using Lemma~\ref{lemma:L1_1}} \\
    % \label{line:L1_1used}\\
    &\leq\lp1+\frac{3\awmax\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\awmax}{\bsmin^2}\avgtelm\expb{\sumC\frac{\dd}{1-\dd}\normbs{\G\fii{\xtoi}}}\nn\\
    &\leq\lp1+\frac{3\awmax\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\awmax}{\bsmin^2}\avgtelm\expb{\sumCP{\frac{2\dd}{1-\dd}\normbs{\G\fii{\xtoi}-\G\f{\xtoi}}+\frac{2\dd}{1-\dd}\normbs{\G\f{\xtoi}}}}\nn\\
    &\leq\lp1+\frac{3\awmax\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{3\Li^2\lrsqt\awmax}{\bsmin^2}\avgtelm\expb{\sumC\frac{2\dd}{1-\dd}\normbs{\G\f{\xtoi}}}\nn\\
    &\leq\lp1+\frac{3\awmax\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\avgtelm\expns{\G\f{\xt}}\nn\\&+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\frac{2\dd}{1-\dd}\avgtelm\expns{\G\f{\xt}}\nn \tag{Using Remark \ref{obs:counting_max_updates}}\\
    &=\lp1+\frac{3\awmax\Li^2\lrc^2\locit^2}{2\lp1-\dd\rp}\rp\frac{\Li^2\lrsqt\awmax}{\bsmin^2\locit}\lhets+\frac{1+\dd}{1-\dd}\frac{3\Li^2\lrsqt\awmax^2}{\bsmin^2}\ghets+\frac{6\Li^2\lrsqt\awmax\bssk\rdm}{\bsmin}\frac{1+\dd}{1-\dd}\avgtelm\expns{\G\f{\xt}}.\nn
\end{align}} 

% \bibliography{References}
\end{document}
