\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
% ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example




\usepackage{xcolor}
\usepackage{comment}
\newcommand*{\kk}{\textcolor{red}}
\newcommand*{\ia}{\textcolor{cyan}}
\newcommand{\TS}[1]{{\color{orange}[TS: #1]}}

\usepackage{caption}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{amsmath,bm}
\usepackage{makecell}
\usepackage{multirow}
\usepackage{enumitem}
\usepackage{stfloats}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{eucal}
\usepackage[T1]{fontenc}
\DeclareMathOperator*{\argmin}{arg\,min}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}


\newcommand{\SWITCH}[1]{\STATE \textbf{switch} (#1)}
\newcommand{\ENDSWITCH}{\STATE \textbf{end switch}}
\newcommand{\CASE}[1]{\STATE \textbf{case} #1\textbf{:} \begin{ALC@g}}
\newcommand{\ENDCASE}{\end{ALC@g}}
\newcommand{\CASELINE}[1]{\STATE \textbf{case} #1\textbf{:} }



\title{FLASH: Automating Federated Learning using CASH}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<alamm2@rpi.edu>?Subject=Your UAI 2023 paper}{Md Ibrahim Ibne Alam}{}}
\author[1]{Koushik Kar}
\author[2]{Theodoros Salonidis}
\author[2]{Horst Samulowitz}
% Add affiliations after the authors
\affil[1]{%
    Department of ECSE\\
  Rensselaer Polytechnic Institute \\
  Troy, NY, USA - 12180
}
\affil[2]{%
    IBM T.J. Watson Research Center \\
   Yorktown Heights, NY, USA - 10598
}

  
  \begin{document}
\maketitle




        %%%%%%%%%%

\begin{abstract}
In this paper, we present FLASH, a framework which addresses for the first time the central AutoML problem of Combined Algorithm Selection and HyperParameter (HP) Optimization (CASH) in the context of Federated Learning (FL). 

To limit training cost, FLASH incrementally adapts the set of algorithms to train based on their projected loss rates, while supporting decentralized (federated) implementation of the embedded hyper-parameter optimization (HPO), model selection and loss calculation problems. We provide a theoretical analysis of the training and validation loss under FLASH, and their tradeoff with the training cost measured as the data wasted in training sub-optimal algorithms. The bounds depend on the degree of dissimilarity between the datasets of the clients, a result of FL restriction that client datasets remain private.
Through extensive experimental investigation on several datasets, we evaluate three variants of FLASH, and show that FLASH performs close to centralized CASH methods.


\end{abstract}









        %%%%%%%%%%








        
\section{Introduction}
\label{Intro}

\paragraph{Motivation.} 
Federated learning (FL) is a distributed learning framework that enables training a model from decentralized data located at client sites, without the data ever leaving the clients. Compared to a centralized model, in which training requires all the data to be transmitted to and stored in a central location (e.g., a data center), FL has the benefits of preserving data privacy while avoiding transmission of large volumes of raw data from the client sites. 

FL has two key challenges; first, 
the data across clients can be highly heterogeneous. Second, the communication overhead can be prohibitive during training as model parameters are exchanged in multiple global rounds between the clients and an aggregation server. Therefore there has been significant research effort on FL techniques that reduce communication overhead during model training. Such techniques typically assume that the clients agree on a common algorithm and hyperparameters (HPs) before training occurs, i.e. do not provide AutoML capabilities. 

Recently the problem of HyperParameter Optimization (HPO) in FL  has been addressed in several works.  HPO in FL is an important problem as the choice of HPs can dramatically affect FL system performance. The FL setting poses unique challenges in addressing HPO, due to non-iid data, limited processing power at clients, and function evaluations for an HP set being much more communication and computation intensive than the centralized setting because they require FL training. Solving the algorithm selection along with HPO (popularly known as CASH) in an FL setting inherits the aforementioned challenges of HPO in FL and adds the additional layer of complexity of algorithm selection, where different algorithms have different performance as well as different HP sets. 
%. First, in centralized settings HPO algorithms have access to the entire dataset, while in FL setting the data is partitioned at the parties and cannot be shared. Second, the data in the parties is heterogeneous and non-iid. Third, in certain FL scenarios where clients are mobile devices or edge servers, the processing capabilities of the parties are limited. Fourth, the function evaluations are much more communication and computation expensive than the centralized setting as they require training and evaluating an FL model.
In this paper, we propose (for the first time) a way to solve the CASH problem for an FL setting without performing any FL in the solution process (i.e., only using FL after solving CASH). In prior literature, the CASH problem has only been addressed in the centralized setting, and most approaches treat it as a more complex HPO problem that merges the HPs of all algorithms and adds the algorithm type as a new HP. Extending the HPO algorithms to use this approach would not be adequate due to the explosion on HP dimensionality and computation complexity; in addition it is not evident how to aggregate these new CASH HPs to an single optimal HP set.



{\bf FLASH overview.} 
In this paper, we propose and evaluate FLASH, a framework which solves the CASH problem in an FL setting by viewing it as bi-level optimization problem: the algorithm selection problem being solved at the outer level requires solving the embedded HPO problem at the inner level. FLASH solves the algorithm selection problem using a multi-fidelity approach, where, for each algorithm, the inner level HPO method (which we term FL-HPO) runs on increasing subsets of the clients' data, providing data increments to a subset of best performing algorithms according to a projected loss curve and subject to a tolerance threshold.  
% and  incrementally refining loss projections of candidate algorithms through evaluations on additional data, based on which the set of candidate algorithms are further adjusted. 
This avoids wasting training resources on poorly performing algorithms. 
We analyze and evaluate the FLASH framework under three FL-HPO methods: 
% The FLASH framework allows a wide range of approaches to be used for solving the inner level FL-HPO problem. We propose three FL-HPO approaches: 
\textit{Local Best Model (LBM)}, \textit{Local K-Best Model (LKBM)}, and \textit{Regression based Model (RM)}. These FL-HPO approaches allow the clients to run HPO separately on their private data, but differ in how the results from the individual clients are aggregated at the central server and further re-validated at the clients, before the final HP choice is determined for the algorithm choice made at the outer level algorithm selection problem. Instead of expensive FL training, each HP configuration is evaluated using an approximation metric modeled as a linear combination of the clients' local loss functions. This in turn enables FLASH to reduce communication and computation overhead by first performing CASH search for the best algorithm-hyperparameter (Alg-HP) configuration in only a few rounds of communication between the clients and the central server, and then performing a single FL training to reach the final model for this configuration.


We provide a theoretical analysis of the worst-case loss performance and the wasted training cost measured as the data allocated for training sub-optimal algorithms. The performance bounds are expressed in terms of the dissimilarity between the client dataset distributions and other key parameters. Our extensive experimental study investigates these trade-offs and shows that FLASH can achieve a performance that is close to that of centralized CASH.



{\bf Summary of contributions.} To summarize, the key novel contributions of this work are as follows.
% \kk{The first bullet below has a lot of repetition with the text in teh Flash Overview para on Page 1. Should we remove or shorten?}
\begin{itemize}[noitemsep,topsep=0pt]
\item We present FLASH, a framework that solves for the first time the CASH problem in a FL setting by decomposing it into algorithm selection (outer level) and FL-HPO (inner level) problems. 
% The algorithm selection problem is solved by incrementally assigning training data to best performing algorithms determined based on their projected loss rates, subject to a tolerance threshold. FLASH allows a wide range of FL-HPO approaches to be used; we specifically describe three such models where clients do not need to share private data or run communication-intensive FL model training to solve the HPO problem. 
FLASH minimizes communication and communication overhead during CASH search using a multi-fidelity incremental data approach at the algorithm selection level and by avoiding expensive FL training-based evaluations at the FL-HPO level. Only a single FL training is needed for the Alg-HP configuration found during CASH search.

%according to a predictive evaluation process for the candidate algorithms with incrementally assigned training data, whereas four different ways of solving the inner HPO problem is considered, none of which require the clients to share their private data.

\item We provide a theoretical analysis of the convergence and worst-case loss performance of all three FLASH variants, and the wasted training cost measured as the data allocated for training sub-optimal algorithms. These performance bounds are expressed in terms of the dissimilarity between the client dataset distributions and other key parameters.
%the tradeoff between the inner and outer level optimizations of FLASH, and the effect of keeping the client data set private on these tradeoffs.

\item We provide numerical evaluation of FLASH on eight large data sets with seven algorithm choices, for all three FL-HPO variants and several baseline approaches. We compare the accuracy and training cost for these variants, and the performance effects of some of the parameter choices and options that FLASH provides. 
\end{itemize}








            %%%%%%%%









\section{Related Work}
\label{sec:related}

%
{\bf FL Training.} A large number of optimization techniques have been devised to address the communication and computation overhead during FL training~\cite{li2020federated}. These techniques assume that algorithm and HPs are known. 
%We will therefore focus on the AutoML  literature on CASH and FL-HPO. Besides, 
Since FLASH decouples CASH search from FL training, these techniques can be viewed as complementary and could be applied during the FL training after CASH search. 

{\bf Centralized CASH approaches.} A popular approach is to view the CASH problem as an extended HPO problem by merging the
HPs of all algorithms and introducing the algorithm type as a new HP \cite{komer2014hyperopt, autoweka2} . 
%This results in 
% a hierarchy configuration space, 
% where the top-level HP decides which algorithm to select and all other HPs depend on this one.
Then, iterative Bayesian Optimization methods (BO) for HPO are used to solve this HPO problem~\cite{shahriari2015taking}. BO avoids expensive model training/validation evaluations by estimating the shape of the loss landscape with a surrogate model and suggesting the HP configuration to be evaluated in the next iteration. 
A major challenge is that the explosion on the HP space introduced by CASH limits the efficiency of BO.  Existing solutions include using different surrogate models (random forests~\cite{JMLR:v23:21-0888}, trees~\cite{olson2016tpot}), combining BO with Hyperband~\cite{li2017hyperband}(a bandit strategy that dynamically allocates resources to a set of random configurations and uses successive halving~\cite{jamieson2016non} to stop poorly performing configurations)~\cite{falkner2018bohb}, and multi-fidelity optimization which uses subsets of the data to perform and project on faster training-based evaluations~\cite{klein2017fast}.  Apart from treatment as HPO problem some recent approaches have used reinforcement learning~\cite{efimova2017fast}, adaptive allocation of HPO iterations to algorithms~\cite{li2020efficient}, and alternating direction method of multipliers~\cite{liu2020admm}. Implementing these approaches to an FL setting directly can be computation and communication exhaustive. 



{\bf HPO approaches for FL.} 
% Recent work for vesting FL with AUtoML capabilities has addressed the FL-HPO problem, i.e. given an algorithm, find optimal hyperparameters. 
Most HPO approaches for FL focuses on finding local client HPs such as learning rates~\citep{Koskela19Learning, Mostafa19Robust, Reddi20Adaptive}, number of local SGD iterations~\citep{wang2019adaptive} or global HPs common to all clients such as network architectures~\citep{He20toward, Garg20direct, Xu20federated}, for SGD training algorithms and deep neural networks (DNNs). Fedex~\citep{khodak2020weight, khodak2021federated} uses the NAS technique of weight sharing combined with successive halving to tune local HPs at clients to build personalized models.
FLoRA~\cite{flora} extends the above works beyond SGD/DNNs to any training algorithm and provides a framework for tuning global HPs. HPO algorithms for FL cannot apply the approach of viewing CASH as an HPO problem: the exploded HP search space would be too vast for each client (which is more processing limited than server) to execute CASH. Furthermore, each client would result in a different Alg-HP configuration and it is not evident how to aggregate this information to a single Alg-HP configuration.  In contrast, FLASH addresses this problem using an outer algorithm selection layer and an inner HPO layer, which are solved in a decentralized manner. %and can therefore leverage any FL-HPO algorithm in its inner layer.














                %%%%%%%%%%%





\section{CASH Formulation in FL}
\label{Prob_form}
%\kk{The notation in this section needs to be made consistent with later.}
We first define the CASH problem in an FL setting. 
%The CASH problem involves determining 
%The former problem is intertwined with the latter since the rankings of algorithms depend on whether their hyperparameters are tuned properly. Fortunately, the two problems can efficiently be tackled as a single, structured, joint optimization problem:
Similar to the standard CASH problem considered in a centralized setting \cite{Thornton2013, Zoller2021}, we are given a set of algorithms $\mathcal{A} = (A^{(1)}, \cdots, A^{(J)})$, where each algorithm $A^{(j)}$ is associated with  hyperparameters (HPs) that belong to domain $\Lambda^{(j)}$. Each algorithm choice $A^{(j)}$ and HP setting $\bm{\lambda} \in \Lambda^{(j)}$, compactly written as $A^{(j)}_{\bm{\lambda}}$, is associated with a model class $\mathcal{W}^{(j)}_{\bm{\lambda}}$, from which a model (parameter vector) $\bm{w} \in \mathcal{W}^{(j)}_{\bm{\lambda}}$ must be chosen so as to minimize a predictive loss function $\mathcal{L}(\bm{w}, \mathcal{D}')$ over a validation dataset $\mathcal{D}'$.

In an FL setting \cite{pmlr-v54-mcmahan17a,Yang2019federated}, the training dataset $\mathcal{D}$ is partitioned into several subsets $\mathcal{D}_i, i \in \mathcal{C}$ that are owned individually by
%training dataset $\mathcal{D}$ as well as the validation dataset $\mathcal{D}'$ are partitioned into 
a set of $N = |\mathcal{C}|$ clients. Thus $\mathcal{D} = \cup_{i \in \mathcal{C}} \mathcal{D}_i$. 
%Let $\mathcal{D}_i$ and $\mathcal{D}'_i$ respectively denote the training and validation datasets of client $i \in \mathcal{C}$.
%$\mathcal{D}_i \cap \mathcal{D}_{i'} = \phi$ for $i \neq i'$, $\mathcal{D}'_i \cap \mathcal{D}'_{i'} = \phi$ for $i \neq i'$, and that the 
We assume that $\mathcal{D}_i$ is private to client $i$, and cannot be shared or aggregated due to privacy or complexity reasons. 
Given an algorithm and HP choice, $A^{(j)}_{\bm{\lambda}}$, an FL algorithm $\mathcal{F}$ aims to determine a model $\bm{w}$ using the training dataset,
% \begin{equation}
$\mathcal{F}(A^{(j)}_{\bm{\lambda}}, \cup_i \mathcal{D}_i) \longrightarrow \bm{w} \in \mathcal{W}^{(j)}_{\bm{\lambda}}$, % \nonumber
% \end{equation}
where the training dataset $\mathcal{D}$ is written as $\cup_i \mathcal{D}_i$ to emphasize its distributed (partitioned) nature.
Usually, $\bm{w}$ is chosen to minimize the training error, modeled with the given loss function $\mathcal{L}$ but computed over the training dataset $\mathcal{D}$. That is, the FL algorithm $\mathcal{F}$ typically aims to minimize $\mathcal{L}(\bm{w}, \cup_i \mathcal{D}_i)$ over $\bm{w} \in \mathcal{W}^{(j)}_{\bm{\lambda}}$,
%Note that in general, it may not be possible to minimize this loss function without sharing of the private datasets $\mathcal{D}_i$. %Under certain assumptions on the loss function and training parameters, 
%FL model training algorithms (see for example, \cite{wang2019adaptive}) can optimize this global loss function 
using iterative methods that involve local model training at the individual clients (using their private datasets) and sharing information on models and their accuracies (but not data) with a central aggregator.

Although not necessary for the validity of our analysis or results, for ease of exposition we assume that the validation dataset $\mathcal{D}'$ is partitioned across the clients as well. Thus $\mathcal{D}' = \cup_i \mathcal{D}'_i$, where $\mathcal{D}'_i$ is the validation dataset of client $i$.
Then given the underlying FL algorithm $\mathcal{F}$ for finding the model (for any Alg-HP setting), the CASH problem for FL involves finding $A^\star_{\bm{\lambda}^\star}$ that minimizes a global loss function, computed as the aggregation of loss functions at the clients (over their validation datasets)
\begin{equation}
    A^\star_{\bm{\lambda}^\star} = \argmin_{A^{(j)} \in \mathcal{A}, \bm{\lambda} \in \Lambda^{(j)}} \sum_i \alpha_i\,\mathcal{L}(\mathcal{F}(A^{(j)}_{\bm{\lambda}}, \cup_i \mathcal{D}_i), \mathcal{D}'_i), 
    \label{eq:fl-cash-agg}
\end{equation}
where $\alpha_i$ are appropriately defined client weights, such as $\alpha_i = \frac{1}{N}$ or $\alpha_i = \frac{|\mathcal{D}_i|}{|\mathcal{D}|}$, and the FL function ($\mathcal{F}$) also uses these weights for computing the loss in the training process.
%where $\mbox{Agg}_i$ represents, say, a weighted sum or the minimum of all client loss functions.
%\kk{Add the expression for the special case where the global loss function is the sum of the per-client loss functions ...}
While solving this CASH problem, we seek to: (1) adhere to the core FL requirement that the datasets $\mathcal{D}_i$ remain private; (2) minimize the number of communication rounds between the server and the clients, including the rounds needed by the federated learning (model training) algorithm $\mathcal{F}$. The development of the FLASH framework, described next, is guided by these practical requirements.









                %%%%%%%%%%%






\section{FLASH Framework} \label{sec:flash}



Even in a centralized setting, solving the CASH problem of finding the best Alg-HP pair  $A^\star_{\bm{\lambda}^\star}$ is computationally expensive due the large number (set) of Alg-HP combinations over which the loss function must be minimized. This is more complex (i.e., communication intensive) in an FL setting, as the loss evaluation for any specific  $A^{(j)}_{\bm{\lambda}}$ requires solving the underlying FL (model training) problem that may take multiple (possibly many) rounds of communication between the clients and the central server. To address this complexity issue, FLASH adopts three broad principles or approximations, as listed below. These approximations introduce a degree of sub-optimality in the solving the CASH problem in FL, which is quantified through our theoretical analysis in the next section.




Firstly, in FLASH the global loss for any $A^{(j)}_{\bm{\lambda}}$ setting is computed by aggregating the losses computed at the clients on their individual datasets. In other words, in comparing the Alg-HP settings in FLASH, the loss function in FLASH (compare with (\ref{eq:fl-cash-agg})) is calculated as
\begin{equation}
\fontsize{9.0pt}{10.0pt} \selectfont
  \sum_i \alpha_i\,\mathcal{L}(\mathcal{F}(A^{(j)}_{\bm{\lambda}}, \mathcal{D}_i), \mathcal{D}'_i).
    \label{eq:flash-agg}
\end{equation}
We note that $\cup_i \mathcal{D}_i$ within the model training function $\mathcal{F}$ in (\ref{eq:fl-cash-agg}) is replaced by $\mathcal{D}_i$ in (\ref{eq:flash-agg}). This implies that in FLASH, model training (and therefore loss computation) happens locally in each client, avoiding the communication-intensive procedure of computing the global model through FL. This allows FLASH to compute the global loss function (albeit approximately), for a given $A^{(j)}_\lambda$ and training dataset, in a single round of communication. 


Secondly, FLASH divides up the CASH problem in FL into two levels (see Algorithm \ref{algo:flash}): the outer level (`for' loop in Step 2) which requires finding the optimal algorithm $A^{(j)} \in \mathcal{A}$ (for their best HP setting), and the inner level problem that requires finding the best HP $\bm{\lambda} \in \Lambda^{(j)}$ for any given $A^{(j)}$. However, finding the best (global) HP $\bm{\lambda}$ for a given $A^{(j)}$, even for the separable loss function in (\ref{eq:flash-agg}), can be computation and communicative intensive in an FL setting. For this reason, FLASH approximates it using decentralized FL-HPO approaches (Step 9 of Algorithm \ref{algo:flash}, described later in this section) that work by aggregating the HPs (and their corresponding loss values) computed/validated separately by the clients on their individual datasets.



Finally, since training all algorithms on entire client datasets could be wasteful 
(particularly when the client datasets are large, or there are a large number of algorithm choices), FLASH allocates training data to the algorithms incrementally, focusing only the best performing algorithms at any time. More specifically, FLASH works in rounds, i.e., $0, 1, 2, ..., M$ (Step 2 in Algorithm \ref{algo:flash}), and keeps a running set of best performing algorithms $\tilde{\mathcal{A}} \subseteq \mathcal{A}$ that it updates after each round. In round $m$, FLASH evaluates the training loss on fraction $a_m$ of the data (randomly chosen) at each client (by calling Algorithm \ref{alg:FL-HPO} in Step 9), where $0 < a_0 < a_1 < \ldots < a_M = 1$, and projects the loss curve to the entire data set, in a manner similar to that described in some of the prior work on centralized algorithm selection \cite{sabharwal2016selecting, li2020efficient}. More precisely, denoting $\ell(A^{(j)},a_m)$ as the loss rate for algorithm $A^{(j)}$ calculated in round $m$ by FLASH, the loss projection ($LP$) for $A^{(j)}$ is computed by linearly extrapolating $\ell(A^{(j)},a_m)$ from $a_m$ to $a_M=1$, i.e.  $LP(A^{(j)}) = \ell(A^{(j)},a_m)+(1-a_m)\cdot \ell'(A^{(j)}, a_m)$ (Step 11). Here, $\ell'(A^{(j)}, a_m)$ is an estimate of the derivative of the loss rate curve based on the loss rates calculated by FLASH so far, $\ell(A^{(j)},a_m'), m' \leq m$. Also, $LP^{*}(a_m)=\min_{j}LP(A^{(j)},a_m)$ is the minimum projected loss computed at step $m$ for all $A^{(j)}$ (Step 6).
Then for a chosen \textit{tolerance factor} $\Delta$ (an input parameter), in any round FLASH selects all algorithms (for training in the next round) whose projected loss is within $\Delta$ of the best projected loss in that round (Step 6). Note that to calculate the loss projection ($LP$) for each algorithm at least two values $(m = 2)$ are needed. However, due to the small (fractional) datasets used in the initial steps, $m = 2$ may lead to a very noisy loss projection. Hence, we chose to go up to 3 iterations $(m = 3)$ to estimate
the initial LP of all algorithms (Steps 3 to 4) and start choosing algorithms (to allocate training data to) from $m
=4$. %for $m<4$ all algorithms are chosen for training to generate the initial loss projections 

Computing the loss function on fraction $a_m$ of the training dataset for any algorithm $A^{(j)}$ requires finding the optimum HP $\bm{\lambda} \in \Lambda^{(j)}$ for that dataset. Since this dataset is spread across the clients, this optimization in done in a federated manner, by aggregating (at the central server) the HPs and corresponding losses by running per-client HPOs. This is referred to as FL-HPO in Algorithm~\ref{algo:flash} (Step 9), and is described below.





\begin{algorithm}[t]
\caption{FLASH}
\begin{algorithmic}[1] 
\STATE \textbf{Input:} Set of all algorithms $\mathcal{A}$, %$\Lambda^{(j)} \ \forall j$, 
tolerance parameter $\Delta$.
%\bm{\lambda}, \bm{w}, \mathcal{D}_i.$ 
% \STATE Check the average dataset size of the parties. 
% \IF {Avg. dataset size > Threshold}
% \STATE Initialize $r>1 $.
%\STATE Initial dataset size $= a_0$ fraction of dataset.
% \ELSE
% \STATE Use initial dataset size,  $a = a_K = 1$ (full data).
% \ENDIF
\FOR{$m=0, 1, \ldots, M$}
\IF {$m < 4$}
\STATE $\tilde{\mathcal{A}}$= $\mathcal{A}$.
\ELSE
\STATE $\tilde{\mathcal{A}}=$ [Algorithms for which $LP(A^{(j)},a_{m-1}) - LP^{*}(a_{m-1}) \leq \Delta$].
\ENDIF
\FOR {each algorithm $A^{(j)} \in \mathcal{\tilde{A}}$}
% \STATE Set $A^{(j)}$ as the current Algorithm.
\STATE Call \textit{FL-HPO}($A^{(j)}, a_m$) to get best HP $\lambda(A^{(j)},a_m)$ and its loss $\ell(A^{(j)},a_m$).
\STATE Store the best (HP, loss) pair.
\STATE Update $LP$ of the algorithm: $LP(A^{(j)},a_m) = \ell(A^{(j)},a_m)+(1-a_m)\cdot \frac{\ell(A^{(j)},a_m)-\ell(A^{(j)},a_{m-1})}{a_m - a_{m-1}}$.
% \STATE Update $a= a \times r$.
\ENDFOR
\STATE $LP^{*}(a_m)=\min_{j}LP(A^{(j)},a_m)$
\ENDFOR
\STATE Set $A^\dag = argmin_{A^{(j)}} \ell(A^{(j)},a_M)$ algorithm with minimum loss in the last iteration, and $\lambda^\dag = \lambda(A^\dag,a_M)$, its best HP.
%\STATE Choose the HP combination found at the last iteration.
\STATE \textbf{Output:} $A^\dag$, $\lambda^\dag$.
\end{algorithmic}
\label{algo:flash}
\end{algorithm}






        %%% %%% %%







\begin{algorithm}[t]
\caption{FL-HPO ($A^{(j)}, a$)}
\label{alg:FL-HPO}
\begin{algorithmic}[1] 
\STATE \textbf{Run Local HPO:} At each client $i$, run HPO on $a$ fraction of its dataset, and send the best HP(s) and corresponding loss(es) ($L_{iter}$) to the aggregator. 
\STATE \textbf{Aggregate Results:}  Aggregate the results using one of the following methods: 
\STATE \hspace{\algorithmicindent}
$\begin{aligned}
    & \textit{LBM:} & \text{\parbox[t]{5.8 cm}{Calculate HP by max-voting or averaging the best HP setting of each client.}} \\
    & \textit{LKBM:} & \text{\parbox[t]{5.8 cm}{Take the union of the top $K$ HP settings of each client for re-evaluation.} }\\
    & \textit{RM:} & \text{\parbox[t]{5.8 cm}{Perform regression using $\kappa$ HPs (and their losses) collected per client to generate top $K$ HP settings for re-evaluation.}}
\end{aligned} $


\STATE \textbf{Re-Evaluation}  (\textit{LKBM} and \textit{RM} only): Send the HP setting candidates back to clients for re-evaluation.

\STATE \textbf{Final Aggregation} (\textit{LKBM} and \textit{RM} only): Average the re-validated HP settings and corresponding losses.
\STATE \textbf{Output:} HP $\bm{\lambda} \in \Lambda^{(j)}$ and corresponding loss value.

\end{algorithmic}
\end{algorithm}

\subsection*{FLASH FL-HPO algorithm}



Our FL-HPO method is summarized by Algorithm \ref{alg:FL-HPO}. It provides a way to compute the best HP for a given algorithm $A^{(j)}$ and data size fraction $a$ in a decentralized manner and can be implemented easily in any FL platform. There are three variants of our FL-HPO method called $LBM, LKBM, RM$; the main difference among them is how they aggregate local HPs computed by the clients to find a globally optimal HP.

Initially, when the FL-HPO method (Algorithm \ref{alg:FL-HPO}) is called, each client $i$ creates a subset of its dataset by sampling a fraction $a$ of its rows  (Step 1). Then it runs locally an HPO algorithm (e.g. HyperOpt) on this subset for a given number of iterations ($HPO_{iter}$). Each iteration evaluates an HP on the subset using $k$-fold cross-validation and yields a loss value $L_{iter}$. A set of HPs and their loss values explored by HPO is then communicated to the server by the client as (HP, loss) pairs. Then the server aggregates the HPs and yields a set of candidate global HPs using one of the three variants described as follows (Step 2):


\paragraph{\textit{Local Best Model (LBM):}} In LBM, each client sends its best (HP, loss) pair to the server.
The aggregator computes the global HP set by performing max-voting to categorical HP coordinates (ties broken randomly) and averaging the numerical HP coordinates across the clients' HP sets. 


\paragraph{\textit{Local K-Best Model (LKBM):}} 
%This model is quite similar to LBM, but instead of choosing the best HP settings from each clients, we take the 
In $LKBM$, each client sends its K-best (HP,loss) pairs explored by its HPO to the server.  Then the server sends these $K \times N$ HP pairs as candidate global HPs to all clients for evaluation (each client will evaluate the K-best HPs of the others). 
% Each client evaluates them and sends back the (HP,loss) pairs. The aggregator selects the HP setting with minimum average loss across the clients.

\paragraph{\textit{Regression based Model (RM):}} 
In $RM$, each client sends all (HP,loss) pairs explored by its HPO to the server. Then the server uses all these pairs to train a regressor model (we used Random Forest).
After training, the regressor model, is used to compute the predicted losses for a large number of HP settings (generated randomly), and the top-10 performing ones are kept. These HPs form the global HP set are sent to the clients for re-evaluation. RM is similar to FLoRA, the FL-HPO approach presented in \cite{flora}, but with the additional re-evaluation step.




After the candidate set of global HP sets are determined (from Step 2), these sets are sent to  the clients for re-evaluation. The clients evaluate them using k-fold cross-validation on their data subsets and send back to the server the global HP sets and their corresponding loss values (Step 4). Finally, the server averages the losses of each global HP set sent by the clients and selects the global HP set with the minimum average loss (Step 5). It is to be noted that Step 4 and 5 are only executed for $LKBM$ and $RM$ variants, whereas the global best HP is found at step 3 for the $LBM$ variant. In other words, in $LBM$ the HPs are computed by just combining the best HPs provided by the clients, i.e., the server does not send any HP(s) back to the clients for re-validation. This makes $LBM$ simpler, but as we will see later in Section \ref{emp}, it results in a slightly worse performance then the other two variants.











                %%%%%%%%%%%







\section{Theoretical Analysis}
\label{theory_v1}
%This section will mainly focus on how FACE works
In this section, we provide a theoretical analysis of the loss optimality and training cost of FLASH; proofs of the results are included in the Supplementary Material (\cite{uai_2023_657_suppliment}). 

\paragraph{Preliminaries.} 
Let $\mathcal{D}^a$ represent a dataset comprising of $a$ fraction of the training data, and $\mathcal{D}_i^a$ the corresponding per-client datasets; from Algorithm~\ref{algo:flash}, recall that $a$ varies as $a_0, a_1, \cdots, a_M$. For a given algorithm $A^{(j)}$, let $\underline{\ell}(A^{(j)}, a)$ represent the \textit{true training loss} for algorithm $A^{(j)}$ when using $a$ fraction of the data. Since this training loss depends on how the dataset $\mathcal{D}^a$ is chosen, the true training loss can be estimated by averaging over the losses computed over all possible datasets $\mathcal{D}^a$, denoted by $\bm{\mathcal{D}}^a=\{ \mathcal{D}^a \subseteq \mathcal{D}, |\mathcal{D}^a| = a |\mathcal{D}| \}$. Therefore, from (\ref{eq:fl-cash-agg}), $\underline{\ell}(A^{(j)}, a)$ can be expressed as,
\begin{equation}
\fontsize{9.0pt}{10.0pt} \selectfont
 \underline{\ell}(A^{(j)}, a) =  \mathbb{E}_{\mathcal{D}^a \in \bm{\mathcal{D}}^a} \min_{\bm{\lambda} \in \Lambda^{(j)}} \sum_i \alpha_i\,\mathcal{L}(\mathcal{F}(A^{(j)}_{\bm{\lambda}}, \cup_i \mathcal{D}^a_i), \mathcal{D}^a_i). \nonumber
    \label{eq:true-loss-def}
\end{equation}
From (\ref{eq:fl-cash-agg}), note that $\mathcal{D}'_i$ is replaced by $\mathcal{D}^a_i$, since $\underline{\ell}(\cdot, a)$ denotes the training cost on $a$ fraction of the dataset.



Assuming that all dataset sizes $|\mathcal{D}_i^a|$ are sufficiently large, and cross-validation is considered, it is reasonable to assume that the true loss function is smooth and convex in $a$ (since loss functions are usually convex with respect to training data).
%(as illustrated in Figure~\ref{fig:Loss_projection_example}). 
We further assume that $\underline{\ell}$ has bounded second derivatives, i.e., $\underline{\ell}''(A^{(j)},a) \leq B, \forall a, \forall A^{(j)}$. 

Let $\ell(A^{(j)}, a)$ represent the training loss computed for algorithm $A^{(j)}$ under FLASH, when using $a$ fraction of the data. Note that $\ell$ will in general differ from the true training loss $\underline{\ell}$ for several reasons: (i) The FL-HPO algorithm may calculate the HP sub-optimally; (ii)  $\ell(A^{(j)}, a)$ may be calculated over one (or a few) datasets $\mathcal{D}^a \in \bm{\mathcal{D}}^a$, instead of averaging over all possible datasets in $\bm{\mathcal{D}}^a$. Let $\sigma$ represent the maximum difference between $\ell(A^{(j)}, a)$ and $\underline{\ell}(A^{(j)}, a)$, i.e., $|\ell(A^{(j)}, a) - \underline{\ell}(A^{(j)}, a)| \leq \sigma, \forall a, \forall A^{(j)}$. 
The value of $\sigma$ depends on which of the three FL-HPO variants is used, and is provided later in this section.
%in terms of the dissimilarity between the client datasets. 
Finally, let $\delta$ be the minimum difference between the $a_m$, i.e., $\delta = \min\{a_0, \min_{m \in \{1, \cdots, M\}} (a_m - a_{m-1})\}$. 
%representing the minimum granularity of the training dataset sizes over which FLASH evaluates the algorithms. 
\paragraph{Loss Optimality Analysis}
Let $\ell^*$ be the minimum training loss achievable, i.e., the minimum value of $\underline{\ell}(A^{(j)},1)$ across all algorithms $A^{(j)} \in \mathcal{A}$. In the following lemma, 
%With $B, \sigma$ and $\delta$ thus defined, we claim the following lemma, 
which is key to bounding loss performance and training cost of FLASH, the optimum algorithm refers to the one that attains the minimum training loss $\ell^*$.
\begin{lemma}
\label{th:lemma_1}
If $\Delta > B + 2\sigma + \frac{4 \sigma}{\delta}$, FLASH ensures the training of the optimum algorithm (that attains $\ell^*$) in every iteration $m=\{0, \cdots, M\}$.
\label{thm:flash-opt}
\end{lemma}


Lemma \ref{th:lemma_1} quantifies how large the tolerance parameter $\Delta$ needs to be so that the optimum algorithm is allocated data in every round. This leads to the following result, which shows in terms of \textit{training loss}, FLASH can be sub-optimal by at most $2 \sigma$.
\begin{theorem}
\label{th:theorem2}
If $\Delta > B + 2\sigma + \frac{4 \sigma}{\delta}$, then the Alg-HP pair chosen by FLASH attains a training loss that is within $\sigma$ (for $LKBM$ and $RM$) and $2\sigma$ (for $LBM$) of the minimum training loss, $\ell^*$. 
\label{thm:training-suboptimality}
\end{theorem}



\paragraph{Training Cost Analysis}
Next we try to bound the degree of \textit{wasteful training}, measured by the amount of training data allocated to sub-optimal algorithms. Let $\epsilon_j$ denotes how sub-optimal algorithm $A^{(j)}$ is, in terms of the training loss, i.e., $\epsilon_j= \underline{\ell}(A^{(j)},1) - \ell^*$.
%, where $\ell^* = \min_{A^{(j)} \in \mathcal{A}} \underline{\ell}(A^{(j)},1)$. 

\begin{theorem}
\label{Thrm_training_cost}
\label{th:thrm_trainig_cost}
If $\epsilon_j \leq \frac{B}{2} + 2\sigma + \frac{4\sigma}{\delta} + \Delta$, then algorithm $A^{(j)}$ receives the full dataset for training under FLASH; otherwise, the fraction of the training data allocated in rounds $m \geq 4$ by FLASH to any algorithm $A^{(j)}$ is no more than $\max \left\{0,1-\sqrt{\frac{\epsilon_j-B/2-2\sigma -4\sigma/\delta - \Delta}{B/2}}\right\}$.
%The training data allocation for any algorithm (say $x$) in PACE is bounded by $\left(1-\sqrt{\frac{\Delta_A-\epsilon}{B/2}}\right)$ times of the full training data.
\end{theorem}
Theorem \ref{Thrm_training_cost} implies that 
the algorithms whose true training loss is within $(\frac{B}{2} + 2\sigma + \frac{4\sigma}{\delta} + \Delta)$ of $\ell^*$ receives full training; algorithms whose true training loss is beyond $(B + 2\sigma + \frac{4\sigma}{\delta} + \Delta)$ of $\ell^*$ do not receive any training data at all (except in the initial 3 rounds when all algorithms are trained). If their true training loss is within these two limits, then those sub-optimal algorithm incurs a training cost that decreases monotonically with $\epsilon_j$.
%, the sub-optimality  i.e., the more sub-optimal (in terms of true training loss) the algorithm is, the less training it receives.




\paragraph{Bounding the Loss Calculation Error}
We now proceed to bound $\sigma$, as defined earlier, by computing an upper bound on $|\ell(A^{(j)}, a) - \underline{\ell}(A^{(j)}, a)|$ over all $A^{(j)}, a$ values. 
%Towards bounding the sub-optimality of FLASH in terms of the \textit{validation loss}, let us 
To capture how the training loss rate for a given algorithm $A^{(j)}$ varies with the distribution of the training dataset $\hat{\mathcal{D}}$, we define loss function $\hat{l}(A^{(j)}, \hat{\mathcal{D}})$ as
\begin{equation}
 \hat{\ell}(A^{(j)}, \hat{\mathcal{D}}) = \min_{\bm{\lambda} \in \Lambda^{(j)}} \sum_i \alpha_i\,\mathcal{L}(\mathcal{F}(A^{(j)}_{\bm{\lambda}}, \cup_i \hat{\mathcal{D}}_i), \hat{\mathcal{D}}_i). \nonumber
    %\label{eq:true-loss-def}
\end{equation}
For any algorithm $A^{(j)}$, and any two training datasets $\hat{\mathcal{D}}_1,\hat{\mathcal{D}}_2$, we assume that $\hat{\ell}$ satisfies $| \hat{\ell}(A^{(j)}, \hat{\mathcal{D}}_1) - \hat{\ell}(A^{(j)}, \tilde{\mathcal{D}}_2)| \leq \beta \cdot \nu(\hat{\mathcal{D}}_1,\hat{\mathcal{D}}_2)$, for some scalar constant $\beta$, with $\nu$ being 1-Wasserstein distance measure between the distributions of the two training datasets $\hat{\mathcal{D}}_1, \hat{\mathcal{D}}_2$. Further, let $\underline{\mathcal{D}}^a$ denote the expectation of the distributions of all the datasets in $\bm{\mathcal{D}}^a$. Let $d_j(\cdot,\cdot)$ denote the 1-norm distance metric in $\Lambda^{(j)}$, the hyperparameter space of $A^{(j)}$. Further, let $\bm{\lambda}^i_k, \, k \in [k] = \{1, \cdots, \kappa\}$ denote the $\kappa$ HP choices of client $i$ in RM. Define $D_j = \sum_i \alpha_i \min_{k \in [\kappa]} d_j(\bm{\lambda}, \bm{\lambda}^i_k)]$, where $\min_{k \in [\kappa]} d_j(\bm{\lambda}, \bm{\lambda}^i_k)]$ determines the worst case distance of any HP in the space $\Lambda^{(j)}$ from the closest initial HP chosen by client $i$. Let $\bar{D}$ be an upper bound on $D_j \forall j$.

The upper bound on $\sigma$ depends on which FLASH variant is being used, and can be stated as follows.
\begin{theorem}
\label{thm:sigma-bound}
For the training dataset $\mathcal{D}^a$, the loss calculation error for any algorithm $A^{(j)}$ is upper-bounded by $\sigma(a)$, given as $\sigma(a) = \beta_0 \,\nu(\mathcal{D}^a, \underline{\mathcal{D}}^a) + \hat{\sigma}(a)$, where
$$\hat{\sigma}(a) =
\begin{cases}
\beta_1 \sum_i \alpha_i \, \nu(\mathcal{D}_i^a, \mathcal{D}^a) & (LBM)\\
\beta_2 \max_{i,i'} \nu(\mathcal{D}_i^a, \mathcal{D}_{i'}^a) & (LKBM)\\
\beta_3 \sum_i \alpha_i \, \nu(\mathcal{D}_i^a, \mathcal{D}^a) + \gamma \bar{D} & (RM)
\end{cases}
$$
for appropriately defined scalar constants $\beta_0$, $\beta_1$, $\beta_2$, $\beta_3$, and $\gamma$. Then $\sigma$ is given by
$\sigma = \max_{a \in [a_0, \cdots, a_m]} \sigma(a)$.
\end{theorem}
%Note that for LBM and LKBM, $\sigma_j(a)$ is independent of $j$, therefore $\sigma$ can be calculated as $\max_{a \in [a_0, \cdots, a_m]} \sigma_j(a)$.
In the above results, LKBM has been analyzed for the conservative case of $K=1$. Note that the bound for RM depends on $\kappa$ (through $\bar{D}$), the number of initial HPs chosen by each client, as it determines the accuracy of the HPO. The bounds are not directly comparable between the three FL-HPO models as the constants $\beta_1$, $\beta_2$, $\beta_3$ can be different. However, the key takeaway from the bounds is that the loss calculation errors (and therefore the overall loss performance bounds as computed by Theorem~\ref{thm:training-suboptimality}) depend on the dissimilarity between the client datasets. This results from the fact that in these FL-HPO approaches, the HPs (losses) are optimized (calculated) on the individual client datasets and then aggregated, instead of being computed globally. 
%This is intuitively expected, and unavoidable, due to the decentralized (federated) nature of the FLASH. 








                %%%%%%%%%%%









\section{Empirical Evaluation} 
\label{emp}



\begin{table}[t]
\fontsize{9.0pt}{10.0pt} \selectfont
\begin{center}
\caption{Comparison of {\sl CASH-D}, {\sl CASH-O} and Auto-SKL}
\begin{tabular}{l|r|r|r}
DataSet &  {\sl CASH-D} &  {\sl CASH-O} &  Auto-SKL \\
\hline
\hline
EEG - Eye & 93.10 & 94.05 &  97.42 \\
Electricity & 91.04 & 93.51 & 93.24 \\ 
Eye Movement & 69.87 & 73.73 &  75.58 \\ 
Diabetic Data & 53.36 & 56.79 &  51.82 \\ 
Connect - 4  & 72.97 & 75.54 & 76.01 \\ 
Higgs &  72.18 & 72.56 & 72.83 \\ 
Magic Telescope &  86.13 & 86.66  & 85.47\\ 
Default of Credit & 73.91 & 75.51  & 70.61 \\
\end{tabular}

\label{table:centralize_perf}
\end{center}
\end{table}



\paragraph{Dataset Selection:} We initially selected 35 datasets (from OpenML \cite{OpenML2013}) with more than 10000 data-samples (the dataset sizes ranged from about 11k to about 100k). We split the datasets in a training part and validation part. We trained models using the training part and used accuracy on the validation part as performance metric.
%When determining the performance of these datasets we used accuracy instead of loss (the analysis of loss can be directly used by taking the negative value of the accuracy and use it as loss). 
We considered seven well performing algorithms: \textit{Random Forest, Decision Tree, Extra Tree, Logistic Regression, XGB, LGBM} and \textit{MLP} with well defined HP space for simulation. We define {\sl CASH-D} and {\sl CASH-O} as the validation accuracies found from the best performing algorithm (in terms of training accuracy) using default HP settings of scikit-learn \cite{scikit-learn} and with the optimized HP setting found through HPO, respectively.


As expected, {\sl CASH-O} attained better accuracy than {\sl CASH-D}. However, 23 out of the 35 datasets showed very minor improvement indicating that HPO does not yield much gain over the default HPs. However, we observed that the algorithm choice did have a significant impact on the performance. From the remaining 12 datasets we selected 8 datasets (name and accuracies in Table \ref{table:centralize_perf}) where CASH-O demonstrated higher gains and are diverse in terms of number of examples and features.
In Table \ref{table:centralize_perf} we also provide the accuracy values attained by centrally solving the CASH problem with auto-sklearn (Auto-SKL). The higher accuracies attained by Auto-SKL compared to {\sl CASH-O} in some cases is largely due to the fact that Auto-SKL spans a much larger algorithm set and HP space. %Also, we included Higgs dataset which has been used in many ML problem setting and is extremely consistent in results, to check if that consistency is moved forward by FLASH as well.   




\paragraph{Baselines and Evaluation Metric:} {\sl CASH-D} and {\sl CASH-O} are used as the performance evaluation baselines for FLASH for the 8 datasets that we selected. Let us define $P_D$ and $P_*$ as the performance (accuracy) of {\sl CASH-D} and {\sl CASH-O} respectively for some dataset. For that same dataset if FLASH attains accuracy $P$, we define relative improvement with respect to {\sl CASH-D} ({\sl CASH-O})  as $RI_D$ ($RI_*$, respectively), calculated as $\frac{P - P_D}{P_D} \times 100 \%$ ( $\frac{P - P_*}{P_*} \times 100 \%$, respectively). 
The RI reflects how much \% improvement is achieved by FLASH compared to the centralized baselines. A negative $RI$ value means that FLASH is performing worse than the baseline.




\paragraph{Implementation:} 
FLASH is implemented with two major loops (Algorithm \ref{algo:flash}), where the outer loop performs the algorithm selection and the inner loop performs FL-HPO (Algorithm \ref{alg:FL-HPO}). For each of the 8 datasets we selected, the generation of the client training and validation datasets was done as follows: first the dataset was randomly divided in $N$ subsets, one for each client. Then, each client's dataset was divided to training and validation parts with a ratio of 70-30 using stratified sampling on target class. Performance was measured as the average validation accuracy across clients. For training and evaluations, we used Hyperopt~\cite{komer2014hyperopt}, a mainstream and easily configurable HPO algorithm, using 10-fold cross-validation. We also used different data seeds (which change the client data distributions) and different HPO seeds (which change the initial HP used by HyperOpt. Unless otherwise specified, each experiment was ran 25 times with at different data and HPO seeds. Moreover, the value of $a_m$ in our empirical evaluation followed a geometric progression (not necessary for the theoretical analysis), implying $a_m = a_0 \cdot r^{m}$. For the results in the paper, we used $a_0 = 3.75\%$ and a progression rate of $r=1.5$. While there can be other ways of choosing $a_0, a_1, ...$, geometric progression was used because we expect the change of the slope to slow down for larger values of $m$.





\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{FLASH_to_default.png}
    \caption{$RI$ of FLASH compared to {\sl CASH-D}}
    \label{fig:FLASH_to_deflt}
\end{figure}


\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{FLASH_to_centralized.png}
    \caption{$RI$ of FLASH compared to {\sl CASH-}$O$}
    \label{fig:FLASH_to_centr}
\end{figure}



\begin{table}[t]
\fontsize{9.0pt}{10.0pt} \selectfont
    \centering
    \caption{Robustness of FLASH (Random HP setting)}
\begin{tabular}{c|c|c}
Value of $\Delta$ & Avg. Error (\%) &  Training Cost (norm.)  \\
\hline
\hline
0 & 0.441	& 1.00 \\ 
0.2 & 0.427 & 1.0095   \\ 
0.4 & 0.358	& 1.0995   \\ 
0.6 & 0.365 & 1.2096 \\  
0.8 & 0.343 & 1.2867  \\
1.0 & 0.336 & 1.4394  \\  
\end{tabular}
\label{table:FLASH-Delta}
\end{table}

\paragraph{Comparison of different versions of FLASH:} 
We first perform experiments on all 3 FL-HPO approaches ($LBM, LKBM, RM$) with $\Delta = 0$, $N=3$ clients, and $HPO_{iter}=50$ HPO iterations. 
We compare the performance of all three variants of FLASH (RM, LKBM, LBM) using their $RI$  with respect to the baselines (i.e., {\sl CASH-D}, {\sl CASH-O}). %As discussed in FLASH Framework, all the variants are quite intuitive and which variation to use depends mainly on the environment (i.e., resources available for communication and computation). 
Fig. \ref{fig:FLASH_to_deflt} shows the value of $RI$ when FLASH is compared to {\sl CASH-D}. The improvement in performance with FLASH is quite prominent and for some datasets RI is as high as $5\%$. For `Higgs' dataset the performance was decreased for some versions of FLASH, but that is less than $0.1\%$. We observe that all three variants of FLASH perform consistently while $RM$ usually yields the best performance. This is an important finding because it shows that FLASH which performs CASH on \emph{distributed} client datasets performs better than an approach using optimal algorithm selection \emph{over default HPs} and assuming all data is available at a central location. 

Fig. \ref{fig:FLASH_to_centr} depicts the $RI$ of FLASH compared to the {\sl CASH-O}. In this case it is expected for FLASH to not yield improvement as {\sl CASH-O} uses optimal algorithm selection and HPO on centralized data. The small negative $RI$ values (up to -1.5\%) indicate that FLASH is performing worse but very close to the centralized {\sl CASH-O} solution. 
A few cases yield very small positive $RI$ values (less than 0.5\%) which indicate that FLASH slightly outperforms CASH-O, which sounds counter-intuitive.  These cases arise due to over-fitting in the CASH-O model training which cause the depicted marginal decrease in \emph{validation accuracy}. Of course, CASH-O always performs better than FLASH in terms of training accuracy. 


%\paragraph{Run-time and Communication Overhead:} 
Run time for all the 3 variants of FL-HPO, normalized with respect to the average runtime of the slowest variant ($RM$), are provided in table \ref{table:runtime}, . Although FLASH $RM$ is usually the better performing FL-HPO technique, however it consistently takes the longest to run due to the additional regression analysis. Also, the communication overhead for FLASH $RM$ and $LKBM$ (not experimentally evaluated in this study) are twice as of $LBM$ due to the re-evaluation. Moreover, 
the runtime for $RM$ and $LKBM$ tend to  increase more than $LBM$ with the increase of clients, possibly because of the re-evaluation done on the former two.
%$CBM$ on the other hand has almost similar runtime as $LBM$ but will in general have higher communication overhead because of iterative communication between clients and the aggregator (one round of communication for each HPO iteration). 



\begin{table}[t]
\fontsize{9.0pt}{10.0pt} \selectfont
    \centering
    \caption{Run-time comparison}
\begin{tabular}{c|r|r|r}
\# of clients &  RM &  LKBM &  LBM  \\
\hline
\hline
3 & 1.00 & 0.981 & 0.793  \\ 
5 & 1.00 & 0.985 & 0.778  \\ 
10 & 1.00 & 0.988 & 0.76  \\ 
20 & 1.00 & 0.992 & 0.751 \\ 
\end{tabular}
\label{table:runtime}
\end{table}





\begin{figure}[t]
    \centering
\includegraphics[width=0.98\linewidth]{Diff_N.png}
\caption{Effect of \# of clients ($N$)}
\label{fig:diff_N}
\end{figure}

\begin{figure}[t]
    \centering
\includegraphics[width=\linewidth]{Diff_HPO.png}
\caption{Effect of HPO iterations}
\label{fig:diff_HPO}
\end{figure}



We now perform an ablation study to evaluate FLASH performance when $\Delta$, $N$ and $HPO_{iter}$ are varied.

\paragraph{Impact of Tolerance Parameter ($\Delta$):} We ran FLASH for $\Delta$ ranging between 0 and 1. For each $\Delta$, we performed 100 runs that included different data seed (client data distributions), HP seed (different initial Hyperopt HPs) and HPO iterations ($HPO_{iter}$).
Table~\ref{table:FLASH-Delta} quantifies FLASH performance in terms of (\%) Avg. Error and training cost. For each $\Delta$, the Avg. Error is computed by averaging the difference between the  accuracy of each run and the best accuracy over the 100 runs. 
The training cost is the ratio of the average training time of a $\Delta$ over the average training time of $\Delta=0$ (training time increases with increased $\Delta$), where average is taken over the 100 runs. 
We see that $\Delta =0$ yields Avg. Error of $0.44\%$, which is very low and slightly higher than that of the higher $\Delta$s. Also training cost increases abruptly for $\Delta>0.6$. Thus, $\Delta < 0.6$ seems best for our method and $\Delta=0$ is a good value for the datasets we considered. It should be noted that increasing $\Delta$ ensures a larger set of algorithms (having higher accuracy projection) to be trained at each round ($m$) and therefore a lower chance of picking a sub-optimal algorithm. Hence with a higher $\Delta$ value, the average error decreases while the average training cost increases.





\paragraph{Impact of Number of Clients:} Fig. \ref{fig:diff_N} depicts the $RI$ in performance for FLASH $RM$ compared to {\sl CASH-D} for different values of $N$. Notably, 6 of the 8 datasets showed very consistent results, with the $RI$ values varying over a narrow range with variation in $N$. The performance decreases monotonically with increasing $N$ for \textit{Eye Movements} and shows a sharp drop at $N=20$ for \textit{Diabetic data}. However, upon close inspection of Fig. \ref{fig:diff_N}, we do see some non-monotonic behavior in the performance of FLASH $RM$. The overall performance of FLASH $RM$ is impacted by factors such as: i) the amount of data per client and ii) the number of HP settings reported back to the central server to perform FL-HPO. In our empirical evaluation, we divided the whole dataset with equal data-samples per client, hence with larger $N$ each client had fewer data-samples. Usually, less data per client leads to worse loss estimates, whereas with more HP settings reported (due to a larger number of clients), FLASH–RM is able to come up with better HP settings. Thus, there are two opposing forces in play here as we increase $N$, and it is difficult to identify under what conditions one factor would dominate the other.

\paragraph{Impact of Number of HPO iterations:} The $HPO_{iter}$ was varied from 1 to 50 on each client's dataset, and for FLASH $RM$ the accuracy values are plotted against number of iteration in Figure \ref{fig:diff_HPO}. %The vertical axis values are the accuracy values, while the horizontal axis is showing the number of HPO iterations used. 
We observe that accuracy in general increases when $HPO_{iter}$ goes from 1 to 10 for FLASH $RM$ (it takes a few more iterations for $LBM$ and $LKBM$), after which it becomes almost flat (or increases slowly).
Reaching the optimal solution in a few iterations justifies the use of the three proposed FL-HPO algorithms, especially in the case where HPO iterations at the clients are compute-intensive.



\paragraph{Accuracy projections with training data: }To get more insight on how FLASH works, the projection of accuracy (analogous to the loss projection) for a specific dataset (Electricity) is plotted in Fig. \ref{fig:Acc_upper_bound}. It is worth noting that during the initial stages of FLASH, when the value of $m$ is small (i.e., smaller training data), the Extra Tree and MLP algorithms exhibit remarkably high accuracy projections. However, as the training progresses, their accuracy projections decline, while algorithms like XGBoost and LGBM emerge as the frontrunners. Consequently, if FLASH had not reintroduced these algorithms in subsequent rounds and discarded them solely based on their initial performance, the overall accuracy would have been lower by 10\%. These results use $\Delta = 100\%$, so that the projection value of all the algorithms can be observed in each round.


\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{Ub_values.png}
    \caption{Accuracy projections with training data.}
    \label{fig:Acc_upper_bound}
\end{figure}



\begin{figure}[t]
    \centering
    \includegraphics[width=\linewidth]{diff_gamma.png}

    \caption{Impact of heterogeneity in client data distributions, depicting $RI$ of FLASH-RM with different Dirichlet constant $\gamma$ over FLASH with random distribution.}
    \label{fig:diff_gamma}

\end{figure}

\paragraph{Controlled heterogeneity.} We now evaluate the impact of the degree of heterogeneity on FLASH performance. In all of the aforementioned results, the label distributions of the clients were created by randomly dividing the entire dataset. We refer to FLASH with this random data distribution as FLASH-RANDOM.
As in~\cite{hsu2019measuring}, we create several client data distributions by controlling the heterogeneity of labels of the data points distributed to the clients using Dirichlet constant $\gamma$ (smaller $\gamma$ yields more heterogeneous non-iid distribution). Fig.~\ref{fig:diff_gamma} depicts the $RI$ values of FLASH-RM for two values of $\gamma$ ($10^2$ and $10^4$) over FLASH-RANDOM.  RI has a small range ($-0.4\%$ to $0.35\%$) for both values of $\gamma$ across all datasets, demonstrating that FLASH performs consistently across heterogeneous non-iid distributions.












                %%%%%%%%





\section{Conclusion}
\label{sec:concl}

We presented and evaluated FLASH, which solves the CASH problem in an FL setting by combining  outer-level algorithm selection with inner-level FL-HPO methods, and requires the global FL model training problem to be solved only once, i.e., after the Alg-HP configuration has been selected. 
FLASH reduces training cost by allocating training data incrementally to only a subset of all algorithms based on their loss performances. %FLASH allows a wide range on FL-HPO approaches to be used for the inner-level problem. 
Specifically, we theoretically analyzed and evaluated FLASH with three FL-HPO methods which are simple to implement, and compute the global HP and loss function by aggregating those computed individually by the clients on their private data sets. 
FLASH is able to identify near optimal Alg-HP configuration with a few rounds of communication between the clients and the central server, and easy to implement in an FL environment.  Extensive simulations show consistent and competitive performance of FLASH upon comparing it with centralized benchmarks. 



                %%%%%%%%%%%%%%





                







\begin{acknowledgements} 
The work was supported by the Rensselaer-IBM AI Research Collaboration (\url{http://airc.rpi.edu}), part of the IBM AI Horizons Network (\url{http://ibm.biz/AIHorizons}).
\end{acknowledgements}

% References
\bibliography{reference}



\end{document}
