\documentclass[accepted]{uai2023} % for initial submission
% \documentclass[accepted]{uai2023} % after acceptance, for a revised
% version; also before submission to
% see how the non-anonymous paper
% would look like

%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2023} % ptmx math instead of Computer
% Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2023} % newtx fonts (improves upon
 % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% for cross referencing the main text
% PLEASE ONLY USE xr IN THE SUPPLEMENTARY MATERIAL. 
% In the main paper, hard code any cross-reference to the supplementary material. 
\usepackage{xr} 


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example



\usepackage{xcolor}
\usepackage{comment}
\newcommand*{\kk}{\textcolor{red}}
\newcommand{\TS}[1]{{\color{orange}[TS: #1]}}

\usepackage{caption}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{amsmath,bm}
\usepackage{makecell}
\usepackage{multirow}
\usepackage{enumitem}
\usepackage{stfloats}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{eucal}
\usepackage[T1]{fontenc}
\DeclareMathOperator*{\argmin}{arg\,min}

\theoremstyle{plain}
\newtheorem{theorem}{Theorem}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}


\newcommand{\SWITCH}[1]{\STATE \textbf{switch} (#1)}
\newcommand{\ENDSWITCH}{\STATE \textbf{end switch}}
\newcommand{\CASE}[1]{\STATE \textbf{case} #1\textbf{:} \begin{ALC@g}}
\newcommand{\ENDCASE}{\end{ALC@g}}
\newcommand{\CASELINE}[1]{\STATE \textbf{case} #1\textbf{:} }



\usepackage{xr}
\makeatletter
\newcommand*{\addFileDependency}[1]{
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{
    \externaldocument{#1}
    \addFileDependency{#1.tex}
    \addFileDependency{#1.aux}
}
%%% END HELPER CODE

% put all the external documents here!


\title{FLASH: Automating Federated Learning using CASH \\(Supplementary Material)}

% The standard author block has changed for UAI 2023 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<alamm2@rpi.edu>?Subject=Your UAI 2023 paper}{Md Ibrahim Ibne Alam}{}}
\author[1]{Koushik Kar}
\author[2]{Theodoros Salonidis}
\author[2]{Horst Samulowitz}
% Add affiliations after the authors
\affil[1]{%
    Department of ECSE\\
  Rensselaer Polytechnic Institute \\
  Troy, NY, USA - 12180
}
\affil[2]{%
    IBM T.J. Watson Research Center \\
   Yorktown Heights, NY, USA - 10598
}
  
  \begin{document}
  
\onecolumn %% Turn this off if single column is desired for the supplement
\maketitle


% \section{Appendix}
% \label{sec:suplmnt}
\setcounter{equation}{2}
In our analysis, for simplicity we assume that the dataset (or equivalently, its distribution) $\mathcal{D}$ has finite discrete support. Our results generalize when this assumption is relaxed, although the analysis in that case becomes more complex.
%\subsection{Bounding Loss Projection}
Also, for ease of exposition, we are going to use $A \in (A^{(1)}, \cdots, A^{(J)})$ to denote a generic Algorithm, and $*$ to denote the optimum algorithm $A^*$.

\paragraph{Proof of Lemma 1: }  We define the true loss projection for an algorithm $A$ as $\underline{LP}(A,a_n)=\underline{\ell}(A,a_n)+(1-a_n)\cdot \underline{\ell}'(A, a_n)$, where $\underline{\ell}$ represents the true training loss function (assuming cross-validation), and $\underline{\ell}'$ its derivative. Similarly $LP(A,a_n)$ is defined as the loss projection calculated from $\ell(A,a_n)$ and computed as $\ell(A,a_n)+(1-a_n)\cdot \ell'(A, a_n)$. where $\ell$ is the loss function computed by FLASH and $\ell'$ its discrete derivative (defined later). Hence we can use Taylor series expansion  on true loss function $\underline{\ell}$ with $0< a_n,a_m \leq 1$, to get the following equations;
\begin{align}
\label{eq_loss1}
& \qquad \; \; \underline{\ell}(*,1) =\underline{\ell}(*,a_m)+(1-a_m)\cdot \underline{\ell}'(*,a_m)\nonumber \\
& \qquad \qquad \qquad \qquad \qquad +\frac{1}{2}(1-a_m)^2 \cdot \underline{\ell}''(*,\bar{a}_m),  \nonumber\\
& \text{or, } \underline{\ell}(*,1) =  \underline{LP}(*,a_m) + \frac{1}{2}(1-a_m)^2 \cdot \underline{\ell}''(*,\bar{a}_m) \\
    \label{eq_loss2}
& \text{and,   \;\;} \underline{\ell}(A,1) =\underline{\ell}(A,a_n)+(1-a_n)\cdot \underline{\ell}'(A,a_n)\nonumber \\
& \qquad \qquad \qquad \qquad \qquad +\frac{1}{2}(1-a_n)^2 \cdot \underline{\ell}''(A,\bar{a}_n), \nonumber \\
& \text{or, } \underline{\ell}(A,1) =  \underline{LP}(A,a_n) + \frac{1}{2}(1-a_n)^2 \cdot \underline{\ell}''(A,\bar{a}_n).
\end{align}
Where $a_m \leq \bar{a}_m \leq 1$, $a_n \leq \bar{a}_n \leq 1$, $*$ is the optimum algorithm ensuring that minimized  (1), and $A \in \mathcal{A}$ is any other algorithm.  We know from the definition of $\ell^* = \underline{\ell}(*,1) \leq \underline{\ell}(A,1)$. Hence we can write the following using (\ref{eq_loss1}) and (\ref{eq_loss2});
\begin{align}
\label{eq:true_LP_relation}
    & \underline{LP}(*,a_m) + \frac{1}{2}(1-a_m)^2 \cdot \underline{\ell}''(*,\bar{a}_m) \nonumber \\
     \leq & \;  \underline{LP}(A,a_n) + \frac{1}{2}(1-a_n)^2 \cdot \underline{\ell}''(A,\bar{a}_n) \nonumber\\
     \text{or, } & \underline{LP}(*,a_m) \leq \underline{LP}(A,a_n)  + \frac{1}{2}(1-a_n)^2 \cdot \underline{\ell}''(A,\bar{a}_n) \nonumber \\
     \text{or, } & \underline{LP}(*,a_m) - \underline{LP}(A,a_n) \leq B/2.
\end{align}

The last line in (\ref{eq:true_LP_relation}) comes from the fact that $\underline{\ell}''(A,\bar{a}_n)<B$. We recall that $\sigma$ is defined in the following way,
\begin{equation}
\label{eq:sigma}
    |\ell(A,a)-\underline{\ell(A,a)}| \leq \sigma
\end{equation}

Since, $\ell$ is calculated at discrete points (i.e., $a_{m-1},a_m,..$) hence the discrete derivative of $\ell$, $\ell'$ is defined as $\frac{\ell(A,a_m)-\ell(A,a_{m-1})}{\delta_m}$, for some value of $a_m$ and $\delta_m = a_m - a_{m-1}$. Hence we have the following using (\ref{eq:sigma});
\begin{align}
    \ell'(A,a_m) = & \frac{\ell(A,a_m)-\ell(A,a_{m-1})}{\delta_m} \nonumber \\
    \leq & \frac{\underline{\ell}(A,a_m) + \sigma -\underline{\ell}(A,a_{m-1})+ \sigma}{\delta_m} \nonumber \\
    \label{eq:ell'_upper_bound}
    = & \frac{\underline{\ell}(A,a_m) -\underline{\ell}(A,a_{m-1})+ 2\sigma}{\delta_m}.
\end{align}
Using Taylor series expansion we get,
\begin{align}
    & \underline{\ell}(A,a_{m-1}) =\underline{\ell}(A,a_m)- \delta_m  \underline{\ell}'(A,a_m) + \frac{\delta_m^2}{2} \underline{\ell}''(A,\bar{a}) \nonumber \\
   \text{or, } & \underline{\ell}(A,a_m) - \underline{\ell}(A,a_{m-1}) = \delta_m  \underline{\ell}'(A,a_m) - \frac{\delta_m^2}{2} \underline{\ell}''(A,\bar{a}), \nonumber
\end{align}
for some  $\bar{a} \in [a_{m-1},a_m]$. From which we get the following two inequalities;
\begin{align}
\label{eq:ell_bound_lower}
    & \frac{\underline{\ell}(A,a_m) - \underline{\ell}(A,a_{m-1})}{\delta_m} \leq \underline{\ell}'(A,a_m), \\
    \text{and, } &
\label{eq:ell_bound_upper}
    \frac{\underline{\ell}(A,a_m) - \underline{\ell}(A,a_{m-1})}{\delta_m} \geq \underline{\ell}'(A,a_m) - \frac{\delta_m}{2} \cdot B.
\end{align}
Comparing (\ref{eq:ell'_upper_bound}) with (\ref{eq:ell_bound_lower}) we get the following:
\begin{align}
\label{eq:l'_to_l'bar}
    \ell'(A,a_m) = \frac{\ell(A,a_m) - \ell(A,a_{m-1})}{\delta_m} \leq \underline{\ell}'(A,a_m) + \frac{2\sigma}{\delta_m}.
\end{align}
Hence, we can write,
\begin{align}
    LP(*,a_m) & = \ell(*,a_m) + (1-a_m) \cdot \ell'(*,a_m) \nonumber \\
     \leq & \underline{\ell}(*,a_m) + \sigma + (1-a_m)(\underline{\ell}'(*,a_m)+\frac{2\sigma}{\delta_m}) \nonumber \\
\label{eq:LP_to_LPbar_upper}
    \text{or, } LP(*,a_m) & \leq \underline{LP}(*,a_m) + \sigma + \frac{2\sigma}{\delta_m}.
\end{align}
On the other hand, from the definition of $LP$ and (\ref{eq:l'_to_l'bar}) we have,
\begin{align}
\label{eq:LP_to_LPbar_lower}
    &LP(A,a_m) = \ell(A,a_m) + (1-a_m)\ell'(A,a_m) \nonumber \\
    & \geq \underline{\ell}(A,a_m) - \sigma + (1-a_m) \cdot \left(\frac{\ell(A,a_m)-\ell(A,a_m)}{\delta_m}\right) \nonumber \\
    &=\underline{\ell}(A,a_m) - \sigma \nonumber \\
    & \qquad \qquad + (1-a_m) \left(\frac{\underline{\ell}(A,a_m)-\underline{\ell}(A,a_m)-2\sigma}{\delta_m}\right) \nonumber \\
    & \geq \underline{\ell}(A,a_m) - \sigma + (1-a_m)\left(\underline{\ell}'(A,a_m)-\frac{\delta_m}{2}  B - \frac{2\sigma}{\delta_m}\right) \nonumber \\
    & = \underline{\ell}(A,a_m) + (1-a_m)\underline{\ell}'(A,a_m) \nonumber \\
    & \qquad \qquad - \sigma -(1-a_m)\left(\frac{\delta_m}{2}  B + \frac{2\sigma}{\delta_m} \right) \nonumber \\
    & \geq \underline{LP}(A,a_m) - \left(\sigma +  \frac{\delta_m}{2}  B + \frac{2\sigma}{\delta_m} \right).
\end{align}
Now, we are ready to bound the value of $LP(*,a_m)-LP(A,a_m)$ using  (\ref{eq:LP_to_LPbar_upper}) and (\ref{eq:LP_to_LPbar_lower}).
\begin{align}
    & LP(*,a_m)-LP(A,a_m) \nonumber \\
   \leq & \; \underline{LP}(*,a_m) + \sigma + \frac{2\sigma}{\delta_m} - \nonumber \\
  & \qquad  \qquad \underline{LP}(A,a_m) + \sigma + \frac{\delta_m B}{2} + \frac{2\sigma}{\delta_m} \nonumber \\
  = & \underline{LP}(*,a_m) - \underline{LP}(A,a_m) + 2\sigma + \frac{4\sigma}{\delta_m} + \frac{\delta_m}{2}  B \nonumber \\
  \leq & B/2 + 2\sigma + \frac{4\sigma}{\delta_m} + \frac{\delta_m}{2}  B   \;\;\text{[using  (\ref{eq:true_LP_relation})]} \nonumber \\
  \leq & B + 2\sigma + \frac{4\sigma}{\delta}.
\end{align}
This implies that for any algorithm ($A$), the $LP(A,a_m)$ cannot be less $LP(*,a_m)$ by a value greater than $B + 2\sigma + \frac{4\sigma}{\delta}$. Thus, if $B + 2\sigma + \frac{4\sigma}{\delta} \leq \Delta$, it ensures the training of the optimum algorithm ($*$), which proves Lemma 1. \qed








\paragraph{Proof of Theorem 2: } Let $A^\dag$ be the Algorithm chosen by the FLASH, and $*$ is the optimum algorithm. We know from Lemma 1 that if $B + 2\sigma + \frac{4\sigma}{\delta}$, then $*$ will be in the final choice of algorithms alongside $A^\dag$ (when $a=1$). Since, $A^\dag$ was chosen by FLASH instead of $*$,
\begin{align}
\label{eq:Adag1_to_A*1}
    & \ell(A^\dag,1)\leq \ell(*,1).
\end{align}
Since, $\underline{\ell}(A^\dag,1) \geq \underline{\ell}(*,1)$, hence we need to bound the value of $\underline{\ell}(A^\dag,1) - \underline{\ell}(*,1)$ to prove Theorem 2.
\begin{align}
\label{eq:theoem2_bound}
    &\underline{\ell}(A^\dag,1) - \underline{\ell}(*,1) \nonumber \\
    = & \underline{\ell}(A^\dag,1) - \ell(A^\dag,1) + \ell(A^\dag,1) - \underline{\ell}(*,1) \nonumber\\
    \leq & \sigma + \ell(A^\dag,1) - \underline{\ell}(*,1) \nonumber \\
    = & \sigma + \ell(*,1) - \underline{\ell}(*,1)  \;\; \text{[using (\ref{eq:Adag1_to_A*1})]} \nonumber\\
    \leq & \sigma +\sigma = 2\sigma.
\end{align}
In the calculation above, at the third line from the top, we have bounded $ \underline{\ell}(A^\dag,1) - \ell(A^\dag,1)$ by $\sigma$ which is true for all cases. However, for the case of $RM$ and $LKBM$, since $A^\dag$ is chosen over $*$ in the revalidation step, $\ell(A^\dag,1) \leq \ell(*,1) = \underline{\ell}(*,1)$. Hence, the bound in (\ref{eq:theoem2_bound}) reduces to $\sigma$ from 2$\sigma$ for those two FL-HPO methods. \qed







\paragraph{Proof of Theorem 3: } Consider any round $n$ is which an Algorithm $A$ is allocated additional data for training. Since, $\Delta>B + 2\sigma + \frac{4\sigma}{\delta}$, the optimum algorithm $*$ is included for training in that round as well. We use (\ref{eq:LP_to_LPbar_upper}) and (\ref{eq:LP_to_LPbar_lower}) to get the following upper bound for $\underline{LP}(A,a_n) - \underline{LP}(*,a_m)$,
\begin{align}
\label{eq:LPbar_diff_to_LP_diff}
    & \underline{LP}(A,a_n) - \underline{LP}(*,a_m) \nonumber \\
    \leq & LP(A,a_n) + \sigma + \frac{\delta_n B}{2} + \frac{2\sigma}{\delta_n} - LP(*,a_m) + \sigma + \frac{2\sigma}{\delta_m} \nonumber \\
    = & LP(A,a_n) - LP(*,a_m) + 2\sigma + \frac{2\sigma}{\delta_n} + \frac{2\sigma}{\delta_m}  + \frac{\delta_n B}{2} . \nonumber \\
    \leq & LP(A,a_n) - LP(*,a_m) + 2\sigma + \frac{4\sigma}{\delta}  + \frac{ B}{2}.
\end{align}



Let $O$ the algorithm with the best $LP$ in that round ($n$). Let $n,m$ and $p$ be the last round in which the $LP$ values of $A,*$ and $O$ have been updated ($n,m,p \leq M$). Then we have,
\begin{align}
& 0 \leq LP(*,a_m) - LP(O,a_p) \leq \Delta \nonumber \\
  \text{and, } & 0 \leq LP(A,a_n) - LP(O,a_p) . \leq \Delta \nonumber
\end{align}
From above two inequalities we have, 
\begin{align}
\label{eq:LP_A_an_to_LP_*_am}
& LP(A,a_n) - LP(*,a_m) \leq \Delta.
\end{align}
Then using (\ref{eq_loss1}) and (\ref{eq_loss2}) we get,
\begin{align}
\label{eq:LPbar_Aan_to_LPbar_*am_bound}
    & \underline{\ell}(A,1) - \underline{\ell}(*,1) =  \epsilon_A = \underline{LP}(A,a_n) - \underline{LP}(*,a_m) \nonumber \\
& \;\;\; + \frac{1}{2}(1-a_n)^2 \cdot \underline{\ell}''(A,\bar{a}_n) - \frac{1}{2}(1-a_m)^2 \cdot \underline{\ell}''(*,\bar{a}_m) \nonumber \\
\text{or,\; }  & \underline{LP}(A,a_n) - \underline{LP}(*,a_m) \nonumber \\
& \qquad \qquad \geq  \epsilon_A - \frac{1}{2}(1-a_n)^2 \cdot \underline{\ell}''(A,\bar{a}_n).
\end{align}
Now using (\ref{eq:LPbar_diff_to_LP_diff}) and (\ref{eq:LPbar_Aan_to_LPbar_*am_bound}), we get,
\begin{align}
\label{eq:LP_Aan_to_LP_*am_diff_bound}
    & LP(A,a_n) - LP(*,a_m) + 2\sigma + \frac{4\sigma}{\delta}  + \frac{ B}{2} \nonumber \\
    & \qquad \qquad \geq  \epsilon_A - \frac{1}{2}(1-a_n)^2 \cdot \underline{\ell}''(A,\bar{a}_n) \nonumber \\
    \text{or, } & LP(A,a_n) - LP(*,a_m)  \geq \epsilon_A -  2\sigma - \frac{4\sigma}{\delta}  \nonumber\\
    & \qquad\qquad\qquad - \frac{ B}{2} - \frac{1}{2}(1-a_n)^2 \cdot \underline{\ell}''(A,\bar{a}_n).
\end{align}
To finalize the proof, we use  (\ref{eq:LP_A_an_to_LP_*_am}) and (\ref{eq:LP_Aan_to_LP_*am_diff_bound}) to get,
\begin{align}
\label{eq:training_cost_bound}
    & \Delta \geq \epsilon_A -  2\sigma - \frac{ B}{2} + \frac{4\sigma}{\delta}  \nonumber\\
    & \qquad\qquad  - \frac{1}{2}(1-a_n)^2 \cdot \underline{\ell}''(A,\bar{a}_n) \nonumber \\
    \text{or, }& \frac{1}{2}(1-a_n)^2 \cdot \underline{\ell}''(A,\bar{a}_n) \nonumber \\
    & \qquad \geq \epsilon_A  - 2\sigma - \frac{ B}{2} -  \frac{4\sigma}{\delta} -  \Delta \nonumber \\
     \text{or, } & (1-a_n)^2 \geq \frac{\epsilon_A  - 2\sigma - \frac{ B}{2} -  \frac{4\sigma}{\delta} -  \Delta }{B/2} \nonumber \\
      \text{or, } & a_n \leq 1 - \sqrt{\frac{\epsilon_A  - 2\sigma - \frac{ B}{2} -  \frac{4\sigma}{\delta} -  \Delta }{B/2}}.
\end{align}
Therefore, Algorithm A does not get any training data greater than $1 - \sqrt{\frac{\epsilon_A  - 2\sigma - \frac{ B}{2} -  \frac{4\sigma}{\delta} -  \Delta }{B/2}}$ in round $n$. The result in Theorem  3 easily follows from this. \qed









\paragraph{Preliminaries for Proof of Theorem 4: } To prove Theorem 4, we define a new loss function ($\tilde{\ell}(A_{\bm{\lambda}}^{(j)},\hat{\mathcal{D}})$) as,
\begin{align}
    & \tilde{\ell}(A_{\bm{\lambda}}^{(j)},\hat{\mathcal{D}}) =  \sum_i \alpha_i\,\mathcal{L}(\mathcal{F}(A^{(j)}_{\bm{\lambda}}, \cup_i \hat{\mathcal{D}}_i), \hat{\mathcal{D}}_i). \nonumber \\
    \text{Hence,\;} &  \hat{\ell}(A^{(j)},\hat{\mathcal{D}}) = \min_{\bm{\lambda} \in \bm{\Lambda}^{j}} \tilde{\ell}(A_{\bm{\lambda}}^{(j)},\hat{\mathcal{D}}) \nonumber,
\end{align}
\noindent where $\hat{\ell}(A_{\bm{\lambda}}^{(j)},\hat{\mathcal{D}})$ is as defined in the Theoretical analysis section. We define the loss rate computed by FLASH for algorithm $A^{(j)}$ and HP ${\bm{\lambda}}$ on dataset $\mathcal{D}^a$ as $\ell(A_{\bm{\lambda}}^{(j)},\mathcal{D}^a)$. Finally, for some Algorithm $A$ and HP ${\bm{\lambda}}$ we make the following assumptions to prove Theorem 4,
\begin{align}
\label{eq:assmp1}
   & |\tilde{\ell}(A_{\bm{\lambda}},\hat{\mathcal{D}}_1) - \tilde{\ell}(A_{\bm{\lambda}},\hat{\mathcal{D}}_2)| \leq \beta'\nu (\hat{\mathcal{D}}_1,\hat{\mathcal{D}}_2). \\
   \label{eq:assmp2}
 & |{\bm{\lambda}}(\hat{\mathcal{D}}_1)-{\bm{\lambda}}(\hat{\mathcal{D}}_2)| \leq \beta''\nu (\hat{\mathcal{D}}_1,\hat{\mathcal{D}}_2). \\
 \label{eq:assmp3}
 & |\tilde{\ell}(A_{{\bm{\lambda}}_1},\hat{\mathcal{D}}) - \tilde{\ell}(A_{{\bm{\lambda}}_2},\hat{\mathcal{D}})| \leq \beta'''|{\bm{\lambda}}_1 - {\bm{\lambda}}_2|.
\end{align}
In (\ref{eq:assmp3}), ${\bm{\lambda}}_1$ and ${\bm{\lambda}}_2$ are two HP settings of Algorithm $A$. For two different datasets $\hat{\mathcal{D}}_1$ and $\hat{\mathcal{D}}_2$ we use (\ref{eq:assmp2}) and (\ref{eq:assmp3}) to derive another inequality which will be helpful in the proof of Theorem 4.
\begin{align}
\label{eq:data_dissimilarity_bound}
    & |\hat{\ell}(A,\hat{\mathcal{D}}_1) - \hat{\ell}(A,\hat{\mathcal{D}}_2)| \nonumber \\
    = & |\tilde{\ell}(A_{{\bm{\lambda}}^*(\hat{\mathcal{D}}_1)},\hat{\mathcal{D}}_1) -  \tilde{\ell}(A_{{\bm{\lambda}}^*(\hat{\mathcal{D}}_2)},\hat{\mathcal{D}}_2)| \nonumber \\
     \leq & |\tilde{\ell}(A_{{\bm{\lambda}}^*(\hat{\mathcal{D}}_1)},\hat{\mathcal{D}}_1)- \tilde{\ell}(A_{{\bm{\lambda}}^*(\hat{\mathcal{D}}_1)},\hat{\mathcal{D}}_2)| \nonumber \\
    & \;\; \qquad + |\tilde{\ell}(A_{{\bm{\lambda}}^*(\hat{\mathcal{D}}_1)},\hat{\mathcal{D}}_2) - \tilde{\ell}(A_{{\bm{\lambda}}^*(\hat{\mathcal{D}}_2)},\hat{\mathcal{D}}_2)| \nonumber \\
    \leq  &\beta' \nu(\hat{\mathcal{D}}_1,\hat{\mathcal{D}}_2) + \beta'''|{\bm{\lambda}}^*(\hat{\mathcal{D}}_1)-{\bm{\lambda}}^*(\hat{\mathcal{D}}_2)| \nonumber \\
    \leq  & \beta' \nu(\hat{\mathcal{D}}_1,\hat{\mathcal{D}}_2) + \beta'''\beta''\nu(\hat{\mathcal{D}}_1,\hat{\mathcal{D}}_2) \nonumber \\
     =  & \beta \nu(\hat{\mathcal{D}}_1,\hat{\mathcal{D}}_2),
\end{align}
where $\beta = \beta' + \beta''\beta'''$.





\paragraph{Proof of Theorem 4:} For any algorithm $A$ and dataset $\mathcal{D}^a$, let ${\bm{\lambda}}^\dag$ as the HP setting chosen by FLASH and ${\bm{\lambda}}^*$ as the optimum HP setting that minimizes $\tilde{\ell}$. Then from the definition of $\sigma$ we have,
\begin{align}
    &\sigma \geq |\ell(A_{{\bm{\lambda}}^\dag},\mathcal{D}^a) - \underline{\ell}(A,a)| \nonumber \\
    & = |\ell(A_{{\bm{\lambda}}^\dag},\mathcal{D}^a) - \mathbb{E}_{\hat{\mathcal{D}}^a \in \bm{\hat{\mathcal{D}}}^a} \min_{{\bm{\lambda}}} \hat{\ell}(A_{\bm{\lambda}},\hat{\mathcal{D}}^a)| \nonumber \\
    & =  |\ell(A_{{\bm{\lambda}}^\dag},\mathcal{D}^a) - \mathbb{E}_{\hat{\mathcal{D}}^a \in \bm{\hat{\mathcal{D}}}^a} \;\tilde{\ell}(A_{{\bm{\lambda}}^*(\hat{\mathcal{D}}^a)},\hat{\mathcal{D}}^a)| \nonumber \\
    & \leq |\ell(A_{{\bm{\lambda}}^\dag},\mathcal{D}^a) -\tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}^a) | \nonumber \\
    & \qquad + |\tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}^a) - \mathbb{E}_{\hat{\mathcal{D}}^a \in \bm{\hat{\mathcal{D}}}^a} \; \tilde{\ell}(A_{{\bm{\lambda}}^*(\hat{\mathcal{D}}^a)},\hat{\mathcal{D}}^a)| \nonumber \\
    & = |X| + |Y|, \nonumber  
\end{align}
where we denoted the first term in $|\boldsymbol{\cdot}|$ as $X$ and the second term to be $Y$. Now, we will bound $X$ and $Y$ individually since $X$ depends on the FL-HPO variant used (as we will observe), whereas $Y$ is independent of that. Since, $Y$ is common for all HPO-aggregation variants, so we start by bounding the value of $Y$. 

\noindent \underline{Bounding $|Y|$:}
For any specific $\hat{\mathcal{D}}^a$ we define,
\begin{align}
        \hat{y}(\hat{\mathcal{D}^a}) & = \tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}^a) - \tilde{\ell}(A_{{\bm{\lambda}}^*(\hat{\mathcal{D}}^a)},\hat{\mathcal{D}}^a) \nonumber \\
    & = \hat{\ell}(A,\mathcal{D}^a) - \hat{\ell}(A,\hat{\mathcal{D}}^a) \nonumber
\end{align}
From (\ref{eq:data_dissimilarity_bound}), $| \hat{\ell}(A,\mathcal{D}^a) - \hat{\ell}(A,\hat{\mathcal{D}}^a)| \leq \beta \nu (\mathcal{D}^a,\hat{\mathcal{D}}^a)$, we get,
\begin{align}
    & - \beta \nu (\mathcal{D}^a,\hat{\mathcal{D}}^a) \leq \hat{y}(\hat{\mathcal{D}^a}) \leq \beta \nu (\mathcal{D}^a,\hat{\mathcal{D}}^a). \nonumber
\end{align}
Taking expectation with respect to $\mathcal{D}^a$ on all sides,
\begin{align}
    & - \beta \mathbb{E}_{\hat{\mathcal{D}}^a} \nu (\mathcal{D}^a,\hat{\mathcal{D}}^a) \leq \mathbb{E}_{\hat{\mathcal{D}}^a} \hat{y}(\hat{\mathcal{D}^a}) \leq \beta \mathbb{E}_{\hat{\mathcal{D}}^a} \nu (\mathcal{D}^a,\hat{\mathcal{D}}^a) \nonumber \\
    \therefore \;\;&  |Y| \leq  \beta |\mathbb{E}_{\hat{\mathcal{D}}^a} \nu (\mathcal{D}^a,\hat{\mathcal{D}}^a)| .
\end{align}

Note that the distance function $\nu (\mathcal{D}^a,\hat{\mathcal{D}}^a) = f(\hat{\mathcal{D}}^a)$ is convex in $\hat{\mathcal{D}}^a$, with bounded convexity in the support of $\mathcal{D}^a$ ($|\nabla^2 f''| \leq \hat{\beta}$, with some constant $\hat{\beta}$). Also, we denote the variance of $\hat{\mathcal{D}}^a$ as $V(\hat{\mathcal{D}}^a)$ with an upper bound of $\sigma^2$. Then we have,
\begin{align}
    & \mathbb{E}_{\hat{\mathcal{D}}^a} \nu (\mathcal{D}^a,\hat{\mathcal{D}}^a) = \mathbb{E}_{\hat{\mathcal{D}}^a} f(\hat{\mathcal{D}}^a) \nonumber \\
    & \leq f(\mathbb{E}(\hat{\mathcal{D}}^a)) + \hat{\beta}V(\hat{\mathcal{D}}^a)
    \leq \nu(\mathcal{D}^a,\underline{\mathcal{D}}^a) + \hat{\beta}\sigma^2 \nonumber
\end{align}
Hence, 
\begin{align}
\label{eq:bound_of_Y}
    |Y| \leq  \beta|\nu(\mathcal{D}^a,\underline{\mathcal{D}}^a)+\hat{\beta}\sigma^2| = \beta\nu(\mathcal{D}^a,\underline{\mathcal{D}}^a) + \mu,
\end{align} 
where, $\beta \hat{\beta} \sigma^2 = \mu$. (\ref{eq:bound_of_Y}) gives us the bound on $Y$. Now we proceed to bound $|X|$, since this bound depends on FL-HPO variant used, we consider each variant seperately.





\noindent \underline{Bounding $|X|$ for $LBM$:}
For $LBM$ we know that ${\bm{\lambda}}^\dag=\sum_i \alpha_i {\bm{\lambda}}_i^\dag$, where $\sum_i\alpha_i=1 $ and ${\bm{\lambda}}_i^\dag$ is the best HP setting found by HPO on client $i$'s data. Also, $\tilde{\ell}(A_{{\bm{\lambda}}_i^\dag},\mathcal{D}_i^a) = \tilde{\ell}(A_{{\bm{\lambda}}_i^*(\mathcal{D}_i^a)},\mathcal{D}_i^a)$, where ${\bm{\lambda}}_i^*(\mathcal{D}_i^a)$ is the HP setting optimized on $\mathcal{D}_i^a$ data of client $i$. Then we can write,
\begin{align}
\label{eq:ell_ldag_to_tilde_ell_l*}
    &\ell(A_{{\bm{\lambda}}^\dag},\mathcal{D}^a) =  \sum_i \alpha_i \tilde{\ell}(A_{{\bm{\lambda}}_i^\dag},\mathcal{D}_i^a) \nonumber \\
     = & \sum_i \alpha_i \tilde{\ell}(A_{{\bm{\lambda}}_i^*(\mathcal{D}_i^a)},\mathcal{D}_i^a) \leq  \sum_i \alpha_i \tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}_i^a)  \nonumber \\
     = &  \tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}^a).
\end{align}

%From the definition of $\lambda^*(\mathcal{D}^a)$, is not it always true that $\ell(A_{\lambda^\dag},\mathcal{D}^a) > \tilde{\ell}(A_{\lambda^*(\mathcal{D}^a)},\mathcal{D}^a)$
Hence, we have proved that $\tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}^a) \geq \ell(A_{{\bm{\lambda}}^\dag},\mathcal{D}^a)$. Now, under the  reasonable assumption that  $\tilde{\ell}(A_{{\bm{\lambda}}},\mathcal{D}^a)$ is convex in ${\bm{\lambda}}$, we can write,
\begin{align}
\label{eq:ell_l_dag_to_ell_tilde_l*}
    \tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}^a) \leq \tilde{\ell}(A_{{\bm{\lambda}}^\dag},\mathcal{D}^a) \leq \sum_i \alpha_i \tilde{\ell}(A_{{\bm{\lambda}}^\dag_i},\mathcal{D}^a).
\end{align}
Also, since $ \ell(A_{{\bm{\lambda}}^\dag},\mathcal{D}^a) = \sum_i \alpha_i \tilde{\ell}(A_{{\bm{\lambda}}_i^\dag},\mathcal{D}_i^a)$, using (\ref{eq:ell_l_dag_to_ell_tilde_l*}), we can write,
\begin{align}
\label{eq:LBM_X_bound}
    & \tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}^a) - \ell(A_{{\bm{\lambda}}^\dag},\mathcal{D}^a)  \nonumber \\
    & \leq \sum_i \alpha_i \left(\tilde{\ell}(A_{{\bm{\lambda}}^\dag_i},\mathcal{D}^a)- \tilde{\ell}(A_{{\bm{\lambda}}^\dag_i},\mathcal{D}^a_i) \right) \nonumber \\
    & \leq \beta' \sum_i \alpha_i \nu(\mathcal{D}_i^a,\mathcal{D}^a).
\end{align}
Equation (\ref{eq:LBM_X_bound}) upper bounds the value of $\tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}^a) - \ell(A_{{\bm{\lambda}}^\dag},\mathcal{D}^a)$, which is also shown to have lower bound of $0$ (from (\ref{eq:ell_ldag_to_tilde_ell_l*})). Hence, $|X| \leq \beta' \sum_i \alpha_i \nu(\mathcal{D}_i^a,\mathcal{D}^a)$, and combining this with (\ref{eq:bound_of_Y}), we get the bound for the $LBM$ case in Theorem 4.







\noindent \underline{Bounding $|X|$ for $LKBM$:}
For $LKBM$ we know the HP setting chosen by FLASH is ${\bm{\lambda}}^\dag = {\bm{\lambda}}_{i'}^*(\mathcal{D}_i^a)$, where ${\bm{\lambda}}_{i'}$ is the HP setting found by performing HPO at client $i$ that gives the lowest average loss over all clients. Since there is a re-validation process in $LKBM$, hence we can directly say that $X = \ell(A_{{\bm{\lambda}}^\dag},\mathcal{D}^a) -\tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}^a) \geq 0$, which means that at some iteration the loss calculated by $LKBM$ cannot be less than the optimized loss calculation overall (which could have happened in $LBM$). 


Now for any client $i \ne i'$ we have,
\begin{align}
    &\tilde{\ell}(A_{{\bm{\lambda}}^\dag},\mathcal{D}_i^a) \nonumber\\
    \leq & \tilde{\ell}(A_{{\bm{\lambda}}^\dag},\mathcal{D}_{i'}^a) + \beta' \nu(\mathcal{D}_i^a, \mathcal{D}_{i'}^a) \;\; \text{[using (\ref{eq:assmp1})]} \nonumber \\
    \leq & \tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}_{i'}^a) + \beta' \nu(\mathcal{D}_i^a, \mathcal{D}_{i'}^a) \nonumber \\
    \leq & \tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}_{i'}^a) + \beta' \max_{i \ne i'} \nu(\mathcal{D}_i^a, \mathcal{D}_{i'}^a). \nonumber
\end{align}
Hence,
\begin{align}
\label{eq:LKBM_X_1}
    & \ell(A_{{\bm{\lambda}}^\dag},\mathcal{D}^a) = \sum_i \alpha_i \tilde{\ell}(A_{{\bm{\lambda}}^\dag},\mathcal{D}^a) \nonumber \\
    & \qquad \leq  \sum_i \alpha_i \left( \tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}_{i'}^a) + \beta \max_{i \ne i'} \nu(\mathcal{D}_i^a, \mathcal{D}_{i'}^a) \right) \nonumber \\
    & \qquad = \tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}_{i'}^a) + \beta \max_{i \ne i'} \nu(\mathcal{D}_i^a, \mathcal{D}_{i'}^a).
\end{align}
The last line comes from the fact that $\sum_i \alpha_i =1$. On the other hand, using (\ref{eq:assmp1}) again we have,
\begin{align}
\label{eq:LKBM_X_2}
    & \tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}^a) \nonumber \\
    \geq & \tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}^a_{i'}) - \beta' \nu (\mathcal{D}^a, \mathcal{D}_{i'}^a) \nonumber \\
    \geq & \tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}^a_{i'}) - \beta' \max_{i \ne i'} \nu(\mathcal{D}_i^a, \mathcal{D}_{i'}^a) .
\end{align}
Combining (\ref{eq:LKBM_X_1}) and (\ref{eq:LKBM_X_2}) we get,
\begin{align}
    \ell(A_{{\bm{\lambda}}^\dag},\mathcal{D}^a) - \tilde{\ell}(A_{{\bm{\lambda}}^*(\mathcal{D}^a)},\mathcal{D}^a) \nonumber  \\
    = X \leq 2\beta' \max_{i \ne i'} \nu(\mathcal{D}_i^a, \mathcal{D}_{i'}^a). \nonumber
\end{align}
Combining this with the fact that $X \geq 0$ (which we already argued), we have $|X| \leq 2\beta' \max_{i \ne i'} \nu(\mathcal{D}_i^a, \mathcal{D}_{i'}^a)$. Then adding this bound of $|X|$ with (\ref{eq:bound_of_Y}), we get the bound for the $LKBM$ case in Theorem 4.




\noindent \underline{Bounding $|X|$ for $RM$:} $RM$ is very similar to FLoRA which is analyzed in \cite{flora}. The same analysis as in Theorem 4.5 of \cite{flora} applied to our case gives,
\begin{align}
    |X| \leq \beta_3 \sum_i \alpha_i \left[\nu(\mathcal{D}^a_i,\mathcal{D}^a) + \gamma \min_{k \in [K]} d_j ({\bm{\lambda}}, {\bm{\lambda}}_k)\right],
\end{align}
for algorithm $A^{(j)}$. Combining this with (\ref{eq:bound_of_Y}), we get the bound for the $RM$ case in Theorem 4.
\qed




Note that in Theorem 4, $\sigma$ is taken to be the max of the upper bounds $\sigma(a)$ over different values of the fraction $a$ used by FLASH, namely $a \in [a_0, \cdots, a_m]$. When the dataset $\mathcal{D}$ is sufficiently large, then for any $a \in [a_0, \cdots, a_m]$, the datasets $\mathcal{D}^a$ can be assumed to have a distribution that is similar (very close) to that of $\mathcal{D}$. In that case, $\mathcal{D}^a$ and $\underline{\mathcal{D}}^a$ in the bound can both be replaced by (closely approximated by) by $\mathcal{D}$. This implies $\nu(\mathcal{D}^a, \underline{\mathcal{D}}^a) \approx 0$.
Therefore, for large datasets $\mathcal{D}$, the loss calculation error bound $\sigma$ is well approximated as 
$$\hat{\sigma} = \mu + 
\begin{cases}
\beta_1 \sum_i \alpha_i \, \nu(\mathcal{D}_i, \mathcal{D}) & (LBM)\\
\beta_2 \max_{i,i'} \nu(\mathcal{D}_i, \mathcal{D}_{i'}) & (LKBM)\\
\beta_3 \sum_i \alpha_i \, \nu(\mathcal{D}_i, \mathcal{D}) + \gamma \bar{D} & (RM)
\end{cases}
$$
which is only in terms of the full training dataset $\mathcal{D}$.








\bibliography{reference}


\end{document}
