% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams



\usepackage{xr-hyper}

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}


\myexternaldocument{dai_226-supp}



% if you need to pass options to natbib, use, e.g.:
%     \PassOptionsToPackage{numbers, compress}{natbib}
% before loading neurips_2021

% ready for submission
% \usepackage{neurips_2021}

% to compile a preprint version, e.g., for submission to arXiv, add add the
% [preprint] option:
%     \usepackage[preprint]{neurips_2021}

% to compile a camera-ready version, add the [final] option, e.g.:
%     \usepackage[final]{neurips_2021}

% to avoid loading the natbib package, add option nonatbib:
%    \usepackage[nonatbib]{neurips_2021}

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
% \usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors



% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
%\usepackage{subfigure}
\usepackage{booktabs} % for professional tables

\usepackage{commath}
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{bbm}
% \usepackage[english]{babel}
% \usepackage{graphicx}
 \usepackage{subcaption}
% \usepackage{subfig}
%\usepackage{graphicx}

\usepackage[toc,page]{appendix}

\usepackage{algorithm}
\usepackage{algorithmic}

\allowdisplaybreaks

%\newtheorem{theorem}{Theorem}[section]
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{proposition}{Proposition}
%\newtheorem{corollary}{Corollary}[theorem]
%\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{lemma}{Lemma}
%\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{assumption}{Assumption}
\newtheorem{definition}{Definition}


%\usepackage{hyperref}

%\newcommand{\theHalgorithm}{\arabic{algorithm}}





%\title{Robust Meta-Bayesian Optimization with Online Regret Minimization}
%\title{No-Regret Meta-Bayesian Optimization with Online Regret Minimization}
%\title{Provably Robust Meta-Bayesian Optimization}
\title{On Provably Robust Meta-Bayesian Optimization}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<dzx@nus.edu.sg>?Subject=Your UAI 2022 paper}{Zhongxiang Dai}{}}
\author[1]{Yizhou Chen}
\author[2]{Haibin Yu}
\author[1]{Bryan Kian Hsiang Low}
\author[3]{Patrick Jaillet}
%\author[3]{Further~Coauthor}
%\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
Department of Computer Science, National University of Singapore, Republic of Singapore
}
\affil[2]{%
Department of Data Platform, Tencent
}
\affil[3]{%
Department of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, USA
}
  
  \begin{document}
\maketitle


\begin{abstract}
\emph{Bayesian optimization} (BO) has become popular for sequential optimization of black-box functions. When BO is used to optimize a target function, we often have access to previous evaluations of potentially related functions. This begs the question as to whether we can leverage these previous experiences to accelerate the current BO task through \emph{meta-learning} (meta-BO), while ensuring \emph{robustness} against potentially harmful dissimilar tasks that could sabotage the convergence of BO. This paper introduces two scalable and provably robust meta-BO algorithms: \emph{robust meta-Gaussian process-upper confidence bound} (RM-GP-UCB) and \emph{RM-GP-Thompson sampling} (RM-GP-TS). We prove that both algorithms are asymptotically no-regret even when some or all previous tasks are dissimilar to the current task, and show that RM-GP-UCB enjoys a better theoretical robustness than RM-GP-TS. We also exploit the theoretical guarantees to optimize the weights assigned to individual previous tasks through regret minimization via online learning, which diminishes the impact of dissimilar tasks and hence further enhances the robustness. Empirical evaluations show that (a) RM-GP-UCB performs effectively and consistently across various applications, and (b) RM-GP-TS, despite being less robust than RM-GP-UCB both in theory and in practice, performs competitively in some scenarios with less dissimilar tasks and is more computationally efficient. 
%The theoretical and empirical comparisons between RM-GP-UCB and RM-GP-TS may provide useful insights for meta-BO in general and potentially other meta-learning algorithms.
\end{abstract}

%\begin{abstract}
%\emph{Bayesian optimization} (BO) has become popular for sequential optimization of black-box functions. When BO is used to optimize a target function, we often have access to some previous evaluations of potentially related functions. This begs the question as to whether we can leverage these previous experiences to accelerate the current BO task through \emph{meta-learning} (meta-BO), while ensuring \emph{robustness} against potentially harmful dissimilar tasks that could sabotage the convergence of BO. This paper introduces a scalable, principled and robust meta-BO algorithm called \emph{robust meta-Gaussian process-upper confidence bound} (RM-GP-UCB). RM-GP-UCB utilizes a weighted combination of separate acquisition functions from individual tasks for query selection, hence achieving scalability in the number of previous tasks and observations for each previous task. We derive a robust theoretical convergence guarantee for RM-GP-UCB and show that it is asymptotically no-regret even when some or all previous tasks are dissimilar to the current task. Moreover, the theoretical guarantee allows RM-GP-UCB to optimize the weights assigned to the individual previous tasks, hence diminishing the impact of dissimilar tasks, in a principled way through regret minimization via online learning. Empirical evaluation shows that RM-GP-UCB performs effectively and consistently across various applications.
%\end{abstract}


\section{Introduction}
\label{sec:intro}
%\vspace{-1mm}
%Nowadays, the design choice of the parameters of complex systems has become an increasingly prevalent problem. 
%For example, the choice of the hyperparameters for modern \emph{machine learning} (ML) models has been a frequently encountered issue by ML practitioners~\citep{snoek2012practical}.
\emph{Bayesian optimization} (BO) has recently gained immense popularity as an efficient method to optimize black-box functions~\citep{shahriari2016taking},
%such as hyperparameters tuning of \emph{machine learning} (ML) models~\citep{shahriari2016taking}.
%Due to its impressive sample efficiency, 
and it has found success in a variety of applications such as 
automated \emph{machine learning} (ML)~\citep{snoek2012practical}, \emph{reinforcement learning} (RL)~\citep{wilson2014using}, among others.
BO uses a \emph{Gaussian process} (GP)~\citep{rasmussen2004gaussian} as a surrogate to represent the belief about the objective function and, in each iteration, 
queries the input parameters that maximize an \emph{acquisition function}.
In particular, the BO algorithms based on the \emph{GP-upper confidence bound} (GP-UCB)~\citep{srinivas2009gaussian} and \emph{GP-Thompson sampling} (GP-TS)~\citep{chowdhury2017kernelized} acquisition functions have been shown to be asymptotically \emph{no-regret} and perform competitively in practice.
When using BO to optimize a \emph{target function}, we sometimes have access to a set of evaluations of some potentially related functions.
For example, when using BO for hyperparameter optimization of an ML model trained on a target dataset, 
we often have access to some previously completed BO tasks using other potentially related datasets~\citep{golovin2017google}.
These previous tasks, if similar to the current task, may be exploited to accelerate the current BO task.
However, if some (or even all) previous tasks are in fact dissimilar to the current task, their use may turn out to 
incorporate harmful information and 
% hence 
sabotage the convergence of BO~\citep{feurer2018scalable}.
%sabotage the convergence of BO~\citep{feurer2018scalable}.
This begs the question as to whether we can leverage 
% these 
previous tasks to improve the efficiency of the current BO task, while
ensuring \emph{robustness} against 
% these 
harmful dissimilar tasks such that they do not affect the trademark \emph{no-regret} convergence of BO. 
% and (b) we can identify them and thus diminish their impact in a principled manner.
%This begs the question as to whether we can leverage these previous tasks to improve the current BO task, while ensuring that these adverse dissimilar tasks 
%(a) would not affect the no-regret convergence of BO and (b) can be identified and thus diminished.

Exploiting previous learning experiences to improve the efficiency of the current task is the goal of \emph{meta-learning}~\citep{vanschoren2018meta}.
Meta-learning is a broad field with various applications in supervised learning~\citep{finn2017model}, 
RL~\citep{xu2018meta}, active learning~\citep{pang2018meta}, among others. 
The major challenges in meta-learning include (a) the transfer of information from previous tasks to the current task, and 
(b) characterization of task similarity which is crucial for identifying harmful dissimilar tasks~\citep{vanschoren2018meta}.
%\footnote{Refer to~\citep{vanschoren2018meta} for an in-depth discussion on meta-learning.}.
The application of meta-learning to BO (or \emph{meta-BO}) has been explored by previous studies which differ in how these two challenges are addressed.
Some works, such as multitask BO~\citep{swersky2013multi}, transfer the information from previous tasks by building a joint GP surrogate using the observations from all previous and current tasks,
with the task similarity either represented by meta-features~\citep{bardenet2013collaborative,yogatama2014efficient} or learned from observations~\citep{swersky2013multi,wang2018regret}.
%They either represent task similarity through \emph{meta-features} (i.e., for characterizing properties of different tasks)~\citep{bardenet2013collaborative,yogatama2014efficient} or 
%learn task similarity from the observations (e.g., multitask BO)~\citep{golovin2017google,swersky2013multi}.
These works, however, are limited by the scalability of GP due to including all previous and current observations in a single GP~\citep{feurer2018scalable}.\footnote{Some works such as~\citet{perrone2018scalable} and~\citet{volpp2020meta} replace GP by other surrogate models such as neural networks for scalability, however, they lack the principled uncertainty estimate and theoretical guarantee offered by GP.}
To this end, other recent works transfer information from previous tasks using a more scalable approach:
They build a separate GP surrogate for each individual task and use a weighted combination of either the individual surrogate functions 
or acquisition functions for query selection~\citep{feurer2018scalable,wistuba2016two,wistuba2018scalable}. 
%However, these works on scalable meta-BO attempt to identify dissimilar previous tasks using heuristics,
%and do not provide a theoretical performance guarantee to ensure robust performances in the presence of harmful dissimilar tasks.
A more detailed review of related works is presented in Sec.~\ref{sec:related_works}.
However, none of the previous works has provided a theoretical performance guarantee to ensure robust performances in the presence of harmful dissimilar tasks.
A robust theoretical guarantee is important for guaranteeing the consistent performances of meta-BO algorithms in various real-world applications, which is crucial for their practical deployment.
%However, these works on scalable meta-BO do not provide a theoretical performance guarantee to ensure robust performances in the presence of harmful dissimilar tasks,
%and use heuristics to attempt to identify these dissimilar tasks.
%%In these methods, the task similarity (i.e., the weight assigned to a previous task) is calculated based on the heuristic of
%%how accurately the surrogate function of the previous task ranks each pair of observations of the target function.
%%However, although equipped with the ability to identify dissimilar tasks, these works on scalable meta-BO do not provide a theoretical performance guarantee
%%to ensure competitive performances in the presence of dissimilar tasks.
%%A theoretical guarantee that is robust against dissimilar previous tasks 
%%is crucial for the widespread use of meta-BO since in many real-world scenarios, some (or even all) previous tasks may be severely dissimilar to the target task.
%If not handled in a principled manner, these dissimilar 
%%previous 
%tasks might mislead the current BO algorithm by providing poisonous information, 
%consequently resulting in a suboptimal convergence.
%%performance compared with standard BO.


To this end, this paper introduces two scalable and provably robust meta-BO algorithms: \emph{robust meta-GP-upper confidence bound} (RM-GP-UCB) and \emph{robust meta-GP-Thompson sampling} (RM-GP-TS).
Both algorithms compute the acquisition function (GP-UCB or GP-TS) for each individual task 
% (including the previous and current tasks) 
and select the next query via either a weighted combination (RM-GP-UCB) or in a probabilistic way (RM-GP-TS) (Sec.~\ref{sec:om_gp_ucb}).
% employ a weighted combination of these acquisition functions to select the next query (Sec.~\ref{sec:om_gp_ucb}).
% Both algorithms compute the acquisition function (GP-UCB or GP-TS) for each individual task (including the previous and current tasks) and employ a weighted combination of these acquisition functions to select the next query (Sec.~\ref{sec:om_gp_ucb}).
As a result, like the works of~\citet{feurer2018scalable,wistuba2016two,wistuba2018scalable}, a separate GP surrogate is built for each previous task, making our algorithms scale well in the number of meta-tasks and observations in each meta-task.
Our major contributions include:
\textbf{Firstly}, we prove robust theoretical convergence guarantees for both RM-GP-UCB and RM-GP-TS (Sec.~\ref{sec:theoretical_analysis}). In particular, both algorithms are asymptotically \emph{no-regret} for \emph{any} given set of previous tasks, i.e., even if some or all previous tasks are dissimilar to the target task. Moreover, we show that RM-GP-UCB enjoys a superior robustness guarantee compared with RM-GP-TS (Sec.~\ref{subsec:theory:ts}).
\textbf{Secondly}, to further enhance our robustness against dissimilar tasks, we exploit the theoretical guarantees to learn the task similarity (and hence identify dissimilar tasks) in a principled way, 
by minimizing the regret upper bounds via a computationally cheap online learning algorithm known as \emph{Follow-The-Regularized-Leader} (Sec.~\ref{sec:online_regret_minimization}).
\textbf{Lastly}, we use extensive empirical evaluations to show that: RM-GP-UCB performs effectively and consistently across a wide range of tasks; RM-GP-TS, despite under-performing in adverse scenarios (i.e., when a large number of previous tasks are dissimilar), performs competitively in some favorable cases with less dissimilar tasks and is much more computationally efficient.
Of note, our theoretical and empirical comparisons between RM-GP-UCB and RM-GP-TS may provide useful insights for other meta-BO algorithms in general (and potentially for other related algorithms such as meta-RL) in terms of the relative strengths and weaknesses of UCB- and TS-based meta-learning algorithms.
% \end{itemize}

% This paper presents a scalable, principled and robust meta-BO algorithm: \emph{robust meta-GP-upper confidence bound} (RM-GP-UCB).
% % (Section~\ref{sec:om_gp_ucb}).

% Like the works of~\citep{feurer2018scalable,wistuba2016two,wistuba2018scalable}, we transfer information from previous tasks by combining individual acquisition functions.
% In particular, we compute the GP-UCB acquisition function for each individual task (including the previous and current tasks)  
% and employ a weighted combination of these acquisition functions to select the next query. 
% As a result, a separate GP surrogate is built for each previous task, making RM-GP-UCB scale well in the number of meta-tasks and observations in each meta-task.

% In stark contrast to the works of~\citep{feurer2018scalable,wistuba2016two,wistuba2018scalable}, 
% RM-GP-UCB achieves \emph{principled robustness} against harmful dissimilar tasks as a result of our two major contributions:
% RM-GP-UCB (a) is endowed with a theoretical convergence guarantee that is robust against dissimilar previous tasks, % (Section~\ref{sec:theoretical_analysis}), 
% and (b) learns the task similarity to identify dissimilar tasks in a principled way through online learning. % (Section~\ref{sec:online_regret_minimization}).
% %On the other hand, OM-GP-UCB is superior to the works of~\citep{feurer2018scalable,wistuba2016two,wistuba2018scalable} in two major aspects:
% %%Unlike the works of~\citep{feurer2018scalable,wistuba2016two,wistuba2018scalable}, 
% %OM-GP-UCB (a) is endowed with a theoretical performance guarantee that is robust against dissimilar previous tasks, % (Section~\ref{sec:theoretical_analysis}), 
% %and (b) learns the task similarity \emph{in a principled way} through online learning. % (Section~\ref{sec:online_regret_minimization}).

% Firstly, we derive an upper bound on the regret of RM-GP-UCB and show that RM-GP-UCB is asymptotically no-regret for \emph{any} given set of previous tasks, i.e., without requiring assumptions on the similarity between the previous and current tasks (Section~\ref{sec:theoretical_analysis}).
% %This allows us to guarantee consistently competent performance of RM-GP-UCB in a wide range of applications, 
% %even when some or all previous tasks are significantly dissimilar to the target task.
% %Of note, to achieve this robust convergence guarantee, we do not impose assumptions on the similarity between the previous and current tasks,
% %and hence do not intend to obtain a better regret upper bound than GP-UCB.
% %%show a faster convergence than GP-UCB.
% %However, we also use our theoretical analysis to show that when the meta-tasks are similar to the target task, they can help RM-GP-UCB converge faster than GP-UCB by accelerating the exploration process.
% This allows us to guarantee robust performances of RM-GP-UCB in a wide range of applications, 
% even when some or all previous tasks are significantly dissimilar to the target task.
% %show a faster convergence than GP-UCB.

% Moreover, we also use our theoretical analysis to show that when the meta-tasks are similar to the target task, they can help RM-GP-UCB converge faster than GP-UCB at the initial stage by accelerating the exploration process.
% %Of note, instead of demonstrating faster convergence of the regret OM-GP-UCB over GP-UCB
% %%demonstrating the superiority of the theoretical guarantee of OM-GP-UCB over that of GP-UCB 
% %by imposing additional assumptions on the similarity between the previous and current tasks,
% %we show that OM-GP-UCB is no-regret for \emph{any} given set of previous tasks.

% Secondly, the theoretical guarantee allows us to learn the task similarity (and hence identify dissimilar tasks) in a principled way, 
% by minimizing the upper bound on the regret of RM-GP-UCB via a computationally cheap online learning algorithm known as \emph{Follow-The-Regularized-Leader} (Section~\ref{sec:online_regret_minimization}).
% %Moreover, interestingly, the theoretical guarantee allows us to minimize the upper bound on the regret of OM-GP-UCB via a computationally cheap online learning algorithm known as \emph{Follow-The-Regularized-Leader} 
% %to derive the optimal weight assigned to each previous task in every BO iteration (Section~\ref{sec:online_regret_minimization}).

% We demonstrate in Section~\ref{sec:experiment} that, as a result of its robust performance guarantee and principled learning of task similarity, 
% RM-GP-UCB performs effectively and consistently across a number of interesting applications.

% We also introduce another algorithm, \emph{Robust Meta-GP-TS} (RM-GP-TS), which is less robust against dissimilar tasks, but can work better empirically when the meta-tasks are similar.
% We theoretically show and empirically verify that the worst-case regret upper bound of RM-GP-TS is worse than RM-GP-UCB, i.e., RM-GP-UCB is more robust against dissimilar meta-tasks.
% However, RM-GP-TS enjoys smaller computational cost and work better in less adverse scenarios, i.e., when some meta-tasks are similar.



%\vspace{-1mm}
\section{Background and Problem Formulation}
\label{sec:background}
%\vspace{-1mm}
\textbf{Bayesian Optimization.}
This work tackles the problem of sequentially maximizing an unknown function $f:\mathcal{D}\rightarrow\mathbb{R}$. 
In each iteration $t=1,\ldots,T$, an input $\mathbf{x}_t \in \mathcal{D}$ (a $D\geq 1$-dimensional vector) is 
% selected and 
queried to yield
% yield a noisy output 
% $y_t$.
$y_t\triangleq f(\mathbf{x}_t) + \epsilon$ where $\epsilon \sim \mathcal{N}(0,\sigma^2)$ is a Gaussian noise with variance $\sigma^2$.
The performance of BO is typically measured by \emph{cumulative regret}: $R_T\triangleq\sum_{t=1,\ldots,T}[f(\mathbf{x}^*) - f(\mathbf{x}_t)]$ 
where $\mathbf{x}^* \in \arg\max_{\mathbf{x}\in \mathcal{D}}f(\mathbf{x})$ is a global maximizer of $f$. 
It is desirable for a BO algorithm to achieve \emph{no regret} by making its $R_T$ grow sublinearly 
such that its \emph{simple regret} $S_T\triangleq\min_{t=1,\ldots,T}[f(\mathbf{x}^*) - f(\mathbf{x}_t)] \leq R_T/T$ goes to $0$ asymptotically. % (since $S_T \leq R_T/T$).
During BO, we model the belief about $f$ using a \emph{Gaussian process} (GP) $\{f(\mathbf{x})\}_{\mathbf{x}\in {\mathcal{D}}}$.
That is, any finite subset of $\{f(\mathbf{x})\}_{\mathbf{x}\in {\mathcal{D}}}$ follows a multivariate Gaussian distribution~\citep{rasmussen2004gaussian}.
A GP is fully specified by its prior mean $\mu(\mathbf{x})$ and kernel function $k(\mathbf{x}, \mathbf{x}')$, and we assume w.l.o.g. that $\mu(\mathbf{x})=0$ and $k(\mathbf{x}, \mathbf{x}') \leq 1$ $\forall \mathbf{x}, \mathbf{x}' \in \mathcal{D}$.
We focus on the widely used Squared Exponential (SE) kernel.
Given 
% a set of 
$T$ noisy observations $\mathbf{y}_{T}\triangleq [y_t]^{\top}_{t=1,\ldots,T}$ at inputs $\mathbf{x}_1,\ldots,\mathbf{x}_T$, the posterior GP belief of $f$ at 
% any 
input $\mathbf{x} \in \mathcal{D}$ is Gaussian with the following posterior mean and variance:
\begin{equation}
\begin{split}
    \mu_T(\mathbf{x}) \triangleq\displaystyle\mathbf{k}_T(\mathbf{x})^\top(\mathbf{K}_T+\lambda I)^{-1}\mathbf{y}_{T}, \,\\
    \sigma_T^2(\mathbf{x}) \triangleq\displaystyle k(\mathbf{x},\mathbf{x})-\mathbf{k}_T(\mathbf{x})^\top(\mathbf{K}_T+\lambda I)^{-1}\mathbf{k}_T(\mathbf{x}),
\end{split}
\label{gp_posterior}
\end{equation}
%\begin{equation}
%\begin{split}
%    \mu_T(\mathbf{x}) &\triangleq\displaystyle\mathbf{k}_T(\mathbf{x})^\top(\mathbf{K}_T+\sigma^2I)^{-1}\mathbf{y}_{T}\ ,\\
%    \sigma_T^2(\mathbf{x}) &\triangleq\displaystyle k(\mathbf{x},\mathbf{x})-\mathbf{k}_T(\mathbf{x})^\top(\mathbf{K}_T+\sigma^2I)^{-1}\mathbf{k}_T(\mathbf{x})
%\end{split}
%\label{gp_posterior}
%\end{equation}
where $\mathbf{K}_T\triangleq [k(\mathbf{x}_t,\mathbf{x}_{t'})]_{t,t'=1,\ldots,T}$, $\mathbf{k}_T(\mathbf{x})\triangleq [k(\mathbf{x}_t,\mathbf{x})]^\top_{t=1,\ldots,T}$, $\lambda$ is a regularization parameter.
% , and $\lambda$ will be specified in Sec.~\ref{sec:theoretical_analysis}.
%In iteration $t$ of the GP-UCB algorithm~\citep{srinivas2009gaussian}, $\mathbf{x}_{t}$ is selected by maximizing the GP-UCB acquisition function  
%$\mu_{t-1}(\mathbf{x})+\sqrt{\beta_t}\sigma_{t-1}(\mathbf{z})$, in which $\beta_t$ 
%is carefully chosen to balance \emph{exploration} of the input domain and \emph{exploitation} of the current GP belief.

\textbf{Meta-Bayesian Optimization.} 
We refer to the function $f$ being maximized as the \emph{target function} and the functions $f_i$ for $i=1,\ldots, M$ of the $M$ previous tasks as \emph{meta-functions}.
We use \textit{target task/observations} and \textit{meta-tasks/observations} in a similar manner. 
All functions are defined on the same domain $\mathcal{D}$ which is assumed to be discrete for simplicity, % with cardinality $|\mathcal{D}| < \infty$ for simplicity.
but the theoretical results can be easily generalized to continuous domains following the analysis of previous works~\citep{chowdhury2017kernelized,srinivas2009gaussian}.
We assume that $f$ and all $f_i$'s lie in the \emph{reproducing kernel Hilbert space} (RKHS) associated with the kernel $k$ such that their norm induced by the RKHS is bounded: $\norm{f}_{k} \leq B$, $\norm{f_i}_{k}\leq B,\forall i=1,\ldots,M$.
This assumption intuitively suggests that the target and meta-functions have the same degree of smoothness.
% We assume $f$ and all $f_i$'s are sampled from the same GP with kernel $k$. This assumption is justified in the sense that the target function and meta-functions are assumed to be generated from \emph{the same underlying phenomenon} (e.g., performance of ML models trained using the activity recognition data of different individuals), and thus have the same degree of smoothness characterized by the kernel $k$.
Same as the work of~\citet{wang2018regret} which has also performed theoretical analysis of a meta-learning algorithm for BO, 
we also assume that all meta- and target observations are corrupted by a Gaussian noise $\epsilon \sim \mathcal{N}(0,\sigma^2)$ with variance $\sigma^2$.
% These two assumptions are in line with the work of~\citep{wang2018regret} which has also performed theoretical analysis of a meta-learning algorithm for BO (Sec.~\ref{sec:related_works}).
%\footnote{Refer to the Section~\ref{sec:related_works} for more details about this work.}.
The number of observations from meta-task $i$ is a constant denoted as $N_i$, and  $N\triangleq\max_{i=1,\ldots,M}N_i$.
$\mathbf{x}_{i,j}$ and $y_{i,j}$ represent the $j$-th input and noisy output of meta-task $i$ respectively.
We define the \emph{function gap} $d_i\triangleq \max_{j=1,\ldots,N_i}\left|f(\mathbf{x}_{i,j})-f_i(\mathbf{x}_{i,j})\right| < \infty$  
which represents the maximum difference between the function values of $f$ and $f_i$ at any corresponding input $\mathbf{x}_{i,j}$ of meta-task $i$.
Note that for a given set of meta-observations for meta-task $i$, the function gap $d_i$ is an unknown constant characterizing the similarity between meta-task $i$ and the target task: 
a smaller function gap implies a stronger similarity.
% Our algorithm is designed to be robust such that it performs effectively even when some or all function gaps are large, 
% therefore, we do not place assumptions on the magnitude of the functions gaps except that they are finite.

%We define the \emph{function gap} $d_i\triangleq \max_{\mathbf{x} \in \mathcal{D}}\left|f(\mathbf{x})-f_i(\mathbf{x})\right| < \infty$,
%%which represents the maximum difference between the function values of $f$ and $f_i$ at any corresponding input $\mathbf{x}_{i,j}$ of meta-task $i$.
%%Note that for a given set of meta-observations for meta-task $i$, the function gap $d_i$ 
%which is an unknown constant characterizing the similarity between meta-task $i$ and the target task: 
%a smaller function gap implies a stronger similarity.
%Our algorithm is designed to be robust such that it performs effectively even when some or all function gaps are large, 
%therefore, we do not place assumptions on the magnitude of the functions gaps except that they are finite.


%\vspace{-1mm}
\section{Robust Meta-Bayesian Optimization}
%\section{Robust Meta-Gaussian Process-Upper Confidence Bound (RM-GP-UCB)}
%\vspace{-1mm}
\label{sec:om_gp_ucb}
The acquisition function~\eqref{acq_func} adopted by RM-GP-UCB in iteration $t$ is a weighted combination of $M+1$ individual GP-UCB acquisition functions~\citep{srinivas2009gaussian} for the target task and the $M$ meta-tasks, each of which is calculated using the observations from a particular task:
\begin{equation}
\begin{split}
\overline{\zeta}^{\text{UCB}}_t(\mathbf{x})\triangleq &\nu_t\Big[{\sum}^M_{i=1}\omega_i \left[\overline{\mu}_{i}(\mathbf{x}) + \tau\overline{\sigma}_{i}(\mathbf{x})\right]\Big] + \\
&\left(1-\nu_t\right)\left[\mu_{t-1}(\mathbf{x})+\beta_t\sigma_{t-1}(\mathbf{x})\right].
\end{split}
\label{acq_func}
\end{equation}
%\vskip -0.1in
In~\eqref{acq_func}, $\mu_{t-1}(\mathbf{x})$ and $\sigma_{t-1}(\mathbf{x})$ represent, respectively, the GP posterior mean and standard deviation~\eqref{gp_posterior} at $\mathbf{x}$ 
calculated using the target observations from iterations 1 to $t-1$. 
$\overline{\mu}_{i}(\mathbf{x})$ and $\overline{\sigma}_{i}(\mathbf{x})$
are computed using all meta-observations from meta-task $i$. 
$\beta_t>0$ and $\tau>0$ will be defined in Sec.~\ref{sec:theoretical_analysis}.
$\nu_t \in [0, 1]$ can be interpreted as the overall weight given to all meta-tasks in iteration $t$ and should be chosen to be non-increasing in $t$, 
which enforces the impact of meta-tasks in~\eqref{acq_func} to be non-increasing.
The \emph{meta-weights} $\omega_i$'s 
%form a probability simplex (all $\omega_i \geq 0$ and $\sum^M_{i=1}\omega_i=1$) and 
can be understood as the weights assigned to individual meta-tasks.
%In~\eqref{acq_func}, $\beta_t>0$ and $\tau>0$ are selected with consideration of the \emph{exploration-exploitation trade-off} and will be defined in Section~\ref{sec:theoretical_analysis}.
Note that since the dataset used to calculate $\overline{\mu}_{i}(\mathbf{x})$ and $\overline{\sigma}_{i}(\mathbf{x})$ 
is fixed with size $N_i$, the matrix inversion in~\eqref{gp_posterior} (i.e., the computational bottleneck for GP) can be pre-computed.
So, after $T$ iterations, RM-GP-UCB incurs $\mathcal{O}(T^3)$ time for covariance matrix inversion
(since only the target covariance matrix of size $T\times T$ needs to be inverted) 
and $\mathcal{O}(MN^2+T^2)$ time during predictive inference,  
which are less than the respective $\mathcal{O}((MN+T)^3)$ and $\mathcal{O}((MN+T)^2)$ time 
% incurred 
when all observations are included in a single GP.
In practice, the total number of BO iterations ($T$) is usually small, therefore, the differences between these corresponding computational costs can be large, especially when $M$ and $N$ are large.
Hence, RM-GP-UCB is scalable in the number of meta-tasks ($M$) and observations in each meta-task ($N$).

% The acquisition function of RM-GP-TS is defined in a probabilistic way:
The acquisition function of RM-GP-TS is defined as:
\begin{equation}
\overline{\zeta}^{\text{TS}}_t(\mathbf{x})\triangleq
\begin{cases}
f^t(\mathbf{x}) & \text{with probability } 1-\nu_t \ ,\\
{\sum}^M_{i=1}\omega_i \overline{f}^t_{i}(\mathbf{x}) & \text{with probability } \nu_t,
\end{cases}
\label{eq:acq_func_ts}
\end{equation}
in which $f^t$ is a function sampled from the GP posterior of the target task: $f^t \sim \mathcal{GP}\left(\mu_{t-1}(\cdot), \beta_t^2 \sigma_{t-1}^2(\cdot)\right)$, and $f^t_i$ is sampled from the GP posterior of meta-task $i$: $\overline{f}^t_i \sim \mathcal{GP}\left(\overline{\mu}_{i}(\cdot), \tau^2 \overline{\sigma}_{i}^2(\cdot)\right)$.
% Similar to RM-GP-UCB, RM-GP-TS is also scalable since the functions can be efficiently sampled using approximation techniques such as random Fourier features (RFF) approximation.
Using approximation techniques such as random Fourier features (RFF) approximation~\citep{rahimi2008random} (which we use in all our experiments), the functions $f^t$ and $f_i^t$'s can be sampled efficiently, hence making RM-GP-TS computationally efficient (as we will demonstrate in Sec.~\ref{sec:experiment}).
Moreover, since the meta-observations of every meta-task is fixed, the use of approximation techniques such as RFF allows the functions $\overline{f}^t_i$'s to be sampled beforehand before the algorithm starts.
Refer to Appendix~\ref{app:ts:details} for more details on RM-GP-TS.

In iteration $t$ of either RM-GP-UCB or RM-GP-TS (Algorithm~\ref{OM_GP_UCB}), we first optimize the meta-weights and update $\nu_t$ (Sec.~\ref{sec:online_weight_estimation}), which corresponds to line $2$ of Algorithm~\ref{OM_GP_UCB}.
Next, the input $\mathbf{x}_t$ is selected by maximizing the acquisition function~\eqref{acq_func} (RM-GP-UCB) or~\eqref{eq:acq_func_ts} (RM-GP-TS), after which we query $\mathbf{x}_t$ and 
use the newly collected 
% input-output pair 
$(\mathbf{x}_t, y_t)$ to update the GP posterior belief~\eqref{gp_posterior}.



%\vspace{-2mm}
\begin{algorithm}
\begin{algorithmic}[1]
	\FOR{$t=1,2,\ldots, T$}
        \STATE Update $\omega_i$ for $i=1,\ldots,M$ via online meta-weight optimization and update $\nu_t$ (Sec.~\ref{sec:online_weight_estimation})
%        \STATE $\mathbf{x}_t \leftarrow {\arg\max}_{\mathbf{x} \in \mathcal{D}} \overline{\zeta}_t(\mathbf{x})$, with $\overline{\zeta}_t(\mathbf{x})$ given in~\eqref{acq_func} (for RM-GP-UCB) or~\eqref{eq:acq_func_ts} (for RM-GP-TS)
        \STATE $\mathbf{x}_t \leftarrow {\arg\max}_{\mathbf{x} \in \mathcal{D}} \overline{\zeta}^{\text{UCB}}_t(\mathbf{x})$ (for RM-GP-UCB)~\eqref{acq_func}, or
        $\mathbf{x}_t \leftarrow {\arg\max}_{\mathbf{x} \in \mathcal{D}} \overline{\zeta}^{\text{TS}}_t(\mathbf{x})$ (for RM-GP-TS)~\eqref{eq:acq_func_ts}
        \STATE Query $\mathbf{x}_t$ to observe $y_t$, and update GP posterior belief~\eqref{gp_posterior} using $(\mathbf{x}_t, y_t)$
	\ENDFOR
\end{algorithmic}
\caption{RM-GP-UCB/RM-GP-TS}
\label{OM_GP_UCB}
\end{algorithm}
%\vspace{-2mm}

% \vspace{-2mm}
\section{Theoretical Analysis}
% \vspace{-1mm}
\label{sec:theoretical_analysis}
\subsection{RM-GP-UCB}
\label{subsec:theory:rm_gp_ucb}
%Theorem~\ref{regret_bound} presents the main theoretical result of this work and its proof is given in Appendix~\ref{app:first_section}:
Theorem~\ref{regret_bound} presents an upper bound on the cumulative regret of RM-GP-UCB (proof in Appendix~\ref{app:first_section}).
%the main theoretical result of this work and its proof is given in Appendix~\ref{app:first_section}:
\begin{theorem}[RM-GP-UCB]
\label{regret_bound}
Let $\delta \in (0,1)$.
Denote by $\gamma_t$ the maximum information gain about $f$ from observing any set of $t$ observations.
If RM-GP-UCB is run with:
% the following parameters: 
$\lambda = 1+2/T$,
$\beta_t=B + \sigma \sqrt{2(\gamma_{t-1} + 1 + \log(4/\delta))}$,
$\tau=B + \sigma \sqrt{2(\gamma_{N} + 1 + \log(4M/\delta))}$,
$\nu_t\in [0, 1]$ and $\nu_{t+1} \leq \nu_{t}$, 
$\omega_i\geq 0$ and $\sum^M_{i=1}\omega_i=1$.
% and $\lambda=1+2/T$.
Then, with probability of $\geq 1 - 3\delta / 4$,
\begin{align}
    R_T \leq 2(\alpha+\tau) \sum^T_{t=1} \nu_t + \beta_T\sqrt{C_1 T \gamma_T} \nonumber \\
    = \widetilde{\mathcal{O}}\big( \big(\sum^M_{i=1}d_i\big) \sum^T_{t=1} \nu_t + \gamma_T\sqrt{T} \big),
\label{eq:regret:ucb:within:theorem}
\end{align}
where 
%$C_1\triangleq 8/\log(1+\sigma^{-2})$, 
$C_1 \triangleq \frac{8}{1+\sigma^{-2}}$,
% $\gamma_T$ is the maximum information gain about $f$ from observing any set of $T$ observations, 
and $\alpha \triangleq \sum^M_{i=1}\omega_i \frac{N_i}{\sigma^2}(2\sqrt{2\sigma^2\log\frac{8N_i}{\delta}}+d_i).$
%and $\alpha_i\triangleq \frac{N_i}{\sigma^2}(2\sqrt{N_i}\sqrt{2\sigma^2\log(6N_i / \delta)}+d_i\sqrt{N_i}).$
%\[    \alpha_i\triangleq \frac{N_i}{\sigma^2}\left(2\sqrt{N_i}\sqrt{2\sigma^2\log\frac{6N_i}{\delta}}+d_i\sqrt{N_i}\right).
%\]
\end{theorem}

The second term $\gamma_T \sqrt{T}$ in the regret upper bound~\eqref{eq:regret:ucb:within:theorem} grows sub-linearly for the SE kernel for which $\gamma_T=\mathcal{O}((\log T)^{D+1})$.
% The second term in the regret upper bound has been shown to grow sub-linearly for the Squared Exponential (SE) and M\'atern kernels~\citep{srinivas2009gaussian}\footnote{For the SE kernel, $\gamma_T=\mathcal{O}((\log T)^{D+1})$; for the Mat\'ern kernel with parameter $\nu>1$, $\gamma_T=\mathcal{O}(T^{D(D+1)/(2\nu + D(D+1))}\log T)$. Therefore, for both kernels, the second term in Theorem~\ref{regret_bound} grows sublinearly.}.
Therefore, if $\nu_t$ is designed such that $\nu_t \rightarrow 0$ as $t \rightarrow \infty$, the first term also grows sub-linearly and hence RM-GP-UCB is asymptotically no-regret.

%If $\nu_t$ is designed according to Corollary~\ref{cor:eta_t} below, both terms in the regret upper bound grow sublinearly, which
%suggests that RM-GP-UCB is asymptotically no-regret for some commonly used kernels~\citep{srinivas2009gaussian}.
%\begin{corollary}
%\label{cor:eta_t}
%If $\nu_t \rightarrow 0$ as $t \rightarrow \infty$,
%then RM-GP-UCB achieves no regret asymptotically for the Squared Exponential (SE) and M\'atern kernels.\footnote{For the SE kernel, 
%$\gamma_T=\mathcal{O}((\log T)^{D+1})$; for the Mat\'ern kernel with parameter $\nu>1$, $\gamma_T=\mathcal{O}(T^{D(D+1)/(2\nu + D(D+1))}\log T)$.
%Therefore, for both kernels, the second term in Theorem~\ref{regret_bound} grows sublinearly.}
%\end{corollary}

Theorem~\ref{regret_bound} holds for a given set of meta-tasks with fixed yet unknown $d_i$'s.
Note that we do not impose assumptions on the values of $d_i$'s, i.e., the similarities between the meta- and target tasks.
Therefore, Theorem~\ref{regret_bound} gives a robust regret upper bound which holds for \emph{any} given set of meta-tasks.
%(i.e., an assurance about the performance) 
%In other words, even in adverse scenarios where some or all meta-tasks are extremely dissimilar to the target task (i.e., when some or all $d_i$'s are very large),
%RM-GP-UCB is still asymptotically no-regret, which gives an assurance about the \emph{worst-case behavior} in \emph{any} 
%given scenario.\footnote{This is in line with the notion of robustness from \emph{robust optimization} (RO)~\citep{beyer2007robust} which also attempts to optimize the performance in the worst-case scenario.
%The difference is that RO optimizes an explicit objective, while we aim at preserving the no-regret property in the worst case.}
%%Note that the robustness offered by our theoretical results is in line with the notion of robustness in robust optimization (RO) since we 
%Our proof involves two major steps. Firstly, we upper bound the errors induced by the use of a given set of meta-observations (instead of the target observations at the same corresponding inputs) in calculating the acquisition function~\eqref{acq_func}.
%%(i.e., $\overline{\mu}_i(\mathbf{x})$ and $\overline{\sigma}_i(\mathbf{x})$).
%Secondly, 
%%assuming the meta-observations in the calculation of~\eqref{acq_func} is replaced by the corresponding target observations, 
%we modify the proof of GP-UCB to take into account the impact of (a) the additional term in the acquisition function (the first term in~\eqref{acq_func}), and (b) the errors from the first step.
%Regarding the first step, $\alpha_i$ in Theorem~\ref{regret_bound} can be interpreted as an upper bound on the cumulative error incurred by using the meta-observations of $f_i$ in calculating the acquisition function~\eqref{acq_func} (Lemma~\ref{ucb_diff} in Appendix~\ref{app:proof_lemma_1}).
%Similarly, $\alpha$ can be viewed an upper bound on the overall error induced by the use of all meta-observations when calculating~\eqref{acq_func}.
%These interpretations also explain the dependence of $\alpha$, hence the regret upper bound, on $d_i$ and $N_i$: 
%Larger function gaps increase the error resulting from the use of the meta-observations, and 
%a larger number of meta-observations also inflates the worst-case upper bound by accumulating the individual errors.
In other words, even in adverse scenarios where some or all meta-tasks are extremely dissimilar to the target task (i.e., when some or all $d_i$'s are very large),
RM-GP-UCB is still asymptotically no-regret, which indicates the robustness and generality of our algorithm.
This provides an assurance about the \emph{worst-case behavior} in any given scenario.\footnote{This notion of robustness is in line with that of \emph{robust optimization} (RO)~\citep{beyer2007robust} which also attempts to optimize the performance in the worst-case scenario. The difference is that RO optimizes an explicit objective, while we aim at preserving the no-regret property in the worst case.}
%Note that the robustness offered by our theoretical results is in line with the notion of robustness in robust optimization (RO) since we 
In our proof, the key step (Lemma~\ref{ucb_diff} in Appendix~\ref{app:first_section}) is to upper bound (by $\alpha$ in Theorem~\ref{regret_bound})
%Our robust theoretical guarantee is achieved by upper bounding 
the overall error induced by the use of any given set of meta-observations, instead of the target observations at the same corresponding input locations, when calculating the acquisition function~\eqref{acq_func}.
% Concretely, $\alpha$ can be viewed as an upper bound on the overall error induced by using the meta-observations from all meta-tasks (weighted by $\omega_i$'s) in calculating the acquisition function~\eqref{acq_func}.
% Concretely, $\alpha_i$ in Theorem~\ref{regret_bound} can be interpreted as an upper bound on the cumulative error incurred by using the meta-observations of $f_i$ 
% %(instead of observations of $f$ at the same corresponding inputs) 
% in calculating the acquisition function~\eqref{acq_func}.
% Similarly, $\alpha$ can be viewed an upper bound on the overall error induced by the use of all meta-observations.
These interpretations also explain the dependence of $\alpha$, hence the regret bound, on $d_i$ and $N_i$: 
Larger function gaps increase the error resulting from the use of the meta-observations, and 
a larger number of meta-observations also inflates the worst-case upper bound by accumulating the individual errors.
%\footnote{Note that a larger $N_i$ is also likely to increase the value of $d_i$, as a result of the definition of $d_i$ (Section~\ref{sec:background}).}.
Of note, a limitation of our regret upper bound (Theorem~\ref{regret_bound}) is that it does not reflect the benefit of the use of the meta-tasks when they are indeed similar to the target task. Next, we use our theoretical analysis to give some insights on how the meta-tasks, if similar to the target task, help improve the convergence of our algorithm.

\textbf{Meta-tasks Can Improve the Convergence by Accelerating Exploration.}
In addition to characterizing the worst-case behavior, we also use our theoretical analysis to illustrate how meta-tasks can help RM-GP-UCB converge faster than standard GP-UCB.
As we have proved in Appendix~\ref{app:improved_bound}, at the early stage of the algorithm, the meta-tasks (if similar to the target task) can help RM-GP-UCB obtain a smaller regret upper bound than GP-UCB by \emph{reducing the uncertainty at the selected input}. 
%Intuitively, this allows RM-GP-UCB to \emph{reduce the degree of exploration at the early stage} by exploiting the additional information offered by the meta-tasks.
Equivalently, the additional information from the meta-tasks allows RM-GP-UCB to \emph{reduce the degree of exploration at the early stage}.
%This therefore leads to smaller regrets since the exploration phase usually incurs large regret.
%, hence achieving a faster convergence. 
Since initial exploration of BO usually incurs large regrets,
%(and bandit algorithms in general), 
less exploration results in smaller regrets.
At later stages when $\nu_t$ becomes close to $0$, RM-GP-UCB converges to no regret at a similar rate to GP-UCB (i.e., the second term $\gamma_T \sqrt{T}$ in the regret upper bound~\eqref{eq:regret:ucb:within:theorem} dominates).

% \vspace{-1mm}
\subsection{RM-GP-TS}
% \vspace{-1mm}
\label{subsec:theory:ts}
Theorem~\ref{regret_bound_ts} gives an upper bound on the cumulative regret of RM-GP-TS (proof in Appendix~\ref{app:proof:theorem:ts}).
% \footnote{The proof of RM-GP-TS needs an additional assumption that the target and meta-functions are bounded: $|f_i(\mathbf{x})|,|f(\mathbf{x})|\leq B,\forall \mathbf{x}\in\mathcal{D},i=1,\ldots,M$ for some $B$, which is satisfied by most real-world functions.}
\begin{theorem}[RM-GP-TS]
\label{regret_bound_ts}
%The parameters $\delta,\nu_t,\tau,\beta_t,\omega_i$ are defined in the same way as Theorem~\ref{regret_bound},
Define $d_i' \triangleq \max_{\mathbf{x}\in\mathcal{D}}| f(\mathbf{x}) - f_i(\mathbf{x}) |$.
With the same parameters as those defined in Theorem~\ref{regret_bound},
we have that with probability of at least $1 - 3\delta / 4$,
\[
R_T = \widetilde{\mathcal{O}}\big(\big(\sum^M_{i=1}\omega_i d_i' \big) \sum^T_{t=1} \nu_t + 
\sum^T_{t=1} \nu_t \sqrt{\gamma_t} + 
\gamma_T \sqrt{T} \big).
\]
% \[
% R_T = \mathcal{O}\big(\big(\sum^M_{i=1}\omega_i d_i' \big) \sum^T_{t=1} \nu_t + 
% \sum^T_{t=1} \nu_t \sqrt{\gamma_t}\log t + 
% \gamma_T\log T \sqrt{T} \big).
% \]
% \[
% R_T = \mathcal{O}\Big(\Big( \sqrt{\gamma_T}\log T + \sum^M_{i=1}\omega_i d_i' \Big) \sum^T_{t=1} \nu_t +\gamma_T\log T \sqrt{T} \Big).
% \]
\end{theorem}
% Note that by definition, $d_i'$ is lower-bounded by the function gap $d_i$: $d_i' \geq d_i, \forall i$.
Note that by definition, we have that $d_i' \geq d_i, \forall i$.
Similar to RM-GP-UCB, as long as $\nu_t$ is chosen such that $\nu_t \rightarrow 0$ as $t \rightarrow \infty$ and that
% $(\sqrt{\gamma_T}\log T)\sum^T_{t=1} \nu_t = o(T)$,
% $\nu_t = o(1/(\sqrt{\gamma_t}\log t))$,
$\nu_t = o(1/\sqrt{\gamma_t})$,
%$\nu_t = o(1/\log t)$, 
all three terms in Theorem~\ref{regret_bound_ts} are sub-linear (for the SE kernel).
That is, RM-GP-TS is also asymptotically no-regret 
for any set of meta-tasks, even when some or all meta-tasks are dissimilar to the target task.
Moreover, comparing the extra terms in the regret upper bounds resulting from the use of the meta-tasks for both RM-GP-UCB (i.e., the first term of equation~\eqref{eq:regret:ucb:within:theorem} in Theorem~\ref{regret_bound})
% within $\widetilde{\mathcal{O}}(\cdot)$) 
and RM-GP-TS (i.e., the first two terms of Theorem~\ref{regret_bound_ts}) reveals that compared with RM-GP-UCB, RM-GP-TS suffers from a worse extra dependence on $T$ due to the meta-tasks.
Specifically, while the first terms of Theorems~\ref{regret_bound} and~\ref{regret_bound_ts} have the same dependence on $T$, the second term of Theorem~\ref{regret_bound_ts} introduces an extra dependence on $T$ which dominates the first term.
This suggests that in adverse scenarios with a large number of dissimilar tasks, RM-GP-TS may suffer from a worse convergence than RM-GP-UCB.
In other words, RM-GP-UCB enjoys a better theoretically guaranteed robustness against dissimilar tasks.

% Of note, although RM-GP-TS is also no-regret for any given set of meta-tasks, the use of the meta-tasks induces a worse dependence on $T$ for RM-GP-TS (i.e., $(\log T)\sum^T_{t=1} \nu_t$) compared with RM-GP-UCB (i.e., $\sum^T_{t=1} \nu_t$).

% For example, if $\nu_t=\mathcal{O}(1/t)$, RM-GP-TS and RM-GP-UBC suffer from an additional dependence of $(\log T)^2$ and $\log T$, respectively. This implies that compared with RM-GP-TS, RM-GP-UCB enjoys a better theoretically guaranteed robustness against dissimilar tasks,

% \vspace{-1mm}
\subsection{Practical Implications}
% \vspace{-1mm}
Besides the theoretical insights, Theorems~\ref{regret_bound} and~\ref{regret_bound_ts} also provide two natural hints to the practical algorithmic design.
Firstly, note that both Theorems hold for all choices of meta-weights $\omega_i$'s.
% as long as they form a probability simplex. 
% So, 
Therefore,
we have the flexibility to choose the optimal $\omega_i$'s (i.e., learn the task similarity) by minimizing the regret upper bounds in Theorems~\ref{regret_bound} and~\ref{regret_bound_ts}.
Secondly, the first term in Theorem~\ref{regret_bound} suggests that we can lower the regret by making $\nu_t$ (i.e., the influence of the meta-tasks) decay faster if $\alpha$ in Theorem~\ref{regret_bound} (i.e., an upper bound on the error produced by using the meta-tasks) is larger.
The same reasoning applies to Theorem~\ref{regret_bound_ts}, i.e., we can decay $\nu_t$ faster if $\sum_{i=1,\ldots,M}\omega_i d_i'$ in Theorem~\ref{regret_bound_ts} is larger.
% Secondly, the first term in Theorem~\ref{regret_bound} (Theorem~\ref{regret_bound_ts}) suggests that we can lower the regret by making $\nu_t$ (i.e., the influence of the meta-tasks) decay faster if $\alpha$ in Theorem~\ref{regret_bound} ($\sum^M_{i=1}\omega_i d_i'$ in Theorem~\ref{regret_bound_ts}), which is an upper bound on the error produced by using the meta-tasks, is larger.
% Secondly, the interpretations of $\alpha$ (Theorem~\ref{regret_bound}) and $\nu_t$ suggest that we can make the influence of the meta-tasks (quantified by $\nu_t$) decay faster if the error produced by using the meta-tasks (measured by its upper bound $\alpha$) is larger.
Both design choices can further strengthen the robustness of our algorithms against dissimilar meta-tasks by lessening their impact.
Unfortunately, they both require the values of the function gaps $d_i$'s which are unavailable.\footnote{$d_i$ can be used as an estimate of $d_i'$ since $d_i' \geq d_i$ (Sec.~\ref{subsec:theory:ts}).}
%due to unavailability of the function gaps $d_i$'s.
To this end, we devise a principled technique to estimate upper bounds on the function gaps, which is presented
in the next section.
% in Sec.~\ref{sec:estimate_d}.

%Secondly, the interpretations of $\alpha$ and $\nu_t$ suggest that we can make the influence of the meta-tasks (quantified by $\nu_t$) decay faster if the error produced by using the meta-tasks 
%(measured by its upper bound $\alpha$) is larger.
%Both design choices can strengthen the robustness of RM-GP-UCB against dissimilar meta-tasks by lessening their impact.
%Unfortunately, they both require the value of $\alpha$, which we lack access to since the function gaps $d_i$'s are unavailable.
%%due to unavailability of the function gaps $d_i$'s.
%To this end, we devise a principled technique to estimate upper bounds on the function gaps, which is presented in Section~\ref{sec:estimate_d}.

%\vspace{-1mm}
\section{Online Meta-Weight Optimization}
%\vspace{-1mm}
\label{sec:online_regret_minimization}
In this section, we first introduce a principled technique for estimating high-probability upper bounds on the function gaps (Sec.~\ref{sec:estimate_d}) that, 
when combined with Theorems~\ref{regret_bound} and~\ref{regret_bound_ts}, naturally yields a principled method for optimizing the meta-weights 
%for both RM-GP-UCB and RM-GP-TS 
through regret minimization via online learning.
% We focus on RM-GP-UCB here since the analysis for RM-GP-TS is similar.
% (Section~\ref{sec:online_weight_estimation}).

%\vspace{-1mm}
\subsection{Online Estimation of Function Gaps}
%\vspace{-1mm}
\label{sec:estimate_d}
Inspired by the confidence region constructed by GP-UCB~\citep{srinivas2009gaussian,chowdhury2017kernelized} that contains the target function with high probability, 
after $t\geq 1$ target observations have been collected, define
\begin{equation}
\begin{split}
    &U_{t,i,j}\triangleq\mu_{t}(\mathbf{x}_{i,j}) + \beta_{t+1}\sigma_{t}(\mathbf{x}_{i,j})\, , \\
    &L_{t,i,j}\triangleq\mu_{t}(\mathbf{x}_{i,j}) - \beta_{t+1}\sigma_{t}(\mathbf{x}_{i,j}),
\end{split}
\label{UL}
\end{equation}
where $\mathbf{x}_{i,j}$ is the $j$-th input of meta-task $i$, $\beta_{t+1}$ is previously defined in Theorem~\ref{regret_bound}, and 
$U_{t,i,j}$ and $L_{t,i,j}$ can be interpreted, respectively, as the upper and lower confidence bounds of $f$ at $\mathbf{x}_{i,j}$ after $t$ iterations.
Lemma \ref{gaussian_bound} (Appendix \ref{app:first_section}) implies that
with probability of at least $1 - \delta/4$ 
($\delta$ is defined in Theorem \ref{regret_bound}):
$L_{t,i,j} \leq f(\mathbf{x}_{i,j}) \leq U_{t,i,j}, \forall t, i, j\ $.
%\begin{equation}
%L_{t,i,j} \leq f(\mathbf{x}_{i,j}) \leq U_{t,i,j}% \qquad \forall t, i, j\ .
%\label{eq:L_f_U}
%\end{equation}
%for $t=1,\ldots, T$, $i=1,\ldots,M$, and $j=1,\ldots, N_i$. 
Consequently, the following result gives high-probability upper bounds on the function gaps (proof in Appendix~\ref{app:upper_bound_func_gap}):
\begin{lemma}
\label{estimate_di}
%Let $\delta, \delta' \in \left(0, 1\right)$. Then, with probability of at least $1 - \delta - \delta'$,
With probability of at least 
$1 - \delta$,
% $1 - \delta/2$,
\begin{equation*}
\begin{split}
    d_i \leq &\sqrt{2\sigma^2\log \Big[\big(8{\sum}^M_{i=1}N_i\big)/\delta\Big] } + \\
    &\max_{j=1,...,N_i}\left[\max \{|y_{i,j} - U_{t,i,j}|, |y_{i,j} - L_{t,i,j}|\}\right] \triangleq \overline{d}_{i,t},
\end{split}
\end{equation*}
%\begin{equation*}
%\begin{split}
%    d_i \leq \sqrt{2\sigma^2\log\frac{2\sum^M_{i=1}N_i}{\delta'}} +  \max_{j=1,...,N_i}\left[\max \{|y_{i,j} - U_{t,i,j}|, |y_{i,j} - L_{t,i,j}|\}\right] \triangleq \overline{d}_{i,t} 
%\end{split}
%\end{equation*}
for $t=1,\ldots, T$ and $i=1,\ldots,M$.
\end{lemma}
%\vspace{-0.1in}
%Note that 
Unlike $d_i$, $\overline{d}_{i,t}$ can be efficiently calculated as its incurred time is linear in both $M$ and $N$.

%\vspace{-1mm}
\subsection{Online Meta-Weight Optimization through Regret Minimization}
%\vspace{-1mm}
\label{sec:online_weight_estimation}
%Based on Lemma~\ref{estimate_di}, 
%the following result (see Appendix~\ref{app:prop_1_proof} for its proof) can be obtained by modifying Theorem~\ref{regret_bound}:
%Here we leverage Lemma~\ref{estimate_di} and online learning to introduce a principled method to optimize the meta-weights for RM-GP-UCB, and similar analyses and methods can be applied to RM-GP-TS (Appendix~\ref{app:meta:weight:optimization:ts}).
In this section, we focus on RM-GP-UCB since the analysis for RM-GP-TS (deferred to Appendix~\ref{app:meta:weight:optimization:ts}) is similar and leads to the same update rules for $\omega_i$'s and $\nu_t$.
Combining Lemma~\ref{estimate_di} and Theorem~\ref{regret_bound} allows us to derive the following result for RM-GP-UCB (proof in Appendix~\ref{app:prop_1_proof}):
\begin{proposition}[RM-GP-UCB]
\label{regret_bound_2}
%Let $\delta,\delta' \in (0,1)$. 
With probability of $\geq 1 - \delta$,
\begin{equation*}
\begin{split}
    R_T \leq &\frac{2}{\sigma^2} \Big[{\sum}^T_{t=1}\boldsymbol{\omega}^{\top} \boldsymbol{l}_t\Big] \Big[{\sum}^T_{t=1}\nu_t\Big] +\\
    &\quad 2\tau {\sum}^T_{t=1} \nu_t + \beta_T\sqrt{C_1 T \gamma_T},
\end{split}
\end{equation*}
%\begin{equation*}
%\begin{split}
%    R_T \leq \frac{2}{\sigma^2} \Big[\sum^T_{t=1}\boldsymbol{\omega}^{\top} \boldsymbol{l}_t\Big] \Big[\sum^T_{t=1}\nu_t\Big] + 2\sqrt{\tau} \sum^T_{t=1} \nu_t + \sqrt{C_1 T \left(1-\nu_T\right)^2\beta_T\gamma_T}
%\end{split}
%\end{equation*}
where $\boldsymbol{\omega}\triangleq [\omega_i]_{i=1,\ldots,M}$, $\boldsymbol{l}_t\triangleq [l_{i,t}]_{i=1,\ldots,M}$, and $l_{i,t}\triangleq N_i (2\sqrt{2\sigma^2\log({8N_i/\delta})}+\overline{d}_{i,t})$.
\end{proposition}
Note that $\boldsymbol{l}_t$ can be efficiently computed after the $t$-th observation is collected.
The regret upper bound in Proposition~\ref{regret_bound_2} depends on  $\omega_i$'s only through the term $\sum^T_{t=1}\boldsymbol{\omega}^{\top} \boldsymbol{l}_t$  
which can be minimized to derive the optimal meta-weights.
This constitutes an \emph{online learning} problem with linear loss function and its solution $\boldsymbol{\omega}$ constrained to a probability simplex. 
An additional entropic regularization term is usually preferred so as to encourage a solution with a large entropy to stabilize it~\citep{bubeck2011introduction}.
This corresponds to encouraging the meta-weights to spread across a large number of meta-tasks, in order to discover as many similar meta-tasks as possible. 
As a result, by using $1/\eta$ ($\eta>0$) as the regularization parameter, the optimal $\boldsymbol{\omega}$ in iteration $t>1$ is obtained by solving the following optimization problem:
%\vspace{-0.2mm}
\begin{equation}
\label{online_learning_objective}
    \boldsymbol{\omega} \triangleq \mathop{\arg\min}_{\boldsymbol{\omega}'} {\sum}^{t-1}_{s=1}\boldsymbol{\omega}'^{\top} \boldsymbol{l}_s + 
    \eta^{-1}{\sum}^M_{i=1}\omega_i'\log \omega_i', %\vspace{-0.2mm}
\end{equation}
%\begin{equation}
%\label{online_learning_objective}
%    \boldsymbol{\omega} \triangleq \mathop{\arg\min}_{\boldsymbol{\omega}'} \sum^{t-1}_{s=1}\boldsymbol{\omega}'^{\top} \boldsymbol{l}_s + 
%    \frac{1}{\eta}\sum^M_{i=1}\omega_i'\log \omega_i'\vspace{-0.2mm}
%\end{equation}
subject to the constraints: $\omega_i'\geq 0,\forall i$ and $\sum^M_{i=1}\omega_i'=1$.
When $t=1$, the optimal $\boldsymbol{\omega}$ follows from optimizing only the entropic regularization term, thus naturally entailing the uniform distribution $\omega_{i}=1/M,\forall i$.
Consequently,~\eqref{online_learning_objective} corresponds exactly to the online learning algorithm called \emph{Follow-The-Regularized-Leader} with an entropic regularizer~\citep{bubeck2011introduction} 
where $\eta$ represents the learning rate. Its optimal solution in iteration $t$ can be derived via Lagrange multiplier (Appendix~\ref{lagran}) as
%\begin{equation}
%\begin{split}
%    \omega_{i} = \frac{\exp(-\eta \sum^{t-1}_{s=1}l_{i,s})}{\sum^M_{j=1}\exp(-\eta \sum^{t-1}_{s=1}l_{j,s})} \stackrel{(a)}{\approx}\frac{\exp(-\eta N^{3/2} \sum^{t-1}_{s=1}\overline{d}_{i,s})}{\sum^M_{j=1}\exp(-\eta N^{3/2} \sum^{t-1}_{s=1}\overline{d}_{j,s})}  
%\end{split}
%\label{estimate_wi}
%\end{equation}
\begin{equation}
\begin{split}
    \omega_{i} = \frac{e^{-\eta \sum^{t-1}_{s=1}l_{i,s}}}{\sum^M_{j=1}e^{-\eta \sum^{t-1}_{s=1}l_{j,s}}} \stackrel{(a)}{\approx}\frac{e^{-\eta N \sum^{t-1}_{s=1}\overline{d}_{i,s}}}{\sum^M_{j=1}e^{-\eta N \sum^{t-1}_{s=1}\overline{d}_{j,s}}},
\end{split}
\label{estimate_wi}
\end{equation}
for $i=1,\ldots,M$ where (a) follows from assuming that all $N_i$'s are close to $N$ for simplicity.
%Note that (a) holds with strict equality if $N_i=N$ for $i=1,\ldots,M$.
%Moreover, w
With this simplification, the first (noise-correction) term in the expression of $\overline{d}_{i,t}$ from Lemma~\ref{estimate_di} also cancels out, thus 
leading to a neat and elegant update rule for $\omega_i$ %\footnote{This simplification also avoids erroneously diminishing the weights of similar tasks with large number of meta-observations.} 
which we use in all our experiments.
As is evident from~\eqref{estimate_wi}, the update of $\omega_i$'s in each iteration only involves computing $\overline{d}_{i,t}$'s (incurring $\mathcal{O}(MN)$ time), 
adding one term to the summation on the exponent ($\mathcal{O}(M)$ time), and a normalization step ($\mathcal{O}(M)$ time), all of which are computationally cheap.
% ; this is another factor contributing to the scalability of RM-GP-UCB.
Intuitively,~\eqref{estimate_wi} assigns small weights to meta-tasks with a large cumulative estimated function gap which implies a less similar meta-task.
%Therefore, our online meta-weight optimization technique allows OM-GP-UCB to identify and lesson the impact of dissimilar meta-tasks in a principled way,
%thus preventing them from undermining the convergence of BO.
%In this way
% So, RM-GP-UCB handles dissimilar meta-tasks in a principled way by reducing their impact.

In addition, $\overline{d}_{i,t}$ from Lemma~\ref{estimate_di} also allows for the estimation of an upper bound on $\alpha$ (Theorem~\ref{regret_bound}) in each iteration 
(i.e., by simply replacing $d_i$ with $\overline{d}_{i,t}$) and thus facilitates an adaptive selection of $\nu_t$, as mentioned in Sec.~\ref{sec:theoretical_analysis}. 
Specifically, we set $\nu_1= 1$ and $\nu_t = \nu_{t-1}\times \min(r, (\sum^M_{i=1}\omega_i\overline{d}_{i,t})^{-\epsilon})$ for $t>1$,
in which we have dropped the constants independent of $\overline{d}_{i,t}$. 
$r \in (0,1)$ represents the minimum decaying rate to ensure the monotonic decay of $\nu_t$ such that RM-GP-UCB is no-regret (Sec.~\ref{subsec:theory:rm_gp_ucb}).
%required by Corollary~\ref{cor:eta_t}, and
$\epsilon>0$ controls the aggressiveness of the adaptive decay such that a larger $\epsilon$ results in a faster decay.
With this scheme, when the overall estimated function gaps are larger (the meta-tasks are dissimilar), 
$\nu_t$ decays faster and thus the impact of the meta-tasks vanishes more quickly.
%As a result, in the worst case where all meta-tasks are dissimilar to the target task, the impact of these adverse meta-tasks can be rapidly diminished.

% Using the function gap $d_i$ to approximate $d_i'$ in Theorem~\ref{regret_bound_ts}, the same update rules for $\omega_i$'s~\eqref{estimate_wi} and $\nu_t$ can be derived for RM-GP-TS. The details are presented in Appendix~\ref{app:meta:weight:optimization:ts}.
%Similar analyses and derivations can be applied to RM-GP-TS and Theorem~\ref{regret_bound_ts}, which lead to the same methods for updating the meta-weights~\eqref{estimate_wi} and $\nu_t$.

Importantly, when optimizing the values of $\omega_i$'s and $\nu_t$ as described above, we have taken into account the limitation of our regret upper bounds (i.e., they do not reflect the benefit of the use of the meta-tasks, Sec.~\ref{subsec:theory:rm_gp_ucb}) and hence incorporated additional practical considerations. Specifically, we have optimized the $\omega_i$'s with an additional entropic regularization term to encourage the $\omega_i$'s to spread across a large number of meta-tasks, and optimized $\nu_t$ such that it decreases faster if $\alpha$ (i.e., an upper bound on the error induced by the use of the meta-tasks) is larger.


% \vspace{-1mm}
\section{Experiments and Discussion}
% \vspace{-1mm}
\label{sec:experiment}
We use extensive real-world experiments to compare our RM-GP-UCB and RM-GP-TS with \emph{(1)} standard GP-UCB, two other GP-based scalable meta-BO algorithms: 
\emph{(2)} \emph{ranking-weighted Gaussian process ensemble} (RGPE)~\citep{feurer2018scalable} and \emph{(3)} \emph{transfer acquisition function} (TAF)~\citep{wistuba2018scalable},
% (Appendix~\ref{sec:related_works:app}),
\emph{(4)} multitask BO (MTBO)~\citep{swersky2013multi}, and \emph{(5)} the method from~\citep{wang2018regret} named \emph{point estimate meta-BO} (PEM-BO).
%We compare the performance of RM-GP-UCB with GP-UCB and two other recently introduced scalable meta-BO algorithms: 
%\emph{ranking-weighted Gaussian process ensemble} (RGPE)~\citep{feurer2018scalable} and \emph{transfer acquisition function} (TAF)~\citep{wistuba2018scalable} (Section~\ref{sec:related_works})
%%\footnote{Refer to Section~\ref{sec:related_works} for more details about these two algorithms.}
%in a number of real-world experiments.
%We also evaluate the performance of multitask BO (MTBO)~\citep{swersky2013multi} in some experiments. 
%However, we find that MTBO is considerably more time-consuming than other methods 
%%which agrees with our discussion in Section~\ref{sec:intro}, 
%and is thus not scalable
%\footnote{The poor scalability of MTBO has made it infeasible for some experiments with large number of meta-tasks and meta-observations such as HAR (Section~\ref{subsec:automl}) and RL (Section~\ref{sec:rl}).}.
Since MTBO is relatively not scalable (Sec.~\ref{sec:intro}), we only apply it to those experiments with relatively small number of meta-tasks and observations for which MTBO is still computationally feasible.
% However, MTBO is found to be significantly more time-consuming than the other methods in these experiments (Sec.~\ref{sec:experiment_discussion}).
We compare with PEM-BO~\citep{wang2018regret} in the experiment that is most favorable for this algorithm, i.e., with the largest number of meta-observations and a discrete domain (refer to Sec.~\ref{subsec:automl} for more details).
%We also compare with the method from~\citep{wang2018regret} named \emph{point estimate meta-BO} (PEM-BO) in the experiment which is most favorable for their algorithm, i.e., with a large number of meta-observations and a discrete domain (refer to Section~\ref{subsubsec:svm} for more details).
%\footnote{All their real-world experiments have used a discrete domain, and it is highly nontrivial to extend their algorithm to continuous domain.}.
We set $\eta= 1/N$, $\epsilon= 0.7$ and $r= 0.7$ in all real-world experiments to demonstrate the robustness of our algorithm against the choice of these parameters.
In practice, the upper bound on the function gap, $\overline{d}_{i,t}$, from Lemma~\ref{estimate_di} may be too conservative; 
so, we replace the outer $\max$ operator over $j=1,...,N_i$ with the empirical mean in our experiments.\footnote{We explore the difference between 
them
% these two choices 
in Appendix~\ref{app:subsec_max_mean}.}
Some details and results are deferred to Appendix~\ref{app:experiments} due to lack of space.
All error bars represent standard errors.
Our code is available at \url{https://github.com/daizhongxiang/meta-BO}.

%\vspace{-1mm}
\subsection{Synthetic Experiments}
%\vspace{-1mm}
\label{exp:synth}
We firstly explore the effectiveness of our online meta-weight optimization (Sec.~\ref{sec:online_regret_minimization}) and the impact of different algorithmic parameters by optimizing synthetic functions drawn from GPs.
% We firstly optimize synthetic functions drawn from GPs to gain some insights on the behavior of our methods and the impact of different parameters.
For each objective function, we construct $M=4$ meta-tasks with $N=N_i=20$ meta-observations each. 
The function gaps are chosen as $d_1=d_2=0.05$ and $d_3=d_4=4.0$ such that 
the last $2$ meta-tasks are dissimilar to the target task.
Fig.~\ref{fig:synth_func_results}a plots the simple regrets averaged over $20$ randomly drawn synthetic functions, with $\eta N =1.0$, $\epsilon=0.7$, and $r=0.7$.
The figure shows that RM-GP-UCB with online meta-weight optimization 
% (red curve) 
significantly outperforms RM-GP-UCB with fixed meta-weights ($\omega_i=1/4$ for all $i$).
% The figure shows that RM-GP-UCB with fixed uniform meta-weights ($\omega_i=1/4$ for all $i$) outperforms GP-UCB, and RM-GP-UCB with online meta-weight optimization (red) performs even better.
Fig.~\ref{fig:synth_func_results}b plots the meta-weights optimized by RM-GP-UCB for the red curve in Fig.~\ref{fig:synth_func_results}a, showing that the weights given to the last two meta-tasks which are dissimilar to the target task are rapidly reduced.
% showing that online meta-weight optimization (Sec.~\ref{sec:online_regret_minimization}) rapidly reduces the weights given to the last two meta-tasks which are dissimilar to the target task.
These results verify the effectiveness of online meta-weight optimization in reducing the impact of dissimilar meta-tasks.
% (Fig.~\ref{fig:synth_func_results}b), and verifies that it leads to performance improvement (Fig.~\ref{fig:synth_func_results}a).

\begin{figure}
%\vspace{-2mm}
	\centering
	\begin{tabular}{cc}
		\hspace{-4mm} \includegraphics[width=0.47\linewidth]{figures/synth_func_new_err} & \hspace{-6.2mm}
		\includegraphics[width=0.47\linewidth]{figures/meta_weights_new_err}\\
		{(a)} & {(b)}\\
		\hspace{-4mm} \includegraphics[width=0.47\linewidth]{figures/synth_func_lr_err}& \hspace{-6.2mm}
		\includegraphics[width=0.47\linewidth]{figures/synth_func_eps_err}\\
		{(c)} & {(d)}\\
		\includegraphics[width=0.47\linewidth]{figures/mnist_with_mtbo_new_with_ts} & \hspace{-5.3mm}
		\includegraphics[width=0.47\linewidth]{figures/cifar_10_with_mtbo_new_with_ts}\\
		{(e)} & {(f)}\\
	\end{tabular}% \vspace{-1.55mm}
	\caption{(a) The simple regret and (b) meta-weights optimized by RM-GP-UCB.
	The impact of (c) $\eta$ and (d) $\epsilon$. 
	Best validation error of CNN for (e) MNIST and (f) CIFAR-$10$.}
%	($10$ random initializations)
%	Best validation error of CNN for (a) MNIST and (b) CIFAR-$10$ ($10$ random initializations).
%	(c) Average rank of best validation errors for the HAR experiment. (d) Best validation error for diabetes diagnosis (10 random initializations).
%	(e) Best cumulative rewards and (f) learned meta-weights for the $3$ similar meta-tasks for the RL experiment.}
	\label{fig:synth_func_results}
% 	\vspace{-4mm}
\end{figure}

%\begin{figure}
%%\vspace{-2mm}
%	\centering
%	\begin{tabular}{ccc}
%		\hspace{-4mm} \includegraphics[width=0.33\linewidth]{figures_aaai/synth_func_new_err} & \hspace{-6.2mm}
%		\includegraphics[width=0.33\linewidth]{figures_aaai/meta_weights_new_err}& \hspace{-6.2mm}
%		\includegraphics[width=0.33\linewidth]{figures_aaai/synth_func_lr_err}\\
%		{(a)} & {(b)} & {(c)}\\
%		\hspace{-4mm} \includegraphics[width=0.33\linewidth]{figures_aaai/synth_func_eps_err}& \hspace{-6.2mm}
%		\includegraphics[width=0.335\linewidth]{figures_aaai/mnist_with_mtbo_new_with_ts} & \hspace{-5.3mm}
%		\includegraphics[width=0.33\linewidth]{figures_aaai/cifar_10_with_mtbo_new_with_ts}\\
%		{(d)} & {(e)} & {(f)}\\
%	\end{tabular}% \vspace{-1.55mm}
%	\caption{(a) The simple regret and (b) meta-weights optimized by RM-GP-UCB.
%	The impact of (c) $\eta$ and (d) $\epsilon$. 
%	Best validation error of CNN for (e) MNIST and (f) CIFAR-$10$.}
%%	($10$ random initializations)
%%	Best validation error of CNN for (a) MNIST and (b) CIFAR-$10$ ($10$ random initializations).
%%	(c) Average rank of best validation errors for the HAR experiment. (d) Best validation error for diabetes diagnosis (10 random initializations).
%%	(e) Best cumulative rewards and (f) learned meta-weights for the $3$ similar meta-tasks for the RL experiment.}
%	\label{fig:synth_func_results}
%% 	\vspace{-4mm}
%\end{figure}


% \begin{figure}
% %\vspace{-2mm}
% 	\centering
% 	\begin{tabular}{cccc}
% 		\hspace{-4mm} \includegraphics[width=0.27\linewidth]{figures_aaai/synth_func_new_err} & \hspace{-6.2mm}
% 		\includegraphics[width=0.27\linewidth]{figures_aaai/meta_weights_new_err}& \hspace{-6.2mm}
% 		\includegraphics[width=0.27\linewidth]{figures_aaai/synth_func_lr_err}& \hspace{-6.2mm}
% 		\includegraphics[width=0.27\linewidth]{figures_aaai/synth_func_eps_err}\\
% 		{(a)} & {(b)} & {(c)} & {(d)}
% %		\includegraphics[width=0.33\linewidth]{figures_aaai/mnist_with_mtbo_new_with_ts} & \hspace{-5.3mm}
% %		\includegraphics[width=0.33\linewidth]{figures_aaai/cifar_10_with_mtbo_new_with_ts}\\
% %		{(d)} & {(e)} & {(f)}\\
% 	\end{tabular}%\vspace{-1.55mm}
% 	\caption{(a) The simple regret and (b) meta-weights optimized by RM-GP-UCB.
% 	The impact of (c) $\eta$ and (d) $\epsilon$. 
% %	Best validation error of CNN for (e) MNIST and (f) CIFAR-$10$.
% 	}
% %	($10$ random initializations)
% %	Best validation error of CNN for (a) MNIST and (b) CIFAR-$10$ ($10$ random initializations).
% %	(c) Average rank of best validation errors for the HAR experiment. (d) Best validation error for diabetes diagnosis (10 random initializations).
% %	(e) Best cumulative rewards and (f) learned meta-weights for the $3$ similar meta-tasks for the RL experiment.}
% 	\label{fig:synth_func_results}
% 	\vspace{-4mm}
% \end{figure}

We also investigate the impact of $\eta$ and $\epsilon$.
Fig.~\ref{fig:synth_func_results}c shows the performances of different values of $\eta$, with fixed $\epsilon=0.7$ and $r=0.7$.
The figure demonstrates that an excessively small $\eta$ (purple curve) negatively impacts the performance, since RM-GP-UCB is unable to quickly reduce the weights of dissimilar meta-tasks (Fig.~\ref{fig:meta_weights_curves}a in Appendix~\ref{app:synth}).
Moreover, an overly large $\eta$ is also slightly detrimental (green curve) since it rapidly assigns a large weight to one of the two useful meta-tasks 
(Fig.~\ref{fig:meta_weights_curves}c in Appendix~\ref{app:synth}), thus failing to utilize the other useful meta-task.
%The figure demonstrates that if the learning rate $\eta$ is too small (purple curve), OM-GP-UCB is unable to quickly reduce the weights assigned to the meta-tasks with large function gaps 
%(refer to Fig.~\ref{fig:meta_weights_curves}a in Appendix~\ref{app:synth}), which exerts some negative impacts on the performance.
%On the other hand, overly large $\eta$ is also slightly detrimental to the performance (green curve) since it rapidly assigns a large weight to one of the two useful meta-tasks 
%(refer to Fig.~\ref{fig:meta_weights_curves}c in Appendix~\ref{app:synth}), thus failing to make use of the information from the other useful meta-task.
Fig.~\ref{fig:synth_func_results}d illustrates the impact of $\epsilon$ when all function gaps are large: $d_i=8.0$ for all $i$.\footnote{We use $\eta=1/N$ and fix $r$ at a large value ($0.99$) so that the decaying rate of $\nu_t$ is purely decided by $\epsilon$.}
The figure shows that even when all meta-tasks are dissimilar, our adaptive selection of $\nu_t$
is able to diminish their negative impact and allow RM-GP-UCB to perform comparably to GP-UCB.
Furthermore, in this adverse scenario, a faster decline of the impact of the meta-tasks (i.e., faster decay of $\eta_t$ via larger $\epsilon$) leads to slightly better performance.
%The results from Fig.~\ref{fig:synth_func_results}d corroborate our discussion in Section~\ref{sec:theoretical_analysis} stating that our theoretical results
%guarantee the robustness of RM-GP-UCB against dissimilar meta-tasks.
% Fig.~\ref{fig:synth_func_results}d verifies our robustness against dissimilar meta-tasks, which can be attributed to our robust convergence guarantee and online meta-weight optimization.


%\vspace{-1mm}
\subsection{Real-world Experiments}
%\subsection{Automated Machine Learning (ML)}
%\vspace{-1.1mm}
\label{subsec:automl}
\textbf{Hyperparameter Tuning for Convolutional Neural Networks (CNNs).} 
%\subsubsection{Hyperparameter Tuning for Convolutional Neural Networks (CNNs)} 
%Hyperparameter optimization of ML models is routinely performed with each task likely using a different yet potentially related dataset~\citep{golovin2017google}, 
%which suggests the potential of using the previous tasks to accelerate the current hyperparameter search.
%We simulate this scenario by tuning the hyperparameters of CNNs using $4$ widely used image datasets: MNIST, SVHN, CIFAR-$10$ and CIFAR-$100$.
%Meta-BO can be applied in hyperparameter tuning of ML models with the previous tasks using other datasets as the meta-tasks.
We apply meta-BO to hyperparameter tuning of ML models with the previous tasks using other datasets as the meta-tasks.
We tune $3$ hyperparameters of CNNs using $4$ widely used image datasets: MNIST, SVHN, CIFAR-$10$ and CIFAR-$100$.
Specifically, in each experiment, one of the four datasets is selected to produce the target function $f$  
which maps a hyperparameter setting to a validation accuracy obtained using this dataset.
The meta-observations are generated from $3$ independent BO tasks (each with $50$ iterations) using the other $3$ datasets, 
i.e., $M=3$ and $N_i=50$ for $i=1,2,3$ in all $4$ experiments. 
The results for MNIST and CIFAR-$10$ are plotted in Figs.~\ref{fig:synth_func_results}e and~\ref{fig:synth_func_results}f while
the remaining results are shown in Appendix~\ref{app:auto_ml} (Fig.~\ref{fig:cnn_2}).
%The performance of RM-GP-UCB is the best for MNIST (Fig.~\ref{fig:synth_func_results}e) and comparable to RGPE in the other tasks (i.e., outperforming GP-UCB, TAF and MTBO).
The results show that RM-GP-UCB is the only method that consistently performs well in all tasks, and that RM-GP-TS performs much better than RM-GP-UCB (and other methods) for MNIST, yet worse in the other tasks.
We have also adopted the 
% commonly used 
Omniglot dataset~\citep{lake2015human} commonly used in meta-learning, for which RM-GP-UCB performs the best (Fig.~\ref{fig:omniglot}, Appendix~\ref{app:auto_ml}).
%We also adopt the Omniglot dataset~\citep{lake2015human}, which is widely used for meta-learning.
%%We tune $3$ hyperparameters of a Siamese neural network for one-shot classification, using $10$ alphabets from the background set as $10$ meta-tasks and an alphabet from the evaluation set as the target task.
%Each task uses one alphabet, and involves tuning $3$ hyperparameters of a Siamese network for one-shot classification, trained and validated using $75\%$ and $25\%$ of the alphabet respectively.
%We use $10$ alphabets from background set as $10$ meta-tasks and an alphabet from evaluation set as the target task.
%The 2-way validation errors (Fig.~\ref{fig:cnn}a) show that RM-GP-UCB outperforms all other algorithms under comparison.

% \begin{figure}
% %\vspace{-2mm}
% 	\centering
% 	\begin{tabular}{cccc}
% 		\hspace{-4mm} \includegraphics[width=0.27\linewidth]{figures_aaai/mnist_with_mtbo_new_with_ts} & \hspace{-5.3mm}
% 		\includegraphics[width=0.27\linewidth]{figures_aaai/cifar_10_with_mtbo_new_with_ts}& \hspace{-6mm}
% 		\includegraphics[width=0.27\linewidth]{figures_aaai/clinical_with_mtbo_new_with_ts}& \hspace{-6mm}
% 		\includegraphics[width=0.26\linewidth]{figures_rebuttal/runtime_rebuttal_clinical_0_with_ts.pdf}\\	
% 		{(a)} & {(b)} & {(c)} & {(d)}
% 	\end{tabular}%\vspace{-1.55mm}
% 	\caption{
% %	(a) The simple regret and (b) meta-weights optimized by RM-GP-UCB.
% %	The impact of (c) $\eta$ and (d) $\epsilon$. 
% 	Best validation error of CNN for (a) MNIST and (b) CIFAR-$10$,
% 	(c) Best validation error of LR for diabetes diagnosis.
% 	(d) Runtime of different algorithms in non-stationary BO experiment.
% 	}
% %	($10$ random initializations)
% %	Best validation error of CNN for (a) MNIST and (b) CIFAR-$10$ ($10$ random initializations).
% %	(c) Average rank of best validation errors for the HAR experiment. (d) Best validation error for diabetes diagnosis (10 random initializations).
% %	(e) Best cumulative rewards and (f) learned meta-weights for the $3$ similar meta-tasks for the RL experiment.}
% 	\label{fig:cnn}
% 	\vspace{-4mm}
% \end{figure}

\textbf{Non-stationary Bayesian Optimization.}
Meta-BO can be naturally applied to non-stationary BO problems in which the unknown objective function evolves over time  
since the previous (outdated) observations can be treated as the meta-observations. 
We consider here automated ML for clinical diagnosis.
%Nowadays, ML models have been widely used for automatic clinical diagnosis~\citep{miotto2017deep}, in which hyperparameter optimization naturally arises as an important issue.
%Unfortunately, as the data from new patients becomes available regularly, the clinicians often need to 
As the data from new patients becomes available regularly, clinicians often need to 
%add the new data to the existing dataset and 
periodically 
update the dataset and
re-run hyperparameter optimization 
%of the clinical diagnosis ML model periodically.
for the ML model used for clinical diagnosis.
%re-run hyperparameter optimization periodically.
This stimulates the question as to whether the previous hyperparameter tuning tasks using the outdated patients data can help accelerate the current task.
We consider the problem of diabetes prediction~\citep{smith1988using} with \emph{logistic regression} (LR) and tune $3$ LR hyperparameters.
We create $5$ progressively growing datasets (including the full dataset), 
treating (the hyperparameter tuning task using) the full dataset as the target task and the $4$ smaller datasets as the meta-tasks.
%Fig.~\ref{fig:har_clinical}a plots the results and shows that OM-GP-UCB and MTBO are the best-performing algorithms.
%The results (Fig.~\ref{fig:cnn}d) show that RM-GP-UCB yields similar performance to MTBO, while TAF performs the best in this case.
Specifically, the entire dataset consists of 768 data instances, among which 77 instances are set aside to measure the validation accuracy. 
The sizes of the 5 progressively growing training datasets (i.e., corresponding to the 4 meta-tasks and the target task, respectively) are 138, 276, 414, 552, and 691.
The results (Fig.~\ref{fig:cnn}a) show that RM-GP-TS outperforms all other methods in this task.
%MTBO and TAF perform the best, and RM-GP-UCB follows closely, outperforming both RGPE and GP-UCB.
%The results (Fig.~\ref{fig:cnn}c) show that MTBO and TAF perform the best, and RM-GP-UCB follows closely, outperforming both RGPE and GP-UCB.
%RM-GP-UCB yields similar performance to MTBO, while TAF performs the best in this case.
Moreover, we also compare the runtime of different methods in Fig.~\ref{fig:cnn}b: RM-GP-TS is significantly more efficient than all other methods, and the methods building separate GP surrogates for different tasks (i.e. RM-GP-UCB, RGPE and TAF) are more efficient than MTBO which includes all observations in a single GP (Sec.~\ref{sec:intro}).


\textbf{Hyperparameter Tuning for Support Vector Machines (SVMs).} 
%\subsubsection{Hyperparameter Tuning for Support Vector Machines (SVMs)} 
%\label{subsubsec:svm}
We also tune the hyperparameters of SVMs using a tabular benchmark dataset~\citep{wistuba2015learning} which has also been adopted by RGPE~\citep{feurer2018scalable}.
The benchmark was constructed by evaluating a fixed grid of $288$ SVM hyperparameter configurations using $50$ \emph{diverse} datasets (i.e., containing many dissimilar tasks).
We follow the setting used by RGPE~\citep{feurer2018scalable}: In every trial, we fix one of the tasks as the target task, and the remaining $M=49$ tasks as the meta-tasks; for every meta-task $i$, we randomly select $N_i=50$ hyperparameter configurations 
% on the grid 
as the meta-observations.
%$288$ configurations of $6$ SVM hyperparameters using $50$ diverse datasets, thus producing $50$ tasks, 
The results in Fig.~\ref{fig:cnn}c 
% (averaged over $25$ trials, each further averaged over $5$ runs with random initializations) 
show that our RM-GP-UCB performs comparably to RGPE, 
%outperforming GP-UCB, TAF and PEM-BO~\citep{wang2018regret}.
outperforming the other methods; 
RM-GP-TS performs unsatisfactorily in this experiment with diverse tasks.
Of note, this experiment has the most favorable setting for PEM-BO~\citep{wang2018regret} because (a) PEM-BO has been shown to require a massive set of meta-observations ($\geq5000$)
% in their experiments) 
to perform well~\citep{wang2018regret}, and this experiment has the largest number ($49\times50=2450$) of meta-observations among all experiments;
(b) the domain here is discrete, which is much easier for the application of PEM-BO.

%To demonstrate the consistency of our algorithm w.r.t. the choice of benchmark functions, we also test on the benchmark function (for hyperparameter tuning of SVM) used by the RGPE algorithm~\citep{feurer2018scalable}. The results (Fig.~\ref{fig:synth_func_results}e) show that RM-GP-UCB performs comparably to RGPE on this benchmark, outperforming both GP-UCB and TAF.


% \begin{figure}
% %\vspace{-2mm}
% 	\centering
% 	\begin{tabular}{cccc}
% 		\hspace{-4mm} \includegraphics[width=0.27\linewidth]{figures_rebuttal/svm_25_tasks_new_with_ts}& \hspace{-5.6mm}
% 		\includegraphics[width=0.28\linewidth]{figures_aaai/HAR_new}  & \hspace{-6mm}
% 		\includegraphics[width=0.27\linewidth]{figures_aaai/rl_cartpole_new_with_ts} & \hspace{-6mm}
% 		\includegraphics[width=0.27\linewidth]{figures_aaai/meta_weights_rl_all.pdf}\\
% 		{(a)} & {(b)} & {(c)} & {(d)}
% 	\end{tabular}%\vspace{-1.55mm}
% 	\caption{(a) Simple regret on SVM benchmark.
% 	(b) Best validation errors for HAR.
% 	(c) Best cumulative rewards and (d) learned meta-weights for the $3$ similar meta-tasks for the RL experiment.}
% 	\label{fig:others}
% 	\vspace{-4mm}
% \end{figure}

\begin{figure}
%\vspace{-2mm}
	\centering
	\begin{tabular}{cc}
		\hspace{-4mm} \includegraphics[width=0.47\linewidth]{figures/clinical_with_mtbo_new_with_ts} & \hspace{-4mm}
		\includegraphics[width=0.47\linewidth]{figures/runtime_rebuttal_clinical_0_with_ts.pdf}\\
		{(a)} & {(b)}\\
		\hspace{-4mm} \includegraphics[width=0.47\linewidth]{figures/svm_25_tasks_new_with_ts}& \hspace{-4mm}
		\includegraphics[width=0.47\linewidth]{figures/HAR_new}\\
		{(c)} & {(d)}\\
		\hspace{-4mm} \includegraphics[width=0.47\linewidth]{figures/rl_cartpole_new_with_ts} & \hspace{-4mm}
		\includegraphics[width=0.47\linewidth]{figures/meta_weights_rl_all.pdf}\\
		{(e)} & {(f)}
	\end{tabular} 
%	\vspace{-1.55mm}
	\caption{
	(a) Best validation error of LR for diabetes diagnosis.
	(b) Runtime 
% 	of different algorithms 
	in non-stationary BO experiment.
	(c) Simple regret on SVM benchmark.
	(d) Best validation errors for HAR.
	(e) Best cumulative rewards and (f) learned meta-weights for the $3$ similar meta-tasks for the RL experiment.}
	\label{fig:cnn}%\vspace{-4mm}
\end{figure}

%\begin{figure}
%%\vspace{-2mm}
%	\centering
%	\begin{tabular}{ccc}
%		\hspace{-4mm} \includegraphics[width=0.335\linewidth]{figures_aaai/clinical_with_mtbo_new_with_ts} & \hspace{-5.6mm}
%		\includegraphics[width=0.33\linewidth]{figures_rebuttal/runtime_rebuttal_clinical_0_with_ts.pdf}& \hspace{-5.6mm}
%		\includegraphics[width=0.33\linewidth]{figures_rebuttal/svm_25_tasks_new_with_ts} \\
%		{(a)} & {(b)} & {(c)}\\
%		\hspace{-4mm} \includegraphics[width=0.335\linewidth]{figures_aaai/HAR_new}& \hspace{-6mm}
%		\includegraphics[width=0.33\linewidth]{figures_aaai/rl_cartpole_new_with_ts} & \hspace{-6mm}
%		\includegraphics[width=0.33\linewidth]{figures_aaai/meta_weights_rl_all.pdf}\\
%		{(d)} & {(e)} & {(f)}
%	\end{tabular} \vspace{-1.55mm}
%	\caption{
%	(a) Best validation error of LR for diabetes diagnosis.
%	(b) Runtime 
%% 	of different algorithms 
%	in non-stationary BO experiment.
%	(c) Simple regret on SVM benchmark.
%	(d) Best validation errors for HAR.
%	(e) Best cumulative rewards and (f) learned meta-weights for the $3$ similar meta-tasks for the RL experiment.}
%	\label{fig:cnn}%\vspace{-4mm}
%\end{figure}


% \begin{figure}
% %\vspace{-2mm}
% 	\centering
% 	\begin{tabular}{ccc}
% 		\hspace{-4mm} \includegraphics[width=0.3\linewidth]{figures_rebuttal/Omniglot_new} & \hspace{-5.6mm}
% 		\includegraphics[width=0.3\linewidth]{figures_rebuttal/svm_25_tasks_new_with_ts}& \hspace{-5.6mm}
% 		\includegraphics[width=0.3\linewidth]{figures_aaai/HAR_rank_new_with_ts} \\
% 		{(a)} & {(b)} & {(c)}\\
% 		\hspace{-4mm} \includegraphics[width=0.3\linewidth]{figures_aaai/clinical_with_mtbo_new_with_ts}& \hspace{-6mm}
% 		\includegraphics[width=0.3\linewidth]{figures_aaai/rl_cartpole_new_with_ts} & \hspace{-6mm}
% 		\includegraphics[width=0.3\linewidth]{figures_aaai/meta_weights_rl_all.pdf}\\
% 		{(d)} & {(e)} & {(f)}
% 	\end{tabular}%\vspace{-1.55mm}
% 	\caption{(a) Best validation error of CNN using Omniglot. (b) Simple regret on SVM benchmark.
% 	(c) Average rank of best validation errors for the HAR experiment. (d) Best validation error for diabetes diagnosis.
% 	(e) Best cumulative rewards and (f) learned meta-weights for the $3$ similar meta-tasks for the RL experiment.}
% 	\label{fig:cnn}%\vspace{-4mm}
% \end{figure}


\textbf{Human Activity Recognition (HAR).}  
%\subsubsection{Human Activity Recognition (HAR)}  
HAR using mobile devices has promising applications in various domains such as healthcare~\citep{reyes2013human}.
When optimizing the configurations (hyperparameters) of the activity prediction model (ML model) for a subject, the previous optimization tasks for other subjects might be helpful.
However, cross-subject transfer in HAR is challenging due to high \emph{individual variability}~\citep{soleimani2019cross},
which makes HAR suitable for evaluating the robustness of a meta-BO algorithm against dissimilar meta-tasks.
We use the data collected through mobile phones from $30$ subjects performing $6$ activities and 
use \emph{support vector machines} (SVM) for activity prediction. 
Every task corresponds to tuning $2$ SVM hyperparameters for a subject.
We run a separate BO ($30$ iterations) for each of the $21$ subjects to generate the meta-observations ($M=21$, $N_i=30$ for $i=1,\ldots,21$)  
and use the other $9$ subjects for validation.
The results are shown in Fig.~\ref{fig:cnn}d (averaged over the $9$ subjects, each further averaged over $5$ random initializations),
in which RM-GP-UCB delivers the best performance, followed by RGPE; RM-GP-TS again fails to perform effectively, suggesting that it is less robust against the individual variability in HAR.
%The validation error for each subject is averaged over $10$ random initializations (shown in Fig.~\ref{fig:har_all}, Appendix~\ref{app:auto_ml}).
%Then, for each tested algorithm and in every iteration, the rank (among the $4$ algorithms) of their corresponding validation error is averaged over the $9$ subjects (Fig.~\ref{fig:cnn}c),
%which shows that
%RM-GP-UCB outperforms all other algorithms.\footnote{The average ranks are shown since the subjects have different scales of validation errors.}
%Of note, as a result of adverse dissimilar meta-tasks,
%both RGPE and TAF fail to outperform standard GP-UCB for $5$ of the $9$ subjects (Appendix~\ref{app:auto_ml}), while OM-GP-UCB fails to outperform GP-UCB for only $1$ subject.
%This indicates the practical robustness of OM-GP-UCB against dissimilar meta-tasks (i.e., subjects with highly different activity profiles) that are detrimental to the convergence of BO.

%\begin{figure}
%	\centering
%	\begin{tabular}{cc}
%		\hspace{-3mm} \includegraphics[width=0.48\linewidth]{figures/HAR_err_rank} & \hspace{-3mm}
%		\includegraphics[width=0.51\linewidth]{figures/clinical_err}\\
%		{(a)} & {(b)}
%	\end{tabular}\vspace{-2mm}
%	\caption{(a) Average rank of the best validation errors of SVM for human activity recognition.
%	(b) Best validation error of LR for diabetes diagnosis (20 random initializations).}
%	\label{fig:har_clinical}\vspace{-5mm}
%\end{figure}

%\begin{figure}[tb]
%	\centering
%	\begin{tabular}{cc}
%		\hspace{-3mm} \includegraphics[width=0.48\linewidth]{figures/rl_cartpole_err.pdf} & \hspace{-3mm}
%		\includegraphics[width=0.48\linewidth]{figures/rl_mountain_car_err.pdf}\\
%		{(a)} & {(b)}
%	\end{tabular}\vspace{-2mm}
%	\caption{Best (normalized) cumulative rewards in an episode for (a) Cart-Pole and (b) Mountain-Car (10 random initializations).}
%	\label{fig:rl_results}\vspace{-5mm}
%\end{figure}


%\vspace{-3mm}
%\subsection{Non-stationary Bayesian Optimization}
%\vspace{-1mm}
%\label{sec:non_stationary_bo}
%\subsubsection{Non-stationary Bayesian Optimization.}

%\begin{figure}
%	\centering
%	\begin{tabular}{ccc}
%		\hspace{-3mm} \includegraphics[width=0.33\linewidth]{figures_updated/clinical_with_mtbo_new} & \hspace{-4mm}
%		\includegraphics[width=0.355\linewidth]{figures_updated/rl_cartpole_new} & \hspace{-6mm}
%		\includegraphics[width=0.33\linewidth]{figures_updated/meta_weights_rl_all.pdf}\\
%		{(a)} & {(b)} & {(c)}
%	\end{tabular}%\vspace{-1.55mm}
%	\caption{(a) Best validation error of LR for diabetes diagnosis (10 random initializations).
%	(b) Best cumulative rewards and (c) learned meta-weights for the $3$ similar meta-tasks for the RL experiment.}
%%	Best (normalized) cumulative rewards for (b) Cart-Pole and (c) Mountain-Car (10 random initializations).}
%	\label{fig:har_clinical}%\vspace{-4mm}
%\end{figure}

%\begin{figure}
%	\centering
%	\begin{tabular}{ccc}
%		\hspace{-3mm} \includegraphics[width=0.3\linewidth]{figures/clinical_err} & \hspace{-4mm}
%		\includegraphics[width=0.3\linewidth]{figures/rl_cartpole_err} & \hspace{-4mm}
%		\includegraphics[width=0.3\linewidth]{figures/rl_mountain_car_err.pdf}\\
%		{(a)} & {(b)} & {(c)}
%	\end{tabular}\vspace{-2mm}
%	\caption{(a) Best validation error of LR for diabetes diagnosis (20 random initializations).
%	Best (normalized) cumulative rewards for (a) Cart-Pole and (b) Mountain-Car (10 random initializations).}
%	\label{fig:har_clinical}\vspace{-6mm}
%\end{figure}

%\vspace{-1mm}
%\subsection{Policy Search for Reinforcement Learning (RL)}
%\vspace{-2mm}
%\label{sec:rl}
\textbf{Policy Search for Reinforcement Learning (RL).}
%\subsubsection{Policy Search for Reinforcement Learning (RL)}
When optimizing the RL policy of an agent in an environment, the agent's experience in other related environments may help to make learning more efficient~\citep{duan2016rl, wang1611learning}.
We apply meta-BO to policy search in RL to maximize the cumulative rewards in an episode, using the Cart-Pole environment from OpenAI Gym~\citep{brockman2016openai} with $8$ policy parameters.
We simulate different environments by setting the agent to different initial states. 
In particular, we choose $M=10$ different initial states, among which the majority (i.e., $7$) are randomly generated (i.e., dissimilar meta-tasks) and 
the other $3$ are designed to be close to the initial state of the target task so that they are similar to the target task.
An independent BO task with $50$ iterations is run for every initial state, i.e., $N_i=50$ for $i=1,\ldots,10$.
Figs.~\ref{fig:cnn}e and~\ref{fig:cnn}f plot the (normalized) cumulative rewards of different algorithms and their learned meta-weights for the $3$ similar meta-tasks.
The results show that RM-GP-UCB achieves the best performance (Fig.~\ref{fig:cnn}e), 
%because it
and it is more effective than RGPE and TAF at identifying the $3$ similar meta-tasks (Fig.~\ref{fig:cnn}f).
% and thus diminishing the impact of the remaining dissimilar meta-tasks (Fig.~\ref{fig:others}d).
%The inability of RGPE and TAF to correctly identify similar meta-tasks reveals a disadvantage of their ranking-based method to identify task similarity (Section~\ref{sec:related_works}, second paragraph) when many target observations have the same value (more details in Appendix~\ref{app:auto_ml}).
RGPE and TAF fail to correctly identify similar meta-tasks because they learn the meta-weights based on how accurately each GP surrogate predicts the \emph{pairwise ranking of the target observations} (more details in Sec.~\ref{sec:related_works}). However, in the Cart-Pole environment, many target observations have equal values, which confuses the pairwise ranking and makes the learned meta-weights unreliable.
RM-GP-TS again only performs comparably with standard GP-UCB (Fig.~\ref{fig:cnn}e).

%When optimizing the RL policy of an agent in an environment, the agent's experience in other related environments may help to make learning more efficient~\citep{duan2016rl, wang1611learning}.
%We apply meta-BO algorithms to policy search in RL to maximize the cumulative rewards in an episode using the Cart-Pole and Mountain-Car environments from OpenAI Gym \citep{brockman2016openai}.
%We simulate different environments by setting the agent to different initial states. 
%In particular, we choose $M=10$ different initial states, among which $7$ are randomly generated (i.e., dissimilar meta-tasks) and the other $3$ are designed to be close to the initial state of the target function.
%An independent BO task with $50$ iterations is run for every initial state, i.e., $N_i=50$ for $i=1,\ldots,10$.
%Figs.~\ref{fig:har_clinical}b and~\ref{fig:har_clinical}c 
%show that OM-GP-UCB, 
%by assigning small weights to dissimilar meta-tasks (Fig.~\ref{fig:rl_ws} in Appendix~\ref{app:rl}), performs the best 
%for Cart-Pole and comparably to RGPE for Mountain-Car, for which TAF works unfavorably.

%\begin{figure}[tb]
%	\centering
%	\begin{tabular}{cc}
%		\hspace{-3mm} \includegraphics[width=0.48\linewidth]{figures/rl_cartpole_err.pdf} & \hspace{-3mm}
%		\includegraphics[width=0.48\linewidth]{figures/rl_mountain_car_err.pdf}\\
%		{(a)} & {(b)}
%	\end{tabular}\vspace{-2mm}
%	\caption{Best (normalized) cumulative rewards in an episode for (a) Cart-Pole and (b) Mountain-Car (10 random initializations).}
%	\label{fig:rl_results}\vspace{-5mm}
%\end{figure}

% \vspace{-1mm}
\subsection{Experimental Discussion}
\label{sec:experiment_discussion}
% \vspace{-1mm}
In most experimental results (Figs.~\ref{fig:synth_func_results} and~\ref{fig:cnn}), 
%RM-GP-UCB starts to outperform GP-UCB from the beginning. 
the performance advantage of RM-GP-UCB is most evident at the initial stage.
This is likely to corroborate our theoretical insights that 
%RM-GP-UCB can exploit the meta-tasks to reduce the need for exploration at the initial stage.
the meta-tasks can help improve the convergence of RM-GP-UCB at the initial stage by reducing the degree of exploration (Sec.~\ref{subsec:theory:rm_gp_ucb}).
% We also empirically demonstrate the scalability of our methods (Appendix~\ref{app:scalability}) by plotting the runtime in the non-stationary BO experiment and in a more large-scale version of the RL experiment (with $60$ meta-tasks, each containing $130$ meta-observations). These results show that RM-GP-UCB incurs considerably less runtime than MTBO and that RM-GP-TS is significantly more scalable than the other meta-BO methods.
% A commonly raised issue in meta-BO is that the meta-functions may have different scales from the target function, i.e., the meta-functions may be shifted or scaled versions of the target function~\citep{feurer2018scalable}.
A potential limitation of our online meta-weight optimization (Sec.~\ref{sec:online_regret_minimization}) is that it does not account for the scenario where the meta-functions are shifted or scaled versions of the target function.
% Nonetheless, this problem is not explicitly addressed here for the following reasons: 
% Firstly, 
However, note that in some scenarios, the scale of the meta-functions is informative about task similarity and thus should not be removed. For example, in our clinical diagnosis (i.e., non-stationary BO) experiment, the more recently completed meta-tasks (with larger training set, smaller validation errors, and thus smaller function gaps) are expected to be more similar to the target task.
% Secondly, 
Furthermore, as demonstrated by the green curve in Fig.~\ref{fig:synth_func_results}a, in some cases, even though the meta-weights are not optimized, RM-GP-UCB still performs favorably. This implies its robustness against mis-specification of the meta-weights.

RM-GP-UCB is the only method that consistently outperforms standard GP-UCB in \emph{all} experiments (Figs.~\ref{fig:synth_func_results} and~\ref{fig:cnn}), whereas other methods perform either comparably with or worse than GP-UCB in some experiments (e.g., RGPE in Figs.~\ref{fig:synth_func_results}e and~\ref{fig:cnn}a, TAF in Figs.~\ref{fig:synth_func_results}e,~\ref{fig:synth_func_results}f and~\ref{fig:cnn}d).
This might be attributed to RM-GP-UCB's theoretically guaranteed robustness against dissimilar meta-tasks (Sec.~\ref{sec:theoretical_analysis}) and its ability to diminish their impact in a principled way (Sec.~\ref{sec:online_regret_minimization}).
In particular, RM-GP-UCB performs significantly better than RM-GP-TS in those experiments with a large number of dissimilar meta-tasks (Figs.~\ref{fig:cnn}c-e), which may be explained by RM-GP-UCB's better theoretically guaranteed robustness against dissimilar meta-tasks than RM-GP-TS (Sec.~\ref{subsec:theory:ts}).
However, Figs.~\ref{fig:synth_func_results}e-f and Fig.~\ref{fig:cnn}a show that RM-GP-TS performs competitively in some experiments with more favorable settings (i.e., less dissimilar meta-tasks), which might result from the repeatedly observed empirical effectiveness of TS-based algorithms~\citep{chapelle2011empirical,russo2017tutorial}.
% However, RM-GP-TS performs competitively in some experiments with less dissimilar meta-tasks (e.g., Figs.~\ref{fig:cnn}a-c), which might result from the repeatedly observed empirical effectiveness of TS-based algorithms~\citep{chapelle2011empirical,russo2017tutorial}.
Moreover, the computational efficiency of RM-GP-TS is markedly superior to other methods (Fig.~\ref{fig:cnn}b).
These theoretical and empirical comparisons between RM-GP-UCB and RM-GP-TS may provide useful insights for other meta-BO algorithms and potentially for a broader range of problems (e.g., meta-learning for multi-armed bandits and RL) in terms of the relative strengths and weaknesses of UCB- and TS-based algorithms.

%Moreover, RM-GP-UCB performs favorably throughout all experiments.
%Specifically, RM-GP-UCB outperforms standard GP-UCB in \emph{all} real-world experiments,
%while TAF (RGPE) fails to outperform standard GP-UCB in Figs.~\ref{fig:synth_func_results}e and~\ref{fig:synth_func_results}f (Figs.~\ref{fig:synth_func_results}e and~\ref{fig:cnn}c),
%which is likely to result from the negative impact of harmful dissimilar meta-tasks.
%This might suggest RM-GP-UCB's superior ability to prevent the convergence of BO from being affected by dissimilar meta-tasks,
%which is believed to be largely due to its theoretically guaranteed convergence even when faced with dissimilar meta-tasks (Section~\ref{sec:theoretical_analysis}) and 
%its ability to identify dissimilar meta-tasks in a principled way (Section~\ref{sec:online_regret_minimization}).

\section{Related Works}
%\vspace{-1.0mm}
% \label{sec:related_works}
\label{sec:related_works}
% Some previous works on meta-BO build a joint GP surrogate using all previous and current observations, 
% and represent task similarity through meta-features~\citep{bardenet2013collaborative,schilling2016scalable,yogatama2014efficient}.
% However, these algorithms suffer from the requirement of handcrafted meta-features, which is avoided in other works that learn task similarity from the observations~\citep{swersky2013multi,shilton2017regret}. For example, multitask BO~\citep{swersky2013multi} uses a multitask GP as a surrogate and models each task as an output of the GP. 
% %The work of~\citep{shilton2017regret} has focused on the setting with all previous observations belonging to a single previous task and 
% %modeled the difference between the source and target functions either as a noise process or as a GP. 
% These works include all previous and current observations in a single GP surrogate and are thus limited by GP's scalability.
% %In addition, handling a large number of tasks may be problematic for~\citep{swersky2013multi} due to the need to invert the task correlation matrix.
% %The work of~\citep{perrone2018scalable} has replaced the GP surrogate with Bayesian linear regression for scalability and 
% %transferred information from previous tasks by learning a shared representation across tasks using a deep neural network.
% %Two other works~\citep{golovin2017google,poloczek2016warm} have specifically tackled the scenario of sequentially arriving tasks 
% %by using the GP surrogate to model the residual of the current task relative to the posterior mean of the previous tasks.
% %Besides,~\citep{feurer2015initializing} and~\citep{wistuba2015sequential} have 
% %learned a set of good initializations from the previous tasks.
% There have also been other empirical works which replace GP with Bayesian linear regression for scalability~\citep{perrone2018scalable},
% tackle sequentially arriving tasks~\citep{golovin2017google,poloczek2016warm}, learn a set of good initializations~\citep{feurer2015initializing,wistuba2015sequential}, learn a reduced search space for BO from previous tasks~\citep{perrone2019learning}, handle the issue of different function scales using Gaussian Copulas~\citep{salinas2020quantile}, 
% learn the task similarities through the distance between the distributions of the optima from different tasks~\citep{ramachandran2018information},
% or use the meta-observations to learn the entire acquisition function through RL~\citep{volpp2020meta}.
%or use the meta-observations to learn the entire acquisition function through an expensive training process using RL~\citep{volpp2020meta}.
%or study information transfer between GP models for supervised learning~\citep{cao2010adaptive}.

%Some previous works on meta-BO, such as multitask BO~\citep{swersky2013multi}, build a joint GP surrogate using all previous and current observations.
%% , and represent the task similarity using either handcrafted meta-features or learned task correlations. 
%% For example, multitask BO~\citep{swersky2013multi} uses a multitask GP as a surrogate and models each task as an output of the GP. 
%We defer more detailed discussions of these works, as well as other recent works on meta-BO, to Appendix~\ref{sec:related_works:app}.

%BO has been one of the most popular methods for black-box optimization~\citep{frazier2018tutorial,shahriari2016taking} and has been extended to handle different important settings such as privacy-preserving optimization~\citep{kharkovskii2020private} and risk-averse optimization~\citep{nguyen2021optimizing,nguyen2021value}.
Some previous works on meta-BO build a joint GP surrogate using all previous and current observations, 
and represent task similarity through meta-features~\citep{bardenet2013collaborative,schilling2016scalable,yogatama2014efficient}.
However, these algorithms suffer from the requirement of handcrafted meta-features, which is avoided in other works that learn task similarity from the observations~\citep{swersky2013multi,shilton2017regret}. For example, multitask BO~\citep{swersky2013multi} uses a multitask GP as a surrogate and models each task as an output of the GP. 
These works include all previous and current observations in a single GP surrogate and are thus limited by the scalability of GPs.
There have also been other empirical works which replace GP by Bayesian linear regression for scalability~\citep{perrone2018scalable},
tackle sequentially arriving tasks~\citep{golovin2017google,poloczek2016warm}, learn a set of good initializations~\citep{feurer2015initializing,wistuba2015sequential}, learn a reduced search space for BO from previous tasks~\citep{perrone2019learning}, handle the issue of different function scales using Gaussian Copulas~\citep{salinas2020quantile}, 
learn the task similarities through the distance between the distributions of the optima from different tasks~\citep{ramachandran2018information},
or use the meta-observations to learn the entire acquisition function through RL~\citep{volpp2020meta}.
\citet{wang2018regret} have learned the GP prior from previous tasks and given theoretical guarantees. % regret bounds showing the benefit of increasing the number of meta-observations;
However, they have shown in both theory and practice that a large training set of meta-observations ($\geq 5000$) is required for their method to work well, while we focus on the more practical setting of meta-BO where the number of available meta-observations may be small.
We have also verified that our algorithm outperforms the method from~\citet{wang2018regret} in the experiment that is most favorable for their method among all our experiments (more details in the third paragraph of Sec.~\ref{subsec:automl}).
%however, they rely on potentially restrictive assumptions such as the availability of basis functions %(for compact domain)
%and require a large set of meta-observations to learn the GP prior.
%In contrast, our algorithm is free from these requirements.
%\citep{volpp2020meta} use meta-observations to train a neural network to represent the acquisition function;
%however, they require an expensive training process using RL and do not have theoretical guarantee.
Meta-BO is also related to the works on multi-fidelity BO~\citep{dai2019bayesian,kandasamy2016gaussian,poloczek2017multi,wu2020practical,zhang2020bayesian,zhang2017information}, since the previous tasks can be viewed as low-fidelity functions which can approximate the target function and are cheap to query. However, multi-fidelity BO allows querying the low-fidelity functions during the BO process, whereas meta-BO algorithms can only query the target function, i.e., the highest-fidelity function.
Moreover, meta-BO is also related to the previous works on BO which involve multiple agents (i.e., analogous to multiple tasks in meta-BO), such as federated BO~\citep{dai2020federated,dai2021differentially,sim2021collaborative} or BO methods based on game-theoretical approaches~\citep{dai2020r2,sessa2019no}.

Some works have aimed to improve the scalability of GP-based meta-BO algorithms by building a separate GP surrogate for each task~\citep{feurer2018scalable,wistuba2016two,wistuba2018scalable}.
%and leveraging a weighted combination of either the individual surrogate functions or acquisition functions~\citep{feurer2018scalable,wistuba2016two,wistuba2018scalable}.
\citet{wistuba2016two} use a weighted combination of the posterior mean of each individual GP surrogate as the joint posterior mean 
while the posterior variance is derived using only the target observations.
RGPE~\citep{feurer2018scalable} has extended the work of~\citet{wistuba2016two} by estimating the joint objective function as a weighted combination of individual objective functions,
%(corresponding to the previous and current tasks)
such that the resulting joint surrogate remains a GP (unlike~\citet{wistuba2016two}) and can thus be plugged into standard BO algorithms.
Note that RGPE differs from our RM-GP-UCB algorithm in that RGPE uses a weighted combination of individual GP surrogates to derive a joint GP surrogate, whereas our RM-GP-UCB leverage a weighted combination of individual acquisition functions.
\citet{wistuba2018scalable} have proposed TAF, which also uses a weighted combination of the acquisition functions (i.e., expected improvement) from the individual tasks for query selection.
In these works, the weight of a previous task is heuristically chosen to be proportional to the accuracy of the \emph{pairwise ranking of the target observations} produced by 
either (a) the posterior mean of the GP surrogate of the previous task (TAF)~\citep{wistuba2018scalable} or 
(b) functions sampled from the posterior GP surrogate (RGPE)~\citep{feurer2018scalable}.
%In our experiments, we have compared our OM-GP-UCB algorithm with RGPE and TAF, which represent the best-performing scalable meta-BO algorithm making use of the weighted combination 
%of (a) the GP posterior predictions (RGPE) and (b) the acquisition functions (TAF) respectively,
%and have demonstrated that OM-GP-UCB outperforms both algorithms.
% We have shown in our experiments that our RM-GP-UCB algorithm outperforms both RGPE and TAF.
% Our analysis of RM-GP-TS is similar to that of~\citep{dai2020federated}, but we don't need to consider the issue of privacy, so we don't have to use RFF.
% Our theoretical analysis of RM-GP-TS shares similarity with the work of~\citep{dai2020federated} but has important differences, e.g., unlike the work of~\citep{dai2020federated}, RM-GP-TS does not suffer from the error introduced by random Fourier features approximation since we do not need to consider the issues of retaining (hence not transmitting) the raw data and communication efficiency.

% \vspace{-1.0mm}
\section{Conclusion}
\label{sec:conclusion}
% \vspace{-1.0mm}
% We introduce two provably robust meta-BO algorithms, 
We have introduced
RM-GP-UCB and RM-GP-TS, both of which are asymptotically no-regret even if all meta-tasks are dissimilar to the target task.
We leverage the theoretical results to learn the task similarities in a principled way via online learning.
Theoretical and empirical comparisons show that RM-GP-UCB is more robust against dissimilar tasks, whereas RM-GP-TS performs effectively in more favorable cases and is more computationally efficient.
% Since our methods improve the efficiency of the black-box optimization method of BO, a potential negative societal impact is its misuse for malicious purposes.
% For example, they may be used to for more efficient black-box adversarial attacks. A potential way to mitigate such risks is to leverage our methods to design more effective adversarial defense mechanisms.

% This paper introduces RM-GP-UCB, a scalable, principled and robust meta-BO algorithm that is asymptotically no-regret even when all meta-tasks are dissimilar to the target task.
% The regret upper bound of RM-GP-UCB is minimized via online learning to learn task similarity and identify harmful dissimilar tasks.
% %RM-GP-UCB has been empirically shown to deliver robust and effective performances in a variety of real-world applications.
% Note that for our acquisition function~\eqref{acq_func}, we can also use a weighted combination of other acquisition functions instead of GP-UCB (although the theoretical results presented here would not hold anymore), and it is interesting to explore this in future work. Moreover, we will also investigate whether our algorithm also performs well in problems with high-dimensional input, which is a major challenge for current BO algorithms.


% %\vspace{-1.0mm}
% \section{Related Works}
% %\vspace{-1.0mm}
% \label{sec:related_works}
% Some previous meta-BO works build a joint GP surrogate using all previous and current observations, 
% and represent task similarity through meta-features~\citep{bardenet2013collaborative,schilling2016scalable,yogatama2014efficient}.
% However, these algorithms suffer from the requirement of handcrafted meta-features, which is 
% avoided in other works that learn task similarity from the observations~\citep{swersky2013multi,shilton2017regret}.
% For example, multitask BO~\citep{swersky2013multi} uses a multitask GP as a surrogate and models each task as an output of the GP. 
% %The work of~\citep{shilton2017regret} has focused on the setting with all previous observations belonging to a single previous task and 
% %modeled the difference between the source and target functions either as a noise process or as a GP. 
% These works include all previous and current observations in a single GP surrogate and are thus limited by GP's scalability.
% %In addition, handling a large number of tasks may be problematic for~\citep{swersky2013multi} due to the need to invert the task correlation matrix.
% %The work of~\citep{perrone2018scalable} has replaced the GP surrogate with Bayesian linear regression for scalability and 
% %transferred information from previous tasks by learning a shared representation across tasks using a deep neural network.
% %Two other works~\citep{golovin2017google,poloczek2016warm} have specifically tackled the scenario of sequentially arriving tasks 
% %by using the GP surrogate to model the residual of the current task relative to the posterior mean of the previous tasks.
% %Besides,~\citep{feurer2015initializing} and~\citep{wistuba2015sequential} have 
% %learned a set of good initializations from the previous tasks.
% There have also been other empirical works which replace GP with Bayesian linear regression for scalability~\citep{perrone2018scalable},
% tackle sequentially arriving tasks~\citep{golovin2017google,poloczek2016warm}, learn a set of good initializations~\citep{feurer2015initializing,wistuba2015sequential}, learn a reduced search space for BO from previous tasks~\citep{perrone2019learning}, handle the issue of different function scales using Gaussian Copulas~\citep{salinas2020quantile}, 
% or use the meta-observations to learn the entire acquisition function through RL~\citep{volpp2020meta}.
% %or use the meta-observations to learn the entire acquisition function through an expensive training process using RL~\citep{volpp2020meta}.
% %or study information transfer between GP models for supervised learning~\citep{cao2010adaptive}.
% \citep{wang2018regret} have learned the GP prior from previous tasks and also given theoretical guarantee. % regret bounds showing the benefit of increasing the number of meta-observations;
% However, they have shown in both theory and practice that a large training set of meta-observations ($\geq 5000$) is required for their method to work well, while we focus on the more practical setting of meta-BO where the number of available meta-observations may be small.
% We have also verified that our algorithm outperforms the method from~\citep{wang2018regret} in the experiment that is most favorable for their method among all our experiments (Sec.~\ref{subsec:automl}).
% %however, they rely on potentially restrictive assumptions such as the availability of basis functions %(for compact domain)
% %and require a large set of meta-observations to learn the GP prior.
% %In contrast, our algorithm is free from these requirements.
% %\citep{volpp2020meta} use meta-observations to train a neural network to represent the acquisition function;
% %however, they require an expensive training process using RL and do not have theoretical guarantee.

% Some recent works aim to improve the scalability of GP-based meta-BO algorithms by building a separate GP surrogate for each task~\citep{feurer2018scalable,wistuba2016two,wistuba2018scalable}.
% %and leveraging a weighted combination of either the individual surrogate functions or acquisition functions~\citep{feurer2018scalable,wistuba2016two,wistuba2018scalable}.
% \citep{wistuba2016two} use a weighted combination of the posterior mean of each individual GP surrogate as the joint posterior mean 
% while the posterior variance is derived using only the target observations.
% RGPE~\citep{feurer2018scalable} has extended the work of~\citep{wistuba2016two} by estimating the joint objective function as a weighted combination of individual objective functions,
% %(corresponding to the previous and current tasks)
% such that the resulting joint surrogate remains a GP (unlike~\citep{wistuba2016two}) and can thus be plugged into standard BO algorithms.
% Note that RGPE differs from our RM-GP-UCB algorithm in that RGPE uses a weighted combination of individual GP surrogates to derive a joint GP surrogate, whereas we leverage a weighted combination of individual acquisition functions.
% \citep{wistuba2018scalable} have proposed TAF, which also uses a weighted combination of the acquisition functions (i.e., expected improvement) from the individual tasks for query selection.
% In these works, the weight of a previous task is heuristically chosen to be proportional to the accuracy of the \emph{pairwise ranking of the target observations} produced by 
% either (a) the posterior mean of the GP surrogate of the previous task (TAF)~\citep{wistuba2018scalable} or 
% (b) functions sampled from the posterior GP surrogate (RGPE)~\citep{feurer2018scalable}.
% %In our experiments, we have compared our OM-GP-UCB algorithm with RGPE and TAF, which represent the best-performing scalable meta-BO algorithm making use of the weighted combination 
% %of (a) the GP posterior predictions (RGPE) and (b) the acquisition functions (TAF) respectively,
% %and have demonstrated that OM-GP-UCB outperforms both algorithms.
% We have shown in our experiments that our RM-GP-UCB algorithm outperforms both RGPE and TAF.


\begin{acknowledgements} % will be removed in pdf for initial submission,
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
This research/project is supported by A*STAR under its RIE$2020$ Advanced Manufacturing and Engineering (AME) Industry Alignment Fund – Pre Positioning (IAF-PP) (Award A$19$E$4$a$0101$) and by the Singapore Ministry of Education Academic Research Fund Tier $1$. This research is part of the programme DesCartes and is supported by the National Research Foundation, Prime Minister’s Office, Singapore under its Campus for Research Excellence and Technological Enterprise (CREATE) programme.

\end{acknowledgements}

\newpage

%\nocite{rahimi2008random}

\bibliography{dai_226}
% \bibliographystyle{abbrv}

\end{document}