% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams



\usepackage{xr-hyper}

\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

\myexternaldocument{dai_226}


% if you need to pass options to natbib, use, e.g.:
%     \PassOptionsToPackage{numbers, compress}{natbib}
% before loading neurips_2021

% ready for submission
% \usepackage{neurips_2021}

% to compile a preprint version, e.g., for submission to arXiv, add add the
% [preprint] option:
%     \usepackage[preprint]{neurips_2021}

% to compile a camera-ready version, add the [final] option, e.g.:
%     \usepackage[final]{neurips_2021}

% to avoid loading the natbib package, add option nonatbib:
%    \usepackage[nonatbib]{neurips_2021}

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
% \usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors



% Recommended, but optional, packages for figures and better typesetting:
\usepackage{microtype}
\usepackage{graphicx}
%\usepackage{subfigure}
\usepackage{booktabs} % for professional tables

\usepackage{commath}
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{bbm}
% \usepackage[english]{babel}
% \usepackage{graphicx}
 \usepackage{subcaption}
% \usepackage{subfig}
%\usepackage{graphicx}

\usepackage[toc,page]{appendix}

\usepackage{algorithm}
\usepackage{algorithmic}

\allowdisplaybreaks

%\newtheorem{theorem}{Theorem}[section]
\newtheorem{theorem}{Theorem}
\newtheorem{corollary}{Corollary}
\newtheorem{proposition}{Proposition}
%\newtheorem{corollary}{Corollary}[theorem]
%\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{lemma}{Lemma}
%\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{assumption}{Assumption}
\newtheorem{definition}{Definition}


%\usepackage{hyperref}

%\newcommand{\theHalgorithm}{\arabic{algorithm}}


\renewcommand\thesection{\Alph{section}}
\renewcommand\thesubsection{\thesection.\arabic{subsection}}

\renewcommand\thefigure{\arabic{figure}}
\setcounter{figure}{2}


%\title{Robust Meta-Bayesian Optimization with Online Regret Minimization}
%\title{No-Regret Meta-Bayesian Optimization with Online Regret Minimization}
%\title{Provably Robust Meta-Bayesian Optimization}
\title{On Provably Robust Meta-Bayesian Optimization (Supplementary material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<dzx@nus.edu.sg>?Subject=Your UAI 2022 paper}{Zhongxiang Dai}{}}
\author[1]{Yizhou Chen}
\author[2]{Haibin Yu}
\author[1]{Bryan Kian Hsiang Low}
\author[3]{Patrick Jaillet}
%\author[3]{Further~Coauthor}
%\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
Department of Computer Science, National University of Singapore, Republic of Singapore
}
\affil[2]{%
Department of Data Platform, Tencent
}
\affil[3]{%
Department of Electrical Engineering and Computer Science, Massachusetts Institute of Technology, USA
}
  
\begin{document}
\onecolumn

\maketitle


\setcounter{equation}{7}
\setcounter{lemma}{1}


%\bibliography{dai_226}
% \bibliographystyle{abbrv}


%\newpage
%\appendix

% \section{}
%\vspace{-1.0mm}
%\section{More Related Works}
%%\vspace{-1.0mm}
%% \label{sec:related_works}
%\label{sec:related_works:app}
%Some previous works on meta-BO build a joint GP surrogate using all previous and current observations, 
%and represent task similarity through meta-features~\citep{bardenet2013collaborative,schilling2016scalable,yogatama2014efficient}.
%However, these algorithms suffer from the requirement of handcrafted meta-features, which is avoided in other works that learn task similarity from the observations~\citep{swersky2013multi,shilton2017regret}. For example, multitask BO~\citep{swersky2013multi} uses a multitask GP as a surrogate and models each task as an output of the GP. 
%%The work of~\citep{shilton2017regret} has focused on the setting with all previous observations belonging to a single previous task and 
%%modeled the difference between the source and target functions either as a noise process or as a GP. 
%These works include all previous and current observations in a single GP surrogate and are thus limited by GP's scalability.
%%In addition, handling a large number of tasks may be problematic for~\citep{swersky2013multi} due to the need to invert the task correlation matrix.
%%The work of~\citep{perrone2018scalable} has replaced the GP surrogate with Bayesian linear regression for scalability and 
%%transferred information from previous tasks by learning a shared representation across tasks using a deep neural network.
%%Two other works~\citep{golovin2017google,poloczek2016warm} have specifically tackled the scenario of sequentially arriving tasks 
%%by using the GP surrogate to model the residual of the current task relative to the posterior mean of the previous tasks.
%%Besides,~\citep{feurer2015initializing} and~\citep{wistuba2015sequential} have 
%%learned a set of good initializations from the previous tasks.
%There have also been other empirical works which replace GP with Bayesian linear regression for scalability~\citep{perrone2018scalable},
%tackle sequentially arriving tasks~\citep{golovin2017google,poloczek2016warm}, learn a set of good initializations~\citep{feurer2015initializing,wistuba2015sequential}, learn a reduced search space for BO from previous tasks~\citep{perrone2019learning}, handle the issue of different function scales using Gaussian Copulas~\citep{salinas2020quantile}, 
%learn the task similarities through the distance between the distributions of the optima from different tasks~\citep{ramachandran2018information},
%or use the meta-observations to learn the entire acquisition function through RL~\citep{volpp2020meta}.
%%or use the meta-observations to learn the entire acquisition function through an expensive training process using RL~\citep{volpp2020meta}.
%%or study information transfer between GP models for supervised learning~\citep{cao2010adaptive}.
%% The work of~\citep{wang2018regret} has learned the GP prior from previous tasks and also given theoretical guarantee. % regret bounds showing the benefit of increasing the number of meta-observations;
%% However, they have shown in both theory and practice that a large training set of meta-observations ($\geq 5000$) is required for their method to work well, while we focus on the more practical setting of meta-BO where the number of available meta-observations may be small.
%% We have also verified that our algorithm outperforms the method from~\citep{wang2018regret} in the experiment that is most favorable for their method among all our experiments (more details in Sec.~\ref{subsec:automl}, Hyperparameter Tuning for Support Vector Machines).
%%however, they rely on potentially restrictive assumptions such as the availability of basis functions %(for compact domain)
%%and require a large set of meta-observations to learn the GP prior.
%%In contrast, our algorithm is free from these requirements.
%%\citep{volpp2020meta} use meta-observations to train a neural network to represent the acquisition function;
%%however, they require an expensive training process using RL and do not have theoretical guarantee.
%
%% Some recent works aim to improve the scalability of GP-based meta-BO algorithms by building a separate GP surrogate for each task~\citep{feurer2018scalable,wistuba2016two,wistuba2018scalable}.
%% %and leveraging a weighted combination of either the individual surrogate functions or acquisition functions~\citep{feurer2018scalable,wistuba2016two,wistuba2018scalable}.
%% \citep{wistuba2016two} use a weighted combination of the posterior mean of each individual GP surrogate as the joint posterior mean 
%% while the posterior variance is derived using only the target observations.
%% RGPE~\citep{feurer2018scalable} has extended the work of~\citep{wistuba2016two} by estimating the joint objective function as a weighted combination of individual objective functions,
%% %(corresponding to the previous and current tasks)
%% such that the resulting joint surrogate remains a GP (unlike~\citep{wistuba2016two}) and can thus be plugged into standard BO algorithms.
%% Note that RGPE differs from our RM-GP-UCB algorithm in that RGPE uses a weighted combination of individual GP surrogates to derive a joint GP surrogate, whereas our RM-GP-UCB leverage a weighted combination of individual acquisition functions.
%% \citep{wistuba2018scalable} have proposed TAF, which also uses a weighted combination of the acquisition functions (i.e., expected improvement) from the individual tasks for query selection.
%% In these works, the weight of a previous task is heuristically chosen to be proportional to the accuracy of the \emph{pairwise ranking of the target observations} produced by 
%% either (a) the posterior mean of the GP surrogate of the previous task (TAF)~\citep{wistuba2018scalable} or 
%% (b) functions sampled from the posterior GP surrogate (RGPE)~\citep{feurer2018scalable}.
%% %In our experiments, we have compared our OM-GP-UCB algorithm with RGPE and TAF, which represent the best-performing scalable meta-BO algorithm making use of the weighted combination 
%% %of (a) the GP posterior predictions (RGPE) and (b) the acquisition functions (TAF) respectively,
%% %and have demonstrated that OM-GP-UCB outperforms both algorithms.
%% % We have shown in our experiments that our RM-GP-UCB algorithm outperforms both RGPE and TAF.
%% % Our analysis of RM-GP-TS is similar to that of~\citep{dai2020federated}, but we don't need to consider the issue of privacy, so we don't have to use RFF.
%% Our theoretical analysis of RM-GP-TS shares similarity with the work of~\citep{dai2020federated} but has important differences, e.g., unlike the work of~\citep{dai2020federated}, RM-GP-TS does not suffer from the error introduced by random Fourier features approximation since we do not need to consider the issues of retaining (hence not transmitting) the raw data and communication efficiency.

\section{Proof of Theorem~\ref{regret_bound}}
\label{app:first_section}
To begin with, we need the following lemma to give a high-probability confidence bound on the target function, which will be used in the theoretical analysis of both Theorems~\ref{regret_bound} and~\ref{regret_bound_ts}.
\begin{lemma}
\label{gaussian_bound}
Let $\delta \in (0,1)$ and $\beta_t=B + \sigma \sqrt{2(\gamma_{t-1} + 1 + \log(4/\delta))}$, then
\begin{equation*}
|f(\mathbf{x})-\mu_{t-1}(\mathbf{x})| \leq \beta_t\sigma_{t-1}(\mathbf{x}) \qquad \forall \mathbf{x}\in \mathcal{D},\, t\geq1
\end{equation*}
which holds with probability of $\geq1-\delta/4$.
\end{lemma}
% \begin{lemma}
% \label{gaussian_bound}
% Let $\delta \in (0,1)$ and $\beta_t=2\log(\frac{2|\mathcal{D}|t^2\pi^2}{3\delta})$, then
% \begin{equation*}
% |f(\mathbf{x})-\mu_{t-1}(\mathbf{x})| \leq \sqrt{\beta_t}\sigma_{t-1}(\mathbf{x}) \qquad \forall \mathbf{x}\in \mathcal{D},\, t\geq1
% \end{equation*}
% which holds with probability $\geq1-\delta/4$.
% \end{lemma}
Lemma~\ref{app:first_section} follows directly from Theorem 2 of~\citep{chowdhury2017kernelized}.


To facilitate the theoretical analysis of RM-GP-UCB, we introduce the following auxiliary term:
\begin{equation}
\widetilde{\zeta}_t(\mathbf{x})=\nu_t\left[\sum^M_{i=1}\omega_i\left[\widetilde{\mu}_{i}(\mathbf{x}) + \tau\widetilde{\sigma}_{i}(\mathbf{x})\right]\right] 
+ (1-\nu_t)\left[\mu_{t-1}(\mathbf{x})+\beta_t\sigma_{t-1}(\mathbf{x})\right]
\label{acq_fake}
\end{equation}
in which $\widetilde{\mu}_{i}(\mathbf{x})$ and $\widetilde{\sigma}_{i}(\mathbf{x})$ are obtained by replacing each noisy output of the meta-observations $y_{i,j}$ in the calculation of $\overline{\mu}_{i}(\mathbf{x})$ and $\overline{\sigma}_{i}(\mathbf{x})$~\eqref{acq_func}
by the (hypothetically available) noisy target function output observation at the corresponding input $\mathbf{x}_{i,j}$.
Eq.~\eqref{acq_fake} will serve as the bridge to connect the acquisition function of RM-GP-UCB~\eqref{acq_func} with the target function $f$ in the subsequent theoretical analysis, which will be demonstrated in Appendix~\ref{app:proof_theorem_1}.
To simplify exposition, we omit the superscript in our notation to represent the acquisition function~\eqref{acq_func}, i.e., we use $\overline{\zeta}_t$ to denote the acquisition function of RM-GP-UCB instead of $\overline{\zeta}^{\text{UCB}}_t$.
The next lemma shows that the difference between $\overline{\zeta}_t(\mathbf{x})$~\eqref{acq_func} and $\widetilde{\zeta}_t(\mathbf{x})$~\eqref{acq_fake} is bounded $\forall \mathbf{x} \in \mathcal{D}$, whose proof is given in Appendix~\ref{app:proof_lemma_1}.
\begin{lemma}
\label{ucb_diff}
Let $\delta \in (0, 1)$. Suppose the RM-GP-UCB algorithm is run with parameters $\nu_t\in [0,1]$ $\forall t\geq 1$, and $\omega_i\geq 0$ for $i=1,\ldots,M$ and $\sum_{i=1,\ldots,M}\omega_i=1$. 
Then with probability of $\geq 1 - \delta / 4$,
\[    \left|\overline{\zeta}_t(\mathbf{x})-\widetilde{\zeta}_t(\mathbf{x})\right| \leq \nu_t\alpha \qquad \forall \mathbf{x} \in \mathcal{D}
\]
in which 
\[
\alpha \triangleq \sum^M_{i=1}\omega_i \frac{N_i}{\sigma^2}(2\sqrt{2\sigma^2\log\frac{8N_i}{\delta}}+d_i).
\]
%\[    \alpha \triangleq \sum^M_{i=1}\omega_i \alpha_i,
%\]
%\[    \alpha_i \triangleq  \frac{N_i}{\sigma^2}\left(2\sqrt{N_i}\sqrt{2\sigma^2\log\frac{8N_i}{\delta}}+d_i\sqrt{N_i}\right).
%\]
\end{lemma}

% Next, we need the following lemma that shows upper and lower bounds on the target function values, which follows from Gaussian concentration inequality and will be used extensively in the subsequent proofs.
Next, because $\widetilde{\mu}_{i}(\mathbf{x})$ and $\widetilde{\sigma}_{i}(\mathbf{x})$ are calculated using the (hypothetically available) noisy observations of the target function (i.e., same as $\mu_{t-1}(\mathbf{x})$ and $\sigma_{t-1}(\mathbf{x})$), we can also get the following lemma on the concentration of the target function $f$
% Next, we also need to the following lemma on the concentration of the target function $f$ 
which, similar to Lemma~\ref{gaussian_bound} above, also follows directly from Theorem 2 of~\citep{chowdhury2017kernelized}.
\begin{lemma}
\label{lemma:confidence:bound:tau}
% Furthermore, 
Let 
% $\tau=2\log (4|\mathcal{D}|M/\delta)$, 
$\tau=B + \sigma \sqrt{2(\gamma_{N} + 1 + \log(4M/\delta))}$,
we have that
\begin{equation*}
|f(\mathbf{x})-\widetilde{\mu}_{i}(\mathbf{x})| \leq \tau\widetilde{\sigma}_{i}(\mathbf{x}) \qquad \forall \mathbf{x}\in \mathcal{D},\, i=1,...,M,
\end{equation*}
which also holds with probability $\geq1-\delta/4$.
\end{lemma}
% Note that since $\widetilde{\mu}_{i}(\mathbf{x})$ and $\widetilde{\sigma}_{i}(\mathbf{x})$ (defined in~\eqref{acq_fake}) are calculated using the (hypothetically available) noisy observations of the target function, we can also get the following lemma on the concentration of the target function $f$
% and Lemma~\ref{gaussian_bound} can be proven with a slight modification to Lemma $5.1$ of~\citep{srinivas2009gaussian}.

\subsection{Proof of Lemma~\ref{ucb_diff}}
\label{app:proof_lemma_1}
Let $\mathbf{K}_i=[k(\mathbf{x}_{i,j}, \mathbf{x}_{i,j'})]_{j,j'=1,\ldots,N_i}$ represent the Gram matrix corresponding to the inputs of the meta-observations from meta-task $i$, and $\mathbf{k}_i=[k(\mathbf{x}_{i,j}, \mathbf{x})]^{\top}_{j=1,\ldots,N_i}$.
Denote by $\lambda_j[\mathbf{A}]$ the $j$-th eigenvalue of matrix $\mathbf{A}$.
%Firstly, we need the following lemma proving an upper bound on Frobenius norm:
%\begin{lemma}
%\label{frob_norm}
%\[
%\norm{\left(\mathbf{K}_{i}+\sigma^2I\right)^{-1}}_F \leq \frac{\sqrt{N_i}}{\sigma^2}.
%\]
%\end{lemma}
%\begin{proof}
%\begin{equation*}
%\begin{split}
%    \norm{\left(\mathbf{K}_{i}+\sigma^2I\right)^{-1}}_F &\stackrel{\text{(a)}}{=} \sqrt{\textrm{Tr}\left(\left(\mathbf{K}_{i}+\sigma^2I\right)^{-1}\left(\left(\mathbf{K}_{i}+\sigma^2I\right)^{-1}\right)^T\right)}\\
%    &\stackrel{\text{(b)}}{=} \sqrt{\textrm{Tr}\left(\left(\mathbf{K}_{i}+\sigma^2I\right)^{-1}\left(\mathbf{K}_{i}+\sigma^2I\right)^{-1}\right)}\\
%    &\stackrel{\text{(c)}}{=} \sqrt{\sum^{N_i}_{j=1} \lambda_j \left[\left(\mathbf{K}_{i}+\sigma^2I\right)^{-1}\left(\mathbf{K}_{i}+\sigma^2I\right)^{-1}\right]}\\
%    &= \sqrt{\sum^{N_i}_{j=1} \left(\lambda_j \left[\left(\mathbf{K}_{i}+\sigma^2I\right)^{-1}\right]\right)^2}\\
%    &= \sqrt{\sum^{N_i}_{j=1} \left(\frac{1} {\lambda_j \left[\mathbf{K}_{i}+\sigma^2I\right]}\right)^2}\\
%    &= \sqrt{\sum^{N_i}_{j=1} \left(\frac{1} {\lambda_j \left[\mathbf{K}_{i}\right]+\sigma^2}\right)^2}\\
%    &\stackrel{\text{(d)}}{\leq} \sqrt{\sum^{N_i}_{j=1}\frac{1}{(\sigma^2)^2}} = \frac{\sqrt{N_i}}{\sigma^2}
%\end{split}
%\end{equation*}
%in which (a) results from the definition of matrix Frobenius norm, (b) follows since $\mathbf{K}_{i}+\sigma^2I$ (hence its inverse) is symmetric, 
%(c) holds since the trace of a matrix is equal to the sum of its eigenvalues, the ensuing equalities make use of several identities of matrix eigenvalues.
%(d) follows because all eigenvalues of $\mathbf{K}_i$ are non-negative since $\mathbf{K}_i$ is positive semi-definite (because the kernel $k$ is positive semi-definite).
%\end{proof}
Firstly, we need the following lemma proving an upper bound on matrix $L_2$ norm:
\begin{lemma}
\label{frob_norm}
For all $i=1,\ldots,M$, we have that
\[
\norm{\left(\mathbf{K}_{i}+\sigma^2I\right)^{-1}}_2 \leq \frac{1}{\sigma^2}.
\]
\end{lemma}
\begin{proof}
\begin{equation*}
\begin{split}
    \norm{\left(\mathbf{K}_{i}+\sigma^2I\right)^{-1}}_2 &= \sqrt{\max_{j=1,\ldots,N_i} \lambda_j\left[\left(\left(\mathbf{K}_{i}+\sigma^2I\right)^{-1}\right)^{\top} \left(\mathbf{K}_{i}+\sigma^2I\right)^{-1}\right]}\\
    &=\sqrt{\max_{j=1,\ldots,N_i} \lambda_j\left[\left(\mathbf{K}_{i}+\sigma^2I\right)^{-1}\right]^2}\\
    &\leq \frac{1}{\sigma^2}
\end{split}
\end{equation*}
%in which (a) results from the definition of matrix Frobenius norm, (b) follows since $\mathbf{K}_{i}+\sigma^2I$ (hence its inverse) is symmetric, 
%(c) holds since the trace of a matrix is equal to the sum of its eigenvalues, the ensuing equalities make use of several identities of matrix eigenvalues.
%(d) follows because all eigenvalues of $\mathbf{K}_i$ are non-negative since $\mathbf{K}_i$ is positive semi-definite (because the kernel $k$ is positive semi-definite).
\end{proof}

Next, define $\overline{\mathbf{f}}_{i}=[f_i(\mathbf{x}_{i,j})]_{j=1,\ldots,N_i}$ (in which $f_i(\mathbf{x}_{i,j})$ represents the value of meta-function $i$ at input $\mathbf{x}_{i,j}$), 
and $\widetilde{\mathbf{f}}_{i}=[f(\mathbf{x}_{i,j})]_{j=1,\ldots,N_i}$ (in which $f(\mathbf{x}_{i,j})$ represents the value of target function at input $\mathbf{x}_{i,j}$).
Similarly, define $\overline{\mathbf{y}}_{i}=[y_{i,j}]_{j=1,\ldots,N_i}$ (in which $y_{i,j}$ represents the noisy output observation of meta-task $i$ at input $\mathbf{x}_{i,j}$), 
and $\widetilde{\mathbf{y}}_{i}=[y(\mathbf{x}_{i,j})]_{j=1,\ldots,N_i}$ (in which $y(\mathbf{x}_{i,j})$ represents the hypothetically observed noisy output observation of the target function at input $\mathbf{x}_{i,j}$).
With these definitions, the next lemma shows upper bounds on the distance between $\overline{\mathbf{y}}_{i}$ and $\overline{\mathbf{f}}_{i}$, as well as that distance between $\widetilde{\mathbf{y}}_{i}$ and $\widetilde{\mathbf{f}}_{i}$.
\begin{lemma}
\label{bound_with_obs_noise}
With probability $\geq 1 - \delta/4$,
\begin{equation*}
\begin{split}
\norm{\overline{\mathbf{y}}_{i} - \overline{\mathbf{f}}_{i}}_2 \leq \sqrt{N_i} \sqrt{2\sigma^2\log\frac{8N_i}{\delta}},\\
\norm{\widetilde{\mathbf{y}}_{i} - \widetilde{\mathbf{f}}_{i}}_2 \leq \sqrt{N_i} \sqrt{2\sigma^2\log\frac{8N_i}{\delta}}.
\end{split}
\end{equation*}
\end{lemma}
\begin{proof}
Following the same analysis as Lemma 5.1 of \citep{srinivas2009gaussian}, we have that for the standard Gaussian random variable $z \sim \mathcal{N}(0, 1)$,
\begin{equation}
\mathbb{P}(\left|z\right| > c) \leq e^{-\frac{c^2}{2}}.
\label{standard_gaussian}
\end{equation}
Since for each $j=1,\ldots,N_i$, we have that $y_{i,j} - f_i(\mathbf{x}_{i,j}) \sim \mathcal{N}(0,\sigma^2)$ and that $y(\mathbf{x}_{i,j}) - f(\mathbf{x}_{i,j}) \sim \mathcal{N}(0,\sigma^2)$, which leads to the following,
\begin{equation*}
\begin{split}
\mathbb{P}\left(\left|\frac{y_{i,j} - f_i(\mathbf{x}_{i,j})}{\sigma}\right| > \sqrt{2\log\frac{8N_i}{\delta}}\right) = \mathbb{P}\left(\left|y_{i,j} - f_i(\mathbf{x}_{i,j}) \right| > \sqrt{2\sigma^2\log\frac{8N_i}{\delta}}\right) \leq \frac{\delta}{8N_i},\\
\mathbb{P}\left(\left|\frac{y(\mathbf{x}_{i,j}) - f(\mathbf{x}_{i,j})}{\sigma}\right| > \sqrt{2\log\frac{8N_i}{\delta}}\right) = \mathbb{P}\left(\left|y(\mathbf{x}_{i,j}) - f(\mathbf{x}_{i,j}) \right| > \sqrt{2\sigma^2\log\frac{8N_i}{\delta}}\right) \leq \frac{\delta}{8N_i}.
\end{split}
\end{equation*}
Taking a union bound over $j=1,\ldots,N_i$ for each of the two equations above, we have that for \emph{all} $j=1,\ldots,N_i$,
\begin{equation*}
\begin{split}
\left|y_{i,j} - f_i(\mathbf{x}_{i,j}) \right| \leq \sqrt{2\sigma^2\log\frac{8N_i}{\delta}}, \\
\left|y(\mathbf{x}_{i,j}) - f(\mathbf{x}_{i,j}) \right| \leq \sqrt{2\sigma^2\log\frac{8N_i}{\delta}},
\end{split}
\end{equation*}
both of which hold with probability $\geq 1 - \delta/8$.
Therefore, with probability $\geq 1 - \delta/8$,
\begin{equation}
\norm{\overline{\mathbf{y}}_{i} - \overline{\mathbf{f}}_{i}}_2 = \sqrt{\sum^{N_i}_{j=1} \left|y_{i,j} - f_i(\mathbf{x}_{i,j})\right|^2 } \leq \sqrt{\sum^{N_i}_{j=1} 2\sigma^2\log\frac{8N_i}{\delta}} \leq \sqrt{N_i}\sqrt{2\sigma^2\log\frac{8N_i}{\delta}}.
\label{eq:tmp_1}
\end{equation}
Repeating the procedure above leads to
\begin{equation}
\norm{\widetilde{\mathbf{y}}_{i} - \widetilde{\mathbf{f}}_{i}}_2 \leq \sqrt{N_i}\sqrt{2\sigma^2\log\frac{8N_i}{\delta}}
\label{eq:tmp_2}
\end{equation}
which also holds with probability $\geq 1 - \delta/8$.
Taking a union bound over equations~\eqref{eq:tmp_1} and~\eqref{eq:tmp_2} completes the proof.
\end{proof}

With these supporting lemmas, Lemma~\ref{ucb_diff} can be proved as follows:
\begin{align}
    \left|\overline{\zeta}_t(\mathbf{x})-\widetilde{\zeta}_t(\mathbf{x})\right| &= \left|\nu_t\left[\sum^M_{i=1}\omega_i[\overline{\mu}_{i}(\mathbf{x}) + \sqrt{\tau}\overline{\sigma}_{i}(\mathbf{x})]\right] - \nu_t\left[\sum^M_{i=1}\omega_i[\widetilde{\mu}_{i}(\mathbf{x}) + \sqrt{\tau}\widetilde{\sigma}_{i}(\mathbf{x})]\right]\right| \nonumber\\
    &\stackrel{\text{(a)}}{=}\left|\nu_t\sum^M_{i=1}\omega_i[\overline{\mu}_{i}(\mathbf{x})-\widetilde{\mu}_{i}(\mathbf{x})]\right| \nonumber\\
    &\leq \nu_t\sum^M_{i=1}\omega_i\left|\overline{\mu}_{i}(\mathbf{x})-\widetilde{\mu}_{i}(\mathbf{x})\right|\nonumber\\
    &\leq \nu_t\sum^M_{i=1}\omega_i\left|\mathbf{k}_{i}(\mathbf{x})^{\top} (\mathbf{K}_{i}+\sigma^2I)^{-1}(\overline{\mathbf{y}}_{i}-\widetilde{\mathbf{y}}_{i})\right| \nonumber\\
    &\stackrel{\text{(b)}}{\leq} \nu_t\sum^M_{i=1}\omega_i \norm{\mathbf{k}_{i}(\mathbf{x})}_2 \norm{(\mathbf{K}_{i}+\sigma^2I)^{-1}}_2 \norm{\overline{\mathbf{y}}_{i}-\widetilde{\mathbf{y}}_{i}}_2 \nonumber\\
    &\stackrel{\text{(c)}}{\leq} \nu_t\sum^M_{i=1}\omega_i \norm{\mathbf{k}_{i}(\mathbf{x})}_2 \frac{1}{\sigma^2} \norm{\overline{\mathbf{y}}_{i}-\widetilde{\mathbf{y}}_{i}}_2\nonumber\\
    &\stackrel{\text{(d)}}{\leq} \nu_t\sum^M_{i=1}\omega_i \sqrt{N_i} \frac{1}{\sigma^2} \norm{\overline{\mathbf{y}}_{i}-\widetilde{\mathbf{y}}_{i}}_2\nonumber\\
    &\leq \nu_t\sum^M_{i=1}\omega_i \frac{\sqrt{N_i}}{\sigma^2}\norm{\overline{\mathbf{y}}_{i} - \overline{\mathbf{f}}_{i} + \overline{\mathbf{f}}_{i} - \widetilde{\mathbf{f}}_{i} + \widetilde{\mathbf{f}}_{i} - \widetilde{\mathbf{y}}_{i}}_2\nonumber\\
    &\leq \nu_t\sum^M_{i=1}\omega_i \frac{\sqrt{N_i}}{\sigma^2}\left[\norm{\overline{\mathbf{y}}_{i} - \overline{\mathbf{f}}_{i}}_2+\norm{\overline{\mathbf{f}}_{i} - \widetilde{\mathbf{f}}_{i}}_2+\norm{\widetilde{\mathbf{f}}_{i} - \widetilde{\mathbf{y}}_{i}}_2\right]\nonumber\\
    &\stackrel{\text{(e)}}{\leq} \nu_t\sum^M_{i=1}\omega_i \frac{\sqrt{N_i}}{\sigma^2}\left(2\sqrt{N_i}\sqrt{2\sigma^2\log\frac{8N_i}{\delta}}+\norm{\overline{\mathbf{f}}_{i} - \widetilde{\mathbf{f}}_{i}}_2\right) \nonumber\\
    &= \nu_t\sum^M_{i=1}\omega_i \frac{\sqrt{N_i}}{\sigma^2}\left(2\sqrt{N_i}\sqrt{2\sigma^2\log\frac{8N_i}{\delta}}+\sqrt{\sum^{N_i}_{j=1}\left(f_i(\mathbf{x}_{i,j}) - f(\mathbf{x}_{i,j})\right)^2}\right) \nonumber\\
    &\stackrel{\text{(f)}}{\leq} \nu_t\sum^M_{i=1}\omega_i \frac{\sqrt{N_i}}{\sigma^2}\left(2\sqrt{N_i}\sqrt{2\sigma^2\log\frac{8N_i}{\delta}}+d_i\sqrt{N_i}\right) \nonumber\\
    &= \nu_t\sum^M_{i=1}\omega_i \frac{N_i}{\sigma^2}\left(2\sqrt{2\sigma^2\log\frac{8N_i}{\delta}}+d_i\right) \nonumber\\
    &\triangleq \nu_t \alpha
\label{ucb_diff_proof}
\end{align}
which holds with probability $\geq 1-\delta/4$. (a) holds because $\overline{\sigma}_i(\mathbf{x})=\widetilde{\sigma}_i(\mathbf{x})$ for all $\mathbf{x} \in \mathcal{D}$, 
because 
% we have assumed that $f$ and all $f_i$'s are sampled from the same GP with kernel $k$ and 
the posterior standard deviation only depends on the input locations and is independent of the corresponding output responses; 
(b) follows from Cauchy-Schwarz inequality,
(c) follows from Lemma~\ref{frob_norm}, 
(d) results from the assumption w.l.o.g.~that $k\left(\mathbf{x}, \mathbf{x}'\right) \leq 1$ for all $\mathbf{x}, \mathbf{x}' \in \mathcal{D}$,
(e) follows from Lemma~\ref{bound_with_obs_noise}, 
(f) is obtained from the definition of the function gap: $d_i\triangleq \max_{j=1,\ldots,N_i}\left|f(\mathbf{x}_{i,j})-f_i(\mathbf{x}_{i,j})\right|$ for $i=1,\ldots,M$.
This completes the proof of Lemma~\ref{ucb_diff}.

\subsection{Proof of Theorem~\ref{regret_bound}} 
\label{app:proof_theorem_1}
To begin with, we need the following lemma showing a high-probability upper bound on the global maximum of the target function.
\begin{lemma}
\label{bound_opt_func_val}
Given $\delta \in (0,1)$.
Let $\mathbf{x}^*$ denote a global maximizer of the target function $f$, and $\alpha$ be as defined in Lemma~\ref{ucb_diff}. 
Suppose the RM-GP-UCB algorithm is run with the parameter $\nu_t\in [0, 1]$ for all $t\geq 1$.
Then, with probability $\geq 1-3\delta/4$,
\[
f(\mathbf{x}^*) \leq \overline{\zeta}_t(\mathbf{x}_t)+\nu_t\alpha \qquad \forall t\geq 1.
\]
\end{lemma}
\begin{proof} 
Firstly, as a result of Lemma~\ref{gaussian_bound} and Lemma~\ref{lemma:confidence:bound:tau} (both hold with probability of $\geq 1 - \delta/4$), at any iteration $t\geq 1$ and for all $\mathbf{x} \in \mathcal{D}$, we have that with probability $\geq 1 - \delta/4 - \delta/4$, $\widetilde{\zeta}_t(\mathbf{x})$ is an upper bound on $f(\mathbf{x})$:
\begin{equation}
\begin{split}
    \widetilde{\zeta}_t&(\mathbf{x})-f(\mathbf{x})=\widetilde{\zeta}_t(\mathbf{x})-\left[\nu_t\sum^M_{i=1}\omega_i f(\mathbf{x}) + (1-\eta_t)f(\mathbf{x})\right]\\
    &=\nu_t\sum^M_{i=1}\omega_i\left[\widetilde{\mu}_{i}(\mathbf{x}) + \sqrt{\tau}\widetilde{\sigma}_{i}(\mathbf{x})-f(\mathbf{x})\right]+ (1-\nu_t)\left[\mu_{t-1}(\mathbf{x})+\sqrt{\beta_t}\sigma_{t-1}(\mathbf{x})-f(\mathbf{x})\right] \geq 0.
\end{split}
\label{upper_bound}
\end{equation}

Therefore, with probability $\geq 1-\delta/4 - \delta/4 - \delta/4$,
\begin{equation}
\begin{split}
    f(\mathbf{x}^*) \stackrel{\text{(a)}}{\leq} \widetilde{\zeta}_t(\mathbf{x}^*) \stackrel{\text{(b)}}{\leq} \overline{\zeta}_t(\mathbf{x}^*)+\nu_t\alpha \stackrel{\text{(c)}}{\leq} \overline{\zeta}_t(\mathbf{x}_t)+\nu_t\alpha
\end{split}
\label{eq:tmp}
\end{equation}
in which (a) results from~\eqref{upper_bound}, (b) is obtained via Lemma~\ref{ucb_diff} which holds with probability of $\geq 1-\delta/4$, and (c) follows from the policy for selecting $\mathbf{x}_t$, i.e., by maximizing~\eqref{acq_func}.
This completes the proof.
\end{proof}
Subsequently, we can show a high-probability upper bound on the instantaneous regret with the following lemma .
\begin{lemma}
\label{inst_regret_analysis}
Given $\delta \in (0,1)$. Let $\alpha$ be as defined in Lemma~\ref{ucb_diff}. Suppose the RM-GP-UCB algorithm is run with the parameters $\beta_t$, $\tau$ and $\nu_t$.
Then, with probability $\geq 1-3\delta/4$, $\forall t\geq 1$,
\[
r_t \leq 2\nu_t(\alpha+\tau)+2(1-\nu_t)\beta_t\sigma_{t-1}(\mathbf{x}_t).
\]
\end{lemma}
\begin{proof}
The instantaneous regret can be upper-bounded by
\begin{equation}
\begin{split}
    r_t &= f(\mathbf{x}^*)-f(\mathbf{x}_t) \stackrel{\text{(a)}}{\leq} \overline{\zeta}_t(\mathbf{x}_t)+\nu_t\alpha- f(\mathbf{x}_t)\\ 
    &\leq \underline{\overline{\zeta}_t(\mathbf{x}_t) - \widetilde{\zeta_t}(\mathbf{x}_t)} + \widetilde{\zeta_t}(\mathbf{x}_t)-f(\mathbf{x}_t)+\nu_t\alpha \\
    &\stackrel{\text{(b)}}{\leq} \nu_t\alpha + \nu_t\sum^M_{i=1}\omega_i \left[\widetilde{u}_{i}(\mathbf{x}_t) +\tau\widetilde{\sigma}_{i}(\mathbf{x}_t)\right] +(1-\nu_t)\left[u_{t-1}(\mathbf{x}_t)+\beta_t\sigma_{t-1}(\mathbf{x}_t)\right] \\
    &\qquad - f(\mathbf{x}_t) + \nu_t\alpha\\
    &= \nu_t\alpha + \nu_t\sum^M_{i=1}\omega_i \left[\widetilde{u}_{i}(\mathbf{x}_t) +\tau\widetilde{\sigma}_{i}(\mathbf{x}_t)\right] +(1-\nu_t)\left[u_{t-1}(\mathbf{x}_t)+\beta_t\sigma_{t-1}(\mathbf{x}_t)\right] \\
    &\qquad - \left[\nu_t\sum^M_{i=1}\omega_i f(\mathbf{x}_t) + (1-\nu_t)f(\mathbf{x}_t) \right] + \nu_t\alpha\\
    &\leq \nu_t\alpha + \nu_t\sum^M_{i=1}\omega_i \left[\widetilde{u}_{i}(\mathbf{x}_t) -f(\mathbf{x}_t)\right]+
    \nu_t\sum^M_{i=1}\omega_i\tau\widetilde{\sigma}_{i}(\mathbf{x}_t) \\
    &\qquad +(1-\nu_t)\left[u_{t-1}(\mathbf{x}_t)-f(\mathbf{x}_t)\right] + (1-\nu_t)\beta_t\sigma_{t-1}(\mathbf{x}_t) + \nu_t\alpha\\
    &\stackrel{\text{(c)}}{\leq} 2\nu_t\alpha + 2\nu_t\sum^M_{i=1}\omega_i\tau\widetilde{\sigma}_{i}(\mathbf{x}_t)+2(1-\nu_t)\beta_t\sigma_{t-1}(\mathbf{x}_t)\\
    &\stackrel{\text{(d)}}{\leq} 2\nu_t\alpha + 2\nu_t\tau+2(1-\nu_t)\beta_t\sigma_{t-1}(\mathbf{x}_t)\\
    &\leq 2\nu_t(\alpha+\tau)+2(1-\nu_t)\beta_t\sigma_{t-1}(\mathbf{x}_t)
\end{split}
\label{eq:analyze_insta_regret}
\end{equation}
which holds with probability $\geq 1-3\delta/4$. (a) follows from Lemma~\ref{bound_opt_func_val} which holds with probability of $\geq 1-3\delta/4$, 
(b) results from Lemma~\ref{ucb_diff} as well as the definition of $\widetilde{\zeta}_t(\mathbf{x}_t)$~\eqref{acq_fake}, 
(c) is a result of Lemma~\ref{gaussian_bound} and Lemma~\ref{lemma:confidence:bound:tau}, 
and (d) follows because $\widetilde{\sigma}_{i}(\mathbf{x}_t) \leq 1$ for all $\mathbf{x}_t \in \mathcal{D}$, which can be easily verified using the formula of the GP posterior variance~\eqref{gp_posterior} and the assumption that $k(\mathbf{x},\mathbf{x}')\leq1$ for all $\mathbf{x},\mathbf{x}'\in \mathcal{D}$.
The error probabilities $3\delta/4=\delta/4+\delta/4+\delta/4$ result from Lemmas~\ref{gaussian_bound},~\ref{ucb_diff} and~\ref{lemma:confidence:bound:tau}.
\end{proof}

Next, we need to connect the second term from Lemma~\ref{inst_regret_analysis} with the information gain. The following lemma, which is Lemma 5.3 of~\citep{srinivas2009gaussian}, defines the information gain on the target function from any set of observations.
\begin{lemma}
\label{info_gain}
Let $\mathbf{f}_T$ and $\mathbf{y}_T$ denote the set of function values and noisy observations of the target function respectively after $T$ iterations. Then, the information gain about $f$ from the first $T$ observations can be expressed as
\[
I(\mathbf{y}_T;\mathbf{f}_T)=\frac{1}{2}\sum^T_{t=1}\log \left[1+\sigma^{-2}\sigma^2_{t-1}(\mathbf{x}_t)\right].
\]
\end{lemma}
Subsequently, we can upper bound the second term from Lemma~\ref{inst_regret_analysis} (summed from iterations 1 to $T$) by the maximum information gain via the following lemma.
\begin{lemma}
\label{upper_bound_by_info_gain}
Suppose the RM-GP-UCB algorithm is run with the parameters $\beta_t$ $\forall t\geq 1$ and a non-increasing sequence $\nu_t \in [0,1]$ $\forall t \geq 1$. 
Define the maximum information gain as $\gamma_T=\max_{A\in \mathcal{D}, |A|=T}I(\mathbf{y}_A;\mathbf{f}_A)$ 
in which $\mathbf{f}_A$ and $\mathbf{y}_A$ represent the function values and noisy observations from a set $A$ of inputs of size $T$. Then,
\[
\sum^T_{t=1}\left[2(1-\nu_t)\beta_t\sigma_{t-1}(\mathbf{x}_t)\right]^2 \leq (1-\nu_T)^2 C_1 \beta_T^2 \gamma_T
\]
in which $C_1\triangleq \frac{8}{\log(1+\sigma^{-2})}$.
\end{lemma}
\begin{proof}
Each term inside the summation can be upper-bounded by
\begin{equation}
\label{bound_each_regret_term}
\begin{split}
4(1-\nu_t)^2\beta_t^2\sigma^2_{t-1}(\mathbf{x}_t) &\stackrel{\text{(a)}}{\leq} 4(1-\nu_T)^2\beta_T^2 \sigma^2 \left(\sigma^{-2} \sigma^2_{t-1}(\mathbf{x}_t) \right) \\
&\stackrel{\text{(b)}}{\leq} 4(1-\nu_T)^2\beta_T^2 \sigma^2 \left(\frac{\sigma^{-2}}{\log(1+\sigma^{-2})} \log\left(1+\sigma^{-2}\sigma^2_{t-1}(\mathbf{x}_t)\right)\right)\\
&= (1-\nu_T)^2\beta_T^2 \frac{8}{\log(1+\sigma^{-2})} \left[\frac{1}{2} \log\left(1+\sigma^{-2}\sigma^2_{t-1}(\mathbf{x}_t)\right)\right]
\end{split}
\end{equation}
in which (a) follows since $\beta_t$ is non-decreasing in $t$ and $\nu_t$ is non-increasing in $t$, 
(b) follows since $\sigma^{-2} x \leq \frac{\sigma^{-2}}{\log(1+\sigma^{-2})} \log(1+\sigma^{-2}x)$ for all $x\in (0,1]$ and $\sigma^2_{t-1}(\mathbf{x}_t)\in (0, 1]$.

As a result, the summation can be decomposed as
\begin{equation*}
\begin{split}
\sum^T_{t=1}\left[2(1-\nu_t)\beta_t\sigma_{t-1}(\mathbf{x}_t)\right]^2 &\stackrel{\text{(a)}}{\leq} (1-\nu_T)^2\beta_T^2 \frac{8}{\log(1+\sigma^{-2})} \sum^T_{t=1} \left[\frac{1}{2} \log\left(1+\sigma^{-2}\sigma^2_{t-1}(\mathbf{x}_t)\right)\right]\\
&\stackrel{\text{(b)}}{=}(1-\nu_T)^2\beta_T^2 \frac{8}{\log(1+\sigma^{-2})} I(\mathbf{y}_T;\mathbf{f}_T)\\
&\stackrel{\text{(c)}}{\leq} (1-\nu_T)^2 C_1 \beta_T^2 \gamma_T
\end{split}
\end{equation*}
in which (a) results from~\eqref{bound_each_regret_term}, (b) follows from Lemma~\ref{info_gain}, 
and (c) is obtained by making use of the definition of $C_1$ and $\gamma_T$.
\end{proof}
Finally, an upper bound on the cumulative regret follows from combining these supporting lemmas:
\begin{equation}
\begin{split}
    R_T&=\sum^T_{t=1}r_t \stackrel{\text{(a)}}{\leq} \sum^T_{t=1}\left[ 2\nu_t(\alpha+\tau)+2\left(1-\nu_t\right)\beta_t\sigma_{t-1}(\mathbf{x}_t)\right]\\
    &= 2(\alpha+\tau) \sum^T_{t=1} \nu_t + \sum^T_{t=1}2 (1-\nu_t)\beta_t\sigma_{t-1}(\mathbf{x}_t)\\
    &\stackrel{\text{(b)}}{\leq} 2(\alpha+\tau) \sum^T_{t=1} \nu_t + \sqrt{T} \sqrt{\sum^T_{t=1}\left[2 (1-\nu_t)\beta_t\sigma_{t-1}(\mathbf{x}_t)\right]^2}\\
    &\stackrel{\text{(c)}}{\leq} 2(\alpha+\tau) \sum^T_{t=1} \nu_t + \sqrt{C_1 T (1-\nu_T)^2\beta_T^2\gamma_T}\\
    &\stackrel{\text{(d)}}{\leq} 2(\alpha+\tau) \sum^T_{t=1} \nu_t + \beta_T\sqrt{C_1 T \gamma_T}
\end{split}
\label{cum_reg}
\end{equation}
which holds with probability $\geq 1 - 3\delta/4$. (a) is a result of Lemma~\ref{inst_regret_analysis}, (b) follows from Cauchy-Schwarz inequality, (c) is obtained using Lemma~\ref{upper_bound_by_info_gain}, and (d) follows since $1-\nu_T \leq 1$. This completes the proof.

% Lastly, an asymptotic expression for the upper bound on $R_T$ given above can be derived:
% \[
% R_T = \mathcal{O}\Big( \Big(\sum^M_{i=1}\omega_id_i\Big) \sum^T_{t=1} \nu_t + \gamma_T\sqrt{T} \Big).
% \]
% Moreover, i
If the meta-weights $\omega_i$'s are allowed to change with $t$ (i.e., when our online meta-weight optimization is used), then the proof here only needs to be modified to let $\alpha$ depend on $t$: 
$R_T \leq 2\tau \sum^T_{t=1} \nu_t + 2\sum^T_{t=1} \nu_t \alpha_t + \beta_T\sqrt{C_1 T \gamma_T}$.
In this case, the no-regret convergence guarantee of RM-GP-UCB (Sec.~\ref{subsec:theory:rm_gp_ucb}) is still preserved since in this case, we can simply upper-bound every $\omega_{i,t}$ by $1$. That is
$R_T \leq 2(\alpha'+\tau) \sum^T_{t=1} \nu_t + \beta_T\sqrt{C_1 T \gamma_T}$,
with $\alpha' \triangleq \sum^M_{i=1} \frac{N_i}{\sigma^2}(2\sqrt{2\sigma^2\log\frac{8N_i}{\delta}}+d_i)$.
% \[
% R_T = \mathcal{O}\Big( \sum^T_{t=1} \nu_t \Big(\sum^M_{i=1}\omega_{i,t}d_i\Big) + \gamma_T\sqrt{T} \Big) = \mathcal{O}\Big( \Big(\sum^M_{i=1}d_i\Big) \sum^T_{t=1} \nu_t + \gamma_T\sqrt{T} \Big).
% \]

\subsection{Meta-tasks Can Improve the Convergence by Accelerating Exploration}
\label{app:improved_bound}
Here, we utilize the analysis in Appendix~\ref{app:proof_theorem_1} to illustrate how the meta-tasks (if similar to the target task) can help RM-GP-UCB obtain a better regret bound than standard GP-UCB in the early stage of the algorithm. For simplicity, we focus on the most favorable scenario where all meta-functions have equal values to the target function at their corresponding input locations, i.e., all function gaps are $0$: 
% $\max_{\mathbf{x} \in \mathcal{D}}\left|f(\mathbf{x})-f_i(\mathbf{x})\right| = 0, \forall i=1,\ldots,M$.
$d_i = \max_{j=1,\ldots,N_i}\left|f(\mathbf{x}_{i,j})-f_i(\mathbf{x}_{i,j})\right|=0, \forall i=1,\ldots,M$.
Although not realistic, this scenario is useful for illustrating how the meta-tasks help our RM-GP-UCB algorithm achieve a better convergence at the initial stage.

In this case, according to the definition of $\widetilde{\zeta}_t$~\eqref{acq_fake} and $\overline{\zeta}_t$~\eqref{acq_func}, we have that $\widetilde{\zeta}_t(\mathbf{x})=\overline{\zeta}_t(\mathbf{x}), \forall \mathbf{x}\in\mathcal{D}, t\geq1$. As a result, the analysis of~\eqref{eq:tmp} in the proof of Lemma~\ref{bound_opt_func_val} can be similarly applied, yielding:
\begin{equation}
f(\mathbf{x}^*) \leq \widetilde{\zeta}_t(\mathbf{x}^*)=\overline{\zeta}_t(\mathbf{x}^*) \leq \overline{\zeta}_t(\mathbf{x}_t).
\end{equation}
%Recall we have proved in~\eqref{eq:tmp} that $f(\mathbf{x}^*) \leq \widetilde{\zeta}_t(\mathbf{x}^*)=\overline{\zeta}_t(\mathbf{x}^*)$. 
Next, we can re-analyze the instantaneous regret following similar steps to~\eqref{eq:analyze_insta_regret}:
\begin{equation}
\begin{split}
r_t &= f(\mathbf{x}^*) - f(\mathbf{x}_t) \leq \overline{\zeta}_t(\mathbf{x}_t) - f(\mathbf{x}_t)\\
&\leq 2\nu_t\sum^M_{i=1}\omega_i\tau\overline{\sigma}_{i}(\mathbf{x}_t)+2(1-\nu_t)\beta_t\sigma_{t-1}(\mathbf{x}_t)\\
&=\underbrace{2\nu_t\left( \sum^M_{i=1}\omega_i\tau\overline{\sigma}_{i}(\mathbf{x}_t) - \beta_t\sigma_{t-1}(\mathbf{x}_t) \right)}_{A1} + \underbrace{2\beta_t\sigma_{t-1}(\mathbf{x}_t)}_{A2},
\end{split}
\label{eq:better_bound}
\end{equation}
in which some intermediate steps that are identical to those used in~\eqref{eq:analyze_insta_regret} have been omitted for simplicity.
Note that term $A_2$ in~\eqref{eq:better_bound} is identical to the upper bound on the instantaneous regret for the standard GP-UCB algorithm~\citep{srinivas2009gaussian}. Therefore, the meta-tasks affect the upper bound on the instantaneous regret through the term $A_1$.
%, such that the meta-tasks improve the regret upper bound if $A_1 < 0$.

Recall Theorem~\ref{regret_bound} has told us that we should choose $\nu_t \rightarrow 0$ as $t \rightarrow \infty$. In the initial stage of the algorithm when $\nu_t$ is large, the impact of $A_1$ on the regret of the algorithm is large. In this case, the meta-tasks improve the upper bound on the instantaneous regret (compared with standard GP-UCB) if $A_1 < 0$, that is:
\begin{equation}
\sum^M_{i=1}\omega_i \overline{\sigma}_{i}(\mathbf{x}_t) < \frac{\beta_t}{\tau}\sigma_{t-1}(\mathbf{x}_t).
\label{eq:better_bound_2}
\end{equation}
In other words, RM-GP-UCB converges faster than standard GP-UCB in the initial stage if the (weighted combination of) meta-tasks have smaller uncertainty (i.e., posterior standard deviation) at $\mathbf{x}_t$ compared with the target task (scaled by $\beta_t/\tau$). 
Fortunately, in the early stage of the algorithm, this condition is highly likely to be satisfied: 
%because the number of observations collected by the target task is small, hence leading to large uncertainty of the target task, i.e., large target posterior standard deviation $\sigma_{t-1}(\mathbf{x}_t)$. 
When the number of observations of the target task is small, the posterior standard deviation of the target GP posterior (i.e., RHS of Equation~\eqref{eq:better_bound_2}) is usually large; therefore, Equation~\eqref{eq:better_bound_2} is highly likely to be satisfied.
%Therefore, this implies that at the initial stage, the meta-tasks help improve the convergence of standard GP-UCB by \emph{reducing the uncertainty and hence accelerating the exploration of the target task}.
This insight turns out to have an intuitive and elegant interpretation as well.
In the initial stage of the standard GP-UCB algorithm, due to the lack of observations, the algorithm \emph{has large uncertainty} regarding the objective function and hence tends to \emph{explore}; however, the meta-tasks (assuming that they are similar to the target task) provides additional information for the algorithm, which \emph{reduces the uncertainty} about the objective function and hence \emph{decreases the requirement for initial exploration}. To summarize, in the initial stage, the meta-tasks, if similar to the target task, help RM-GP-UCB achieve smaller regret upper bound (hence converge faster) than GP-UCB by reducing the degree of exploration.
In less favorable scenarios where the function gaps are nonzero (i.e., the meta-functions are not exactly equal to the target function), some amount of errors will be introduced to the upper bound on the instantaneous regret~\eqref{eq:better_bound}. As a results, a positive error term will be added to the LHS of~\eqref{eq:better_bound_2}, making the theoretical condition for a faster convergence~\eqref{eq:better_bound_2} harder to satisfy.
At later stages where $\nu_t$ is already small and close to $0$, the impact of the term $A_1$ is significantly diminished, thus allowing our RM-GP-UCB algorithm to converge to no regret at a similar rate to standard GP-UCB.

\section{Proof of Theorem~\ref{regret_bound_ts}}
\label{app:proof:theorem:ts}
Our theoretical analysis of RM-GP-TS shares similarity with the works of~\citep{dai2020federated,dai2021differentially} but has important differences, e.g., unlike the works of~\citep{dai2020federated,dai2021differentially}, RM-GP-TS does not suffer from the error introduced by random Fourier features approximation since we do not need to consider the issues of communication efficiency and retaining (hence not transmitting) the raw data.

%\begin{equation}
%\overline{\psi}_t(\mathbf{x})\triangleq
%\begin{cases}
%{\sum}^M_{i=1}\omega_i \left[\overline{f}^t_{i}(\mathbf{x})\right] & \text{with probability } \nu_t \ ,\\
%f^t(\mathbf{x}) & \text{with probability } 1-\nu_t.
%\end{cases}
%\end{equation}
Based on the acquisition function $\overline{\zeta}_t$ for RM-GP-TS~\eqref{eq:acq_func_ts} (we have again removed the superscript for simplicity), define $\mathcal{E}^1_t$ as the event that $\overline{\zeta}_t(\mathbf{x}) = f^t(\mathbf{x})$ which happens with probability $1-\nu_t$, and define $\mathcal{E}^2_t$ as the event that $\overline{\zeta}_t(\mathbf{x}) = {\sum}^M_{i=1}\omega_i \left[\overline{f}^t_{i}(\mathbf{x})\right]$ which happens with probability $\nu_t$.
Define $\mathcal{F}_{t-1}$ as the filtration containing the history of input-output pairs of the target task up to and including iteration $t-1$.

\begin{lemma}
\label{confidence_bound:ts}
%Let $\delta \in (0,1)$ and $\beta_t'=2\log(\frac{|\mathcal{D}|t^2 5\pi^2}{3\delta})$, then
%\begin{equation*}
%|f(\mathbf{x})-\mu_{t-1}(\mathbf{x})| \leq \sqrt{\beta_t'} \sigma_{t-1}(\mathbf{x}) \qquad \forall \mathbf{x}\in \mathcal{D},\, t\geq1
%\end{equation*}
%which holds with probability $\geq1-\delta/10$. 
%Also, 
%Let $\tau'=2\log (8|\mathcal{D}|M/\delta)$, we get
With $\tau$ defined in Lemma~\ref{lemma:confidence:bound:tau}, we have that
\begin{equation*}
|f_i(\mathbf{x})-\overline{\mu}_{i}(\mathbf{x})| \leq \tau\overline{\sigma}_{i}(\mathbf{x}) \qquad \forall \mathbf{x}\in \mathcal{D},\, i=1,...,M
\end{equation*}
which holds with probability $\geq1-\delta/4$.
\end{lemma}
Similar to Lemma~\ref{gaussian_bound} and Lemma~\ref{lemma:confidence:bound:tau}, Lemma~\ref{confidence_bound:ts} also follows from Theorem 2 of~\citep{chowdhury2017kernelized}.
Next, we also need the following lemma showing the concentration of functions sampled from the GP posterior around the posterior mean, for both the target function and the meta-functions.
\begin{lemma}
\label{lemma:concentration:sampled:function:around:mean}
With $\beta_t$ defined in Lemma~\ref{gaussian_bound} and $\tau$ defined in Lemma~\ref{lemma:confidence:bound:tau}, we have that
\[
|f^t(\mathbf{x}) - \mu_{t-1}(\mathbf{x})| \leq \beta_t \sqrt{2\log(\frac{|\mathcal{D}|t^2 2\pi^2}{\delta})} \sigma_{t-1}(\mathbf{x}), \qquad \forall \mathbf{x}\in \mathcal{D},\, t\geq1,
\]
which holds with probability $\geq 1 - \delta/12$, and that
\[
|f_i^t(\mathbf{x}) - \overline{\mu}_{i}(\mathbf{x})| \leq \tau \sqrt{2\log( \frac{M |\mathcal{D}|t^2 2\pi^2}{\delta})} \overline{\sigma}_{i}(\mathbf{x}), \qquad \forall \mathbf{x}\in \mathcal{D},\, t\geq 1,\, i=1,...,M
\]
which holds with probability $\geq 1 - \delta/12$.
\end{lemma}
The proof of Lemma~\ref{lemma:concentration:sampled:function:around:mean} follows straightforwardly from Lemma 5 of~\citep{chowdhury2017kernelized}, together with a union bound over all $\mathbf{x}\in\mathcal{D}$ and over all $t\geq 1$, as well as an additional union bound over all $M$ meta-tasks for the second inequality.


\begin{lemma}
\label{lemma:bound:ft:weighted:fti}
Define $d_i' \triangleq \max_{\mathbf{x}\in\mathcal{D}}| f(\mathbf{x}) - f_i(\mathbf{x}) |$.
Define $c_t \triangleq \beta_t \left(1 + \sqrt{2\log(\frac{|\mathcal{D}|t^2 2\pi^2}{\delta})}\right)$, and $c_t' \triangleq \tau \left(1 + \sqrt{2\log( \frac{M |\mathcal{D}|t^2 2\pi^2}{\delta})}\right)$.
With probability $\geq 1 - \delta/4 - \delta/4 - \delta/12 - \delta/12 = 1 - 2\delta/3$, we have that
\[
|f^t(\mathbf{x}) - \sum^M_{i=1} \omega_i f_i^t(\mathbf{x})| \leq c_t  + c_t' + \sum^M_{i=1}\omega_i d_i', \qquad\forall \mathbf{x}\in\mathcal{D}, t\geq 1.
\]
\end{lemma}
\begin{proof}
Firstly, we can bound the difference between the target function and a sampled function from its GP posterior.
\begin{equation}
\begin{split}
|f^t(\mathbf{x}) - f(\mathbf{x})| &\leq |f^t(\mathbf{x}) - \mu_{t-1}(\mathbf{x})| + |\mu_{t-1}(\mathbf{x}) - f(\mathbf{x})|\\
&\stackrel{(a)}{\leq} \beta_t \sqrt{2\log(\frac{|\mathcal{D}|t^2 2\pi^2}{\delta})} \sigma_{t-1}(\mathbf{x}) + \beta_t \sigma_{t-1}(\mathbf{x})\\
&=c_t \sigma_{t-1}(\mathbf{x}),
\end{split}
\label{eq:bound:f:ft}
\end{equation}
where $(a)$ results from Lemma~\ref{lemma:concentration:sampled:function:around:mean} and Lemma~\ref{gaussian_bound}, and hence holds with probability of $\geq 1 - \delta/12 - \delta/4$. Next, we do the same for all meta-functions $i=1,\ldots,M$.
\begin{equation}
\begin{split}
|f^t_i(\mathbf{x}) - f_i(\mathbf{x})| &\leq |f^t_i(\mathbf{x}) - \overline{\mu}_{i}(\mathbf{x})| + |\overline{\mu}_{i}(\mathbf{x}) - f_i(\mathbf{x})|\\
&\stackrel{(a)}{\leq} \tau\overline{\sigma}_{i}(\mathbf{x}) + \tau \sqrt{2\log( \frac{M |\mathcal{D}|t^2 2\pi^2}{\delta})} \overline{\sigma}_{i}(\mathbf{x})\\
&= c_t' \overline{\sigma}_{i}(\mathbf{x}),
\end{split}
\end{equation}
where $(a)$ results from Lemma~\ref{lemma:concentration:sampled:function:around:mean} and Lemma~\ref{confidence_bound:ts}, and hence also holds with probability of $\geq 1 - \delta/12 - \delta/4$.
Therefore, combining the above two inequalities gives us:
\begin{equation}
\begin{split}
|f^t(\mathbf{x}) - f_i^t(\mathbf{x})| &\leq |f^t(\mathbf{x}) - f(\mathbf{x})| + |f(\mathbf{x}) - f_i(\mathbf{x})| + |f_i(\mathbf{x}) - f^t_i(\mathbf{x})|\\
&\leq c_t \sigma_{t-1}(\mathbf{x}) + c_t' \overline{\sigma}_{i}(\mathbf{x}) + d_i'\\
&\leq c_t \sigma_{t-1}(\mathbf{x}) + c_t' + d_i',
\end{split}
\end{equation}
in which the last inequality follows since $\overline{\sigma}_{i}(\mathbf{x})\leq 1$.
Finally, the lemma can be proved as:
\begin{equation}
\begin{split}
|f^t(\mathbf{x}) - \sum^M_{i=1} \omega_i f_i^t(\mathbf{x})| &\leq \sum^M_{i=1} \omega_i |f^t(\mathbf{x}) - f_i^t(\mathbf{x})|\\
&\leq \sum^M_{i=1}\omega_i \left( c_t \sigma_{t-1}(\mathbf{x}) + c_t' + d_i' \right)\\
&\leq c_t + c_t' + \sum^M_{i=1}\omega_i d_i'.
\end{split}
\end{equation}
\end{proof}

Next, we define the set of "saturated points" in an iteration $t$, which are those inputs which incur large regrets in iteration $t$.
\begin{definition}
\label{def:saturated:point}
At iteration $t$, define the set of saturated points as
\[
S_t \triangleq \{ \mathbf{x}\in\mathcal{D} | \Delta(\mathbf{x}) > c_t \sigma_{t-1}(\mathbf{x}) \},
\]
where $\Delta(\mathbf{x}) \triangleq f(\mathbf{x}^*) - f(\mathbf{x})$.
\end{definition}

The next lemma will be useful in proving that the input we query in iteration $t$ is unsaturated (i.e., in proving Lemma~\ref{lemma:x_t:unsaturated}), and its proof makes use of Gaussian anti-concentration inequality.
\begin{lemma}
\label{lemma:use:gaussian:anti}
With probability of $\geq 1 - \delta/4$,
\[
\mathbb{P}\left( f^t(\mathbf{x}) > f(\mathbf{x}) | \mathcal{F}_{t-1}, \mathcal{E}^1_t\right) \geq p,\qquad \forall t\geq 1.
\]
where $p\triangleq \frac{e^{-1}}{4\sqrt{\pi}}$.
\end{lemma}
\begin{proof}
Define $\theta_t \triangleq \frac{|f(\mathbf{x}) - \mu_{t-1}(\mathbf{x}) |}{\beta_t \sigma_{t-1}(\mathbf{x})}$. 
% Note that $\theta_t \leq 1$ with probability $\geq 1 - \delta/4$.
\begin{equation}
\begin{split}
\mathbb{P}\left( f^t(\mathbf{x}) > f(\mathbf{x}) | \mathcal{F}_{t-1}, \mathcal{E}^1_t\right) &= \mathbb{P}\left( \frac{f^t(\mathbf{x}) - \mu_{t-1}(\mathbf{x})}{\beta_t \sigma_{t-1}(\mathbf{x})} > \frac{f(\mathbf{x}) - \mu_{t-1}(\mathbf{x})}{\beta_t \sigma_{t-1}(\mathbf{x})} | \mathcal{F}_{t-1}, \mathcal{E}^1_t\right)\\
&\geq \mathbb{P}\left( \frac{f^t(\mathbf{x}) - \mu_{t-1}(\mathbf{x})}{\beta_t \sigma_{t-1}(\mathbf{x})} > \frac{|f(\mathbf{x}) - \mu_{t-1}(\mathbf{x}) |}{\beta_t \sigma_{t-1}(\mathbf{x})} | \mathcal{F}_{t-1}, \mathcal{E}^1_t\right)\\
&= \mathbb{P}\left( \frac{f^t(\mathbf{x}) - \mu_{t-1}(\mathbf{x})}{\beta_t \sigma_{t-1}(\mathbf{x})} > \theta_t | \mathcal{F}_{t-1}, \mathcal{E}^1_t\right)\\
&\stackrel{(a)}{\geq} \frac{e^{-\theta_t^2}}{4\sqrt{\pi} \theta_t} \stackrel{(b)}{\geq} \frac{e^{-1}}{4\sqrt{\pi}}.
\end{split}
\end{equation}
Note that due to the way in which the function $f^t$ is sampled from the GP posterior, i.e., $f^t \sim \mathcal{GP}\left(\mu_{t-1}(\cdot), \beta_t^2 \sigma_{t-1}^2(\cdot)\right)$ (Sec.~\ref{sec:om_gp_ucb}), we have that $\frac{f^t(\mathbf{x}) - \mu_{t-1}(\mathbf{x})}{\beta_t \sigma_{t-1}(\mathbf{x})}$ follows a standard Gaussian distribution.
Therefore, step $(a)$ above results from the Gaussian anti-concentration inequality: denote by $Z$ the standard Gaussian distribution $\mathcal{N}(0, 1)$, then $\mathbb{P}(Z > \theta_t) \geq \frac{e^{-\theta_t^2}}{4\sqrt{\pi}\theta_t}$.
Step $(b)$ follows from Lemma~\ref{gaussian_bound} (i.e., $\theta_t \leq 1$) and hence holds with probability of $\geq 1 - \delta/4$.
\end{proof}

The next lemma shows that in every iteration, the probability that we choose an unsaturated input is lower-bounded.
\begin{lemma}
\label{lemma:x_t:unsaturated}
With probability of $\geq 1 - \delta/4 - \delta/12=1-\delta/3$,
\[
\mathbb{P}\left( \mathbf{x}_t \in \mathcal{D}\setminus S_t | \mathcal{F}_{t-1} \right) \geq (1-\nu_t)p, \qquad \forall t\geq 1.
\]
\end{lemma}
\begin{proof}
Firstly, we have that
\begin{equation}
\begin{split}
\mathbb{P}\left( \mathbf{x}_t \in \mathcal{D} \setminus S_t | \mathcal{F}_{t-1} \right) \geq \mathbb{P}\left( \mathbf{x}_t \in \mathcal{D} \setminus S_t | \mathcal{F}_{t-1}, \mathcal{E}^1_t \right) \mathbb{P}\left(\mathcal{E}^1_t\right) = \mathbb{P}\left( \mathbf{x}_t \in \mathcal{D} \setminus S_t | \mathcal{F}_{t-1}, \mathcal{E}^1_t \right) (1-\nu_t).
\end{split}
\end{equation}
Next, we attempt to lower-bound the term $\mathbb{P}\left( \mathbf{x}_t \in \mathcal{D} \setminus S_t | \mathcal{F}_{t-1}, \mathcal{E}^1_t \right)$.
\begin{equation}
\begin{split}
\mathbb{P}\left( \mathbf{x}_t \in \mathcal{D} \setminus S_t | \mathcal{F}_{t-1}, \mathcal{E}^1_t \right) \geq \mathbb{P}\left( f^t(\mathbf{x}^*) > f^t(\mathbf{x}),\forall \mathbf{x}\in S_t | \mathcal{F}_{t-1}, \mathcal{E}^1_t \right).
\end{split}
\end{equation}
The above inequality follows because $\mathbf{x}^*$ is always unsaturated: $\Delta(\mathbf{x}^*) = f(\mathbf{x}^*)-f(\mathbf{x}^*)=0\leq c_t\sigma_{t-1}(\mathbf{x}^*)$.
As a result, if the event on the RHS of the above inequality holds (i.e., an unsaturated input has larger value of $f^t$ than all saturated inputs), then the event on the LHS (i.e., $\mathbf{x}_t$ is unsaturated) also holds.
Next, we also have that $\forall \mathbf{x} \in S_t$,
\begin{equation}
f^t(\mathbf{x}) \leq f(\mathbf{x}) + c_t \sigma_{t-1}(\mathbf{x}) \leq f(\mathbf{x}) + \Delta(\mathbf{x}) = f(\mathbf{x}) + f(\mathbf{x}^*) - f(\mathbf{x}) = f(\mathbf{x}^*),
\end{equation}
in which the first inequality follows from~\eqref{eq:bound:f:ft} and hence holds with probability $\geq 1-\delta/12-\delta/4$, the second inequality is a result of the definition of saturated inputs (Definition~\ref{def:saturated:point}).
The above inequality implies that
\begin{equation}
\mathbb{P}\left( f^t(\mathbf{x}^*) > f^t(\mathbf{x}),\forall \mathbf{x}\in S_t | \mathcal{F}_{t-1}, \mathcal{E}^1_t \right) \geq \mathbb{P}\left( f^t(\mathbf{x}^*) > f(\mathbf{x}^*) | \mathcal{F}_{t-1}, \mathcal{E}^1_t \right).
\end{equation}

Lastly, combining the above inequalities gives us
\begin{equation}
\begin{split}
\mathbb{P}\left( \mathbf{x}_t \in \mathcal{D} \setminus S_t | \mathcal{F}_{t-1}, \mathcal{E}^1_t \right) \geq \mathbb{P}\left( f^t(\mathbf{x}^*) > f(\mathbf{x}^*) | \mathcal{F}_{t-1}, \mathcal{E}^1_t \right) \geq p,
\end{split}
\end{equation}
where the last inequality follows from Lemma~\ref{lemma:use:gaussian:anti}.
This completes the proof.
Note that the error probabilities for this lemma come from Lemma~\ref{lemma:concentration:sampled:function:around:mean} ($\delta/12$) and Lemma~\ref{gaussian_bound} ($\delta/4$).
% The error probabilities come from the event of bounding $|f(\mathbf{x}) - \mu_{t-1}(\mathbf{x})|$ ($\delta/4$) and $|\mu_{t-1}(\mathbf{x}) - f^t(\mathbf{x})|$ ($\delta/12$).
\end{proof}

Next, we prove an upper bound on the expected instantaneous regret $r_t=f(\mathbf{x}^*)-f(\mathbf{x}_t)$.
\begin{lemma}
\label{lemma:upper:bound:expected:inst:regret}
With probability of $\geq 1 - \delta/4 - \delta/4 - \delta/12 - \delta/12 = 1-2\delta/3$,
\[
\mathbb{E}[r_t | \mathcal{F}_{t-1}] \leq c_t \left(1 + \frac{2}{(1-\nu_1)p}\right) \mathbb{E} [\sigma_{t-1}(\mathbf{x}_t) | \mathcal{F}_{t-1}] + \psi_t,
\]
where $\psi_t \triangleq 2\nu_t \left( c_t + c_t' + \sum^M_{i=1}\omega_i d_i' \right)$.
\end{lemma}
\begin{proof}
To begin with, define the unsaturated input with the smallest posterior standard deviation as
\begin{equation}
\overline{\mathbf{x}}_t \triangleq {\arg\min}_{\mathbf{x}\in \mathcal{D} \setminus S_t} \sigma_{t-1}(\mathbf{x}).
\end{equation}
This allows us to obtain the following:
\begin{equation}
\begin{split}
\mathbb{E}[\sigma_{t-1}(\mathbf{x}_t) | \mathcal{F}_{t-1}] \geq \mathbb{E}[\sigma_{t-1}(\mathbf{x}_t) | \mathcal{F}_{t-1}, \mathbf{x}_t\in\mathcal{D}\setminus S_t] \mathbb{P}\left(\mathbf{x}_t\in\mathcal{D}\setminus S_t\right) \stackrel{(a)}{\geq} \sigma_{t-1}(\overline{\mathbf{x}}_t)(1-\nu_t)p,
\end{split}
\label{eq:ues:xt:bar}
\end{equation}
where $(a)$ results from Lemma~\ref{lemma:x_t:unsaturated} and hence holds with probability $\geq 1-\delta/12-\delta/4$ (the error probabilities come from Lemma~\ref{lemma:concentration:sampled:function:around:mean} and Lemma~\ref{gaussian_bound}).
Subsequently, the instataneous regret can be upper-bounded as
\begin{equation}
\begin{split}
r_t &= \Delta(\mathbf{x}_t) = f(\mathbf{x}^*) - f(\overline{\mathbf{x}}_t) + f(\overline{\mathbf{x}}_t) - f(\mathbf{x}_t)\\
&\stackrel{(a)}{\leq} \Delta(\overline{\mathbf{x}}_t) + f^t(\overline{\mathbf{x}}_t) + c_t \sigma_{t-1}(\overline{\mathbf{x}}_t) - f^t(\mathbf{x}_t) + c_t \sigma_{t-1}(\mathbf{x}_t)\\
&\stackrel{(b)}{\leq} c_t \sigma_{t-1}(\overline{\mathbf{x}}_t) + c_t \sigma_{t-1}(\overline{\mathbf{x}}_t) + c_t \sigma_{t-1}(\mathbf{x}_t) + f^t(\overline{\mathbf{x}}_t) - f^t(\mathbf{x}_t)\\
&= c_t (2\sigma_{t-1}(\overline{\mathbf{x}}_t) + \sigma_{t-1}(\mathbf{x}_t)) + f^t(\overline{\mathbf{x}}_t) - f^t(\mathbf{x}_t),
\end{split}
\end{equation}
in which $(a)$ follows from~\eqref{eq:bound:f:ft}, and $(b)$ results from the definition of saturated input (Definition~\ref{def:saturated:point}) and that $\overline{\mathbf{x}}_t$ is unsaturated.
Next, we attempt to upper-bound the expected value of the term $f^t(\overline{\mathbf{x}}_t) - f^t(\mathbf{x}_t)$ from the equation above:
\begin{equation}
\begin{split}
\mathbb{E}&\left[f^t(\overline{\mathbf{x}}_t) - f^t(\mathbf{x}_t) | \mathcal{F}_{t-1}\right] \\
&=\mathbb{P}(\mathcal{E}^1_t) \mathbb{E}\left[f^t(\overline{\mathbf{x}}_t) - f^t(\mathbf{x}_t) | \mathcal{F}_{t-1}, \mathcal{E}^1_t\right] + \mathbb{P}(\mathcal{E}^2_t) \mathbb{E}\left[f^t(\overline{\mathbf{x}}_t) - f^t(\mathbf{x}_t) | \mathcal{F}_{t-1}, \mathcal{E}^2_t\right]\\
&\stackrel{(a)}{\leq} \nu_t \mathbb{E}\left[f^t(\overline{\mathbf{x}}_t) - f^t(\mathbf{x}_t) | \mathcal{F}_{t-1}, \mathcal{E}^2_t\right]\\
&\stackrel{(b)}{\leq} \nu_t \mathbb{E}\Big[\sum^M_{i=1}\omega_i f^t_i(\overline{\mathbf{x}}_t) + c_t + c_t' + \sum^M_{i=1}\omega_i d_i' +  c_t + c_t' + \sum^M_{i=1}\omega_i d_i' - \sum^M_{i=1}\omega_i f^t_i(\mathbf{x}_t) | \mathcal{F}_{t-1}, \mathcal{E}^2_t\Big]\\
&\stackrel{(c)}{\leq} 2\nu_t \left( c_t + c_t' + \sum^M_{i=1}\omega_i d_i' \right) \triangleq \psi_t.
\end{split}
\end{equation}
Step $(a)$ follows since conditioned on the event $\mathcal{E}^1_t$ ($\overline{\zeta}_t(\mathbf{x}) = f^t(\mathbf{x})$), we have that $f^t(\mathbf{x}) \leq f^t(\mathbf{x}_t),\forall \mathbf{x}\in\mathcal{D}$; step $(b)$ results from Lemma~\ref{lemma:bound:ft:weighted:fti}; step $(c)$ follows since conditioned on the event $\mathcal{E}^2_t$ (i.e., $\overline{\zeta}_t(\mathbf{x}) = {\sum}^M_{i=1}\omega_i \left[\overline{f}^t_{i}(\mathbf{x})\right]$), we have that ${\sum}^M_{i=1}\omega_i \left[\overline{f}^t_{i}(\mathbf{x})\right] \leq {\sum}^M_{i=1}\omega_i \left[\overline{f}^t_{i}(\mathbf{x}_t)\right],\forall\mathbf{x}\in\mathcal{D}$.
Lastly,
\begin{equation}
\begin{split}
\mathbb{E}[r_t | \mathcal{F}_{t-1}] &\leq \mathbb{E}[ c_t (2\sigma_{t-1}(\overline{\mathbf{x}}_t) + \sigma_{t-1}(\mathbf{x}_t)) + \psi_t | \mathcal{F}_{t-1}]\\
&\leq \mathbb{E}\left[ c_t \left(\frac{2}{(1-\nu_t)p} \sigma_{t-1}(\mathbf{x}_t) + \sigma_{t-1}(\mathbf{x}_t) \right) + \psi_t |\mathcal{F}_{t-1} \right]\\
&\leq c_t \left(1 + \frac{2}{(1-\nu_1)p}\right) \mathbb{E} [\sigma_{t-1}(\mathbf{x}_t) | \mathcal{F}_{t-1}] + \psi_t,
\end{split}
\end{equation}
in which the second inequality results from~\eqref{eq:ues:xt:bar}.
Note that the error probabilities for this Lemma follow from Lemma~\ref{lemma:bound:ft:weighted:fti}.
\end{proof}

Subsequently, we make use of martingale concentration inequalities to bound the cumulative regret.
\begin{definition}
Define $Y_0 = 0$, and for $t\geq 1$,
\[
X_t = r_t - c_t \left(1 + \frac{2}{(1-\nu_1)p}\right) \sigma_{t-1}(\mathbf{x}_t) - \psi_t,
\]
\[
Y_t = \sum^t_{s=1} X_s.
\]
\end{definition}
The next lemma shows that $\{Y_t\}_{t\geq 1}$ is a super-martingale.
\begin{lemma}
%Conditioned on event ?, 
With probability $\geq 1 - \delta/4 - \delta/4 - \delta/12 - \delta/12=1-2\delta/3$,
$\{Y_t\}_{t\geq 1}$ is a super-martingale with respect to the filtration $\mathcal{F}_{t-1}$.
\end{lemma}
\begin{proof}
\begin{equation}
\begin{split}
\mathbb{E}[Y_t - Y_{t-1} | \mathcal{F}_{t-1}] &= \mathbb{E}[X_t | \mathcal{F}_{t-1}]\\
&=\mathbb{E}[ r_t - c_t \left(1 + \frac{2}{(1-\nu_1)p}\right) \sigma_{t-1}(\mathbf{x}_t) + \psi_t | \mathcal{F}_{t-1}]\\
&= \mathbb{E}[r_t | \mathcal{F}_{t-1}] - \left[ c_t \left(1 + \frac{2}{(1-\nu_1)p}\right) \mathbb{E}[\sigma_{t-1}(\mathbf{x}_t) | \mathcal{F}_{t-1}] + \psi_t \right] \leq 0,
\end{split}
\end{equation}
where the last inequality follows from Lemma~\ref{lemma:upper:bound:expected:inst:regret}.
\end{proof}

% \begin{lemma}[Azuma-Hoeffding Inequality]
% \end{lemma}
Finally, we are ready to use martingale concentration inequalities to bound the cumulative regret.
\begin{lemma}
\label{lemma:ts:final:upper:bound:RT}
With probability of $\geq 1 - \delta/4 - \delta/4 - \delta/12 - \delta/12 - \delta/12=1-3\delta/4$,
\[
R_T \leq \left(2B + c_T \left(1 + \frac{2}{(1-\nu_1)p}\right) + \psi_1\right) \sqrt{T (C_1 \gamma_T + 2\log(12/\delta))} + 2\sum^T_{t=1} \nu_t (c_t+c_t'+\sum^M_{i=1}\omega_i d_i')
\]
where $C_1=2/\log(1+\sigma^{-2})$.
\end{lemma}
\begin{proof}
To begin with, we have that
\begin{equation}
\begin{split}
|Y_t - Y_{t-1}| &= |X_t| \leq |r_t| + |c_t \left(1 + \frac{2}{(1-\nu_1)p}\right) \sigma_{t-1}(\mathbf{x}_t)| + |\psi_t|\\
&\leq 2B + c_t \left(1 + \frac{2}{(1-\nu_1)p}\right) + \psi_t,
\end{split}
\end{equation}
where the last inequality follows since $|r_t|=|f(\mathbf{x}^*)-f(\mathbf{x}_t)|\leq 2B$ (because $\norm{f}_k\leq B$ as we have assumed in Sec.~\ref{sec:background}, which immediately implies that $|f(\mathbf{x})|\leq B,\forall \mathbf{x}\in\mathcal{D}$), and $\sigma_{t-1}(\mathbf{x})\leq 1,\forall \mathbf{x}\in\mathcal{D}$.

Next, we apply the Azuma-Hoeffding Inequality with an error probability of $\delta/12$ (first inequality):
\begin{equation}
\begin{split}
\sum^T_{t=1} r_t &\leq \sum^T_{t=1} c_t \left(1 + \frac{2}{(1-\nu_1)p}\right) \sigma_{t-1}(\mathbf{x}_t) + \sum^T_{t=1} \psi_t + \\
&\qquad \sqrt{2 \log\frac{10}{\delta} \sum^T_{t=1} \left(2B + c_t \left(1 + \frac{2}{(1-\nu_1)p}\right) + \psi_t\right)^2 }\\
&\leq c_T \left(1 + \frac{2}{(1-\nu_1)p}\right) \sum^T_{t=1} \sigma_{t-1}(\mathbf{x}_t) + \sum^T_{t=1} \psi_t + \\
&\qquad \left(2B + c_T \left(1 + \frac{2}{(1-\nu_1)p}\right) + \psi_1\right)\sqrt{2 T \log\frac{12}{\delta}}\\
&\leq c_T \left(1 + \frac{2}{(1-\nu_1)p}\right) \sqrt{C_1' \gamma_T T} + \sum^T_{t=1} \psi_t + \\
&\qquad \left(2B + c_T \left(1 + \frac{2}{(1-\nu_1)p}\right) + \psi_1\right)\sqrt{2 T \log\frac{12}{\delta}}\\
&\leq \left(2B + c_T \left(1 + \frac{2}{(1-\nu_1)p}\right) + \psi_1\right) \sqrt{T (C_1' \gamma_T + 2\log(12/\delta))} + \\
&\qquad 2\sum^T_{t=1} \nu_t (c_t+c_t'+\sum^M_{i=1}\omega_i d_i).
\end{split}
\end{equation}
The second last inequality makes use of Lemma~\ref{upper_bound_by_info_gain} from the proof of RM-GP-UCB (excluding the factor of $(1-\nu_t)\beta_t$) with $C_1'\triangleq2/\log(1+\sigma^{-2})$.
\end{proof}

Recall that $c_t=\mathcal{O}(\sqrt{\gamma_t} \log t)$, $c_t'=\mathcal{O}(\log t)$.
Therefore, Lemma~\ref{lemma:ts:final:upper:bound:RT} can be further analyzed as:
\begin{equation}
\begin{split}
R_T &= \mathcal{O}\left(c_T \sqrt{T\gamma_T} + \sum^T_{t=1} \nu_t (c_t+c_t'+\sum^M_{i=1}\omega_i d_i)\right)\\
&= \mathcal{O}\Big(\Big(\sum^M_{i=1}\omega_i d_i' \Big) \sum^T_{t=1} \nu_t + 
\sum^T_{t=1} \nu_t \sqrt{\gamma_t}\log t + 
\gamma_T\log T \sqrt{T} \Big).
\end{split}
\end{equation}

Lastly, similar to our analysis of RM-GP-UCB for the case where the $\omega_i$'s change with $t$ (i.e., at the end of Appendix~\ref{app:proof_theorem_1}), when our online meta-weight optimization is used, we simply need to slightly modify the definition of $\psi_t$:
$\psi_t \triangleq 2\nu_t \left( c_t + c_t' + \sum^M_{i=1}\omega_{i,t} d_i' \right)$ by allowing $\omega_{i,t}$ to change with $t$, and the subsequent analysis still holds by simply replacing $\omega_i$ by $\omega_{i,t}$.
As a result, the no-regret guarantee of RM-GP-TS (Theorem~\ref{regret_bound_ts}) still holds (since we can simply upper-bound every $\omega_{i,t}$ by $1$):
\begin{equation*}
\begin{split}
R_T &= \mathcal{O}\Big( \sum^T_{t=1} \nu_t \Big(\sum^M_{i=1}\omega_{i,t} d_i' \Big) + 
\sum^T_{t=1} \nu_t \sqrt{\gamma_t}\log t + 
\gamma_T\log T \sqrt{T} \Big)\\
&=\mathcal{O}\Big(\Big(\sum^M_{i=1} d_i' \Big) \sum^T_{t=1} \nu_t + 
\sum^T_{t=1} \nu_t \sqrt{\gamma_t}\log t + 
\gamma_T\log T \sqrt{T} \Big)\\
&=\widetilde{\mathcal{O}}\Big(\Big(\sum^M_{i=1} d_i' \Big) \sum^T_{t=1} \nu_t + 
\sum^T_{t=1} \nu_t \sqrt{\gamma_t} + 
\gamma_T \sqrt{T} \Big).
\end{split}
\end{equation*}

% Note that $c_T = \mathcal{O}(\log T)$, $c_T' = \mathcal{O}(\sqrt{\log T})$.

% \begin{equation}
% \begin{split}
% R_T = \mathcal{O}\left(\left( \log T + \sum^M_{i=1}\omega_i d_i' \right) \sum^T_{t=1} \nu_t +\log T \sqrt{T\gamma_T} \right).
% \end{split}
% \end{equation}

%\begin{equation}
%\begin{split}
%R_T = \mathcal{O}\left(\left( \log T + \sum^M_{i=1}\omega_i d_i \right) \sum^T_{t=1} \nu_t + \log T \sqrt{T\gamma_T} \right).
%\end{split}
%\end{equation}


\section{Analysis of Online Meta-Weight Optimization}
\label{app:analysis:online:weight}
\subsection{Proof of Lemma~\ref{estimate_di}} 
\label{app:upper_bound_func_gap}
%Eq.~\eqref{eq:L_f_U} implies that
From the definitions of $U_{t,i,j}$ and $L_{t,i,j}$~\eqref{UL}, and the fact that $L_{t,i,j} \leq f(\mathbf{x}_{i,j}) \leq U_{t,i,j}, \forall t, i, j\ $ with probability $\geq 1-\delta/4$ 
(Section~\ref{sec:estimate_d}), we have that
\begin{equation}
\begin{split}
    d_i&=\max_{j=1,...,N_i} \left|f_i(\mathbf{x}_{i,j}) - f(\mathbf{x}_{i,j})\right| \\
    &\leq \max_{j=1,...,N_i}\left[\max \{\left|f_i(\mathbf{x}_{i,j}) - U_{t,i,j}\right|, \left|f_i(\mathbf{x}_{i,j}) - L_{t,i,j}\right|\}\right] \qquad \forall \, i=1,\ldots,M,\forall t\geq 1
\end{split}
\label{func_gap_proof_aux_1}
\end{equation}
which holds with probability $\geq 1 - \delta/4$. Next, we derive upper bounds on $\left|f_i(\mathbf{x}_{i,j}) - U_{t,i,j}\right|$ and $\left|f_i(\mathbf{x}_{i,j}) - L_{t,i,j}\right|$ that only consist of known or computable 
terms, such that the upper bounds on $d_i$ can be efficiently calculated in practice.
\begin{lemma}
\label{func_gap_proof_aux_2}
%Let $\delta' \in (0, 1)$. 
With probability $\geq 1 - \delta/4$, $\forall \, t\geq 1$, $\forall i, j$, 
\begin{equation*}
\begin{split}
\left|f_i(\mathbf{x}_{i,j}) - U_{t,i,j}\right| \leq \sqrt{2\sigma^2\log\frac{8\sum^M_{i=1}N_i}{\delta}} + \left|y_{i,j} - U_{t,i,j}\right|,\\
\left|f_i(\mathbf{x}_{i,j}) - L_{t,i,j}\right| \leq \sqrt{2\sigma^2\log\frac{8\sum^M_{i=1}N_i}{\delta}} + \left|y_{i,j} - L_{t,i,j}\right|.
\end{split}
\end{equation*}
\end{lemma}
\begin{proof}
To begin with, note that $f_i(\mathbf{x}_{i,j}) - y_{i,j} \sim \mathcal{N}(0, \sigma^2)$. Therefore,~\eqref{standard_gaussian} suggests that
\begin{equation}
\mathbb{P}\left(\left|f_i(\mathbf{x}_{i,j}) - y_{i,j}\right| > \sigma\sqrt{2\log\frac{8\sum^M_{i=1}N_i}{\delta}}\right) \leq \frac{\delta}{8\sum^M_{i=1}N_i}
\end{equation}
which naturally leads to a high-probability upper bound on $\left|f_i(\mathbf{x}_{i,j}) - U_{t,i,j}\right|$:
\begin{equation}
\begin{split}
    \left|f_i(\mathbf{x}_{i,j}) - U_{t,i,j}\right| &= |f_i(\mathbf{x}_{i,j}) - y_{i,j} + y_{i,j} - U_{t,i,j}| \\
    &\leq \left|f_i(\mathbf{x}_{i,j}) - y_{i,j}\right| + \left|y_{i,j} - U_{t,i,j}\right|\\
    &\leq \sqrt{2\sigma^2\log\frac{8\sum^M_{i=1}N_i}{\delta}} + \left|y_{i,j} - U_{t,i,j}\right|
\end{split}
\end{equation}
which holds with probability $\geq 1 - \frac{\delta}{8\sum^M_{i=1}N_i}$. Applying the same reasoning to $\left|f_i(\mathbf{x}_{i,j}) - L_{t,i,j}\right|$ results in a similar high-probability upper bound:
\begin{equation}
    \left|f_i(\mathbf{x}_{i,j}) - L_{t,i,j}\right| \leq \sqrt{2\sigma^2\log\frac{8\sum^M_{i=1}N_i}{\delta}} + \left|y_{i,j} - L_{t,i,j}\right|.
\end{equation}
Next, the proof is completed by taking a union bound over both $U_{t,i,j}$ and $L_{t,i,j}$, as well as all $\sum^M_{i=1}N_i$ observations of the meta-tasks.
\end{proof}

Finally, Lemma~\ref{estimate_di} follows by combining~\eqref{func_gap_proof_aux_1} and Lemma~\ref{func_gap_proof_aux_2}.

\subsection{Proof of Proposition \ref{regret_bound_2}}
\label{app:prop_1_proof}
In iteration $t$, define $\overline{\alpha}_t$ by replacing $d_i$ in $\alpha$ with $\overline{d}_{i,t}$:
\begin{equation}
\overline{\alpha}_t= \sum^M_{i=1}\omega_i \frac{N_i}{\sigma^2}(2\sqrt{2\sigma^2\log\frac{8N_i}{\delta}}+\overline{d}_{i,t}).
\label{alpha_t_bar}
\end{equation}
Since according to Lemma~\ref{estimate_di}, $d_i \leq \overline{d}_{i,t} \, \forall i=1,\ldots,M, t\geq 1$ with probability $\geq 1 - \delta/2$, 
we have that $\alpha \leq \overline{\alpha}_t \, \forall t\geq 1$, which also holds with probability $\geq 1 - \delta/2$.\\

Therefore, Theorem~\ref{regret_bound} implies that, with probability $\geq 1 - \delta$,
\begin{equation}
\begin{split}
    R_T \leq \underline{2\sum^T_{t=1} \overline{\alpha}_t\nu_t} + 2\tau \sum^T_{t=1} \nu_t + \beta_T\sqrt{C_1 T \gamma_T}.
\end{split}
\label{R_T_new}
\end{equation}

In~\eqref{R_T_new}, only the underlined term depends on the $\omega_i$'s. 
Define two column vectors $\overline{\boldsymbol{\alpha}}=[\overline{\alpha}_{t}]^{\top}_{t=1,\ldots,T}$ and $\boldsymbol{\nu}=[\nu_t]^{\top}_{t=1,\ldots,T}$.
Then, the underlined term in~\eqref{R_T_new} can be further decomposed as
\begin{equation}
\begin{split}
    2 \sum^T_{t=1} \overline{\alpha}_{t} \nu_t &\triangleq 2 \overline{\boldsymbol{\alpha}}^{\top}\boldsymbol{\nu} \stackrel{\text{(a)}}{\leq} 2\norm{\overline{\boldsymbol{\alpha}}}_2 \norm{\boldsymbol{\nu}}_2 \stackrel{\text{(b)}}{\leq} 2 \norm{\overline{\boldsymbol{\alpha}}}_1 \norm{\boldsymbol{\nu}}_1 \stackrel{\text{(c)}}{=} 2 \underline{\sum^T_{t=1} \overline{\alpha}_{t}} \sum^T_{t=1}\nu_t
\end{split}
\label{R_T_new_aux_1}
\end{equation}
in which (a) results from Cauchy-Schwarz inequality, (b) follows because the L2 norm is upper-bounded by the L1 norm, and (c) is obtained because $\overline{\alpha}_t>0,\nu_t\geq 0, \forall t\geq 1$.

In~\eqref{R_T_new_aux_1}, the dependence on the $\omega_i$'s appears in the underlined term, which can be further decomposed as
\begin{equation}
\begin{split}
    \sum^T_{t=1} \overline{\alpha}_{t} &=  \sum^T_{t=1}\left[\sum^M_{i=1}\omega_i\frac{N_i}{\sigma^2}\left(2\sqrt{2\sigma^2\log\frac{8N_i}{\delta}}+\overline{d}_{i,t} \right)\right]\\
    &\stackrel{\triangle}{=}\frac{1}{\sigma^2} \sum^T_{t=1}\left[ \sum^M_{i=1}\omega_i l_{i,t} \right] \\
    &\stackrel{\triangle}{=} \frac{1}{\sigma^2}\sum^T_{t=1}\boldsymbol{\omega}^{\top} \boldsymbol{l}_t
\end{split}
\label{R_T_new_aux_2}
\end{equation}
in which we have defined $\boldsymbol{\omega} \triangleq [\omega_i]_{i=1,\ldots,M}$, $\boldsymbol{l}_t\triangleq[l_{i,t}]_{i=1,\ldots,M}$, with
\begin{equation}
    l_{i,t} \triangleq N_i \left(2\sqrt{2\sigma^2\log\frac{8N_i}{\delta}}+\overline{d}_{i,t}\right).
\end{equation}

Plugging~\eqref{R_T_new_aux_1} and~\eqref{R_T_new_aux_2} in to~\eqref{R_T_new} completes the proof.

\subsection{Derivation of Equation~\ref{estimate_wi}}
\label{lagran}
Recall that our objective is to minimize 
\[
    \sum^{t-1}_{s=1}\boldsymbol{\omega}'^{\top} \boldsymbol{l}_s + \frac{1}{\eta}\sum^M_{i=1}\omega_i'\log \omega_i'
\]
subject to the constraint that $\boldsymbol{\omega}'$ forms a probability simplex: $\sum^M_{i=1}\omega_i' = 1.0$
and $\omega_i' \geq 0$ for all $i=1,\ldots,M$.
Define the Lagrangian as 
\begin{equation}
L(\boldsymbol{\omega}, \lambda) = \sum^{t-1}_{s=1}\boldsymbol{\omega}'^{\top} \boldsymbol{l}_s + \frac{1}{\eta}\sum^M_{i=1}\omega_i'\log \omega_i' + 
\lambda \left(1 - \sum^M_{i=1}\omega_i'\right).
\end{equation}
Taking the derivative of $L(\boldsymbol{\omega}, \lambda)$ with respect to $\omega_i'$, we get
\begin{equation}
\label{L_derive}
\frac{\partial L(\boldsymbol{\omega}, \lambda)}{\partial \omega_i'} = \sum^{t-1}_{s=1} l_{i,s} + \frac{1}{\eta} \left( \log \omega_i' + 1 \right) - \lambda.
\end{equation}
Setting~\eqref{L_derive} to 0 gives us
\begin{equation}
\omega_i' = e^{\eta \lambda - 1} e^{-\eta \sum^{t-1}_{s=1} l_{i,s} } \propto e^{-\eta \sum^{t-1}_{s=1} l_{i,s} }.
\end{equation}
Normalizing the $\omega_i'$'s for all $i=1\ldots,M$ to form a probability simplex leads to~\eqref{estimate_wi}.

\subsection{Analysis for RM-GP-TS}
\label{app:meta:weight:optimization:ts}
Here we use the function gap $d_i$ to approximate $d_i'$ (defined in Theorem~\ref{regret_bound_ts}), 
i.e., $d_i'\approx d_i,\forall i=1,\ldots,M$.
% i.e., we assume $d_i'=d_i,\forall i=1,\ldots,M$.
Combining Lemma~\ref{estimate_di} and Theorem~\ref{regret_bound_ts}, we have for RM-GP-TS that with probability of $\geq 1 - \delta$,
\begin{equation}
\begin{split}
R_T &= \mathcal{O}\Big( \sum^T_{t=1} \nu_t \left(\sum^M_{i=1}\omega_i \overline{d}_{i,t} \right) + 
\sum^T_{t=1} \nu_t \sqrt{\gamma_t}\log t + 
\gamma_T\log T \sqrt{T} \Big)\\
&\leq \mathcal{O}\Big(  \underline{\left(\sum^T_{t=1}\sum^M_{i=1}\omega_i \overline{d}_{i,t}\right)}  \left(\sum^T_{t=1} \nu_t\right) + 
\sum^T_{t=1} \nu_t \sqrt{\gamma_t}\log t + 
\gamma_T\log T \sqrt{T} \Big),
\end{split}
\label{eq:ts:online:regret:minimize:eq:1}
\end{equation}
in which the inequality can be proved in a similar way as equation~\eqref{R_T_new_aux_1}.
Next, define $\boldsymbol{\omega}\triangleq [\omega_i]_{i=1,\ldots,M}$, $\boldsymbol{d}_t \triangleq [\overline{d}_{i,t}]_{i=1,\ldots,M}$, then the underlined term above can be denoted as:
\begin{equation}
\sum^T_{t=1} \sum^M_{i=1}\omega_i \overline{d}_{i,t} = \sum^T_{t=1} \boldsymbol{\omega}^{\top} \boldsymbol{d}_t.
\end{equation}
Therefore, equation~\eqref{eq:ts:online:regret:minimize:eq:1} can be further upper-bounded as:
\begin{equation}
\begin{split}
R_T = \mathcal{O}\Big(  \underline{\left( \sum^T_{t=1} \boldsymbol{\omega}^{\top} \boldsymbol{d}_t \right)}  \left(\sum^T_{t=1} \nu_t\right) + 
\sum^T_{t=1} \nu_t \sqrt{\gamma_t}\log t + 
\gamma_T\log T \sqrt{T} \Big).
\end{split}
\end{equation}
Next, applying similar derivations as Appendix~\ref{lagran} (treating the underlined term above as the loss to be minimized) leads to the same update rule for the meta-weights as equation~\eqref{estimate_wi}.
Approximating $d_i'$ using $d_i$ also allows us to derive the same update rule for $\nu_t$ (Sec.~\ref{sec:online_weight_estimation}).

% \begin{equation}
% \begin{split}
% R_T &= \mathcal{O}\Big( \sum^T_{t=1} \nu_t \Big( \sum^M_{i=1}\omega_i \overline{d}_{i,t} \Big) + (\log T) \sum^T_{t=1}\nu_t +\log T \sqrt{T\gamma_T} \Big).\\
% &\leq \mathcal{O}\Big( \underline{\Big(\sum^T_{t=1} \sum^M_{i=1}\omega_i \overline{d}_{i,t} \Big)} \sum^T_{t=1}\nu_t + (\log T) \sum^T_{t=1}\nu_t +\log T \sqrt{T\gamma_T} \Big)
% \end{split}
% \end{equation}

% \begin{equation}
% \sum^T_{t=1} \sum^M_{i=1}\omega_i \overline{d}_{i,t} = \sum^T_{t=1} \boldsymbol{\omega}^{\top} \boldsymbol{d}_t,
% \end{equation}
% where we have defined $\boldsymbol{\omega}\triangleq [\omega_i]_{i=1,\ldots,M}$, $\boldsymbol{d}_t \triangleq [\overline{d}_{i,t}]_{i=1,\ldots,M}$.

% \begin{equation}
% R_T = \mathcal{O}\Big( \left[\sum^T_{t=1} \boldsymbol{\omega}^{\top} \boldsymbol{d}_t\right] \left[\sum^T_{t=1} \nu_t\right] + (\log T) \sum^T_{t=1}\nu_t +\log T \sqrt{T\gamma_T} \Big).
% \end{equation}
% %\begin{proposition}[RM-GP-TS]
% %\[
% %R_T = \mathcal{O}\Big(\Big( \log T + \boldsymbol{\omega}^{\top} \boldsymbol{d}_t \Big) \sum^T_{t=1} \nu_t +\log T \sqrt{T\gamma_T} \Big),
% %\]
% %\end{proposition}

\section{More Experimental Details and Results}
\label{app:experiments}
In every experiment, the same set of random initializations are used for all methods
%: GP-UCB, RGPE, TAF, MTBO and RM-GP-UCB, 
to ensure fair comparisons.
The kernel bandwidth parameter $\rho$ in TAF is set to $\rho=0.5$ in all experiments, but we have observed that other values of $\rho$ (such as $0.1$ and $0.9$)
lead to similar performances. $S=500$ posterior samples are used to compute the ensemble weights in RGPE.
All experiments are run on a server with 16 cores of Intel Xeon processor, 256G of RAM and 5 NVIDIA GTX1080 Ti GPUs.

\subsection{Optimization of Synthetic Functions}
\label{app:synth}
\subsubsection{Synthetic Functions Sampled from GPs}
The objective functions are drawn from GP's with the Squared Exponential kernel (with a length scale of $0.05$) from the domain $\mathcal{D}=[0,1]$.
Fig.~\ref{fig:synth_func} shows an example of such synthetic functions.
\begin{figure}[tb]
\centering
\includegraphics[width=0.45\columnwidth]{figures/synth_func_example.pdf}
\caption{An example synthetic function sampled from a GP.}
\label{fig:synth_func}
\end{figure}
The meta-functions and meta-tasks are generated in the following way. 
To begin with, we fix the number of meta-tasks $M=4$, the number of observations (input-output pairs) for each meta-task $N=N_i=20$ for $i=1\ldots M$, and the function gaps: $d_1=d_2=0.05$, $d_3=d_4=4.0$. 
For the $i$-th meta-task, firstly, $N_i$ inputs are randomly drawn from the entire domain $\mathcal{D}=[0,1]$. 
Then for each of the $N_i$ inputs $\mathbf{x}_{i,j}$, a number is randomly drawn from $[-d_i,d_i]$, 
which is added to the value of the target function $f(\mathbf{x}_{i,j})$ to produce the corresponding function value of the meta-function $f_i(\mathbf{x}_{i,j})$. 
Subsequently, a zero-mean Gaussian noise (with a noise variance of $0.01$) is added to $f_i(\mathbf{x}_{i,j})$, resulting in the corresponding output of the meta-observation $y_i(\mathbf{x}_{i,j})$. 
The above-mentioned procedure is repeated for each of the $M=4$ meta-tasks.
Note that according to the specified function gaps, meta-tasks 1 and 2 are relatively more similar to the target task, 
whereas meta-tasks 3 and 4 are dissimilar to the target task due to the larger function gaps. 

Fig.~\ref{fig:meta_weights_curves} plots the evolution of the meta-weights for each of the $4$ meta-tasks in the experiments exploring the impact of $\eta$,
i.e., corresponding to Fig.~\ref{fig:synth_func_results}c in Section~\ref{exp:synth}. These figures are used to demonstrate the observations that overly large and excessively small
values of $\eta$ can both degrade the performance of RM-GP-UCB.
\begin{figure}[tb]
	\centering
	\begin{subfigure}[b]{0.325\linewidth}
		\includegraphics[width=\linewidth]{figures/meta_weights_curves_lr_0_01.pdf}
		\caption{$\eta N=0.01$.}
	\end{subfigure}
	\hfill
	\begin{subfigure}[b]{0.325\linewidth}
		\includegraphics[width=\linewidth]{figures/meta_weights_curves_lr_1.pdf}
		\caption{$\eta N=1.0$.}
	\end{subfigure}
	\hfill
	\begin{subfigure}[b]{0.325\linewidth}
		\includegraphics[width=\linewidth]{figures/meta_weights_curves_lr_5.pdf}
		\caption{$\eta N=5.0$.}
	\end{subfigure}
	\caption{Evolution of the meta-weights with different learning rate, $\eta$, for online meta-weight optimization in the synthetic experiments.
	In each figure, the red and blue curves represent the meta-weights of the two meta-tasks that are more similar to the target task (i.e., the first two meta-tasks),
	whereas the green and yellow curves correspond to the meta-weights of the other two dissimilar meta-tasks.
	Every color has $10$ curves in each figure, which correspond to $10$ independent runs of the algorithm with different random initializations.}
	\label{fig:meta_weights_curves}
\end{figure}


Moreover, we have added another experiment where the $N_i$'s (i.e., the number of observations from the meta-tasks) are different.
Specifically, we use the same experimental setting involving $M=4$ meta-tasks as described above, and let $N_1=15,N_2=25,N_3=10,N_4=30$, where $d_1=d_2=0.05$, $d_3=d_4=4.0$.
The results (Fig.~\ref{fig:synth_func:with:diff:Ni}) show that when the $N_i$'s are different, our RM-GP-UCB algorithm, despite performing worse than the setting where all $N_i$'s are equal, is still able to significantly outperform standard GP-UCB.
\begin{figure}
\centering
\includegraphics[width=0.45\columnwidth]{figures/synth_func_new_err_diff_Ni.pdf}
\caption{The performance of RM-GP-UCB when the $N_i$'s are different.}
\label{fig:synth_func:with:diff:Ni}
\end{figure}



%\subsubsection{Benchmark Function for Hyperparameter Tuning of SVM Used by RGPE}
%%\label{app:benchmark_rgpe}
%%\begin{figure}[tb]
%%\centering
%%\includegraphics[width=0.4\columnwidth]{figures_rebuttal/svm_5_tasks.pdf}
%%\caption{Performance of different algorithms on the benchmark function on SVM hyperparameter tuning adopted by~\citep{feurer2018scalable}, i.e., the paper which introduced the RGPE algorithm.}
%%\label{fig:synth_func_svm}
%%\end{figure}
%This benchmark dataset, originally introduced by~\citep{wistuba2015learning}, is created by performing hyperparameter tuning of SVM using $50$ diverse datasets.
%$6$ hyperparameters are tuned: $3$ binary parameters indicating whether a linear, polynomial or radial basis function (RBF) kernel is used, the penalty parameter, the degree of the polynomial kernel, and the bandwidth parameter for the RBF kernel. A fixed grid of hyperparameters of size $288$ is created. For each dataset, each hyperparameter configuration on the grid is evaluated and the corresponding validation accuracy is recorded.
%In our experiments, each dataset corresponds to a task. We treat one of the $50$ tasks as the target task, and the remaining tasks as $49$ meta-tasks. For each meta-task, the meta-observations are produced by randomly sampling $50$ points from the grid. The results reported in the main paper (Fig.~\ref{fig:synth_func_results}e) are averaged over $25$ runs, each run treating a different task as the target task; for each run/target task, we again average the results over $5$ random initializations.

\subsection{Real-world Experiments}
\label{app:auto_ml}
%All datasets and software used in our real-world experiments are publicly available, and none of them contains personally identifiable information or offensive content.
%All datasets used in our real-world experiments are publicly available.
%More details on these datasets and software are included below.

\textbf{Hyperparameter Tuning for Convolutional Neural Networks (CNNs).}
The MNIST, CIFAR-10 and CIFAR-100 datasets can all be directly downloaded using the Keras Python package\footnote{\url{https://keras.io/}},
and the SVHN dataset can be downloaded from \url{http://ufldl.stanford.edu/housenumbers/}.
The MNIST dataset is under the GNU General Public License, CIFAR-10 adn CIFAR-100 are under the MIT License, and SVHN is under the Custom (non-commercial) License.
The image pixel values are all normalized into the range $[0, 1]$.
The CNN hyperparameters being optimized in this set of experiments are the learning rate, learning rate decay, and the L2 regularization parameter, 
all of which have the search space from $10^{-7}$ to $10^{-2}$.
Other than these hyperparameters, a common CNN architecture is used for all datasets, i.e., a CNN containing two convolutional layers (both with 32 filters and each filter has a size of $3\times 3$)
each of which is followed by a Max pooling layer (with a pooling size of $3\times 3$), 
followed by two fully connected layers (both with $64$ hidden units); all non-linear activations are ReLU.
The size of the training set and validation set for the four datasets are: 60,000/10,000 for MNIST, 73,257/26,032 for SVHN, 50,000/10,000 for both CIFAR-10 and CIFAR-100.
For the evaluation of a set of selected hyperparameters, the CNN model is trained using the RMSprop algorithm for $20$ epochs, and the 
final validation error is used as the corresponding output observation.
Fig.~\ref{fig:cnn_2} presents the results when the SVHN and CIFAR-100 datasets are used to produce the target functions.

Comparing Figs.~\ref{fig:synth_func_results}e,~\ref{fig:synth_func_results}f and Fig.~\ref{fig:cnn_2} shows that our RM-GP-UCB performs similarly to RGPE for the CIFAR-10, CIFAR-100 and SVHN datasets, and outperforms RGPE for MNIST.
After inspection, we found that this is because for the first three datasets (Fig.~\ref{fig:synth_func_results}f and Fig.~\ref{fig:cnn_2}), both RM-GP-UCB and RGPE assign most meta-weights to the same meta-task. 
On the other hand, for MNIST (Fig.~\ref{fig:synth_func_results}e), RM-GP-UCB (and RM-GP-TS) is able to assign most weights to SVHN which is indeed more similar to MNIST since they both contain images of digits.
In contrast, RGPE mistakenly assigns more meta-weights to CIFAR-10. The reason is that RGPE chooses the weights based on how accurately each meta-task’s GP surrogate predicts the pairwise ranking of the target observations (more details in Sec.~\ref{sec:related_works}, second paragraph). However, for MNIST, most target observations have very similar values since the overall accuracy is very high due to the simplicity of the MNIST dataset. Therefore, the predicted pairwise rankings become unreliable, thus rendering the weights learned by RGPE inaccurate and deteriorating the performance.

\begin{figure}[tb]
	\centering
	\begin{subfigure}[b]{0.45\linewidth}
		\includegraphics[width=\linewidth]{figures/svhn_with_mtbo_with_ts.pdf}
		\caption{SVHN.}
	\end{subfigure}
	\hfill
	\begin{subfigure}[b]{0.45\linewidth}
		\includegraphics[width=\linewidth]{figures/cifar_100_with_mtbo_with_ts.pdf}
		\caption{CIFAR-100.}
	\end{subfigure}
	\caption{Best validation error of CNN (both averaged over 10 random initializations).}
	\label{fig:cnn_2}
\end{figure}

\textbf{Hyperparameter Tuning for CNNs Using the Omniglot Dataset.}
The Omniglot dataset can be downloaded from \url{https://github.com/brendenlake/omniglot}, and it is under the MIT License.
The dataset consists of $50$ alphabets, $30$ from the background set and $20$ from the evaluation set. Each alphabet includes a number of characters, and all alphabets combine to have $1623$ characters. Every character only consists of $20$ example images, each drawn by a different person.
To perform one-shot classification, we use a Siamese neural network,
%~\citep{koch2015siamese}, 
which takes two images as inputs and outputs a score indicating whether the pair of input images are predicted to be the same character.
The evaluation metric we use in the experiment is 2-way validation error. That is, we compare a test image in the validation set with two other images, only one of which is the same character as the test image, and evaluate whether the Siamese network is able to output a higher predictive score for the correct image which is the same character; we do this using every test image, and use the percentage of errors as the 2-way validation error.
In our setting, each task represents tuning $3$ hyperparameters of the Siamese network (the same hyperparameters and ranges as the CNN experiments above) using one alphabet. For each task, we use $75\%$ of the characters in the alphabet to produce the training set, and the remaining $25\%$ to generate the validation set.
We use $10$ alphabets from the background set as $10$ meta-tasks. For each meta-task, we generate $30$ meta-observations by running BO (using GP-UCB) for $30$ iterations. This in total produces $10 \times 30=300$ meta-observations. We use one of the alphabets from the evaluation set as the target task.

\begin{figure}[tb]
\centering
\includegraphics[width=0.4\columnwidth]{figures/Omniglot_new_with_ts.pdf}
\caption{2-way validation error on the Omniglot dataset.}
\label{fig:omniglot}
\end{figure}

\textbf{Hyperparameter Tuning for Support Vector Machines (SVMs).}
This benchmark dataset, which was originally introduced by~\citep{wistuba2015learning} and can be downloaded from~\url{https://github.com/wistuba/TST}, is created by performing hyperparameter tuning of SVM using $50$ diverse datasets.
$6$ hyperparameters are tuned: $3$ binary parameters indicating whether a linear, polynomial or radial basis function (RBF) kernel is used, the penalty parameter, the degree of the polynomial kernel, and the bandwidth parameter for the RBF kernel. A fixed grid of hyperparameters of size $288$ is created. For each dataset, every hyperparameter configuration on the grid is evaluated and the corresponding validation accuracy is recorded as the observed output of the objective function.
In our experiments, each dataset corresponds to a task. We treat one of the $50$ tasks as the target task, and the remaining tasks as $49$ meta-tasks. For each meta-task, the meta-observations are produced by randomly sampling $50$ points (hyperparameter configurations) from the grid. The results reported in the main paper (Fig.~\ref{fig:cnn}c) are averaged over $25$ trials, each trial treating a different task as the target task; for each trial/target task, we again average the results over $5$ random initializations.


\textbf{Human Activity Recognition (HAR).}
The dataset used in this experiment can be downloaded from \url{https://archive.ics.uci.edu/ml/datasets/Human+Activity+Recognition+Using+Smartphones}.

In this experiment of human activity prediction, each data instance (input-output pair) is characterized by a feature vector of length 561 and a label corresponding to one of the $6$ activities.
The SVM hyperparameters being optimized are the penalty parameter $C$ (from 0.01 to 10) and the radial basis function (RBF) kernel coefficient $\gamma$ (from 0.01 to 1).
There are in total 7,352 data instances for the 21 subjects that are used to generate the meta-tasks, and 2,947 instances for the 9 subjects used for performance validation.
For each subject, half of the instances are used as the training set, with the other half being used for validation.

% Fig.~\ref{fig:har_all} plots the performances of each of the 9 subjects used for performance validation, in which RM-GP-UCB performs most consistently among all
% algorithms under comparison (summarized in Fig.~\ref{fig:cnn}c in the main text). 
% Specifically, 
% %as mentioned in the main text (Section~\ref{subsec:automl}), 
% RGPE fails to outperform standard GP-UCB in Figs.~\ref{fig:har_all}c, d, e, g and h,
% and TAF fails to perform better than standard GP-UCB in Figs.~\ref{fig:har_all}e and h, whereas RM-GP-UCB fails to outperform GP-UCB only in Fig.~\ref{fig:har_all}h.

% \begin{figure}[tb]
% 	\centering
% 	\begin{subfigure}[b]{0.325\linewidth}
% 		\includegraphics[width=\linewidth]{figures_updated/HAR_sub_1_new.pdf}
% 		\caption{}
% 		\label{fig:har_1}
% 	\end{subfigure}
% 	\begin{subfigure}[b]{0.325\linewidth}
% 		\includegraphics[width=\linewidth]{figures_updated/HAR_sub_2_new.pdf}
% 		\caption{}
% 		\label{fig:har_2}
% 	\end{subfigure}
% 	\begin{subfigure}[b]{0.325\linewidth}
% 		\includegraphics[width=\linewidth]{figures_updated/HAR_sub_3_new.pdf}
% 		\caption{}
% 		\label{fig:har_3}
% 	\end{subfigure}
% 	\begin{subfigure}[b]{0.325\linewidth}
% 		\includegraphics[width=\linewidth]{figures_updated/HAR_sub_4_new.pdf}
% 		\caption{}
% 		\label{fig:har_4}
% 	\end{subfigure}
% 	\begin{subfigure}[b]{0.325\linewidth}
% 		\includegraphics[width=\linewidth]{figures_updated/HAR_sub_5_new.pdf}
% 		\caption{}
% 		\label{fig:har_5}
% 	\end{subfigure}
% 	\begin{subfigure}[b]{0.325\linewidth}
% 		\includegraphics[width=\linewidth]{figures_updated/HAR_sub_6_new.pdf}
% 		\caption{}
% 		\label{fig:har_6}
% 	\end{subfigure}
% 	\begin{subfigure}[b]{0.325\linewidth}
% 		\includegraphics[width=\linewidth]{figures_updated/HAR_sub_7_new.pdf}
% 		\caption{}
% 		\label{fig:har_7}
% 	\end{subfigure}
% 	\begin{subfigure}[b]{0.325\linewidth}
% 		\includegraphics[width=\linewidth]{figures_updated/HAR_sub_8_new.pdf}
% 		\caption{}
% 		\label{fig:har_8}
% 	\end{subfigure}
% 	\begin{subfigure}[b]{0.325\linewidth}
% 		\includegraphics[width=\linewidth]{figures_updated/HAR_sub_9_new.pdf}
% 		\caption{}
% 		\label{fig:har_9}
% 	\end{subfigure}
% 	\caption{Best validation error of SVM for human activity recognition for the 9 individual subjects 
% 	(each averaged over 10 random initializations).}
% 	\label{fig:har_all}
% \end{figure}

\textbf{Non-stationary Bayesian Optimization.}
The clinical diagnosis dataset used in this experiment can be found at \url{https://www.kaggle.com/uciml/pima-indians-diabetes-database}, and it is associated with the CC0 License.
The hyperparameters of the logistic regression (LR) model being optimized are the batch size (20 to 60), 
the L2 regularization parameter ($10^{-6}$ to 0.01) and the learning rate (0.01 to 0.1).
The dataset represents a binary classification problem (whether a patient has diabetes or not), with each input instance consisting of 8 diagnostic features:
number of pregnancies, plasma glucose concentration, blood pressure, skin thickness, insulin, BMI, diabetes pedigree function, and age.
%The entire dataset consists of 768 data instances, among which 77 instances are set aside to measure the validation accuracy. 
%The sizes of the 5 progressively growing training datasets (i.e., corresponding to the 4 meta-tasks and the target task respectively) are 138, 276, 414, 552, and 691.


\textbf{Policy Search for Reinforcement Learning.} 
%\label{app:rl}
In this experiment, we use the Cart-Pole environment from OpenAI Gym (\url{https://github.com/openai/gym}), which is under the MIT License.
We adopt the linear softmax policy 
% for the RL experiment using the Cart-Pole environment, 
which linearly maps a state vector of length 4 
to an action vector of length 2, followed by a softmax operator.
As a result, for a particular state, the action with the largest softmax value is taken.
With this setting, $4\times 2=8$ parameters are tuned in this experiment.
The performance metric used in the experiment is the cumulative rewards (normalized to the range $[0,1]$) in an episode (averaged over $10$ independent episodes), and 
the maximum length of each episode is set to 200.

%Here, we adopt the linear softmax policy for both RL experiments, which linearly maps a state vector (of length 4 for Cart-Pole and 2 for Mountain-Car) 
%to an action vector (of length 2 for Cart-Pole and 3 for Mountain-Car), followed by a softmax operator.
%As a result, for a particular state, the action with the largest softmax value is taken.
%With this setting, $4\times 2=8$ parameters are tuned for the Cart-Pole experiment and $2 \times 3=6$ are optimized for the Mountain-Car experiment.
%The performance metric used in both experiments is the cumulative rewards (normalized to the range $[0,1]$) in an episode, and 
%the maximum length of each episode is set to 200 in both experiments.
%
%Fig.~\ref{fig:rl_ws} plots the weights assigned by OM-GP-UCB to the 10 meta-tasks in the Cart-Pole task, 
%which shows that OM-GP-UCB is able to learn to automatically assign large weights to the first three meta-tasks, which are constructed to be more similar to the target task as mentioned in the main text.
%\begin{figure}[htb]
%\centering
%\centerline{\includegraphics[width=0.5\columnwidth]{figures/meta_weights_rl.pdf}}
%\caption{The meta-weights assigned to different meta-tasks by OM-GP-UCB in the Cart-Pole experiment for policy search in RL.}
%\label{fig:rl_ws}
%\end{figure}

\subsection{Impacts of Max vs Mean in Function Gap Estimation}
\label{app:subsec_max_mean}
Here we explore the impact of the choice between using $\max$ (the outer $\max$ operator over $j=1,...,N_i$) or the empirical mean in the estimated upper bound on the function gap (Lemma~\ref{estimate_di}),
as mentioned in the first paragraph of Section~\ref{sec:experiment}.
Fig.~\ref{fig:inspect_use_max} plots the different performances using these two choices in the MNIST, CIFAR-$10$ and clinical diagnosis (non-stationary BO) experiments.
The results show that the performance deficit resulting from the use of the $\max$ operator is marginal in some experiments (Fig.~\ref{fig:inspect_use_max}a and b),
whereas the difference can be larger in some other experiments (Fig.~\ref{fig:inspect_use_max}c).
Therefore, it is recommended to use the empirical mean when estimating the upper bound on the function gap in practice.
\begin{figure}
	\centering
	\begin{tabular}{ccc}
		\hspace{-3mm} \includegraphics[width=0.328\linewidth]{figures/mnist_use_max} & \hspace{-4mm}
		\includegraphics[width=0.328\linewidth]{figures/cifar_10_use_max} & \hspace{-4mm}
		\includegraphics[width=0.328\linewidth]{figures/clinical_use_max.pdf}\\
		{(a)} & {(b)} & {(c)}
	\end{tabular}%\vspace{-2mm}
	\caption{Impacts of using max vs empirical mean in estimating the upper bound on the function gaps, using the (a) MNIST, (b) CIFAR-10 and (c) non-stationary BO (clinical diagnosis) experiments.}
	\label{fig:inspect_use_max}
\end{figure}

\subsection{Scalability of Our Algorithms}
\label{app:scalability}
Here we further demonstrate the scalability of our RM-GP-UCB and RM-GP-TS algorithms.
% Firstly, we plot the runtime of different algorithms in the non-stationary BO (diabetes diagnosis) experiment. We have chosen to use this experiment since its scale is not excessively large such that it is still computationally feasible for the MTBO algorithm. As shown in Fig.~\ref{fig:scalability_1}, our RM-GP-UCB algorithm, as well as RGPE and TAF, runs much faster than the MTBO algorithm. 
by showing that our algorithms can be applied to experiments with a very large scale, and still performs competitively.
Specifically, we construct a much larger version of the experiment on policy search for RL, with $60$ meta-tasks each containing $130$ meta-observations. Fig.~\ref{fig:scalability_2}a and b plot the performance and runtime in this large-scale experiment. 
Consistent with Fig.~\ref{fig:cnn}e in the main text, our RM-GP-UCB algorithm still performs the best among all algorithms (Fig.~\ref{fig:scalability_2}a).
RM-GP-TS has a better performance here than in Fig.~\ref{fig:cnn}e, performing comparably with RGPE (Fig.~\ref{fig:scalability_2}a).
Moreover, RM-GP-TS is again significantly more scalable than RM-GP-UCB, RGPE and TAF, and its computational cost is comparable with standard GP-UCB (Fig.~\ref{fig:scalability_2}b).


%\begin{figure}[tb]
%\centering
%\includegraphics[width=0.4\columnwidth]{figures/runtime_rebuttal_clinical_0_with_ts.pdf}
%\caption{Runtime of different algorithms in the non-stationary BO (clinical diagnosis) experiment.}
%\label{fig:scalability_1}
%\end{figure}

\begin{figure}[tb]
	\centering
	\begin{subfigure}[b]{0.4\linewidth}
		\includegraphics[width=\linewidth]{figures/rl_cartpole_new_large_scale_new_new_new_with_ts.pdf}
		\caption{Cumulative rewards.}
	\end{subfigure}
	\hfill
	\begin{subfigure}[b]{0.4\linewidth}
		\includegraphics[width=\linewidth]{figures/runtime_rebuttal_RL_new_with_ts.pdf}
		\caption{Runtime.}
	\end{subfigure}
	\caption{Results demonstrating that our algorithms can be applied to experiments with a very large scale, using a larger version of the RL experiment (with $60\times 130=7800$ meta-observations).}
	\label{fig:scalability_2}
\end{figure}


\subsection{More Details on RM-GP-TS}
\label{app:ts:details}
In this section, we present more details on the practical implementation of our RM-GP-TS algorithm.
In all experiments, when sampling a function from the GP posterior, we use random Fourier features (RFF)~\citep{dai2020federated,rahimi2008random} with $m=120$ random Fourier features.
Firstly, we need to construct a set of random features. For an SE kernel with hyperparameters $l$ and $\sigma_k$ (i.e., $k(\mathbf{z})=\sigma_k^2e^{-\frac{\norm{\mathbf{z}}^2_2}{2l^2}}$, with $\mathbf{z}=\mathbf{x}_1-\mathbf{x}_2,\forall \mathbf{x}_1,\mathbf{x}_2\in\mathcal{D}$), we firstly sample $m$ vectors $\{\mathbf{s}_i\}_{i=1,\ldots,m}$ from the $D$-dimensional Gaussian distribution: $\mathcal{N}(0, \frac{1}{l^2}I)$, and sample $m$ scalar values $\{b_i\}_{i=1,\ldots,m}$ from the uniform distribution within the domain $[0, 2\pi]$.
Next, for any input $\mathbf{x}\in\mathcal{D}$, its corresponding $m$-dimensional random features can be constructed as $\boldsymbol{\phi}(\mathbf{x})=[\sqrt{2/m}\cos(\mathbf{s}_i^{\top}\mathbf{x} + b_i)]^{\top}_{i=1,\ldots,m}$. Every $\boldsymbol{\phi}(\mathbf{x})$ is then normalized such that $\norm{\boldsymbol{\phi}(\mathbf{x})}^2_2=\sigma_k^2,\forall \mathbf{x}\in\mathcal{D}$.
Based on these, in order to (approximately) sample a function from the GP posterior, we firstly sample a vector $\boldsymbol{\omega}$ from the Gaussian distribution $\boldsymbol{\omega}\sim\mathcal{N}(\boldsymbol{\nu}_t,\sigma^2\boldsymbol{\Sigma}_t)$, with $\boldsymbol{\Sigma}_t=(\boldsymbol{\Phi}_t^{\top}\boldsymbol{\Phi}_t+\sigma^2 \boldsymbol{I})^{-1}$,  $\boldsymbol{\nu}_t=\boldsymbol{\Sigma}_t\boldsymbol{\Phi}_t^{\top}\mathbf{y}_t$, and $\boldsymbol{\Phi}_t=[\boldsymbol{\phi}(\mathbf{x}_1,\ldots,\mathbf{x}_t)]^{\top}$.
Finally, we can use the sampled $\boldsymbol{\omega}$ to construct the sampled function such that $f^t(\mathbf{x})=\boldsymbol{\phi}(\mathbf{x})^{\top}\boldsymbol{\omega},\forall \mathbf{x}\in\mathcal{D}$.
As a result, as mentioned in Sec.~\ref{sec:om_gp_ucb}, for a meta-task $i$, in order to sample multiple functions from the meta-function $f_i$ before the algorithm starts, we simply need to draw multiple samples of the vector $\boldsymbol{\omega}$ from the corresponding multivariate Gaussian distribution using the observations from meta-task $i$.
For both the target function and every meta-function, the kernel hyperparameters ($l$ and $\sigma_k$) used in the posterior sampling steps above are learned by maximizing the marginal likelihood (using full GP without RFF approximation), which is a common practice in BO.

\end{document}