\documentclass[accepted]{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
% if you need to pass options to natbib, use, e.g.:
%     \PassOptionsToPackage{numbers, compress}{natbib}
% before loading neurips_2023


% ready for submission
\usepackage{wrapfig}
\usepackage{algorithm}
\usepackage{algcompatible}

\usepackage{tikz}
\def\checkmark{\tikz\fill[scale=0.4](0,.35) -- (.25,0) -- (1,.7) -- (.25,.15) -- cycle;}

% to compile a preprint version, e.g., for submission to arXiv, add add the
% [preprint] option:
%     \usepackage[preprint]{neurips_2023}


% to compile a camera-ready version, add the [final] option, e.g.:
%     \usepackage[final]{neurips_2023}


% to avoid loading the natbib package, add option nonatbib:
%    \usepackage[nonatbib]{neurips_2023}


\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{subcaption}
\usepackage{color,soul}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{mathtools}
\usepackage{amsthm}
\usepackage{amsfonts}
\usepackage{thmtools}
\usepackage{thm-restate}
\usepackage[capitalize,noabbrev]{cleveref}
\usepackage{natbib}
% \theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
% \theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
% \theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
% \newtheorem{assumption}{Assumption}[section]
% \usepackage{neurips_2023}
% \DeclareMathOperator*{\argmax}{arg\,max}
% \DeclareMathOperator*{\argmin}{arg\,min}

\author[1]{\href{mailto:<mzfan@tamu.edu>?Subject=Multi-fidelity Bayesian Optimization with Multiple Information Sources of Input-dependent Fidelity}{Mingzhou Fan}}
\author[1,2]{Byung-Jun Yoon}
\author[1]{Edward Dougherty}
\author[2]{Nathan Urban}
\author[3]{\\Francis Alexander}
\author[4,5]{Raymundo Arr\'{o}yave}
\author[1,2,6]{Xiaoning Qian}
% Add affiliations after the authors

%  \address{$^{\star}$ 
% %       $^{\dagger}$ Computational Science Initiative, Brookhaven National Laboratory, Upton, NY}

\affil[1]{%
    Department of Electrical \& Computer Engineering\\ Texas A\&M University\\ College Station\\ Texas\\ USA\\
}
\affil[2]{%
    Computational Science Initiative\\ Brookhaven National Laboratory\\ Upton\\New York\\USA\\
}
\affil[3]{%
    Argonne National Laboratory\\Lemont\\Illinois\\USA\\
  }
\affil[4]{%
    Department of Materials Science \& Engineering\\Texas A\&M University\\ College Station\\ Texas\\ USA
  }


\affil[5]{%
    Department of Mechanical Engineering\\Texas A\&M University\\ College Station\\ Texas\\ USA
  }

\affil[6]{%
    Department of Computer Science \& Engineering\\Texas A\&M University\\ College Station\\ Texas\\ USA
  }

\title{Multi-fidelity Bayesian Optimization with Multiple Information Sources of Input-dependent Fidelity}

% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to LaTeX to determine where to break the
% lines. Using \AND forces a line break at that point. So, if LaTeX puts 3 of 4
% authors names on the first line, and the last on the second line, try using
% \AND instead of \And before the third author name.
\begin{document}
\maketitle
\begin{abstract}
  By querying approximate surrogate models of different fidelity as available information sources, Multi-Fidelity Bayesian Optimization (MFBO) aims at optimizing unknown functions that are costly if not infeasible to evaluate. Existing MFBO methods often assume that approximate surrogates have consistently high/low fidelity across the input domain. However, approximate evaluations from the same surrogate can have different fidelity at different input regions due to data availability and model constraints, especially when considering machine learning surrogates. In this work, we investigate MFBO when multi-fidelity approximations have input-dependent fidelity. By explicitly capturing input dependency for multi-fidelity queries in Gaussian Process (GP), 
  our new input-dependent MFBO~(iMFBO) with learnable noise models better captures the fidelity of each information source in an intuitive way.
  % To the best of our knowledge, this is 
  We further design a new acquisition function for iMFBO and prove that the queries selected by iMFBO have higher quality than those by naive MFBO methods, with the derived sub-linear regret bound. Experiments on both synthetic and real-world data demonstrate its superior empirical performance. %Our iMFBO with learnable noise models explains the fidelity of each information source in an intuitive way.
  % To the best of our knowledge, this is the first work that studies MFBO with learnable input-dependent fidelity.
\end{abstract}

\section{Introduction}
Bayesian Optimization (BO)~\citep{frazier2018tutorial} has been a powerful tool to optimize black-box functions. The term `black-box' here indicates that we do not have access to the analytic form of either the objective function or its derivatives. We can only gain information about them by querying selected inputs to evaluate, where each evaluation can be time-consuming with prohibitive costs. Usually, BO first learns a probabilistic model, Gaussian Process~(GP)~\citep{rasmussen2003gaussian} for example, from available evaluations as a surrogate of the black-box objective and then iteratively selects new input(s) to query guided by some acquisition function. The acquisition function is designed to be easier to optimize compared to the original objective and achieve the desired exploration and exploitation trade-off for efficient identification of global optimum. % of the current surrogate model.

In Multi-fidelity Bayesian Optimization (MFBO)~\citep{forrester2007multi}, % is an extension of BO, where 
instead of directly evaluating expensive objective functions, we can query their less resource-demanding approximation models. 
% For example for hyper-parameter tuning in AutoML, it is time-consuming to wait till convergence to observe the model performances of different hyper-parameter settings~\citep{7900023} while we can %optimize hyper-parameters by the intermediate models trained with fewer training epochs and 
% use the validation losses of intermediate models trained with fewer training epochs to approximate the actual model performances. Usually different fidelity of approximation models are associated with different costs, in hyper-parameter tuning, longer training time with more epochs indicates higher fidelity but requiring more time for training. \hl{do we still need hyperparameter tuning example here? }
% %At present, m
Most of the existing works on MFBO consider fixed fidelity for each approximation model and optimize the underlying function within the predefined budget of cost~\citep{NIPS2016_605ff764}. However, the fidelity of different approximation models may not be always fixed but is dependent on the input. This may arise in many adaptive reduced-order models and especially data-driven approximation models by recent machine learning (ML) methods. Typically, these approximation models tend to be more accurate in the `data-rich' regions and less accurate in the other regions with less data.
%, which have been successful in many real world applications. In some other applications, however, the fidelity of the approximation model may mot fixed but dependent on the input. This kind of situation may happen especially when data-driven machine learning models are chosen as approximation models, which tend to be more accurate in the `data rich' regions and less accurate in regions with less data.

In this work, we focus on the cases where the multi-fidelity approximations have varying fidelity for different approximation models and over the input space. We try to capture the varying fidelity by learning the input-dependent additive noise, usually ignored and considered as a hyper-parameter in many BO and MFBO methods.% We design a new acquisition function adopted from Upper Confidence Bound~(UCB)~\citep{srinivas2009gaussian} and theoretically prove that it will result in higher information gain than MFBO using the naive UCB acquisition function and extensions. We further empirically compare our input-dependent MFBO~(iMFBO) with existing baselines on both synthetic and real-world examples. 
% %\hl{we may want to have a good name for this?, maybe iMFBO? } the naive procedure and 
%

Our contribution in this work is three-fold:
% \vspace{-3mm}
\begin{enumerate}
    \item We adopted the heteroscedastic Gaussian Process to the Multi-Fidelity setup that the multi-fidelity approximations have varying fidelity over the input space as well as different input sources and proposed input-dependent MFBO~(iMFBO) framework and extend it to cost-aware and bias-aware setups. 
    \item Based on the surrogate modeling, we proposed a new acquisition function Noise-Variant Upper Confidential Bound~(NVUCB)
    % by modifying Upper Confidence Bound~(UCB)~\citep{srinivas2009gaussian}
    and theoretically derived a sub-linear regret bound.
    \item We further empirically compare our iMFBO with existing baselines on both synthetic and real-world examples and demonstrate its superiority.
\end{enumerate}

% 



\section{Background}
% %\vspace{-2mm}
\subsection{Single Fidelity Bayesian Optimization} %(SFBO)}
% %\vspace{-1mm}

%In Bayesian Optimization, we try to iteratively optimize a ground-truth black-box objective function $f: \mathcal{X} \rightarrow \mathbb{R}$.do the prediction by distribution of the 

For single fidelity BO (SFBO), we iteratively get noisy observations $y$ of the ground-truth objective function $f: \mathcal{X} \rightarrow \mathbb{R}$ by querying selected inputs $x$, where $y=f(x) + \delta S$, $\delta$ is noise scale, and $S \sim \mathcal{N}(0, 1)$ is the standard normal white noise. BO iterates the surrogate model updating and selecting query evaluation, aiming to find the global optimum of $f(x)$ with the minimum number of queries. 

In iteration $i$, we query one input $x_i$ in the input space $\mathcal{X}$, and gradually build a dataset $\mathcal{D}_t = \{(x_i, y_i)\}_{i\in \{1, 2, \dots, t\}}$, denoting $X = [x_1, x_2, \dots, x_t]$ and $Y = [y_1, y_2, \dots, y_t]$. 
GPs are well studied probabilistic surrogate models and are commonly chosen in BO. Given $\mathcal{D}_t$, we can derive the predictive posterior assuming the GP prior for the ground-truth objective values; % follows a multi-variate Gaussian distribution:
\begin{equation}
[f(x_1), f(x_2), \dots, f(x_t)] \sim \mathcal{N}(m, K), \forall x_i \in \mathcal{X},
\end{equation}
where $m$ is the mean vector (usually chosen to be 0) and $K$ is the covariance matrix with entries $K_{i, j} = k(x_i, x_j)$, where $k(\cdot, \cdot)$ is a pre-defined kernel function. The prediction at $x$ is then %the Bayesian posterior
\begin{equation}
f(x)|_{\mathcal{D}_t} \sim \mathcal{N}(\mu_t(x), \sigma^2_t(x)),
\end{equation}
where $\mu_t(x) = K'K_t^{-1} Y$ is the posterior mean, $\sigma_t^2(x) = k(x, x) - K'K_t^{-1}K'^T$ is the posterior variance, $K_t = K+\delta^2 I$ is the covariance matrix of the observation with the observation noise $\delta$, and $K' = [k(x, x_1), k(x, x_2), \dots ,k(x, x_t)]$~\citep{rasmussen2003gaussian}.
Although the observation noise $\delta$ is usually considered stationary, there have also been works on GPs considering heteroscedastic noise setups~\citep{goldberg1997regression, kersting2007most, liu2020large}, which inspired us to extend the input-dependent noise to the BO setup. % showing the necessity of considering input-dependent noise

There are many commonly used acquisition functions in BO, such as Expected Improvement (EI)~\citep{jones1998efficient} and Probability of Improvement (PI)~\citep{kushner1964new}. Another widely studied acquisition function is the Upper Confidence Bound (UCB):
\begin{equation}
    \alpha_t(x) = \mu_t(x) + \beta_t^{\frac{1}{2}}\sigma_t(x).
    \label{eq:UCB}
\end{equation}
There are also many other complicated acquisition functions, especially the entropy-based ones including the Predictive Entropy Search (PES)~\citep{hernandez2014predictive} and Maximum-value Entropy Search (MES)~\citep{wang2017max}. These usually do not have analytic forms and require approximation or sampling methods to compute. 

% %\vspace{-2mm}
% \hl{please make sure to change the notations, making covariances K and approx. function index K difference...}
% \hl{make sure about references...}
\vspace{-2mm}
\subsection{Multi-Fidelity Bayesian Optimization}%~(MFBO)}
\vspace{-2mm}
For multi-fidelity BO~(MFBO), the ground-truth objective function $f$ is usually not able to be directly queried or evaluated without observation noise. Instead, we can query its different black-box approximation models, namely $f^j: \mathcal{X} \rightarrow \mathbb{R}$ at different cost $c_j$, where $j\in \mathcal{J} = \{1, 2, \dots, J\}$ indexes the approximation functions to query. Such situations are ubiquitous in many real-world applications. Many complex systems in reality, such as the climate system, are difficult to evaluate at will. Alternative to directly measuring the actual status, we can query different surrogate models such as physics-based simulation models or data-driven ML surrogates.

Unlike in classical BO, each iteration of MFBO necessitates the selection of both an input and an approximation model for querying.  Let us denote the observed data $\mathcal{D}^{MF}_t = \{(x_i, y_i^{j_i})\}_{i\in \{1, 2, \dots, t\}}$, where $x_i$ represents the $i$th queried input and $y_i^{j_i}$ the corresponding evaluation result from the approximation model $j_i \in \mathcal{J}$. We refer $\mathcal{D}^{MF}_t$ as $\mathcal{D}_t$ in the following sections for simplicity. Note that we do not necessarily have results from all the approximation models for a given input.
\vspace{-2mm}
\subsection{Related Work}
\vspace{-2mm}
The design of MFBO also considers two critical components as in BO: the surrogate modeling and the acquisition function derivation.

Many different probabilistic models have been proposed as surrogates in MFBO, including independent GPs for different approximation models~\citep{lam2015multifidelity}, Convolved Multi-Output Gaussian Process~\citep{zhang2017information}, hierarchical (co-)kriging~\citep{shu2021multi, poloczek2017multi}, recent Bayesian Neural Networks (BNNs)~\citep{li2020multi,NEURIPS2021_d5e2c0ad}, and the Semiparametric Latent Factor Model~(SLFM)~\citep{teh2005semiparametric}, which is a Gaussian process-based multiple response model~\citep{pmlr-v119-takeno20a}. % adopted.
%Heteroscedastic GPs are also considered as BO surrogates for both SFBO~\citep{kirschner2018information} and MFBO~\citep{bogunovic2016truncated}. 
All the surrogate models previously mentioned for MFBO necessitate a pre-determined fixed fidelity. This can be achieved either by establishing correlations between approximation models as hyper-parameters, as illustrated in the works of~\cite{lam2015multifidelity},~\cite{zhang2017information}, and~\cite{teh2005semiparametric}, or by utilizing the low-fidelity approximation results as inputs to the high-fidelity surrogates. The latter approach is exemplified in the research by~\cite{shu2021multi}, \cite{li2020multi}, and in the later publication~\citep{NEURIPS2021_d5e2c0ad}.

Several MFBO acquisition functions have been derived from their classical counterparts in BO, each tailored to different models. 
While there are analytical acquisition functions, such as MF-GP-UCB~\citep{NIPS2016_605ff764}, the majority tend to be entropy-based. This trend stems from the inherent nature of MFBO, where the queried evaluations originate from multiple sources. For instance, the MF-MES~\citep{pmlr-v119-takeno20a, li2020multi, NEURIPS2021_d5e2c0ad} is derived from the MES, and the MF-PES~\citep{zhang2017information} is adapted from the PES used in BO.

There are also works considering different MFBO setups. % different from the general ones. 
For example, \cite{song2019general} proposed MF-MI-Greedy aiming at 
%\hl{maximizing regret?reduction} 
minimizing regret when querying high-fidelity evaluations is mandatory after spending a specified budget on lower-fidelity models.
% (meaning?)}
\cite{pmlr-v70-kandasamy17a} considered approximation models with fixed but continuous fidelity.

Considering the observation has input-dependent noise and model it as heteroscedastic GP has previously been studied for SFBO~\cite{makarova2021risk, tautvaivsas2023heteroscedastic, griffiths2021achieving, kirschner2018information}, sometimes termed as Risk-Averse BO, aiming at optimizing target function while restricting the risk. We extend the use of heteroscedastic GP as the surrogate model of MFBO and use the input-dependent noise to model the fidelity over different approximation models as well as the input space in this work.

Similar BO setups that allow querying different information sources have been given different names such as MFBO~\citep{NIPS2016_605ff764} or Multi-Information-Source BO~\citep{poloczek2017multi}. We use MFBO for simplicity in this paper. %term this type of problem as Multi-fidelity Bayesian Optimization with Multiple Information Source, and MFBO for short. 
To contextualize our contributions within the existing literature, we present the first-ever input-dependent MFBO~(iMFBO) methodology that takes into account learnable input-dependent fidelity for queried evaluations facilitated by heteroscedastic learnable noise models.  

In this work, we are considering BO with different evaluation or information sources that have different "fidelity" compared to the ground truth, and the goal is to efficiently utilize the information from different sources to optimize the target function similar to the majority of MFBO work. While the term ``MFBO'' in the literature usually refers to BO problems with evaluation models with different costs and fixed fidelity, we want to note that this work considers a more flexible setup that is not restricted to fixed high-low fidelity but considers input-dependent fidelity to incorporate more complicated real-world scenarios.


\section{Method}
\subsection{Surrogate Modeling}
Here we %introduce our Bayesian Optimization~(BO) with evaluations from approximation models that have input-dependent noise. We will 
first present the surrogate modeling for BO with evaluations from different information sources with approximation models that may have input-dependent noise. We then derive input-dependent BO methods with both single and multiple approximation models, which we respectively denote as iBO and iMFBO. %, respectively.  %\hl{with both single and multiple approximation models, which we term as iBO and iMFBO respectively. }

%%\vspace{-3mm}
Similar %as  
to existing BO methods, we model the objective function $f$ to optimize by a GP. 
%We would like to emphasize that in this work, we focus on the situations where only evaluations from approximation models are available and different evaluations from the corresponding approximation model may have input-dependent fidelity, hence the deviation from the underlying objective function $f$ can change based on the input $x$. 
We would like to underscore that this study primarily focuses on scenarios where only evaluations from information sources with approximation models are accessible. In these cases, different evaluations from the corresponding approximation model may exhibit input-dependent fidelity. Consequently, the deviation from the underlying objective function $f$ can vary in accordance with the input $x$. We aim to capture input-dependent fidelity of different evaluations for more efficient BO. 
Most of the existing BO methods ignore the evaluation noise and model evaluations by GPs with independent additive Gaussian noise of the fixed variance $\delta^2$. Here, we explicitly model the input-dependent fidelity by additive Gaussian noise with an input-dependent variance  as a random variable $\delta^2(x)$.
% In this work, we model the evaluations by each of the approximation model by Gaussian Processes: $f_k(x)$, \st{ to be Gaussian distributed} with mean $f(x)$ and input-dependent variance $\delta_k^2(x)$, i.e. $$f_i(x) \sim \mathcal{N}(F(x), \delta_i^2(x)) = f(x) + \delta_i(x)  S,$$ 
% and we use a zero mean GP to model $f$, $f(x) \sim \mathcal{GP}(0, k(\cdot, \cdot))$, where $k$ is the kernel function. To capture the input dependent noise, we model each of the noise function $\delta_{j}$ with parameter $\theta_k$, denoted as  $\delta_{\theta_k}$.

The predictive posterior $p(f(x)|\mathcal{D}_t)$ with GP can be derived by marginalizing out the noise variable $\delta$: 
\begin{equation}
\label{eq:posterior}
% p(\hat{f}(x)|\mathcal{D}_t) = \int_\theta p(\hat{f}(x)|\theta, \mathcal{D}_t) p(\theta|\mathcal{D}_t) d\theta.
p(f(x)|\mathcal{D}_t) = \int_\delta p(f(x)|\delta, \mathcal{D}_t) p(\delta|\mathcal{D}_t) d\delta.
\end{equation}
% \hl{I felt things should be better described: ground-truth objective, approximation models, and GP-surrogates for MFGO, should be different...}
In this setting, the GP surrogate modeling of any input-dependent approximation model of the ground-truth objective function is
%$\hat{f}_{a}(x) \sim \mathcal{N}(\hat{f}(x),
% $f^{j}(x)|_{\delta_j(x)} \sim \mathcal{N}(f(x),\delta_j^2(x))$, 
$f^{j}(x) = f(x) + \delta_j^2(x) S$, 
where $j \in \mathcal{J}$ is the index of information sources, and $\delta_j(x)$ is now an input-dependent random variable and $\delta = [\delta_1, \dots, \delta_{J}]$ and $S\sim\mathcal{N}(0, 1)$.%\hl{$\delta$ is not constant}


As stated before, to infer the posterior distribution of $f$, $f(x)|_{\delta, \mathcal{D}_t}$, we need to define the prior on the queried noisy evaluation(s), for which the covariance takes the following form: % in our case: 
\begin{equation}
    \text{cov}[f^{j}(x), f^{j'}(x')] = k(x, x') + \mathbb{I}(j, j')\mathbb{I}(x, x')\delta_j^2(x),
\end{equation}
where $\mathbb{I}(a, a')$ is the indicator function, and $\mathbb{I}(a, a')=1$ when $a = a'$ and 0 otherwise.

At each iteration $t$, our prediction for the ground-truth objective $f(x)$ on input $x$ can be written as: 
% $$f^{(t)}(x)|_\sigma := f(x)|_{\mathcal{D}_t, \sigma} \sim \mathcal{N}(\mu_t(x), \sigma_t^2(x)),$$
\begin{equation}
    f_t(x)|_\delta := f(x)|_{\delta, \mathcal{D}_t} \sim \mathcal{N}(\mu_t(x), \sigma_t^2(x)),
\end{equation}
where the posterior mean is $\mu_t(x) = K'\hat{K}_t^{-1} Y_t$, and $Y_t = [y_1^{j_1}, y_2^{j_2}, \dots, y_t^{j_t}]$ denotes the previous evaluation results.
% \hl{need to redefine Y?} 
The updated posterior variance becomes $\sigma_t^2(x) = k(x, x) - K'\hat{K}_t^{-1}K'^T$, and the covariance matrix of the observations  becomes $\hat{K}_t = K+ \Lambda(\delta^2_{j_1}(x_1), \dots, \delta^2_{{j_t}}(x_t))$, where $\Lambda(\delta^2_{j_1}(x_1), \dots, \delta^2_{j_t}(x_t))$ denotes a diagonal matrix with diagonal entries being $\delta^2_{j_1}(x_1), \dots, \delta^2_{{j_t}}(x_t)$. 

Here we have the form of $p(f(x)|\delta, \mathcal{D}_t)$. 
The other important component of the posterior~\eqref{eq:posterior} is $p(\delta|\mathcal{D}_t)$. Naturally, we believe that the input-dependent fidelity should have a certain level of continuity and, in turn, can be modeled either parametrically or non-parametrically. An example illustrating the performance of these surrogate modeling can be found in Appendix~\ref{sec:surrogate_diff}

\subsubsection{Parametric Noise Model}\label{sec:pnm} 

One way to capture the input-dependent noise variance $\delta(x)$ is by learnable parametric models, such as linear models or MLPs, denoted by $\delta_\theta(x)$. The posterior of the objective function is then transformed to:
\begin{equation}
\label{eq:posterior_para}
p(f(x)|\mathcal{D}_t) = \int_\theta p(f(x)|\theta, \mathcal{D}_t) p(\theta|\mathcal{D}_t) d\theta.
\end{equation}
In this setup, we aim to learn the posterior distribution of the parameters and use the model structure to preserve the continuity of $\delta(x)$.
%%\vspace{-3mm}
\paragraph{Model parameter posterior $p(\theta|\mathcal{D}_t)$:}
The posterior of the learnable parameters $\theta$ that model input-dependent fidelity, $p(\theta|\mathcal{D}_t)$, is another important component in~\eqref{eq:posterior_para}.
By Bayes' rule, 
\begin{equation}
p(\theta|\mathcal{D}_t) \propto p(\theta)p(\mathcal{D}_t|\theta),
\label{eq:theta_posterior}
\end{equation}
where $p(\theta)$ is the prior distribution of the model parameters and $p(\mathcal{D}_t|\theta)$ is the likelihood.
While the prior distribution is usually selected beforehand, the likelihood takes the form: 
\begin{equation}
p(\mathcal{D}_t|\theta) = (2\pi)^{-\frac{t}{2}}|\hat{K}_{t}|^{-\frac{1}{2}}\text{exp}(-\frac{1}{2}Y_t^T(\hat{K}_{t})^{-1}Y_t).
\end{equation}
%%\vspace{-6mm}
\paragraph{Sampling:} 
Although we have the analytic forms of the prior and likelihood in our settings, Bayesian inverse to update the posterior of $\theta$ given queried evaluations usually does not have an analytic closed-form solution because of the integral in~\eqref{eq:posterior}. % is usually intractable. 
One of the strategies to deal with the unnormalized distribution in~\eqref{eq:posterior} is by No-U-Turn Sampler~(NUTS)~\citep{hoffman2014no}, a variant of Hamiltonian Monte-Carlo (HMC)~\citep{betancourt2017conceptual}, which enables efficient sampling from unnormalized distributions.

Given the samples $\Theta = \{\theta^1, \theta^2, \dots, \theta^M\}$ of the posterior $\theta|_{\mathcal{D}_t}$, the posterior distribution in~\eqref{eq:posterior} can be estimated by: 
\begin{equation}
\label{eq:posterior_sam}
p(f(x)|\mathcal{D}_t) \approx \frac{1}{M}\sum_{\theta^m \in \Theta} p(f(x)|\theta^m, \mathcal{D}_t),
\end{equation}
where $m \in {1, 2, \dots, M}$ and $M$ is the number of samples.

\subsubsection{Non-parametric Noise Model}
%The noise variance can also be modeled non-parametrically.  for the prior of $\delta(x)$

We also apply a non-parametric GP noise model, assuming that the available data, $\delta(x_i)$, are jointly Gaussian distributed,  the posterior can be written as
\begin{equation}
\label{eq:posterior_para}
p(f(x)|\mathcal{D}_t) = \int_{\delta_t} p(f(x)|\delta_{t}, \mathcal{D}_t) p(\delta_{t}|\mathcal{D}_t) d\delta_{t},
\end{equation}
where $p(\delta_{t}|\mathcal{D}_t) \propto p(\delta_{t})p(\mathcal{D}_t|\delta_{t})$, and $\delta_t$ denotes the random vector $[\delta_{j_1}(x_1), \dots, \delta_{j_t}(x_t)]$. 

In this non-parametric setting, we update the posterior distribution of the input-dependent variance for each input sample, and the continuity is captured by the GP prior. % distribution $p(\delta_{t})$.

GP-modeled $\sigma(x)$ will face the same intractability problem as in the parametric setup. Furthermore, $\delta(x)|_{\mathcal{D}_t}$, which plays an important role in suggesting new samples, is also intractable because of the non-Gaussian likelihood $p(\mathcal{D}_t|\delta_t)$.

Sampling methods can be time-consuming in this case because when having a batch of $M$ samples $\hat{\delta}(x)$, computing $\delta(x)|_{\hat{\delta}_t}$ has the complexity of $\mathcal{O}(M t^3)$ in each iteration. %\hl{in future, this might be able to solved by Prof. Tuo's kernel packets?}

In practice, we propose to apply the \emph{Maximum a Posterior}~(MAP,~\cite{murphy2012machine}) point estimate $\bar{\delta}_t$ by maximizing the posterior $p(\mathcal{D}_t|\bar{\delta}_t)p(\bar{\delta}_t)$ and then use $\delta(x)|_{\bar{\delta}_t}$ to approximate $\delta(x)|_{\mathcal{D}_t}$. The corresponding estimated posterior becomes
\begin{equation}
\label{eq:posterior_nonp}
p(f(x)|\mathcal{D}_t) \approx  p(f(x)|\bar{\delta}_{t}, \mathcal{D}_t),
\end{equation}
and $\delta(x)|_{\bar{\delta}_t}$ can be acquired by ordinary GP updates.

Comparing the posterior distributions from our setting and traditional BO settings, the main difference of our updated posterior covariance matrix $\hat{K}_t$ from the covariance $K_n$ in previous settings is that we have replaced the constant noise variance $\delta^2$ with an input-dependent noise $\delta^2(x)$. By doing this, different approximation models are dependent by modeling the covariance $k(x, x')$, instead of being mutually independent as in~\cite{pmlr-v70-kandasamy17a}. We also capture the input-dependent fidelity by observing that the correlation of the evaluations from the approximation model(s) and the ground-truth objective function is $\text{corr}(f^j(x), f(x)) = \frac{k(x, x)}{\sqrt{k(x, x)}\sqrt{k(x, x) + \delta_{j}(x)}} = \frac{1}{\sqrt{1+\frac{\delta_{j}(x)}{k(x, x)}}}$, which is again dependent on input $x$.
% \hl{may need to discuss the indication of the above form... dependency captured in k, while noise covariance is input-dependent...}

\subsection{Noise-Variant UCB~(NVUCB)}
With the previously described surrogate model updates, we now investigate the acquisition function for iBO.
% \hl{With the previously described surrogate model updates, we now investigate the acquisition function for input-dependent BO. }
To achieve better sample efficiency, we first propose a new acquisition function---\emph{Noise-Variant UCB}~(NVUCB)---for single fidelity BO when the observation noise $\delta(x)$ is dependent on $x$: 
\begin{equation}\label{eq:NVUCB}
    \alpha^{NV}_{t}(x) = \mu_t(x) + \beta^{\frac{1}{2}} \frac{\sigma_t(x)}{\sqrt{\sigma^2_t(x) + \delta^2(x)}} \sigma_t(x).
\end{equation}
Recall that the original UCB for BO takes the form of~\eqref{eq:UCB}, our proposed NVUCB is basically the UCB with the standard deviation factored by $\gamma_t(x) = \frac{\sigma_t(x)}{\sqrt{\sigma^2_t(x) + \delta^2(x)}}$.

Our NVUCB acquisition function can easily be extended to multi-fidelity BO by considering multiple approximation models, named as \emph{Multi-Fidelity NVUCB}~(MFNVUCB):  
\begin{equation}\label{eq:MFNVUCB}
\resizebox{0.89\hsize}{!}{
    $\alpha^{MFNV}_{t}(x, j) = \mu_t(x) + \beta^{\frac{1}{2}} \frac{\sigma_t(x)}{\sqrt{\sigma^2_t(x) + \delta_j^2(x)}} \sigma_t(x). $}
\end{equation}
This proposed iMFBO follows the usual BO framework with our input-dependent surrogate model and MFNVUCB acquisition function, which is summarized in Algorithm~\ref{alg:iMFBO} in Appendix~\ref{sec:alg}.


To explain the reason for the factorization of the deviation in~\eqref{eq:NVUCB}, we go back to the idea of the original UCB~\eqref{eq:UCB}. The first term $\mu(x)$ is designed for exploiting the surrogate estimation of potential optimal solutions and the second term encourages exploration into unknown regions in the design space.

While it is natural to consider the inferred variance $\sigma^2(x)$ as quantified model uncertainty to guide exploration, we can also be more explicit to directly consider the potential reduction of the variance after querying $x$ with noiseless observations. 

Consider a potential candidate $x$, in input-dependent fidelity settings with $\mathcal{N}(0, \delta^2(x))$ observation noise, our surrogate of the ground-truth objective function at iteration $t$ is $f_t(x) \sim \mathcal{N}(\mu(x), \sigma^2(x))$, and the potential observation $f^a(x) \sim \mathcal{N}(f_t(x), \delta^2(x))$. The posterior when the observation is $y$ is then $f_t|_{y} \sim \mathcal{N}(\frac{\delta^2(x) \mu(x) + \sigma^2(x) y}{\delta^2(x) + \sigma^2(x)}, \frac{\delta^2(x)}{\sigma^2(x) + \delta^2(x)}\sigma^2(x))$. The variance reduction is $\text{Var}(f_t) - \text{Var}(f_t|_{y}) = \gamma^2(x) \sigma^2(x)$, which is exactly reflected in our factored variance term in NVUCB and MFNVUCB. 
In a noiseless setup, i.e. $\delta(x) = 0$, $\gamma(x) = \frac{\sigma(x)}{\sqrt{\sigma^2(x) + 0}} = 1$ so NVUCB would become original UCB. Compared to the penalty terms applied in~\citet{griffiths2021achieving, makarova2021risk}, ours has similar penalty power for noisier points, but ours is derived from the information gain formulation and leads to our theoretical results. An illustration of different acquisition functions can be found in Appendix~\ref{sec:illustrate_accu}.


Though the discussion in this section is under the unbiased evaluations assumption and equal-cost setup, we want to note that this iMFBO is capable of being extended to a bias-aware and cost-aware version and we discuss such extensions in Appendix~\ref{app:bias} and~\ref{sec:cost}, respectively.
\section{Theoretical Results}
As another way to illustrate the importance of the factor $\gamma(x)$,  we show that it also shows up in the information gain of the ground-truth after observing noisy evaluations.

% \begin{restatable}{proposition}{pone}
\begin{proposition}\label{prop:IG}
Given a set of input samples $[x_1, x_2, \dots, x_n]$, the information gain of the ground-truth function $f(x)$ after querying approximation model $f^a$, with observation noise variance $\delta^2(x)$, getting $F^a_n = [f^a(x_1), f^a(x_2), \dots, f^a(x_n)]$ is
\begin{equation}
\label{eq:IG}
I(f;F^a_n)  = -\frac{1}{2}\sum_{i=1}^n \log(1-\gamma_i^2(x_i)),
\end{equation}
\end{proposition}
% \end{restatable}
%%\vspace{-3mm}
where $\gamma_i(x_i) = \frac{\sigma_t(x_i)}{\sqrt{\sigma^2_i(x_i) + \delta^2(x_i)}}$, $\sigma_i^2(x_i)$ is the predictive variance after observing $F^a_i = [f^a(x_1), f^a(x_2), \dots, f^a(x_i)]$.


With the fact that $-\frac{1}{2} \log(1-\gamma^2)$ is monotonically increasing with respect to $\gamma$, iBO guided by NVUCB, which encourages querying samples with larger $\gamma_t$ values at iteration $t$, is more likely resulting in more informative queries when exploring the input space. Formally, we have the following main theorem:%\hl{following main theorem}: 

% the variance reduction can be derived to be $\gamma^2(x) \sigma^2(x)$,
% This form can be acquired by Bayesian inference.

\begin{assumption}\label{assumption:f_GP}
The ground-truth target function $f$ is sampled from a Gaussian Process with a kernel $k(x, x')$.
\end{assumption}

\begin{theorem}\label{thm:theorem1}
If the latent ground-truth $f$ satisfies Assumption~\ref{assumption:f_GP}, denote $x_v$ as the selected candidate by the proposed NVUCB acquisition function~\eqref{eq:NVUCB}, $x_u$ as the selected candidate by the original UCB~\eqref{eq:UCB}. 
At least one of the following statements holds true:
%%\vspace{-3mm}
\begin{itemize}
    \item \textbf{S1}: The information gain of ground-truth $f$ after querying approximation model $f^a$, with observation noise variance $\delta^2(x)$ at $x_v$ can be lower bounded 
    % (please make sure this is correct and reflects the inequality below)} 
    by that at $x_u$,
$I(f;f^a(x_v)) \geq I(f;f^a(x_u))$;
%\vspace{-3mm}
\item \textbf{S2}: The predictive mean of the selected sample $\mu(x_v) > \mu(x_u)$.
\end{itemize}
\end{theorem}

%The theorem conveys the message that 
Compared to the original UCB acquisition function, the NVUCB acquisition function would either get more informative queries (\textbf{S1}), tend to exploit the current model (\textbf{S2}), or achieve both in SFBO setup.

\begin{figure*}[t]
    \begin{subfigure}{0.33\linewidth}
    \includegraphics[width=\linewidth]{figure/quad/target.png}
    \caption{Approximation models}
    \label{subfig:20}
    \end{subfigure}
    \begin{subfigure}{0.33\linewidth}
    \includegraphics[width=\linewidth]{figure/quad/100random_samples.png}
    \caption{100 samples}
    \label{subfig:500}
    \end{subfigure}
    \begin{subfigure}{0.33\linewidth}
    \includegraphics[width=\linewidth]{figure/quad/predictive_mean.png}
    \caption{Predictive mean}
    \label{subfig:100mean}
    \end{subfigure}\vspace{-2mm}
    \caption{(a) The approximation model and the ground truth, though the approximation model is deterministic, we here show that the model prediction can also benefit from considering the bias as input-dependent noise.
    % \hl{(this may be difficult to understand unless explained as in Fig.2 caption)}
    (b) Randomly drawn 100 samples, 50 from each of the approximation model. (c) The predictive mean of the surrogate models. the curve labeled with "GP with learnt noise" is the proposed GP with learnable noise, while "GP(1)" and "GP(2)" are GPs fitting approximation model $f^1$ and $f^2$ respectively. We also plotted the average value of "GP(1)" and "GP(2)" as "GP(average)".}
    %Each of the lines in ~\ref{subfig:20}~\ref{subfig:500} is generated by a sample of the parameter posterior sampled by No-U-Turn Sampler (NUTS)~\cite{hoffman2014no}. \ref{subfig:500non} illustrates the MAP estimation of the input-dependent noise with 500 data samples.}
    \label{fig:model_illu}\vspace{-2mm}
\end{figure*}

A sublinear regret bound can also be derived for MFNVUCB-guided iMFBO in the Multi-Fidelity setup to be $\mathcal{O}(\sqrt{\beta_T I^{max}_T T})$ under mild assumptions, following~\cite{srinivas2009gaussian}. Formally, we prove the following theorem. 
\begin{assumption}\label{assumption:D}
The target function $f$ defined on $D \subset [0, r]^d$ is compact and convex, $d \in N$, $r > 0$. 
\end{assumption}
\begin{assumption}\label{assumption:k}
The kernel $k(x, x')$ defined in Assumption~\ref{assumption:f_GP} satisfies the following high probability bound on the derivatives of GP sample paths $f$: There exist constants $a, b > 0$,
    \begin{equation}\nonumber
    Pr\{\sup_{x \in D}|\partial{f}/\partial{x_k}| > L\} \leq a e^{-(L/b)^2}, k = 1,\dots, d.
    \end{equation}
\end{assumption}
\begin{assumption}\label{assumption:delta}
The observation noise $\delta_j(x)$ for any information source $j$ satisfies $\delta_{min}\leq \delta_j(x) \leq \delta_{max}$.
\end{assumption}

\begin{theorem}\label{thm:regret}
    For a constant $\epsilon \in (0, 1)$, and $\beta_t = 2\log(t^2\pi^2/(3\epsilon)) + 2d\log(t^2dbr\sqrt{\log{(4da/\epsilon)}})$, 
    performing MFNVUCB for a target $f$ satisfying Assumptions~\ref{assumption:f_GP}~\ref{assumption:D}~\ref{assumption:k} with observation noise satisfying Assumption~\ref{assumption:delta}, we have
    \begin{equation}
    Pr\{R_T \leq (\sqrt{C_{\delta_{min}}} + 1)\sqrt{2\delta_{max}^2 \beta_T  I^{max}_T T} + \frac{\pi^2}{6} \} \geq 1-\epsilon, 
    \end{equation}
    where $R_T = \sum_{t=1}^T [f(x^*) - f(x_t)]$, $I^{max}_T$ is the maximum information gain at iteration $T$, and the constant $C_{\delta_{min}} > 1$ is related to $\delta_{min}$.
\end{theorem}
The proofs for Proposition~\ref{prop:IG}, Theorem~\ref{thm:theorem1}
% Lemma~\ref{thm:lemma1}, Lemma~\ref{thm:lemma2},
and Theorem~\ref{thm:regret} can be found in Appendix \ref{sec:proofs}.

\begin{figure*}
\begin{subfigure}{0.33\linewidth}
    \includegraphics[width=\linewidth]{acquisition_function_illustration/our.png}
    \caption{NVUCB}
    \label{subfig:NVUCB}
    \end{subfigure}
    \begin{subfigure}{0.33\linewidth}
    \includegraphics[width=\linewidth]{acquisition_function_illustration/our_nfc.png}
    \caption{NUCB}
    \label{subfig:NUCB}
    \end{subfigure}
    \begin{subfigure}{0.33\linewidth}
    \includegraphics[width=\linewidth]{acquisition_function_illustration/naive.png}
    \caption{Separated GP}
    \label{subfig:SGP}
    \end{subfigure}\vspace{-3mm}
    \caption{The corresponding acquisition functions are illustrated as dashed lines based on the learned surrogate models of the ground-truth for (a) NVUCB, (b) NUCB, and (c) Separated GP, respectively. The shaded region is 1-$\sigma$ confidence region. The number in parentheses indexes the corresponding approximation model, e.g. NVUCB(1) in (a) is by~\eqref{eq:MFNVUCB} with $j=1$. The solid red line illustrates the latent ground truth $f(x)$.}% 
    \label{fig:acqui_illu}
    % \vspace{-3mm}
\end{figure*}



% \hl{there is no $j$ on the right hand...}
% \begin{wrapfigure}{r}{.32\textwidth}%\vspace{-0mm}
%     \begin{minipage}{\linewidth}
%     \centering\captionsetup[subfigure]{justification=centering}
%     \includegraphics[width=\linewidth]{toy_example/2_fids.png}
%     \subcaption{MFBO: \emph{sin} wave}
%     \label{subfig:2fidsin}\par\vfill
%     \includegraphics[width=\linewidth]{figure/quad/ten_BO.png}
%     \subcaption{MFBO: toy}
%     \label{subfig:toy_MFBO}
%     \includegraphics[width=\linewidth]{toy_example/fid_choice.png}
%     \subcaption{Queried samples}
%     \label{subfig:qs}
% \end{minipage}
% \caption{(a)(b) Identified maximal ground-truth values by iMFBO over 10 independent runs with random initialization. The error bar represents the $1-\sigma$ confidence interval. (c) The queried evaluations by iMFBO with different acquisition functions: the queried evaluations from $f^1$ are plotted red and those from $f^2$ are blue.}\label{fig:5}%\vspace{-25mm}
% \end{wrapfigure}





We note that the learnt input-dependent noise $\delta(x)$ is considered as random variables and the acquisition functions used in the experiments are computed by taking expectation over the distribution of corresponding noise $\delta(x)$.

\section{Numerical Results}
% \hl{a couple of sentences summarize the experiments (purpose, etc.)}
We here first show that our surrogate modeling and acquisition function can capture input-dependent fidelity and approximate the ground-truth more efficiently using a toy example. We then illustrate the performance of our proposed iMFBO methods with both the toy example and well-known benchmarking optimization targets. The performance of iBO is discussed in Appendeix~\ref{sec:sfbo}.  Finally, we implement NVUCB to a real-world materials discovery dataset~\citep{zhuo2018predicting}, for which we aim to maximize the band-gap of nonmetal materials. 

% \hl{cite}  %\hl{the band-gap of materials.}
% \vspace{-5mm}
\subsection{Surrogate Model Performance}\label{sec:surrogate}

We illustrate the effectiveness of capturing input-dependent fidelity by our surrogate modeling strategy with a toy example in Figure~\ref{fig:model_illu}, where the ground truth is $f(x) = -(x^2 - 1)\cos(3\pi x), x \in [-1, 1]$, which has two local maximums (left and right) and one global maximum (center). Two different deterministic approximation models are considered as $f^1(x) = f(x) + 0.5(x + 1) \sin(32\pi x)$, $f^2(x) = f(x) - 0.5(x - 1) \sin(32\pi x)$ that has an observation error compared to the target. We show the predictive mean of our proposed heteroscedastic noise GP (GP with inferred noise) and regular GP with constant observation noise (GP(1) and GP(2)). It can be observed that though ``GP(1)'' and ``GP(2)'' are able to correctly identify three peaks, they fail to identify the global maximum while our model can as shown in Figure \ref{subfig:100mean}.

\begin{figure*}[t]
\begin{subfigure}{0.33\linewidth}
    \includegraphics[width=\linewidth]{toy_example/2_fids_log.png}
    \caption{MFBO: \emph{sin} wave}
    \label{subfig:2fidsin}
\end{subfigure}
\begin{subfigure}{0.33\linewidth}
    \includegraphics[width=\linewidth]{figure/quad/ten_BO_log.png}
    \caption{MFBO: toy}
    \label{subfig:toy_MFBO}
\end{subfigure}
\begin{subfigure}{0.33\linewidth}
    \includegraphics[width=\linewidth]{toy_example/fid_choice_new.png}
    \caption{Queried samples}
    \label{subfig:qs}
\end{subfigure} %\vspace{-5mm}
\caption{(a)(b) Identified maximal ground-truth values by iMFBO over 10 independent runs with random initialization. The y-axis illustrates the log-regret $\log(1-y^*)$, where $y^*$ is the largest value queried and $1$ is the optimum for both target functions, and the shaded area represents 1-$\sigma$ confidence. (c) The queried evaluations by iMFBO with different acquisition functions: the queried evaluations from $f^1$ are plotted in red and those from $f^2$ are blue.}\label{fig:5}
\vspace{-3mm}
\end{figure*}
\vspace{-4mm}
\subsection{Acquisition Function Illustration}\label{sec:acqui}
\vspace{-2mm}

% \begin{wrapfigure}{r}{0.32\linewidth}
%     \includegraphics[width=\linewidth]{toy_example/2_fids.png}
%     \caption{iMFBO performance}
%     \label{subfig:2fidsin}
%     \end{wrapfigure}


The performance of our acquisition function is first tested with a toy example, where the ground truth is a {\tt sin} wave over one period, $f(x) = \sin(2\pi x), x\in [0, 1]$. We consider two approximation models with the corresponding linear additive noise:
$f^j(x) = f(x) + (a_j x + b_j) S$, where $a_1 = 0.5$, $b_1 = 0$, $a_2 = -0.5$, $b_2 = 0.5$, and $S$ denotes the standard normal distributed noise, i.e. $f^1$ is with higher fidelity when $x$ is small while $f^2$ is more precise when $x$ is large.

Compared with the MFBO proposed in~\citet{NIPS2016_605ff764}, which directly applies UCB on the approximation models, we refer the acquisition function focusing only on the approximation models here as \emph{Noise UCB}~(NUCB):
%\vspace{-3mm}
\begin{equation}\label{eq:NUCB}
    \mbox{NUCB:}\quad \alpha^{N}_{t}(x) = \mu_t(x) + \beta^{\frac{1}{2}} \sqrt{\sigma^2_t(x) + \delta^2(x)}.
\end{equation}
The acquisition function values with $20$ randomly sampled data trained with the parametric linear noise model are illustrated in Figure~\ref{fig:acqui_illu}. We compare the performance of NVUCB~\eqref{subfig:NVUCB} and NUCB~\eqref{subfig:NUCB}. We also fit ``Separated GPs'' for the two approximation models and compute the UCB acquisition function values for each of them as illustrated in Figure~\ref{subfig:NUCB}. The cost of evaluating either of the two approximation models is set to $1$ equally. %, referred as Separated GP,

As the approximation model $f^1$ has higher fidelity when $x$ is small and $f^2$ is more accurate when $x$ is large, intuitively one tends to query the evaluation from the approximation model with higher fidelity. Our NVUCB acquisition, as in Figure~\ref{subfig:NVUCB}, conforms to this intuition by showing NVUCB(1) is larger when $x$ is closer to $0$ and smaller when $x$ is closer to $1$ compared to NVUCB(2).
% and \hl{??? indicates the NVUCB corresponds to $f^1$ is larger when $x$ is closer to $0$ and smaller when $x$ is closer to $1$ compared to the NVUCB corresponds to $f^2$.} 
However, NUCB has the opposite trend and does not make use of the learnt fidelity information as in Figure~\ref{subfig:NUCB}. Separated GPs in Figure~\ref{subfig:SGP}, as expected, do not properly handle the input-dependent noise and result in overestimating the model uncertainty. 

We then test iMFBO with the two approximation models, with the cost of querying either of the models set to 1. We compare iMFBO with NVUCB (MFNVUCB), NUCB~(MFNUCB), separated GPs (SepGP), and MF-MES~\citep{takeno2020multi}. SepGP indicates maintaining a Gaussian Process for each information source and performing BO over them. We also compare a two-step method, referred as UCBTS, in which we learn input-dependent surrogates and then choose the input by UCB and query the approximation model for the corresponding evaluation with the lowest noise.  

Figure~\ref{subfig:2fidsin} illustrates the performance assessment results of these different iMFBO methods. % with two potential approximation model to query. The performance of 
It is clear that our MFNVUCB achieves the best MFBO performance and is more stable than the others. Although UCBTS shows better performance in the first few iterations, it fails to identify the global optimum with the increasing number of iterations. 
We have also tested the performances of the MFBO methods with the toy example in Section~\ref{sec:surrogate}, which is more complicated than \emph{sin} wave as illustrated in Figure~\ref{subfig:toy_MFBO}. We believe that our proposed surrogate model and acquisition function can better approximate the underlying function in this case and query better samples.
    
Figure~\ref{subfig:qs} plots the queried evaluations from the corresponding approximation models. We can see that during iMFBO iterations, our input-dependent fidelity GPs can capture the observation noise of the two approximation models %based on the fact that 
while SepGP
% \hl{UCBTS}
does not show a clear preference on which approximation model to query. MFNVUCB tends to query $f^1$ near the global optimum 0.25, which has higher fidelity and is more informative. In contrast, MFNUCB does the opposite, validating that our proposed surrogate modeling and NVUCB can help capture input-dependent fidelity and efficiently guide the selection of inputs and approximation models to query.

\vspace{-3mm}
\subsection{Benchmark Optimization Functions}\label{sec:syn}
% \vspace{-3mm}
\begin{table}
%%\vspace{-3mm}
    \centering
    \caption{Minimal ground-truth values among queried samples averaged over 10 independent runs.}\vspace{-2mm}
    \scalebox{0.83}{
    \begin{tabular}{c c c c }
    \hline 
         & Hartmann 6D & Branin & Levy\\
         \hline
    \textbf{NVUCB}&\textbf{-1.92}$ \pm $0.48 & \textbf{1.26} $ \pm $1.32& 2.009 $\pm$ 2.28\\
    NUCB&-1.76 $\pm$ 0.41 &7.02 $\pm$ 4.52 &6.87 $ \pm$ 7.49\\
    SepGP&-1.79 $\pm$ 0.33 &3.73 $\pm$ 4.18 &2.03 $ \pm$ 1.36\\
    MF-GP-UCB&-1.80 $\pm$ 0.18 &2.66 $\pm$ 1.50 &2.006 $ \pm$ 2.33\\
    MF-MES&--1.87 $\pm$ 0.26 &2.19 $\pm$ 1.27 &\textbf{0.98} $\pm$ 0.80\\
    \hline
    \end{tabular}}%%\vspace{-3mm}
    \label{tab:syn_BO}
    \vspace{-3mm}
\end{table}
In Table~\ref{tab:syn_BO}, we present the performance of our iMFBO along with MF-GP-UCB~\citep{NIPS2016_605ff764} and MF-MES~\citep{takeno2020multi} on minimizing three benchmark objective functions: Hartmann 6D, Branin, and Levy as mean and standard deviation over 10 independent runs. In all three cases, we set two approximation models with linear additive noise, with the cost of querying either approximation model again set to $1$. The detailed experimental settings can be found in Appendix~\ref{sec:exp}. We initialize the experiment with two randomly queried samples from each of the approximation models, i.e. four samples in total, and run the experiment for 50 iterations. For Hartmann and Branin functions, our iMFBO with the proposed surrogate modeling and NVUCB acquisition function performs the best while NUCB performs the worst, demonstrating that our surrogate model can capture the input-dependent fidelity and NVUCB utilizes it efficiently. Our iMFBO with NVUCB also performs comparably well to MF-GP-UCB when optimizing the Levy benchmark function. 
\vspace{-3mm}
\subsection{Materials Discovery Problem}\label{sec:real}
\vspace{-3mm}
\begin{figure}
% \vspace{8mm}
    \centering
    \includegraphics[width=\linewidth]{figure/real_no_cost_addr_three.png}%\vspace{-2mm}
    \caption{Maximal band-gap (eV) for queried compositions by different MFBO methods.}% \hl{using the same acronyms as in fig.4...} }
    \label{fig:real}
    \vspace{-3mm}
\end{figure}
We now test our iMFBO on a real-world materials dataset, for which we must find the material with the largest band-gap.
\vspace{-4mm}
\subsubsection{Data Collection}
\vspace{-2mm}
We consider the experimental dataset used and reported by~\cite{zhuo2018predicting} as the ground truth $f$. This data set consists of 3,896 experimentally-characterized band-gap measurements from 2,458 unique inorganic compounds. These experimental band-gaps were obtained using a number of experimental techniques, including diffuse reflectance, resistivity measurements, surface photovoltage, photoconduction, and UV–vis measurements. 

Density Functional Theory (DFT) calculations are often used to predict a variety of material properties, including the band-gap~\citep{jain2013commentary}. These predictions can vary based on the structural and other configurations of the material. %Despite their utility, DFT computations serve only as approximations to real experimental results, especially when calculating band-gaps~\citep{ward2018matminer}. This limitation stems from the fact that DFT, strictly speaking, is a theory of the ground state of a material system, while the band-gap is essentially a property of the excited state. Consequently, for a single composition, the DFT-computed band-gaps can exhibit variation due to differences in material structural or other configurations. %A case in point is the material CuBr, for which six different results are reported in the open-access Material Project (MP) dataset~\cite{jain2013commentary}. This highlights the challenges associated with accurately predicting band-gaps using DFT. 
%In practical applications, 
We take the band-gap %for the desired composition, 
characterized by the smallest energy per atom as reported in the open-access Material Project (MP) dataset~\citep{jain2013commentary}, which is considered a suitable approximation~\citep{ward2018matminer} %. This assumption is 
based on the principle that the compound with the lowest energy per atom corresponds to the predicted ground state for that particular chemical system. 
% Therefore, its band-gap is deemed the most accurate approximation to the experimental band-gap. 
By querying the MP with the compositions present in the ground-truth dataset, we successfully identified DFT %Density Functional Theory (DFT) 
band-gap calculations for 1,439 out of the 2,458 compositions. DFT band-gap values were not available in the MP for the remaining 1,019 compositions, which we use to train another data-driven surrogate based on the experimental measurements.

\vspace{-3mm}
\subsubsection{Preprocessing}%\hl{band-gap to f(x)}
\vspace{-2mm}
In dealing with duplicated compositions from the dataset provided by~\cite{zhuo2018predicting}, we opted to retain only the first reported entry, due to 
the absence of corresponding experimental conditions and energy per atom values within the dataset.


We consider querying three approximation models: (1) the DFT-calculated band-gap $f_1$ in MP, (2) the band-gap predicted by a pre-trained MLP $f_2$, and (3) the band-gap predicted by a pre-trained linear model $f_3$. The MLP and linear models are trained based on the 1,019 compositions without DFT band-gap values recorded in MP. The objective of iMFBO is to identify the material with the largest band-gap among the other 1,439 compositions. 
The experiments are initiated with one random sample from each approximation model. 
We evaluate performance over 20 independent runs with a budget ($B$) of 30, as illustrated in Figure~\ref{fig:real}. It is important to note that all queries in this experiment can only yield approximations (either from DFT or MLP evaluations), and performance is assessed based on the experimental band-gaps reported in~\cite{zhuo2018predicting}. The details about the pre-trained MLP are provided in Appendix~\ref{sec:E}. 
We use the input of the MLP's final prediction layer as a feature extractor, distilling the original features down to two. All MFBO methods are performed in this extracted 2-dimensional space.

As shown in Figure~\ref{fig:real}, our MFNVUCB outperforms other models as anticipated. Interestingly, our UCBTS consistently surpasses SepGP, and we observe that MFNUCB yields superior final performance to SepGP within the budget. We also reported the performance of the random selection policy (RS) demonstrating the effectiveness of the BO methods. This outcome likely stems from our input-dependent fidelity surrogate modeling, which utilizes queried evaluations more efficiently in conjunction with the corresponding acquisition functions.

\section{Conclusion}
In this study, we have introduced iMFBO, an innovative Multi-Fidelity Bayesian Optimization method. This approach models the input-dependent fidelity of each approximation model, formulates a novel acquisition function, NVUCB, and is capable of being extended to cost-aware and bias-aware setups. Our framework integrates the learned input-dependent fidelity to more effectively guide the adaptive query evaluation of corresponding approximation models in each iteration. Our method is particularly suited to many scientific problems, such as materials discovery, where multiple information sources are available, each providing insights into the ground truth at varying levels of fidelity. We evaluated our proposed iMFBO on both synthetic and real-world datasets, demonstrating its proficiency in capturing the input-dependent fidelity of multiple approximation models and its efficiency in optimizing the underlying ground-truth objective function based on approximation evaluations. Therefore, this work underscores the potential of iMFBO in effectively addressing multi-fidelity optimization problems, particularly in complex scientific fields where diverse sources of information must be synthesized.

\section*{Acknowledgements}
This work was supported in part by the U.S. National Science Foundation~(NSF) grants CCF-1553281, DMREF-2119103, SHF-2215573, and IIS-2212419; and by the U.S. Department of Engergy~(DOE) Office of Science, Advanced Scientific Computing Research (ASCR) M2DT Mathematical Multifaceted Integrated Capability Center~(MMICC) under Award B\&R\# KJ0401010/FWP\# CC130, program manager W. Spotz. Portions of this research were conducted with the advanced computing resources provided by Texas A\&M High Performance Research Computing.

\newpage
\bibliography{citation}

\newpage
\appendix
\onecolumn
\title{Appendix\\(Supplementary Material)}
\maketitle
%\vspace{-50mm}
\section{iMFBO algorithm}\label{sec:alg}
We present our proposed iMFBO pseudo-code as follows:
\begin{algorithm}[h]
        \caption{iMFBO}
   \label{alg:iMFBO}
\begin{algorithmic}
   \STATE {\bfseries Initialize} Initial dataset $\mathcal{D}_t = \mathcal{D}_0$, budget $B$, time step $t = 1$;
   \REPEAT 
   % \STATE Fit the \hl{noise-dependent Gaussian Process and noise model} to the current dataset $\mathcal{D}_t$;
   \STATE Fit the surrogate model of the latent ground truth and input-dependent fidelity to the current dataset $\mathcal{D}_t$;
   \STATE Find $(x_t, j_t)$ pair that maximize the equation~\eqref{eq:MFNVUCB};
   \STATE Query the $j$-th approximation model on sample $x_t$;
   \STATE Update the dataset $\mathcal{D}_{t+1} = \mathcal{D}_{t} \cup (x_t, y_t^{j_t})$
   \STATE Update budget $B = B-1$;
   \STATE Update time step $t = t + 1$;
   \UNTIL{$B \leq 0$}
\end{algorithmic}
      \end{algorithm}




\section{Performance of the Noise Surrogate}\label{sec:surrogate_diff}
We illustrate the effectiveness of capturing input-dependent fidelity by our surrogate modeling strategy with a toy example, where the ground truth is a {\tt sin} wave over one period, 
$f(x) = \sin(2\pi x), x\in [0, 1]$.
We consider two approximation surrogate models with the corresponding linear additive noise:
$f^j(x) = f(x) + (a_j x + b_j) S$, where $a_1 = 0.5$, $b_1 = 0$, $a_2 = -0.5$, $b_2 = 0.5$, and $S$ is the standard normal distributed noise, i.e. $f^1$ is with higher fidelity when $x$ is small while $f^2$ is more precise when $x$ is large. %\hl{S? standard normal?}

In parametric surrogate modeling (Sec.~\ref{sec:pnm}), we can take a linear noise model. The priors of the bias and weights are set to be standard normal distributions for both approximation models. The posterior of $\delta(x)$ trained with $20$ and $500$ random evaluation samples are illustrated in Figures~\ref{subfig:20} and~\ref{subfig:500}. In the non-parametric setting, as illustrated in Figure~\ref{subfig:500non}, the prior of the noise model is set to be a Gaussian Process with zero mean and Radial Basis Function~(RBF) covariance kernel~\citep{seeger2004gaussian}. 

When acquiring more evaluations from approximation models, the uncertainty of the noise model reduces from Figure~\ref{subfig:20} to Figure~\ref{subfig:500}, indicating that our model can reliably capture the model uncertainty. When we have a large queried evaluation set, the learnt noise fits the ground truth well in Figure~\ref{subfig:500}. More importantly, even with a relatively limited number of queried approximation evaluations, the input-dependent noise trend can still be reliably learned as in Figure~\ref{subfig:20}. The trend of noise is also learnt reasonably well with the non-parametric surrogate modeling as shown in Figure~\ref{subfig:500non}. %Even not learnt perfectly well, 

\begin{figure}
    \begin{subfigure}{0.33\linewidth}
    \includegraphics[width=\linewidth]{figure/sample_20.png}
    \caption{20 samples with linear model}
    \label{subfig:20}
    \end{subfigure}
    \begin{subfigure}{0.33\linewidth}
    \includegraphics[width=\linewidth]{figure/sample_500.png}
    \caption{500 samples with linear model}
    \label{subfig:500}
    \end{subfigure}
    \begin{subfigure}{0.33\linewidth}
    \includegraphics[width=\linewidth]{figure/sample500_nonp.png}
    \caption{500 samples with GP prior}
    \label{subfig:500non}
    \end{subfigure}
    \caption{(a) The learnt noise with 20 random samples with the linear model. Each line is a sample generated by the updated parameter posterior. 
    % \hl{(this may be difficult to understand unless explained as in Fig.2 caption)}
    (b) The learnt linear noise model with 500 random samples. (c) The MAP estimation of the input-dependent noise over 500 random  samples with the Gaussian Process prior. The number in parentheses indexes the approximation model, e.g., ``Learnt noise(1)'' in (a) is the learnt observation noise of the approximation model $f^1$. }
    \label{fig:model_illu_noise}
\end{figure}











\section{An example to illustrate the effectiveness of NVUCB}\label{sec:illustrate_accu}

We here use an example to illustrate the effectiveness of our proposed acquisition function, NVUCB, in Figure~\ref{fig:acqui}. The controlling parameter $\beta$ is set to 1 for all of the three acquisition functions: UCB, NUCB, and NVUCB. 
Consider we have three candidate samples at ${x_1, x_2, x_3}$ to query.
Comparing $x_2$ and $x_3$, with the same prediction mean $0$ and prediction variance $12.5$ by the approximation model, NUCB will face a tie. Both UCB and NVUCB break the tie by selecting more informative sample $x_3$. However, $x_1$, with predictive mean $0.5$ and less observation noise variance $0.1^2$, would have the same UCB value as the one at $x_2$.
NVUCB would select candidate $x_1$ which not only has higher prediction mean but also is more informative with $\gamma(x_1) = \frac{2}{\sqrt{2^2+ 0.1^2}} > \frac{2.5}{\sqrt{12.5}} = \gamma(x_3)$.

\begin{figure}[h]
    \centering
    \includegraphics[width=0.5\linewidth]{figure/acqui_probelm_v2.png}
    \caption{An illustration of UCB, NUCB and NVUCB acquisition functions. The predictive mean is plotted in solid black, The red shaded region represents the $1-\sigma$ confidence of the ground-truth model and the green shaded region represents the $1-\sigma$ confidence of the approximation model. NVUCB guides towards the best selection.}
    \label{fig:acqui}
\end{figure}



\section{Acquisition Function Performance on SFBO}\label{sec:sfbo}
We further test the performance of our proposed iBO using the corresponding single approximation model over 10 independent runs, as illustrated in Figures~\ref{subfig:f1} and~\ref{subfig:f2}. In each run, we randomly select four samples in the design space of interval $x \in [0, 1]$
for the initial dataset with two queried from $f^1$ and $f^2$ respectively. We compare iBO with NVUCB and NUCB to BO with UCB based on ordinary GP surrogates with constant observation noise. For an ablation study, we further test the performance of BO with UCB based on our input-dependent GP surrogates, referred to as iUCB. 

By comparing Figures~\ref{subfig:f1} and~\ref{subfig:f2}, we can observe that all methods perform better using $f^1$ evaluations than using $f^2$. This is because $f^1$ has higher fidelity in the relatively well-performing region (near the optimum $x=0.25$) and can be more informative for BO to find the global maximum. 
The better performance of iUCB compared to UCB demonstrates that our input-dependent fidelity GP models the ground-truth objective better than constant noise GP. With only $f^2$ evaluations, iBO with NVUCB performs slightly worse than iUCB 
since NVUCB would encourage to query the inputs with smaller observation noise of $f^2$ evaluations, where the ground-truth objective values are smaller, hence hindering the maximization task.

\begin{figure}[h]

\begin{subfigure}{0.49\linewidth}
    \includegraphics[width=\linewidth]{toy_example/0_fid_log.png}
    \caption{With only $f^1$ queries}
    \label{subfig:f1}
    \end{subfigure}
    \begin{subfigure}{0.49\linewidth}
    \includegraphics[width=\linewidth]{toy_example/1_fid_log.png}
    \caption{With only $f^2$ queries}
    \label{subfig:f2}
    \end{subfigure}
    \label{fig:iBO}
    \caption{Identified maximal ground-truth values by iBO with different surrogates and acquisition functions over 10 independent runs with random initialization. The error bar represents the $1-\sigma$ confidence interval.}
    \end{figure}
\section{Extended experimental results for the materials discovery problem}
\begin{figure}
% \vspace{8mm}
    \centering
    \includegraphics[width=0.5\linewidth]{figure/real_no_cost_addr.png}%\vspace{-2mm}
    \caption{Maximal band-gap (eV) for queried compositions by different MFBO methods.}% \hl{using the same acronyms as in fig.4...} }
    \label{fig:real_2}
    \vspace{-3mm}
\end{figure}
To better demonstrate our method, we extend the previous experiments in Section~\ref{sec:real} by only considering querying the first two approximation models: (1) the DFT-calculated band-gap $f_1$ in MP, and (2) the band-gap predicted by a pre-trained MLP $f_2$. Similar to the previous experiment, the MLP alse is trained based on the 1,019 compositions without DFT band-gap values recorded in MP. The objective of iMFBO is to identify the material with the largest band-gap among the other 1,439 compositions. 
The experiments are initiated with one random sample from each approximation model. 
We evaluate performance over 20 independent runs with a budget ($B$) of 20, as illustrated in Figure~\ref{fig:real_2}. 

As shown in Figure~\ref{fig:real_2}, similar to the previous results, our MFNVUCB outperforms other models as anticipated. The performance of other methods also performs similarly as discussed before.
\section{iMFBO considering evaluation cost}\label{sec:cost}
To incorporate the fact that the information resources of different approximate evaluation models usually have different evaluation costs, we can further modify the acquisition function to  
% \hl{The choice on handing cost appears to be quite adhoc... }
\begin{equation}\label{eq:MFNVUCBC}
    \alpha^{MFNVC}_{t}(x, j) = \mu_t(x) + \frac{1}{c_j} \beta^{\frac{1}{2}} \frac{\sigma_t(x)}{\sqrt{\sigma^2_t(x) + \delta_j^2(x)}} \sigma_t(x),
\end{equation} 
where $c_k$ is the cost to evaluate each of the approximation models. The reason that we put the cost on the standard deviation term is that our surrogates of the approximation models are only different on the input-dependent noise $\delta(x)$, which only appears in the variance reduction term in the acquisition function.

We present our proposed iMFBO considering such costs as follows:
\begin{algorithm}[h]
        \caption{iMFBO with cost}
   \label{alg:iMFBOC}
\begin{algorithmic}
   \STATE {\bfseries Initialize} Initial dataset $\mathcal{D}_t = \mathcal{D}_0$, budget $B$, time step $t = 1$;
   \REPEAT 
   % \STATE Fit the \hl{noise-dependent Gaussian Process and noise model} to the current dataset $\mathcal{D}_t$;
   \STATE Fit the surrogate model of the latent ground-truth and input-dependent fidelity to the current dataset $\mathcal{D}_t$;
   \STATE Find $(x_t, j_t)$ pair that maximize the equation~\eqref{eq:MFNVUCBC};
   \STATE Query the $j$-th approximation model on sample $x_t$;
   \STATE Update the dataset $\mathcal{D}_{t+1} = \mathcal{D}_{t} \cup (x_t, y_t^{j_t})$
   \STATE Update budget $B = B-c_{j_t}$;
   \STATE Update time step $t = t + 1$;
   \UNTIL{$B \leq 0$}
\end{algorithmic}
      \end{algorithm}

To numerically test the performance of the iMFBO implementation considering evaluation cost, we test it with the benchmark in the materials discovery problem~(Section \ref{sec:real}).

To reflect the reality that DFT computations are typically more resource-intensive than querying machine learning models, we set the cost for querying DFT computations and MLP to be 5 and 1, respectively.

As shown in Figure~\ref{fig:real_cost}, the performance trends of different competing models are similar as reported in Section \ref{sec:real} when the querying costs are taken into consideration, with our MFNVUCB-based iMFBO outperforming other models as anticipated.

\begin{figure}[t]
    \centering
    \includegraphics[width=0.5\linewidth]{figure/real_cost.png}
    \caption{Maximal band-gap (eV) for queried compositions by different MFBO methods.}\label{fig:real_cost}
    %\vspace{-5mm}
\end{figure}


\section{iMFBO Considering Evaluation Bias}\label{app:bias}
Though we consider unbiased evaluations in the main text, it is capable of extending the framework to multi-fidelity evaluations with bias. By considering the surrogate model for each evaluation model or information source $f_i(x)$ as the addition of the ground truth $f(x)$ and a separate bias $g_i(x)$ modeled by a separate model, i.e. $f_i(x) = f(x) + g_i(x) + \delta_i(x) S$. The posterior of the bias $g_i$ can also be inferred based on Bayes' rule similar to the inference of $\delta_i$ in the main text.

Here we use an illustrative experiment similar to the one in Section~\ref{sec:acqui} to demonstrate the performance of iMFBO considering bias. The target function is set to be a sine wave $f(x) = \sin(2\pi x)$, and the evaluation models as multiple information sources are biased such that $f_1(x) = f(x) + 0.5 + 0.5 x S$, $f_2(x) = f(x) - 0.5 + (-0.5 x+0.5)S$. Similar to the previous experiments, we use a GP to estimate the ground truth, the input-dependent noise for each information source $\delta_i(x)$ is estimated by a linear model, and the bias is modeled as a constant in this case, and estimated by MAP.

We illustrate the performance in Figure~\ref{fig:bias}. It can be observed that our method~(MFNVUCB) constantly outperforms other methods, indicating that our framework is capable to be extend to bias-aware versions.

\begin{figure}
    \centering
    \includegraphics[width=0.5\linewidth]{figure/2_fids_log_bias.png}
    \caption{Log regret for biased evaluations.
    % \hl{consistent color/format?}
    }\label{fig:bias}
    %\vspace{-5mm}
\end{figure}


\section{Proofs}\label{sec:proofs}

\begin{proposition}
Given a set of input samples $[x_1, x_2, \dots, x_n]$, the information gain of the ground-truth function $f(x)$ after querying approximation model $f^a$, with observation noise variance $\delta^2(x)$, getting $F^a_n = [f^a(x_1), f^a(x_2), \dots, f^a(x_n)]$ is
\begin{equation}
\label{eq:IG}
I(f;F^a_n)  = -\frac{1}{2}\sum_{i=1}^n \log(1-\gamma_i^2(x_i)),
\end{equation}
\end{proposition}
% *\pone
\begin{proof}
By the definition of entropy, we can derive: 
\begin{equation}
I(f;F^a_n) = H(f) - H(f|F^a_n) = H(F^a_n) - H(F^a_n|f).
\end{equation}
By the independent observation noise assumption, 
\begin{equation}
H(F^a_n|f) = H(F^a_n|F_n),
\end{equation}
where $F_n = [f(x_1), f(x_2), \dots, f(x_n)]$; and the conditional entropy term $H(F^a_n|F_n) = \frac{1}{2} \sum_{i=1}^{n} \log(2\pi e \delta(x_i))$, again by the independent noise assumption. 

The entropy term $H(F^a_n)$ can be recursively calculated as 
\begin{align}\nonumber
    H(F^a_n)\nonumber
    =&H(F^a_{n-1}) + H(f^a(x_n)|F^a_{n-1})\\\nonumber
    =&H(F^a_{n-1}) + \frac{1}{2}\log[2\pi e (\sigma_n^2(x_n) + \delta^2(x_n))]\\\nonumber
    =&\frac{1}{2}\sum_{i=1}^n \log[2\pi e (\sigma_i^2(x_i) + \delta^2(x_i))].\nonumber
\end{align}

Combining these two terms, we get the information gain with an analytic form as in~\eqref{eq:IG}.
\end{proof}


\begin{lemma}\label{thm:lemma1}
With the same setup as Theorem~\ref{thm:theorem1}, % \hl{make sure about the theorem index} {\tt label}. 
the information gain can be lower bounded by
\begin{equation}\footnotesize
I(f;f^a(x_v)) \geq I(f;f^a(x_u)) + \frac{\gamma(x_u)[\sigma(x_v) - \sigma(x_u)]}{[1+\gamma(x_u)]\sigma(x_v)},
\end{equation}
where  $\gamma(x) = \frac{\sigma(x)}{\sqrt{\sigma^2(x) + \delta^2(x)}}$.
\end{lemma}

\begin{proof}
By the assumptions that $x_v$ maximizes~\eqref{eq:NVUCB},
\begin{equation}
\mu(x_v) + \beta^{1/2} [\gamma(x_v)\sigma(x_v) - \gamma(x_u)\sigma(x_u)]
    \geq \mu(x_u), \label{eq:derive-3}
\end{equation}
and $x_u$ maximizes~\eqref{eq:UCB},
\begin{equation}
\mu(x_u)\label{eq:derive-4}
    \geq \mu(x_v) + \beta^{1/2} \sigma(x_v) - \beta^{1/2} \sigma(x_u). 
\end{equation}
% \begin{align}
% \begin{split}
%     &\mu(x_v) + \beta^{1/2} \gamma(x_v)\sigma(x_v)\\
%     \geq& \mu(x_u) + \beta^{1/2} \gamma(x_u)\sigma(x_u)\\
%     \geq& \mu(x_1) + \beta^{1/2} (\sigma(x_1) - \sigma(x_2) + \gamma(x_2)\sigma(x_2)).
% \end{split}
% \end{align}
Combining~\eqref{eq:derive-3} and~\eqref{eq:derive-4} gives us:  
\begin{equation}
    \gamma(x_v)\sigma(x_v) - \gamma(x_u)\sigma(x_u) \geq \sigma(x_v) - \sigma(x_u),
\end{equation}
which can be rewritten as 
\begin{equation}
    \gamma(x_v) \geq \gamma(x_u) + \frac{\sigma(x_v) - \sigma(x_u)}{\sigma(x_v)}[1-\gamma(x_u)]. 
\end{equation}
With Proposition~\ref{prop:IG}, for any $x$, 
\begin{equation}
I(f; f^a(x)) = H(f) - H(f|f^a(x)) = -\frac{1}{2} \log[1-\gamma^2(x)].
\end{equation}
%for any $x$.

By Jensen's inequality and the fact that the function $-\frac{1}{2} \log(1-\gamma^2)$ is convex with respect to $\gamma$,
\begin{equation}
    I(f;f^a(x_u)) \geq I(f; f^a(x_v)) + G(x_u, x_v),
\end{equation}
where $G(x_u, x_v) = \frac{\gamma(x_u)}{1-\gamma^2(x_u)}\frac{\sigma(x_v) - \sigma(x_u)}{\sigma(x_v)}[1-\gamma(x_u)] = \frac{\gamma(x_u)[\sigma(x_v) - \sigma(x_u)]}{[1+\gamma(x_u)]\sigma(x_v)}$.
% Considering $\sigma_t(x_m)$ and $\delta(x_m)$ as two catheti of a triangle, we can denote $\gamma_t(x_m) = \frac{\sigma_t(x_m)}{\sqrt{\sigma_t(x_m)^2 + \delta(x_i)^2}} = \sin(\theta_m)$, $m \in \{1, 2\}$.
% We can rewrite~\eqref{eq:derive-1} as
% \begin{equation}
%     \sin(\theta_1) \geq 1-\frac{\sigma_t(x_2)}{\sigma_t(x_1)}(1-\sin(\theta_2)). 
% \end{equation}
% If $\sigma(x_1) \geq \frac{\delta(x_1)}{\delta(x_2)}\sigma(x_2)$, we have $\tan(\theta_1) \geq \tan(\theta_2)$, which leads to $\sin(\theta_1) \geq \sin(\theta_2)$. 
% When $\sigma_t(x_1) \geq \sigma_t{(x_2)}$, $1-\frac{\sigma_t(x_2)}{\sigma_t(x_1)}(1-\sin(\theta_2)) \geq \sin(\theta_2)$, we have $\sin(\theta_1) \geq \sin(\theta_2)$, i.e. $\gamma(x_1) \geq \gamma(x_2)$. Expanding~\eqref{eq:IG_x} and plugging in the corresponding terms, we prove that the inequality~\eqref{eq:VSUCB} holds. \hl{do read all the proofs to make sure that the logic is clear enough to avoid similar complaints as in the group testing reviews...}\st{and will lead to Equation~\eqref{eq:VSUCB}.}
\end{proof}

\begin{lemma}\label{thm:lemma2}
With the same setup as in Theorem~\ref{thm:theorem1}, when $\sigma(x_v) \leq \sigma(x_u)$, the information gain can also be lower bounded by:
\begin{equation}
I(f;f^a(x_v)) \geq I(f;f^a(x_u)) + \frac{\gamma(x_u)[\mu(x_u) - \mu(x_v)]}{[1-\gamma^2(x_u)]\beta^\frac{1}{2} \sigma(x_v)}.
\end{equation}
\end{lemma}
\begin{proof}
By the assumptions that $x_v$ maximizes~\eqref{eq:NVUCB},
\begin{align}
\begin{split}
%&
\mu(x_v)%\\
    \geq &\mu(x_u) + \beta^{1/2} [\gamma(x_u)\sigma(x_u) - \gamma(x_v)\sigma(x_v)]\\
    =&\mu(x_u) + \beta^{1/2} [\gamma(x_u)\sigma(x_u) - \gamma(x_u)\sigma(x_v)]\\
    &\quad + \beta^{1/2} [\gamma(x_u)\sigma(x_v) - \gamma(x_v)\sigma(x_v)]\\
    \geq &\mu(x_u) + \beta^{1/2} [\gamma(x_u)\sigma(x_v) - \gamma(x_v)\sigma(x_v)]\\
    = &\mu(x_u) + \beta^{1/2} [\gamma(x_u)- \gamma(x_v)]\sigma(x_v).
    \end{split}
\end{align}
The last step can be derived by the assumption that $\sigma(x_v) \leq \sigma(x_u)$. Therefore we now have: 
\begin{equation}
    \gamma(x_v) \geq \gamma(x_u) + \frac{\mu(x_u) - \mu(x_v)}{\beta^\frac{1}{2} \sigma(x_v)}.
\end{equation}
Similar as Lemma~\ref{thm:lemma1}, by Jensen's inequality,
\begin{equation}
    I(f;f^a(x_v)) \geq I(f;f^a(x_u)) + \frac{\gamma(x_u)}{1-\gamma^2(x_u)}\frac{\mu(x_u) - \mu(x_v)}{\beta^\frac{1}{2} \sigma(x_v)}.
\end{equation}
\end{proof}

\begin{theorem}
If the latent ground-truth $f$ satisfies Assumption~\ref{assumption:f_GP}, denote $x_v$ as the selected candidate by the proposed NVUCB acquisition function~\eqref{eq:NVUCB}, $x_u$ as the selected candidate by the original UCB~\eqref{eq:UCB}. 
At least one of the following statements holds true:
%\vspace{-3mm}
\begin{itemize}
    \item \textbf{S1}: The information gain of ground-truth $f$ after querying approximation model $f^a$, with observation noise variance $\delta^2(x)$ at $x_v$ can be lower bounded 
    % (please make sure this is correct and reflects the inequality below)} 
    by that at $x_u$,
$I(f;f^a(x_v)) \geq I(f;f^a(x_u))$;
%\vspace{-3mm}
\item \textbf{S2}: The predictive mean of the selected sample $\mu(x_v) > \mu(x_u)$.
\end{itemize}
\end{theorem}

%The theorem conveys the message that 
Compared to the original UCB acquisition function, the NVUCB acquisition function would either get more informative queries (\textbf{S1}), tend to exploit the current model (\textbf{S2}), or achieve both.

\begin{proof}

We only need to prove %\st{the theorem by proving }
that \textbf{S1} holds when \textbf{S2} does not.

When \textbf{S2} does not hold, i.e. $\mu(x_v) \leq \mu(x_u)$:
\begin{enumerate}
\item If $\sigma(x_v) \geq \sigma(x_u)$, $I(f;f^a(x_v)) \geq I(f;f^a(x_u))$ by Lemma~\ref{thm:lemma1};
\item If $\sigma(x_v) < \sigma(x_u)$, $I(f;f^a(x_v)) \geq I(f;f^a(x_u))$ by Lemma~\ref{thm:lemma2}.
\end{enumerate}
Therefore, we can conclude that $I(f;f^a(x_v)) \geq I(f;f^a(x_u))$ if $\mu(x_v) \leq \mu(x_u)$, which proves that at least one of \textbf{S1} and \textbf{S2} is true. 
\end{proof}

We would also like to first theoretically compare the performance of NUCB with our NVUCB in the single fidelity scenario. 
\begin{theorem}\label{thm:nucb}
With the same setup as Theorem~\ref{thm:theorem1}, denote $x_v$ as the selected candidate by our proposed NVUCB acquisition function~\eqref{eq:NVUCB}, $x_n$ as the selected candidate by NUCB~\eqref{eq:NUCB}. 
At least one of the following statements holds true:
\begin{itemize}
    \item \textbf{T1}: The information gain $I(f;f^a(x_v)) \geq I(f;f^a(x_n))$;

\item \textbf{T2}: The predictive mean of the selected sample $\mu(x_v) > \mu(x_n)$.
\end{itemize}
\end{theorem}

\begin{proof}
%By the assumptions that
As $x_v$ maximizes~\eqref{eq:NVUCB},
\begin{equation}\label{eq:vsnucb}
\mu(x_v) + \beta^{1/2} [\gamma(x_v)\sigma(x_v) - \gamma(x_n)\sigma(x_n)]
    \geq \mu(x_n), 
\end{equation}
and $x_n$ maximizes~\eqref{eq:NUCB},
\begin{align}\label{eq:derive-7}
\begin{split}
\mu(x_n)
    \geq &\mu(x_v) + \beta^{1/2} \sqrt{\sigma^2(x_v) + \delta^2(x_v)}\\
    &- \beta^{1/2}\sqrt{\sigma^2(x_n) + \delta^2(x_n)}.
    \end{split}
\end{align}
The standard deviation term $\sqrt{\sigma^2(x) + \delta^2(x)}$ can also be written as $\frac{\sigma(x)}{\gamma(x)}$,
combining~\eqref{eq:vsnucb} and~\eqref{eq:derive-7}, we have 
\begin{align}
\begin{split}\label{eq:derive8}
&\gamma(x_v)\sigma(x_v) - \gamma(x_n)\sigma(x_n)\\
\geq& \sqrt{\sigma^2(x_v) + \delta^2(x_v)}- \sqrt{\sigma^2(x_n) + \delta^2(x_n)}\\
=&\frac{\sigma(x_v)}{\gamma(x_v)}- \frac{\sigma(x_n)}{\gamma(x_n)}. 
\end{split}
\end{align}

Rewriting~\eqref{eq:derive8}, we have
\begin{equation}\label{eq:derive-9}
[\gamma(x_v) - \frac{1}{\gamma(x_v)}]\sigma(x_v) \geq [\gamma(x_n) - \frac{1}{\gamma(x_n)}]\sigma(x_n).
\end{equation}

Similar as in the proof of Theorem~\ref{thm:theorem1}, we only need to prove that \textbf{T1} holds when \textbf{T2} does not. %  in this theorem.

When \textbf{T2} does not hold, i.e. $\mu(x_v) \leq \mu(x_n)$:
\begin{enumerate}
\item 
If $\sigma(x_v) \geq \sigma(x_n)$, 
recall that $0<\gamma(x)<1$, so $\gamma(x_v) - \frac{1}{\gamma(x_v)} < 0$, we have $\gamma(x_v) - \frac{1}{\gamma(x_v)} \geq \gamma(x_n) - \frac{1}{\gamma(x_n)}$ from~\eqref{eq:derive-9}. We can further have $\gamma(x_v) \geq \gamma(x_n)$ because of the monotonicity of function $\gamma - \frac{1}{\gamma}$.
\item
If $\sigma(x_v) < \sigma(x_n)$,
\begin{align}
\begin{split}
&\gamma(x_v) - \gamma(x_u)\\
= &\sigma^{-1}(x_n)[\gamma(x_v)\sigma(x_n) - \gamma(x_u)\sigma(x_n)]\\
> &\sigma^{-1}(x_n)\gamma(x_v)\sigma(x_v) - \gamma(x_u)\sigma(x_n)\\
\geq &\sigma^{-1}(x_n) \beta^{-\frac{1}{2}}[\mu(x_n) - \mu(x_v)]\\
\geq& 0. 
\end{split}
\end{align}
\end{enumerate}
We can then conclude that when $\mu(x_v) \leq \mu(x_n)$, we have $\gamma(x_v) \geq \gamma(x_u)$, and can further get $I(f;f^a(x_v)) \geq I(f;f^a(x_n))$ because of the monotonicity of~\eqref{eq:IG}.

Therefore we have proven the theorem. %d that \textbf{S1} holds when \textbf{S2} does not.
\end{proof}


Note that Theorem~\ref{thm:theorem1} can also be proven with a similar strategy as in the proof of Theorem~\ref{thm:nucb} without using Lemma~\ref{thm:lemma2} and Lemma~\ref{thm:lemma2}. Those two lemmas, however, allow us to better understand the acquisition functions from the information-theoretic point of view.

\begin{theorem}
    For a constant $\epsilon \in (0, 1)$, and $\beta_t = 2\log(t^2\pi^2/(3\epsilon)) + 2d\log(t^2dbr\sqrt{\log{(4da/\epsilon)}})$, 
    performing MFNVUCB for a target $f$ satisfying Assumptions~\ref{assumption:f_GP}~\ref{assumption:D}~\ref{assumption:k} with observation noise satisfying Assumption~\ref{assumption:delta}, we have
    \begin{equation}
    Pr\{R_T \leq (\sqrt{C_{\delta_{min}}} + 1)\sqrt{2\delta_{max}^2 \beta_T  I^{max}_T T} + \frac{\pi^2}{6} \} \geq 1-\epsilon, 
    \end{equation}
    where $R_T = \sum_{t=1}^T [f(x^*) - f(x_t)]$, $I^{max}_T$ is the maximum information gain at iteration $T$, and the constant $C_{\delta_{min}} > 1$ is related to $\delta_{min}$.
\end{theorem}

\begin{proof}
Based on Lemma 5.7 in~\cite{srinivas2009gaussian}, let $[x^*]_t$ be the closest point in $D_t$ to $x^*$, where $D_t$ is the discredited subset of $D$, 
\begin{equation}\label{eq:prob}
|f(x^*) - \mu_{t}([x^*]_t)| \leq \beta^{\frac{1}{2}}_t \sigma_{t-1}([x^*]_t) + \frac{1}{t^2}
\end{equation}
holds with probability greater or equal to $1-\delta$. Note that our notation of time index is different from~\cite{srinivas2009gaussian}.

By definition of $x_t$ and $j_t$, 
\begin{equation}\label{eq:trans}
\mu_t(x_t) + \beta^{\frac{1}{2}}\gamma_t^{j_t}(x_t)\sigma_t(x_t) \geq \mu_t([x^*]_t) + \beta^{\frac{1}{2}}\gamma_t^{j_t^*}([x^*]_t)\sigma_t([x^*]_t),
\end{equation}
The index on the RHS $j_t^*$ can be chosen arbitrarily so we omit it in the following derivation.
So regret in one iteration:
\begin{align}
\begin{split}
\label{eq:regret_one_step}
r_t &= f(x^*) - f(x_t)\\
&\leq \mu_t([x^*]_t) + \beta_t^{\frac{1}{2}} \sigma_t([x^*]_t) + 1/t^2 - f(x_t) \quad \text{(by Equation~\ref{eq:prob})}\\
&= \mu_t([x^*]_t) + \beta_t^{\frac{1}{2}} \gamma_t([x^*]_t) \sigma_t([x^*]_t) + \beta_t^{\frac{1}{2}} (1-\gamma_t([x^*]_t)) \sigma_t([x^*]_t) + 1/t^2 - f(x_t)\\
&\leq \mu_t(x_t) + \beta_t^{\frac{1}{2}} \gamma_t(x_t) \sigma_t(x_t) + \beta_t^{\frac{1}{2}} (1-\gamma_t([x^*]_t)) \sigma_t([x^*]_t) + 1/t^2 - f(x_t) \quad\text{(by Equation~\ref{eq:trans})}\\
&\leq \beta_t^{\frac{1}{2}} \sigma_t(x_t) + \beta_t^{\frac{1}{2}} \gamma_t(x_t) \sigma_t(x_t) + \beta_t^{\frac{1}{2}} (1-\gamma_t([x^*]_t)) \sigma_t([x^*]_t) + 1/t^2 \quad\text{(by Lemma 5.1 in~\cite{srinivas2009gaussian})}\\
&=\beta_t^{\frac{1}{2}} (1+\gamma_t(x_t)) \delta_{max} \frac{\sigma_t(x_t)}{\delta_{max}} + \beta_t^{\frac{1}{2}} (1-\gamma_t([x^*]_t)) \delta_{max} \frac{\sigma_t([x^*]_t)}{\delta_{max}} + 1/t^2\\
&\leq \beta_t^{\frac{1}{2}} (1+\gamma_t(x_t)) \delta_{max} \frac{\sigma_t(x_t)}{\delta(x_t)} + \beta_t^{\frac{1}{2}} (1-\gamma_t([x^*]_t)) \delta_{max} \frac{\sigma_t([x^*]_t)}{\delta_{t}([x^*]_t)} + 1/t^2\\
&= \delta_{max} \beta_t^{\frac{1}{2}} (1+\gamma_t(x_t)) \frac{\gamma_t(x_t)}{\sqrt{1-\gamma_t^2(x_t)}} + \delta_{max} \beta_t^{\frac{1}{2}} (1-\gamma_t([x^*]_t)) \frac{\gamma_t([x^*]_t)}{\sqrt{1-\gamma_t^2([x^*]_t)}} + 1/t^2\\
&= \delta_{max} \beta_t^{\frac{1}{2}} \gamma_t(x_t) \sqrt{\frac{1+\gamma_t(x_t)}{1-\gamma_t(x_t)}} + \delta_{max} \beta_t^{\frac{1}{2}} \gamma_t([x^*]_t) \sqrt{\frac{1-\gamma_t([x^*]_t)}{1+\gamma_t([x^*]_t)}} + 1/t^2.
\end{split}
\end{align}
The last two steps are based on the definition of $\gamma_t$. 

It can be shown that 
\begin{equation}\label{eq:38}
\gamma_t^2(x_t) \frac{1+\gamma_t(x_t)}{1-\gamma_t(x_t)}\leq -C_{\delta_{min}} \log(1-\gamma_t^2(x_t))
\end{equation}
holds for some constant $C_{\delta_{min}} > 1$ related to $\delta_{min}$ when $0\leq\gamma_t(x) \leq \frac{\sigma_t(x)}{\sigma_t(x)+\delta_{min}} < 1, \forall x$, and 
\begin{equation}\label{eq:39}
\gamma_t^2([x^*]_t) \frac{1-\gamma_t([x^*]_t)}{1+\gamma_t([x^*]_t)}\leq -\log(1-\gamma_t^2([x^*]_t)). 
\end{equation}

Therefore,  
\begin{align}
\sum_{t=1}^T \delta_{max}^2 \beta_t\gamma_t(x_t)^2 \frac{1+\gamma_t(x_t)}{1-\gamma_t(x_t)}\leq& -C_{\delta_{min}}\delta_{max}^2 \beta_T \sum_{t=1}^T \log(1-\gamma_t^2(x_t))\\
= & -2C_{\delta_{min}}\delta_{max}^2 \beta_T \sum_{t=1}^T \frac{1}{2}\log(1-\gamma_t^2(x_t))\\
= & 2C_{\delta_{min}}\delta_{max}^2 \beta_T  I(f;F^a_T)\\
\leq & 2C_{\delta_{min}}\delta_{max}^2 \beta_T  I^{max}_T.
\end{align}

Similarly,
\begin{align}
\sum_{t=1}^T \delta_{max}^2 \beta_t\gamma_t^2([x^*]_t) \frac{1-\gamma_t([x^*]_t)}{1+\gamma_t([x^*]_t)}\leq& -\delta_{max}^2 \beta_T \sum_{t=1}^T \log(1-\gamma_t^2([x^*]_t))\\
= & -2\delta_{max}^2 \beta_T \sum_{t=1}^T \frac{1}{2}\log(1-\gamma_t^2([x^*]_t))\\
= & 2\delta_{max}^2 \beta_T  I(f;[F_T^a]^*)\\
\leq & 2\delta_{max}^2 \beta_T  I^{max}_T,
\end{align}
where $[F_T^a]^* = [f^a([x^*]_1]), \dots, f^a([x^*]_T)]$.

By Cauchy-Schwarz inequality:
\begin{equation}
\sum_{t=1}^T \delta_{max} \beta_t^{\frac{1}{2}}\gamma_t(x_t) \sqrt{\frac{1+\gamma_t(x_t)}{1-\gamma_t(x_t)}} \leq \sqrt{2C_{\delta_{min}}\delta_{max}^2 \beta_T  I^{max}_T T},
\end{equation}
and
\begin{equation}
\sum_{t=1}^T \delta_{max} \beta_t^{\frac{1}{2}}\gamma_t([x^*]_t) \sqrt{\frac{1-\gamma_t([x^*]_t)}{1+\gamma_t([x^*]_t)}} \leq \sqrt{2\delta_{max}^2 \beta_T  I^{max}_T T}. 
\end{equation}

Now we have:  
\begin{equation}
R_T = \sum r_t \leq \sqrt{2C_{\delta_{min}}\delta_{max}^2 \beta_T  I^{max}_T T} + \sqrt{2\delta_{max}^2 \beta_T  I^{max}_T T} + \frac{\pi^2}{6},
\end{equation}
because $\sum^{\infty}_{t=1} 1/t^2 = \frac{\pi^2}{6}$.
\end{proof}

\paragraph{Notes on Equations~\eqref{eq:38} and~\eqref{eq:39}:}

Consider $g(x) = x^2 \frac{1+x}{1-x} + c \log(1-x^2)$: $\frac{d g(x)}{dx} = -\frac{2x^2}{(1-x)^2(1+x)}(x^3 -(2+c)x - 1 + c)$. 

Given any $x_R \in (0,\,1)$,  there exists a constant $c_{x_R} > 1$ related to the choice of $x_R$ such that$\frac{d g(x)}{dx} \leq 0, \forall x \in (0,\, x_R)$, by the nature of the cubic function $x^3 -(2+c_{x_R})x - 1 + c_{x_R}$. Therefore, $g(x) \leq g(0) = 0, \,\forall x \in (0,\, x_R)$ when $c = c_R$. 

Similarly, consider $h(x) = x^2 \frac{1-x}{1+x} + \log(1-x^2)$, $\frac{d h(x)}{dx} = \frac{2x^2}{(1-x)(1+x)^2}(x^3-3)$. We have $\frac{d h(x)}{dx} \leq 0, \forall x \in (0,\,1)$. So $h(x) \leq h(0) = 0, \forall x \in (0,\,1)$.

It is now clear that Equations~\eqref{eq:38} and~\eqref{eq:39} hold, by replacing $x$ with $\gamma_t$.

\section{Experimental Details}\label{sec:exp}

All the experiments are performed on Intel Xeon 8352Y processor with 256 GB memory. The benchmark functions are defined to be:

\subsection{Hartmann}
\subsubsection{Ground-truth objective function}
It is a 6-dimensional function
\begin{equation}
f(x) = -\sum_{i=1}^4\alpha_i \exp(-\sum_{j=1}^6 A_{ij}(x_j-P_{ij})^2),
\end{equation}
where $x = [x_1, \dots, x_6]^T$, $\alpha = (1.0, 1.2, 3.0, 3.2)^T$, 
$A = \begin{pmatrix}
10&3&17&3.5&1.7&8\\
0.05& 10&17&0.1&8&14\\
3 & 3.5 & 1.7&10&17&8\\
17&8&0.05&10&0.1&14
\end{pmatrix}$,
$P = 10^{-4} \begin{pmatrix}
1312 & 1696 & 5569& 124& 8283& 5886\\
2329 & 4135 & 8307& 3736& 1004& 9991\\
2348 & 1451 & 3522& 2883& 3047& 6650\\
4047 & 8228 & 8732& 5743& 1091& 381
\end{pmatrix}$.
\subsubsection{Approximation Models}
We consider two approximation models with the same evaluation cost of $1$:

\begin{equation}
f^1(x) = f(x) + ([0.5,  0.5 , 0.5,  0,  0,  0]x)S;
\end{equation}

\begin{equation}
f^2(x) = f(x) + ([-0.5 , -0.5,  -0.5, 0 , 0,  0 ]x + 1)S.
\end{equation}

\subsubsection{Surrogate modeling}
The ground-truth objective function is modeled with a GP with the RBF kernel
\begin{equation}\label{eq:RBF}
k(x_1, x_2) = \sigma \exp(\frac{||x_1 - x_2||_2^2}{l^2}). 
\end{equation}
The prior of the parameters are set to $\sigma\sim \tt Unif(1, 2)$, $l\sim \tt Unif(0.01, 0.5)$, where $\tt Unif(a, b)$ represents a uniform distribution from $a$ to $b$.
The input-dependent noise is modeled linearly with the prior of weights set to $\tt Unif(0, 1)$ and the prior of biases set to $\tt Unif(-1, 1)$. The number of samples to approximate the posterior is 64.

The implementation of MF-GP-UCB is set with the default configurations.
\subsection{Branin}
\subsubsection{Ground-truth objective function}
It is a 2-dimensional function
\begin{equation}
f(x) = a(x_2-bx_1^2+cx_1 -r)^2 + s(1-t)\cos(x_1) + s,
\end{equation}
where $x = [x_1, x_2]^T$,  $a = 1$, $b=5.1/(4\pi^2)$, $c=5/\pi$, $r=6$, $s=10$, and $t = 1/ (8\pi)$.
\subsubsection{Approximation Models}
We consider two approximation models with the same cost of $1$:
\begin{equation}f^1(x) = f(x) + ([3.33,  3.33]x + 16.67)S;\end{equation}

\begin{equation}f^2(x) = f(x) + ([-3.33,  -3.33 ]x + 83.33)S.\end{equation}

\subsubsection{Surrogate modeling}
The ground-truth objective function is modeled with a GP with the RBF kernel~\eqref{eq:RBF}. 
The prior of the parameters are set to $\sigma\sim \tt Unif(100, 200)$, $l\sim \tt Unif(0.15, 0.75)$.
The input-dependent noise is modeled linearly with the prior of weights set to $\tt Unif(0, 100)$ and the prior of biases set to $\tt Unif(-100, 100)$. The number of samples to approximate the posterior is 64.

%Configurations of MF-GP-UCB is set as default.
The implementation of MF-GP-UCB is set with the default configurations.

\subsection{Levy}
\subsubsection{Ground-truth objective function}
It is a 3-dimensional function
\begin{equation}
    f(x) = \sin^2(\pi \omega_1) + \sum_{i=1}^{d-1}(\omega_i -1)^2[1+10sin^2(\pi\omega_i + 1)] + (\omega_d - 1)^2[1 + \sin^2(2\pi\omega_d)],
\end{equation}
where $d=3$, $x = [x_1, \dots, x_3]^T$,  $\omega_i = 1+\frac{x_i-1}{4}$, for all $i=1, \dots, d$.
\subsubsection{Approximation Models}
We consider two approximation models with the same cost of $1$:
\begin{equation}f^1(x) = f(x) + ([1, 1, 0]x + 20)S;\end{equation}

\begin{equation}f^2(x) = f(x) + ([-1,  -1, 0]x + 20)S.\end{equation}
\subsubsection{Surrogate Model}
The ground-truth objective function is modeled with a GP with the RBF kernel~\eqref{eq:RBF}.
The prior of the parameters are set to $\sigma\sim \tt Unif(40, 80)$, $l\sim \tt Unif(0.2, 1)$.
The input-dependent noise is modeled linearly with the prior of weights set to $\tt Unif(0, 40)$ and the prior of biases set to $\tt Unif(-40, 40)$. The number of samples to approximate the posterior is 64.

%Configurations of MF-GP-UCB is set as default.
The implementation of MF-GP-UCB is set with the default configurations.


\subsection{Materials Discovery}
The approximation model and ground-truth objective have been described in the main text.

The ground-truth band-gap objective is modeled with a GP with a RBF kernel~\eqref{eq:RBF}, in which
$\sigma=4$, $l=0.5$.
The input-dependent noise is modeled with a GP with mean $m=1$ and a RBF kernel parameterized by $\sigma=0.5$, $l=0.5$.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Additional details on the Materials Discovery Problem (Sec~\ref{sec:real})}\label{sec:E}

\subsection{Discussion on Density Functional Theory (DFT) Calculation}
Density Functional Theory (DFT) calculations are often used to predict a variety of material properties, including the band-gap~\citep{jain2013commentary}. These predictions can vary based on the structural and other configurations of the material. Despite their utility, DFT computations serve only as approximations to real experimental results, especially when calculating band-gaps. This limitation stems from the fact that DFT, strictly speaking, is a theory of the ground state of a material system, while the band-gap is essentially a property of the excited state. Consequently, for a single composition, the DFT-computed band-gaps can exhibit variation due to differences in material structural or other configurations. A case in point is the material CuBr, for which six different results are reported in the open-access Material Project (MP) dataset~\citep{jain2013commentary}. This highlights the challenges associated with accurately predicting band-gaps using DFT.

\subsection{Details on the Pre-trained MLP }
The material compositions within the dataset incorporate a total of 80 distinct elements, with each material comprising two to four of these elements. Rather than using composition percentages directly as input for surrogate modeling, we initially generate 138 property-related features based on the material compositions. This is accomplished using the open-source Python package, matminer~\citep{ward2018matminer}. However, the data samples available in this generated 138-dimensional feature space are relatively sparse.
To address this, we train a 3-layer Multilayer Perceptron (MLP) on the 1,019 samples that lack reported DFT-calculated band-gaps. The MLP accepts the 138-dimensional feature vector generated by matminer as input, with the two hidden layers comprising 8 and 2 neurons, respectively. The MLP is trained to minimize the mean square error between the MLP-predicted and ground-truth (experimental) band-gap values.

We then use the input of the MLP's final prediction layer as a feature extractor, distilling the original 138 features down to two. All MFBO methods are subsequently performed in this extracted 2-dimensional space. This approach effectively leverages the MLP as a tool for feature reduction, enhancing the explainability, scalability, and manageability of MFBO.

\iffalse
\section{Limitation}
We fully trust on the inferred noise and take the expectation over the noise predictions in NVUCB. In future works, we will try to utilize the uncertainty of the noise model to guide the selection of new samples to improve the method performance further.
\fi

\end{document}