%\documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

\usepackage[american]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{amsfonts}
\usepackage{amsthm}
\usepackage{dsfont}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{tikz}
\usepackage{pifont}
\usepackage{footnote}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage[printonlyused,nolist]{acronym}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
\newcommand{\R}{{\rm I\!R}}
\newcommand{\N}{{\rm I\!N}}
\newcommand{\Scalar}[1]{#1}
\newcommand{\Vector}[1]{\mathbf{#1}}
\newcommand{\Vectors}[1]{\boldsymbol{#1}}
\newcommand{\Matrix}[1]{\mathbf{#1}}
\newcommand{\Matrixs}[1]{\boldsymbol{#1}}
\newcommand{\Set}[1]{\mathcal{#1}}
\newcommand{\VSpace}[1]{\mathbb{#1}}
\newcommand{\Rv}[1]{#1}
\newcommand{\RV}[1]{\boldsymbol{#1}}
\newcommand{\RVs}[1]{\mathit{#1}}
\newcommand{\E}[2][]{\mathbb{E}_{#1}\left[#2\right]}
\newcommand{\Var}[2][]{\mathbb{V}_{#1}\left[#2\right]}
\newcommand{\Cov}[1]{\mathrm{Cov}\left[#1\right]}
\newcommand{\Entpy}[1]{\mathrm{H}\left[#1\right]}
\newcommand{\sub}[2][]{_{\mathrm{#2}#1}}
\newcommand{\pdf}[2]{p_{\Rv{#1}}(\Scalar{#2})}
\newcommand{\Pdf}[2]{p_{\RV{#1}}(\Vector{#2})}

\newtheorem{theo}{Theorem}
\newtheorem{prop}{Proposition}
\newtheorem{defi}{Definition}

\definecolor{bblue}{HTML}{4878D0}
\definecolor{lightbrown}{HTML}{D5BB67}
\definecolor{bgreen}{HTML}{6ACC64}
\definecolor{bred}{HTML}{D65F5F}

\newcommand{\cmark}{\ding{51}}%
\newcommand{\xmark}{\ding{55}}%

\newcommand{\Circle}[2][1]{\tikz[baseline=-0.55ex]\draw[#2, fill=#2, opacity=#1,radius=.7ex] (0,0) circle ;}
\newcommand{\Rect}[2][1]{\tikz[]\draw[#2, fill=#2, opacity=#1] (0,0) rectangle ++(0.2,0.2) ;}
\newcommand{\SRect}[2][1]{\tikz[]\draw[#2, fill=#2, opacity=#1] (0,0) rectangle ++(0.16,0.16) ;}
\newcommand{\SDiamond}[2][1]{\tikz[baseline=0.1ex]\draw[#2, fill=#2, opacity=#1, rounded corners=0.04ex] (0,0.01) -- (0.06,0.11) -- (0,0.21) -- (-0.06,0.11) -- cycle;}

%\newcommand{\change}[1]{\textcolor{red}{#1}}
\newcommand{\change}[1]{#1}

% aliases for standard citation
\defcitealias{ISO26262}{ISO26262 [2016]}
\defcitealias{IEC61508}{IEC61508 [2010]}

\title{Quantifying Local Model Validity using Active Learning}

% Add authors
\author[1, 2]{\href{mailto:<sven.laemmle@zf.com>?Subject=Your UAI 2024 paper}{Sven~L{\"a}mmle}{}}
\author[3]{Can~Bogoclu}
\author[4]{Robert~Vo{\ss}hall}
\author[5]{Anselm~Haselhoff}
\author[2]{Dirk~Roos}
% Add affiliations after the authors
\affil[1]{%
    Center of Expertise\\
    ZF Friedrichshafen AG\\
    Friedrichshafen\\ 
    Germany
}
\affil[2]{%
    Institute of Modelling and High-Performance Computing\\
    Niederrhein University of Applied Sciences\\
    Krefeld\\
    Germany
}
\affil[3]{%
    Zalando SE\\
    Berlin\\
    Germany
  }
\affil[4]{%
    auxmoney GmbH\\
    D{\"u}sseldorf\\
    Germany
  }
\affil[5]{%
    Ruhr West University of Applied Sciences\\
    Bottrop\\
    Germany
  }
\begin{document}
  
  \begin{acronym}
	\acro{ml}[ML]{machine learning}
	\acro{mv}[MV]{model validation}
	\acro{doe}[DoE]{\emph{design of experiments}}
	\acro{lhs}[LHS]{Latin hypercube sampling}
	\acro{gp}[GP]{Gaussian process regression}
	\acro{vgp}[SVGP]{sparse variational Gaussian process regression}
	\acro{bo}[BO]{Bayesian optimization}
	\acro{aqf}[AF]{acquisition function}
	\acro{pdf}[pdf]{probability density function}
	\acro{cdf}[cdf]{cumulative distribution function}
	\acro{mvp}[MVP]{\emph{model validation problem}}
	\acro{ra}[RA]{reliability analysis}
	\acro{qoi}[QOI]{quantity of interest}
	\acro{map}[MAP]{\emph{maximum a posteriori}}
	\acro{rr}[RR]{Ridge regression}
	\acro{svr}[SVR]{support vector regression}
	\acro{rf}[RF]{random forest regression}
	\acro{xgb}[XGB]{XGBoost}
	\acro{gt}[GT]{ground truth}
	\acro{lvd}[LVD]{locally valid discriminative prediction intervals}
	\acro{mad}[MAD]{mean absolute deviation - normalized split conformal}
\end{acronym}
  
\maketitle

\begin{abstract}
Real-world applications of machine learning models are often subject to legal or policy-based regulations. Some of these regulations require ensuring the validity of the model, i.e., the approximation error being smaller than a threshold. A global metric is generally too insensitive to determine the validity of a specific prediction, whereas evaluating local validity is costly since it requires gathering additional data. We propose learning the model error to acquire a local validity estimate while reducing the amount of required data through active learning. Using model validation benchmarks, we provide empirical evidence that the proposed method can lead to an error model with sufficient discriminative properties using a relatively small amount of data. Furthermore, an increased sensitivity to local changes of the validity bounds compared to alternative approaches is demonstrated.
\end{abstract}

\section{Introduction}\label{sec:intro}
Ensuring the validity of deployed \ac{ml} models is often a core concern in safety-critical domains such as medical, vehicle, and industrial applications, with a high risk of harming humans and the environment. These use cases are often subject to legal or regulatory requirements, such as \citetalias{ISO26262} and \citetalias{IEC61508}. \change{Most \ac{ml} models are built using only past observations or examples and may lack further domain-specific inductive biases, such as underlying physics (although some models are capable of incorporating such, e.g., see \citep{karniadakis2021}). Consequently, the behavior of these models in unseen scenarios is difficult to predict without further analysis.}

A strict assessment of a model's capabilities is needed to determine its validity \change{across the input space}. Specifically, we want to identify valid subdomains of the input space where the absolute model error is smaller than some predefined tolerance level. In this context, evaluating global accuracy metrics such as mean squared error is not a useful approach. Even if a model achieves a small average error globally, it can exhibit high inaccuracy in specific input domains. Similarly, even models with high average error may be useful in certain subdomains of the input space.

The validity of a prediction can be assessed by comparing it to real-world observations or, in some cases, to simulations with very high accuracy. The obtained results can be used to approximate the error level \citep{oberkampf2010}, sometimes referred to as \emph{error learning} \citep{riedmaier2020}. In contrast to boosting, we are not interested in improving the model predictions by the addition of the learned error. Instead, we want to have an estimate of the error level to decide the validity of a prediction. A popular model for this task is \ac{gp} \citep{rasmussen2006, kennedy2001}. Besides being a powerful model, its capability to represent the epistemic uncertainty is useful to derive confidence bounds for the estimated local error.

If a large dataset is available for training, an additional split can be afforded to build an error model to be used for \ac{mv}. However, gathering additional data is costly but necessary in many cases to build a sufficiently accurate validation model. Therefore, the \ac{doe}, i.e., the planning of tests or data queries, is often crucial as it determines the overall validation cost and the quality of the validation statement. A good strategy should achieve a highly accurate validity estimate while being data-efficient, using as little additional data as possible.
\begin{figure*}[t]
	\centering
	\includegraphics[width=1.0\textwidth]{./local_validity}
	\caption{Illustration of a locally valid model: the trained model $f\sub{M}$ is marginally valid for tolerance level $\xi$ with $80\%$ probability (\textbf{b)}), but only locally valid $\Set{V}$ (\SRect[0.6]{bgreen}) in some regions of the input space $\VSpace{X}$ (\textbf{a)}). 
	\textbf{b)} Marginal distribution of the true absolute error $\vert\delta\vert$, where the $80\%$ quantile corresponds to the tolerance level, i.e., $\xi=q_{80}$. \textbf{c)} Our learned error model $\vert\change{\tilde{f}\sub{D}}\vert$ and 90\% confidence interval (\SRect[0.4]{bblue}) from the folded Gaussian (Section~\ref{sec:gp}), together with the predicted local valid set $\tilde{\Set{V}}_{0.1}$ (\SRect[0.3]{bgreen}) (Section~\ref{sec:prediction}). Samples (\SDiamond{black}) are sequentially placed based on $\psi\sub{mis}$ (\textbf{d)}) to reduce the misclassification probability (Section~\ref{sec:aq}), i.e., most samples are close to the limit state $x_{\Set{S}_i}\in\Set{S}$.
	}\label{fig:local_valid}
\end{figure*}

Active learning \citep{settles2010} has shown to be an efficient strategy for reducing the number of samples, i.e., the choice of samples used for training a model. Therefore, samples or batches of samples are selected sequentially, leveraging knowledge from previous iterations to guide the sampling process. This approach has been used across various tasks, from optimization to querying new samples \citep{kumar2020} for improving model quality. 

As an active learning method, \Ac{bo} \citep{snoek2012} is known to exploit the probabilistic estimates to find the optimum of a black-box objective function with high data efficiency. However, our goal is to find valid domains, i.e., the set $\Set{V}$ of points $\Vector{x}$ where the absolute model error $\vert\delta(\Vector{x})\vert\in\R_{+}$ is smaller than some tolerance $\xi\in\R_{>0}$. 
In contrast, global adaptive sampling strategies \citep{lammle2023} aim to improve the model quality over the entire input space. We are not interested in having accurate predictions where the error $\vert\delta(\Vector{x})\vert$ is much larger or smaller than $\xi$. Instead, we are primarily interested in learning the neighborhood where $\vert\delta(\Vector{x})\vert \approx \xi$. In classical engineering, \ac{ra} requires solving a similar problem \citep{rebba2008}.

Given a function $g(\cdot)$, \ac{ra} defines failure domains using the condition $g(\Vector{x}) \leq 0$. The aim of \ac{ra} is to compute the failure probability $P\sub{\mathcal{F}}=\E[\RV{X}\sim p(\Vector{x})]{\mathds{1}_{g(\RV{X})\leq0}(\RV{X})}$, where $\RV{X}$ are the input variables. Therefore, an accurate representation of the \emph{limit state} condition $g(\Vector{x}) = 0$ is necessary. \change{Equivalently, the problem of \ac{mv} can be framed as learning two \emph{level sets} \citep{gotovos2013} at once: $\Set{V}=\{\Vector{x}\colon\delta(\Vector{x})\leq\xi\}\cap\{\Vector{x}\colon\delta(\Vector{x})\geq-\xi\}$.} Despite the similarity of the problems and the popularity of active learning in \ac{ra} \change{and level set estimation}, to the best of our knowledge, a similar approach has not been proposed for \ac{mv}.

In this paper, we first formulate the problem of \ac{mv} as learning two limit state conditions ($\delta(\Vector{x})=\xi$ and $\delta(\Vector{x})=-\xi$), thus showing the connection to \ac{ra} problems. Based on this formulation, we propose an active learning method for \ac{mv} inspired by its \ac{ra} counterpart \cite{bichon2008}. We test our method on a variety of benchmark problems \change{within a small sample setting} and show that it can be reliably used for the validation of a multitude of \ac{ml} models.

\paragraph{Contributions.} Our main contributions are summarized as follows:
\textbf{1)} We introduce a new formulation of \acf{mv} inspired by \acf{ra}\change{, extending the setting to two symmetrical limit state conditions and noisy observations.} \textbf{2)} We propose a novel acquisition function based on the misclassification probability of the limit state (Section~\ref{sec:aq}). \textbf{3)} We derive frequentist error bounds for the proposed methodology (Section~\ref{sec:theory}). \textbf{4)} We evaluate the proposed method on a variety of different benchmarks (Section~\ref{sec:experiments}, Appendix~\ref{appendix:additional_results}), and provide a comparison with conformal prediction (Appendix~\ref{appendix:conformal_prediction}).

\section{Related Work}
\paragraph{\change{Reliability Analysis.}}
The goal of \ac{ra} is to calculate the failure probability $P\sub{\mathcal{F}}$. Since the distribution of $\RV{X}$ can be arbitrary, \ac{ra} often requires a large number of samples. 
To increase sample efficiency, previous research has utilized surrogate-aided methods based on \ac{ml} models such as neural networks \citep{papadrakakis1996}, ordinary \citep{bucher1990} and moving least squares models \citep{most2006}, \acp{gp} \citep{kaymaz2005}, and support vector machines \citep{rocco2002}. Although some active learning approaches have been already proposed \citep{macke2000, most2006}, \cite{bichon2008} was the first to introduce active learning based on \ac{gp} models to the field of \ac{ra}.

Following, several \acp{aqf} similar to Bayesian optimization methods have been proposed \change{to learn the limit state}. The most popular ones being the \emph{expected feasibility function} \citep{bichon2008} and the \emph{U-function} \citep{echard2011, echard2013}. For an extensive review, see \citep{teixeira2021}. \change{In contrast, active learning in the \ac{ml} context aims to achieve high accuracy \emph{globally} by using as few samples (labels) as possible. The approach is more frequently used in the context of classification tasks compared to regression. See \citep{settles2010} for an overview.}

\paragraph{\change{Level Sets.}}
\change{Similar to \ac{ra}, the task of level set estimation \citep{bryan2005, gotovos2013} aims to identify regions where the value of some target function is below or above a given threshold. Therefore, samples are placed according to an \ac{aqf} based on a \ac{gp} model to reduce the uncertainty about the level set. In this context, \ac{mv} can be seen as learning two joint level sets simultaneously. A more general approach, not restricted to level set estimation, was proposed by \cite{neiswanger2021}. The entropy-based method can be applied to arbitrary algorithmic outputs. However, the computational cost for evaluating the \ac{aqf} may be high, as a closed-form is unavailable.}

\paragraph{\change{Bayesian Calibration.}}
Another line of work initiated by \cite{kennedy2001} considers learning the model error to correct the output of a computational model similar to boosting techniques in \ac{ml}, while inferring the free parameters of the computational model. Therefore, the posterior distribution of the parameters and the model error are inferred jointly. Subsequent research incorporates physical knowledge by using a physics-informed prior \citep{spitieris2023} and extends the work to multiple outputs \citep{higdon2008a, white2023a}. \change{Nevertheless, one mayor criticism of this work is the identifiability issue, i.e., the effects of calibration parameters and model discrepancy can be confounded due to the over-parameterization of the model \citep{arendt2012, marmin2022}.}

\paragraph{\change{Conformal Prediction.}}
For distribution-free predictive inference, conformal prediction based on the work of \cite{vovk2005} is used to derive prediction intervals with frequentist coverage guarantees. However, most work focuses on marginal coverage \citep{vovk2005, papadopoulos2002, lei2018}, i.e., the model prediction is only marginally valid over the training data and the test points with a specified probability. In this context, our work can be seen as related to the setting of split-conformal prediction, where the data is partitioned into training and calibration sets. The latter is used to derive the prediction intervals for the trained model. Here, we aim to learn the model error, especially around the limit state, to assess the local validity of the model.

A more rigorous conditional coverage is known to be impossible to achieve without further assumptions about the underlying distribution \citep{barber2021}. Approximate conditional coverage has been considered in the work of \cite{lin2021}. 
Although they provide a method for approximate conditional coverage, it often requires more calibration samples to form meaningful prediction intervals (see Appendix~\ref{appendix:conformal_prediction}).

\section{Background}\label{sec:background}
Let $\Vector{x}\in\VSpace{X}\subset\R^d$ be a vector of observable \change{and controllable} inputs, with validation domain $\VSpace{X}$ of dimension $d$. Furthermore, for a model represented by a map $f\sub{M}\colon \VSpace{X} \rightarrow \VSpace{Y}$ between the inputs $\VSpace{X}$ and the \ac{qoi} $\VSpace{Y}\subset\R$, we want to assess the validity of $f\sub{M}$, i.e., estimate model misfit.

\paragraph{Observations.} An observation consists of the tuple $(\Vector{x}, y_x)$ where $y_x$ is assumed to be a sample from the random variable $Y_{\Vector{x}}=f\sub{E}(\Vector{x}) + \epsilon$. In this context, $f\sub{E}\colon \VSpace{X} \rightarrow \VSpace{Y}$ is the true data generating function and $\epsilon$ is the additive label noise. Here, we do not assume to have any knowledge about $f\sub{E}$, therefore we treat it as a black-box function. Moreover, $\epsilon$ is assumed to be homoscedastic Gaussian white noise $\epsilon\sim\mathcal{N}(0, \sigma\sub{e}^2)$, \change{where we use $\sim$ to denote ``distributed as'' in this work.}
We assume that the input can be measured precisely, such that any uncertainty about the true value of $\Vector{x}$ can be neglected. 

In general, a set of observations is used to train and possibly calibrate the model $f\sub{M}$. Another set of observations is used to estimate the generalization error and to validate $f\sub{M}$. Generating new observations is often costly, and only a finite number of observations are available in real-world applications. Therefore, data efficiency of both training and validation is crucial.

\paragraph{Validation Metric.}
\Ac{mv} can be performed by evaluating the model error within $\VSpace{X}$, given as
$\change{f\sub{D}}(\Vector{x}):=f\sub{M}(\Vector{x})-\Rv{Y}_{\Vector{x}}$, with $\change{f\sub{D}}(\Vector{x})\sim\mathcal{N}(f\sub{M}(\Vector{x})-f\sub{E}(\Vector{x}), \sigma\sub{e}^2)$.
For a given tolerance level $\xi\in\R_{>0}$, we quantify the probability of the model error being within the desired tolerance as
\begin{equation}
P\left(-\xi< \change{f\sub{D}}(\Vector{x})< \xi\right).\label{eq:rvm}
\end{equation}
Equation~\eqref{eq:rvm} is termed the \emph{reliability validation metric} and was proposed for \ac{mv} by \citet{rebba2008}, \citet{sankararaman2013}. Equivalently, we will represent Equation~\eqref{eq:rvm} as $P\left(g(\Vector{x})> 0 \right)=1-P\left(g(\Vector{x})\leq 0 \right)$,
where we have $g(\Vector{x}):= \xi - \vert \change{f\sub{D}}(\Vector{x})\vert$, which is in reliability theory often referred to as \emph{limit state function}. 
In the following, we will define the noiseless residual as $\delta(\Vector{x})=\E{\change{f\sub{D}}(\Vector{x})}=f\sub{M}(\Vector{x})-f\sub{E}(\Vector{x})$, which is unknown in practice, i.e., only $\xi$ and $\change{f\sub{D}}(\Vector{x})$ are available for validation iff. $\Vector{x}$ is already observed input.

\paragraph{\change{Differences to \ac{ra}.}}
The formulation of \ac{mv} leads to key differences to \ac{ra} that we have to consider: \textbf{1)} The formulation of Equation~\ref{eq:rvm} is analogous to having two limit state conditions in the setting of \ac{ra}. \textbf{2)} The distribution of the samples in the limit state is a folded Gaussian \citep{leone1961}, since $g(\cdot)$ is formulated in terms of the absolute value of $\change{f\sub{D}}(\cdot)$, which is Gaussian by assumption. \textbf{3)} The \ac{mv} limit state is corrupted by noise, while it is commonly considered noise-free in \ac{ra}\footnote{\change{With a suitable \ac{gp} model, we could extend \ac{ra} to the noisy setting if required (e.g., \citep{chun2024}).}}, \change{since $g(\cdot)$ is generally a simulation model with negligibly small numerical errors \citep{bucher1990}.} \textbf{4)} $\Vector{x}$ is subject to uncertainty in \ac{ra} problems, whereas we assumed no uncertainty in the context of \ac{mv}. Nevertheless, the \ac{mv} setting could be interpreted as a \ac{ra} problem in this respect, where $p(\Vector{x})$ is a uniform distribution defined over the entire $\VSpace{X}$. This represents our interest in assessing the validity of the model everywhere in $\VSpace{X}$ with equal importance.

\section{Method}\label{sec:method}
In this section, we derive our formulation for learning and representing the limit state $\Set{S}$. We start by defining local validity and the limit state for \ac{mv}. Next, we introduce the notation and active learning approach. In Section \ref{sec:gp}, we show how to represent the limit state function $g(\cdot)$ as a transformed \ac{gp} model, which is then used in Section \ref{sec:aq} to derive the \ac{aqf}. A stopping criterion is proposed in Section \ref{sec:stop} based on the probability of misclassifying the validity of a set of points. Additionally, in Section~\ref{sec:prediction} it is shown how to obtain the prediction of the local valid set. Finally, theoretical considerations are presented in Section~\ref{sec:theory}.

\subsection{Definition of Local Validity and Limit State}
Before describing the proposed method, proper definitions of local validity and the limit state for the \emph{noiseless} case (i.e., representing the ground truth) are provided.
\begin{defi}[Local Validity]\label{def:locval} A model $f\sub{M}$ is locally valid at $\Vector{x}\in\VSpace{X}$, given a tolerance level $\xi$, if $\xi - \vert \delta(\Vector{x})\vert\change{\geq}0$. Then, the valid region of $f\sub{M}$ is \begin{equation*}
    \mathcal{V}=\{\Vector{x}\in\VSpace{X}\colon \xi - \vert \delta(\Vector{x})\vert\geq0\}.
\end{equation*}
\end{defi}
Based on the definition of local validity, \emph{global validity} can be asserted if Definition~\ref{def:locval} holds for all $\Vector{x}\in\VSpace{X}$. Note the difference between global validity and valid on average, e.g., according to a prediction accuracy metric.
\begin{defi}[Limit State]\label{def:limit} Given a model $f\sub{M}$ and a tolerance level $\xi$, the limit state of $f\sub{M}$ is \begin{equation*}
    \mathcal{S}=\{\Vector{x}\in\VSpace{X}\colon \xi - \vert \delta(\Vector{x})\vert=0\}.
\end{equation*}
\end{defi}
\change{We can interpret Definition~\ref{def:limit} as the set of points on the boundary between the valid and invalid domains (see Figure~\ref{fig:local_valid}). Moreover, $\mathcal{S}$ is unavailable in practice, and our objective is to construct a strategy aimed at placing samples in the vicinity of $\mathcal{S}$, thereby efficiently learning to differentiate between valid and invalid domains for $f\sub{M}$.}

\subsection{Overview}
\paragraph{Notation.}
We consider learning the limit state $\Set{S}$ over the \emph{normalized} input space $\tilde{\VSpace{X}}\subseteq[0,1]^d$ from validation data $\Set{D}=\{(\Vector{x}_i, y_i)\}_{i=1}^n$, with input $\Vector{x}_i\in\tilde{\VSpace{X}}$ and label $y_i=\change{f\sub{D}}(\Vector{x}_i)$.
Equivalently, we represent training examples as $n\times d$ matrix $\Matrix{X}$, where the $i$-th row is the $i$-th training example $\Vector{x}_i$, with corresponding labels $\Vector{y}$. Furthermore, we employ a surrogate model $\hat{g}(\cdot)$ that provides, for some input $\Vector{x}$, a (conditional) probability distribution over the output $\hat{g}(\Vector{x})=\hat{\Rv{G}}_\Vector{x}\sim p(g\vert\Vector{x}, \mathcal{D})$. We can further use, e.g., the predictive mean $\mu_{g\vert\mathcal{D}}(\Vector{x})=\mathbb{E}[\hat{\Rv{G}}_\Vector{x}]$ or variance $\sigma^2_{g\vert\mathcal{D}}(\Vector{x})=\mathbb{V}[\hat{\Rv{G}}_\Vector{x}]$ of the model.

\paragraph{Active Learning.} Active learning strategies aim to reduce the evaluations of an expensive black-box function $f\sub{E}$, while still achieving a satisfactory result; in this context, learning the limit state with high accuracy for \ac{mv}. Therefore, an \emph{acquisition function} (\ac{aqf}) $\psi\colon\tilde{\VSpace{X}}\rightarrow\R$ is used to rate promising new sample points, often referred to as candidates. Learning is encouraged by maximizing $\psi$ over the candidate set $\Set{C}=\left\{\Vector{c}_i\right\}^{n\sub{c}}_{i=1}$, where $\Vector{c}_i$ is drawn uniformly from $[0,1]^d$ in our case. A new query $\Vector{x}^\ast$ is obtained as
\begin{equation*}
    \Vector{x}^\ast=\argmax_{\Vector{x}\in\Set{C}}\psi(\Vector{x}; \hat{g}, \Set{D}),
\end{equation*}
where $\hat{g}$ is the learned surrogate based on $\Set{D}$. The initial dataset can be generated from a space-filling design (e.g., Sobol \citep{joe2008} or \ac{lhs} \citep{mckay1979}). Algorithm~\ref{ag:almv} shows the active learning procedure.
From here on, we drop writing the explicit dependence of $\psi$ on $\Set{D}$ and $\hat{g}$.
\begin{algorithm}[t]
	\begin{algorithmic}
		\Require Initial data $\Set{D}$, candidate set $\Set{C}$, tolerance $\xi$, and acquisition function $\psi$ 
		\Repeat
		\State Train surrogate $\hat{g}$ with $\Set{D}$
		\State $\Vector{x}^\ast \gets \argmax_{\Vector{x}\in\Set{C}}\psi(\Vector{x}; \hat{g}, \Set{D})$
		\State $y^\ast \gets \change{f\sub{D}}(\Vector{x}^\ast)$
		\State $\Set{D} \gets \Set{D}\cup \{(\Vector{x}^\ast, y^\ast)\}$
		\State Generate new $\Set{C}$\Comment{optional}
		\Until{Termination condition}\Comment{e.g., maximum iterations}
		\State Train surrogate $\hat{g}$ with $\Set{D}$\\
	    \Return $\Set{D}$, $\hat{g}$
	\end{algorithmic}
	\caption{Active Learning Model Validation}\label{ag:almv}
\end{algorithm} 

\subsection{Gaussian Process}\label{sec:gp}
\acf{gp} is a popular probabilistic \ac{ml} method, which can be used to represent the belief over the objective function. Therefore, it is a central component to different learning schemes, e.g., in \ac{bo} \citep{snoek2012} or global improvement of surrogate models \citep{lammle2023}.

In the following, we use a transformed \ac{gp} model to represent our belief of the limit state
\begin{gather*}
    \hat{g}=\lambda\circ\change{\tilde{f}\sub{D}}\\
    \change{\tilde{f}\sub{D}}\sim\mathcal{GP}\left(\mu, k\right),
\end{gather*}
where \change{$\circ$ is the function composition,} $\mu\colon \tilde{\VSpace{X}} \rightarrow \R$ and $k\colon \tilde{\VSpace{X}} \times \tilde{\VSpace{X}} \rightarrow \R_+$ denote the \emph{mean} and the \emph{covariance} (kernel) functions, respectively. $\lambda(\cdot)$ represents the non-invertible mapping $\lambda(y): = \xi - \vert y\vert$.

\paragraph{Exact Prediction.}
Since the \ac{gp} is defined as a joint Gaussian distribution, the prediction at a point $\Vector{x}^\star$ can be analytically obtained as conditional Gaussian distribution $p(y\vert\Vector{x}^\star, \mathcal{D})\sim\mathcal{N}(\mu_{y\vert\Set{D}}(\Vector{x}^\star), \sigma^2_{y\vert\Set{D}}(\Vector{x}^\star))$, with mean and variance as
\begin{gather}
	\mu_{y\vert\Set{D}}(\Vector{x}^\star) = \mu(\Vector{x}^\star) +  \Vector{k}^{T} (\Matrix{K}^{-1}+\hat{\sigma}\sub{e}^2\Matrix{I}) \Vector{y}, \label{eq:pred_mu}\\
	\sigma^2_{y\vert\Set{D}}(\Vector{x}^\star) =  
	k(\Vector{x}^\star, \Vector{x}^\star) - \Vector{k}^T (\Matrix{K}^{-1}+\hat{\sigma}\sub{e}^2\Matrix{I}) \Vector{k}, \label{eq:pred_var}
\end{gather}
where $\Matrix{K}$ contains all pairs of kernel entries (i.e., $\Matrix{K}_{ij}=k(\Vector{x}_i, \Vector{x}_j)$), and $\Vector{k}$ denotes the vector of correlations between $\Vector{x}^\star$ and training points, $\Vector{k}_i =k(\Vector{x}_i, \Vector{x}^\star)$. $\hat{\sigma}\sub{e}^2$ is the estimated noise variance.

\paragraph{Learning Hyperparameters.}
Predictions made with a \ac{gp} depend on the \emph{hyperparameters} $\hat{\sigma}\sub{e}^2$ and $\Vectors{\theta}$, e.g., \change{noise variance, kernel lengthscale, or possibly the parameters of the mean function $\mu(\Vector{x}^\star)$.}
These \change{hyperparameters} could be obtained by maximizing the log \emph{marginal likelihood}
\begin{equation*}
	\log p(\Vector{y} \vert \Matrix{X}, \Vectors{\theta}, \hat{\sigma}\sub{e}^2) \propto  -\dfrac{1}{2} \log \vert \Matrix{K} \vert 
	- \dfrac{1}{2}\Vector{y}^T (\Matrix{K}^{-1}+\hat{\sigma}\sub{e}^2\Matrix{I}) \Vector{y}.
\end{equation*}
If prior knowledge is available, it can be beneficial to use instead the \ac{map} estimate as
\begin{equation*}
    \hat{\Vectors{\theta}}\sub{MAP} = \argmax_{\Vectors{\theta}, \hat{\sigma}\sub{e}^2}
	\log p(\Vector{y} \vert \Matrix{X}, \Vectors{\theta}, \hat{\sigma}\sub{e}^2) + \log p(\Vectors{\theta}, \hat{\sigma}\sub{e}^2),
	 \label{eq:map}
\end{equation*}
where $p(\Vectors{\theta}, \hat{\sigma}\sub{e}^2)$ are the specified priors over the hyperparameters.

\paragraph{Limit State Prediction.} 
The prediction of the limit state is obtained by the mapping $\lambda(\cdot)$. 
We can give closed form solutions for $\mu_{g\vert\mathcal{D}}(\cdot)$ and $\sigma^2_{g\vert\mathcal{D}}(\cdot)$, since $\hat{\Rv{G}}_{\Vector{x}}$ is a folded Gaussian distribution flipped and shifted by $\xi$ (see Figure~\ref{fig:folded_normal}). Therefore, we have
\begin{gather*}
    \mu_{g\vert\Set{D}}(\Vector{x}^\star) = \xi -  \left(\sigma_\star\sqrt{\frac{2}{\pi}}\zeta + \mathrm{erf}\left(\frac{\mu_\star}{\sqrt{2\sigma_\star^2}}\right)\mu_\star\right),
    \\
    \sigma^2_{g\vert\Set{D}}(\Vector{x}^\star) = \mu_\star^2 + \sigma_\star^2 - \mu_{g\vert\Set{D}}^2(\Vector{x}^\star),
\end{gather*}
where we denoted $\mu_\star=\mu_{y\vert\mathcal{D}}(\Vector{x}^{\star})$, $\sigma^2_\star=\sigma^2_{y\vert\mathcal{D}}(\Vector{x}^{\star})$, and $\zeta=\exp\left(\frac{-\mu_\star^2}{2\sigma_\star^2}\right)$. $\mathrm{erf}(\cdot)$ is the Gaussian error function.

\subsection{Acquisition Function}\label{sec:aq}
The \ac{aqf} is used to guide the sampling strategy in regions of interest. Especially for \ac{ra} and validation, we are interested to sample in the vicinity of the limit state $\mathcal{S}$. For this purpose, several \acp{aqf} were proposed in \ac{ra}, e.g., see \citet{bichon2008,echard2011}.
Among them, the so called "U-function" \citep{echard2011} is widely used in \ac{ra} \citep{dubourg2013, teixeira2021}.

\paragraph{U-Function.} The \ac{aqf} focuses on the design subspace near the limit state boundary considering an exploration-exploitation trade-off. It is given by
\begin{equation}
    \psi\sub{U}(\Vector{x})=-\frac{\vert \mu_{g\vert\Set{D}}(\Vector{x}) \vert}{\sigma_{g\vert\Set{D}}(\Vector{x})},\label{eq:ufun}
\end{equation}
where $\mu_{g\vert\Set{D}}(\Vector{x})$ and $\sigma_{g\vert\Set{D}}(\Vector{x})$ are predictive mean and standard deviation of the probabilistic model, respectively. \change{$\psi\sub{U}$ selects samples which have large variance and are close to the limit state according to the GP. $\mu_{g\vert\Set{D}}(\Vector{x})$ is small only if $\Vector{x}$ is close to the limit state.} Note that Equation~\ref{eq:ufun} is different from the original formulation \citep{echard2011} in our setting, since the \ac{gp} predictive mean and standard deviation are transformed by $\lambda(\cdot)$. 
\paragraph{MC-Prob.} Maximizing $\psi\sub{U}$ was originally derived to be equivalent to maximizing the probability of misclassifying the limit state condition under the Gaussian assumption \citep{echard2013}. 
However, the equivalence does not hold for a folded Gaussian posterior. Instead, we propose using the misclassification probability directly as
\begin{equation*}
    \psi\sub{mis}(\Vector{x}; \omega) = \begin{cases}P\left(\hat{\Rv{G}}_{\Vector{x}}\leq-\omega\right), & \text{for } \vert\mu_{y\vert\mathcal{D}}(\Vector{x})\vert\leq\xi\\ 1 - P\left(\hat{\Rv{G}}_{\Vector{x}}\leq\omega\right), & \text{for } \vert\mu_{y\vert\mathcal{D}}(\Vector{x})\vert>\xi,\end{cases}
\end{equation*}
where $\omega\in\R_+$ determines the exploration-exploitation trade-off, i.e., it gives the misclassification probability around the limit state with a small slack variable $\omega < \xi$. Larger values of $\omega$ encourage more exploration.
The closed-form expression for the \ac{cdf} of $\hat{\Rv{G}}_{\Vector{x}}$ is given by
\begin{align}
\begin{split}\label{eq:cdf_ffolded}
    P\left(\hat{\Rv{G}}_{\Vector{x}}\leq\omega\right) =&\; 2 - \Phi\left(\frac{\xi-\omega + \mu_{y\vert\mathcal{D}}(\Vector{x})}{\sigma_{y\vert\mathcal{D}}(\Vector{x})}\right) \\&- \Phi\left(\frac{\xi-\omega - \mu_{y\vert\mathcal{D}}(\Vector{x})}{\sigma_{y\vert\mathcal{D}}(\Vector{x})}\right),
\end{split}
\end{align}
where $\Phi(\cdot)$ is the standard normal distribution. See Appendix~\ref{appendix:deriv_fncdf} for a derivation of Equation~\ref{eq:cdf_ffolded} as well as Appendix~\ref{appendix:misclass} for some additional insights on the misclassification probability.

\subsection{Stopping Criterion}\label{sec:stop}
The criterion for stopping the sequential strategy is a crucial part of the method. One straightforward approach is to use the probability of misclassifying a candidate point $\Vector{x}$ under the transformed \ac{gp} posterior, as given by $\psi\sub{mis}$. Then, we stop by reaching a predefined tolerance $\alpha\geq \tilde{P}\sub{mis}=\E[\RV{X}\sim p(\Vector{x})]{\psi\sub{mis}(\RV{X}; \omega=0)}$. 
In practice, we estimate this expectation based on the candidate set $\Set{C}$. The criterion can be made more robust by ensuring the above condition in $k$ consecutive iterations. Additionally, we introduce a sampling budget, i.e., we limit the maximum number of samples obtained. Therefore, we stop if the budget is exhausted or a suitable $\tilde{P}\sub{mis}$ is achieved in time.

\subsection{Prediction}\label{sec:prediction}
The learned \ac{gp} model represents our final belief of the limit state, which we can use to decide if $f\sub{M}$ is locally valid at an arbitrary $\Vector{x}\in\tilde{\VSpace{X}}$. Hence, we predict the local valid set, analogous to Definition~\ref{def:locval}, as 
\begin{equation}\label{eq:pred_valid}
    \tilde{\mathcal{V}}=\left\{\Vector{x}\in\tilde{\VSpace{X}}\colon \xi - \vert \mu_{y\vert\Set{D}}(\Vector{x})\vert\change{\geq}0\right\}.
\end{equation}
In safety-critical applications, preventing false positives can become more important than incorrectly classifying a sample to be invalid, as also noted by \cite{reeb2023}. Therefore, depending on the application, we may be more risk averse than using $\tilde{\Set{V}}$ directly. 
Instead, we can derive the predicted local valid set with $1-\alpha$ confidence, for $\alpha\in(0,1)$, as 
\begin{equation}\label{eq:pred_valid_conf}
    \tilde{\mathcal{V}}_\alpha=\left\{\Vector{x}\in\tilde{\VSpace{X}}\colon q_{\alpha}(\Vector{x}) \change{\geq} 0\right\},
\end{equation}
with quantile $q_{\alpha}(\Vector{x}):=\inf\{\tilde{g}\in\R: P(\hat{\Rv{G}}_{\Vector{x}}\leq \tilde{g})\geq \alpha\}$.

It can be seen from Equation~\eqref{eq:pred_valid} that $\xi$ can be changed post-hoc. However, if $\vert\xi\sub{old} - \xi\sub{new}\vert$ is large, the model may not be sufficiently accurate, as the samples are usually placed near $\xi\sub{old}$, and the new limit state resulting from $\xi\sub{new}$ could be far away from the old one.

\begin{figure}[t]
  \centering
  \includegraphics[width=0.8\linewidth]{./tricky_FCProb_20.jpg}
  \caption{Prediction $\tilde{\Set{V}}$ (Equation~\ref{eq:pred_valid}) for the modified Rastrigin function after 20 initial and 70 adaptive observations, with $\psi\sub{mis}$ and $\omega=0.2\xi$. The true limit state is represented by the black line.}\label{fig:tricky}
\end{figure}

\subsection{Theoretical Considerations}\label{sec:theory}
By \cite[Thm 9]{lederer2021} (\cite[Thm 3.1]{lederer2019}, resp.), the regression error of the GP model is under certain conditions bounded in terms of the posterior variance:
\begin{theo} \label{thm:errorbound}
 Assume that $\delta$ is a Lipschitz continuous sample from the zero mean Gaussian process with covariance kernel $k$ with Lipschitz constant $L_k$ on the compact set $\tilde{\VSpace{X}}$. Denote the Lipschitz constant of $\delta$ by $L_\delta$. Then, $\mu_{y\vert\Set{D}}(\cdot)$ and $\sigma_{y\vert\Set{D}}^2(\cdot)$ are continuous with Lipschitz constants $L_\mu$ and $L_{\sigma^2}$ on $\tilde{\VSpace{X}}$ such that
\begin{align}
L_{\mu}&\leq L_k \sqrt{n}\Vert\left(\Matrix{K}+\sigma\sub{e}^2\Matrix{I}\right)^{-1}\Vector{y}\Vert \label{ineq:L_mu} \\
L_{\sigma^2}&\leq 2 L_k \Big(1+n\Vert \left(\Matrix{K}+\sigma\sub{e}^2\Matrix{I}\right)^{-1}\Vert~ k^* \Big), \label{ineq:L_sigma2}
\end{align}
where $k^* := \max_{\Vector{x}, \Vector{x}'\in\mathbb{X}}k(\Vector{x}, \Vector{x}')$. Moreover, pick $\alpha\in(0,1)$, $\tau\in\R_+$ and set
\begin{equation*}
    \beta(\tau)=2\log\left(\frac{M(\tau, \tilde{\mathbb{X}})}{\alpha}\right)
\end{equation*}
\begin{equation*}
    \gamma(\tau)=(L_{\mu} + L_\delta)\tau+\sqrt{\beta(\tau) L_{\sigma^2} \tau},
\end{equation*}
where $M(\tau, \tilde{\VSpace{X}})$ is the $\tau$-covering number of $\tilde{\VSpace{X}}$.
Then, it holds that
\begin{align}
P\left(\vert\delta(\Vector{x})-\mu_{y\vert\Set{D}}(\Vector{x})\vert\leq \eta(\Vector{x}), \forall\Vector{x}\in\tilde{\VSpace{X}}\right)\geq 1-\alpha \label{ineq:errorbound}
\end{align}
where $\eta(\Vector{x})=\sqrt{\beta(\tau)}\sigma_{y\vert\Set{D}}(\Vector{x})+\gamma(\tau)$.
\end{theo}
For $\tilde{\VSpace{X}}\subseteq[0,1]^d$, it holds
\begin{equation*}
    M(\tau, \tilde{\VSpace{X}}) \leq M(\tau, [0,1]^d) = \left(\frac{\sqrt{d}}{2\tau}\right)^d.
\end{equation*}
In contrast to the mere consideration of $\sigma_{y\vert\Set{D}}$, Theorem \ref{thm:errorbound} implies convergence of $\mu_{y\vert\Set{D}}$ to $\delta$ if $\eta$ converges to zero sufficiently fast as $n \rightarrow \infty$. According to the posterior variance bounds in \cite[Section 3]{lederer2021}, this is particularly the case if a sufficient number of samples are close to $\Vector{x}$. Since we are mainly interested in a small error near the limit state, it is important to choose the adaptive sampling method accordingly. The proposed \ac{aqf} is designed exactly for this purpose; it intuitively prefers samples which are presumably close to the limit state and have high posterior variance. $\eta(\Vector{x})$ can be computed explicitly and yields an uniform error bound on $\tilde{\VSpace{X}}$. Since
\begin{equation*}
    \vert\delta(\Vector{x})\vert \leq \vert\delta(\Vector{x})-\mu_{y\vert\Set{D}}(\Vector{x})\vert + \vert \mu_{y\vert\Set{D}}(\Vector{x})\vert,
\end{equation*}
these bounds can be considered as an alternative to the use of confidence intervals of the \ac{gp}. \change{However, Theorem~\ref{thm:errorbound} requires additional knowledge on the Lipschitz continuity of the covariance kernel as well as $\delta$ and is therefore not generally applicable. Furthermore, an exemplary computation shows that the obtained results are very conservative, in particular, if the bounds in (\ref{ineq:L_mu}) and (\ref{ineq:L_sigma2}) are used (see Appendix~\ref{appendix:errorbounds}). Moreover, we found that the \ac{gp} confidence intervals provide reliable bounds in most cases.}

\section{Experiments}\label{sec:experiments}
Experiments are conducted to demonstrate the sample efficiency of the presented method on a variety of  problems, ranging from analytical benchmark functions to trained \ac{ml} models on analytical problems, and some real-world tabular datasets from \texttt{OpenML} \citep{OpenML2013}.\footnote{https://github.com/SvenL13/LocalValidity}
Results for the latter are shown in Appendix~\ref{appendix:tabular_bench}, while the influence of label noise is studied in Appendix~\ref{appendix:noise}, and results for the stopping criterion are presented in Appendix~\ref{appendix:stop}.

\paragraph{\change{Target.}}
The learning target for our benchmarks is to correctly predict valid regions $\Set{V}$ of the model $f\sub{M}$ as shown in Figure~\ref{fig:tricky}. Thus, we can frame the task as a binary classification problem (positive class for valid, and negative for invalid) and report precision and recall, where the $F_1$-score is used as a summary within plots.
\change{For the \ac{gt} labels, we evaluate $\xi - \vert\delta(\Vector{x})\vert$, where $\delta(\Vector{x})=f\sub{M}(\Vector{x})-f\sub{E}(\Vector{x})$ is the noiseless residual. 
Further, we split into valid (positive) GT label if the value is not negative and invalid label otherwise (Definition \ref{def:locval}).}

\paragraph{Setup.}
The initial, candidate, and test datasets are drawn quasi-uniform via \ac{lhs}, with $10d$ initial observations as proposed by \cite{loeppky2009}, where $d$ is the dimension of the validation domain. We restrict the number of adaptive samples to $50d$ across all experiments. \change{For the next sample point, we draw $\min(5000d, 50000)$ candidates in each iteration and choose the one that achieves the largest \ac{aqf} value.} 
The \ac{gp} is trained in each iteration until 100 samples are observed. Thereafter, training is conducted every 4-th iteration to reduce computational effort. Even if the model is not retrained, we update the \ac{gp} with the observed data \change{and keep the \ac{gp} hyperparameters fixed}. For the benchmarks, we conduct 30 restarts with different initializations, test sets, and seeds, if not stated otherwise. In every 5-th iteration, we compare the prediction $\tilde{\Set{V}}$ (Equation~\ref{eq:pred_valid}) with the \ac{gt} target based on $\min(25000d, 250000)$ test samples. Additionally, the results for $\tilde{\Set{V}}_{0.1}$ are shown in Appendix~\ref{appendix:risk_averse}, and further implementation details of our method can be found in Appendix~\ref{appendix:impl}.

\subsection{Analytical Benchmark Functions}\label{sec:bench_analytical}
\paragraph{\change{Methodology.}} \change{For the first benchmark, we consider various analytical functions $f(\Vector{x})$ to assess method performance under well-controlled conditions, where $f(\Vector{x})$ represents the error surface ($\delta(\Vector{x}):=f(\Vector{x})$). The tested methods have only access to noisy evaluations $\change{f\sub{D}}(\Vector{x})=f(\Vector{x})-\epsilon$, where $\epsilon\sim\mathcal{N}(0, \sigma\sub{e}^2)$. We use the Styblinsky-Tang \citep{styblinski1990} \change{and Michalewicz function \citep{michalewicz1992}} with varying dimensions, as well as two 2-d benchmark functions popular in \ac{ra}, namely a modified Rastrigin \citep{torn1989} and a series system function \citep{waarts2000} as benchmark functions.} 
Definitions of these analytical functions are given in Appendix~\ref{appendix:bench}.

We compare $\psi\sub{mis}$ with $\omega=0.2\xi$ and $\omega=0.0$  \change{against several baselines. The first one is given by $\psi\sub{U}$ \citep{echard2011} adapted to our setting (Equation~\eqref{eq:ufun}). Secondly, a random sampling baseline is used, where we select a sample from the candidates with equal probability instead of maximizing an \ac{aqf}.
Further, we consider running \ac{ra} individually for lower and upper tolerance bound with the original U-function \citep{echard2011}, denoted $\psi\sub{U2}$. Therefore, we use the sum of two \acp{aqf} with two \acp{gp}, each for $\xi-f\sub{D}(\Vector{x})$ and $\xi+f\sub{D}(\Vector{x})$. Both models observe all samples, even if only one of the \acp{gp} is used to query a new sample, i.e., we train and evaluate them on the same inputs. Finally, we consider the smallest margin method \citep{scheffer2001} with a \ac{xgb} classifier \citep{chen2016}, as this seems to be a reasonable baseline \citep{cawley2011, yang2018} to represent that direction of research.} Further settings for the benchmark are given in Table~\ref{tab:setup_analytical}.
\begin{table}[ht]
    \centering
    \caption{Experimental settings for the analytical benchmark functions.}
    \begin{tabular}{lcccc}
        \toprule
        \bfseries Benchmark function & $\xi$ & $\sigma\sub{e}$ & $n\sub{init}$ & $n\sub{adapt}$\\
        \midrule
        Styblinski-Tang & $30$ & $5$ & 10d & 50d\\
        \change{Michalewicz} & $0.07$ & $0.01$ & 10d & 50d\\
        Mod. Rastrigin & $20$ & $0.1$ & 10d & 50d\\
        Series System & $3$ & $0.5$ & 10d & 50d\\
        \bottomrule
    \end{tabular}
    \label{tab:setup_analytical}
\end{table}

 \begin{figure*}[t]
  \centering
  \includegraphics[width=\textwidth]{analytical_f1-score}
  \caption{Median and $95$\% confidence intervals of $F1$-score on the analytical problem functions across 30 runs. Top: Styblinsky-Tang for 2 to 8 dimensions. Bottom: Modified Rastrigin (2-d), series system function (2-d), \change{and Michalewicz function (4-d, 6-d).}}\label{fig:analytical}
\end{figure*}

\paragraph{Results.} \change{The results are given in Figure~\ref{fig:analytical}, where we report median and $95$\% confidence bounds. The smallest margin baseline is partially not shown to improve visibility, as it underperforms other methods, with mean $F_1$-score of 39.35\% across samples and final score of 73.99\% at the last sample. This may be due to the smaller number of available samples compared to applications in previous research.}
It can be seen that the additional exploration ($\omega=0.2\xi$) in $\psi\sub{mis}$ increases performance slightly upon its counterpart $\psi\sub{mis}$ with $\omega=0$. \change{Further, we found that $\psi\sub{U2}$ can be prone to model misspecification, as can be observed for the Rastrigin function, since we have to learn two \acp{gp}.} 

\change{Overall, $\psi\sub{mis}$ with $\omega=0.2\xi$ shows significant improvement over the baselines, achieving an average $F_1$-score of 76.5\% and a final score of 92\%. 
In contrast, $\psi\sub{U2}$ shows the weakest performance among \ac{gp}-based strategies, with an average $F_1$-score of 75.21\%, slightly improving over the random baseline with 74.55\%.}

\subsection{Benchmark Model Validation}\label{sec:ml_bench}
\paragraph{\change{Methodology.}}\change{
We evaluate $\psi\sub{mis}$ ($\omega=0.2\xi$), $\psi\sub{U}$, and the random baseline for a more realistic problem setting, where the model under validation $f\sub{M}$ is given by a trained \ac{ml} model. Since $\psi\sub{mis}(\omega=0.2\xi)$ outperformed $\psi\sub{mis}(\omega=0)$ on the analytical benchmark functions, we do not expect $\omega=0$ to perform significantly different here.} 

\change{Training data for $f\sub{M}$ was obtained from an analytical benchmark function $f\sub{E}$ via \ac{lhs} and acquiring noisy labels. In particular, only noisy evaluations of $f\sub{E}$ were available both for training and validation, whereas noise-free samples were used for testing the validation models. In all test cases, $f\sub{M}$ was ensured to be invalid in some but not all regions (w.r.t input domain). This scenario is common in practice, where a model can perform a task fairly well on average, but remains invalid in certain regions.}
\begin{table}[!t]
    \centering
    \caption{Experimental settings used in \ac{ml} benchmark.}
    \begin{tabular}{lccccc}
        \toprule
        \bfseries Bench. & \bfseries Dim. & \bfseries $\xi$ & \bfseries $\sigma\sub{e}$ & $n\sub{init}$ & $n\sub{adapt}$\\
        \midrule
        \multirow{3}{*}{Micha.} & 2 & $0.3$ & $0.03$ & 20 & 100\\
         & 4 & $0.6$ & $0.03$ & 40 & 200\\
         & 8 & $0.9$ & $0.03$ & 80 & 400\\ \hline
        \multirow{3}{*}{Rosen.} & 2 & $250$ & $5$ & 20 & 100\\
         & 4 & $500$ & $5$ & 40 & 200\\
         & 8 & $1000$ & $5$ & 80 & 400\\
        \bottomrule
    \end{tabular}
    \label{tab:setup_ml_bench}
\end{table}
\begin{table*}[ht!]
    \centering
    \caption{Mean and standard error of precision and recall across \ac{rr}, \ac{svr}, \ac{rf}, and \ac{xgb} models. Scores for mean and final result across samples and 30 runs are reported. Bold numbers represent the best result.}\label{tab:ml_bench}
\begin{tabular}{lc|ccc|ccc}
\toprule
\textbf{Benchmark} & \textbf{Dimension} & \multicolumn{3}{c}{\textbf{Mean Precision [\%]}} & \multicolumn{3}{|c}{\textbf{Final Precision [\%]}} \\
 &  & $\psi\sub{mis, 0.2}$ & $\psi\sub{U}$ & Random & $\psi\sub{mis, 0.2}$ & $\psi\sub{U}$ & Random \\
\midrule
\multirow[c]{3}{*}{Michalewicz} & 2 & $\mathbf{97.3}\scriptstyle\textcolor{gray}{\pm0.1}$ & $96.9\scriptstyle\textcolor{gray}{\pm0.1}$ & $96.5\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{98.5}\scriptstyle\textcolor{gray}{\pm0.1}$ & $97.6\scriptstyle\textcolor{gray}{\pm0.1}$ & $97.4\scriptstyle\textcolor{gray}{\pm0.1}$ \\
 & 4 & $\mathbf{95.1}\scriptstyle\textcolor{gray}{\pm0.1}$ & $94.9\scriptstyle\textcolor{gray}{\pm0.1}$ & $94.5\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{96.8}\scriptstyle\textcolor{gray}{\pm0.2}$ & $96.7\scriptstyle\textcolor{gray}{\pm0.1}$ & $96.2\scriptstyle\textcolor{gray}{\pm0.2}$ \\
 & 8 & $\mathbf{88.5}\scriptstyle\textcolor{gray}{\pm0.0}$ & $88.4\scriptstyle\textcolor{gray}{\pm0.0}$ & $88.1\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{89.7}\scriptstyle\textcolor{gray}{\pm0.1}$ & $89.6\scriptstyle\textcolor{gray}{\pm0.1}$ & $89.2\scriptstyle\textcolor{gray}{\pm0.1}$ \\
\hline
\multirow[c]{3}{*}{Rosenbrock} & 2 & $\mathbf{97.0}\scriptstyle\textcolor{gray}{\pm0.1}$ & $95.9\scriptstyle\textcolor{gray}{\pm0.2}$ & $95.5\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{98.2}\scriptstyle\textcolor{gray}{\pm0.1}$ & $97.0\scriptstyle\textcolor{gray}{\pm0.3}$ & $96.9\scriptstyle\textcolor{gray}{\pm0.2}$ \\
 & 4 & $\mathbf{94.7}\scriptstyle\textcolor{gray}{\pm0.1}$ & $93.0\scriptstyle\textcolor{gray}{\pm0.1}$ & $92.1\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{96.1}\scriptstyle\textcolor{gray}{\pm0.1}$ & $94.9\scriptstyle\textcolor{gray}{\pm0.2}$ & $94.5\scriptstyle\textcolor{gray}{\pm0.2}$ \\
 & 8 & $\mathbf{92.6}\scriptstyle\textcolor{gray}{\pm0.1}$ & $90.6\scriptstyle\textcolor{gray}{\pm0.1}$ & $89.9\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{94.5}\scriptstyle\textcolor{gray}{\pm0.1}$ & $92.7\scriptstyle\textcolor{gray}{\pm0.2}$ & $92.2\scriptstyle\textcolor{gray}{\pm0.1}$ \\
\hline
  &  & \multicolumn{3}{c}{\textbf{Mean Recall [\%]}} & \multicolumn{3}{|c}{\textbf{Final Recall [\%]}} \\\hline
 \multirow[c]{3}{*}{Michalewicz} & 2 & $99.4\scriptstyle\textcolor{gray}{\pm0.0}$ & $\mathbf{99.6}\scriptstyle\textcolor{gray}{\pm0.0}$ & $99.1\scriptstyle\textcolor{gray}{\pm0.1}$ & $99.4\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{99.6}\scriptstyle\textcolor{gray}{\pm0.0}$ & $99.3\scriptstyle\textcolor{gray}{\pm0.1}$ \\
 & 4 & $98.7\scriptstyle\textcolor{gray}{\pm0.0}$ & $98.8\scriptstyle\textcolor{gray}{\pm0.0}$ & $\mathbf{99.0}\scriptstyle\textcolor{gray}{\pm0.1}$ & $98.7\scriptstyle\textcolor{gray}{\pm0.2}$ & $98.7\scriptstyle\textcolor{gray}{\pm0.2}$ & $\mathbf{98.9}\scriptstyle\textcolor{gray}{\pm0.1}$ \\
 & 8 & $98.2\scriptstyle\textcolor{gray}{\pm0.1}$ & $98.3\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{98.9}\scriptstyle\textcolor{gray}{\pm0.1}$ & $97.1\scriptstyle\textcolor{gray}{\pm0.1}$ & $97.3\scriptstyle\textcolor{gray}{\pm0.2}$ & $\mathbf{98.0}\scriptstyle\textcolor{gray}{\pm0.1}$ \\
\hline
\multirow[c]{3}{*}{Rosenbrock} & 2 & $98.8\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{99.2}\scriptstyle\textcolor{gray}{\pm0.1}$ & $98.5\scriptstyle\textcolor{gray}{\pm0.1}$ & $99.2\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{99.4}\scriptstyle\textcolor{gray}{\pm0.1}$ & $98.6\scriptstyle\textcolor{gray}{\pm0.1}$ \\
 & 4 & $93.0\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{96.4}\scriptstyle\textcolor{gray}{\pm0.1}$ & $96.2\scriptstyle\textcolor{gray}{\pm0.2}$ & $95.8\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{97.2}\scriptstyle\textcolor{gray}{\pm0.2}$ & $96.8\scriptstyle\textcolor{gray}{\pm0.2}$ \\
 & 8 & $91.5\scriptstyle\textcolor{gray}{\pm0.2}$ & $97.0\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{97.2}\scriptstyle\textcolor{gray}{\pm0.1}$ & $92.1\scriptstyle\textcolor{gray}{\pm0.2}$ & $96.6\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{96.8}\scriptstyle\textcolor{gray}{\pm0.1}$ \\
\bottomrule
\end{tabular}
\end{table*}

\change{Four classes of \ac{ml} models are considered for $f\sub{M}$: \ac{rr}, \ac{svr}, \ac{rf}, and \ac{xgb} regression, which exhibit different error surfaces with varying difficulty. $f\sub{E}$ is given by the Michalewicz or the Rosenbrock function \citep{rosenbrock1960} with dimensions ranging from 2 to 8. The tolerance $\xi$ is chosen such that we obtain partially valid models, with valid ratio ranging from 0.75 to 0.99. For Rosenbrock we kept hyperparameters of the \ac{ml} models fixed, while we found the need to tune them via \ac{bo} for Michalewicz to obtain models that are at least partially valid. Furthermore, we fix the trained models across the 30 repetitions. The experimental settings are given in Table~\ref{tab:setup_ml_bench}, and further details regarding the \ac{ml} models hyperparameters and analytical functions can be found in Appendix~\ref{sec:details_ml_bench}.}

\paragraph{Results.} 
The results of our experiments are shown in Table~\ref{tab:ml_bench}, where we report the mean and the last value across samples for precision and recall. We want to emphasize the importance of preventing false positives, i.e., falsely judging a model to be valid, as captured by the precision score \change{from the perspective of safety-critical applications}. In contrast, the consequences of false negatives, as reflected by the recall score, are not as severe. Furthermore, we expect the mean across samples to be informative about the overall performance and sample efficiency, while the final value is relevant to the end performance of the method.

It can be seen that $\psi\sub{mis}$ outperforms both the baseline and $\psi\sub{U}$ in terms of average and final precision across almost all tested cases with regard to precision. While all strategies achieved high recall scores $(>90\%)$, $\psi\sub{U}$ and the random sampling baseline slightly outperform $\psi\sub{mis}$ in this regard.

\section{Conclusion}
Assessing the validity of a model across the range of inputs can be challenging due to the expense of gathering additional validation data. To address this issue, we developed a novel formulation of the local validity problem for \ac{ml} models inspired by active learning commonly used in \ac{ra}. Based on this foundation, we proposed a new acquisition function (MC-Prob) that uses the misclassification probability, which can be evaluated in closed-form.

MC-Prob intuitively places samples near the limit state and can learn the boundary between valid and invalid regions of the model. Empirically, MC-Prob improves upon its counterpart, U-Fun, across several benchmarks and two real-world examples, reducing the probability of incorrectly classifying the model as valid, as desired in most safety applications. In contrast to existing conformal prediction methods, our approach can significantly reduce the required amount of data while maintaining accurate predictions. The scalability to higher dimensions remains open, possibly by replacing the \ac{gp} model \citep{hensman2013} or by exploiting the lower intrinsic dimensionality expected in most real-world data \citep{wang2016}.

\begin{acknowledgements}
    This work was supported by ZF Friedrichshafen AG.
\end{acknowledgements}

% References
\bibliography{Library}

\newpage

\onecolumn

\title{Quantifying Local Model Validity via Active Learning\\(Supplementary Material)}
\maketitle

\appendix

\section{Derivation of Equation~\ref{eq:cdf_ffolded}}\label{appendix:deriv_fncdf}
Let $\Rv{Y} = \vert\Rv{X}\vert$, where $\Rv{X}\sim\mathcal{N}(\mu, \sigma^2)$ with mean $\mu$ and variance $\sigma^2$. Then, $Y$ follows a folded Gaussian \citep{leone1961}, 
with parameters $\mu$ and $\sigma^2$. The \ac{cdf} is given by \citep{tsagris2014}
$$F(x)=0.5\left(erf\left(\frac{x-\mu}{\sigma\sqrt{2}}\right) + erf\left(\frac{x+\mu}{\sigma\sqrt{2}}\right)\right),$$
where $erf(x)=2/\sqrt{\pi}\int_0^x \exp(-t^2)dt$ is the error function.
Further, let $\Rv{Z}=a-\Rv{Y}=a-\vert\mu+\sigma\Rv{X}\vert$. Then, for $x\in(0, \infty)$,
\begin{align*}
    F(x)&=P(Z\leq x)=P(a-\vert\mu + \sigma\Rv{X}\vert\leq x)=1-P(\vert\mu+\sigma\Rv{X}\vert\leq \underbrace{-x+a}_{:=z})=1-P(-z\leq\mu+\sigma\Rv{X}\leq z)\\
    &=1-P\left(\frac{-z-\mu}{\sigma}\leq\Rv{X}\leq \frac{z-\mu}{\sigma}\right)\\
    &=1+\Phi\left(\frac{-z - \mu}{\sigma}\right) - \Phi\left(\frac{z - \mu}{\sigma}\right)\\
    &=2-\Phi\left(\frac{z + \mu}{\sigma}\right) - \Phi\left(\frac{z - \mu}{\sigma}\right),
\end{align*}
where $\Phi(\cdot)$ is the standard normal \ac{cdf}, and $z:=-x+a$. The last step follows since $\Phi(-x)=1-\Phi(x)$. We obtain the formulation in Equation~\ref{eq:cdf_ffolded} by using $x=\omega$ and $a=\xi$, with folded Gaussian parameters $\mu$ and $\sigma^2$ given by the \ac{gp} predictive mean and variance. See Figure~\ref{fig:folded_normal} for a illustration.
\begin{figure}[t]
  \centering
  \includegraphics[width=0.6\linewidth]{./folded_normal.pdf}
  \caption{\ac{gp} prediction of the limit state function $g$ is a folded Gaussian distribution, which is flipped and shifted by the predefined tolerance $\xi$. The filled area shows the misclassification probability $\psi\sub{mis}$.}\label{fig:folded_normal}
\end{figure}

\section{Limitations and Discussion}
We have shown that our approach can learn the limit state $\Set{S}$ and predict the valid set $\Set{V}$ with a reduced number of validation samples. However, it is important to discuss further challenges and limitations we encountered during development.

Firstly, we observed that high label noise in relation to the tolerance level $\xi$ can degrade the performance of the strategy. In such cases, the underlying limit state may not be accurately identifiable, as shown by our ablation study in Appendix~\ref{appendix:noise}. Therefore, if small $\xi$ has to be achieved, it is important to keep the noise correspondingly low.

Secondly, misspecification of the \ac{gp} model is important to consider and could occur, e.g., if many hyperparameters have to be optimized. To mitigate possible misspecification, we train the \ac{gp} model multiple times with different parameter initializations (see implementation details in Appendix~\ref{appendix:impl}). Additionally, one can provide suitable prior distributions for the model hyperparameters if available. Issues could arise with discontinuous or unsmooth error surfaces, as observed during validation of tree-based models, due to the learned decision structure. In such situations, the \ac{gp} model can only provide a smooth approximation.

Finally, our method was developed in the setting of additive homoscedastic Gaussian noise with variance $\sigma\sub{e}^2$. However, real-world applications can be influenced by heteroscedastic noise, where $\sigma\sub{e}^2$ may change with $\Vector{x}$. Thus, the estimated noise would be under- or overestimated in certain input regions if assumed homoscedastic. \change{Nevertheless, homoscedasticity can be a reasonable assumption even for real-world data. This is demonstrated in Appendix~\ref{appendix:tabular_bench} by applying our method with homoscedasticity assumption to real-world examples. To further improve} the performance with heteroscedastic noise, a suitable transformation on the labels could be applied, such as the Box-Cox \citep{box1964} or Yeo-Jonhson transformation \citep{yeo2000}, which have been used in the context of \ac{bo} and \ac{gp} models \citep{cowen-rivers2022}. Alternatively, the noise can be learned directly by a second \ac{gp} model \citep{kersting2007, binois2018}. Testing such an approach is left for future work.

\section{Additional theoretical considerations}\label{appendix:theo}

\subsection{Error bounds} \label{appendix:errorbounds}
\change{In the following, we discuss the error bound using the 90\% confidence interval of the \ac{gp} model and the error bounds based on Theorem \ref{thm:errorbound}}. Therefore, a 1-d test function
$$\delta(x) = \frac{1}{2}~ \exp(x)~\sin(8x - 2)$$
and additive noise $(\sigma\sub{e}^2=0.05^2)$ is used with Algorithm~\ref{ag:almv}. Tolerance is set to $\xi = 1$, and we use 10 initial samples and 500 iterations. The two limit states are at $x_1 \approx 0.786$ and $x_2 \approx 0.92$. For this case, it is possible to compute $\eta$ given in Theorem~\ref{thm:errorbound} with exact Lipschitz constants. 

\change{The results are illustrated in Figure~\ref{fig:eta_bounds}. It can be seen, that the bound using the $90\%$ confidence interval of the GP model may underestimate the true error, but yields in most cases a good bound. In contrast, the error bound given by $\eta$ is in any case a very conservative approximation of the true error. Further, it can be seen that our adaptive sampling strategy results in a rapid decrease of the error bound $\eta$ near the limit state even for small sample sizes, whereas the model improves globally with more samples available.}
\begin{figure*}[t]
	\centering
	\includegraphics[width=\textwidth]{./eta_bounds}
	\caption{Error bound $\eta$ for $\vert\delta-\mu_{y\vert\Set{D}}\vert$ as well as true error and $90\%$ confidence interval of the GP model for 10 initial samples, 20 (top left), 50 (top right), 100 (lower left) and 500 (lower right) adaptive samples. The vertical lines show the limit states. Initial and adaptive samples are blue and orange, resp.
	}\label{fig:eta_bounds}
\end{figure*}

Figure~\ref{fig:eta_bounds2} shows the comparison of $\eta$ with exact Lipschitz constants and $\eta$ calculated using the bounds from Equation~\ref{ineq:L_mu} and Equation~\ref{ineq:L_sigma2}. The setting is the same as in the top left of Figure~\ref{fig:eta_bounds}. Apparently, the upper bounds for the Lipschitz constant yield too pessimistic results for small sample sizes.
\begin{figure*}[ht]
	\centering
	\includegraphics[width=0.5\textwidth]{./eta_bounds2}
	\caption{Error bound $\eta$ for $\vert\delta-\mu_{y\vert\Set{D}}\vert$ with exact Lipschitz constants and using the bounds given in (Equation~\ref{ineq:L_mu}) and (Equation~\ref{ineq:L_sigma2}).
	}\label{fig:eta_bounds2}
\end{figure*}

\change{Theorem~\ref{thm:errorbound} states that the true error is less than the error bound $\eta$ for all points in the input space with a probability of 90\%. Therefore, $\eta$ gives a uniform error bound that is not directly dependent on the accuracy of the predicted uncertainty of the \ac{gp}, and is therefore much stronger. However, the theorem requires additional knowledge on the Lipschitz continuity of the covariance kernel as well as $\delta$ and is therefore not generally applicable. Furthermore, Figures~\ref{fig:eta_bounds}, \ref{fig:eta_bounds2} show that the bound may be too conservative, especially when using estimated Lipschitz constants.}

\subsection{Probability of misclassification} \label{appendix:misclass} 
The probability of misclassification at $\Vector{x}$ is given by Equation~\ref{eq:cdf_ffolded} and derived in Appendix~\ref{appendix:deriv_fncdf}. For the case $\mu < -\xi$ (with $\mu = -4$ and $\xi = 2$), this probability is illustrated as a yellow area in Figure~\ref{fig:p_mis12}. This setting corresponds to the case of an invalid posterior mean. The misclassification probability is the probability that the state is actually valid. Since the valid domain is bounded, this probability can only be maximized up to some limited extent and it is bounded by 0.5 in any case. In contrast, the misclassification probability of a state that is predicted to be valid can be arbitrarily close to one and is strictly increasing with the variance for fixed $\mu$ (see right-hand side of Figure~\ref{fig:p_mis12}). For fixed $\mu$ it is even possible to compute the $\sigma$ which maximizes the probability of misclassification. This can be achieved by taking the derivative of Equation~\ref{eq:cdf_ffolded} with respect to $\sigma$ and by computing its root. The optimal standard deviation $\sigma_{\text{opt}}$ is given by 
$$\sigma_{\text{opt}}^2 = - 2~\xi~\mu~\ln\left(\frac{\mu - \xi}{\mu + \xi}\right)^{-1} \quad \text{for fixed } \mu > \vert \xi \vert.$$
Figure~\ref{fig:p_mis3} shows the dependence of the misclassification probability on both $\mu$ and $\sigma$. The optimal standard deviation increases with the distance of $\mu$ to $\xi$ and very small variance results in small values for $P_{\mathrm{mis}}$. However, as described before, $P_{\mathrm{mis}}$ decreases again if $\sigma$ becomes too large.

\begin{figure}[ht]
    \centering
    \includegraphics[width=\textwidth]{./Pmis_comb}
    \caption{Left: Yellow area equals the misclassification probability and is maximized for $\vert\mu\vert > \xi$ if green and blue area are equal (lower figure). Right: Misclassification probability for fixed $\mu$ if $|\mu| > \xi$ (upper figure) and $|\mu| \leq \xi$ (lower figure).
    }%
    \label{fig:p_mis12}%
\end{figure}

\begin{figure*}[ht]
	\centering
	\includegraphics[width=0.6\textwidth]{./Pmis3}
	\caption{Dependence of the misclassification probability on $\mu$ and $\sigma$ for $\mu < -\xi$ with $\xi = 2$.}\label{fig:p_mis3}
\end{figure*}

\section{Comparison with Conformal Prediction}\label{appendix:conformal_prediction}
While our primary objective of this work is designing an adaptive sampling strategy for learning the limit state $\Set{S}$ (Section~\ref{sec:method}), an important question arises how our approach compares with existing conformal prediction methods. 
Hence, we provide a comparison with the most popular conformal prediction strategies, which we can utilize to predict the valid set $\tilde{\Set{V}}_\alpha$ for the model under validation $f\sub{M}$.

\paragraph{Prediction Interval and Valid Set.}
Conformal prediction methods are used to derive prediction intervals $\hat{C}_\alpha(\Vector{x})$ that contain an unseen observation $\Rv{Y}^\star$ at test point $\Rv{X}^\star$ with confidence $1-\alpha$ (often referred as coverage), where no further assumption is made about the data generating distribution $p(\Vector{x}, y)$. The framework provides guarantees, with the most common one being \emph{marginal coverage}, which aims to satisfy
$$P(\Rv{Y}^\star\in \hat{C}_\alpha(\Rv{X}^\star))\geq 1-\alpha.$$
Note, in order to form a prediction interval, most strategies (e.g., split conformal prediction) use additional calibration data besides the training data for $f\sub{M}$ \citep{lei2018, bellotti2020}. In our setting, we use the validation data for this purpose.

We can obtain a valid set similar to $\Set{V}_{\alpha}$ (Equation~\ref{eq:pred_valid_conf}), by using lower $\rho\sub{lo}$ and upper bound $\rho\sub{lo}$ of the prediction interval, where $\hat{C}_\alpha(\Vector{x})=[\rho\sub{lo}(\Vector{x}), \rho\sub{up}(\Vector{x})]$.
Further, we can obtain the valid set from $\hat{C}_\alpha(\Vector{x})$ as
$$\tilde{\Set{V}}_\alpha = \{\Vector{x}\in\VSpace{X}: (f\sub{M}(\Vector{x}) - \rho\sub{lo}(\Vector{x}) \leq\xi) \wedge (\rho\sub{up}(\Vector{x}) - f\sub{M}(\Vector{x}) \leq\xi)\},$$
where $\xi\in\R_{+}$ is the tolerance level. 
In other words, we classify $f\sub{M}$ to be valid at $\Vector{x}$ if we predict with $1-\alpha$ confidence that the difference between mean prediction ($f\sub{M}$) and upper/lower bound is below the prescribed tolerance $\xi$. Thus, we are able to compare conformal prediction methods to our method based on $\tilde{\Set{V}}_\alpha$.

\subsection{Comparison}
We compare our approach (Section~\ref{sec:method}) with the following methods from conformal prediction: 
\begin{itemize}
  \item Split-conformal prediction \citep{papadopoulos2002}, with the commonly-used residual score $s(\Vector{x}, y)=\vert y-f\sub{M}(\Vector{x}) \vert$.
  \item \Ac{mad} \citep{lei2018, bellotti2020}, with the residual score $s(\Vector{x}, y)=\vert y-f\sub{M}(\Vector{x})\vert/u(\Vector{x}) $, where $u(x)$ is the residual predictor for $\vert y-f\sub{M}(\Vector{x})\vert$.
  \item \Ac{lvd} \citep{lin2021}, with squared exponential kernel.
\end{itemize}
A qualitative comparison between methods is given in Table~\ref{tab:comp_conformal}.
\begin{table}[!t]
    \centering
    \caption{Qualitative comparison between strategies to perform validation. Split conformal and \ac{mad} have only marginally coverage, while \ac{lvd} provides approximately conditional coverage. Similarly, our method can be seen to have marginal coverage if the error bound from Theorem~\ref{thm:errorbound} is used. In our experiments, conformal methods have shown the need for larger datasets in order to provide meaningful prediction intervals for validation, in comparison to the proposed approach (Section~\ref{sec:method}).}
    \begin{tabular}{l|cccc}
        \toprule
        \bfseries  & \bfseries Split Conformal & \bfseries MAD & \bfseries LVD & \bfseries Ours\\
        \midrule
        Coverage & marginal & marginal & apprx. conditional & marginal\footnotemark\\
        Discriminative & \xmark & \checkmark & \checkmark & \checkmark\\
        Applicable $f\sub{M}$ & regression, class. & regression & regression & regression\\
        Samples needed & medium & high & high & small\\
        \bottomrule
        \multicolumn{5}{l}{\footnotesize{$^1$based on Theorem~\ref{thm:errorbound}}}
    \end{tabular}
    \label{tab:comp_conformal}
\end{table}

\paragraph{Simple Example.} 
To illustrate differences in the obtained valid sets, we train a \ac{gp} model $f\sub{M}$ to be validated on $10$ samples from $f(x)=x\sin(x) + \epsilon$, with $\epsilon\sim \mathcal{N}(0, 0.5^2)$ (similar to Figure~\ref{fig:local_valid}). $x$ is drawn uniformly between $0$ and $10$, and tolerance $\xi$ is set to $3$.
To obtain the valid set for the conformal methods, a total of $100$ samples are drawn uniformly. For our method, we use $10$ initial samples and $4$ samples drawn with $\psi(\omega=0.1\xi)$. A confidence level of $1-\alpha=0.9$ is used for all methods, while predictions are given by $\tilde{\Set{V}}_{0.1}$.
Results are shown in Figure~\ref{fig:conf_1d}. We see that the prediction interval with split conformal has a fixed width for all $\Vector{x}$, since the method is not discriminative. Therefore, we see the method is not capable of identifying valid regions in this setting. 
\ac{lvd} is discriminative and therefore able to correctly identify some parts of the valid region. However, with few data, the prediction intervals of \ac{lvd} can become infinitely wide, as noted by the authors \citep[Section~3.2]{lin2021}. 
For \ac{mad}, only a small region is correctly classified as valid. In contrast, our method is able to identify valid regions with high accuracy. Further, we found that all tested conformal strategies need considerably more samples than our method in order to provide meaningful prediction intervals.
\begin{figure*}[t]
  \centering
  \includegraphics[width=\textwidth]{./compare_conformal.pdf}
  \caption{Comparison of different validation strategies, where we used $100$ validation samples for the conformal strategies (Split conformal, \ac{lvd} and \ac{mad}), and $14$ samples for our method. The target is to obtain an accurate estimate of the valid regions, i.e., $f\sub{M}$ is inside the tolerance $\xi$. The prediction of the local valid set $\tilde{\Set{V}}_{0.1}$ (\SRect[0.3]{bgreen}) is shown for each method.}\label{fig:conf_1d}
\end{figure*}
\paragraph{8-dimensional Comparison.} We compare methods based on the 8-dimensional benchmark from Section~\ref{sec:ml_bench}, where we used \ac{rr}, \ac{svr}, \ac{rf}, and \ac{xgb} as $f\sub{M}$ to be validated. For comparison, we use the final result obtained from our method, with $\psi\sub{mis}$ and $\omega=0.2\xi$. Further, we draw uniformly the same number of samples as used by our adaptive approach (480 samples for 8-d) to calibrate the prediction intervals.
We show the average across 10 runs for the conformal prediction strategies and 30 runs with our method (results taken from Table~\ref{tab:ml_bench_c90}) for $F_1$-score, precision, and recall. In Table~\ref{tab:comp_conformal_8d}, it can be seen that our method outperforms the conformal strategies by a large margin across all metrics. Across the conformal methods, \ac{mad} provided the best performance. We found that \ac{lvd} had issues with the small dataset, leading to infinitely wide prediction intervals in most cases, as described previously.
\begin{table}[!t]
    \centering
    \caption{Comparison of different validation strategies for the 8-dimensional \ac{ml} benchmark (Section~\ref{sec:experiments}) with \ac{rr}, \ac{svr}, \ac{rf}, and \ac{xgb} models. Mean and standard error are shown across 10 runs for the conformal prediction strategies (Split Conf., \ac{mad} and \ac{lvd}) and 30 runs for our strategy (Section~\ref{sec:method}). Predictions are made with $\tilde{\Set{V}}_{0.1}$. Scores for the final sample are reported for our method. Bold numbers represent the best result.}
    \begin{tabular}{ll|cccc}
    \toprule
     \textbf{Benchmark} & \textbf{Metric} & \multicolumn{4}{|c}{\textbf{Method}} \\
      & & \textbf{Split Conf.} & \textbf{MAD} & \textbf{LVD} & \textbf{Ours} \\
    \midrule
    \multirow[c]{3}{*}{Michalewicz (8-d)} & $F_1$-score [\%] & $24.9\scriptstyle\textcolor{gray}{\pm6.9}$ & \change{$44.4\scriptstyle\textcolor{gray}{\pm4.1}$} & $0.0\scriptstyle\textcolor{gray}{\pm0.0}$ & $\mathbf{74.5}\scriptstyle\textcolor{gray}{\pm1.8}$ \\
 & Precision [\%] & $24.9\scriptstyle\textcolor{gray}{\pm6.9}$ & $90.2\scriptstyle\textcolor{gray}{\pm0.9}$ & $0.0\scriptstyle\textcolor{gray}{\pm0.0}$ & $\mathbf{94.0}\scriptstyle\textcolor{gray}{\pm0.4}$ \\
 & Recall [\%] & $25.0\scriptstyle\textcolor{gray}{\pm6.9}$ & \change{$33.4\scriptstyle\textcolor{gray}{\pm4.3}$} & $0.0\scriptstyle\textcolor{gray}{\pm0.0}$ & $\mathbf{64.8}\scriptstyle\textcolor{gray}{\pm2.1}$ \\
 \hline
 \multirow[c]{3}{*}{Rosenbrock (8-d)} & $F_1$-score [\%] & $24.8\scriptstyle\textcolor{gray}{\pm6.9}$ & \change{$52.4\scriptstyle\textcolor{gray}{\pm4.8}$} & $0.0\scriptstyle\textcolor{gray}{\pm0.0}$ & $\mathbf{64.9}\scriptstyle\textcolor{gray}{\pm2.4}$ \\
 & Precision [\%] & $24.6\scriptstyle\textcolor{gray}{\pm6.8}$ & \change{$93.0\scriptstyle\textcolor{gray}{\pm0.6}$} & $0.0\scriptstyle\textcolor{gray}{\pm0.0}$ & $\mathbf{98.0}\scriptstyle\textcolor{gray}{\pm0.2}$ \\
 & Recall [\%] & $25.0\scriptstyle\textcolor{gray}{\pm6.9}$ & \change{$43.2\scriptstyle\textcolor{gray}{\pm5.5}$} & $0.0\scriptstyle\textcolor{gray}{\pm0.0}$ & $\mathbf{53.3}\scriptstyle\textcolor{gray}{\pm2.5}$ \\
    \bottomrule
    \end{tabular}
    \label{tab:comp_conformal_8d}
\end{table}
\subsection{Conclusion} 
Methods for conformal prediction were derived to provide prediction intervals without further assumption of the underlying distribution. In our setting, we found that the resulting prediction intervals were overly conservative when used for validation, especially with limited data.
Furthermore, \ac{lvd} provided infinitely wide prediction intervals if not enough samples are available, which makes the method difficult to use if data is scarce.
In contrast, we have seen that our proposed method can provide accurate valid sets without the need for excessive amounts of samples. The usefulness of conformal prediction strategies is their wide applicability, where they can be used in the more general setting with heteroscedastic noise, i.e., $\sigma\sub{e}^2$ may change with $\Vector{x}$.
Future research may extend our proposed approach, since \ac{gp} models are well capable of handling heteroscedastic noise, as shown with other active learning strategies \citep{binois2018, binois2019}.

\section{Additional Results}\label{appendix:additional_results}
Here, we present additional experimental results complementing the benchmarks shown in Section~\ref{sec:experiments}.

\subsection{Tabular Dataset}\label{appendix:tabular_bench}
\change{We show the practical application of our method by extending experiments from Section~\ref{sec:ml_bench} with two real-world tabular datasets (6-d: ID-4835, 9-d: ID-361083) from \texttt{OpenML} \citep{OpenML2013}, containing numerical features.} In contrast to previous experiments, the available data is restricted, i.e., we cannot place adaptive samples at arbitrary $\Vector{x}\in\tilde{\VSpace{X}}$ (previously we could query $f\sub{E}$ which should be observable in practice). Hence, the available data is randomly separated into train $\Set{D}\sub{train}$ (6-d: $621$ samples, 9-d: $349100$ samples), validation $\Set{D}\sub{val}$ (6-d: $1234$ samples, 9-d: $93093$ samples), and test sets $\Set{D}\sub{test}$ (6-d: $1234$ samples, 9-d: $139641$ samples). Furthermore, if the active learning strategy proposes to query at a position $\Vector{x}^\star$ we pick the closest ($L^2$ distance) available sample $\Vector{x}'\in\Set{D}\sub{val}$ in the validation set and return the corresponding observation $y'\in\Set{D}\sub{val}$. We compare $\psi\sub{mis, 0.2}$, $\psi\sub{U}$, and the random sampling baseline.

\paragraph{Results 6-dimensional dataset.} \change{Two models have to be validated: a \ac{gp} model (Matérn5/2 kernel) and a \ac{svr} model with default hyperparameters. The models were trained with $\Set{D}\sub{train}$, which results in a valid ratio of 0.64 for the \ac{gp} and 0.65 for the \ac{svr} model, with $\xi=0.1$. Figure~\ref{fig:tabular_bench_4835} shows median and 95\% confidence interval for the $F1$-score across $20$ runs. The dashed lines represent a reference \ac{gp} model trained with the complete validation data $\Set{D}\sub{val}$.}
\begin{figure*}[t]
  \centering
  \includegraphics[width=\textwidth]{./real_4835_f1_score.pdf}
  \caption{Median and $95$\% confidence intervals of $F1$-score on the 6-dimensional tabular dataset for \ac{gp} and \ac{svr} model across 20 runs. Dashed line represents reference model with all available samples from the tabular dataset.}\label{fig:tabular_bench_4835}
\end{figure*}

\paragraph{Results 9-dimensional dataset.} The model to be validated is a \ac{xgb} model trained on $\Set{D}\sub{train}$ with default hyperparameters, with test $R^2\approx0.4$ and tolerance $\xi=0.4$, \change{which gives a valid ratio of 0.78. Due to the computational burden, a \ac{vgp} model \citep{hensman2013, titsias2009} is used instead of the \ac{gp} (Section~\ref{sec:gp}). See Appendix~\ref{appendix:impl} for the \ac{vgp} implementation details.}

Figure~\ref{fig:tabular_bench} shows median and 95\% confidence interval for the $F1$-score across $30$ runs. Further, we show a reference \ac{vgp} model (dashed lines) with the complete validation data $\Set{D}\sub{val}$ (11636 samples). It can be seen that although random sampling improves the score faster in the beginning, only $\psi\sub{mis}$ achieves a score close to the reference solution.

\begin{figure*}[t]
  \centering
  \includegraphics[width=\textwidth]{./real_361083_f1_score.pdf}
  \caption{Median and $95$\% confidence intervals of $F1$-score on the 9-dimensional tabular dataset for \ac{xgb} model and across 30 runs. Dashed line represents reference model with all available samples from the tabular dataset.}\label{fig:tabular_bench}
\end{figure*}

\subsection{Influence of Noise on Model Quality}\label{appendix:noise}
Noise $\sigma^2\sub{e}$ can have a non-neglectable impact on the achievable model accuracy and can lead, if high enough, to identifiability issues of the underlying limit state. Figure~\ref{fig:noise} shows the influence of varying noise on the $F_1$-score and the predicted misclassification probability $\tilde{P}\sub{mis}$ for the 2-dimensional Styblinsky-Tang function. In this experiment, we varied the signal-to-noise ratio $\text{N/S}$ between $0.1\%$ and $50\%$. The ratio is calculated as $\text{N/S}=\sigma^2_{\mathrm{e}}/\E[\RV{X}\sim p(\Vector{x})]{f(\mathbf{X})}^2$, where $f(\cdot)$ is the Styblinsky-Tang function, and $p(\Vector{x})$ is taken to be uniform. The expectation is numerically evaluated. Furthermore, we provide the signal to tolerance ratio ($\text{N/T}=\sigma_{\mathrm{e}}/\xi$).
It can be observed, that in the most extreme case ($\text{N/S}=50\%$), the final $F_1$-score has decreased on average from $0.95$ to $0.58$.
Note, that this performance decrease is captured by $\tilde{P}\sub{mis}$. 
\begin{figure*}[t]
  \centering
  \includegraphics[width=\textwidth]{./ablation_noise.pdf}
  \caption{Varying noise to signal ($\text{N/S}=\sigma^2_{\mathrm{e}}/\E[\RV{X}\sim p(\Vector{x})]{f(\mathbf{X})}^2$) or noise to tolerance ($\text{N/T}=\sigma_{\mathrm{e}}/\xi$) ratios across columns. Median and $95$\% confidence intervals for the 2-dimensional Styblinsky-Tang function (valid to invalid ratio: $0.77$) after 20 initial observations and across 25 runs are shown.}\label{fig:noise}
\end{figure*}

\change{\subsection{Influence of Tolerance Level}
The chosen tolerance level $\xi$ can influence the efficacy of the adaptive strategy and the learned error model $\hat{f}\sub{D}$. A stricter tolerance will most likely result in more invalid regions, making the problem more challenging to model, except in cases where everything is invalid, which makes the problem much easier. Note that this is a property inherent to the problem.}
\change{Figure~\ref{fig:tolerance} shows the results for the 4-d Styblinsky-Tang function with different tolerance levels, selected to obtain four valid ratios (20\%, 40\%, 60\%, and 80\%) for $f\sub{M}$. Other settings were kept the same as in Section~\ref{sec:experiments}.}
\begin{figure*}[t]
  \centering
  \includegraphics[width=\textwidth]{./ablation_tolerance.pdf}
  \caption{\change{Varying tolerance levels $\xi$ for the 4-d Styblinsky-Tang function, selected to obtain four valid ratios. Median and $95$\% confidence intervals are shown.}}\label{fig:tolerance}
\end{figure*}

\subsection{Stopping Criterion}\label{appendix:stop}
\change{We show the results for the misclassification probability $\tilde{P}\sub{mis}$, proposed in Section~\ref{sec:stop} as a stopping criterion, which we tracked during experiments in Section~\ref{sec:ml_bench}. From this,} we evaluate the difference between the misclassification rate calculated from the test data and our stopping criterion $\tilde{P}\sub{mis}$ based on the trained \ac{gp} models. Results are shown averaged across 2, 4, and 8 dimensions in Figure~\ref{fig:stop}. It can be seen that for both \acp{aqf}, $\tilde{P}\sub{mis}$ tends to be slightly conservative. Similar observations were made for the original U-function, see \citep{wang2019b}. Nevertheless, the stopping criterion provides to be useful in combination with a maximum sample budget, as it can lead to early stopping if a sufficient number of samples are obtained.
\begin{figure*}[t]
  \centering
  \includegraphics[width=\textwidth]{./ablation_stopping.pdf}
  \caption{Difference between true misclassification rate $(M)$ and $\tilde{P}\sub{mis}$ across \ac{rr}, \ac{svr}, \ac{rf}, and \ac{xgb}. Median and $95$\% confidence intervals are shown.}\label{fig:stop}
\end{figure*}

\subsection{Lower Risk Aversity}\label{appendix:risk_averse}
Complementary results for the benchmark in Section~\ref{sec:ml_bench} using $\tilde{\Set{V}}_{0.1}$ (Equation~\ref{eq:pred_valid_conf}) are given in Table~\ref{tab:ml_bench_c90}. We observe that $\psi\sub{mis}$ shows overall the highest precision score on average and final score, outperforming $\psi\sub{U}$ and the random baseline. Due to the more conservative predictions, there is a drop in the recall score for all methods. Differences between the random sampling baseline and the \acp{aqf} can be explained by the limited exploration of the \acp{aqf} in comparison.
\begin{table*}[t]
    \centering
    \caption{Mean and standard error for precision and recall across 30 runs for \ac{rr}, \ac{svr}, \ac{rf}, and \ac{xgb}. Predictions are made with $\tilde{\Set{V}}_{0.1}$. Scores for mean and maximum across samples are reported. Bold numbers represent the best result.}\label{tab:ml_bench_c90}
\begin{tabular}{lc|ccc|ccc}
\toprule
\textbf{Benchmark} & \textbf{Dimension} & \multicolumn{3}{c}{\textbf{Mean Precision}} & \multicolumn{3}{|c}{\textbf{Final Precision}} \\
 &  & $\psi\sub{mis, 0.2}$ & $\psi\sub{U}$ & Random & $\psi\sub{mis, 0.2}$ & $\psi\sub{U}$ & Random \\
\midrule
\multirow[c]{3}{*}{Michalewicz} & 2 & $\mathbf{98.5}\scriptstyle\textcolor{gray}{\pm0.1}$ & $98.0\scriptstyle\textcolor{gray}{\pm0.2}$ & $98.2\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{99.0}\scriptstyle\textcolor{gray}{\pm0.2}$ & $98.4\scriptstyle\textcolor{gray}{\pm0.2}$ & $98.8\scriptstyle\textcolor{gray}{\pm0.1}$ \\
 & 4 & $\mathbf{98.0}\scriptstyle\textcolor{gray}{\pm0.1}$ & $97.8\scriptstyle\textcolor{gray}{\pm0.1}$ & $97.6\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{99.1}\scriptstyle\textcolor{gray}{\pm0.1}$ & $99.0\scriptstyle\textcolor{gray}{\pm0.1}$ & $98.9\scriptstyle\textcolor{gray}{\pm0.1}$ \\
 & 8 & $\mathbf{91.9}\scriptstyle\textcolor{gray}{\pm0.1}$ & $91.7\scriptstyle\textcolor{gray}{\pm0.2}$ & $91.6\scriptstyle\textcolor{gray}{\pm0.2}$ & $\mathbf{94.0}\scriptstyle\textcolor{gray}{\pm0.2}$ & $93.9\scriptstyle\textcolor{gray}{\pm0.3}$ & $93.9\scriptstyle\textcolor{gray}{\pm0.3}$ \\
\hline
\multirow[c]{3}{*}{Rosenbrock} & 2 & $\mathbf{98.5}\scriptstyle\textcolor{gray}{\pm0.1}$ & $97.8\scriptstyle\textcolor{gray}{\pm0.2}$ & $98.0\scriptstyle\textcolor{gray}{\pm0.2}$ & $\mathbf{98.9}\scriptstyle\textcolor{gray}{\pm0.1}$ & $98.3\scriptstyle\textcolor{gray}{\pm0.3}$ & $98.8\scriptstyle\textcolor{gray}{\pm0.1}$ \\
 & 4 & $\mathbf{98.1}\scriptstyle\textcolor{gray}{\pm0.1}$ & $97.8\scriptstyle\textcolor{gray}{\pm0.1}$ & $97.7\scriptstyle\textcolor{gray}{\pm0.1}$ & $98.1\scriptstyle\textcolor{gray}{\pm0.2}$ & $98.2\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{98.3}\scriptstyle\textcolor{gray}{\pm0.1}$ \\
 & 8 & $\mathbf{97.2}\scriptstyle\textcolor{gray}{\pm0.1}$ & $96.5\scriptstyle\textcolor{gray}{\pm0.1}$ & $95.7\scriptstyle\textcolor{gray}{\pm0.1}$ & $\mathbf{98.0}\scriptstyle\textcolor{gray}{\pm0.1}$ & $97.6\scriptstyle\textcolor{gray}{\pm0.1}$ & $97.2\scriptstyle\textcolor{gray}{\pm0.1}$ \\
\hline
  &  & \multicolumn{3}{c}{\textbf{Mean Recall}} & \multicolumn{3}{|c}{\textbf{Final Recall}} \\\hline
 \multirow[c]{3}{*}{Michalewicz} & 2 & $69.3\scriptstyle\textcolor{gray}{\pm1.8}$ & $72.1\scriptstyle\textcolor{gray}{\pm2.1}$ & $\mathbf{88.8}\scriptstyle\textcolor{gray}{\pm0.9}$ & $75.4\scriptstyle\textcolor{gray}{\pm1.0}$ & $75.5\scriptstyle\textcolor{gray}{\pm2.7}$ & $\mathbf{93.1}\scriptstyle\textcolor{gray}{\pm1.1}$ \\
 & 4 & $73.2\scriptstyle\textcolor{gray}{\pm0.7}$ & $72.2\scriptstyle\textcolor{gray}{\pm0.7}$ & $\mathbf{77.7}\scriptstyle\textcolor{gray}{\pm0.6}$ & $82.6\scriptstyle\textcolor{gray}{\pm1.7}$ & $81.1\scriptstyle\textcolor{gray}{\pm2.1}$ & $\mathbf{85.4}\scriptstyle\textcolor{gray}{\pm1.1}$ \\
 & 8 & $\mathbf{51.3}\scriptstyle\textcolor{gray}{\pm1.3}$ & $42.0\scriptstyle\textcolor{gray}{\pm1.1}$ & $50.3\scriptstyle\textcolor{gray}{\pm0.9}$ & $\mathbf{64.8}\scriptstyle\textcolor{gray}{\pm2.1}$ & $52.4\scriptstyle\textcolor{gray}{\pm2.4}$ & $63.3\scriptstyle\textcolor{gray}{\pm1.5}$ \\
\hline
\multirow[c]{3}{*}{Rosenbrock} & 2 & $66.1\scriptstyle\textcolor{gray}{\pm1.4}$ & $67.2\scriptstyle\textcolor{gray}{\pm2.1}$ & $\mathbf{88.0}\scriptstyle\textcolor{gray}{\pm1.2}$ & $68.1\scriptstyle\textcolor{gray}{\pm1.6}$ & $66.1\scriptstyle\textcolor{gray}{\pm2.9}$ & $\mathbf{93.2}\scriptstyle\textcolor{gray}{\pm0.5}$ \\
 & 4 & $44.4\scriptstyle\textcolor{gray}{\pm1.0}$ & $49.7\scriptstyle\textcolor{gray}{\pm1.0}$ & $\mathbf{65.1}\scriptstyle\textcolor{gray}{\pm1.3}$ & $55.4\scriptstyle\textcolor{gray}{\pm2.7}$ & $57.1\scriptstyle\textcolor{gray}{\pm3.4}$ & $\mathbf{75.4}\scriptstyle\textcolor{gray}{\pm2.2}$ \\
 & 8 & $38.1\scriptstyle\textcolor{gray}{\pm0.7}$ & $48.8\scriptstyle\textcolor{gray}{\pm0.9}$ & $\mathbf{63.5}\scriptstyle\textcolor{gray}{\pm0.8}$ & $53.3\scriptstyle\textcolor{gray}{\pm2.3}$ & $61.5\scriptstyle\textcolor{gray}{\pm2.8}$ & $\mathbf{73.7}\scriptstyle\textcolor{gray}{\pm1.6}$ \\
\bottomrule
\end{tabular}
\end{table*}

\section{Implementation Details}\label{appendix:impl}
We now provide further implementation details for our method.

\paragraph{Sampling Strategy.} For generating initial and candidate samples, we used a quasi-Monte Carlo sampling strategy based on \ac{lhs}. The method is implemented in \citep{bogoclu2021}, where samples are drawn from $\VSpace{X}$ without correlation and by maximizing pairwise distance. As an alternative, one could use \texttt{SciPy}'s \citep{virtanen2020} quasi-Monte Carlo implementations (e.g., \ac{lhs} or Sobol sampling). We draw a new set of candidates in each iteration of the adaptive method.

\paragraph{Gaussian Process Model.}
The \ac{gp} model used throughout this work is implemented with \texttt{GPyTorch} \citep{gardner2018} and on top of \texttt{BoTorch} \citep{balandat2020}. Furthermore, we transformed the inputs of the model to the unit cube. Output of the model was normalized to be zero mean and unit variance during training, and reversed for prediction. The \ac{gp} uses a sum of five kernels: squared exponential, Matèrn 1/2, Matèrn 3/2, Matèrn 5/2, and rational quadratic, where we placed half-Cauchy priors ($\sigma=2$) on the lengthscales. The reason for using this combination is: \textbf{1)} the five kernels provide significant flexibility. \textbf{2)} The heavy tales of the half-Cauchy prior can disable unused dimensions according to the principle of \emph{automatic relevance determination} \citep{mackay1994}. \textbf{3)} Our preliminary tests showed that using this combination provides improved performance, e.g., in contrast to using the Matèrn 3/2, although being more difficult to train. Otherwise, we used defaults specified in the \texttt{BoTorch} implementation of the \texttt{SingleTaskGP} class. 
We fit the \ac{gp} model using \texttt{SciPy}'s implementation of the L-BFGS-B algorithm \citep{byrd1995} with 5 random restarts to maximize the log-marginal likelihood. \change{If the model is not trained, new observations are incorporated by building a new \ac{gp} model with updated data and the same hyperparameters as in the previous iteration (strategies for such are implemented in \texttt{GPyTorch}).}


\paragraph{Variational Gaussian Process Model.} For the \ac{vgp} model we use the implementation from \texttt{BoTorch} with 500 inducing points. 
We apply the same input and output transformations as with the \ac{gp} implementation. As kernel, we found using the same five kernels as with \ac{gp} provides to much overhead with \ac{vgp}. Therefore, we restricted the usage to the Matèrn 1/2 kernel.
The model is fitted with 3 random restarts by maximizing the evidence lower bound via Adam \citep{kingma2015} (learning rate $0.1$) combined with early stopping (patience 30) and the cosine annealing learning rate scheduler \citep{loshchilov2017}.

\section{Experimental Details}
Here, we present further details on the implementation of the conducted experiments.

\subsection{Details Benchmark Section~\ref{sec:ml_bench}}\label{sec:details_ml_bench}
For the benchmark, we trained various \ac{ml} models ($f\sub{M}$) to be validated afterwards. We used \ac{rf}, \ac{svr}, and \ac{rr} as implemented in \texttt{Scikit-learn} \citep{pedregosa2011}. \ac{xgb} is implemented based on \citep{chen2016}. Inputs and outputs were normalized to have zero sample mean and unit standard deviation for all models. Number of training samples and resulting test $R^2$ are given in Table~\ref{tab:ml_models_test}.

\paragraph{Hyperparameters.} For the Rosebrock benchmark, we used the specified default parameters to a large extend. Only exceptions are: \textbf{1)} \ac{rr} with polynomial features of max. 3th degree with L2 regularization of $0.3$ (alpha). \textbf{2)} \ac{rf} with $200$ trees (num. estimators) with maximum depth of $25$. \textbf{3)} \ac{xgb} with max. depth of $3$.
For the Michalewicz function, we found the need to tune the hyperparameters. Therefore, we used \ac{bo} implemented in \texttt{Scikit-optimize} \citep{tim2021} based on the cross-validated mean absolute error objective. The hyperparameter searchspace is given in Table~\ref{tab:bo_searchspace}.

\begin{table}[t]
    \centering
    \caption{\ac{bo} search space for hyperparameter optimization.}
    \resizebox{\columnwidth}{!}{
    \begin{tabular}{cc|cc|cc|cc}
        \toprule
        \multicolumn{2}{c|}{\textbf{\ac{rr}}}  & \multicolumn{2}{c|}{\textbf{\ac{svr}}} & \multicolumn{2}{c|}{\textbf{\ac{rf}}} & \multicolumn{2}{c}{\textbf{\ac{xgb}}}\\
        Parameter & Distribution & Parameter & Distribution & Parameter & Distribution & Parameter & Distribution\\
        \midrule
        \textbf{Poly degree} & Unif. Int. $[2, 10]$ & \textbf{C} & Unif. $[10^{-4}, 100]$ & \textbf{Num. Estimators} & Unif. Int $[10, 300]$ &	\textbf{Max. depth}	&	Log Unif. Int $[2, 10]$ \\
        \textbf{Alpha} &	Log Unif. $[10^{-6}, 100]$ &	\textbf{Epsilon} & Log Unif. $[10^{-5}, 100]$ & \textbf{Max. depth} &	Log Unif. Int $[2, 20]$&	\textbf{Gamma}&	Log Unif. $[10^{-5}, 100]$\\
		&& \textbf{Gamma}&	Log Unif. $[10^{-5}, 1000]$& 	&& \textbf{Eta}	& Log Unif. $[10^{-5}, 0.99]$ \\
		&&&&&& \textbf{Lambda}	&Log Unif. $[10^{-5}, 1]$
        \\
        \bottomrule
    \end{tabular}}
    \label{tab:bo_searchspace}
\end{table}

\begin{table}[t]
    \centering
    \caption{Number of training samples together with resulting $R^2$ scores of the trained \ac{ml} models for the benchmark in Section~\ref{sec:ml_bench}.}
    \begin{tabular}{ccc|cccc}
        \toprule
         &&& \multicolumn{4}{|c}{Test $R^2$}\\
        Benchmark & Dim. & $n\sub{train}$ & \ac{rr} & \ac{svr} & \ac{rf} & \ac{xgb} \\
        \midrule
        \multirow[c]{3}{*}{Michalewicz} & 2 & 200      & 0.69 & 0.89 & 0.8  & 0.98 \\
      & 4 & 600      & 0.21 & 0.3  & 0.63 & 0.96 \\
      & 8 & 1000     & 0.11 & 0.1  & 0.31 & 0.82 \\
    \multirow[c]{3}{*}{Rosenbrock} & 2 & 100      & 0.96 & 0.87 & 0.91 & 0.92 \\
      & 4 & 200      & 0.96 & 0.83 & 0.74 & 0.84 \\
      & 8 & 500      & 0.95 & 0.83 & 0.68 & 0.84 \\
        \bottomrule
    \end{tabular}
    \label{tab:ml_models_test}
\end{table}

\newpage
\subsection{Benchmark Functions}\label{appendix:bench}
This section gives an overview of the analytical functions used throughout this work.
\paragraph{Series System with four branches.} \citep{waarts2000}
\begin{equation*}
    f(x_1, x_2) = \min \begin{cases} 3+0.1(x_1-x_2)^2-\frac{x_1+x_2}{\sqrt{2}}\\
    3+0.1(x_1-x_2)^2+\frac{x_1+x_2}{\sqrt{2}}\\
    (x_1-x_2)+\frac{7}{\sqrt{2}}\\
    (x_2-x_1)+\frac{7}{\sqrt{2}}
    \end{cases}
\end{equation*}
where we used $x_i\in\left[-8, 8\right]$.
\paragraph{Modified Rastrigin Function.} \citep{torn1989}
\begin{equation*}
    f(x_1, x_2) = 10 + \sum^2_{i=1}\left(x_i^2-5\cos{2\pi x_i}\right)
\end{equation*}
where we used $x_i\in\left[-5, 5\right]$.
\paragraph{Styblinski-Tang Function.} \citep{styblinski1990} 
\begin{equation*}
    f(\Vector{x}) = 0.5 \sum^d_{i=1} \left(x_i^4-16x_i^2+5x_i\right)
\end{equation*}
where we used $x_i\in\left[-5, 5\right]$.
\paragraph{Michalewicz Function.} \citep{michalewicz1992}
\begin{equation*}
    f(\Vector{x}) = -\sum^d_{i=1}\sin\left(x_i\right)\sin^{20}\left(\frac{ix_i^2}{\pi}\right)
\end{equation*}
where we used $x_i\in\left[0, \pi\right]$.
\paragraph{Rosenbrock Function.} \citep{rosenbrock1960}
\begin{equation*}
    f(\Vector{x}) = \sum^{d-1}_{i=1} 100\left(x_{i+1}-x_i\right)^2+\left(x_i-1\right)^2
\end{equation*}
where we used $x_i\in\left[-2, 2\right]$.
\end{document}
