\documentclass[accepted]{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{comment}
\usepackage{xcolor}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{sidecap}
\usepackage[most]{tcolorbox}
\usepackage{xspace}

%% tcolorbox

\definecolor{C4}{HTML}{66a61e}

\tcbset {
  base/.style={
    arc=0mm, 
    bottomtitle=0.5mm,
    boxrule=0mm,
    colbacktitle=black!10!white, 
    coltitle=black, 
    fonttitle=\bfseries, 
    left=2.5mm,
    leftrule=1mm,
    right=3.5mm,
    title={#1},
    toptitle=0.75mm, 
  }
}

% \newtcolorbox{greenbox}[1]{
%   colframe=C4,
%   colback=C4!10!white, 
%   base={#1},
%   breakable
% }

% \newtcolorbox{greybox}[1]{
%   colframe=black!30!white,
%   base={#1},
%   breakable
% }



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% THEOREMS
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{remark}[theorem]{Remark}

\newcommand{\norm}[1]{\left\lVert#1\right\rVert}
\DeclareMathOperator*{\argmin}{arg\,min}
\DeclareMathOperator*{\argmax}{arg\,max}
\newcommand{\x}{\mathbf{x}}
\newcommand{\y}{\mathbf{y}}
\newcommand{\f}{\mathbf{f}}
\newcommand{\K}{\mathbf{K}}
\newcommand{\bv}{\mathbf{v}}
\newcommand{\vj}{\mathbf{v}^{(J)}}
\newcommand{\bV}{\mathbf{V}}
\newcommand{\bVa}{\bV_{\text{acqf}}}
\newcommand{\z}{\mathbf{z}}
\newcommand{\zj}{\mathbf{z}^{(J)}}
\newcommand{\Dj}{\mathcal{D}^{(J)}}
\newcommand{\X}{\mathcal{X}}
\newcommand{\V}{\mathcal{V}}
\newcommand{\N}{\text{N}}
\newcommand{\Z}{\mathcal{Z}}
\newcommand{\tth}{\text{th}}
\newcommand{\cost}{\text{cost}}
\newcommand{\kl}{\text{FC}}
\newcommand{\klt}{\tilde{\kl}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\D}{\mathcal{D}}
\newcommand{\De}{\mathcal{D}_{\text{eval}}}
\newcommand{\bmu}{\boldsymbol{\mu}}
\newcommand{\bsigma}{\boldsymbol{\Sigma}}
\newcommand{\method}{\texttt{SADCBO}\xspace}

\definecolor{orangepython}{HTML}{ffa500} 
\definecolor{darkbluepython}{HTML}{00008b} 
\definecolor{purplepython}{HTML}{800080} 
\definecolor{crimsonpython}{HTML}{dc143c} 
\definecolor{C1python}{HTML}{ff7f0e} 
\definecolor{magentapython}{HTML}{ff00ff} 
\definecolor{pinkpython}{HTML}{ffc0cb} 
\definecolor{greenpython}{HTML}{008000} 
\definecolor{darkcyanpython}{HTML}{008b8b} 

\renewcommand\labelenumi{(\roman{enumi})}
\renewcommand\theenumi\labelenumi

\newcommand{\ab}[1]{\textcolor{purple}{[AB: #1]}}
\newcommand{\sjs}[1]{\textcolor{blue}{[SS: #1]}}

% Todonotes is useful during development; simply uncomment the next line
%    and comment out the line below the next line to turn off comments
%\usepackage[disable,textsize=tiny]{todonotes}
\usepackage[capitalize,noabbrev]{cleveref}
\usepackage{multirow}
\usepackage{makecell}
\usepackage{titling}


\usepackage[switch]{lineno}
\linenumbers


%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Learning Relevant Contextual Variables Within Bayesian Optimization}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is automatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<julien_martinelli@hotmail.com>?Subject=Your UAI 2024 paper}{Julien~Martinelli}\thanks{Work done while at Aalto University.}}
\author[2]{Ayush~Bharti}
\author[3]{Armi~Tiihonen}
\author[2]{S.~T.~John}
\author[4]{Louis~Filstroff}
\author[5]{Sabina~J.~Sloman}
\author[3]{Patrick~Rinke}
\author[2,5]{Samuel~Kaski}

% Add affiliations after the authors
\affil[1]{%
    Inserm Bordeaux Population Health, Vaccine Research Institute\\
    Universit\'{e} de Bordeaux\\
    Inria Bordeaux Sud-ouest, France
}
\affil[2]{%
    Department of Computer Science\\
    Aalto University\\
    Helsinki, Finland
}
\affil[3]{%
Department of Applied Physics\\
    Aalto University\\
    Helsinki, Finland
}

\affil[4]{%
Univ. Lille, CNRS, Centrale Lille, UMR 9189 CRIStAL, F-59000 Lille, France
  }
\affil[5]{%
    Department of Computer Science\\
    University of Manchester\\
    Manchester, United Kingdom
  }
  
  \begin{document}
\maketitle


\begin{abstract}
Contextual Bayesian Optimization (CBO) efficiently optimizes black-box functions with respect
to design variables, while simultaneously integrating \emph{contextual} information regarding
the environment, such as experimental conditions.
However, the relevance of contextual variables is not necessarily known beforehand.
Moreover, contextual variables can sometimes be optimized themselves at an additional cost, a setting overlooked by current CBO
algorithms.
Cost-sensitive CBO would simply include optimizable contextual variables as part of the design variables based on their cost. Instead, we adaptively select a subset of contextual variables to include in the optimization, based on the trade-off between their \emph{relevance} and the additional cost incurred by optimizing them compared to leaving them to be determined by the environment.
We learn the relevance of contextual variables by sensitivity analysis of the posterior surrogate model while
minimizing the cost of optimization by leveraging recent developments on early stopping for BO\@.
We empirically evaluate our proposed Sensitivity-Analysis-Driven Contextual BO (\texttt{SADCBO}) method against alternatives on both synthetic and real-world
experiments, together with extensive ablation studies, and demonstrate a consistent improvement
across examples.
\end{abstract}

\section{Introduction}

Bayesian optimization (BO) is a sample-efficient black-box optimization method,
typically used when the objective function is too expensive to optimize directly~\citep{garnett_bayesoptbook_2023}. Given an objective function that can be evaluated pointwise
over a set of \emph{design variables},
BO combines surrogate modeling with a pre-specified policy of evaluation over the design space
(the so-called acquisition function) to efficiently locate the global optimum of the function.
BO has been especially useful in automatic discovery of materials~\citep{zhang2020bayesian},
molecules~\citep{optiamber}, and pharmaceutical compounds~\citep{bombarelli2018lbo,korovina2020chembo}---problem domains in which evaluating the performance of a candidate depends on a costly experiment.

Despite the success of BO and its recent algorithmic advancements,
open challenges remain for its practical use.
A key implicit assumption in vanilla BO is that the objective function only depends on the design variables.
This assumption is violated in many practical scenarios, wherein various \emph{uncontrolled} environmental factors and
experimental settings, referred to as
\emph{contextual variables}~\citep{krauze_11_contextbo, bogunovic_2020_drbo, Arsenyan2023},
also affect the objective function. For instance, ambient humidity was found to influence the experiments
in robot-assisted material design~\citep{nega21context}, such that the best compound differed with
humidity conditions. Moreover, in practice, the domain experts themselves might not know \textit{a priori}
which contextual variables are relevant, and would observe their confounding effect only during the course
of the optimization process.
Therefore, it is critical to identify the contextual variables that significantly affect the objective function,
not only to achieve the highest optimization
results, but also for the practitioners to reliably reproduce experimental results.

To deal with the uncertainty related to the contextual variables,
variants of BO have been developed. In particular, \citet{krauze_11_contextbo} introduced the
Contextual Bayesian optimization (CBO) framework, which uses the uncontrollable contextual information known \emph{before} the experiment, like current environmental conditions, to enhance the surrogate model.
Alternatively, several works have proposed to alter the simple optimization objective to make it robust
in some sense, such as by taking the expectation with respect to the contextual variables~\citep{toscano2018bayesian},
or considering distributionally-robust scenarios~\citep{bogunovic2018adversarially, bogunovic_2020_drbo}. However, these works consider a different setup than the original CBO framework, as contextual information is only revealed \emph{after} the design has been sent for experiment, not before.
Besides this distinction, in some applications, contextual variables \emph{can} be controlled, and therefore set to values they may be unlikely to take during passing observation. % that was otherwise unlikely to be observed.
%\sjs{i.e., the variables can be set to values they may be unlikely to take during passive observation}
Such variables are, for instance,
synthesis conditions of material samples, including sintering temperature or the used solvents.
Certain environmental conditions like room temperature or ambient
humidity are also ``principally'' controllable during the course of
an experiment~\citep{higgins21materials, nega21context}. Nevertheless, whether their inclusion as optimization variables is relevant or not may not be straightforward to predict~\citep{abolhasani23aimaterials}. Moreover, optimizing over all the potentially relevant contextual variables %,
can improve BO performance,
but this process can be costly, thus invoking a cost-versus-efficiency trade-off.

\begin{tcolorbox}[colback=violet!13!white,colframe=violet!50!white,boxrule=0mm,arc=0mm,bottomtitle=0.5mm,left=2.5mm,leftrule=1mm,right=3.5mm,toptitle=0.75mm]
\textbf{Contributions.} In this paper, we extend the CBO framework to settings in which the relevance of contextual variables is (i)
not known beforehand, and (ii) can be optimized, but at some cost. We propose a Sensitivity-Analysis-Driven CBO
(\texttt{SADCBO}) algorithm for the simultaneous identification and optimization of relevant contextual variables.
\texttt{SADCBO} leverages recent advances in sensitivity-analysis-driven variable
selection~\citep{sebenius_2022_featurecollapse} and early stopping criteria for
BO~\citep{ishibashi23stopping}. We emphasize that \texttt{SADCBO} combines the
\emph{contextual observational} setting, where the context information is only observed,
and the \emph{contextual optimization} setting, where contextual variables can be optimized
(similar to design variables), into a sequential algorithm. In effect, \method provides a way to navigate the following tradeoff: should contextual variables be taken \emph{as is} at no cost, or should they be steered outside of their observational distribution in order to provide more information about the objective, at a cost?
We evaluate the performance of \texttt{SADCBO}, comparing against methods from the CBO
and high-dimensional BO literature, on both synthetic and real-world cases.
\end{tcolorbox}

\section{Contextual Bayesian Optimization (CBO)}\label{sec:cbo}

The CBO framework \citep{krauze_11_contextbo} deals with a black-box function
$f : \X \times \Z \rightarrow \mathbb{R}$ defined on the joint space of both the
\emph{design variables} $\X \subset \R^d$ and \emph{contextual variables}
$\Z \subset \R^c$. We assume that we get noisy evaluations of $f$, that is, we observe the output
$y = f(\x, \z) + \varepsilon$ with $\varepsilon \sim \mathcal{N}(0, \sigma^2_{\text{noise}})$.
%
A Gaussian process (GP) prior \citep{Rasmussen2006} is placed on $f$; with the notation \mbox{$\bv = [\x,\z]$},
we write \mbox{$f(\bv) \sim \mathcal{GP}(0, k(\bv,\bv'))$}. A GP is a stochastic process fully characterized
by its mean function (taken here to be zero) and its kernel
$ k(\bv,\bv') = \text{cov}[f(\bv),f(\bv')]$. This implies that for any finite-dimensional collection of inputs
$[\bv_1, \dots, \bv_t]$, the function values
$\f = [f(\bv_1),\dots,f(\bv_t)]^\top \in\mathbb{R}^t$
follow a multivariate normal distribution $\f \sim \mathcal{N}(\mathbf{0}, \K)$,
where $\K = (k(\bv_i, \bv_j))_{1\le i,j \le t}$ is the kernel matrix.
Given a dataset $\D_t = \{(\x_i, \z_i,y_i)\}_{i=1}^t = \{(\bv_i,y_i)\}_{i=1}^t$,
the posterior distribution of $f(\bv)$ given $\mathcal{D}_t$ is Gaussian,
with analytical expressions for the mean $\mu_t(\bv|\D_t)$ and variance~$\sigma_t^2(\bv|\D_t)$.

In the CBO setting, we first observe the context variables, and then choose the design variables accordingly.
More precisely, at iteration $t+1$, a context vector $\z_{t+1}$ is observed,
assumed to have been drawn from an unknown distribution $p(\z)$, and the optimal design $\x_{t+1}^{\star}$
is such that
%
\begin{equation}
\x_{t+1}^{\star} = \argmax_{\x \in \X } f(\x,\z_{t+1}).
\label{BO_task}
\end{equation}
%
Given $\z_{t+1}$ and the previous $t$ observations $\D_t$, the next candidate design
$\x_{t+1}$ is selected using the Upper Confidence Bound (UCB) acquisition~function $\alpha$~\citep{srinivas_ucb}:
%
\begin{align}
    \x_{t+1} &=\underset{\x \in \X}{\argmax}~\alpha(\x,\z_{t+1}|\D_t)\nonumber \\
    &=\mu_t(\x, \z_{t+1}|\D_{t}) + \beta_t^{1/2}\sigma_t(\x,\z_{t+1}|\D_{t}),
\label{eq:cucb}
\end{align}
%
for a sequence $(\beta_t)_{t \geq 1}$. % balancing exploitation (high values of $\mu_t$) and exploitation (high values of $\sigma_t$).
This incurs a design cost~$\lambda_\x$.

\paragraph{Extending the CBO problem setup.}
We extend the problem setting of CBO in two ways. Firstly,
we assume that only a subset of the contextual variables truly affect $f$.
Let $\z = [z^{(1)}, \dots, z^{(c)}]$ be the vector of all contextual variables.
For any set $J$ belonging to the power set of $\{1, \dots, c\}$, denote by $\z^{(J)} \in \mathbb{R}^{|J|}$
the vector of reduced dimension whose variables are indexed by $J$.
For instance, if $J=\{1,3\}$, then $\z^{(J)} = [z^{(1)}, z^{(3)}]$.
We assume that there exists a set $J^\star$, where $|J^\star| \ll c$,
such that $f(\x, \z) = f(\x, \z^{(J^\star)})~\forall( \x,\z)$.
Secondly, we include the possibility of setting the value of any of the contextual variables at some cost over and above the usual design query cost $\lambda_\x$.
This means that for all $j \in \{1,\dots,c\}$, the context variable $z^{(j)}$ can be optimized at a cost
$\lambda_j$. To be able to control each contextual variable, we must also assume their independence: $p(\z) = \prod_{j=1}^c p(z^{(j)})$. With these additional assumptions,
we aim to maximize the function $f$ in a cost-efficient manner,
while identifying the optimal set $J^\star$. This provides the user with a comprehensive summary of the relevant contextual variables
found through optimization, thus ensuring reproducibility and explainability. Unlike CBO, the ability to control contextual variables allows us to judge whether or not one should optimize contextual variables to learn more about the objective (albeit at a cost), or if the current sampled context is already informative enough. %In the end, the problem considered is:
Specifically, we aim to maximize the objective
\begin{equation}
    (\x_{t+1}^\star,\z_{t+1}^\star) = \argmax_{(\x,\z^{(J^\star)}_{t+1}) \in \mathcal{X}\times\prod_{j \in J^\star}\Z_j}~f(\x,\z_{t+1})
\label{eq:objective}
\end{equation}
where, for all $j\in J^\star$, we optimize $z_{t+1}^{(j)}$ at cost $\lambda_j$, and all other elements $j'\in\{1,\dots,c\}\setminus J^\star$ of $\z_{t+1}$ remain at their values sampled from the environment ($z_{t+1}^{(j')} \sim p(z^{(j')})$).

\section{Methodology}\label{sec:method}

To solve the extended CBO problem introduced in \cref{sec:cbo}, we identify relevant contextual variables, building on a variable selection technique from the GP literature~\citep{sebenius_2022_featurecollapse}.
\Cref{sec:variable_selection} describes our adaptation of this method to the optimization setting,
by restricting the dataset to high function values. \Cref{sec:sadcbo} then presents our sequential algorithm \texttt{SADCBO},
which employs the adapted variable selection method in solving
the optimization problem. A flowchart summarizing the proposed method can be found in Figure~\ref{fig:flowchart}.

\subsection{Variable selection for CBO via sensitivity analysis}\label{sec:variable_selection}

To handle the presence of contextual variables that can be optimized,
one approach is to include them in the design space.
However, such a strategy can be infeasible when their relevance is not known
\textit{a priori} and domain experts can only provide a candidate set of \emph{potentially}
relevant contextual variables. Indeed, this leads to an exponential expansion of the search space, while at the same time increasing the cost of optimization. %g \emph{all} context variables.
In such cases, it is crucial to identify the \emph{relevant} contextual variables, i.e., to find (a good approximation to) the optimal set $J^\star$. This not only allows us to optimize the function more efficiently but also provides additional insights about the experiment to the domain~experts.

To approximate the optimal set $J^\star$, we include those contextual variables that are most relevant for identifying the optimum, which we estimate using sensitivity analysis. Specifically, we adapt the Feature Collapsing (FC)
method~\citep{sebenius_2022_featurecollapse}.
The FC method perturbs training points (namely, by setting one feature to zero),
and measures the induced shift in the posterior predictive distribution in terms of KL divergence.
Given a dataset $\mathcal{D}_t = \{(\x_i, \z_i, y_i)\}_{i=1}^t$, the relevance $r_{i,j}$ on the $i^\textup{th}$ sample of the $j^\textup{th}$ contextual variable $z_i^{(j)}$ is computed as
% 
\begin{equation}
    r_{i,j} \hspace{-.05cm}= \text{KL}\left(p(y_\star| \x_i, \z_i, \D_t)||p(y_\star|\x_i, \z_i \odot \boldsymbol \xi[j], \D_t)\right),
\label{eq:fcmeasure}
\end{equation}
% 
where $\boldsymbol \xi[j] = [\xi^{(1)}, \dots, \xi^{(c)}]$ is a vector so that $\xi^{(j)} = 0$,
and $\xi^{(j')} = 1$ for $j'\neq j$, and $\odot$ is the element-wise multiplication.
The relevance score of the $j^\textup{th}$ contextual variable is then computed as an average over $\D_t$:
\begin{equation} \label{eq:FC_relevance}
        \kl_{\D_t}(j) = \frac{1}{|\D_t|} \sum_{i=1}^{|\D_t|} \left(\frac{r_{i,j}}{\sum_{j'=1}^c r_{i,j'}} \right).
    \end{equation}
The FC scores obtained in this manner reveal the variables that are relevant for predicting the output
\emph{across} $\D_t$. As our goal is to \emph{maximize} $f$,
we are interested in identifying contextual variables that are relevant for \emph{high} function values.
Hence, we adapt \Cref{eq:FC_relevance} to the BO setting by modifying the dataset over which the scores
are averaged. We use information about high function values from two different sets: %\sjs{Do we have enough space to put these in an \texttt{\textbackslash enumerate}, or to give each set its own paragraph?
 %   It feels a bit overwhelming for them to be embedded in the same pp.
%}
(1) The subset $\D^{\gamma_t}$ associated with the highest output values observed so far:
% 
\begin{equation}
    \D^{\gamma_t}_t = \{(\x_i, \z_i, y_i) \in \D_{t}~ | ~y_i /y_{\text{best}} \ge \gamma_t \},
    \label{eq:dgamma}
\end{equation}
where $y_{\text{best}} = \max_{1\le i \le t}~ y_i$ is the current observed maximum.
For example, using $\gamma_t = 0.8~ \forall t$ would yield a $\D^{\gamma_t}_t$ that consists
of the highest $20\%$ observations so far.
(2) We select a batch of $Q$ points $\mathcal{D}^Q_t :=\{(\x^\star_q, \z_{t+1})\}_{q=1}^Q$
that are promising given the next context $\z_{t+1}$:
\begin{equation}
     \{\x^\star_q\}_{q=1}^Q =\underset{\{\x_q\}_{q=1}^Q \in \X^Q}{\argmax}~\alpha^{\text{Batch}}(\{(\x_q, \z_{t+1})\}_{q=1}^Q|\D_{t}),
\label{eq:dalpha}
\end{equation}
where $\alpha^{\text{Batch}}$ denotes a batched version of the acquisition function $\alpha$ such as $Q$-UCB for UCB~\citep{wilson2017reparameterization}.
We use the union $\mathcal{D}_t^{\text{BO}} = \mathcal{D}^{\gamma_t}_t \cup \mathcal{D}^Q_t$
as our dataset for FC. Therefore, we compute $\kl_{\D_t^{\text{BO}}}$ based on \Cref{eq:FC_relevance}.
The importance of working with $\D_t^{\text{BO}}$ instead of $\D_t$ is illustrated in \Cref{fig:summary}
on a toy example.

We successively select the indices of the contextual variables with the highest FC scores until their
cumulative FC score exceeds some chosen threshold $\eta \in [0,1]$, meaning that the selected variables
explain the fraction $\eta$ of the output sensitivity amongst all contextual variables.
Let $J_\eta $ denote the set of indices of the selected contextual variables. We train a GP surrogate based
on $\{(\x_i, \z_i^{(J_\eta)}, y_i)\}_{i=1}^t$ and can select a new design through maximization of the
acquisition~function~$\alpha$:
% 
\begin{equation}
\x_{t+1} = \underset{\x \in \X}{\argmax}~\alpha(\x, \z^{(J_\eta)}_{t+1}|\D_{t}) .
\label{eq:sensicucb}
\end{equation}
Note that other measures of variable relevance could have been used, e.g.,~the method
proposed by~\citet{Spagnol2019sensibo} based on maximum mean discrepancy \citep{Gretton2012JMLR}.
We found FC to perform better (see~\Cref{sec:results}).
\begin{figure}[t]
    \centering
    \includegraphics[width=1\linewidth]{figures/crfinalsummaryfig.pdf}
    \vspace{-6mm}
    \caption{\textbf{Sensitivity analysis on $\D_t^{\textnormal{BO}}$ characterizes variable importance
    at the optimum faster than $\D_t$.} \emph{Top left}: 2D black-box objective together with the queries
    produced along a BO trajectory. Initial samples are represented by empty dark-colored triangles,
    newly obtained samples as dots with an increasingly lighter color. \emph{Top right}: Best value found during
    the optimization trial. \emph{Bottom left}: Sensitivity indices for $z^{(1)}$ and $z^{(2)}$ averaged over
    $\D_t^{\text{BO}}$. As we converge to the optimum, $\D_t^{\text{BO}}$ mainly involves samples close to the
    optimum, leading to a different variable relevance ranking (iteration 30 to the end; $z^{(1)}$ is more
    relevant) compared to the early iterations (10 to 30; $z^{(2)}$ is more relevant). \emph{Bottom right}:
    Sensitivity indices computed on the whole dataset $\D_t$ do not converge as quickly and do not capture
    the shift in relevance close to the optimum.}%\looseness-1}
    \label{fig:summary}
    % 
\end{figure}

\subsection{Sensitivity-Analysis-Driven CBO (\texttt{SADCBO})}\label{sec:sadcbo}

Building on top of the variable selection method discussed in \Cref{sec:variable_selection},
we now present \texttt{SADCBO}, a sequential method for performing BO in the presence of irrelevant
contextual variables.
\texttt{SADCBO} proceeds in two phases.

In the first, \emph{observational} phase, we choose to only observe the values of the contextual variables
without optimizing over them. This ensures that we do not waste budget optimizing the contextual variables
when their relevance is computed based on a limited amount of data, and hence can be noisy.
We select the contextual variables based on their FC relevance and then use vanilla CBO as described in
\Cref{sec:cbo} to optimize the design variables. Thus, in this phase, we leverage the available contextual
information to guide design selection.

In the early stage of the optimization, cheap queries where contextual variables are not optimized still provide a considerable amount of information. The information gained from purely observing contextual variables will, however, saturate at some point,
leading to diminishing simple regret differences. %x\sjs{Do you mean diminishing gains in performance measures like regret?}. %Hence, controlling more dimensions of the problem for a significantly higher price becomes necessary.
At this point, it becomes necessary to pay the higher price to control more dimensions of the input space.
This motivates the introduction of a second phase,
in which contextual variables can have their values arbitrarily set, through optimization.



In the second, contextual \emph{optimization} phase, we optimize the contextual variables selected at each
iteration based on their FC relevance. As optimizing a context variable $z^{(j)}$ is associated with a cost
$\lambda_j$, 
we modify the FC relevance in \Cref{eq:FC_relevance}:
%
\begin{equation}
     \tilde{\kl}_{\D_t}(j) =  \kl_{\D_t}(j) /\lambda_j
\label{eq:fccost}
\end{equation}
%
Our variable selection criterion can then be interpreted as the degree of sensitivity \emph{per unit cost}.
This allows \method to automatically trade off a variable's potential to greatly affect the optimum with
the associated optimization cost. As before, once the contextual variables $\z^{(J_\eta)}$ have been selected,
we train a GP surrogate using  $\{(\x_i, \z_i^{(J_\eta)},y_i)\}_{i=1}^t$ and select the next design
and contextual variables to query~as
% 
\begin{equation}
    (\x_{t+1}, \z^{(J_\eta)}_{t+1}) = \hspace{-.1cm}\underset{(\x,\z^{(J_\eta)}) \in \X
     \times \prod_{j \in J_\eta}\Z_j}{\argmax} \alpha(\x, \z^{(J_\eta)}|\D_{t})
     %+ \beta_t^{1/2}\sigma_t(\x,\z^{(J_\eta)}|\D_{t})
    .
\label{eq:controlaf}
\end{equation}
In effect, $J_\eta$ represents our approximation for $J^\star$ as introduced in~\Cref{eq:objective}.
Note that our acquisition function is not cost-weighted, as cost-weighted acquisition functions can dramatically
underperform~\citep{eriksson_2021_nonmyopiccost}, specifically for non-continuous cost models.
Including the cost at the model selection level avoids this issue.


\paragraph{Switching from observational to optimization phase.}
We employ the criterion proposed by
\citet{ishibashi23stopping} for determining the stopping time in BO\@. Using this criterion, we detect the
point at which the optimization gain based on purely observing the contextual variables diminishes,
following which the contextual optimization phase begins. We now briefly describe the details of this switching criterion.

With $\bv = [\x, \z]$, let $\bv^\star_t = \argmax_{\bv \in \D_t}f(\bv)$ be the current best candidate point
in the dataset up to time $t$. Denoting $f^\star := \max_{\bv \in \V} f(\bv)$, let
$R_t = f^\star - \mathbb{E}_{\hat{f}\sim p(f|\D_t)}[\max_{\bv \in \V}~\hat{f}(\bv)]$ be the expected minimum
simple regret.
Then, with probability $1-\delta$, $\Delta R_t = \lvert R_t - R_{t-1}\rvert$ can be upper bounded by $\Delta \tilde{R}_t$ with
% 
\begin{align}
    \Delta \tilde{R}_t &= v(\phi(g) + g\Phi(g)) + \lvert \Delta \mu_t^\star \rvert\nonumber\\
    &\quad + \kappa_{\delta, t-1} \sqrt{\frac{1}{2}\text{KL}(p(f|\D_t)||p(f|\D_{t-1}))},
\label{eq:earlystopping}
\end{align}
% 
where $\phi(\cdot)$ and $\Phi(\cdot)$ are the p.d.f.\ and c.d.f.\ of a standard Gaussian distribution,
respectively, $\Delta \mu^\star_t := \mu_{t}(\bv^\star_{t}) - \mu_{t-1}(\bv^\star_{t-1})$,
$v := \sqrt{\sigma_t^2(\bv_t^\star) - 2\Sigma_t(\bv_t^\star, \bv_{t-1}^\star) +\sigma_{t}^2(\bv_{t-1}^\star)}$,
$g := \Delta \mu_t^\star /v$, and $\kappa_{\delta, t-1}$ is a sequence indexed by $t$ and depending on $\delta$.
Then, we switch from the observational to the optimization phase in \texttt{SADCBO}
when $\Delta \tilde{R}_t \leq s_t$, where
\begin{equation}
    s_t := \frac{\big(\sigma_{t-1}(\bv^\star_t)+\kappa_{\delta, t-1}/2\big)
    \sigma_{t-1}(\bv_t)\sigma_{\text{noise}} \sqrt{-2\log\delta}}{\sigma^2_{t-1}(\bv_t)
    +\sigma_{\text{noise}}^{2}}.
\label{eq:ineg}
\end{equation}
Further details about the derivation of $s_t$ and the expression of $\kappa_{\delta,t-1}$ can be found in \Cref{sec:stopcrit}. The entire algorithm is summarized in \Cref{bo-algo}.

\begin{algorithm}[t]
    \caption{\texttt{SADCBO}}
    \label{bo-algo}
        \begin{algorithmic}[1]
            \STATE \textbf{Input}:
            initial dataset $\D_0$, hyperparameters $\eta$ and $\gamma$, batch size $Q$, budget $\Lambda$, costs $\lambda_\x, \lambda_1, \dots, \lambda_c$
            \STATE Train initial GP using $\D_0$ and all variables $[\x,\z]$. \texttt{phase} $=$ observational. $t=1$.
            \WHILE {$\Lambda \geq \lambda_\x$}
                \STATE Receive context $\z_{t+1}\sim p(\z)$
                \STATE Assemble dataset $\mathcal{D}_t^{\text{BO}}$~(\Cref{eq:dgamma,eq:dalpha})
                \STATE Compute $\kl_{\D_t^{\text{BO}}}(j)$ for all $j$~(\Cref{eq:FC_relevance} or~(\ref{eq:fccost}) if \texttt{phase} = optimization)
                \STATE In descending order, add indices to $J_\eta$ until $\sum_{j\in J_\eta} \kl_{\D_t^{\text{BO}}}(j) >\eta$
                \STATE Train lower-dimensional GP $\{(\x_i, \z_i^{(J_\eta)}, \y_i)\}_{i=1}^{t}$ 
                \STATE Get $\x_{t+1}$~(\Cref{eq:sensicucb}) (and $\z_{t+1}$~(\Cref{eq:controlaf}) if \texttt{phase} = optimization)
                \STATE Acquire observation $y_{t+1}$ at $[\x_{t+1}, \z_{t+1}]$
                \STATE $\D_{t+1} \gets \D_{t} \cup \{(\x_{t+1},\z_{t+1},y_{t+1})\}$
                \STATE Retrain full GP
                \IF {\texttt{phase} = observational and $\Delta \tilde{R}_t \le s_t$ [based on $p(f|\D_{t+1})$]~(\Cref{eq:ineg})}
                    \STATE \texttt{phase} = optimization
                \ENDIF
                \STATE $\Lambda \gets \Lambda - \lambda_\x + \sum_{j\in J_\eta}\lambda_j,~t\gets t+1$
            \ENDWHILE
        \end{algorithmic}
    \end{algorithm}

\section{Related work}\label{sec:rw}


\paragraph{Robust BO.}
\citet{bogunovic2018adversarially,bogunovic_2020_drbo, husain2023distributionally} and~\citet{robustsat}
perform worst-case optimization under fluctuations of the contextual variables.
In particular, Distributionally-Robust BO \citep[DRBO,][]{bogunovic_2020_drbo} tries to maximize the expected black-box function
value under the worst-case distribution of the contextual variables. This worst-case distribution belongs
to an ``uncertainty set'', a ball centered around a reference distribution that is gradually
learned~\citep{tulabandhula2014robust}. However, as in \citet{krauze_11_contextbo}, these works assume
that the relevant contextual variables are known \textit{a priori}, and can only be observed, \emph{after the designs have been selected}, and not controlled.

\paragraph{High-dimensional BO.}

Due to the curse of dimensionality,
the performance of standard BO is severely degraded when applied in high-dimensional input spaces.
To tackle this problem, most approaches either aim at carrying out BO in a lower-dimensional space
instead of the original or work with a structured GP surrogate.
A lower-dimensional subspace can be found in a data-agnostic manner,
for instance by randomly dropping dimensions of the problem~\citep{li2018high} or considering tree-like
random decompositions~\citep{ziomek2023random}. Data-driven methods based on various measures of feature
relevance have also been proposed~\citep{Spagnol2019sensibo,shen2021vsbo}.
In contrast, structured surrogate methods encode structural information about the objective,
for instance using an additive kernel, yielding an acquisition function that is additive under
the provided decomposition~\citep{rolland_2018_additivebo}. Finally,~\cite{eriksson_21_saasbo}
and~\cite{eriksson_2023_sparsebo} proposed using a sparsity-enforcing GP surrogate, equipped with a
heavy-tailed horseshoe prior on the squared inverse lengthscales.

\paragraph{Cost-aware BO.}

In most methods, the BO budget is given in iterations, implicitly assuming that each evaluation
has the same cost. In practice, cost may vary significantly across different regions of the input space~\citep{lee2020costaware},
or depend on the number of variables we optimize over.
Cost-aware BO integrates the cost-constrained nature of the problem, usually within the acquisition function. Let us also mention more involved strategies like constrained Markov decision processes when the total budget
is known beforehand~\citep{eriksson_2021_nonmyopiccost}. The recent work by \citet{bocv} carries out Robust BO while at the same time involving a notion of controlled variables at a cost. However---unlike our framework---they require the nonselected variables to be sampled from a \emph{known} distribution at each iteration.

\newcommand*{\myparagraph}[1]{\textbf{#1}}

\section{EXPERIMENTAL RESULTS}\label{experiments}

We evaluate our approach on several real-world examples and synthetic functions, described in \cref{sec:synthetic_exp}.
We compare against multiple baselines (\cref{tab:baselines}) and present results in \cref{sec:results}.
In \cref{sec:ablation} we discuss the influence of various experimental settings: % on the performance of our method:
number of noise variables present, contextual variable query cost, surrogate and method hyperparameters. We conclude by presenting several insights regarding the phase-switching criterion.

\myparagraph{Baselines.}
We benchmark our approach, coined \texttt{SADCBO}, against baselines referenced in Table~\ref{tab:baselines}.
In particular, \texttt{MMDBO} operates variable selection in BO through an MMD-based measure of sensitivity~\citep{Spagnol2019sensibo} and is detailed in \Cref{sec:mmd}, whereas \texttt{Dropout}~\citep{li2018high} randomly selects half of the contextual variable for optimization. Next, \texttt{CaBO}~\citep{lee2020costaware} performs vanilla BO over $[\x,\z]$, using a cost-weighted acquisition function. The cost model employed here is a smoothed version of our noncontinuous cost model, using a Gaussian curve. Finally, \texttt{CBO} refers to the Contextual BO framework proposed by~\cite{krauze_11_contextbo}. As a way to assess the impact of contextual variables and selection mechanisms, we also report \texttt{CUBO} and \texttt{VBO}: Context-Unaware BO over the designs $\x$ only %at cost $\lambda_\x$,
and Vanilla BO over both design and contextual variables $[\x,\z]$.


\begin{table}[t]
    \centering
    \caption{Methods used in experiments.
      }
    \scalebox{0.66}{
\begin{tabular}{p{0.06\textwidth}lc}
                \toprule
                & Name & Description\\
                \midrule
                \multirow{4}{*}{\parbox{1.5cm}{Without \\ variable selection}} & \textcolor{orangepython}{\texttt{CUBO}} & Context-Unaware BO over $\x$ only\\
                & \texttt{VBO} & Vanilla BO over $[\x, \z]$\\
                & \textcolor{darkcyanpython}{\texttt{CaBO}} & Cost-Aware BO over $[\x, \z]$ \citep{lee2020costaware}\\
                & \textcolor{darkbluepython}{\texttt{CBO}} & Contextual BO using all  contexts $\z$ \citep{krauze_11_contextbo}\\
                \midrule
                \multirow{3}{*}{\parbox{1.5cm}{With \\ variable selection}} & \textcolor{pinkpython}{\texttt{Dropout}} & Randomly drop half of context variables \citep{li2018high} \\
                & \textcolor{magentapython}{\texttt{MMDBO}} & Maximum mean discrepancy-driven BO \citep{Spagnol2019sensibo} \\
              & \textcolor{crimsonpython}{\texttt{SADCBO}} &  \textbf{Sensitivity analysis-driven CBO} (\textbf{This work})\\
                \bottomrule
\end{tabular}
}\label{tab:baselines}
\end{table}

\myparagraph{Implementation details.}
We fix the hyperparameters of \texttt{SADCBO} to $\eta=0.8,Q=10,\gamma_t=0.8~ \forall t$. 
For the GP surrogate, we use a squared-exponential kernel with independent lengthscales for each variable,
learned through marginal likelihood maximization. We use the UCB acquisition strategy, as well as $Q$-UCB
for computing $\D_t^Q$ (\Cref{eq:dalpha})~\citep{wilson2017reparameterization}.
In all experiments, we assume that any variable, design or contextual ones, has cost
$\lambda_j = 1~\forall j \in \{1,\dots,d+c\}$, except in a dedicated study in Section~\ref{sec:ablation}.
Our algorithm is implemented using the \texttt{BoTorch} framework~\citep{balandat2020botorch}. Code can be accessed at \href{https://github.com/julienmartinelli/SADCBO}{https://github.com/julienmartinelli/\texttt{SADCBO}}.

\subsection{Experiments}\label{sec:realworld}

We benchmark on 4 real-world and 4 synthetic experiments (\Cref{tab:exps}) described in
brief here and detailed in \Cref{sec:expdetails}.

\begin{table}
  \caption{Dimensionality of the experiments. For synthetic experiments, additional dimensions stand for (artificial) noise variables, put on top of the design and contextual variables.}
  \centering
\scalebox{.9}{
\begin{tabular}{llcc}
                \toprule
                Experiment & \makecell{All\\dimensions} &  \makecell{Design\\variables} & \makecell{Contextual\\ variables}\\
                \midrule
                Portfolio & 5 & $3$ & $2$\\
                Yacht & 6 & 4 & 2\\
                Robot & 14 & 6 & 8\\
                Molecule & 21 & 3 & 18\\
                \midrule
               EggHolder & 2 + 4 & 1 & 1\\
                               Hartmann4D & 4 + 3 & 2 & 2 \\
                               Hartmann6D & 6 + 6 & 3 & 3\\
                               Ackley & 5 + 8 & 2 & 3\\
                \bottomrule
\end{tabular}}
\label{tab:exps}
\end{table}

\begin{figure*}[ht!]
    \centering
    \includegraphics[width=1\linewidth]{figures/uai.pdf}
    \caption{
    Benchmark of the different methods.
   \textbf{(a)} On real-world datasets, \method (red curve with white markers) performs on par with other baselines and is the top performer for the Robot Pushing task. 
   \textbf{(b)} On synthetic functions, 
    \texttt{SADCBO} outperforms other baselines in three cases out of four.
    \textbf{(c)}
    Histograms of phase switching criteriong time for \texttt{SADCBO} computed for the Hartmann6D (\textbf{c.1}) and Hartmann4D problems (\textbf{c.2}).
    \textbf{(d)} Inclusion probability of each contextual variable for \texttt{SADCBO}  computed for the Hartmann6D (\textbf{d.1}) and Hartmann4D problems (\textbf{d.2}).
    Each panel shows the mean $\pm 2$ standard error across $N=100$ trials.}
    \label{fig:killer}
\end{figure*}

\myparagraph{Portfolio Optimization 5D.} This dataset was first introduced by~\citet{NEURIPS2020_e8f27796}.
The goal is to optimize three design variables, which stand for the hyperparameters of a trading strategy,
to maximize return under random environmental conditions. There are two contextual variables, namely: bid--ask spread and the borrowing cost. 

\myparagraph{Yacht Hydrodynamics 6D.} This dataset comes from the UCI Machine Learning
Repository~\citep{misc_yacht_hydrodynamics_243}.
The optimization problem is to maximize the residuary
resistance per unit weight of displacement of a yacht by controlling its 5-dimensional hull
geometry coefficients. 
Design variables are the first four dimensions of the hull geometry coefficients.
The contextual variables are the last hull geometry dimension and the Froude number.

\myparagraph{Molecule structure optimization 21D.} This computational chemistry example consists of optimizing
the bond angles in an alanine molecule to determine the lowest energy conformer, i.e.,
the structure the molecule will likely take in nature. These problems are complicated by high dimensionality.
We consider the Alanine, a molecule with 21 angular variables: 3 key variables based on prior domain knowledge set as design variables,
and 18 other angles treated as contextual variables. Molecular energies are calculated with the AMBER
forcefield~\citep{amber} at each round of BO\@.

\myparagraph{Robot pushing task 14D.} We follow~\citet{wangrobot} and consider a control parameter tuning problem for robot pushing. This real-world function returns the distance between a designated goal location and two objects being pushed by two robot hands, whose trajectory is determined by 14 parameters specifying the location, rotation, velocity and moving direction, among others. There are 6 design variables and 8 contextual variables.

\myparagraph{Synthetic experiments.}\label{sec:synthetic_exp}
We also consider four synthetic test functions, (see \cref{tab:exps,sec:synthetic} for details).
A min-max transformation is performed on the input data, scaling it to the unit cube:
$\X \times \Z = [0,1]^{d+c}$. Similarly, the output is scaled between $[0,1]$ and a noise term
$\varepsilon\sim\mathcal{N}(0, \sigma^2_{\text{noise}})$ is added with $\sigma^2_{\text{noise}}=0.001$.
The contextual variable distribution is $p(\z) = \mathcal{U}([0, 1]^c)$.

\subsection{Results}\label{sec:results}

\paragraph{Real-world experiments.}
In each plot from \Cref{fig:killer}, we report the best value found by each baseline as a function of the number of iterations.
In real-world experiments (\Cref{fig:killer}\textbf{a}), \method (in red with white markers) quickly converges to the optimum.
\method consistently outperforms
the first baselines \texttt{VBO} and \texttt{CUBO}, even though
in the Molecular Shape example, \method and \texttt{CUBO} perform on par due to the good choices of the domain experts on the design variables. Except for the Robot Pushing task, the difference between \method and \texttt{CBO}
(in blue) is marginal in the real-world experiments. The latter enhances the surrogate model with information from sampled
contexts, while our method may even optimize selected contextual variables if needed. Given that these
baselines perform similarly, combined with the observation that optimizing only design variables
(\texttt{CUBO}, in yellow) produces poor results for the Portfolio and Yacht problems,
we can conclude that contextual variables play a significant part in maximizing these two objectives.
The cost-aware BO baseline \texttt{CaBO} performs poorly in all tasks. 
\texttt{Dropout} and \texttt{MMDBO}
consistently underperforms, except on the Yacht example for the latter.  These baselines perform variable selection in a random manner for \texttt{Dropout} and using Hilbert-Schmidt Independence Criterion for \texttt{MMDBO}~\citep{grettonnips}, two strategies that do not seem to surpass the Feature Collapse method implemented in \method. 
This observation highlights the need for an informed variable selection strategy. 
In-depth findings for the Molecule experiment are presented in \Cref{sec:rwdetails} and provide additional explanations as to why \method clearly outperforms \texttt{MMDBO} and \texttt{Dropout}.

\paragraph{Synthetic experiments.}

\Cref{fig:killer}\textbf{b} displays the best value found by each baseline for synthetic functions.
\method ranks first on 3 out of 4 examples, closely followed by the cost-aware baseline \texttt{CaBO}, which performs much better on synthetic experiments than on the real-world ones. The contextual BO baseline \texttt{CBO} that obtained second to best results in real-world experiments, is now less performant, due to the fact that it does not optimize the context, similarly as \texttt{CUBO}.
% Here, \texttt{CBO}, the baseline method that performed second best in the real-world settings, performs relatively poorly.
%     This is because unlike \method (but like \texttt{CUBO}, the other contextual BO baseline which also performs relatively poorly), \texttt{CBO} does not optimize over the contextual variables.
This seems to be particularly critical for Ackley5D, whereas for Hartmann6D/Hartmann4D, simply enhancing the surrogate with contextual variable observation already leads to a large performance gap between \texttt{CUBO} and \texttt{CBO}. 
Lastly, \texttt{VBO} does a poor job
as it optimizes every variable,
thus spending a large fraction of the budget every iteration.

For Hartmann6D and Hartmann4D, \Cref{fig:killer}\textbf{c} reports the time at which \method's switching
criterion (\Cref{eq:ineg}) kicks in, in proportion to the total budget% \sjs{the time step (as a proportion of the total budget) at which \method's switching criterion kicks in}
, demonstrating that both phases
are leveraged in our approach. 

Finally, \Cref{fig:killer}\textbf{d} reports the sensitivity indices computed at each iteration for each
contextual variable, averaged across whole trajectories of multiple trials. For Hartmann6D, the results
match the Sobol sensitivity analysis results (Table~\ref{tab:sobol}), even though global sensitivity
indices may differ from sensitivity indices with respect to the function optimum. Similar findings
apply to Hartmann4D (Table~\ref{tab:sobhart4}). Results for other problems can be found in
\Cref{fig:SI-suppmain}.

\begin{tcolorbox}[colback=violet!13!white,colframe=violet!50!white,boxrule=0mm,arc=0mm,bottomtitle=0.5mm,left=2mm,leftrule=1mm,right=2mm,toptitle=0.75mm]
\textbf{Main takeaways.}
Quantitatively, \texttt{SADCBO} achieves the best overall performances, ranking first in 7 out of 8 problems, although other methods obtained comparable performances on 5 out of 8 problems.\\

The second-best and third-best methods, \texttt{CBO} and \texttt{MMDBO}, both severely underperform in two examples (Ackley and Hartmann6 for \texttt{CBO}, Molecular Shape and EggHolder for \texttt{MMDBO}). While the improvements provided by \texttt{SADCBO} may seem marginal, they are consistent across the benchmark.\\

We hypothesize that this consistent behavior stems from our two-stage approach, which allows \texttt{SADCBO} to be versatile. \method can handle both cases where the impact of the contextual variables on the function is limited (hence it is not worth spending budget to control them) and cases where spending budget leads to informative queries are simultaneously well-handled. For instance,
\texttt{SADCBO} effectively reverted to a \texttt{CBO} algorithm in the Molecular Shape problem, due to an optimization phase mostly triggered at the end of the run.
Meanwhile, for the Ackley function, the optimization phase was triggered in the first quarter of the budget on average, leading to 
\texttt{SADCBO} outperforming \texttt{CBO}.
\end{tcolorbox}

\subsection{Sensitivity analysis}\label{sec:ablation}

We now report experiments assessing the robustness of \method's performance to several modifications,
either at the hyperparameter level or at the experiment setting level. The latter includes assessing
performance when increasing the number of noise variables, varying the contextual variable query cost,
or varying the surrogate model. Next, additional experiments illustrate the sound behavior of the proposed phase switching criterion implemented in \texttt{SADCBO}.

\paragraph{Number of irrelevant contextual variables.}

We compare the performance reached by \method when adding an increasingly larger number of noise variables and find that even for a large number of irrelevant contextual variables, \method reaches top performance on 3 out of 4 examples (\Cref{fig:SI-abladim}). The gap in performance between \method and \texttt{CaBO}, 
 \texttt{Dropout} and \texttt{MMDBO} seems to overall grow with the number of nuisance variables, in favor of \method.

\paragraph{Contextual variables optimization cost.}

We investigate four different values for the query cost of contextual variables (Figure~\ref{fig:SI-ablacost}). For extremely cheap contextual variables $\lambda_j=0.1$ for all $j$, that is, ten times cheaper than a design variable, \texttt{VBO} performs favorably, as optimizing over all inputs $[\x,\z]$ is cheap. \method remains competitive in this configuration, even though \texttt{MMDBO} and \texttt{CaBO} perform on par. For a moderate cost $\lambda_j = 1$ (the cost model considered in Figure~\ref{fig:killer}), \method obtains the lowest average rank over all four test functions. For expensive contextual variables, $\lambda_j=3$ or $\lambda_j=10$, \texttt{CaBO} seems overall more suitable, although closely followed by \method, and \texttt{CBO}.

\paragraph{Sparsity-enforcing surrogates with \method.}

As \method relies on a posterior sensitivity analysis to select the relevant contextual variables, and is hence agnostic to the choice of GP surrogate model, it can be combined with other methods that induce sparsity via the GP surrogate. One such method is by \citet{eriksson_21_saasbo}, who introduced a sparsity-enforcing GP surrogate equipped with a horseshoe prior on the square inverse lengthscales, coined \texttt{SAASBO}. In Figure~\ref{fig:kernel}, we compare \method with the combined method \texttt{SAASBO+SADCBO}, with both having the same hyperparameters. We observe that \texttt{SAASBO+SADCBO} improves over just \texttt{SAASBO} in all the synthetic examples, and is also better than \method in two out of four examples. Note that the performance of \texttt{SAASBO+SADCBO} may further improve through hyperparameter tuning.

\begin{figure}[ht!]
    \centering
    \includegraphics[width=1\linewidth]{figures/cameraready_saasbo.pdf}
    \caption{Combining \method with  sparsity-enforcing surrogate \texttt{SAASBO}.
    For any variable, the associated query cost is 1. $p(\z) = \mathcal{U}([0,1]^c)$.
    The combination is fruitful and improves the performances of \texttt{SAASBO}.}
    \label{fig:kernel}
\end{figure}

\begin{figure}[ht!]
    \centering
    \includegraphics[width=1\linewidth]{figures/crhartmannstopexperiment.pdf}
    \caption{Assessing \texttt{SADCBO}’s phase switching criterion on the Hartmann6D function. The iteration selected by the adaptive stopping criterion implemented in \texttt{SADCBO} yields one of the best BO trials. Each curve is computed as an average of 10 different random seeds.}
    \label{fig:hartstop}
\end{figure}

\paragraph{\method phase switching criterion.}

We ensure that the criterion is well-behaved: the more information about
the output is contained in the contextual variables, the later the phase switching occurs
(Figure~\ref{fig:SI-earlystopping}). Even though the stopping criterion was initially devised for vanilla BO, its application in a CBO setting is fruitful. Figure~\ref{fig:hartstop} further illustrates the soundness of the phase switching criterion. Using the Hartmann6D function under the same setting as described above, the mean switching iteration found by \texttt{SADCBO} over 100 different runs was collected. Then, new BO runs using \texttt{SADCBO} with a \textbf{fixed} phase switching time $i \in \{1,\dots,100\}$ were performed. This was done 10 times for each switching time, using different random seeds for 
the initial dataset. The switching time found by \texttt{SADCBO} yields one of the best runs, validating the use of the criterion.

\paragraph{\method hyperparameters.}

We vary the 3 hyperparameters of \texttt{SADCBO}: $\eta, \gamma, Q$. 
Unsurprisingly, the cumulative sensitivity threshold $\eta$ stands out as the most relevant parameter:
as its value decreases, fewer variables are included, at which point not all relevant ones are selected,
leading to reduced performance (\cref{sec:addres}).

\section{CONCLUSION}

In this paper, we extended Contextual BO~\citep{krauze_11_contextbo} to settings in which the contextual
variables may be not only observed but also optimized at a cost. We introduced \texttt{SADCBO},
an algorithm designed to select relevant context variables affecting the experimental outcomes by efficiently
leveraging information present in both the observational and the interventional data. \method results
in more adequate surrogate models, and ensures the reproducibility of experiments by controlling for such
relevant variables. In that respect, \method should be used for practical applications where contextual
variables can have an influence while being controllable. This includes, e.g., the development of new
high-throughput materials or drugs, where machine learning strategies are being increasingly
used~\citep{zhang2020bayesian, bombarelli2018lbo}. \method can also be combined with any GP surrogate.
Thus, if a practitioner believes that a specific contextual variable should be included,
this can be easily achieved. Conversely, the variable selection procedure could be generalized to discard
design variables as well.
Lastly, recent work~\citep{aglietti_2023_ceo} proposed to perform BO under the assumption that the input variables and the output are linked by a causal directed acyclic graph, learning the graph whilst maximizing the objective function. Despite its high computational complexity, applying this technique to our particular problem might be promising.

\paragraph{Limitations and future work.}
To achieve cost efficiency, \method integrates the query cost at the variable selection level and employs
an early stopping criterion. The latter only depends on an upper bound on the instantaneous regret difference
and is therefore not cost-aware. Adding a notion of remaining budget to this criterion would certainly benefit
our approach.
On a similar note, while our algorithm incorporates cost, more effort could be put into specifying the costs.
In our experiments, they were set to $1$ for all variables to prevent bias in the results, and we carried
out an ablation study with different costs in \Cref{sec:ablation}. Yet, it is worth mentioning that our method is compatible with the inference of black-box, input-dependent costs, similarly to \texttt{CaBO}~\citep{lee2020costaware}. One would simply need to modify~\Cref{eq:fccost}, replacing $\lambda_j$ by the learned cost.
An interesting avenue for future work would be to elicit knowledge of experimental costs
from domain experts in real-world situations.% \looseness-1

\begin{acknowledgements} JM acknowledges the support of the Research Council of Finland under the HEALED project (grant 13342077). AB, AT, STJ, and PR were supported by the Research Council of Finland Flagship programme: Finnish Center for Artificial Intelligence FCAI.
AT further acknowledges funding from the European Union's Horizon 2020 research and innovation programme under the Marie Skłodowska-Curie grant agreement No. 101059891.
SJS and SK were supported by the UKRI Turing AI World-Leading Researcher Fellowship, [EP/W002973/1].


\end{acknowledgements}


% References
\bibliography{ref}

\newpage

\onecolumn
\emptythanks
\title{Learning Relevant Contextual Variables Within Bayesian Optimization \\(Supplementary Material)}
\maketitle


%This Supplementary Material should be submitted together with the main paper.

\appendix

\setcounter{section}{0}
\setcounter{figure}{0}
\setcounter{equation}{0}
\setcounter{table}{0}
\renewcommand\thesection{\Alph{section}}
\renewcommand{\thetable}{S\arabic{table}}
\renewcommand{\thefigure}{S\arabic{figure}}
\renewcommand{\theequation}{S\arabic{equation}}
%\renewcommand{\thealgorithm}{S\arabic{algorithm}}

\section*{Appendix}

\paragraph{Outline.} The Appendix is organized as follows. In \Cref{sec:flowchart}, we provide a flowchart summarizing the proposed method \method. In \Cref{sec:stopcrit}, we provide further details about the phase switching criterion introduced in \Cref{sec:sadcbo}. In \Cref{sec:mmd},
 we provide more details about one of the baselines used in the main text,
based on maximum mean discrepancy.
\Cref{sec:addres} contains further experimental results regarding:
\begin{itemize}
    \item Phase switching time and sensitivity-based inclusion probabilities of
    contextual variables found by \method for additional test functions (\Cref{fig:SI-suppmain}).
    \item Varying the number of irrelevant contextual variables (\Cref{fig:SI-abladim}).
    \item Varying contextual variables query cost (\Cref{fig:SI-ablacost}).
    \item The distribution of phase switching times for \texttt{SADCBO} (\Cref{fig:SI-earlystopping}).
    \item Varying \method hyperparameters (\Cref{sec:hyper} and Figure~\ref{fig:SI-ablahyper}).
\end{itemize}
Finally, \cref{sec:expdetails} contains a description
of the real-world experiments performed throughout the paper, along with the analytical
expressions of the synthetic examples used.

\section{Flowchart of the algorithm}\label{sec:flowchart}

\begin{figure*}[ht!]
    \centering
    \includegraphics[width=1\linewidth]{figures/finalflowchart.pdf}
    \caption{Flowchart of the proposed method \texttt{SADCBO}.}
    \label{fig:flowchart}
\end{figure*}

\section{Phase Switching Criterion}\label{sec:stopcrit}

The phase switching criterion we employ is derived from the stopping criterion from~\citet{ishibashi23stopping}. The absolute difference of expected minimum simple regrets $\Delta R_t := \lvert R_t - R_{t-1}\rvert$ can be upper bounded with probability $1-\delta$ by $\Delta \tilde{R}_t$, a quantity defined in \cref{eq:earlystopping}. Directly quoting the work of~\cite{ishibashi23stopping}, the rationale behind this criterion reads as follows: ``By evaluating the difference between the expected minimum simple regrets, we can stop BO without knowing $f^*$
, because it indicates that the search efficiency is low and there is almost no improvement in the objective value. However, it is generally difficult to calculate $\Delta R_t$  analytically''. 
Next, any stopping criterion involves the computation of some sort of threshold. \cite{ishibashi23stopping} exploit the fact that their upper bound $\Delta\tilde{R}_t$ can itself be upper bounded by a quantity (introduced in~\cite[Equation 10]{ishibashi23stopping}), whose convergence speed to zero is limited by a specific term, $s_t$ (Equation~\ref{eq:ineg}). $s_t$
 can be computed analytically and therefore yields an adaptive threshold.

 
Finally, \Cref{eq:earlystopping} involves a sequence $\kappa_{\delta,t-1}$:
\begin{equation}
    \kappa_{\delta,t-1} = \max_{\bv \in \D_{t-1}}~\text{UCB}_\delta(\bv) - \max_{\bv \in \V} ~\text{LCB}_\delta(\bv),
\end{equation}
where $\text{UCB}_\delta(\bv)= \mu_t(\bv|\D_{t}) + \beta_t^{1/2}\sigma_t(\bv|\D_{t})$ and $\text{LCB}_\delta(\bv)= \mu_t(\bv|\D_{t}) - \beta_t^{1/2}\sigma_t(\bv|\D_{t})$. $\beta_t^{1/2}$ is a trade-off parameter between exploration and exploitation that depends on $\delta$~\citep{srinivas_ucb}. $\kappa_{\delta,t-1}$ is a quantity that was first introduced by~\citet[Section 3.2]{makarova2022automatic} as an upper bound for the simple regret of the surrogate, which directly flows from the bounds provided by~\citet{srinivas_ucb} for well-calibrated surrogates.

Heuristically, one can think of our setting as applying the stopping criterion to $\x \mapsto f(\x,\z)$, a stochastic black-box function with 
$\z\sim p(\z)$. Upon satisfaction of this criterion, we switch to the optimization of 
$(\x,\z) \mapsto f(\x,\z)$ where some contextual variables 
 are optimized, and some others are still sampled from $p(z^{(j)})$.

\section{Maximum Mean Discrepancy-based variable selection}\label{sec:mmd}

\cite{Spagnol2019sensibo} introduced a BO algorithm with a variable selection procedure
based on the Hilbert Schmidt Independence Criterion (HSIC).
This measure can be used in our setting as well. We now briefly describe how it is defined.

As introduced in the main text, let $\Z \subset \R^c$ be the space of contextual variables,
and $\mathcal{H}$ be a Hilbert space of $\R$-valued functions on $\Z$.
Assume that $k: \Z\times\Z \to \R$ is the unique positive definite kernel associated with
the Reproducing Kernel Hilbert Space $\mathcal{H}$. Let $\mu_{\mathbb{P}_Z}$ be the kernel mean
embedding of the distribution
$\mathbb{P}_Z$, $\mu_{\mathbb{P}_Z} := \mathbb{E}_Z[k(Z,\cdot)] = \int_\Z k(\z,\cdot)\mathrm{d}\mathbb{P}_Z$.
Kernel embeddings of probability measures provide a distance between distributions between
their embeddings in the Hilbert Space $\mathcal{H}$, named Maximum Mean Discrepancy
(MMD,~\citep{Gretton2012JMLR}):
\begin{equation}
\text{MMD}(\mathbb{P}_Z,\mathbb{P}_Y) = \norm{\mu_{\mathbb{P}_Z} - \mu_{\mathbb{P}_Y}}^2_\mathcal{H}.
\end{equation}
For two random variables $Z \sim \mathbb{P}_Z$ on $\mathcal{H}$ and $Y \sim \mathbb{P}_Y$ on
$\mathcal{G}$, the HSIC is the squared MMD 
between the product distribution $\mathbb{P}_{ZY}$ and the product of its marginals
$\mathbb{P}_Z\mathbb{P}_Y$,
\begin{align}
    \text{HSIC}(Z,Y) &= \text{MMD}^2(\mathbb{P}_{ZY},\mathbb{P}_{Z}\mathbb{P}_{Y})\\
    &= \norm{\mu_{\mathbb{P}_{ZY}} - \mu_{\mathbb{P}_Z\mathbb{P}_Y}}^2_{\mathcal{H}\otimes\mathcal{G}}\\
    &= \mathbb{E}_{Z,Y}\mathbb{E}_{Z',Y'}[k(Z,Z')l(Y,Y')] \label{eq:mmdapprox}\\
    &~~~+ \mathbb{E}_{Z}\mathbb{E}_{Y}\mathbb{E}_{Z'}\mathbb{E}_{Y'}[k(Z,Z')l(Y,Y')] \nonumber \\
    &~~~-2 \mathbb{E}_{Z,Y}\mathbb{E}_{Z'}\mathbb{E}_{Y'}[k(Z,Z')l(Y,Y')]\nonumber.
\end{align}
To determine the relevance of a variable $Z^{(i)}$, \cite{Spagnol2019sensibo} introduce
\begin{equation}
    S^{\text{HSIC}}(Z^{(i)}) = \text{HSIC}(Z^{(i)},\mathbb{I}(Z\in \mathcal{L}_\gamma)),
\end{equation}
with $\mathcal{L}_\gamma$ a region of interest: the locations where the objective
function value is above a threshold $\gamma$. 
This measure reflects how important $Z^{(i)}$ is to reach $\mathcal{L}_\gamma$.

We implemented this measure, substituting expectations for empirical means over the dataset $\D$.
We use $\gamma=0.8$, a threshold identical to the one used for \texttt{SADCBO} in \Cref{eq:dgamma}.
The kernel $k$ is chosen to be a RBF kernel, and $l$ is a linear kernel $l(y,y') = yy'$,
a common choice for binary data.

\section{Additional experimental results}\label{sec:addres}


\begin{figure}[H]
    \centering
    \includegraphics[width=1\linewidth]{figures/uaimainsupp.pdf}
    \caption{Each row deals with a specific problem. The left panel shows the BO trial results for each baseline.
    The middle and right panel show statistics related to \method: 1) the phase switching time after which phase II
    begins and 2) the inclusion probabilities for each contextual variable.
    Statistics are computed across $N=100$ BO trials with different seeds.}
    \label{fig:SI-suppmain}
\end{figure}

\begin{figure}[H]
    \centering
    \includegraphics[width=.9\linewidth]{figures/fig_uai_nvar.pdf}
    \caption{Varying the number of irrelevant contextual variables. For any variable,
    the associated query cost is 1. $p(\z) = \mathcal{U}([0,1]^c)$. On the three test functions Ackley5D,
    Hartmann6D and Hartmann4D, our approach outperforms other baselines even in high dimensions.}
    \label{fig:SI-abladim}
\end{figure}

\begin{figure}[H]
    \centering
    \includegraphics[width=.9\linewidth]{figures/uaicost.pdf}
\caption{Ablation study on contextual variable query cost. Design variables have cost 1.
$p(\z) = \mathcal{U}([0,1]^c)$.}
\label{fig:SI-ablacost}
\end{figure}

\begin{figure}[h]
    \centering
    \includegraphics[width=.885\linewidth]{figures/soundstopcriterion.pdf}
    \caption{Distribution of phase switching criterion triggering times for \texttt{SADCBO}
    across $N=100$ different BO trials. We consider the Ackley5D function with an increasingly larger
    ratio of relevant contextual variables over design variables,
    and 8 irrelevant contextual variables. $p(\z) = \mathcal{U}([0,1]^c)$. For any variable,
    the associated query cost is 1. As the impact of contextual variables on the output function grows,
    the number of iterations spent in the observational phase grows as well.}
    \label{fig:SI-earlystopping}
\end{figure}

\subsection{Additional details on hyperparameter variations.}\label{sec:hyper}

\begin{figure}[H]
    \centering
    \includegraphics[width=.71\linewidth]{figures/fig_ablahyper_newaistats.pdf}
    \caption{Varying hyperparameters for \texttt{SADCBO}.
    For any variable, the associated query cost is 1. $p(\z) = \mathcal{U}([0,1]^c)$.
    Top: varying $\eta$, the contextual variable inclusion threshold over the cumulative sum
    of sensitivity indices. Middle: varying $\gamma$, the threshold used in the creation of
    the truncated dataset $\D^\gamma$ from \Cref{eq:dgamma}. Bottom: varying $Q$, the size
    of the dataset $\D^Q$ from \Cref{eq:dalpha}. $\eta$ is the most sensitive hyperparameter here.}
    \label{fig:SI-ablahyper}
\end{figure}

We vary the 3 hyperparameters of \texttt{SADCBO}: $\eta \in [0, 1]$ the threshold based over
the cumulative sum of sensitivity indices, which in turn regulates how many variables are selected
every iteration; $\gamma \in [0,1]$, a threshold upon which a value is considered high enough to have
its input added to dataset $\D^\gamma$ (\Cref{eq:dgamma}), used for sensitivity analysis;
and $Q$ the size of the dataset $\D^Q$ (\Cref{eq:dalpha}).

Figure~\ref{fig:SI-ablahyper} reports the performances.
Unsurprisingly, $\eta$ stands out as the most stringent parameter: as its value decreases,
fewer variables are included, at which point not all relevant ones are selected, leading to
reduced performances. Note that in a setting where there are no relevant contextual variables,
lower values of $\eta$ will actually lead to better performances.% This is investigated in \Cref{sec:forward}.
Then, varying $\gamma \in [0,1]$ slightly affects the results: $\gamma$
increasing means that more samples are collected for sensitivity analysis, but these are less
relevant for producing a reliable set of variables accounting for the fluctuations at the optimum. 
Finally, for the examples considered, $Q$ has only a limited effect, close to that of varying
$\gamma$. This might stem from the fact that batched acquisition functions are notoriously difficult
to optimize and may sometimes struggle to enforce diversity.

\section{Experiment details}\label{sec:expdetails}

\subsection{Real-world datasets}\label{sec:rwdetails}

\paragraph{Portfolio optimization dataset.}

This dataset was first introduced in~\citep{NEURIPS2020_e8f27796}. The goal is to tune the hyper-parameters of a trading strategy so as to maximize
return under risk-aversion to random environmental conditions. A software is used to simulate
and optimize the evolution of a portfolio over a period of four years using open-source market data.
Each evaluation of this simulator returns the average daily return over this period of time under the
given combination of hyper-parameters and environmental conditions. Since the simulator is expensive to evaluate, we do not use it directly but perform pool-based Bayesian Optimization using a pool of 3000 points generated according to a Sobol sampling design. 

The hyper-parameters to be optimized are the risk and trade aversion parameters and the holding cost
multiplier.
These variables constitute the design variables.
The contextual variables are the bid-ask spread and the borrowing cost.

\paragraph{Yacht hydrodynamics dataset.}

This dataset comes from the UCI Machine Learning Repository~\citep{misc_yacht_hydrodynamics_243}.
The optimization problem is to maximize the residuary
resistance per unit weight of displacement of a yacht by controlling its 5-dimensional hull
geometry coefficients. Another optimization variable is the 1-dimensional Froude number. We chose as design variables the first four dimensions of the hull geometry coefficients. The contextual variables are the last hull geometry dimension and the Froude number. Like the Portfolio optimization dataset, we have access to a limited number of samples $(\approx 300)$ and thus perform pool-based Bayesian optimization.

\paragraph{Molecular structure optimization.} This case is a computational chemistry challenge. Molecules can adopt different structures that preserve the topology (bonds and bonding types), but have different internal angles. Finding such conformers is a global optimization problem. Here, we are searching for the conformers of alanine --- a molecule with structure C$_3$H$_7$NO$_2$ --- whose energy is calculated at each round of BO with the AMBER force field~\citep{amberarticle,amber}. Alanine provides 33 structural variables to optimize: ten dihedral angles, eleven bond angles, and twelve bond lengths. Conformer search in the full 33-dimensional space is very challenging, but progress has been made with Bayesian optimization recently by reducing the problem to the four most important dihedral angles \citep{optiamber}. For the example in this work, three of these four dihedral angles were chosen as the design variables (indices 3, 17, and 21 in the dataset; which denotes dihedral angles d4, d11, and d13 in AMBER notation; d4 is the bond leading to the amino group, d13 the one leading to the hydroxyl group, and d11 is the bond between these two), the rest of the dihedral and bond angles (18 angles) are chosen as the contextual variables, and the bond lengths are kept fixed to facilitate faster simulations. The search space is selected by utilizing molecule domain knowledge in a conservative manner that allows 10-20 degree variations for the bond angles and is free for the dihedral angles. To outline the alanine optimization results, the structure optimization performed here as a test case is a high-dimensional problem, thus, the \texttt{VBO} method that tries to optimize all the variables $\x$ and $\z$ converges slowly. Due to the same reason, methods \texttt{MMDBO} and \texttt{Dropout} also underperform in terms of convergence for the alanine problem. Similarly to \method, these two baselines operate variable selection, although using a different selection criterion. However, controlling the selected variable comes at a cost. On the opposite, it turns out that from Figure~\ref{fig:SI-suppmain} (fifth row, middle panel), \method virtually never switches to phase II for the Molecular Shape example. Therefore, \method does perform contextual variable selection, but does not control them, it only chooses which of these variables will be included in the surrogate, hence behaving like \texttt{CBO}, but with a variable selection step. This explains why 1) \texttt{CBO} closely follows \method for this example and 2) why other variable selection baselines like \texttt{MMDBO} and \texttt{Dropout} end up far from \method.
Interestingly, in this case, the simplified case of optimizing only the design variables $\x$ (\texttt{CUBO}) also performs well. This is because our domain experts made good initial choices on the relevant design variables $\x$ and the search spaces of context variables $\z$. This type of pre-analysis is time-consuming and more challenging for larger molecules. Hence, a future line of work is to test context-aware BO more comprehensively in molecule structure optimization tasks.

\paragraph{Robot Pushing Task}

This task was first introduced in~\citet{wangrobot}, and consists of a control parameter tuning problem for robot pushing. This real-world function returns the distance between a designated goal location and two objects being pushed by two robot hands, whose trajectory is determined by 14 parameters specifying the location, rotation, velocity and moving direction, among others. The function is implemented with a physics engine, the Box2D simulator. There are 6 design variables and 8 contextual variables.

\subsection{Synthetic test functions}\label{sec:synthetic}

\paragraph{Hartmann-6D function:}
\begin{align*}
    f(\bv) &= - \sum_{i=1}^4 \alpha_i \exp \left(- \sum_{j=1}^6 A_{ij}(v^{(j)} - P_{ij}) \right)\\
    \alpha &= (1.0,1.2,3.0,3.2)^T\\
    \bf{A}   &= \begin{pmatrix}
    10&3&17&3.5&1.7&8\\
    0.05&10&17&0.1&8&14\\
    3&3.5&1.7&10&17&8\\
    17&8&0.05&10&0.1&14
    \end{pmatrix}\\
    \bf{P} &= 10^{-4} \begin{pmatrix}
    1312&1696&5569&124&8283&5886\\
    2329&4135&8307&3736&1004&9991\\
    2348&1451&3522&2883&3047&6650\\
    4047&8828&8732&5743&1091&381
    \end{pmatrix}
\end{align*}

defined over $\V = [0,1]^6$. The second, fifth, and sixth variables were considered as design variables, while the first, third, and fourth variables were considered as contextual variables. 6 noise variables were added.
Table~\ref{tab:sobol} provides the results of a Sobol global sensitivity analysis performed using evaluations of the function collected over a grid of $N=917504$ samples~\citep{sobol2001sa}. 
Adding up the first order indices for design and contextual variables separately leads to $S_\x \approx 0.124$ and $S_\z \approx 0.196$. This means that with respect to first-order interactions, contextual variables have more impact than design variables, in this synthetic example. One should notice however that these indices are computed across the whole search space and not specifically at the optimum. 

\begin{table}[H]
    \centering
    \caption{Sobol global sensitivity analysis for the Hartmann-6D function using $N=917504$ samples.}
\begin{tabular}{lcc}
                \toprule
                Variable & First order sensitivity indices &  Total order sensitivity indices\\
                \midrule
                $z^{(1)}$ & 0.107 & 0.343 \\
                 $x^{(2)}$ & 0.006 & 0.399 \\
                 $z^{(3)}$ & 0.007 & 0.052\\
                 $z^{(4)}$ & 0.082 & 0.379\\
                 $x^{(5)}$ & 0.106 & 0.297\\
                  $x^{(6)}$ & 0.012 & 0.482\\
                \bottomrule
\end{tabular}
\label{tab:sobol}
\end{table}

\paragraph{Hartmann-4D function:}
\begin{align*}
    f(\bv) &= \frac{1}{0.839}\left(1.1 - \sum_{i=1}^4 \alpha_i \exp \left(- \sum_{j=1}^4 A_{ij}(v^{(j)} - P_{ij}) \right)\right)\\
    \alpha &= (1.0,1.2,3.0,3.2)^T\\
    \bf{A}   &= \begin{pmatrix}
    10&3&17&3.5\\
    0.05&10&17&0.1\\
    3&3.5&1.7&10\\
    17&8&0.05&10
    \end{pmatrix}\\
    \bf{P} &= 10^{-4} \begin{pmatrix}
    1312&1696&5569&124\\
    2329&4135&8307&3736\\
    2348&1451&3522&2883\\
    4047&8828&8732&5743
    \end{pmatrix}
\end{align*}

defined over $\V = [0,1]^4$. The first and fourth variables were considered as design variables, while the second and third variables were considered as contextual variables. 3 noise variables were added. Table~\ref{tab:sobhart4} provides the results of a Sobol global sensitivity analysis performed using evaluations of the function collected over a grid of $N=300000$ samples.
Adding up the first order indices for design and contextual variables separately leads to $S_\x \approx 0.579$ and $S_\z \approx 0.091$. This means that with respect to first-order interactions, design variables have much more impact on the output than contextual variables. The gap slightly reduces when considering total order sensitivity indices. However, it is worth remembering that these indices are computed across the whole search space and not specifically at the optimum. 

\begin{table}[h]
    \centering
    \caption{Sobol global sensitivity analysis for the Hartmann-4D function using $N=300000$ samples.}
\begin{tabular}{lcc}
                \toprule
                Variable & First order sensitivity indices &  Total order sensitivity indices\\
                \midrule
                $x^{(1)}$ & 0.307 & 0.477 \\
                 $z^{(2)}$ & 0.037 & 0.279 \\
                 $z^{(3)}$ & 0.054 & 0.103\\
                 $x^{(4)}$ & 0.272 & 0.526\\
                \bottomrule
\end{tabular}
\label{tab:sobhart4}
\end{table}


\paragraph{Ackley 5D function:}
\begin{equation*}
    f(\bv) = -20 \exp \left(-0.2 \sqrt{\frac{1}{5}\sum_{j=1}^5 (v^{(j)})^2}\right) - \exp\left(\frac{1}{5}\sum_{j=1}^5\cos(2\pi v^{(j)}) \right) + 20 + e^1
\end{equation*}

defined over $\V =[-5,5]^5$. 8 noise variables were added.

% \paragraph{Michalewicz 8D function:}
% \begin{equation*}
%     f(\bv) = - \sum_{j=1}^8 \sin(v^{(j)})\sin^{20}\left(\frac{jv^{(j)}}{\pi}\right)
% \end{equation*}

% defined over $\V =[0,\pi]^8$. The first four variables were considered as design variables, while the four last were considered as contextual variables. 6 noise variables were added.

\paragraph{EggHolder 2D function:}
\begin{equation*}
    f(\bv) = - (v^{(2)}+47)\sin\left(\sqrt{\left\lvert v^{(2)} + \frac{v^{(1)}}{2} +47 \right\rvert}\right) - v^{(1)}\sin\left(\sqrt{\lvert v^{(1)} - (v^{(2)} + 47)\rvert }\right)
\end{equation*}

defined over $\V =[-512,512]^2$. The first variable was considered as a design variable, and the second one as a contextual variable. 4 noise variables were added.
A Sobol global sensitivity analysis performed using evaluations of the function collected over a grid of $N = 3 000 000$ samples shows that both variables have a similar contribution to the output (Table~\ref{tab:sobegg}).

\begin{table}[h]
    \centering
    \caption{Sobol global sensitivity analysis for the EggHolder-2D function using $N=3000000$ samples.}
\begin{tabular}{lcc}
                \toprule
                Variable & First order sensitivity indices &  Total order sensitivity indices\\
                \midrule
                $x^{(1)}$ & 0.001 & 0.998 \\
                 $z^{(2)}$ & 0.0004 & 0.999 \\
                \bottomrule
\end{tabular}
\label{tab:sobegg}
\end{table}

\end{document}
