
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib}[compress] % has a nice set of citation styles and commands
\bibliographystyle{abbrvnat}
\renewcommand{\bibsection}{\subsubsection*{References}\small}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors
\usepackage{graphicx}
\usepackage{xspace}
\usepackage{bm}
\usepackage{soul}
\usepackage{caption}
\usepackage{subcaption}
% \usepackage{todonotes}
\usepackage{wrapfig}
\usepackage{placeins}
\usepackage{enumitem}
\usepackage{footnote}
\usepackage{amsthm}
% \usepackage{thmtools, thm-restate}
\usepackage{xr}
% \usepackage{xcite}
% \externalcitedocument{daulton_446}
\makesavenoteenv{tabular}
\makesavenoteenv{table}
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]

\newtheorem{manualtheoreminner}{Theorem}
\newenvironment{manualtheorem}[1]{%
  \renewcommand\themanualtheoreminner{#1}%
  \manualtheoreminner
}{\endmanualtheoreminner}
\newtheorem{manuallemmainner}{Lemma}
\newenvironment{manuallemma}[1]{%
  \renewcommand\themanuallemmainner{#1}%
  \manuallemmainner
}{\endmanuallemmainner}
\newtheorem{proposition}{Proposition}[section]
\newtheorem{lemma}{Lemma}[section]
\newtheorem{corollary}{Corollary}[section]
\theoremstyle{definition}
\newtheorem{definition}{Definition}[section]
\newtheorem{assumption}{Assumption}[section]
\newtheorem{Assumption}{Assumption}[section]
\newtheorem{result}{Result}[section]
\theoremstyle{remark}
\newtheorem{remark}{Remark}[section]
\usepackage[linesnumbered,ruled,vlined]{algorithm2e}
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}
\myexternaldocument{daulton_446}


\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}



\newcommand{\xxcomment}[4]{\textcolor{#1}{[$^{\tiny\textsc{#2}}_{\tiny\textsc{#3}}$ #4]}}
\newcommand{\mb}[1]{\xxcomment{cyan}{M}{B}{#1}}
\newcommand{\de}[1]{\xxcomment{red}{D}{E}{#1}}
\newcommand{\sd}[1]{\xxcomment{blue}{S}{D}{#1}}
\newcommand{\ALG}{MORBO\xspace}
\newcommand{\HV}{\textsc{HV}}
\newcommand{\HVI}{\textsc{HVI}}
\newcommand{\HVC}{\textsc{HVC}}
\newcommand{\TSHVI}{$q\textsc{NEHVI-1}$}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Multi-Objective Bayesian Optimization over High-Dimensional Search Spaces (Supplementary Material)}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[*,1,2]{\href{mailto:<sdaulton@fb.com>?Subject=Your UAI 2022 paper}{Samuel Daulton}{}}
\author[*,2]{David Eriksson}
\author[2]{Maximillian Balandat}
\author[2]{Eytan Bakshy}
% Add affiliations after the authors
\affil[*]{%
    Equal contribution
}
\affil[1]{%
    University of Oxford\\
    Oxford, UK
}
\affil[2]{%
    Meta\\
    Menlo Park, USA
}

\begin{document}
\onecolumn
\maketitle
\appendix
% \input{daulton_446-supp-input}


\section{Details on Batch Selection}
\FloatBarrier
\label{appdx:batch_selection}
\begin{figure}[ht]
    \centering
    \includegraphics[width=0.33\textwidth]{figures/hvi.pdf}
    \caption{
        A visualization of our batch selection using \HVI{} with $q=4$.
        The red points represent the current PF.
        Blue, orange, and green points show the function values for the $3$ selected points under the next posterior sample.
        To select the $4$th point, the \HVI{} of each candidate is evaluated jointly with the red, blue, orange, and green points.
        % The color indicates the incremental hypervolume.
    }
    \label{fig:hvi}
\end{figure}
As discussed in Section~\ref{sec:background}, over-exploration can be an issue in high-dimensional BO because there is typically high uncertainty on the boundary of the search space, which often results in over-exploration.
This is particularly problematic when using continuous optimization routines to find the maximizer of the acquisition function since the global optimum of the acquisition function will often be on the boundary, see~\cite{oh2018bock} for a discussion on the ``boundary issue'' in BO.
While the use of trust regions alleviates this issue, this boundary issue can still be problematic, especially when the trust regions are large.

To mitigate this issue of over-exploration, we use a discrete set of candidates by perturbing randomly sampled Pareto optimal points within a trust region by replacing only a small subset of the dimensions with quasi-random values from a scrambled Sobol sequence.
This is similar to the approach used by~\citet{eriksson2021scalable} which proved crucial for good performance on high-dimensional problems.
In addition, we also decrease the perturbation probability~$p_n$ as the optimization progresses, which \citet{Regis2013} found to improve optimization performance.
The perturbation probability~$p_n$ is set according to the following schedule:
$$ p_n = p_0 \bigg[1 - 0.5\frac{\log n'}{\log b}\bigg],$$
where $n_0$ is the number of initial points, $n_f$ is the total evaluation budget, $p_0 = \min\{\frac{20}{d}, 1\}$, $b = n_f - n_0$, and $n' = \min\{\max\{n - n_0, 1\}, b\}$.

Given a discrete set of candidates, \ALG{} draws samples from the joint posterior over the function values for the candidates in this set and the previously selected candidates in the current batch, and selects the candidate with maximum \HVI{} across the joint samples.
This procedure is repeated to build the entire batch.\footnote{In the case that the candidate point does not satisfy that satisfy all outcome constraints under the sampled GP function, the acquisition value is set to be the negative constraint violation.}
Using standard Cholesky-based approaches, exact posterior sampling has complexity that is cubic with respect to the number of test points and therefore is only feasible for relatively small discrete sets.
\FloatBarrier
\subsection{RFFs for fast posterior sampling}
\label{appdx:rffs}
While asymptotically faster approximations than exact sampling exist; see \citet{pleiss2020fast} for a comprehensive review, these methods still limit the candidate set to be of modest size (albeit larger), which may not do an adequate job of covering a the entire input space.
Among the alternatives to exact posterior sampling, we consider using Random Fourier Features (RFFs) \citep{rahimi_rff}, which provide a deterministic approximation of a GP function sample as a linear combination of Fourier basis functions.
This approach has empirically been shown to perform well with Thompson sampling for multi-objective optimization~\citep{tsemo}.
The RFF samples are cheap to evaluate and which enables using much larger discrete sets of candidates since the joint posterior over the discrete set does not need to be computed.
Furthermore, the RFF samples are differentiable with respect to the new candidate $\bm x$, and \HVI{} is differentiable with respect to $\bm x$ using cached box decompositions~\citep{daulton2021nehvi}, so we can use second-order gradient optimization methods to maximize \HVI{} under the RFF samples.

We tried to optimize these RFF samples using a gradient based optimizer, but found that many parameters ended up on the boundary, which led to over-exploration and poor BO performance.
In an attempt to address this over-exploration issue, we instead consider continuous optimization over axis-aligned subspaces which is a continuous analogue of the discrete perturbation procedure described in the previous section.
Specifically, we generate a discrete set of candidates points by perturbing random subsets of dimensions according to $p_n$, as in the exact sampling case.
Then, we take the top $5$ initial points with the maximum \HVI{} under the RFF sample.
For each of these best initial points we optimize only over the perturbed dimensions using a gradient based optimizer.

Figure \ref{fig:rff_vs_exact_ts} shows that the RFF approximation with continuous optimization over axis-aligned subspaces works well on for $D=10$ on the DTLZ2 function, but the performance degrades as the dimensionality increases.
Thus, the performance of \ALG can likely be improved on low-dimensional problems by using continuous optimization; we used exact sampling on a discrete set for all experiments in the paper for consistency.
We also see that as the dimensionality increases, using RFFs over a discrete set achieves better performance than using continuous optimization.
In high-dimensional search spaces, we find that exact posterior sampling over a discrete set achieves better performance than using RFFs, which we hypothesize is due to the quality of the RFF approximations degrading in higher dimensions.
Indeed, as shown in Figure~\ref{fig:rff_vs_exact_ts}, optimization performance using RFFs improves if we use more basis functions on higher dimensional problems ($4096$ works better than $1024$).


\begin{figure}[h]
    \includegraphics[width=\textwidth]{figures/dtlz2_log_scale.pdf}
    \caption{
        \label{fig:rff_vs_exact_ts} Optimization performance under various Thompson sampling approaches on DTLZ2 test problems with $2$ objectives and various input dimensions $d \in \{10,30,100\}$.
        Disc-Exact uses exact samples from the joint posterior over a discrete set of $4096$ points. Disc-RFF-$1024$ and Disc-RFF-$4096$ evaluate approximate sample paths (RFFs) over a discrete set of $4096$ points with $1024$ and $4096$ basis functions, respectively.
        Cont-RFF-$1024$ and Cont-RFF-$4096$ use L-BFGS-B with exact gradients to optimize RFF draws along a subset of the dimensions (see in Appendix \ref{appdx:rffs} for details) using $1024$ and $4096$ basis functions, respectively.
    }
\end{figure}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Additional details of constraint handling in \ALG}
\label{appdx:constraint_handling}
If there are feasible points, the center is selected as the point with maximum HVC across the feasible Pareto frontier. If there are no feasible points, the center is selected to be the point with minimum total constraint violation (the sum of the constraint violations). A TR’s success counter is incremented if the TR center was feasible and the candidates generated from this TR improved the feasible hypervolume or if the TR center was infeasible and a candidate generated from this TR has lower total constraint violation than the TR center.
% \section{The \ALG algorithm}
% The main steps of \ALG are shown in Algorithm~\ref{algo}.
% \subsection{Additional details of constraint handling in \ALG}
% If there are feasible points, the center is selected as the point with maximum HVC across the feasible Pareto frontier. If there are no feasible points, the center is selected to be the point with minimum total constraint violation (the sum of the constraint violations). A TR’s success counter is incremented if the TR center was feasible and the candidates generated from this TR improved the feasible hypervolume or if the TR center was infeasible and a candidate generated from this TR has lower total constraint violation than the TR center.

% \begin{algorithm*}[!ht]
%     \DontPrintSemicolon
%     \KwIn{Objective functions $f$, Number of trust region $n_\text{TR}$, Initial trust region length $L_\text{init}$, Maximum trust region length $L_{\max}$, Minimum trust region length $L_{\min}$.}
%     \KwOut{Approximate Pareto frontier $\mathcal P_n$}
%     Evaluate an initial set of points and initialize the trust regions $\mathcal T_1, ..., \mathcal T_{n_\text{TR}}$ using the center selection procedure described in Section~\ref{sec:center_selection}. \\
%     Mark center points as unavailable for other trust regions. \\
%     $X_0 \leftarrow \emptyset{}, Y_0 = \emptyset{}, t \leftarrow 1$\\
%     \While{budget not exhausted}{
%         Fit a local model within each trust region. \\
%         Select $q$ candidates using the sequential greedy \HVI{} procedure described in Section~\ref{sec:batch_selection}.\\
%         Evaluate candidates on the true objective functions and obtain new observations.\\
%         \For{$j=1,..., n_\text{TR}$}{
%             Update trust regions with new observations as described in Section \ref{sec:hdbpomo}.\\
%             Increment success/failure counters as described in Section \ref{sec:hdbpomo} for observations from $T_j$.\\
%             Update edgelength $L_j$ for $\mathcal T_j$.\\
%             \If{$L_j < L_\text{min}$}{
%                 Terminate $\mathcal T_j$.\\
%                 % Terminate $\mathcal T_j$ and reinitialize $\mathcal T_j$ with edgelength $L_\text{init}$ and $N_\text{init}$ new random points.
%                 Fit GP to restart points $\mathcal D_{t-1} = (X_{t-1},Y_{t-1})$: $\bm f_{t-1} \sim P(\bm f | \mathcal D_{t-1})$.\\
%                 Sample $\bm\lambda\sim S_+^{M-1}$ and $\tilde{\bm f}_{t-1} \sim P(\bm f | \mathcal D_{t-1})$, where $S_+^{M-1} = \{\bm w\in \mathbb R_+^M : ||\bm w||_2 = 1\}$.\\
%                 Select $\bm x_t = \argmax_{\bm x \in \mathcal X} s_{\bm\lambda}[\tilde{\bm f}_{t-1}(\bm x)]$, where $s_{\bm\lambda}[\bm y] = \min_m (\max(\frac{y_m}{\lambda_m}, 0))^M$ and $\cdot_i$ denotes the $i^\text{th}$ component.\\
%                 Evaluate $\bm x_t$ on the true objective functions and obtain new observation $\bm y_t$.\\
%                 Reinitialize $\mathcal T_j$ with edgelength $L_\text{init}$ centered at the $\bm x_t$.\\
%                 Set $X_t \leftarrow X_{t-1} \cup \{\bm x_t\}, Y_t \leftarrow Y_{t-1} \cup \{\bm y_t\}$, $t \leftarrow t+1$.

%             }
%             Update center to the available point with maximum \HVC{} (globally if $\mathcal T_j$ was terminated otherwise within $\mathcal T_j$).
%         }
%     }
%     \Return{Approximate PF across observed function values}.
%     \caption{Summary of \ALG}
% \label{algo}
% \end{algorithm*}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Proofs}
\label{appdx:proofs}
\begin{manuallemma}{4.1}
Let $\bm f \in [0, B]^M$, and assume that \ALG{} only considers a newly evaluated sample to be an improvement (for updating the corresponding TR's success and failure counters) if it increases the HV by at least $\delta \in \mathbb R^+$ and assume that success counter threshold $\tau_\text{succ} = \infty$.\footnote{As stated in Appendix \ref{appdx:experimentdetails}, we use $\tau_\text{succ} = \infty$ in all of our experiments.} Then each TR will only evaluate a finite number of samples.
\end{manuallemma}
\begin{proof}
First, note that The hypervolume of the true Pareto frontier $\mathcal P^*$ is bounded. Without loss of generality, if the reference point $\bm r = \bm 0$, then the $\HV(\mathcal P^*) \leq B^M.$ Suppose that a trust region evaluates an infinite number of samples. Then, the trust region has not had $1+\log_2 L_\text{init}-\log_2L_\text{min}$ streaks of $\tau_\text{fail}$ consecutive failures. Hence, the trust region has increased the hypervolume of the Pareto frontier over the previously evaluated designs by at least $\delta$ infinitely many times. Hence, the hypervolume over the previously evaluated designs is infinite. This is a contradiction.
\end{proof}

\begin{manualtheorem}{4.1}
Let $\bm f \in [0,B]^M$ for $B>0$ and let each component $f^{(m)}$ for $m=1, ..., M$ follow a Gaussian distribution with marginal variances $\sigma \leq 1$ and independent observation noise $\epsilon_m \sim \mathcal N(0, \sigma_m^2)$ such that $\sigma_m^2 \leq \sigma^2 \leq 1$. Let $\mathcal P_t$ denote the Pareto frontier over $\bm f(X_t)$, where $X_t$ is the set of TR re-initialization points after $t$ TRs have been restarted.
%Assume that \ALG{} only considers a newly evaluated sample to be an improvement (for updating the corresponding TR's success and failure counters) if it increases the hypervolume by at least $\delta \in \mathbb R^+$ and assume that success counter threshold $\tau_\text{succ} = \infty$.
Suppose further that the conditions of Lemma~\ref{lemma:restarts} hold.
Then, the cumulative hypervolume regret $R_T$ of \ALG after $T$ restarts is bounded by:
% $$R_T \leq M^2(\sqrt{2e\pi}B/2)^Md^{\frac{1}{2}}[\gamma_T T\ln(T)]^{\frac{1}{2}}.$$
$$R_T \leq M^2(\sqrt{2e\pi}B/2)^M \sqrt{d\gamma_T T\ln(T)}.$$
% In addition,
% $\HV(\mathcal P_t) \geq \HV{}(\mathcal P^*) - \varepsilon_T$ where $\varepsilon_T = O\big(M^2(\sqrt{2e\pi}B/2)^Md^{\frac{1}{2}}\big[\gamma_T\frac{\ln(T)}{T}\big]^{\frac{1}{2}}\big)$.
\end{manualtheorem}
\begin{proof}
From Lemma~\ref{lemma:restarts}, we have that each trust region will only evaluate a finite number of samples. Hence, as the number of evaluations goes to infinity, \ALG{} will terminate and select new initial center points for trust regions an infinite number of times. Our regret bound is in terms of the number of restart points.

Our proof follows that of \citet[Theorem 8]{zhang2020random}, but the final form of our bound holds for arbitrary $B$. Note that lines 13-19 in Algorithm~\ref{algo} correspond to \citet[Algorithm 1]{paria2020flexible} using Thompson sampling, where the only evaluations are the $t-1$ restart points. From \citet[Theorem 1]{paria2020flexible}, the scalarized Bayes regret of \citet[Algorithm 1]{paria2020flexible} using $L$--Lipschitz scalarizations is $O\big(LMd^{\frac{1}{2}}[\gamma_T T\ln(T)]^{\frac{1}{2}}\big)$. Since a hypervolume scalarization $s_{\bm\lambda}[\bm y]$ is $\mathcal O(B^MM^{1+M/2})$--Lipschitz \citep[Lemma 6]{zhang2020random}, we have that $L\leq B^MM^{1+M/2}$. From \citet[Proof of Theorem 8]{zhang2020random}, the hypervolume regret can be expressed by scaling the scalarized Bayes regret by a constant $c_M=\frac{\pi^\frac{M}{2}}{2^M \Gamma(\frac{M}{2}+1)}$ that depends on the number of objectives. Hence, we can bound the hypervolume regret as:
\begin{align*}
    R_T = \sum_{t=1}^T\HV(\mathcal P^*) - \HV(\mathcal P_t) &\leq c_M LMd^{\frac{1}{2}}[\gamma_T T\ln(T)]^{\frac{1}{2}}.
\end{align*}
Note that
\begin{align*}
    c_ML &\leq B^M M^{1+M/2}\frac{\pi^\frac{M}{2}}{2^M \Gamma(\frac{M}{2}+1)}
\end{align*}
From \citet[Theorem 1]{Li2007}, $\Gamma(x) > \frac{x^{x-\gamma}}{e^{x-1}}$, where $\gamma \approx 0.577$ is the Euler-Mascheroni constant. So,
$$\Gamma(M/2+1)
> \frac{(M/2+1)^{(M/2+1-\gamma)}}{e^{(M/2)}}
>\frac{M^{(M/2)}}{2e^{(M/2)}}.
$$
Hence,
$$\frac{1}{\Gamma(\frac{M}{2}+1)} < \frac{(2e)^{(M/2)}}{M^{(M/2)}}.$$
So,
\begin{align*}
    c_ML &\leq B^M M^{1+M/2}\frac{\pi^\frac{M}{2}}{2^M \Gamma(\frac{M}{2}+1)}\\
    &\leq B^M M\frac{(2e\pi)^\frac{M}{2}}{2^M }\\
    &\leq M\big(\sqrt{2e\pi}B/2\big)^M.
\end{align*}
So the cumulative regret bound is
\begin{align*}
    R_T &\leq c_M LMd^{\frac{1}{2}}[\gamma_T T\ln(T)]^{\frac{1}{2}}\\
    &\leq  M^2(\sqrt{2e\pi}B/2)^Md^{\frac{1}{2}}[\gamma_T T\ln(T)]^{\frac{1}{2}}.
\end{align*}
\end{proof}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Details on Experiments}
\label{appdx:experimentdetails}

\subsection{Algorithmic details}
For \ALG{}, we use $5$ trust regions, which we observed was a robust choice in Figure~\ref{fig:ablation_study}.
Following~\citep{eriksson2019turbo}, we set $L_{\text{init}}=0.8$, $L_{\max} = 1.6$, and use a minimum length of $L_{\min} = 0.01$.
We use $4096$ discrete points for optimizing \HVI{} for the vehicle safety and welded beam problems, $2048$ discrete points on the trajectory planning and optical design problems, and $512$ discrete points on the Mazda problem.
Note that while the number of discrete points should ideally be chosen as large as possible, it offers a way to control the computational overhead of \ALG; we used a smaller value for the Mazda problem due to the fact that we need to sample from a total of $56$ GP models in each trust region as there are $54$ black-box constraints.
We use an independent GP with a a constant mean function and a Mat\'ern-$5/2$ kernel with automatic relevance detection (ARD) and fit the GP hyperparameters by maximizing the marginal log-likelihood (the same model is used for all BO baselines).

When fitting a model for \ALG, we include the data within a hypercube around the trust region center with edgelength $2L$.
In the case that there are less than $N_m := \min\{250, 2d\}$ points within that region, we include the $N_m$ closest points to the trust region center for model fitting.
The success streak tolerance is set to be infinity, which prevents the trust region from expanding; we find this leads to good optimization performance when data is shared across trust regions.
For $q$NEHVI and $q$ParEGO, we use $128$ quasi-MC samples and for TS-TCH, we optimize RFFs with $500$ Fourier basis functions.
All three methods are optimized using L-BFGS-B with $20$ random restarts.
For DGEMO, TSEMO, and MOEA/D-EGO, we use the default settings in the open-source implementation at \url{https://github.com/yunshengtian/DGEMO/tree/master}.
Similarly, we use the default settings for NSGA-II the Platypus package (\url{https://github.com/Project-Platypus/Platypus}).
We encode the reference point as a black-box constraint to provide this information to NSGA-II.

\subsubsection{LaMOO in High-Dimensional Search Spaces}
\label{appdx:lamoo}
For LaMOO methods, leverage the implementation of LaMOO available at \url{https://drive.google.com/drive/folders/1CMdg5iBdbKe3nkboIjiS998rnBEV09EB?usp=sharing}. We set the exploration parameter $C_p$ dynamically using the heuristic proposed by \citet{zhao2021multiobjective} to be 10\% of the hypervolume of the current Pareto frontier over the previously evaluated designs. We follow \citet{zhao2021multiobjective} and set the minimum leaf sample size to be $10$.

\citet{zhao2021multiobjective} propose to use $q$EHVI with LaMOO, but we opt to use $q$NEHVI instead since it is capable of scaling to the batch size of $q=50$ used in many of our experiments. We refer to this method as LaMOO-$q$NEHVI.
We note that $q$NEHVI is mathematically equivalent to $q$EHVI on noiseless problems.
The authors propose using rejection sampling to ensure samples come from the ``good'' region.
For high-dimensional search spaces, the acceptance probability is low for uniform random samples from the global design space, and therefore, rejection sampling is prohibitively slow.
Rejection sampling is used 1) to select starting points for multi-start L-BFGS-B and within the L-BFGS-B routine to enforce that samples are within the ``good'' region.
We contacted the authors about computational issues with this approach, and the authors recommended to use rejection sampling for selecting starting points, and then to simply run L-BFGS-B from these ``good'' starting points across the global search space.
With this approach, the resulting candidates may not (and often are not) within the ``good'' region, and LaMOO-qNEHVI is simply an initialization heuristic for optimizing $q$NEHVI, but this approach does speed up candidate generation quite a bit.
Nevertheless, even using rejection sampling to generate starting points for L-BFGS-B can be (and is on our problems) prohibitively expensive in high-dimensional search spaces. Hence, we limit the rejection sampling by only considering $120,000$ design points before beginning L-BFGS-B with the most promising designs (whether or not they are in the ``good'' region).
This makes LaMOO-qNEHVI feasible to run our our high-dimensional problems.

For LaMOO-CMA-ES, we use $q=5$ rather than $q=1$ on vehicle safety, as $q=1$ is not supported.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Synthetic problems}
\label{appdx:experimentdetails:synthetic}
The reference points for all problems are given in Table \ref{table:ref_points}. We multiply the objectives (and reference points) for all synthetic problems by $-1$ and maximize the resulting objectives.

\begin{table*}[h]
    \centering
    \begin{small}
    \begin{sc}
    \begin{tabular}{lc}
        \toprule
        Problem & Reference Point\\
        \midrule
        DTLZ2 & [$6$, $6$]\\
        DTLZ3 & $[10^3]^M$\\
        DTLZ5 & $[10]^M$\\
        DTLZ7 & $[15]^M$\\
        Vehicle Safety & [$1698.55$, $11.21$, $0.29$]\\
        Welded Beam & [$40$, $0.015$]\\
        MW7 & [$1.2$, $1.2$] \\
        \bottomrule
    \end{tabular}
    \end{sc}
    \end{small}
    \caption{
    \label{table:ref_points} The reference points for each synthetic benchmark problem.
    }
\end{table*}

\paragraph{DTLZ:} We consider the $2$-objective DTLZ2 problem with various input dimensions $d \in \{10, 30, 100\}$. We also use $2$-objective and $4$-objective variants of DTLZ3, DTLZ5, and DTLZ7 with $d=100$.
The DTLZ problems are standard test problems from the multi-objective optimization literature.
Mathematical formulas for the objectives in each problem are given in \citet{dtlz}.
\paragraph{MW7:} For a second test problem from the multi-objective optimization literature, we consider a MW7 problem with $2$ objectives, $2$ constraints, and $d=10$ parameters.
See \citet{mw_test_problems} for details.
\paragraph{Welded Beam:} The welded beam problem \citep{welded_beam} is a structural design problem with $d=4$ input parameters controlling the size of the beam where the goal is to minimize $2$ objectives (cost and end deflection) subject to $4$ constraints.
More details are given in \citet{tanabe2020}.
\paragraph{Vehicle Safety:} The vehicle safety problem is a $3$-objective problem with $d=5$ parameters controlling the widths of different components of the vehicle's frame.
The goal is to minimize mass (which is correlated with fuel economy), toe-box intrusion (vehicle damage), and acceleration in a full-frontal collision (passenger injury).
See \citet{tanabe2020} for additional details.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Trajectory planning}
For the trajectory planning, we consider a trajectory specified by $30$ design points that starts at the pre-specified starting location.
Given the $30$ design points, we fit a B-spline with interpolation and integrate over this B-spline to compute the final reward using the same domain as in~\citet{wang2018batched}.
Rather than directly optimizing the locations of the design points, we optimize the difference (step) between two consecutive design points, each one constrained to be in the domain $[0, 0.05] \times [0, 0.05]$.
We use a reference point of [$0$, $0.5$], which means that we want a reward larger than $0$ and a distance that is no more than $0.5$ from the target location [$0.95$, $0.95$].
Since we maximize both objectives, we optimize the distance metric and the corresponding reference point value by $-1$.

\subsection{Optical design}
In order to obtain precise estimates of the optimization performance at reasonable computational cost, we conduct our evaluation on a neural network surrogate model of the optical system rather than on the actual physics simulator.
The surrogate model was constructed from a dataset of $101$,$000$ optical designs and resulting display images to provide an accurate representation of the real problem.
The surrogate model is a neural network with a convolutional autoencoder architecture.
The model was trained using $80$,$000$ training examples and minimizing MSE (averaged over images, pixels, and RGB color channels) on a validation set of $20$,$000$ examples.
A total of $1$,$000$ examples were held-out for final evaluation.

\subsection{Mazda vehicle design problem}
We follow the suggestions by \citet{kohira2018proposal} and use the reference point $[1.1, 0]$ and optimize the normalized objectives $\tilde{f}_1 = f_1  - 2$ and $\tilde{f}_2 = f_2 / 74$ corresponding to the total mass and number of common gauge parts, respectively.
Additionally, an initial feasible point is provided with objective values $f_1 = 3.003$ and $f_2 = 35$, corresponding to an initial hypervolume of $\approx 0.046$ for the normalized objectives.
This initial solution is given to all algorithms.
We limit the number of points used for model fitting to only include the $2$,$000$ points closest to the trust region center in case there are more than $2$,$000$ in the larger hypercube with side length $2L$.
Still, for each iteration \ALG using $5$ trust regions fits a total of $56 \times 5$ GP models, a scale far out of reach for any other multi-objective BO method.

\section{Complexity Improvements from Local Modeling}
\label{appdx:complexity_local}
\FloatBarrier
The differences in model fitting time can be even more profound.
To illustrate this, consider a situation in which a total of $N$ data points have been collected by $n_\text{TR}$ trust regions. Suppose for simplicity that each TR has the same number of observations (under some abuse of nomenclature we use TR to refer to the modeling domain of a TR in this section). Let $\eta$ denote the average number of trust regions that a data point is part of. Then the number of points in each TR is $\eta N/n_\text{TR}$.
Assuming cubic time complexity for model fitting (i.e. $O(N^3)$ if we used a single global model), the total time complexity of fitting all $n_\text{TR}$ models in the individual TRs is $O\bigl(n_\text{TR} (\eta N/n_\text{TR})^3\bigr) = O\bigl(\eta^3N^3/n_\text{TR}^2\bigr)$.
This will lead to asymptotic speedups of order $O\bigl(n_\text{TR}^2 / \eta^3\bigr)$ when using local modeling.
% (this assumes all points are modeled, in the case where only part of the $N$ collected data points are used for modeling the speedup can be much larger). Typically, as the dimension of the parameter space and the number of points $N$ grows, we will use more TRs (large $n_\text{TR}$) and also see smaller overlap (small $o$), and thus the speedup relative to fitting a single global model can be multiple orders of magnitude.}
Typically, as the optimization progresses and the trust regions shrink, $\eta$ becomes quite small (e.g. $\eta < 1$)\footnote{When $\eta$ is close to the number of trust regions, the ``local" models will fit to nearly all observations, and hence, the models will essentially be global models. The value of $\eta$ at the start of the optimization depends on the initial trust region edge length and the dimension of the search space. }. We validate this claim empirically in the lower right subplot in Figure~\ref{fig:tr_traces}, which shows that $\eta$ becomes less than $1$ on the all problems considered as the optimization progresses.
In Figure~\ref{fig:tr_traces} we illustrate some additional information from the trust regions to better understand the role of data-sharing and local modeling in \ALG.
\begin{figure*}[!ht]
    \centering
    \includegraphics[width=0.8\textwidth]{figures/tr_traces_new.pdf}
    \caption{
         For the optical design, trajectory planning, and DTLZ2 problems. We show the average across replications as a solid line and traces from the first replication as transparent lines. (Upper Left) The number of points in each trust region. Trust regions often usually have a few hundred points on average, which results in computationally efficient local modeling. (Upper Right) The number of points in a trust region that was collected by that trust region. This shows that a large fraction of data within a trust region was actually collected by another trust region. (Lower Left) The trust region length. As the optimization proceeds, the trust regions shrink to focus on specific parts of the search space. (Lower Right) The average number of TRs that contain a given design, $\eta \in [0,N_\text{TR}]$. This shows that as the optimization progresses and the TRs shrink, on average less than $1$ TR contains a given design. This is empirical validation of the claim in Appendix~\ref{appdx:complexity_local} that $\eta$ typically becomes small as the optimization progresses and therefore, the complexity improvements are substantial.
    }
    \label{fig:tr_traces}
\end{figure*}%
Thus, the speedup relative to fitting a single global model can be multiple orders of magnitude.
\subsection{Model fitting times}
Empirically, we verify this speedup in Figure~\ref{fig:local_vs_global}.
\begin{figure}[ht!]
    \centering
    \includegraphics[width=0.4\textwidth]{figures/local_vs_global.pdf}
    \caption{
        Model fitting time for MORBO with local modeling compared to MORBO with one global model on the $146$-dimensional optical design problem.
        Fitting a global model takes almost~$20$ minutes towards the end of the optimization run compared to~$10$ seconds for \ALG.
        \label{fig:local_vs_global}
    }
\end{figure}%
This can also be seen in the results in Tables~\ref{table:fit_walltimes} and \ref{table:fit_walltimes_dtlz_m4}.
While candidate generation is fast for TSEMO, the model fitting causes a significant overhead with almost an hour being spent on model fitting after collecting {$2$,$000$} evaluations on the trajectory planning problem.
This is significantly longer than for \ALG, which only requires far less time for the model fitting due to the use of local modeling.
This shows that the use of local modeling is a crucial component of \ALG that limits the computational overhead from the model fitting.
The model fitting for \ALG on the optical design problem is less than $25$ seconds while methods such as DGEMO and TSEMO that rely on global modeling require far more time for model fitting after only collecting {$1$,$200$} points.
Additionally, while \ALG needs to fit as many as $56 \times 5 = 280$ GP models on the Mazda problem due to the $54$ black-box constraints and the use of $5$ trust regions, the total time for model fitting still is less than $3$ minutes while this problem is completely out of reach for the other BO methods that rely on global modeling.
\FloatBarrier
\begin{table*}[!ht]
    \centering
    \begin{small}
    \begin{sc}
    \begin{adjustbox}{max width=\textwidth}
    \begin{tabular}{l|ccc|ccc}
        \toprule
        Problem & DTLZ3 $(M=2)$ & DTLZ5 $(M=2)$ & DTLZ7 $(M=2)$ & DTLZ3 $(M=4)$ & DTLZ5 $(M=4)$ & DTLZ7 $(M=4)$\\
        \midrule
        \ALG{} & 11.0 (0.6) & 9.7 (0.4) & 10.6 (0.4) & 11.5 (0.9) & 10.5 (0.5) & 10.6 (0.4)\\
        NSGA-II & 0.0 (0.0) & 0.0 (0.0) & 0.0 (0.0) & 0.0 (0.0) & 0.0 (0.0) & 0.0 (0.0)\\
        $q$ParEGO & 139.5 (24.6) & 49.1 (2.2) & 26.0 (2.5) & 137.2 (15.4) & 113.2 (6.6) & 49.0 (3.5)\\
        TS-TCH & 64.5 (3.4)& 93.9 (5.8)& 89.6 (3.5) & 143.3 (5.9)& 167.6 (8.8) & 141.3 (6.1) \\
        $q$NEHVI & 133.2 (23.9) & 48.9 (4.9) & 20.8 (1.7) & 25.9 (2.3) & 19.8 (1.7) & 6.8 (0.4)\\
        DGEMO & 5425.1 (142.0) & 1438.0 (29.0) & 180.0 (35.3) & N/A & N/A & N/A \\
        TSEMO & 4246.3 (91.8) & 2481.5 (48.5) & 958.4 (49.1) & 3767.4 (91.0) & 1892.3 (801.5) & 402.0 (31.7)\\
        MOEAD-EGO & 3474.6 (108.6) & 1824.0 (40.1) & 1130.3 (16.0) & 4206.1 (120.5) & 2526.3 (77.5) & 1048.0 (37.8)\\
        \bottomrule
    \end{tabular}
    \end{adjustbox}
    \end{sc}
    \end{small}
    \caption{\label{table:fit_walltimes_dtlz_m4} Model fitting wall time in seconds. The mean and two standard errors of the mean are reported.
    All models were fit on 2x Intel(R) Xeon(R) Gold 6138 CPU @ 2.00GHz. For $M=4$, $q$NEHVI exceeded GPU memory during acquisition optimization and therefore has shorter average model fitting times.}
\end{table*}
\begin{table*}[!ht]
    \centering
    \begin{small}
    \begin{sc}
    \begin{tabular}{l|ccccc}
        \toprule
         Problem & Welded Beam & Vehicle Safety  & Rover& Optical Design & Mazda\\
        \midrule
        \ALG{} & 7.81 (0.02) & 12.58 (0.26) & 9.3 (0.19) & 23.57 (0.36) & 172.53 (1.89)\\
        $q$ParEGO& 0.5 (0.1) & 0.1 (0.0) & 51.6 (16.4) & 46.7 (10.7) & N/A \\
        TS-TCH & 0.5 (0.0) & 0.2 (0.0) & 45.9 (1.8) & 40.5 (4.9) & N/A \\
        $q$NEHVI& 0.5 (0.0) & 0.1 (0.0) & 97.8 (16.3) & 46.4 (3.2) & N/A \\
        DGEMO & N/A & N/A & 809.7 (127.6) & 1109.3 (178.7) & N/A \\
        TSEMO & N/A & 1.0 (0.1) & 305.3 (38.2) & 859.4 (131.4) & N/A \\
        MOEA/D-EGO & N/A & 0.9 (0.0)&373.2 (51.7) & 736.4 (110.4) & N/A \\
        \bottomrule
    \end{tabular}
    \end{sc}
    \end{small}
    \caption{\label{table:fit_walltimes} Model fitting wall time in seconds. The mean and two standard errors of the mean are reported.
    All models were fit on 2x Intel(R) Xeon(R) Gold 6138 CPU @ 2.00GHz. For DGEMO, TSEMO and MOEA/D-EGO only {$1$,$450$} evaluations were performed on Rover (Trajectory Planning) and only {$1$,$250$} evaluations were performed on Optical Design, so the fitting times are shorter than if the full {$2$,$000$} evaluations had been performed.}
\end{table*}
\FloatBarrier


\section{Additional Results}
\label{appdx:additional_results}


\subsection{Low-dimensional problems}
\label{appdx:baby_problems}
\FloatBarrier
We consider two low-dimensional problems to allow for a comparison with existing BO baselines.
The first problem we consider is a vehicle safety design problem ($d=5$) in which we tune thicknesses of various components of an automobile frame to optimize proxy metrics for maximizing fuel efficiency, minimizing passenger trauma in a full-frontal collision, and maximizing vehicle durability.
The second problem is a welded beam design problem ($d=4$), where the goal is to minimize the cost of the beam and the deflection of the beam under the applied load~\citep{deb2006reference}.
The design variables are the thickness and length of the welds and the height and width of the beam.
In addition, there are $4$ black-box constraints that must be satisfied.

Figure~\ref{fig:baby_problems} presents results for both problems.
While \ALG is not designed for such simple, low-dimensional problems, it is still competitive with other baselines such as TS-TCH and $q$ParEGO on the vehicle design problem, though it cannot quite match the performance of $q$NEHVI and TSEMO.\footnote{DGEMO is not included on this problem as it consistently crashed due to an error deep in the low-level code for the graph-cutting algorithm.}
The results on the welded beam problem illustrate the efficient constraint handling of \ALG.\footnote{DGEMO, TSEMO, MOEA/D-EGO, and TS-TCH are excluded as they do not consider black-box constraints.}
On both problems, we observe that NSGA-II struggles to keep up, performing barely better (vehicle safety) or even worse (welded beam) than quasi-random Sobol exploration.
\begin{figure}
    \centering
        \includegraphics[width=0.8\textwidth]{figures/small_problems_new_2.pdf}
        \caption{(Left) $q$NEHVI performs the best on the  vehicle design problem ($d=5$) with $3$ objectives. (Right) \ALG outperforms the other methods on  welded beam problem ($d=4$) with $4$ constraints.}
        \label{fig:baby_problems}
\end{figure}


% \begin{figure*}[!ht]
%     \centering
%     \includegraphics[width=0.6\textwidth]{figures/small_problems_new_2.pdf}
%     \caption{(Left) $q$NEHVI performs the best on the  vehicle design problem ($d=5$) with $3$ objectives. (Right) \ALG outperforms the other methods on  welded beam problem ($d=4$) with $4$ constraints.}
%     \label{fig:baby_problems}
% \end{figure*}
\FloatBarrier
\subsection{Candidate Generation Wall Time}
\label{appdx:wall_time_comparisons}
\FloatBarrier
\begin{table*}[!ht]
    \centering
    \begin{small}
    \begin{sc}
    % \resizebox{\textwidth}{!}{
    \begin{tabular}{lccccc}
        \toprule
         Problem & Welded Beam & Vehicle Safety  & Rover& Optical Design & Mazda\\
         Batch Size &($q=1$) &($q=1$) & ($q=50$)& ($q=50$) &($q=50$)\\
        \midrule
        \ALG{} & 1.3 (0.0) & 9.6 (0.7) &23.4 (0.4) & 9.8 (0.1)& 188.16 (1.72)\\
        $q$ParEGO &14.5 (0.3)&1.3 (0.0)&213.4 (11.2)&241.9 (14.9)& N/A  \\
        TS-TCH & N/A &0.6 (0.0)&31.3 (1.1)&48.1 (1.2)&N/A \\
        $q$NEHVI &30.4 (0.4)&9.1 (0.1)&997.5 (62.8)&211.27 (6.66)&N/A \\
        NSGA-II &0.0 (0.0)&0.0 (0.0) & 0.0 (0.0) & 0.0 (0.0) & 0.0 (0.0)\\
        DGEMO & N/A & N/A & 697.1 (52.5)&2278.7 (199.8)& N/A \\
        TSEMO & N/A &3.4 (0.1)&3.3 (0.0)& 4.6 (0.1)& N/A \\
        MOEA/D-EGO & N/A & 44.3 (0.3)&71.1 (4.3)&97.5 (6.7)& N/A \\
        LaMOO-CMAES & N/A & 0.6 (0.0) & 2.6 (0.0) & 51.9 (0.3) & N/A \\
        LAMOO-$q$NEHVI& N/A & 24.0 (2.3) & 292.4 (25.2) & 258.8 (1.9) & N/A \\
        \bottomrule
    \end{tabular}
    % }
    \end{sc}
    \end{small}
    \caption{\label{table:walltimes} Batch selection wall time (excluding model fitting) in seconds. The mean and two standard errors of the mean are reported.
    \ALG, $q$ParEGO, TS-TCH, and $q$NEHVI were run on a Tesla V100 SXM2 GPU (16GB RAM), while DGEMO, TSEMO, MOEA/D-EGO and NSGA-II were run on 2x Intel(R) Xeon(R) Gold 6138 CPU @ 2.00GHz. For Welded Beam and Vehicle Safety, we ran NSGA-II with $q=5$ in order to avoid a singleton population. For DGEMO, TSEMO and MOEA/D-EGO only {$1$,$450$} evaluations were performed on Rover (Trajectory Planning) and only {$1$,$250$} evaluations were performed on Optical Design, so the generation times are shorter than if the full {$2$,$000$} evaluations had been performed.}
\end{table*}

\begin{table*}[!ht]
    \centering
    \begin{small}
    \begin{sc}
    \begin{adjustbox}{max width=\textwidth}
    \begin{tabular}{l|ccc|ccc}
        \toprule
        Problem & DTLZ3 $(M=2)$ & DTLZ5 $(M=2)$ & DTLZ7 $(M=2)$ & DTLZ3 $(M=4)$ & DTLZ5 $(M=4)$ & DTLZ7 $(M=4)$\\
        Batch Size &($q=50$) &($q=50$) & ($q=50$) &($q=50$) &($q=50$) & ($q=50$)\\
        \midrule
        \ALG & 26.0 (1.3) & 25.1 (0.9) & 293.0 (21.9) & 976.9 (89.8) & 973.0 (91.8) & 293.0 (21.9)\\
        $q$ParEGO & 315.8 (20.2) & 299.0 (27.2) & 233.0 (21.5) & 372.9 (46.6) & 373.1 (34.6) & 232.4 (22.2)\\
        TS-TCH & 43.6 (1.4) & 49.6 (2.0) & 39.5 (1.9) & 56.5 (1.8) & 69.2 (7.5) & 51.4 (3.4)\\
        $q$NEHVI & 2877.7 (321.3) & 1879.6 (285.4) & 816.9 (49.1) & 4412.9 (600.7) & 3778.2 (266.5) & 57.6 (4.4)\\
        NSGA-II & 0.0 (0.0) & 0.0 (0.0) & 0.0 (0.0) &0.1 (0.0) & 0.0 (0.0) & 0.0 (0.0)\\
        DGEMO & N/A & N/A & N/A & N/A & N/A & N/A \\
        TSEMO & 6.3 (0.1) & 7.2 (0.1) & 6.8 (0.1) & 2878.1 (162.0) & 952.0 (298.1) & 22.2 (3.7)\\
        MOEAD-EGO& 277.8 (1.2) & 224.9 (3.2) & 245.3 (2.9) & 308.7 (2.9) & 303.7 (3.1) & 292.2 (3.5)\\
        \bottomrule
    \end{tabular}
    \end{adjustbox}
    \end{sc}
    \end{small}
    \caption{\label{table:walltimes_dtlz_m4} Batch selection wall time (excluding model fitting) in seconds on DTLZ problems with 2 and 4 objectives with $d=100$. The mean and two standard errors of the mean are reported.}
\end{table*}



While candidate generation time is often a secondary concern in classic BO applications, where evaluating the black box function often takes orders of magnitude longer, existing methods using a single global model and standard acquisition function optimization approaches can become the bottleneck in high-throughput asynchronous evaluation settings that are common with high-dimensional problems.
Tables~\ref{table:walltimes} and \ref{table:walltimes_dtlz_m4} provides a comparison of the wall time for generating a batch of candidates for the different methods on the different benchmark problems.
We observe that the candidate generation for \ALG is two orders of magnitudes faster than for other methods such as $q$ParEGO and $q$NEHVI on the trajectory planning problem where all methods ran for the full {$2$,$000$} evaluations.

\FloatBarrier



\subsection{Pareto Frontiers}
\label{appdx:pareto_frontiers}
\FloatBarrier
We show the Pareto frontiers for the welded beam, trajectory planning, optical design, and Mazda problems in Figure~\ref{fig:all_pfs}.
In each column we show the Pareto frontiers corresponding to the worst, median, and best replications according to the final hypervolume.
We exclude the vehicle design problem as it has three objectives which makes the final Pareto frontiers challenging to visualize.

Figure~\ref{fig:all_pfs} shows that even on the low-dimensional $4$D welded beam problem, \ALG is able to achieve much better coverage than the baseline methods.
\ALG also explores the trade-offs better than other methods on the trajectory planning problem, where the best run by \ALG found trajectories with high reward that ended up being close to the final target location.
In particular, other methods struggle to identify trajectories with large rewards while \ALG consistently find trajectories with rewards close to $5$, which is the maximum possible reward.
On both the optical design and Mazda problems, the Pareto frontiers found by \ALG better explore the trade-offs between the objectives compared to NSGA-II and Sobol.
We note that \ALG generally achieves good coverage of the Pareto frontier for both problems.
For the optical design problem, we exclude the partial results found by running the other baselines for $1$k-$2$k evaluations and only show the methods the ran for the full $10$k evaluations.
For the Mazda problem we show the Pareto frontiers of the true objectives and not the normalized objectives that are described in Section~\ref{sec:real_world_problems}.
\ALG is able to significantly decrease the vehicle mass at the cost of using a fewer number of common parts, a trade-off that NSGA-II fails to explore.
It is worth noting that the number of common parts objective is integer-valued and that exploiting this additional information may unlock even better optimization performance of \ALG.

\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.88\textwidth]{figures/all_pfs.pdf}
    \caption{\label{fig:all_pfs}
        In each column we show the Pareto frontiers for the worst, median, and best replications according to the final hypervolume.
        We indicate whether an objective is minimized/maximized by $-/+$, respectively.
        The reference point is illustrated as a black star.
        The use of multiple trust regions allows \ALG to consistently achieve good coverage of the Pareto frontier, in addition to large hypervolumes.
    }
\end{figure}
\FloatBarrier
\subsection{Additional Benchmark Problems}
\FloatBarrier
\label{appdx:additional_benchmarks}
To study the performance of \ALG{} on a broader range of problems, we evaluate \ALG{} on two-objective and four-objective versions of DTLZ3, DTLZ5, and DTLZ7 problems with $d=100$. As shown in Figure \ref{fig:dtlz_m4}, \ALG{} performs best on the four-objective DTLZ7 and achieve the best final hypervolume on the four-objective DTLZ3 problem. On the two-objective problems, \ALG{} always ranks in the top 4 methods as shown in Figure \ref{fig:dtlz_m4}. To compare the performance in general across the DTLZ3, DTLZ5, and DTLZ7 problems with a given number of objectives, we rank the methods by the average final hypervolume across replications and compute the average rank across the three problems. As shown in Table \ref{table:rank}, \ALG{} achieves the lowest rank across all methods (which is best) on both M=2 and M=4 problems. DGEMO is not evaluated on the 4-objective problems because the open-source implementation (\url{https://github.com/yunshengtian/DGEMO/tree/master}) does not support more than two objectives. Although DGEMO, MOEA/D-EGO and $q$NEHVI all perform competitively in the two objective setting, all methods are significantly slower than \ALG{}.
% \hl{TODO: add wall times}.

\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.9\textwidth]{figures/dtlz_m2.pdf}
    \caption{
        \label{fig:dtlz_m2} Optimization performance on two-objective DTLZ3, DTLZ5, and DTLZ7 problems with $d=100$ and $q=50$.
    }
\end{figure}

\begin{figure}[!ht]
    \centering
    \includegraphics[width=0.9\textwidth]{figures/dtlz_m4.pdf}
    \caption{
        \label{fig:dtlz_m4} Optimization performance on four-objective DTLZ3, DTLZ5, and DTLZ7 problems with $d=100$ and $q=50$.
    }
\end{figure}

\begin{table*}[!ht]
    \centering
    \begin{small}
    \begin{sc}
    \begin{small}
    \begin{tabular}{lcc}
        \toprule
         & Avg. Rank for M=2 & Avg. Rank for M=4 \\
        \midrule
        \ALG{} & 3.0 & 1.67\\
        $q$ParEGO & 4.0 & 3.3  \\
        $q$NEHVI & 3.0 & 3.16  \\
        TS-TCH & 7.3 & 4.3 \\
        NSGA-II & 4.3 & 3.7\\
        DGEMO & 3.0 & 8.2 \\
        TSEMO & 7.7 & 8.3 \\
        MOEA/D-EGO & 4.0 & 5.5 \\
        Sobol & 8.7 & 6.8 \\
        \bottomrule
    \end{tabular}
    \end{small}
    \end{sc}
    \end{small}
    \caption{\label{table:rank} Mean rank across DTLZ3, DTLZ5, and DTLZ7 problems based on final mean hypervolume with $d=100$ and $q=50$. A lower rank means the method achieves better final performance on average across the DTLZ3, DTLZ5, and DTLZ7 problems with $M$ objectives.}
\end{table*}

% \section{Theoretical Results}
% \label{appdx:theoretical_results}
% \subsection{Convergence to the optimal $\epsilon$-approximate Pareto set of size $k$}
% In this section, we prove that
% \ALG{} converges to a finite approximate Pareto set under limited assumptions.
% \begin{definition}[$\bm \epsilon$-dominance]
% Let $\bm \epsilon \in \mathbb R^k_+$. Let $\bm x, \bm x' \in \mathcal X$. If $ f^{(i)}(\bm x) + \epsilon^{(i)} \geq f^{(i)}(\bm x')$ for all $i\in\{1,..., M\}$, then $\bm x$ $\bm\epsilon$-dominates $\bm x'$, which we denote as $\bm x \succeq_{\bm\epsilon} \bm x'$.
% \end{definition}
% An approximate Pareto set is a set such that every true Pareto optimal point is approximately dominated by a point in the approximate Pareto set.
% \begin{definition}[$\epsilon$-Pareto set]
% $\mathcal A \subseteq \mathcal X$ is an $\bm \epsilon$-Pareto set if for all $\bm x' \in \mathcal X$, there exists $\bm x \in \mathcal A$ such that $\bm x \succeq_{\bm\epsilon} \bm x'$.
% \end{definition}
% Despite a rich literature on the concept of approximate Pareto sets, to our knowledge no existing BO methods provide such a guarantee on convergence to the optimal $\epsilon$-approximate Pareto set. Therefore, convergence with probability one in the limit as the evaluation budget approaches infinity is a valuable theoretical property.

% \begin{theorem}
% \label{thm:convergence}
% Let $\bm \epsilon \in \mathbb R^M_+$. Suppose that the following assumptions hold:
% \begin{enumerate}[label=(\arabic*),itemsep=5pt]
%     \item The search space $\mathcal X \subset \mathbb R^d$, where $d \in \mathbb N$, is a compact set.
%     \item The objectives and constraints are continuous.
%     \item The number of objectives and constraints is finite.
%     \item \ALG{} only considers a newly evaluated sample to be an improvement (for updating the corresponding trust region's success and failure counters) if it increases the hypervolume by at least $\delta \in \mathbb R^+$.
% \end{enumerate}
% Then \ALG{} converges to an $\bm \epsilon$-approximate Pareto set $\mathcal A$ with probability one in the limit as the evaluation budget approaches infinity.
% \end{theorem}
% Assumption (1) is a design decision when setting up the optimization problem. This work only considers bounded search spaces. Assumptions (2) and (3) are a common in global optimization \citep{spall2005, regis2007}. Assumption (4) is straightforward to enforce in \ALG{}.

% \begin{proof}
% Note that each trust region will only evaluate a finite number of samples because of assumptions (2) and (4). Hence, trust regions are guaranteed terminated and reinitialized with random points infinitely many times with an infinite evaluation budget. The random points from each re-initialization form an infinite random subsequence, and hence, we can directly leverage the results from {\citet[Theorem 3.5]{schutze2008convergence}}, which proves that random search converges to an $\epsilon$-approximate Pareto set of finite size with probability one.
% \end{proof}
% % \vfill

\bibliography{daulton_446}
\end{document}
