\documentclass[accepted]{uai2024} % for initial submission
%\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.


\setlength{\textfloatsep}{3pt}

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
%\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors

\usepackage{graphicx}
\usepackage{subfigure}

\usepackage{amsthm}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{bm}
\usepackage{multirow}


\usepackage{graphbox}
\usepackage{pifont}

\usepackage{makecell}

\usepackage{algorithm}
\usepackage{algpseudocode}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{corollary}{Corollary}
\newtheorem{definition}{Definition}
\newtheorem{assumption}{Assumption}
%\setlength{\textfloatsep}{3pt}

\renewcommand{\algorithmicrequire}{ \textbf{Input:}}     %Use Input in the format of Algorithm
\renewcommand{\algorithmicensure}{ \textbf{Output:}}    %UseOutput in the format of Algorithm



%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Exploring High-dimensional Search Space via Voronoi Graph Traversing}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<adzhao19@fudan.edu.cn>?Subject=VGT-UAI2024}{Aidong~Zhao}{}}
\author[1]{{Xuyang~Zhao}{}}
\author[1]{{Tianchen~Gu}{}}
%\author[1]{{Zhaori~Bi}{}}
\author[,1]{{Zhaori~Bi}{\thanks{Corresponding authors: \{zhaori\_bi, xzeng\}@fudan.edu.cn.}}}
%\author[,1]{{Zhaori~Bi}{\thanks{Corresponding authors: }}}
\author[2]{{Xinwei~Sun}{}}
\author[1]{{\\Changhao~Yan}{}}
\author[1]{{Fan~Yang}{}}
\author[,1,3]{{Dian~Zhou}{\thanks{Emeritus Professor, the University of Texas at Dallas.}}}
\author[,1]{{Xuan~Zeng}{\footnote[1]{}}}

\affil[1]{%
	    State Key Laboratory of Integrated Chips and Systems, School of Microelectronics\\
	    Fudan University\\
	    Shanghai, China
	}
\affil[2]{%
	School of Data Science\\
	Fudan University\\
	Shanghai, China
}

\affil[3]{%
	Department of Electrical Engineering\\
	University of Texas at Dallas\\
	Richardson, Texas, USA
}


%\author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2024 paper}{Jane~J.~von~O'L\'opez}{}}
%\author[1]{Harry~Q.~Bovik}
%\author[1,2]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[1]{Further~Coauthor}
%\author[3]{Further~Coauthor}
%\author[3,1]{Further~Coauthor}
% Add affiliations after the authors
%\affil[1]{%
%    Computer Science Dept.\\
%    Cranberry University\\
%    Pittsburgh, Pennsylvania, USA
%}
%\affil[2]{%
%    Second Affiliation\\
%    Address\\
%    …
%}
%\affil[3]{%
%    Another Affiliation\\
%    Address\\
%    …
%  }
  
  \begin{document}
\maketitle

\begin{abstract}
  Bayesian optimization (BO) is a well-established methodology for optimizing costly black-box functions. However, the sparse observations in the high-dimensional search space pose challenges in constructing reliable Gaussian Process (GP) models, which leads to blind exploration of the search space. We propose a novel Voronoi Graph Traversing (VGT) algorithm to extend BO to ultra high-dimensional problems. VGT employs a Voronoi diagram to mesh the design space and transform it into an undirected Voronoi graph. VGT explores the search space by iteratively performing path selection, promising cell sampling, and graph expansion operations. We introduce a UCB-based global traversal strategy to select the path towards promising Voronoi cells. 
  Then we perform local BO within the promising cell and train local GP with a neighboring subset. The intrinsic geometric boundaries and adjacency of the Voronoi graph assist in fine-tuning the trajectory of local BO sampling.
  We also present a subspace enhancement approach for the intrinsic low-dimensional problems. Experimental results, including both synthetic benchmarks and real-world applications, demonstrate the proposed approach's state-of-the-art performance for tackling ultra high-dimensional problems ranging from hundreds to one thousand dimensions.
  % 
\end{abstract}

\section{Introduction}

The black-box function optimization is a widespread problem in engineering societies, particularly in domains characterized by computationally expensive or time-consuming evaluations, such as integrated circuit design \citep{cVTS}, vehicle design \citep{ve1}, and drug discovery \citep{drug1}.  %cite here
Bayesian optimization (BO), as discussed in \citep{BO1,BO2}, is a sample-efficient global optimization for expensive black-box problems. 
Nonetheless, scaling BO to high-dimensional (HD) space presents significant challenges, and becomes a prominent research area. 
High-dimensional BO suffers from the hurdles caused by the curse of dimensionality. Firstly, the sparse observations in the HD space compromise the reliability of Gaussian process (GP) models, making it challenging to accurately capture the manifolds of objective functions. Consequently, the imprecise GP models lead to blind exploration across the entire space. Secondly, the computational cost of GP training grows cubically with the number of observations, posing a bottleneck for HD problems. And for the complex HD heterogeneous problems, a large amount of observations is often imperative.

The sample efficiency of BO relies on accurate GP models, which demand numerous observations and become infeasible in HD space. Many high-dimensional BO (HDBO) methods have been proposed to enhance GP reliability and improve the sampling efficiency, as reviewed in Sec.\ref{Related work}. 
Firstly, \textit{dimension decoupling} based methods \citep{addgpucb,overlapaddgp,QFFBO,MES,EBO} fit the objective function with a set of low-dimensional addictive GPs, avoiding uncertain GP model in the HD space. Secondly, \textit{subspace embedding} based approaches \citep{RemBO,Rembo_1,HesBO,ALEBO,SAASBO,MCTSVS} embed the original HD problem into a low-dimensional subspace to obtain an effective subspace GP. However, 
the presumptions, dimension decomposability or intrinsic low-dimensionality, of these two approaches may not hold in practical problems.
Another method, TuRBO \citep{TuRBO}, enhances the local reliability of GPs through dynamic trust regions. However, achieving credible local GPs remains computationally expensive and infeasible for problems involving several hundred design variables. 

Local search methods with restart strategies are popular approaches for solving high-dimensional problems, which utilize the \textit{search direction} and \textit{step length} to iteratively navigate towards improved solutions. 
Commonly used local search methods include line search approaches like the quasi-Newton method BFGS \citep{NumericalOptimization, LBFGS} and trust region methods such as BOBYQA \citep{bobyqa}. 
However, the global convergence performance of these algorithms significantly relies on the choice of initial solutions.
%In the BFGS method, optimization proceeds by aligning with the gradient direction for descent, while adjustments to the step length are made utilizing the quasi-Hessian matrix. Conversely, the BOBYQA method dynamically regulates the step length based on the radius of trust region, with the descent direction informed by the local quadratic model. 

%In the BFGS method, the descent direction is determined by the gradient direction, while the step length is adjusted using the quasi-Hessian matrix. Conversely, the BOBYQA method adapts the step length based on the radius of the trust region, with the descent direction guided by the local quadratic model. 
%The BFGS method descent direction is the gradient direction and step length quasi-Hessian matrix. the BOBYQA method step length radius of trust region descent direction by local quadratic model. 
%\citep{NumericalOptimization} 



%Inspired by %Instead, we leverage the geometric information of each observation $\bm{x}$ and its corresponding function evaluation $f(\bm{x})$, and propose the Voronoi Graph Traversing (VGT) algorithm.

In this paper, we aim to scale BO to address multi-modal and heterogeneous problems in ultra high-dimensional input spaces. 
Characterizing function landscapes with regression models within HD spaces spanning several hundred dimensions is inherently impractical.
Consequently, we adopt an alternative strategy, steering away from the pursuit of enhancing GP fitness. 
Inspired by the key idea of local search methods, we leverage the Voronoi boundary and adjacency of each observation $(\bm{x},f(\bm{x}))$ to provide the \textit{step length} and \textit{search direction} information and guide the optimization process.
We propose the Voronoi Graph Traversing (VGT) algorithm, which employs Voronoi diagrams to segment the search space into convex Voronoi cells and utilizes adjacency information to construct a Voronoi graph to represent the design space. 
%Global exploration of the HD space is accomplished by traversing the graph via UCB. Local exploitation is achieved by performing local BO within promising Voronoi cells, facilitating trajectory correction via Voronoi cell's geometric boundaries. 
The contributions of this paper are summarized as follows.
\begin{itemize}
	\item We propose the VGT algorithm, a sample-efficient approach to enable BO to solve ultra high-dimensional problems. By decomposing the space into Voronoi cells and mapping it to a Voronoi graph, we transform the global exploration in continuous spaces into a Voronoi graph traversal problem. Then the promising cell is identified by traversing the graph via UCB.
	%\item In the global exploration phase, VGT iteratively performs path selection, promising cell sampling, and graph expansion operations to explore the search space and navigates to the promising cell. 
	\item In the local optimization phase, we employ local BO within the selected promising cell. We introduce the Voronoi Neighbored GP (VNGP) model, constructed with the Voronoi neighbors, to reduce computational cost. 
	Additionally, the natural geometric boundaries and Voronoi neighbors assist in fine-tuning the trajectory of local BO sampling.
	\item For intrinsic low-dimensional problems, we provide a local feature extraction method to capture the local manifold of the objective function and enhance sampling efficiency by targeting effective subspaces. 
	\item We assess the performance of the VGT algorithm using ultra high-dimensional benchmarks, with dimensions extending up to 1000D. The results demonstrate that VGT exhibits exceptional advantages when dealing with high-dimensional problems ranging from hundreds to one thousand dimensions.
\end{itemize}
A Python implementation of VGT is available on \url{https://github.com/adzhao072/VGT}.

%%%% insight %%%
%Different from subspace embedding methods that rely on the presumption of intrinsic low dimensionality, we aim to directly address the dense problem of high-dimensional input space with numerous local minima. Given this context, there are three fundamental requirements for the algorithm: (1) Sufficient Global Exploration. The algorithm must achieve comprehensive global exploration and escape from local optima. (2) Precise Local Exploitation. The high-dimensional space grows geometrically with the dimension. The key to solving high-dimensional problems is to focus on the most promising area and avoid excessive exploration. (3) Low Computational Complexity. The algorithm's computational complexity should remain manageable despite the high number of observations typical in high-dimensional scenarios. 
%To tackle these challenges, we introduce the VGT algorithm. First, to address the issue of global exploration in continuous high-dimensional spaces, we utilize Voronoi graphs. By decomposing the space into Voronoi cells and mapping it to the Voronoi graph, we harness the natural boundaries and connections inherent in the Voronoi diagram. Thus, we transform the global exploration in continuous spaces into a Voronoi graph traversal problem. This transformation, guided by UCB, enables us to effectively accomplish global exploration with remarkable efficiency. Secondly, employing Voronoi diagrams for domain decomposition (DD) ensures each cell contains exact one observation. In contrast to DD via TuRBO's trust regions or LaMCTS's K-means clustering, VGT offers a more refined DD technique. This precision guides the algorithm away from unfavorable neighboring cells, concentrating efforts on areas of higher potential for efficient local exploitation. Lastly, in terms of computational complexity, VGT's VNGP modeling complexity is $\mathcal{O}(K^3)$, while the acquisition function optimization complexity is $\mathcal{O}(K^2+\log N)$. Here, $N$ represents the observation count, and $K$ is the number of Voronoi neighbors. The computational complexity of VGT is lower than other BO baselines.










%\url{https://github.com/XXX/XXXXXX} (Anonymous for blind review)

\section{Related Works}
\label{Related work}
The key ideas to tackle HDBO include \textit{dimension decoupling}, \textit{subspace embedding}, and \textit{region restriction}.

\textit{Dimension decoupling} based methods rely on the assumption of dimension decomposability within the objective function. ADD-GP \citep{addgpucb} is proposed to learn the additive structure and decompose the high-dimensional space into disjoint or overlapping subspaces \citep{overlapaddgp,QFFBO,MES}. 
However, training a collection of GPs is computationally expensive and unaffordable for large observations. 
To alleviate the computational cost of GPs, various methods have emerged to approximate the GP kernel with Fourier features \citep{QFFBO,RFF_NIPS2007,RFF_NIPS2015,EBO, VFF}. 
Nevertheless, the challenge of expensive computation and unknown dimension structure still hinder their application in high-dimensional cases.


\textit{Subspace embedding} is a currently popular method that projects the high-dimensional problem into a low-dimensional subspace based on the assumption of intrinsic low dimensionality. Linear embedding methods, including RemBO \citep{RemBO,Rembo_1}, HesBO \citep{HesBO} and ALEBO \citep{ALEBO}, cast the problem into a randomly selected linear subspace and perform BO within the subspace.
SAASBO \citep{SAASBO} and MCTS-VS \citep{MCTSVS} aim to improve BO's sample efficiency by identifying sparse effective variables.
Additionally, other methods focus on learning non-linear feature spaces with neural networks \citep{VAE_BO,DGM_NIPS2020,DAE_NEURIPS2022}.
%, such as variational auto-encoder (VAE) \citep{VAE_BO}, deep generative model (DGM) \citep{DGM_NIPS2020} and deep auto-encoder (DAE) \citep{DAE_NEURIPS2022}.


\textit{Region restriction} is an effective approach for directly managing the high-dimensional input space. 
TuRBO \citep{TuRBO} confines the optimization within dynamically adjusted hyper-rectangular trust regions, which resists blind exploration across the entire search space.
Extensions of TuRBO have been proposed for categorical and mixed variables \citep{CMS_TuRBO}, as well as for faster local descent \citep{MCTD}.
Another approach, LA-MCTS \citep{LAMCTS,LaP3}, introduces a SVM-based hierarchical space partition and balances the exploration and exploitation via Monte Carlo tree search (MCTS). 












% Voronoi Optimistic Optimization (VOO) \citep{VOO} applies MCTS and Voronoi partition to guide the sampling in the high-dimensional space. 
%Voronoi-based method 

%best high-dimensional algorithm



\begin{figure}[ht] 
	\begin{center}
		\includegraphics[width=0.5\textwidth]{figures/alg_frame/VoronoiGraph.pdf} 
		%\vspace{-2mm} %,height=0.4\textwidth
		\caption{An illustration of a Voronoi graph $\mathcal{G}(\mathcal{V},\mathcal{E})$. The edge set $\mathcal{E}$ is determined by Delaunay triangulation. The neighbor set corresponding to node $v_7$ is $\mathcal{N}(v_7)=\{v_4,v_5,v_6,v_7\}$.} 
		\label{fig:01.31}
	\end{center}
\end{figure}

\section{Problem Setup and Background}
\label{Preliminaries}
\paragraph{Problem setup.} 
We consider the following black-box function optimization problem: 
\begin{equation}
	\label{opt problem}%
	%\bm{x}^* =\underset{\bm{x}\in R^D}{\arg\min} \ f(\bm{x}),
	\bm{x}^* =\underset{\bm{x}\in \mathcal{X}}{\arg\min} \ f(\bm{x}),
\end{equation}
where $\bm{x}$ represents the input variable, $\mathcal{X}=[0,1]^D$ is the normalized search space, $f:\mathcal{X}\rightarrow \mathbb {R}$ denotes the objective function that incurs computationally expensive evaluations, and $\bm{x}^*$ is the optimal parameter that achieves the minimal function value. 




\paragraph{Voronoi Diagram.} 
The Voronoi diagram, also known as Dirichlet tessellation, is a geometric representation that partitions space based on the Euclidean distance to a given set of observations or seeds. Consider a set of $n$ observations denoted as $\mathcal{V}=\{v_i;i=0,\cdots,n-1\}$ in the space $\mathcal{X}$. The Voronoi diagram divides $\mathcal{X}$ into $n$ convex polygons known as Voronoi cells. An observation $v_j$ serves as the site for its corresponding Voronoi cell $\text{Vor}(v_j)$. The Voronoi cell $\text{Vor}(v_j)$ covers the region that is closer to $v_j$ than any other observation in $\mathcal{V}$ \citep{ComputationalGeometry}:
% The Voronoi diagram provides a natural spatial partition based on the Euclidean distance to the given set of observations or seeds.
% Let's consider a set of $n$ observations denoted as $\mathcal{D}=\{v_i;i=0,\cdots,n-1\}$ in $\mathcal{X}$. The Voronoi diagram divides the space $\mathcal{X}$ into $n$ convex polygons called Voronoi cells. Each observation $v_j$ serves as the site of its corresponding Voronoi cell $Vor(v_j)$. $Vor(v_j)$ covers the region that is closer to $v_j$ than any other observation in $\mathcal{D}$ \citep{ComputationalGeometry}:
\begin{equation}
	\label{Vor cell}%
	\text{Vor}(v_j)=\{ \bm{x}\in \mathcal{X}|\forall v_i\in \mathcal{V}, ||\bm{x}-v_j||\le ||\bm{x}-v_i||\}.
\end{equation}
The adjacent cells that share common Voronoi boundaries are called Voronoi neighbors. Given an observed dataset $\mathcal{V}$, the Voronoi diagram $\text{Vor}(\mathcal{V})$ is uniquely determined. The neighborhood relationships within the Voronoi diagram can be established through the utilization of Delaunay triangulation. When two Voronoi cells share a common edge, it signifies a neighbor relationship between the corresponding nodes in $\mathcal{X}$. Fig. \ref{fig:01.31} presents an illustration of the Voronoi diagram along with its dual, the Delaunay triangulation. 



\paragraph{Voronoi Graph.} 
Based on the Voronoi diagram, we define a Voronoi graph $\mathcal{G}(\mathcal{V},\mathcal{E})$ as an undirected graph with self-loops. The node set $\mathcal{V}$ consists of the given observations. 
Each node $v_i$ corresponds to a Voronoi cell $\text{Vor}(v_i)$. 
Each non-looped edge $\{v_i,v_j\}\in \mathcal{E}$, where $i\neq j$, represents the existence of a specific neighboring relationship between Voronoi cell $\text{Vor}(v_i)$ and $\text{Vor}(v_j)$. 
If the neighboring relationship is defined by Voronoi adjacency, the Voronoi graph $\mathcal{G}(\mathcal{V},\mathcal{E})$ extends the Delaunay triangulation by including self-loops. 
However, in the high-dimensional scenarios, the exponential increase in possible simplices, such as triangles or tetrahedrons, presents significant challenges for the efficient construction and representation of Voronoi boundaries and Delaunay triangulation, both in terms of computation and storage.
%the exponential growth of possible simplices, such as triangles or tetrahedrons, poses challenges in efficiently constructing and representing the Voronoi boundaries and Delaunay triangulation in terms of computation and storage. 
Fortunately, the VGT algorithm does not rely on the exact Voronoi neighboring relationship. For high-dimensional cases, we construct an approximate graph based on the similarity of observations without explicit Voronoi computation. As outlined in Sec.\ref{scale HD space}, we introduce a Voronoi graph approximation technique that efficiently captures the neighborhood relationships and builds the connected Voronoi graph. 
% and constructs a connected Voronoi graph. 
%Fortunately, we can build the connected graph based on similarity of observations without explicit Voronoi computation for the HD cases. Thus, a Voronoi graph approximation technique, outlined in Sec.\ref{scale HD space}, is introduced to provide an efficient approach for obtaining the neighborhood relationships and build the connected Voronoi graph. 
% The Voronoi neighbors are the cells that share a common Voronoi edge, and this relationship is determined by the dual Delaunay triangulation. The VGT optimization algorithm, introduced in Section \ref{VGT}, leverages the boundary information of the Voronoi cells and the neighborhood relationship provided by the Delaunay triangulation.
To simplify the notation, we define the neighboring subset centered around a node $v$ as $\mathcal{N}(v) = \left\{v_i;\ \text{where}\ \{v,v_i\}\in\mathcal{E} \ \text{for}\  v_i\in\mathcal{V}\right\}$. Notice that, $\mathcal{N}(v)\subseteq \mathcal{V}$ includes both the node $v$ itself and its corresponding neighbors.




% For the sake of simplicity, we denote the neighboring subset centered around $v$ as $\mathcal{N}(v) = \{v\}\cup Neigh(v)$, which includes the node $v$ and its Voronoi neighbors.
%The set of Voronoi neighbors of a site $v$ is denoted as
%The VGT algorithm is designed to enhance the traversal of the Voronoi graph by leveraging the connectivity between Voronoi cells.


\paragraph{Slice Inverse Regression.}
Slice Inverse Regression (SIR) is a supervised method to discover the effective dimension reduction (EDR) directions, particularly in scenarios where there is a limited number of observations in the high-dimensional search space ($n\ll D$). In SIR, a regression model is defined as 
\begin{equation}
y=f(\bm{\beta}_1^T\bm{x},\cdots,\bm{\beta}_d^T\bm{x},\epsilon),
\end{equation}
where $\mathcal{B}=\{\bm{\beta}_1,\cdots,\bm{\beta}_d\}$ represents the $d$-dimensional EDR subspace ($d\ll D$), and $\epsilon$ denotes the regression noise. The pattern space $\mathcal{B}$ is extracted from the central inverse regression curve $E(\bm{x}|y)-E(\bm{x})$ under the linear design condition (LDC). This is achieved by solving the generalized eigen decomposition problem: 
\begin{equation}
\Sigma_{E(\bm{x}|y)}\bm{\beta}=\lambda\Sigma_{\bm{x}}\bm{\beta},
\end{equation}
where $\Sigma_{(\cdot)}$ denotes the covariance matrix empirically estimated using the sliced observations \citep{SIR}.



%\vspace{-3mm}
\section{Voronoi Graph Traversing Method}
\label{VGT}
%\vspace{-3mm}
In this section, we introduce the Voronoi graph traversing (VGT) algorithm, a novel approach for addressing HD black-box function optimization problems. 
%\vspace{-4mm}
\subsection{Voronoi Graph Traversing}
%\vspace{-3mm}
Due to limited observations and high computational complexity, building reliable GP models in HD spaces is impractical. Thus, the sample guidance provided by the surrogate model diminishes significantly, resulting in blind exploration of HD space. Our Voronoi Graph Traversing (VGT) algorithm takes a different approach, which employs the geometric information within the Voronoi diagram to implicitly update the \textit{step length} and \textit{search direction}, planning the traversal trajectory in HD space. 




%Following the local search methods, we use Voronoi cell boundary to provide  \textit{step length} and make the newly observations located on the boundary
%\textit{search direction} VNGP model 

\begin{figure*}[ht] 
	\includegraphics[width=1.0\textwidth]{figures/alg_frame/VGT_frame.pdf}  
	%\vspace{-4mm}
	\caption{An illustration of the search procedure of VGT. \textbf{(a) Expansion of node depth:} The depth of a node is incremented if a new sample is created within its corresponding Voronoi cell. \textbf{(b) Path selection:} Select the red path $v_{5,h=6} \rightarrow v_{6,h=6}$ to a promising neighbor via UCB. \textbf{(c) Local BO sampling:} Train VNGP model with Voronoi neighbors and create new sample $v_{7}$ within promising cell $\text{Vor}(v_{6,h=6})$. \textbf{(d) Expansion \& propagation:} Create a new cell $\text{Vor}(v_{7,h=7})$, and update Voronoi graph. The depth of target node $v_6$ is incremented by 1.
	} 
	\label{fig:figure_frame}
\end{figure*}


\begin{figure}[ht] 
	\includegraphics[width=0.5\textwidth]{figures/alg_frame/VGT_sample.pdf}  %,height=0.4\textwidth
	%\vspace{-5mm}
	\caption{An illustration demonstrating how Voronoi geometric boundaries guide HD optimization.} 
	\label{fig:Sample}
\end{figure}


We partition the search space into Voronoi cells and represent it as a Voronoi graph utilizing adjacency relationships, as depicted in Fig.\ref{fig:01.31}. The global exploration of VGT involves traversing the Voronoi graph, and navigating towards the optimal cell. For the local exploitation phase, the effective selection of \textit{step length} and \textit{search direction} determines the success of the optimization algorithm. 
We refine the BO sampling mechanism. 
The Voronoi boundaries, determined by the perpendicular bisectors between a cell and its neighbors, are considered highly informative and deserving of exploration. 
The GP posterior exhibits a higher standard deviation near these boundaries compared to regions close to existing observations. Sampling around the Voronoi boundaries maximizes the information gain about the current observations $\mathcal{V}$ \cite{UCBPE}. Hence, we anticipate using the Voronoi cell's boundary to guide the selection of the \textit{step length} of the next observation. Additionally, the \textit{search direction} is indicated by the local GP model constructed with Voronoi neighbors of the target cell.
However, computing and storing Voronoi boundaries is intractable. We use a Gaussian distribution to approximate the profile of the target Voronoi cell and sample new observations, as indicated by the dashed ellipse in Fig.\ref{fig:Sample}. 
The sample direction of local BO is refined with the assistance of the shape and geometric boundaries of the Voronoi cells. 
For the ``bad'' observation with poorer function value, like $v_7$, the new cell $\text{Vor}(v_{7})$ extrudes the space of original cell $\text{Vor}(v_{6})$, influencing subsequent sampling directions of BO, as shown in Fig.\ref{fig:Sample}(b).  
Conversely, successful observations, such as $v_8$, guide the algorithm towards more optimal regions, as exemplified by $v_9$ that converges near the global optimum.
By leveraging the geometric boundaries and adjacency of the Voronoi graph, VGT precisely utilizes the coordinate and function value of each observation to guide the optimization. 
%In the local exploitation phase, we construct local GP model with neighboring nodes of the target cell. 
%Recognizing that the GP model is insufficient to guide HD sampling, we resort to the shape and geometric boundaries of the Voronoi cells to strengthen and refine the search direction of local BO. 
%To preserve observation diversity, we generate new sample points near Voronoi cell boundaries, indicated by the dashed ellipse in Fig.\ref{fig:Sample}. 



\subsection{Search Procedure of VGT}
The search procedure is illustrated in Fig.\ref{fig:figure_frame} and detailed in Algorithm \ref{alg:VGT}, which encompasses three primary stages: (1) \textit{Path selection} creates a movement from the current node towards a promising neighboring node; (2) \textit{Local BO sampling} generates a new sample within the promising Voronoi cell; (3) \textit{Expansion} \& \textit{propagation} expands the graph with new observations and updates the reward. These steps are performed iteratively until the stopping criterion is satisfied. 
% and Algorithm \ref{alg:VGT}



The traversal algorithm commences by randomly generating an initial node $v_{0,h=0 \mid \mathcal{G}^{t=0}}$, which covers the whole search space $\mathcal{X}$. 
Here, the node depth $h$ denotes the number of visits to a node and its parents. 
The update of node depth $h$ follows the tree structure shown in Fig.\ref{fig:figure_frame}(a). The union set of leaf nodes represents a space partition of $\mathcal{X}$ at iteration $t$. 
Then we present the detailed search procedure of VGT. 




\begin{algorithm*}[ht]
	\caption{Voronoi Graph Traversing (VGT)}
	\begin{algorithmic}[1]
		\Require  Objective function $f(\bm{x})$, search space $\mathcal{X}$, maximal iteration $T$. 
		\State Randomly sample the initial node $\mathcal{V}=\{v_{0,h=0}=\text{random}(\bm{x}_0,f(\bm{x}_0))\}$; $\mathcal{G}^{t=0}(\mathcal{V},\mathcal{E})$.  \Comment{Random Initialization}
		\State Traverse from $v_{-1,h_{-1}^*\mid \mathcal{G}^{t=0}}^* \leftarrow v_{0,h=0\mid \mathcal{G}^{t=0}}$.
		\For{$t=0$ to $T-1$}
		\State Select the promising node from neighboring subset $v_{t,h_t^* \mid \mathcal{G}^{t}}^*=\underset{v_j\in \mathcal{N}(v_{t-1,h_{t-1}^*\mid \mathcal{G}^t}^*)}{\arg\max} \text{UCB}(v_j)$. \Comment{Path Selection}
		\State Train the VNGP model with dataset $\mathcal{N}(v_{t,h_t^* \mid \mathcal{G}^{t}}^*)$.
		\State Generate sample $\bm{x}^{t+1}$ by performing BO within Voronoi cell $\text{Vor}(v_{t,h_t^* \mid \mathcal{G}^{t}}^*)$. \Comment{Sample via Local BO}
		\State Evaluate objective function and collect samples $v_{t+1,h_t^*+1} = \left(\bm{x}_{t+1},f(\bm{x}_{t+1})\right)$.
		%$\nu^{t}_{leaf}.S\!=\!\nu^{t}_{leaf}.S\cup\{(\bm{x}^t,f(\bm{x}^t)\}$; 
		\State $\mathcal{G}^{t+1} \leftarrow  \mathcal{G}^{t}.\text{append}(v_{t+1,h_t^*+1})$; $v_{t,h_t^*+1 \mid \mathcal{G}^{t+1}}^* \leftarrow v_{t,h_t^*+1 \mid \mathcal{G}^{t}}^*$; Update UCB. \Comment{Expansion \& Propagation}
		\EndFor
	\end{algorithmic}
	%\vspace{-3mm}
	\label{alg:VGT}%
\end{algorithm*}




%\vspace{-1.5mm}
\paragraph{Path Selection.} 

In each iteration $t$, VGT moves from a start node $v_{t-1,h_{t-1}^* \mid \mathcal{G}^{t}}^*$ to a promising neighbor $v_{t,h_t^* \mid \mathcal{G}^{t}}^* \in \mathcal{N}(v_{t-1,h_{t-1}^* \mid \mathcal{G}^{t}}^*)$ (e.g., from node $v_{5,h=6}$ to node $v_{6,h=6}$ in Fig.\ref{fig:figure_frame}(b)), facilitating progress towards the global optimum. Notably, the start node $v_{t-1,h_{t-1}^* \mid \mathcal{G}^{t}}^*$ is the optimal node selected in iteration $t-1$, and also included in $\mathcal{N}(v_{t-1,h_{t-1}^* \mid \mathcal{G}^{t}}^*)$, allowing for a ``stationary step''. 
To achieve a comprehensive exploration of the search space, we employ the Upper Confidence Bound (UCB) criterion to assess the potential of each node during the dynamic path selection process. The UCB for VGT is defined as follows \citep{UCB_graph}:
%\vspace{-1.5mm}
\begin{equation}
	\label{UCB_graph}%
	%\vspace{-1.5mm}
	\text{UCB}(v_i) = Q(v_i) + \sqrt {\frac{C_p \cdot \ln t} {h(v_i)} },
\end{equation}
where $Q(v_i)$ represents the quality of node $v_i$, and $h(v_i)$ denotes the depth of $v_i$. For simplicity, we estimate the quality of node $v_i$ as $Q(v_i)=-f(\bm{x}_i)$. The hyperparameter $C_p$ balances the exploration of under-explored areas and the exploitation of promising regions. 
The traversal algorithm in VGT specifically limits the evaluations of UCB to nodes that have established neighborhood relationships, rather than evaluating all nodes in the graph. This selective UCB evaluation strategy aims to prioritize nodes that are more likely to contribute to finding the optimal solution. By focusing on nodes with established neighborhood relationships, VGT ensures a more targeted exploration. %, avoiding sudden jumps to nodes with large uncertainty. 





\paragraph{Sample via Local BO.} 
Once a promising node is selected, we perform local BO within the corresponding Voronoi cell. To mitigate the computational complexity associated with GP modeling, we propose the Voronoi Neighbored GP (VNGP) model, which is trained with the neighbor dataset $\mathcal{N}(v_{t,h_t^* \mid \mathcal{G}^{t}}^*)$. As the GP kernel is correlated with the distance between observations, the observations located far from the target cell contribute little to local modeling. VNGP takes advantage of the spatial structure of the Voronoi graph to enable computationally efficient local modeling while maintaining accuracy. The new observation $v_{t+1,h_t^*+1}$ is sampled by optimizing the acquisition function within the irregular Voronoi cell (e.g., the blue polygonal region in Fig.\ref{fig:figure_frame}(c)). To achieve this, we employ a Gaussian distribution (e.g., the black dashed ellipse in Fig.\ref{fig:figure_frame}(c)) centered around the Voronoi site $v_{t,h_t^* \mid \mathcal{G}^{t}}^*$ to sample the acquisition function. The hyperparameters of the sample distribution are tuned using random samples located within the target cell. Samples lying outside the target cell are discarded by reject sampling.
%To achieve this, we employ the MSP-L\_BFGS\_B \citep{LBFGS} algorithm within a trust region centered on the Voronoi site $v_{t,h_t^* \mid \mathcal{G}^{t}}^*$. Notably, the acquisition function is penalized for samples lying outside the target cell. 

%\vspace{-1.5mm}
\paragraph{Expansion \& Propagation.} 
Each sample within a target cell $\text{Vor}(v_{t,h_t^* \mid \mathcal{G}^{t}}^*)$ of depth $h_t^*$ creates a new cell of depth $h_t^*+1$ while also incrementing the depth of $v_{t,h_t^* \mid \mathcal{G}^{t+1}}^*$. 
Once a new observation is sampled, the graph $\mathcal{G}^{t+1}$ is then expanded by incorporating a new Voronoi cell centered around the new observation $v_{t+1,h_{t}^*+1}$, as shown in Fig.\ref{fig:figure_frame}(d). 
The UCB is updated to refine the trajectory of the next step based on the most recent information. The \textit{expansion} \& \textit{propagation} step progressively expands the coverage of search space by incorporating the newly acquired nodes and enhances the algorithm's global exploration capability. 

By iteratively performing the aforementioned steps, the algorithm continues to explore the design space and adjust its trajectory to efficiently navigate towards the optimal region. The VNGP model is dynamically updated along with the movement and incorporation of new samples to maintain its local responsiveness and adaptability. By leveraging the VNGP model and employing the UCB selection strategy, the algorithm identifies a traversal path that progresses towards the global optimum along the valley bottom, as illustrated by the red traversing path in Fig.\ref{fig:Valley}.
%For a comprehensive traversal procedure of the VGT, please refer to Appendix \ref{VGT_detail}. %
For a visual overview of the VGT traversal procedure, please refer to Appendix \ref{VGT_detail}.






%\vspace{-1.5mm}
\subsection{Scaling to High-dimensional Search Space}\label{scale HD space}
%\vspace{-1.5mm}
In this section, we propose two key strategies, namely Voronoi graph approximation and subspace BO sampling, to tackle the challenges of scaling the VGT algorithm to high-dimensional search spaces.

%\vspace{-1.5mm}
\paragraph{Voronoi Graph Approximation.} 
Determining the Voronoi diagram and Delaunay triangulation in high-dimensional space and large-sample-budget scenarios is computationally infeasible. 
Instead of explicitly calculating the Voronoi boundaries and Delaunay connections, we can employ similarity search approaches, such as $K$-nearest neighbor search (K-NNS) \citep{10.5555/313559.313789} or approximate nearest neighbor search (ANNS) \citep{LSH, HNSW}, to discover the neighborhood relationships among observations and construct an approximate Voronoi graph. 
In this work, we utilize K-NNS to approximate the neighborhood of a given node. By identifying the $K$ nearest neighbor nodes, we can establish connections between each node and its $K$ closest neighbors, thereby forming a connected graph within the search space. 
% similarity search approaches graph
$K$ is a hyper-parameter that depends on the dimensionality of the problem. With a larger $K$, the Voronoi neighbors will be included in the $K$ nearest neighbors. For a moderate $K$, the Voronoi neighbors and the $K$ nearest neighbors often coincide. 
Additionally, in high-dimensional cases, the Voronoi boundaries are not computed explicitly. Instead, we use reject sampling to discard candidates outside the target cell based on the property described in Eq.\eqref{Vor cell}. Ultimately, the high-dimensional Voronoi graph approximation problem boils down to K-NNS.
% boundary: nearest neighbor search reject sampling
%summary, in LD case, VGT with explicit. in HD case K-NNS
This approximation effectively captures the local neighborhood relationships while avoiding the computational overhead associated with explicit Voronoi boundaries and Delaunay connections. 

%\vspace{-1.5mm}
\paragraph{Subspace BO Sampling.} 
In high-dimensional spaces, the localized GP model is often underfitting and exhibits large uncertainty, especially when the number of available samples is significantly smaller than the problem's dimension $D$. To tackle this issue, we incorporate Localized SIR (LSIR) \citep{LSIR} to capture the local EDR subspace denoted as $\mathcal{B}^t$ by leveraging information from neighboring samples. Furthermore, experience suggests that the objective function tends to decrease along the previous descent direction denoted as $\bm{s}^{t-1}$. Exploiting this insight, we construct the pattern subspace $\mathcal{S}^t = \{\bm{s}^{t-1}\} \cup \mathcal{B}^t$ with a dimension significantly smaller than the original problem's dimension, denoted as $|\mathcal{S}^t| \ll D$. Consequently, by accurately modeling the subspace problem with a smaller number of observations, we can effectively optimize the acquisition function $\alpha(\bm{x}^t + \bm{s})$ within the subspace $\mathcal{S}^t$, subject to the step length constraint $\bm{x}^t + \bm{s} \in \text{Vor}(v_{t,h_t^* \mid \mathcal{G}^{t}}^*)$: 
%\vspace{-1.5mm}
\begin{equation}
	\label{subproblem}%
	%\vspace{-1.5mm}
	\begin{split}
		&\bm{s}^t = \arg\max_{\bm{s}\in \mathcal{S}^t} \alpha(\bm{x}^t+\bm{s}),\\
		&\text{s.t.}\ \ \bm{x}^t+\bm{s} \in \text{Vor}(v_{t,h_t^* \mid \mathcal{G}^{t}}^*),
	\end{split}
\end{equation}
where $\alpha(\cdot)$ is the acquisition function. 
% The subspace policy contributes to the exploitation of promising region and facilitates fast local descent. 
The subspace method can efficiently capture the local effective manifold of objective functions in ultra HD space and enhance local BO sampling. 
To mitigate the potential degradation of diversity caused by subspace BO, we adopt a strategy of alternating subspace sampling and full-dimension sampling. We introduce a hyper-parameter $R_p$ to represent the ratio between subspace sampling and full-dimension sampling. For sparse optimization problems, where the valid dimensions are limited, a larger value of $R_p$ can be chosen to allocate more iterations for exploiting the subspace spanned by the effective feature directions. On the other hand, for dense problems with a larger number of relevant dimensions, a smaller value of $R_p$ can be utilized to focus more on the exploration of the promising cells.

%\vspace{-3mm}
\section{Discussions}
\label{Discussions}
%\vspace{-3mm}

%\subsection{Theoretical Results}
%\vspace{-2.5mm}
%
%Analyzing the regret of VGT directly is challenging. Therefore, we give a theoretical analysis of the algorithm through the lens of Voronoi domain decomposition.
%We consider the cumulative regret $R(T)=\sum_{t=1}^{T}(f(\bm{x}_t)-f(\bm{x}^*))$, representing the summation of gaps between the optimal value $f(\bm{x}^*)$ and the observations up to iteration $T$. 
%To guide our analysis, we make the following assumptions regarding the objective function and Voronoi cells.
%\begin{assumption}
%	(Lipschitz continuity of objective function). We assume that the objective function $f:\mathcal{X}\rightarrow \mathbb{R}$ satisfies the following Lipschitz continuity condition for some constant $L>0$:
%	\begin{equation}
%		\label{Lipschitz continuous}%
%		|f(\bm{x})-f(\bm{x}')|\le L ||\bm{x}-\bm{x}'||.
%	\end{equation}
%\end{assumption}
%\begin{assumption}
%	(Shrinkage of Voronoi cells). There exists a monotonically decreasing sequence $\delta(h) > 0$, such that for any cell $\text{Vor}(v_{i,h})$ of depth $h$, we have $\sup _{\bm{x}\in \text{Vor}(v_{i,h})} ||v_{i,h}-\bm{x}|| \le \delta(h)$. 
%\end{assumption}
%\begin{assumption}
%	(Descent in the near-optimal ball). There exists a small ball $\mathcal{X}_{\epsilon}$ around $\bm{x}^*$, $\epsilon>0$, such that for any $\bm{x}\in \mathcal{X}_{\epsilon}$, $f(\bm{x})\le f(\bm{x}^*)+\epsilon$, and for any $0\le \omega\le 1$, $f(\bm{x}^*) \le f(\omega\bm{x}+(1-\omega)\bm{x}^*)\le f(\bm{x})$.
%\end{assumption}
%\begin{theorem}
%	\label{RT_upper_bound}
%	Suppose $\delta(h)=\sqrt{D}\beta^h$, where $\frac{1}{2}\le \beta<1$, and $C_p=2$. The cumulative expected regret $E[R(T)]$ is bounded by
%	\begin{equation}
%		\vspace{-2mm}
%		E[R(T)]\le \frac{16L\sqrt{D}}{\epsilon^2(1-\beta)}\ln T + \left(4+\frac{2\pi ^2}{3}\right)\frac{L\sqrt{D}}{1-\beta}.
%	\end{equation}
%\end{theorem}
%We treat each Voronoi cell as an arm and conduct a regret analysis inspired by multi-armed bandits \citep{MABs,bandits_graph}. Theorem \ref{RT_upper_bound} provides the upper bound for the cumulative expected regret, with a proof available in Appendix 2. 
%Since $E[R(T)]\in \mathcal{O}(\sqrt{D}\ln(T))$, VGT performs well for ultra HD problems with a large value of $D$. 
%For the ideal case where $\beta = \frac{1}{2}$, indicating that the new sample $\bm{x}_{t+1}$ generated in Voronoi cell $\text{Vor}(v^*_{t,h_t^*|\mathcal{G}_t})$ precisely bisects the hypervolume, the regret converges rapidly.

%\vspace{-2mm}
\subsection{Complexity Analysis}
%\vspace{-2mm}
%\paragraph{Computation Complexity Analysis.} 
The computational burden of the VGT algorithm mainly originates from two factors: training the VNGP model and optimizing the acquisition function. For each iteration $t$, fitting the VNGP model incurs a complexity of $\mathcal{O}(K^3)$. Optimizing the acquisition function involves the prediction complexity of the VNGP model, which is $\mathcal{O}(K^2)$, as well as the complexity of the nearest neighbor search (NNS). 
The complexity of NNS depends on the specific implementation. In our approach, we utilize the popular \textit{k-d} tree for NNS, which involves depth-first tree traversal and backtracking. The backtracking operation typically grows exponentially with the dimension $D$, resulting in a linear query complexity for high-dimensional problems. 
In many scenarios, the \textit{k-d} tree struggles to outperform brute-force search , which has a search complexity of $\mathcal{O}(D\cdot N)$, due to the curse of dimensionality. This work is primarily focused on enhancing the sample efficiency of HDBO, and we do not delve into the challenges associated with high-dimensional NNS further. 
Initially, the computational cost is predominantly dominated by the VNGP model, while in later iterations, NNS becomes the main factor affecting computational efficiency. For a more detailed description of the computational complexity of VGT, please refer to Appendix \ref{detail complexity}. 

%\vspace{-2mm}
\subsection{Insights}
%\vspace{-2mm}
In this paper, we propose VGT as an efficient global optimization approach for complex and heterogeneous problems over HD search space. 
VGT divides the design space into Voronoi cells and traverses the graph to achieve global exploration. 
%By combining graph traversal with promising cell sampling, VGT guides towards global optimum along a valley with small function values, as shown in Fig.\ref{fig:Valley}. Regions with poor function values are effectively avoided with a few additional samples. 
By combining graph traversal with promising cell sampling, VGT guides towards the optimal region along a valley with small function values, as shown in Fig.\ref{fig:Valley}. Regions with poor function values are effectively avoided with a few additional observations. 
While VGT operates within the localized BO framework, maintaining a global perspective is crucial for ensuring the quality of convergence. In addition to the graph traversal strategy, VGT employs a restart mechanism to enhance its global search capability. If there is no reduction in the objective function value over several consecutive iterations, the algorithm moves to the Voronoi cell with the minimal depth $h$ and creates a new search path. 

The Voronoi diagram in VGT provides a fine-grained partition of the search space, with each observation contributing to the update of the geometry of the promising region and guiding the search direction. Compared to TuRBO, which employs hyper-rectangular trust regions, VGT leverages the geometric boundaries defined by each observation, and adapts more effectively to irregular, multi-modal, and heterogeneous function landscapes. 
Compared to La-MCTS, which utilizes SVM for domain decomposition, VGT exhibits lower computational complexity and superior scalability to high-dimensional heterogeneous problems, as the SVM boundary inherently relies on the adaptation of the kernel function to the objective function landscape. 



\begin{figure}[ht] 
	\begin{center}
		\includegraphics[width=0.47\textwidth]{figures/alg_frame/Valley_VGT.pdf}  %,height=0.4\textwidth
		%\vspace{-4.5mm}
		\caption{An illustration of the traversal path of VGT. Most observations are concentrated in the promising valley. The traversal path is marked by red arrows.} 
		\label{fig:Valley}
	\end{center}
\end{figure}

\begin{figure}%[h]
	\centering
	%\vspace{-0.35cm}
	\setlength{\abovecaptionskip}{2pt}
	\subfigtopskip=1pt
	\subfigcapskip=1pt
	\subfigure{
		\includegraphics[width=0.5\columnwidth]{figures/results_low_dim/Ackley5D_v0.pdf}
		\includegraphics[width=0.5\columnwidth]{figures/results_low_dim/Griewank5D_v0.pdf}
	}
	\subfigure{
		\includegraphics[width=0.5\columnwidth]{figures/results_low_dim/Rosenbrock5D_v0.pdf}
		\includegraphics[width=0.5\columnwidth]{figures/results_low_dim/Hartmann6D_v0.pdf}
	}
	\subfigure{
		\includegraphics[width=1.0\columnwidth]{figures/results_low_dim/legend_v0.pdf}
	}
	%\vspace{-2.5mm}
	\caption{Optimization results for low-dimensional synthetic benchmarks. }
	\label{fig:low dim opt results}
	%\vspace{-2.5mm}
\end{figure}

%\vspace{-3mm}
\section{Experimental Results}
\label{Experimental Results}
%\vspace{-1mm}

We conduct a thorough evaluation of VGT's performance using a diverse set of HD experiments. Our experiments encompass both synthetic functions, such as Ackley, Griewank, Rosenbrock and Hartmann6, as well as real-world applications, including vehicle design (124D Mopta08), machine learning tasks (388D SVM training), and analog circuit optimization (36D opamp, 77D phase lock loop).
Except for the experiments in Sec.\ref{dummy_experiments} involving additional dummy dimensions, all other benchmarks are based on real dimensions and challenging for optimization algorithms. 


%\paragraph{Baselines.} 
To provide a comprehensive evaluation, we compare the performance of VGT against a wide range of state-of-the-art baselines, including the local BO methods MCTD \citep{MCTD}, TuRBO \citep{TuRBO} and La-MCTS \citep{LAMCTS}, the subspace embedding-based approaches MCTS-VS \citep{MCTSVS} and HesBO \citep{HesBO}, the popular evolutionary algorithm CMA-ES \citep{CMAES}, the simplex method Nelder-Mead \citep{NelderMead}, and Random Search. 
All experiments are conducted on a Linux workstation equipped with Intel Xeon Gold 6230 @2.1GHz CPUs and 128GB memory. To account for random variations, each experiment is repeated 10 times with different random seeds.
For more detailed experimental settings, sensitivity analysis of the hyper-parameters, and additional experimental results, please refer to Appendix \ref{MoreExperiments}.




\subsection{Synthetic Functions}

\subsubsection{Low-dimensional Synthetic Benchmarks}


We first provide a set of low-dimensional synthetic benchmarks to evaluate the performance of the proposed VGT algorithm, including Ackley, Griewank, Rosenbrock, and Hartmann6. The experimental results are presented in Fig.\ref{fig:low dim opt results}. For these small-scale problems, we can explicitly compute the Voronoi boundaries and neighbors. Then, we compare the performance of \texttt{VGT-lowdim} with explicit Voronoi boundaries and neighbors, and \texttt{VGT} with approximated Voronoi graph by K-NNS. Experimental results indicate that the Voronoi graph approximation approach proposed in Sec.\ref{scale HD space} does not compromise VGT's sample efficiency. Both of the aforementioned VGT methods achieve superior sampling efficiency and better solutions compared to state-of-the-art baselines, with TuRBO and MCTD following behind. The performance of the subspace embedding-based approaches, HesBO and MCTS-VS, is inferior to the local BO methods.







\begin{figure*}[ht]
	\centering
	%\vspace{-0.35cm}
	\setlength{\abovecaptionskip}{1pt}
	\subfigtopskip=2pt
	\subfigbottomskip=2pt
	\subfigcapskip=0pt
	
	\subfigure{
		\includegraphics[width=0.66\columnwidth]{figures/results_funcs/Ackley100D_v1.pdf}
		\includegraphics[width=0.66\columnwidth]{figures/results_funcs/Ackley200D_v1.pdf}
		\includegraphics[width=0.66\columnwidth]{figures/results_funcs/Ackley1000D_v1.pdf}
	}
	
	\subfigure{
		\includegraphics[width=0.66\columnwidth]{figures/results_funcs/Griewank100D_v1.pdf}
		\includegraphics[width=0.66\columnwidth]{figures/results_funcs/Griewank200D_v1.pdf}
		\includegraphics[width=0.66\columnwidth]{figures/results_funcs/Griewank1000D_v1.pdf}
	}
	
	\subfigure {
		\label{fig6:c}
		\includegraphics[width=1.95\columnwidth]{figures/results_funcs/legend2.pdf}
	}
	\caption{ Optimization results for high-dimensional synthetic benchmarks are presented. All benchmarks are multi-modal and challenging for global optimization algorithms, with each having a unique global optimum of 0.}
	%\vspace{-2.5mm}
	\label{fig:high dimensional opt results}
\end{figure*}





\subsubsection{High-dimensional Synthetic Benchmarks}
\label{High-dimensional Synthetic Benchmarks}
We evaluate the performance of VGT and compare it against various baselines on benchmark functions including Ackley, and Griewank. Both functions are evaluated in dimensions of 100D, 200D and 1000D, and are challenging for global optimization algorithms. 


Fig.\ref{fig:high dimensional opt results} gives a visual comparison of VGT against the baselines. In the 100D and 200D scenarios, VGT consistently outperforms MCTD for both benchmarks, with CMA-ES and TuRBO closely following. The subspace embedding-based method HesBO and variable selection-based method MCTS-VS show unsatisfactory performance when applied to the full-dimensional problems.
%For the Rastrigin benchmark, VGT initially lags behind MCTD due to its strategy of investing more observations in exploring the search space and assessing potential regions. However, VGT eventually surpasses them and achieves the best final results.





For the 1000D ultra high-dimensional scenarios, MCTD encounters difficulties when applying its local descent strategy, resulting in premature termination. The curve of LA-MCTS is also missing due to the computation time exceeding 200 hours, highlighting the computational challenges of the problem. 
%To assess the algorithm's performance in scaling to ultra high-dimensional scenarios, we conducted experiments using the Ackley1000D benchmark, where all dimensions are considered valid. %The results of comparing VGT with state-of-the-art high-dimensional methods are presented in Figure \ref{fig:uhd results}.
%In the extremely high-dimensional case, MCTD encounters difficulties when applying its local descent strategy, resulting in premature termination. The curve of LA-MCTS is also incomplete due to the computation time exceeding 100 hours, highlighting the computational challenges of the problem.
Among the compared algorithms, only VGT demonstrates efficient and stable descent for the ultra high-dimensional cases. CMA-ES and other HDBO methods, including MCTS-VS and TuRBO, are lagging far behind VGT. 
These results demonstrate that VGT is a highly effective algorithm for ultra HD optimization problems, outperforming other state-of-the-art methods in terms of both sample efficiency and quality of solutions.



\begin{figure}[ht]
	\centering
	%\vspace{-0.35cm}
	\subfigure{
		%\includegraphics[width=0.4\columnwidth]{figures/results_highdim/Ackley1000D.pdf}
		\includegraphics[align=c, width=0.5\columnwidth]{figures/results_highdim/Hartmann6_500D_v1.pdf}
		\includegraphics[align=c, width=0.5\columnwidth]{figures/results_highdim/Ackley10_500D_v1.pdf}
	}
	%\vspace{-2mm}
	\caption{ Experiments with additional dummy dimensions.  }
	\label{fig:hd results}
\end{figure}



\subsubsection{Scenarios with Additional Dummy Dimensions}
\label{dummy_experiments}
To evaluate the effectiveness of the proposed subspace sampling method, we conduct experiments with additional dummy dimensions in the Hartmann6D and Ackley10D functions. By extending the dimensions to 500D through the addition of independent dummy variables, we investigate two scenarios of VGT: single full-dimensional sampling \texttt{VGT} and effective subspace sampling \texttt{VGT-subspace}. 
For the subspace sampling, we set $R_p=1/1$ to enhance the subspace exploitation while also maintaining global search capability.
The numerical results, depicted in Fig.\ref{fig:hd results}, demonstrate that \texttt{VGT-subspace} with subspace sampling exhibited the fastest descent rate among the compared algorithms, closely followed by \texttt{VGT} with full-dimensional sampling. The variable selection-based MCTS-VS shows the ability to capture the valid dimensions and achieves satisfactory results. CMA-ES also demonstrates good adaptability to these problems. However, MCTD falls behind TuRBO and is not well-suited for these intrinsically low-dimensional problems.
These findings validate the effectiveness of the subspace sampling method in VGT, which allows for sample-efficient optimization even in the presence of additional dummy dimensions.





%\vspace{-1mm}
\subsection{Real-world Applications}
For the real-world optimization problems, we focus on two analog integrated circuit optimization problems based on the open-source benchmark circuits \citep{analogbenchmark}, a 36D opamp circuit and a 77D phase lock loop (PLL). 
Additionally, we consider a 124D soft-constrained vehicle design problem MOPTA08, as well as a 388D SVM training task. 
For these real-world problems with unknown dimensional structures, we use the parameter setting $R_p=1/4$ for subspace sampling, which allows us to explore potential EDR directions with a small number of observations. 

\begin{figure}[ht]
	\centering
	%\vspace{-0.35cm}
	\setlength{\abovecaptionskip}{2pt}
	\subfigtopskip=2pt
	\subfigbottomskip=2pt
	\subfigcapskip=0pt
	\subfigure{
		\includegraphics[width=0.5\columnwidth]{figures/results_circuits/OPAMP36D_v1.pdf}
		\includegraphics[width=0.5\columnwidth]{figures/results_circuits/PLL77D_v1.pdf}
	}
	\subfigure{
		\includegraphics[width=0.5\columnwidth]{figures/results_ml_tasks/Mopta08_124D_v1.pdf}
		\includegraphics[width=0.5\columnwidth]{figures/results_ml_tasks/SVM388D_v1.pdf}
	}
	%\vspace{-2.5mm}
	\caption{Optimization results for real-world applications. Each practical problem is transformed into a scalar minimization problem. }
	\label{fig:ML tasks opt results}
	%\vspace{-2.5mm}
\end{figure}

For the opamp circuit, the objective is to minimize the \texttt{Iddq} with three specification constraints. We formulate a scalar objective function with soft penalties to address the circuit design, which involves 36 free parameters related to transistor sizes and capacitor areas. The circuit performance is obtained from the SPICE simulator, and the objective function exhibits heterogeneity due to the piece-wise device model. VGT outperforms other methods in this case. MCTD achieves a similar descent speed to TuRBO, indicating that the stochastic three-point descent of MCTD can not provide an advantage in this scenario. The simplex method, Nelder Mead, fails to find the feasible region. For the PLL circuit, the objective is to minimize the average current consumption (\texttt{Iddavg}) while maintaining the output peak-to-peak voltage. We focus on optimizing the charge pump and voltage-controlled oscillator (VCO) components. Behavior models of logic gates are used to reduce simulation time. The PLL has 77 design parameters related to device sizes. Results show that VGT still outperforms other methods in this case, with HesBO following. 

For the 124D soft-constrained vehicle design problem MOPTA08 and the 388D SVM training task, VGT maintains its superiority over other methods in these benchmarks. For MOPTA08, VGT converges to the optimum with a small number of samples, while MCTD, TuRBO, MCTS-VS, and CMA-ES achieve similar final results. In the case of SVM388D, VGT outperforms other baselines by a large margin, highlighting its superiority for high-dimensional problems. MCTD, TuRBO, and CMA-ES also display good performance in high-dimensional settings, while other methods fail to find reasonable solutions. Please refer to Fig.\ref{fig:ML tasks opt results} for a visual representation of the results.



%\vspace{-1mm}
\section{Conclusions}
%\vspace{-3mm}
We propose a novel Voronoi graph traversing method for scaling BO to ultra high-dimensional input space. We utilize a UCB-based graph traversing strategy to navigate the search direction in high-dimensional space. Local exploitation efficiency is ensured by sampling within the promising Voronoi cell. Moreover, we provide an efficient subspace BO sampling by restricting BO to the effective subspace extracted using LSIR. 
Experiments on the ultra high-dimensional benchmarks spanning up to 1000D demonstrate the remarkable advantages of the VGT algorithm for solving problems in ultra high-dimensional input space. The extension of VGT to ultra high-dimensional constrained optimization, multi-objective optimization, and distributed parallel computing is a focus of future research.


\begin{acknowledgements} % will be removed in pdf for initial submission,
	% (without ‘accepted’ option in \documentclass)
	% so you can already fill it to test with the
	% ‘accepted’ class option
	%Briefly acknowledge people and organizations here.
	%\emph{All} acknowledgements go in this section.
	This research is supported partly by the National Natural Science Foundation of China (NSFC) research projects 62141407, 62304052, 92373207, and 12331009.
\end{acknowledgements}



\newpage
%\bibliographystyle{unsrt}
\bibliography{reference.bib}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\onecolumn
\appendix

















\section{Convergence Procedure of VGT}\label{VGT_detail}
%\subsection{Convergence Procedure of VGT}
In this section, we present a detailed illustration of the iterative and convergence process of the VGT algorithm, depicted in Figure \ref{fig:figure_iter}, to facilitate readers' in-depth comprehension of VGT's search mechanism in high-dimensional spaces. 



\begin{figure}[ht] 
	\includegraphics[width=1.0\textwidth]{figures/fig_appendix/VGT_12iters.pdf}  %,height=0.4\textwidth
	%\vspace{-3mm}
	\caption{An illustration of the convergence procedure in VGT. 
	} 
	\label{fig:figure_iter}
\end{figure}

In high-dimensional spaces, limited observations result in highly imprecise GP models, rendering BO prone to blind exploration. The VGT algorithm introduces Voronoi domain decomposition to partition the high-dimensional space into convex Voronoi cells. Global exploration of space is achieved through traversal along the Voronoi graph. Subsequently, the Voronoi cell's geometric boundaries are employed to direct the sampling for local BO. 
To guarantee the global coverage of the entire space, in each iteration, we aim to position the sampling for local BO as near as possible to the edge of the promising Voronoi cell, typically around the vicinity of the dashed ellipse in Figure \ref{fig:figure_iter}. 


The algorithm starts with a randomly initial point, e.g. $v_0$ as depicted in Figure \ref{fig:figure_iter}(a), representing the entire search space $\mathcal{X}$.
By iteratively performing path selection, promising cell sampling, and graph expansion operations, new cells are generated. The ``good'' observations, (e.g. $v_4$ in Figure \ref{fig:figure_iter}(e)), direct the algorithm towards more promising regions (indicated by the red arrow). 
Conversely, the ``bad'' observations, like $v_1$, $v_2$, $v_3$, and $v_8$, guide the algorithm away from under-performing areas.
With the progression of iterations, the promising cell gradually contracts and converges towards the global optimum.


%\subsection{The Voronoi Graph Traversing (VGT) Algorithm}
%Algorithm \ref{alg:VGT} illustrates the algorithm flow of the proposed Voronoi Graph Traversing (VGT) algorithm. 








\section{Computational Complexity Analysis}
\label{detail complexity}
The computational burden of the VGT algorithm mainly originates from two factors: training the VNGP model and optimizing the acquisition function. 
The training and prediction complexity of the VNGP model is influenced by the number of neighbors of the selected node. In the Voronoi graph constructed with $K$ nearest neighbor search (NNS), the promising node is connected to its $K$ nearest neighbors. Consequently, the data size for training the VNGP model is $K$.

We consider a VNGP model with a training dataset $(X_K,\bm{y}_K)$ of size $K$. The predictive mean $\mu(\bm{x})$ and variance $\sigma^2(\bm{x})$ at $\bm{x}\in\mathcal{X}$ is:    
\begin{equation}
	\label{mu_cov_predict}%
	\begin{split}
		&\mu(\bm{x})= m(\bm{x}) +\bm{k}_{K}^\text{T} C_{K}^{-1}(\bm{y}-m(X)),\\
		&\sigma^2(\bm{x}) = k(\bm{x},\bm{x})-\bm{k}_{K}^\text{T} C_{K}^{-1}\bm{k}_{K},
	\end{split}
\end{equation}
where $\bm{k}_{K}\!=\![k(\bm{x},\bm{x}_0),\!\cdots\!,k(\bm{x},\bm{x}_{K-1})]^\text{T}$ and $C_K$ is the $K$-dimensional covariance matrix.
The model parameters are tuned by maximizing the log marginal likelihood function:
\begin{equation}
	\label{log maiginal likelihood}%
	\begin{split}
		L(\theta)=-\frac{1}{2}\bm{y}^\text{T} C_{K}^{-1} \bm{y} -\frac{1}{2}\log\big|C_{K}\big|-\frac{K}{2}\log(2\pi).
	\end{split}
\end{equation}
Hence, training the VNGP model has a complexity of $\mathcal{O}(K^3)$ due to the covariance matrix inversion in Eq.\eqref{log maiginal likelihood}. The prediction complexity is $\mathcal{O}(K^2)$ in Eq.\eqref{mu_cov_predict}. 




Another important computation cost arises from the optimization of the acquisition function within the Voronoi cell for each iteration $t$, which involves the prediction complexity of the VNGP model $\mathcal{O}(K^2)$, and the NNS complexity by \textit{k-d} tree. The \textit{k-d} tree has a construction complexity of $\mathcal{O}(D\cdot N\cdot \log(N))$ and a worst-case query complexity of $\mathcal{O}(D\cdot N)$. 





\section{EXPERIMENTS}\label{MoreExperiments}

\subsection{Runtime Comparison}

Table \ref{tbl:runtime} presents a comparison of the experimental results and the average runtime for a total of 2000 function evaluations.

\begin{table}[h]
	\centering
	%\vspace{-2.5mm}
	\caption{Runtime comparison of Bayesian optimization methods}
	%\vspace{-2mm}
	\label{tbl:runtime}
	\resizebox{0.98\textwidth}{!}{
		\begin{tabular}{cccccccccccc}
			\toprule
			\multirow{2}{*}{Algorithms} & \multicolumn{2}{c}{Ackley100D} &  & \multicolumn{2}{c}{Ackley200D} & & \multicolumn{2}{c}{Griewank100D} &  & \multicolumn{2}{c}{Griewank200D} \\
   \cline{2-3}\cline{5-6}\cline{8-9}\cline{11-12}
     & $f(\bm{x})$ & Runtime  & & $f(\bm{x})$ & Runtime & & $f(\bm{x})$ & Runtime &  & $f(\bm{x})$ & Runtime\\
			\midrule
			VGT & $\bm{0.44\pm 0.42}$ & \textbf{1.0h} & & $\bm{2.03\pm 0.42}$ &1.6h& &$\bm{0.90\pm 0.13}$ & 2.2h & & $\bm{195.6\pm 194.4}$ & 2.2h\\
			MCTD & $1.12\pm 0.20$ & 14.1h & & $3.28\pm 1.26$ &25.6h& &$13.1\pm 6.6$ & 4.8h & & $259.4\pm 65.1$ & 4.5h\\
			MCTS-VS & $8.33\pm 0.50$ & 1.2h & & $10.17\pm 0.49$ &\textbf{1.2h}& &$527.0\pm 60.9$ & \textbf{1.2h} & & $1327.0\pm 55.4$ & \textbf{47min}\\
                TuRBO & $5.01\pm 0.30$ & 1.5h & & $7.78\pm 0.22$ &1.4h& &$107.3\pm 30.6$ &1.6h & & $703.7\pm 26.7$ & 53min\\
			LaMCTS & $11.93\pm 0.12$ & 5.3h &  & $12.72\pm 0.15$ & 10.9h & &$1307.0\pm 192.5$ & 4.9h & & $3162.5\pm 163.8$ & 6.6h\\
                LaMCTS-TuRBO & $9.66\pm 0.52$ & 3.7h &  & $11.48\pm 0.11$ &2.6h& &$703.9\pm 68.0$ &17.7h & & $1789.3\pm 151.5$ & 7.5h\\
                GP-EI & $12.50\pm 0.13$ & 1.6h &  & $13.15\pm 0.16$ &3.9h& &$1382.9\pm 68.6$ & 3.1h & & $3203.1\pm 75.0$ & 3.7h \\
			\bottomrule
		\end{tabular}
	}
	%\vspace{-3.5mm}
\end{table}



\subsection{Additional Experimental Results}

\begin{figure}[ht]
	\centering
	%\vspace{-0.35cm}
	
	\subfigure{
		\includegraphics[width=0.33\columnwidth]{figures/results_funcs/Rosenbrock100D_v1.pdf}
		\includegraphics[width=0.33\columnwidth]{figures/results_funcs/Rosenbrock200D_v1.pdf}
		\includegraphics[width=0.33\columnwidth]{figures/results_funcs/Rosenbrock1000D_v1.pdf}
	}
	
	\subfigure{
		\includegraphics[width=0.33\columnwidth]{figures/results_funcs/Rastrigin100D_v1.pdf}
		\includegraphics[width=0.33\columnwidth]{figures/results_funcs/Rastrigin200D_v1.pdf}
		\includegraphics[width=0.33\columnwidth]{figures/results_funcs/Rastrigin1000D_v1.pdf}
	}
	
	\subfigure {
		\includegraphics[width=1.0\columnwidth]{figures/results_funcs/legend2.pdf}
	}
	\caption{ Optimization results for high-dimensional synthetic benchmarks. }
	\label{fig:high dimensional opt results_appendix}
\end{figure}

For the additional experiments, we consider the valley-shaped function Rosenbrock and the multi-modal function Rastrigin. Both functions are evaluated in dimensions of 100D, 200D, and 1000D, posing challenges for global optimization algorithms. 


Fig. \ref{fig:high dimensional opt results_appendix} presents the experimental results of 10 repeated runs. For the 100D and 200D Rosenbrock function, VGT consistently outperforms MCTD, with CMA-ES and TuRBO closely following behind. CMA-ES and TuRBO exhibit comparable sample efficiency for this case. The subspace embedding-based method HesBO and variable selection-based method MCTS-VS show unsatisfactory performance when applied to full-dimensional problems. 

For the 100D and 200D Rastrigin benchmarks, VGT initially lags behind MCTD due to its strategy of investing more observations in exploring the search space and assessing potential regions. However, VGT eventually surpasses them and achieves the best final solutions.

For the 1000D ultra high-dimensional scenarios, MCTD still fails to handle the ultra high-dimensional search space. Similarly, the local BO method LA-MCTS also fails due to its high computation cost caused by the SVM boundary for domain decomposition. 
In contrast, the proposed VGT algorithm continues to lead ahead of other baselines, and the margin with others is much larger in the high-dimensional cases. This demonstrates that the VGT algorithm exhibits much higher sampling efficiency for high-dimensional problems.

Fig. \ref{fig:samples} displays a scatter plot of observations, using Ackley200D as an example.
The distribution of observations in the local BO methods TuRBO and MCTD is relatively concentrated throughout the optimization process, limiting the algorithm's global exploration ability in the early stage. 
Conversely, the distribution of observations of the variable selection method MCTS-VS is overdispersed, indicating its weak local exploitation ability and difficulty in achieving rapid descent in the objective function value. 
In contrast, the observations of VGT are dispersed in the early stage, providing more exploration of the search space. As the iteration progresses, the observations tend to concentrate during the late stage, facilitating more focused exploitation of the search space.


\begin{figure}[ht]
	\centering
	%\vspace{-0.35cm}
	
	\subfigure{
	\includegraphics[width=0.6\columnwidth]{figures/hyperpara/samples.pdf}
}
	
	\caption{ Scatter plot of observations, using Ackley200D as an example. }
	\label{fig:samples}
\end{figure}

\subsection{Sensitivity Analysis of Hyper-parameters}

%Fig. \ref{fig:hyperpara} 

We further investigate the sensitivities of the hyper-parameters of VGT, including $C_p$ for exploration and exploitation balance, the number of neighbors $K$ used for approximating the Voronoi graph and the subspace exploitation ratio $R_p$. The corresponding experimental results are visualized in Fig. \ref{fig:hyperpara}.


\paragraph{Exploration \& exploitation balance parameter $C_p$} The hyper-parameter $C_p$ balances the exploitation in the best cell and the exploration of sparse areas with fewer visits. 
A large value of $C_p$ prioritizes exploration over exploitation, leading to the algorithm traversing under-explored regions of the search space. This can result in a reduced convergence rate, as shown in Fig. \ref{para:Cp} for $C_p = 1$ and $C_p = 5$. Setting $C_p$ within a reasonable range generally does not significantly impact the optimization ability of the algorithm. 

\paragraph{Number of neighbors $K$ } The hyper-parameter $K$ determines the number of nearest neighbors used to approximate the Voronoi graph, which directly affects the performance and computational complexity of the algorithm. 
A small value of $K$ can lead to a poorly fitted VNGP model, which is not instructive for the optimization process, e.g. $K=20$ in Fig. \ref{para:K}. 
Meanwhile, a larger value of $K$ improves accuracy at the cost of increased computational burden. Therefore, selecting an appropriate value of $K$ according to the problem size is important to achieve a balance between performance and computational efficiency.
For example, the choice $K=80$ for the 200D problem in Fig. \ref{para:K} yields satisfactory results. 
However, if $K$ is further increased, the improvement in sampling efficiency is not significant.
%The choice of $K$ should strike a balance between computational efficiency and accuracy based on the specific characteristics of the problem and available computational resources.


\paragraph{Subspace exploitation ratio $R_p$} 
The parameter $R_p$ controls the ratio to perform subspace BO sampling. It is essential to select appropriate $R_p$ according to the intrinsic dimension of the problem. 
A larger value of $R_p$ can lead to a degradation of the dimension "diversity" of the observations, which can result in the optimization trapped in the sub-optimal region. 
In our experiments, we observe that just a small $R_p$ can effectively extract the EDR subspace and accelerate the optimization process for problems with redundant dimensions. 
For instance, setting $R_p=1/3$ is sufficient for the Ackley10\_500D problem in Fig. \ref{para:Rp}. 



\begin{figure}[htbp]
	\centering
	%\vspace{-0.35cm}
	
	\subfigure[$C_p$]{
		\label{para:Cp}
		\includegraphics[width=0.315\columnwidth]{figures/hyperpara/Cp_para_v1.pdf}
	}
	\subfigure[$K$]{
		\label{para:K}
		\includegraphics[width=0.315\columnwidth]{figures/hyperpara/K_para_v1.pdf}
	}
	\subfigure[$R_p$]{
		\label{para:Rp}
		\includegraphics[width=0.315\columnwidth]{figures/hyperpara/Rp_para_v1.pdf}
	}
	
	\caption{ Sensitivity analysis of hyper-parameters. }
	\label{fig:hyperpara}
\end{figure}


\subsection{Experimental Settings}\label{Experimental Settings}
We use the opensource implementation of the baselines referred to by the authors: MCTD\footnote{\url{https://github.com/yazhai/mctd}}, MCTS-VS\footnote{\url{https://github.com/lamda-bbo/MCTS-VS}}, TuRBO\footnote{\url{https://github.com/uber-research/TuRBO}}, LA-MCTS\footnote{\url{https://github.com/facebookresearch/LaMCTS}} and HesBO\footnote{\url{https://github.com/aminnayebi/HesBO}}. 
For CMA-ES, we use the \texttt{pycma} library\footnote{\url{https://github.com/CMA-ES/pycma}}, and for Nelder Mead, we use the Python implementation \footnote{\url{https://github.com/fchollet/nelder-mead}}. We adopt the default hyper-parameter settings by the authors. The detailed experimental configuration is as follows:


\paragraph{MCTD} We use the author's default parameter settings with $C_d=10$ for the weight of recent improvement, $C_p=0.5$ for the weight of exploration, $C_p'=0.1$ for branch exploration and $C_d''=50,C_p''=0.1$ for leaf exploration.

\paragraph{TuRBO} To achieve better performance, we consider a single trust region for TuRBO. The batch size is set to 50 for the 1000D benchmarks and 20 for other benchmarks. 

\paragraph{MCTS-VS} We use the author's default parameter settings with $k=20$, $C_p=0.1$, feature batch size $N_v=2$ and sample batch size $N_s=3$. 

\paragraph{LA-MCTS} We use the parameter settings  leaf size=20, $C_p=0.1$ and gamma type="auto". 
For the Ackley, Rastrigin and SVM388D benchmarks, the poly kernel is used for the boundary. For other benchmarks, RBF kernel is used.
For LaMCTS-TuRBO, we use 20 initial points and a total of 50 evaluations for each TuRBO iteration.

\paragraph{HesBO} We set the low dimension $d=20$ and use the box size $[-0.5,0.5]^d$ to reduce blind exploration.

\paragraph{CMA-ES} We run CMA-ES with $\sigma=0.1$ and the default population size $p=4+\lfloor 3\cdot\log D\rfloor$.
\paragraph{Nelder Mead} We use the parameter settings with $\alpha=4,\gamma=8,\rho=0.1$ and $\sigma=0.1$.

%\subsection{Experimental Settings}
%\subsection{VGT}
%\paragraph{VGT}
The local BO of VGT is implemented with \texttt{GPyTorch}\footnote{\url{https://github.com/cornellius-gp/gpytorch}} library and EI is used as the acquisition function. We use the synthetic functions from the SFU benchmarks\footnote{\url{https://www.sfu.ca/~ssurjano/optimization.html}}, IEEE analog benchmark circuits\footnote{\url{https://sagroups.ieee.org/2427/analogue-benchmark-circuits/}}, vehical design problem Mopta\_08 and SVM training task from \footnote{\url{https://arxiv.org/pdf/2103.00349.pdf}}. The detailed experimental setups and hyper-parameters of VGT are summarized in Table \ref{tbl:function_set}. 

%analog circuit %\footnote{\url{https://sagroups.ieee.org/2427/analogue-benchmark-circuits/}}


\begin{table}[ht]
	\centering
	%\vspace{-2.5mm}
	\caption{Summary of experimental settings}
	%\vspace{-2mm}
	\label{tbl:function_set}
	\resizebox{0.98\textwidth}{!}{
		\begin{tabular}{cccccccc}
			\toprule
			Benchmarks & Dimension($D$) & Search space  & Initial points & Iteration ($T$)& $K$ & $C_p$ & $R_p$\\
			\midrule
			Ackley5D & 5 & $[-5,10]^{5}$ & 10 & 60 &20&0.1&$-$ \\
			Griewank5D & 5 & $[-500,500]^{5}$ & 10 & 60 &20&0.1&$-$ \\
			Rosenbrock5D & 5 & $[-2.048,2.048]^{5}$ & 10 & 60 &20&2&$-$ \\
			Hartmann6D & 6 & $[0,1]^{6}$ & 10 & 60 &20&0.1&$-$ \\
			\midrule
			Ackley100D & 100 & $[-5,10]^{100}$ & 50 & 2000 &100&0.1&$-$ \\
			Ackley200D&200& $[-5,10]^{200}$ &50 &2000 &160&0.1&$-$ \\
			Ackley1000D&1000& $[-5,10]^{1000}$ &50 & 5000  &300&0.1&$-$ \\
			Griewank100D&100& $[-500,500]^{100}$ &50 & 2000 &100&0.01&$-$ \\
			Griewank200D&200& $[-500,500]^{200}$ &50 & 2000 &160&0.01&$-$ \\
			Griewank1000D&1000& $[-500,500]^{1000}$ &50 & 5000 &300&0.01&$-$ \\
			\midrule
			Rosenbrock100D&100& $[-2.048,2.048]^{100}$ &50 & 2000 &100&2&$-$ \\
			Rosenbrock200D&200& $[-2.048,2.048]^{200}$ &50 & 2000 &160&2&$-$ \\
			Rosenbrock1000D&1000& $[-2.048,2.048]^{1000}$ &50 & 5000 &300&2&$-$ \\
			Rastrigin100D&100& $[-5.12,5.12]^{100}$ &50 & 2000 &100&5&$-$ \\
			Rastrigin200D&200& $[-5.12,5.12]^{200}$ &50 & 2000 &160&10&$-$ \\
			Rastrigin1000D&1000& $[-5.12,5.12]^{1000}$ &50 & 5000 &300&10&$-$ \\
			\midrule
			Hartmann6\_500D&500& $[0,1]^{500}$ &10 & 1000 &30 & 0.1 & $1/1$ \\
			Ackley10\_500D&500& $[-5,10]^{500}$ &10 & 1000 & 40 & 0.1 & $1/1$ \\
			\midrule
			OPAMP36D&36&$[0,1]^{36}$ & 10 & 500 &60&0.1&$1/4$\\
			PLL77D&77&$[0,1]^{77}$ & 10 & 500 &40&0.1&$1/4$\\
			Mopta08\_124D&124&$[0,1]^{124}$ & 10 & 1000 &80&0.05&$1/4$\\
			SVM388D&388& $[0,1]^{388}$ & 10 & 2000 &120&0.001&$1/4$\\
			\bottomrule
		\end{tabular}
	}
	%\vspace{-3.5mm}
\end{table}




\vfill

\end{document}
