
\begin{figure*}[htb]
   \centering
   %\includegraphics[width=0.7\columnwidth,height=!]{figures/synth_100d.pdf}
      \includegraphics[width=0.75\linewidth,height=!]{figures/100d-synth.pdf}
   %\includegraphics[width=0.9\columnwidth,height=!]{figures/100d-synth-3.pdf}
  % \vspace{-1mm}
   \caption{Performance (lower is better) over high dimensional synthetic problems: Levy (left) and Rastrigin (right)}
   \label{fig:100d}
  % \vspace{-1mm}
 \end{figure*}
\begin{figure*}[htb]
\begin{center}
  \includegraphics[width=1.0\linewidth,height=!]{figures/highDims.png}
\end{center}
  % \vspace{-3mm}
  \caption{Performance over medium-size dimensional problems: 36D (left) and 56D (middle) additive functions (lower is better) and the 60D rover trajectory planning (right - higher is better)}
  \label{fig:highDims}
\end{figure*}

\section{Numerical Experiments}\label{sec:num_exp}
\input{ablation/ablation}


\begin{figure*}[htb]
\begin{center}
  \includegraphics[width=0.7\linewidth,height=!]{figures/lunar-robot.png}
%   \includegraphics{lunar-robot.png}
\end{center}
  % \vspace{-5mm}
  \caption{Performance (higher is better) over the lunar landing (left) and robot pushing (right) problems}
  \label{fig:lunar-robot}
  % \vspace{-3mm}
\end{figure*}

 \begin{figure*}[htb]
  \centering
  \includegraphics[width=0.95\linewidth,height=!]{figures/synthetic.png}
%   \includegraphics{synthetic.png}
  % \vspace{-5mm}
  \caption{Performance on 10D (top) and 30D (bottom) synthetic 
  % black-box 
  functions: Ackley (left), Levy (middle) and Rastrigin (right)}
  \label{fig:synthetic}
  % \vspace{-1mm}
\end{figure*}

\subsection{Comparisons with other methods}\label{s:exp}
%After tuning the hyper-parameters of CobBO over a number of commonly used benchmarks, 
A default configuration for CobBO is used for all of the experiments. 
%The values are specified in the supplementary materials together with more experiments. 
%Extensive experiments show that 
CobBO performs on par or outperforms a collection of state-of-the-art methods. 
%This further demonstrates the robustness of CobBO.
Most of the experiments are conducted using the same settings as in TurBO~\cite{turbo2019}, where it is compared with a comprehensive list of baselines, including BFGS, BOCK~\cite{bock2018}, BOHAMIANN, CMA-ES~\cite{cmaes}, BOBYQA, EBO~\cite{wang18aistats}, GP-TS, HeSBO~\cite{chaudhuri2019}, Nelder-Mead and random search. 
To avoid repetitions, we only show TuRBO and CMA-ES that achieve the best performance among this list, and additionally compare with BADS~\cite{luigi2017},  
% HDBBO~\cite{zi2017}, SIR~\cite{miao2019},
Tree Parzen Estimator (TPE)~\cite{TPE2011} and Adaptive TPE (ATPE)~\cite{ATPE}. 
%The python code of the experiments will be made publicly available, together with the implementation of CobBO.  
%Since CobBO is designed for high dimensional problems, we benchmark the performance in Section~\ref{ss:highD} in high dimensions. To show that it also works in low dimensions, we conduct the low dimensional tests in Section~\ref{ss:lowDtest}.
As mentioned in Section~\ref{sec:related_work}, the embedding algorithms (e.g., REMBO~\cite{ziyuw2016} and ALEBO~\cite{letham2020}) and CobBO are based on different assumptions, which are
compared in Appendix~1. Appendix~2 presents the comparison with LineBO~\cite{linebo}.
%and thus complement each other, 
%We repeat each experiment independently for 30 times to get the 95\% confidence intervals. 
%, and d-KG~\cite{wujian2017}.
%Though LineBO~\cite{linebo} and DROPOUT~\cite{dropoutbo} are also based on subspace selection,
%they do not show comparable performance.  
%Confidence intervals are computed with the results of 30 independent experiments.

\subsubsection{High dimensional tests}\label{ss:highD}

Since the duration of each experiment in this section is long, confidence intervals ($95\%$) over repeated 10 independent experiments for each problem are shown.

\noindent \emph{The 100 dimensional synthetic black-box functions (minimization):} %\label{sec:100d}
We minimize the Levy and Rastrigin functions on $[-5, 10]^{100}$ with $300$ initial points.  These two problems are challenging since they have no redundant dimensions. % in high dimensions. 
TuRBO is configured with $1$ trust regions and a batch size of $100$.
 Fig.~\ref{fig:100d} (left) shows that CobBO can greatly reduce the trial complexity. 
  For Levy and Rastrigin, CobBO surpasses the final solutions of all the other methods within $2,000$ and $5,000$ trials for a total budget of $10,000$ trials, respectively. 
  REMBO is especially compared in Appendix~1. 
%  For Levy, it finds solutions close to the final one within $1,000$ trials, and eventually reach the best solution among all the algorithms tested.
%  For Rastrigin, within $1,000$ trials CobBO surpasses the final solutions of all the other methods.  REMBO is especially compared in Section~\ref{ss:alebo}. 
 %eventually with a large margin.
 %is excluded in this comparison, since it is designed for problems with low effective dimensions but Ackley 100D is not. See the comparisons in Section~\ref{ss:alebo}. 
  
 
In order to highlight the difference of the running time, we test Ackley 200D with $10,000$ trials. For a fair comparison, we change the configure so that both TurBO and CobBO have the same batch size of $1$. CobBO runs for $12.8$ CPU hours and TuRBO-1 runs for more than $80$ CPU hours or $9.6$ \emph{GPU} hours. Other methods either take too long to make progress or find far worse solutions.
 

\noindent \emph{Additive latent structure (minimization):}
As mentioned in Section~\ref{sec:related_work}, additive latent structures have been explored for tackling challenges in high dimensions.
%which however incur a high computational cost~\cite{chaudhuri2019}.   %For $x=(x_1, x_2, x_3, x_4)$,  
We construct two additive functions. The first one has 36 dimensions, defined as  
 $f_{36}(x)=\rm{Ackley}(x_1) + \rm{Levy}(x_2) + \rm{Rastrigin}(x_3) + \rm{Hartmann}(x_4)$, where the first three terms express the exact functions and domains described in Section~\ref{ss:lowDtest},  with the Hartmann function defiend over $[0, 1]^{6}$. 
 The second has 56 dimensions, defined as 
 $f_{56}(x) = \rm{Ackley}(x_1) + \rm{Levy}(x_2) + \rm{Rastrigin}(x_3) + \rm{Hartmann}(x_4) +\rm{Rosenbrock}(x_5)+\rm{Schwefel}(x_6)$, 
 where the first four terms are the same as those of $f_{36}$, with the Rosenbrock and Schwefel functions defined over $[-5,10]^{10}$ and $[-500,500]^{10}$, respectively. 

We compare CobBO with TPE, ATPE, BADS, CMA-ES and TuRBO, each with $100$ initial points. 
Specifically, TuRBO is configured with 15 trust regions and a batch size 50 for $f_{36}$ and $100$ for $f_{56}$. 
ATPE is excluded for $f_{56}$ as it takes more than 24 hours per run to finish. 
The results are shown in Fig.~\ref{fig:highDims}, where CobBO quickly finds the best solutions for both $f_{36}$  and $f_{56}$.


As shown in Fig.~\ref{fig:highDims}, CobBO finds the best solutions for both $f_{36}$  and $f_{56}$. 
BADS performs closely to CobBO. ATPE outperforms TPE, TuRBO and CMA-ES on $f_{36}$. 
TuRBO surpasses TPE and CMA-ES on $f_{36}$ eventually, while TPE and CMA-ES converge faster than TuRBO on $f_{56}$.

% \begin{figure}[htb]
% \begin{center}
%   \includegraphics[width=0.75\columnwidth,height=!]{figures/medium_v.png}
% %   \includegraphics{highDims.png}
% \end{center}
%   \caption{Performance over medium-size dimensional problems: 56D additive functions (upper) and the 60D rover trajectory planning (lower)}
%   \label{fig:highDims}
% \end{figure}

\noindent \emph{Rover trajectory planning (maximization):} 
This problem (60 dimensions) is introduced in~\cite{wang18aistats}. 
The objective is to find a collision-avoiding trajectory of a sequence consisting of 30 positions in a 2-D plane. 
%$[0,1]^{2}$. 
We compare CobBO with TuRBO, TPE and CMA-ES with a budget of $20,000$ evaluations and
$200$ initial points. 
TuRBO is configured with $15$ trust regions and a batch size of $100$, as in~\cite{turbo2019}. 
ATPE, BADS and REMBO are excluded for this problem, as they all last for more than 24 hours per run. The result is shown in Fig.~\ref{fig:highDims}. CobBO reaches the best solution with fewer evaluations than TuRBO, while TPE and CMA-ES reach inferior solutions.








% \noindent \emph{The 200-dimensional Levy and Ackley functions (minimization):}
% We minimize the Levy and Ackley functions over $[-5, 10]^{200}$ with $500$ initial points. 
% TuRBO-1 is configured with $1$ trust region and a batch size of $100$.

%  Fig.~\ref{fig:200d} shows that CobBO can dramatically
%  reduce the trial complexity. 
%  For Levy, it quickly finds solutions close to the optimal within $1,000$ trials. 
%  All of the other tested algorithms take more than $10,000$ trials and still cannot obtain a comparable solution. 
%  For Ackley, CobBO reaches 4.0 within $1,800$ trials, while CMA-ES requires $7,000$ trials. 
%  TuRBO 
%  \textcolor{red}{(with a batch size of 100 \cite{turbo2019})}  and TPE cannot find a comparable solution within $10,000$ trials. 

%  For Levy, in Fig.~\ref{fig:low_medium_high} (upper right), CobBO reaches $100$ within $2,000$ trials, while CMA-ES and TuRBO 
%  obtain $200$ after $8,000$ trials. TPE cannot find a comparable solution within $10,000$ trials in this case. 
%  For Ackley, in Fig.~\ref{fig:low_medium_high} (lower right), TuRBO, CMA-ES and CobBO converge to the mean best values of $4.53$, $3.33$ and $2.91$ respectively after $20,000$ trials. To be consistent with Levy, we present the first $10,000$ steps that also highlight the effectiveness of CobBO at relatively low query budgets for high dimensional functions. %CobBO reaches the best solution among all of the algorithms tested. 
% %  The appealing trial complexity of CobBO suggests that it can be applied in a hybrid method, e.g., used in the first stage of the query process combined with gradient estimation methods or CMA-ES.





%   Furthermore, note that CobBO's sample variance for the Levy function across $10$ independent experiments is extremely low, as can be seen in Fig.~\ref{fig:200d}. 
  
  
\subsubsection{Low dimensional tests}\label{ss:lowDtest}
 To evaluate the performance of CobBO on low dimensional problems, we use two challenging problems of lunar landing~\cite{turbo2019}  and robot pushing~\cite{wang18aistats}, as well as classic synthetic black-box functions~\cite{TestProblems2013},  by following the setup in~\cite{turbo2019} for most of the experiments. Confidence intervals ($95\%$) over repeated 30 independent experiments for each problem are shown.
 
 


% \noindent \emph{The 30-dimensional classic functions:}
% We compare CobBO with TuRBO, BADS, TPE, ATPE and CMA-ES on the 30 dimensional versions of the Ackley, Levy and Rastrigin functions 
% introduced in Section \ref{ss:lowDtest}. %(except the Hartmann function that is defined to be fixed 6 dimensional)

% %   \begin{figure}[!ht]
% %   \centering
% %   \includegraphics[width=1.\columnwidth,height=!]{supplementary/30D-tests.png}
% %   \caption{Medium dimensional problems: Ackley (left), Levy (middle) and Rastrigin (right)} 
% %   \label{fig:30D-tests}
% %  \end{figure}
% As shown in Fig.~\ref{fig:synthetic}, CobBO finds the global optima of Ackley the Levy, and the best results for Rastrigin. 
% BADS is competitive with CobBO on Ackley and Levy, while it performs next to CobBO on Rastrigin.  
% CMA-ES outperforms TuRBO, TPE and ATPE on Ackley, and is comparable to TPE on the other two problems. 


 
\noindent \emph{Lunar landing (maximization):}
This controller learning problem ($12$ dimensions) is provided by the OpenAI gym and evaluated in~\cite{turbo2019}.
%The controller of a lunar lander decides whether or not to fire the booster engine and the firing direction during landing,   
%based on the current status of the lander in each frame. 
%The average performance of the controller is evaluated by simulations over %a fixed constant set of 
%50 randomly generated terrains and initial states. 
Each algorithm has 50 initial points and a budget of $1,500$ trials. 
TuRBO is configured with 5 trust regions and a batch size of 50 as in~\cite{turbo2019}.   
Fig.~\ref{fig:lunar-robot} shows that, among the $30$ independent tests, CobBO quickly exceeds $300$ along some good sample paths, outperforming other algorithms. 

\noindent \emph{Robot pushing (maximization):}
This control problem (14 dimensions) is introduced in~\cite{wang18aistats} and extensively tested in~\cite{turbo2019}.  We follow the setting in~\cite{turbo2019}, where TuRBO is configured with a batch size of 50 and 15 trust regions with 30 initial points each.  
We exclude REMBO that takes too long per run (more than $24$ hours).  
Each experiment has a budget of $10,000$ evaluations.
On average CobBO exceeds 10.0 within 5,500 trials, while TuRBO requires about 7,000, 
as shown in Fig. ~\ref{fig:lunar-robot}.
TPE and ATPE converge to around 9.0, outperforming BADS and CEM-ES with large margins. 
The latter two exhibit large variations and get stuck at local optima.


% CobBO finds the best results for the robot pushing problem, 
% slightly outperforming TuRBO, as shown in Fig. ~\ref{fig:control-additive}. 
% Both TPE and ATPE  are less competitive but still outperform BADS and CMA-ES with large margins. 
% The latter two algorithms show large variations and get stuck in suboptima at very early stages.


\noindent \emph{Classic synthetic black-box functions (minimization):}
Three popular synthetic functions ($10$ and $30$ dimensions) are chosen, including Ackley over $[-5, 10]^{10}$ and $[-5, 10]^{30}$, Levy over both $[-5, 10]^{10}$ and $[-5, 10]^{30}$, and Rastrigin over both $[-3, 4]^{10}$ and $[-3, 4]^{30}$.
%, and Hartmann(6D) with domain $[0, 1]^{6}$
%Each experiment has a budget of $500$ evaluations. 
TuRBO is configured identically the same as in~\cite{turbo2019}, with a batch size of $10$ and $5$ concurrent trust regions where each has $10$ initial points. 
%\niv{What does it mean "$5$ trust regions" ?}
The other algorithms use $20$ initial points. 
The results are shown in Fig.~\ref{fig:synthetic}. CobBO shows competitive or better performance for all of these problems.
It finds the global optima on Ackley and Levy, and clearly outperforms the other algorithms for the difficult Rastrigin function. 
Notably, BADS is more suitable for low dimensions, as commented in~\cite{luigi2017}, which performs close to CobBO except on Rastrigin. 
TuRBO performs better than TPE and worse than BADS. ATPE outperforms TPE. % and is close to CobBO on Levy.
CMA-ES eventually catches up with TPE, ATPE and REMBO on Ackley.
For $10$ dimensions, REMBO appears unstable with large variations and is trapped at local optima. 
For $30$ dimensions, REMBO is excluded as it takes too long to finish; see Appendix~1.
%(more than 24 hours per experiment in this case). 





