\onecolumn
\icmltitle{
CobBO: Coordinate Backoff Bayesian Optimization - 
Supplementary Materials}
\appendix

% \section{More details on the key/auxiliary features and additional ablation studies} %IMPACTS OF THE KEY/AUXILIARY FEATURES} 

% % For brevity, we abuse the notation and simply write 
% % $ R\left( \hat{\mathcal{X}}_t, \mathcal{H}_t\right)=$. 
% % Their function values $\{\hat{F}_i\}$ are constructed and smoothed by RBF interpolation, hence giving $\hat{\mathcal{H}}_t=\{ (\hat{x}_i, \hat{f}_i), 1\leq i \leq t\}$.
% % Explicitly,  to optimize $f(x)$ in $\Omega_t$,  we 
% % compute the posterior of $\hat{f}(x)$ by conditioning on $\left\{ \left( \hat{x}_i, \hat{y}_i  \right)_{i=1}^{\hat{t}} \right\}$. 

% \subsection{Complement to the key features of CobBO}
%  \textbf{Escaping stagnant local optima:} 
%  In order to escape stagnant local optima, CobBO has two methods. The first method is to change $V_t$, as described in Section~2 of the paper. The threshold $\Theta_1$ for the number of consecutive fails $q_t$ before changing $V_t$ is set to $70$ if the total
%  trial budget is larger than $2000$ otherwise $\Theta_1=35$. 
%  The second method is decrease the function values around the stagnant local optima. Specifically, when the number of consecutive trials $\Theta_2$ that fail to improve the optimization process, e.g., $\Theta_2=50$ if the total
%  trial budget is larger than $2000$ otherwise $\Theta_2=25$, we temporary decrease the function values around the best point observed so far.  By doing so, the Gaussian process regression could encourage to explore other potentially more promising areas. 
 
%  \textbf{Acquisition functions}
% Typical acquisition functions include the expected improvement (EI)~\cite{marchuk1975,jones1998}, the upper confidence bound (UCB)~\cite{peter2003,srinivas2010,srinivas2012}, the entropy search~\cite{henniq2012,henrandez2014,ziw2017},  and the knowledge gradient~\cite{frazier2008,scott2011,wu2016}.  
%  Based on those candidates, CobBO uses ensemble learning for the applied acquisition function. Specifically, we use a bandit approach to select the acquisition functions by measuring the number of queried points that improves the observed function values.  
%  In addition, for UCB, the upper confidence bound typically is constructed as $\mu+\kappa \sigma$
%  where $\mu$ and $\sigma$ represent the estimated mean and variance, respectively. We choose the parameter $\kappa$ as a periodical function of $q_t$ so that $\kappa$ varies with $q_t$ within an interval, e.g., $\kappa \in [2.0, 4.0]$.
 
 \section{Default hyper-parameter configuration}
\label{sec:defalt_conf}
Table~\ref{table:hyperparameters} contains the default configuration of CobBO, which is used to test all the experiments in this paper. 
\begin{table}[h!]
\centering
\begin{tabular}{ |c|c|c| } 
\hline
Hyperparameter & Description & Default Value \\
\hline \hline
$\Theta$ & The threshold for the number of consecutive fails $q_t$ before changing $V_t$ & $60$ if $T>2000$ else $30$\\ 
\hline
$\alpha$ & Increase multiplicative ratio for the coordinate distribution update & $2.0$\\ 
\hline
$\beta$ & Decay multiplicative ratio for the coordinate distribution update  & $1.1$\\ 
\hline
$p$ & Probability for selecting coordinates with the largest $\pi_t$ values & $0.3$\\ 
\hline
$\kappa_S$ & \makecell{The threshold for the virtual clock value $K_t$ \\before shrinking the coarse trust region $\Omega_{S}$} & $30$\\ 
\hline
$\kappa_F$ & \makecell{The threshold for the number of consecutive fails $q_t$ before \\shrinking the fine trust region $\Omega_{F}$ on the fast time scale} & $6$\\ 
%\hline
%$\tau_S$ & The number of consecutive fails $q_t$ in the coarse trust region $\Omega_{S}$ & $8$\\ 
\hline
$\tau_F$ & The number of consecutive fails $q_t$ in the fine trust region $\Omega_{F}$  & $6$\\ 
\hline
$\delta$ & The relative improvement threshold governing the virtual clock update rule & $0.1$\\ 
\hline
& Gussian process kernel & Matern 5/2 \\
\hline
\end{tabular}
\caption{CobBO's hyperparameters configuration for all of the experiments}
\label{table:hyperparameters}
\end{table}

% \subsection{Auxiliary features of CobBO}\label{ss:auxiliary}
% Further smoothness and acceleration can be achieved by filtering out clustered queried points, as alternating between adaptive trust regions promotes exploration in the interior of the domain and assists in escaping local optima.

% %Computational growth with query budget}: 
% The runtime of each iteration for Gaussian process regression scales cubically in the number of queried points. The computational complexity could grow prohibitively high and prevent the usage beyond a limited query budget. 
% It is possible to bring the complexity down to be quadratic by carefully handling the Cholesky factorization~\cite{bayesopt,lazygaussian2020}, or even linear by assuming additive structures~\cite{mutny2018}. Nevertheless, these methods are not generally applicable for our purpose.
% Instead, we resort to approximate Gaussian process regression~\cite{candela2005,bui2017}, using less points to describe the prior. 

% \textbf{Data filtering by K-means classification:}
% Dealing with the cubic computation cost in queries~\cite{snoek2012}, instead of using the sophisticated approximated Gaussian process regression~\cite{candela2005,bui2017}, above some quantity of aggregated observations, e.g. $1000$, we leverage the K-means algorithm~\cite{macqueen1967some} for discarding clustered points.
% Specifically, we only keep the point of maximal value within each cluster. 
%  Intuitively, if two nearby points have close function values,  discarding the smaller one for a maximization problem seems innocuous. Sometimes, it could even be better, since Bayesian optimization assumes the function $f(x)$ to be smooth, from a reproducing kernel Hilbert space~\cite{bull2011}.
 


% \textbf{Batch queries:} %\label{ss:batch}
% Due to sampling subspaces,  CobBO can be easily paralleled in a batch mode.  
% Specifically, we can sample multiple coordinate subspaces, each containing the latest observed pivot point $V_t$. 
% Since the batch mode does not require synchronization, multiple concurrent subspaces may not necessarily use an identical $V_t$.
% In principle, we can integrate other batch methods~\cite{turbo2019,desautels14,emile2013,javier2016,tarun2016,javad2010,desautels14,wilson2017reparameterization} with CobBO.

\section{Implementation} %IMPLEMENTATION}
The proposed CobBO algorithm is implemented in Python~3. \\
An implementation of CobBO is available at: \url{https://github.com/Alibaba-MIIL/CobBO}.

% \subsection{Ablation of the backoff stopping rule and formation of trust regions}
% CobBO is configured with the default hyper-parameter configuration specified in section~\ref{sec:defalt_conf}, including those governing a stopping rule for determining the number of consistent queries and the strategies to form coarse and fine trust regions on slow and fast time scales, respectively. 
% In order to compare the impact of different configurations, we test the following combinations. 
% \begin{itemize} %\vspace{-3mm}%\setlength\itemsep{0em}
% \item Consistent query $\in \{\rm{stopping\;rule}, \;\rm{fixed\; constant}\; q_{\rm{max}}\}$ %with $q_{\rm{max}}$ being the maximum number of consistent queries
% \item $S \in\{\rm{true},\rm{false}\}$, whether or not to employ coarse trust regions on a slow time scale
% \item $F \in\{\rm{true},\rm{false}\}$, whether or not to employ refined trust regions on a fast time scale
% \end{itemize}%\vspace{-3mm}
% %
% The fixed constant $q_{\rm{max}}$ represents the maximum number of consistent queries that can be continuously imposed to the 
% currently selected coordinate subspace. 
% It induces a tradeoff between exploiting the potential of the current coordinate subspace and exploring other subspaces. 
% %Conceptually, more consistent queries exploit the potential of the coordinate subspace, at the risk of missing better solutions of other subspaces due to the limited total budget. 
% %
% When coarse trust regions are enabled on a slow time scale (i.e., $S=\rm{true}$), the procedure exploits a neighborhood of $V_{t}$ instead of the full domain. 
% %
% If fine trust regions are formed on a fast time scale (i.e., $F=\rm{true}$), the Bayesian optimization better exploits the selected regions centered at~$V_{t}$. 
% The alternation between coarse and fine trust regions can help distributing new queries in both this centered area as well as near the boundary. 
% %
% %Coarse trust regions can be considered as a trade-off between the refined small trust regions and the original domain.  
% We conduct extensive experiments to empirically demonstrate the contribution of these features to the performance of CobBO. 



% We apply CobBO on 30 dimensional synthetic functions (Ackley, Levy and Rastrigin) and the robot pushing problem using $5$ different configurations, as shown in Table \ref{table:settings}:

% \begin{table}[hbt]
% %\caption{Table Caption}
% \label{tab:settings}
% \begin{center}
% \begin{tabular}{lcccccc}
% % \hline
% %                   &  $\rm{CobBO}^{\ast}$  & $\rm{CobBO}^{1}$ & $\rm{CobBO}^{2}$ & $\rm{CobBO}^{3}$ & $\rm{CobBO}^{4}$ & $\rm{CobBO}^{5}$ \\ 
% % \hline
% % $q_{\rm{max}}$    & stopping rule      &  stopping rule         & stopping rule        & stopping rule       &  1       & 15 \\
% % $S$               &true  &  false    & true   &  false  & true    & true  \\
% % $F$               & true  &  false    & false  &  true   &  true   & true   \\
% % \hline
% \hline
%                   & $\rm{CobBO}^{1}$ & $\rm{CobBO}^{2}$ & $\rm{CobBO}^{3}$ & $\rm{CobBO}^{4}$ & $\rm{CobBO}^{5}$ \\ 
% \hline
% $q_{\rm{max}}$    &  stopping rule         & stopping rule        & stopping rule       &  1       & 15 \\
% $S$               &  false    & true   &  false  & true    & true  \\
% $F$               &  false    & false  &  true   &  true   & true   \\
% \hline
% \end{tabular}
% \end{center}
% \caption{CobBO with different configurations}
% \label{table:settings}
% \end{table}


% \subsubsection{Ablation over 30 dimensional synthetic problems}
% \label{sec:ablation_synthetic}
% %Note that $\rm{CobBO}^{\ast}$ is the default setting that we have used to generate the experimental results in the main part of this paper. 
% %Based on the previous setup, we
% We assign a budget of $2,500$ function evaluations to Ackley, Levy and Rastrigin, and $7,000$ function evaluations to the robot pushing problem.
% For each configuration, confidence intervals ($95\%$) over repeated 30 independent experiments for each problem are shown.
% The tested value $q_{\rm{max}}$ is chosen to be $2$ for $2,500$ function evaluations and $3$ for $7,000$. 

% \begin{figure}[hbt]
% \begin{center}
% \includegraphics[width=0.98\columnwidth,height=!]{app-synthetic-30}
% \end{center}
% \caption{Performance of different configurations over synthetic problems of $30$ dimensions: Ackley (left), Levy (middle) and Rastrigin (right)}
% \label{fig:d30}
% \end{figure}

% % The different configurations tested yield similar performance over these three synthetic problems, as shown in Fig.~\ref{fig:d30}. This indicates that in those cases CobBO is not sensitive to the differences in the configurations.
% % However, small differences still exist for the experiments. 

% $\rm{CobBO}^{5}$, of a larger $q_{\rm{max}}$ value, performs slightly worse than $\rm{CobBO}^{3}$ and  $\rm{CobBO}^{4}$, 
% but better than  $\rm{CobBO}^{1}$ and  $\rm{CobBO}^{2}$. This implies that $q_{\rm{max}}$ and $F$ have stronger impacts on the performance than $S$ over the examined cases. 
% %
% When the fast trust region feature is enabled ($F = \rm{true}$),   
% $\rm{CobBO}^{3}$ encourages more exploitation within smaller neighborhoods around the current best solutions, and consistently outperforms $\rm{CobBO}^{1}$ and $\rm{CobBO}^{2}$ on all three problems.


% \subsubsection{Ablation over the robot pushing problem}
% \label{sec:ablation_robot}

% \begin{figure}[hbt]
% \begin{center}
% \includegraphics[width=0.6\columnwidth,height=!]{rpush}
% \end{center}
% \caption{Performance of different configurations on the robot pushing problem}
% \label{fig:push}
% \end{figure}
% For the robot pushing problem, shown in Fig.~\ref{fig:push}, 
% % the results of the $5$ configurations are not significantly different from each other either. 
% % Specifically, 
% $\rm{CobBO}^3$ slightly outperforms the rest on average, similar to the experiments shown in Fig~\ref{fig:d30}. 
% $\rm{CobBO}^5$ performs badly, possibly due to its excessive exploitation of the selected coordinate subspaces. 
% Different from the observations made in section~\ref{sec:ablation_synthetic}, $\rm{CobBO}^1$ and $\rm{CobBO}^2$ find better solutions than $\rm{CobBO}^4$ and $\rm{CobBO}^5$ on average. 
% This suggests that properly, and presumably adaptively, balancing exploitation and exploration, e.g. through the formation of trust regions and the allocation of proper query budgets across selected subspaces, can impact the performance.
% The default configuration, detailed in table~\ref{table:hyperparameters}, includes fine trust regions. In this experiment, such configurations do not perform as well as  $\rm{CobBO}^1$ and $\rm{CobBO}^2$. 
% This indicates that better adaptive algorithms can be designed to further improve the performance of CobBO. 

\section{Further ablation of escaping local optima}
CobBO is described in Algorithm~1, where Line~8 is about escaping local maxima by changing the pivot point $V_t$ when the number of consecutive fails exceeds a threshold, i.e., $q_t>\Theta$.
In this case, we decrease the observed function value at $V_{t}$ and set $V_{t+1}$ as a selected sub-optimal random point in $\mathcal{X}_t$. Specifically, we randomly sample $5$ points in $\mathcal{X}_t$ with their values above the median and pick the one furthest away from $V_{t}$. 

%  In order to escape stagnant local optima, CobBO has two methods. The first method is to change $V_t$, as described in Section~2 of the paper. The threshold $\Theta_1$ for the number of consecutive fails $q_t$ before changing $V_t$ is set to $70$ if the total
%  trial budget is larger than $2000$ otherwise $\Theta_1=35$. 
%  The second method is decrease the function values around the stagnant local optima. Specifically, when the number of consecutive trials $\Theta_2$ that fail to improve the optimization process, e.g., $\Theta_2=50$ if the total
%  trial budget is larger than $2000$ otherwise $\Theta_2=25$, we temporary decrease the function values around the best point observed so far.  By doing so, the Gaussian process regression could encourage to explore other potentially more promising areas.


We use the experiments on Levy and Ackley functions of 100 dimensions, as described in section~3.2 to compute the fraction of queries that improve
 the already observed maximal points due to changing~$V_t$ according to Line~8.  
 
 \begin{table}[h!]
\centering
\begin{tabular}{ |c|c|c| } 
\hline
Problem & Average \# improved queries & Average \# improved queries due to escaping\\
\hline \hline
Ackley & 228 & 15.3 \\
\hline
Levy & 155 & 3\\
\hline
\end{tabular}
\caption{The number of improved queries due to escaping local maxima}
\label{table:escaping}
\end{table}

We observe that optimizing the Levy function yields very few queries that improve the maximal points by changing the pivot point, while optimizing the Ackley function can benefit more from that.  

\section{Forming trust regions on two time scales}
CobBO alternates between the two trust regions according to a duty cycle determined by $\kappa_F$ and $\tau_F$ as specified by Algorithm~1 and Table~\ref{table:hyperparameters}.  %~\ref{alg:trust_region}.
%\input{algorithm/trust_region_algo}
The formation of trust regions is triggered when a virtual clock $K_t$, expressing the progress of the optimization, reaches certain thresholds.
Specifically, the virtual clock evolves as following
  \begin{align*}
    K_{t+1}=
    \begin{cases}
		K_t + 1	 & \text{if } \Delta_t \leq 0 \\
	   % \gamma_t(\Delta_t, ||x_t - x_{t-1}||) \cdot K_t & \text{if } 0 < \Delta_t \leq \delta \\
	    \gamma_t(\Delta_t, x_t, x_{t-1}) \cdot K_t & \text{if } 0 < \Delta_t \leq \delta \\
		0	 & \text{if } \Delta_t > \delta\\
	 \end{cases} 
 \end{align*}
which is described in equation~(3) in the main body of the paper.
%  where $\Delta_t = \frac{M_t - M_{t-1}}{\left|M_{t-1}\right|}$ is the relative improvement and for example, 

%  \begin{align*}
% 	 \gamma_t(\Delta, x_t, x_{t-1}) = \left(1-\frac{\Delta}{\delta}\right) \cdot \left(1 - \frac{||x_t - x_{t-1}||}{\sqrt{|C_t|}} \right).
%  \end{align*}

  
\begin{algorithm}[tbh]\vspace{0.0mm}
    \label{alg:trust_region}
% 	\SetAlgoLined
    % \textbf{Input}: Current virtual Clock $K_t$\\
    \textbf{Parameters}: \\
    \hspace{0.5cm} Slow/fast thresholds $\kappa_{S/F}$ respectively\\
    \hspace{0.5cm} Fast duty cycle $\tau_{F}$\\
    % Current observed value $y_t$ \\
    % Previous best value $M_{t-1}$ \\
    % Consecutive fails to improve $q_t$ \\
    \textbf{Init}: $\Omega_{0}, \tilde{\Omega}_{0} \leftarrow \Omega$ \\
    \uIf{$y_t > M_{t-1}$} {
        $\tilde{\Omega}_{t} \leftarrow$ Double $\tilde{\Omega}_{t-1}$ around $V_t$ \\
        $\Omega_{t} \leftarrow \tilde{\Omega}_{t}$ [$\tilde{\Omega}_{t}$ is the trust region formed on the slow time scale]
    }
    \uElseIf{$K_t==\kappa_S$}{
        $\tilde{\Omega}_{t} \leftarrow$ Halve $\tilde{\Omega}_{t}$ around $V_t$ \\
        $\Omega_{t} \leftarrow \tilde{\Omega}_{t}$\\
        Reset $K_t = 0$
    }
    \uElse{
        $\tilde{\Omega}_t \leftarrow \tilde{\Omega}_{t-1}$\\
        \uIf{$mod\left(K_t, \kappa_F+\tau_F\right)== \kappa_F-1$}{
         $\Omega_{t} \leftarrow$ Halve $\Omega_{t-1}$ around $V_t$
     }
     
     \uElseIf{$mod\left(K_t, \kappa_F+\tau_F\right)==  \kappa_F+\tau_F -1$}{
         $\Omega_{t} \leftarrow \tilde{\Omega}_{t}$}
     \uElse{$\Omega_{t} \leftarrow \Omega_{t-1}$}
    }
    
     
    %\uIf{
    %     $\tilde{\Omega}_t \leftarrow \Omega_{F_t}$
    %}
    % $\tilde{\Omega}_{t} \leftarrow \Omega_{S_t} \textbf{ If } mod\left(q_t, \tau_S+\tau_F\right) < \tau_S\textbf{ Else } \Omega_{F_t}\$
    % \IfThenElse {$mod\left(q_t, \tau_S+\tau_F\right) < \tau_S$}% If ...
    %   {$\Omega_{S_t}$}% ...then...
    %   {$\Omega_{F_t}$}% ...else...
      
    \textbf{Output}: Trust Region $\Omega_{t}$
	\caption{FormTrustRegions($K_t$,$y_t$,$M_{t-1}$)}
% 	\caption{FormTrustRegionsPolicy($K_t$, $\kappa_S$, $\kappa_F$, $\tau_S$, $\tau_F$, $y_t$, $M_{t-1}$, $q_t$)}
\end{algorithm}
\setlength{\textfloatsep}{0pt}

% The threshold $\kappa_S$ is not necessarily a constant. To adapt to different optimization problems, we choose $\kappa_S$ to depend on $\eta_t$ the number of times $K_t$ has consecutively reached $\kappa_S$. 
% When $\eta_t$ crosses a certain threshold,
% %that depends on the query budget $T$ and the problem dimension $D$,
% CobBO assumes being trapped in a local optimum~\cite{qin2017,bull2011,snoek2012}. 
% In this case, it 
% %randomly samples a point  reduces the function values in $\mathcal{H}_t$ within a small region around $V_t$, and 
% sets $V_{t+1}$ as
%  one of the already queried top points in $\mathcal{X}_t$ far away from $V_t$, and repeats the entire process
%  by starting with the full domain $\Omega$
%  and $\eta_{t+1}=0$.
 In addition, when the number of queried points exceeds a threshold, e.g., $70\%$ of the query budget, we shrink the total space~$\Omega$ every time when the fraction of the queried points increases by $10\%$. 


\section{Additional experiments}
We provide more experiments for demonstrating the performance of CobBO. Confidence intervals ($95\%$) are computed by repeating $30$ and $10$ independent experiments for the medium-sized functions and the $200$-dimensional functions, respectively.
 
 
   \begin{figure}[bht]\vspace{-0mm}
   \centering
   \includegraphics[width=1.0\columnwidth,height=!]{supplementary/30D-tests.png}\vspace{-3mm}
   \caption{Performance over medium dimensional problems: Ackley (left), Levy (middle) and Rastrigin (right)} 
   \label{fig:30D-tests}
 \end{figure}
\textbf{Medium-sized synthetic black-box functions (minimization):}
We test three synthetic functions ($30$ dimensions), including Ackley on $[-5, 10]^{30}$, Levy $[-5, 10]^{30}$, and Rastrigin on $[-3, 4]^{30}$. In addition, 
we add experiments for an additive function of $36$ dimensions, defined as  $f_{36}(x)=\rm{Ackley}(x_1) + \rm{Levy}(x_2) + \rm{Rastrigin}(x_3) + \rm{Hartmann}(x_4)$, where the first three terms express the same functions over the same domains specified in Section~3.1 of this paper, with the Hartmann function over $[0, 1]^{6}$. 
TuRBO is configured identically the same as in Section~3.1, with a batch size of 10 and 5 trust regions with 10 initial points each. The other algorithms use 20 initial points.
The results are shown in Fig.~\ref{fig:30D-tests} and~\ref{fig:additive-36D}, where CobBO shows competitive or better performance compared to all of the methods tested across all of these problems.
 \begin{figure}[htb]\vspace{-0mm}
   \centering
   \includegraphics[width=0.7\columnwidth,height=!]{supplementary/additive-36D.png}\vspace{1mm}
   \caption{Performance over an additive function of 36 dimensions}\vspace{5mm}
   \label{fig:additive-36D}
 \end{figure}

\textbf{The 200-dimensional Levy and Ackley functions (minimization):}
We minimize the Levy and Ackley functions over $[-5, 10]^{200}$ with $500$ initial points. 
TuRBO is configured with $15$ trust regions and a batch size of $100$.
These two problems are challenging and have no redundant dimensions. 
 For Levy, in Fig.~\ref{fig:200d} (left), CobBO reaches $100.0$ within $2,000$ trials, while CMA-ES and TuRBO 
 obtain $200.0$ after $8,000$ trials. TPE cannot find a comparable solution within $10,000$ trials in this case. 
 For Ackley, in Fig.~\ref{fig:200d} (right), CobBO reaches the best solution among all of the algorithms tested. 
 The appealing trial complexity of CobBO suggests that it can be applied in a hybrid method, e.g., used in the first stage of the query process combined with gradient estimation methods or CMA-ES.
%  Note that the variance for the Levy function across $10$ independent experiments is very small, as shown in Fig.~\ref{fig:200d-zoomin}. 

  \begin{figure}[htb]\vspace{-0mm}
   \centering
   \includegraphics[width=0.8\columnwidth,height=!]{supplementary/200d-synth.png}\vspace{-4mm}
   \caption{Performance over high dimensional synthetic problems: Levy (left) and Ackley (right)}\vspace{-0mm}
   \label{fig:200d}
 \end{figure}
% \vspace{-3mm}
%  \begin{figure}[htb]
%   \centering
%   \includegraphics[width=0.5\columnwidth,height=!]{200d-zoomin.png}\vspace{-4mm}
%   \caption{A closer look at the performance over the high dimensional synthetic Levy problem}\vspace{-0mm}
%   \label{fig:200d-zoomin}
%  \end{figure}
