\documentclass[accepted]{uai2022}

\usepackage[american]{babel}

\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

\usepackage{amssymb}
\usepackage{amsthm}
\usepackage{algorithmic}
\usepackage{algorithm}
\usepackage{subfigure}

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}

\newcommand{\secref}[1]{Section~\ref{#1}}
\newcommand{\figref}[1]{Figure~\ref{#1}}
\newcommand{\tabref}[1]{Table~\ref{#1}}
\newcommand{\algref}[1]{Algorithm~\ref{#1}}
\newcommand{\thmref}[1]{Theorem~\ref{#1}}
\newcommand{\lemref}[1]{Lemma~\ref{#1}}

\newcommand{\argmax}{\operatornamewithlimits{\arg \max}}
\newcommand{\argmin}{\operatornamewithlimits{\arg \min}}

\newcommand{\convex}{{\textrm{Conv}}}
\newcommand{\mutual}{{I}}

\newcommand{\bc}{{\mathbf{c}}}
\newcommand{\bff}{{\mathbf{f}}}
\newcommand{\bk}{{\mathbf{k}}}
\newcommand{\bx}{{\mathbf{x}}}
\newcommand{\by}{{\mathbf{y}}}

\newcommand{\bC}{{\mathbf{C}}}
\newcommand{\bI}{{\mathbf{I}}}
\newcommand{\bK}{{\mathbf{K}}}
\newcommand{\bL}{{\mathbf{L}}}
\newcommand{\bR}{{\mathbf{R}}}
\newcommand{\bX}{{\mathbf{X}}}
\newcommand{\bY}{{\mathbf{Y}}}

\newcommand{\calC}{{\mathcal{C}}}
\newcommand{\calH}{{\mathcal{H}}}
\newcommand{\calN}{{\mathcal{N}}}
\newcommand{\calO}{{\mathcal{O}}}
\newcommand{\calX}{{\mathcal{X}}}

\newcommand{\bbR}{{\mathbb{R}}}


\title{Combinatorial Bayesian Optimization\\with Random Mapping Functions to Convex Polytopes}

\author[1]{\href{mailto:<jtkim@postech.ac.kr>?Subject=Your UAI 2022 paper}{Jungtaek Kim}{}}
\author[2]{Seungjin Choi}
\author[1]{Minsu Cho}

\affil[1]{%
POSTECH\\
South Korea
}
\affil[2]{%
Intellicode\\
South Korea
}


\begin{document}
\maketitle

\begin{abstract}
Bayesian optimization is a popular method for solving the problem of global optimization 
of an expensive-to-evaluate black-box function.
It relies on a probabilistic surrogate model of the objective function,
upon which an acquisition function is built to determine where next to evaluate the objective function.
In general, Bayesian optimization with Gaussian process regression operates on a continuous space.
When input variables are categorical or discrete, an extra care is needed.
A common approach is to use one-hot encoded or Boolean representation for categorical variables 
which might yield a combinatorial explosion problem.
In this paper we present a method for Bayesian optimization in a combinatorial space,
which can operate well in a large combinatorial space.
The main idea is to use a random mapping which embeds the combinatorial space 
into a convex polytope in a continuous space, on which all essential process is performed
to determine a solution to the black-box optimization in the combinatorial space.
We describe our combinatorial Bayesian optimization algorithm and present its regret analysis.
Numerical experiments demonstrate that our method shows satisfactory performance compared to existing methods.
\end{abstract}

\section{Introduction}\label{sec:introduction}

Bayesian optimization~\citep{BrochuE2010arxiv,ShahriariB2016procieee,FrazierPI2018arxiv} 
is a principled method for solving the problem of optimizing a black-box function which is expensive to evaluate.
It has been quite successful in various applications of machine learning~\citep{SnoekJ2012neurips}, 
computer vision~\citep{ZhangY2015cvpr}, 
materials science~\citep{FrazierP2016ismdd}, biology~\citep{GonzalezJ2014bayesopt},
and synthetic chemistry~\citep{ShieldsBJ2021nature}.
In general, Bayesian optimization assumes inputs (decision variables) are real-valued vectors in a Euclidean space,
since Gaussian process regression is typically used as a probabilistic surrogate model and
the optimization of an acquisition function is a continuous optimization problem.
However, depending on problems, input variables can be of different structures. 
For instance, recent work includes
Bayesian optimization over sets \citep{GarnettR2010ipsn,BuathongP2020aistats,KimJ2021ml}, 
over combinatorial inputs \citep{HutterF2011lion,BaptistaR2018icml,OhC2019neurips}, or 
over graph-structured inputs \citep{CuiJ2018arxiv,OhC2019neurips}.

It is a challenging problem to find an optimal configuration among a finite number of choices,
especially when the number of possible combinations is enormous \citep{GareyMR1979book}.
This is even more challenging when the objective function is an expensive-to-evaluate black-box function.
This is the case where inputs are categorical or discrete,
referred to as \emph{combinatorial Bayesian optimization}, which has been studied
by \citet{BaptistaR2018icml,BuathongP2020aistats} recently.
However, the methods by~\citet{HutterF2011lion,BaptistaR2018icml} employ one-hot encoded 
or Boolean representations to handle categorical or discrete input variables,
which often suffer from the curse of dimensionality.

In this paper we present a scalable method for combinatorial Bayesian optimization,
alleviating combinatorial explosion.
Motivated by earlier work on online decision making in combinatorial spaces \citep{RajkumarA2014neurips},
we solve combinatorial optimization in a convex polytope,
employing an injective mapping from a categorical space to a real-valued vector space.
However, in contrast to the work~\citep{RajkumarA2014neurips},
we use a random projection to embed the categorical space into the real-valued vector space
on which all essential process is performed to solve the combinatorial Bayesian optimization.
We show ours achieves a sublinear regret bound with high probability.

\paragraph{Contributions.}
We provide a more general perspective than the existing methods, 
by defining on a convex polytope, a generalization of 0-1 polytope.
Based on this perspective, we propose a combinatorial Bayesian optimization strategy 
with a random mapping function to a convex polytope and their lookup table, 
which is referred to as CBO-Lookup.
Finally, we guarantee that our method has a sublinear cumulative regret bound, 
and demonstrate that our method shows satisfactory performance compared to other methods in various experiments.


\section{Related Work}\label{sec:related}

Bayesian optimization on structured domains, which is distinct 
from a vector-based Bayesian optimization~\citep{MockusJ1978tgo,JonesDR1998jgo}, 
attracts considerable attention from Bayesian optimization community, due to its potential 
on novel practical applications such as sensor set selection~\citep{GarnettR2010ipsn}, 
hyperparameter optimization~\citep{HutterF2011lion,WangZ2016jair}, 
aero-structural problem~\citep{BaptistaR2018icml}, 
clustering initialization~\citep{KimJ2021ml}, 
and neural architecture search~\citep{OhC2019neurips}.
This line of research discovers a new formulation of Bayesian optimization tasks, 
by defining surrogates and acquisition functions on structured domains.

Especially, \citet{HutterF2011lion,WangZ2016jair,BaptistaR2018icml} 
focus on a combinatorial space at which all available items are defined 
as discrete or categorical variables,
and solve this combinatorial problem by formulating on a binary domain, 
because any combination on a combinatorial space is readily expressed as binary variables.
\cite{HutterF2011lion} propose a method based on random forests~\citep{BreimanL2001ml} 
to find an optimal algorithm configuration, 
where a combination is composed of continuous and categorical variables.
\cite{WangZ2016jair} suggest a high-dimensional Bayesian optimization method 
that is able to handle combinations with scaling and rounding.
\cite{BaptistaR2018icml} solve an optimization problem over a discrete structured domain 
with sparse Bayesian linear regression and their own acquisition function.
These approaches (or the variants of these methods) are compared to our method 
in the experiment section.
\cite{DadkhahiH2020kdd} solve a combinatorial optimization problem 
with multilinear polynomials and exponential weight updates.
In particular, the weights are updated using monomial experts' advice.
\cite{DeshwalA2021aaai} propose an efficient approach to solving combinatorial optimization 
with Mercer features for a diffusion kernel.

To compute a kernel value over combinations, 
a kernel with one-hot encoding~\citep{DuvenaudD2014thesis} 
can be directly employed.
\cite{HutterF2011lion,FeurerM2015neurips} show 
this approach performs well in the applications of hyperparameter optimization.
However, as in the experimental results reported by \cite{BaptistaR2018icml}, 
it underperforms in the optimization tasks of combinatorial structures.
Furthermore, 
in this work, we will employ a baseline with 
Aitchison and Aitken (AA) kernel~\citep{AitchisonJ1976biometrika}, 
defined on a discrete space.
This AA kernel is symmetric and positive definite~\citep{MussaHY2013jmr}, 
which satisfies the requirements of positive-definite kernels~\citep{ScholkopfB2002book}.


\section{Combinatorial Bayesian Optimization}\label{sec:formulation}

We consider the problem of minimizing a black-box function 
$f(\bc): \calC \to \bbR,$ where the input $\bc=[c_1, \ldots, c_k]^{\top}$ is a collection 
of $k$ categorical (or discrete)\footnote{Only nominal variables are considered in this paper.} variables 
with each variable $c_i$ taking one of $N_i$ distinct values:
\begin{equation}
	\bc^\star = \argmin_{\bc \in \calC} f(\bc),
	\label{eqn:global_opt}
\end{equation}
where the cardinality of the combinatorial space $\calC$ is
\begin{equation}
	\lvert \calC \rvert = N_1 \times \cdots \times N_k = N.
\end{equation}
Bayesian optimization is an efficient method for solving the black-box optimization \eqref{eqn:global_opt},
where a global solution $\bc^{\star}$ is found by repeating the construction of a surrogate and 
the optimization of an acquisition function.

Bayesian optimization estimates a surrogate function $\widehat{f}$ in order to 
determine a candidate of global solution $\bc^\dagger$,
\begin{equation}
    \bc^\dagger = \argmax_{\bc \in \calC} a(\bc; \widehat{f}(\bc; \bC, \by)),
\end{equation}
where $a$ is an acquisition function, 
$\bC$ is previously observed input matrix $[\bc_1, \ldots, \bc_t] \in \calC^{t}$,
and $\by$ is the corresponding noisy output vector $[y_1, \ldots, y_t]^\top \in \bbR^t$;
\begin{equation}
	y = f(\bc) + \epsilon,
\end{equation}
where $\epsilon$ is an observation noise.
This formulation has two technical challenges in (i) modeling a surrogate function over $\bc$ 
and (ii) optimizing an acquisition function over $\bc$.
We thus suggest a combinatorial Bayesian optimization method to solving this problem.

Instantaneous regret $r_t$ at iteration $t$ measures a discrepancy 
between function values over $\bc^\dagger_t$ and $\bc^\star$:
\begin{equation}
    r_t = f(\bc^\dagger_t) - f(\bc^\star),
    \label{eqn:instant_regret}
\end{equation}
where $\bc^\dagger_t$ is the candidate determined by Bayesian optimization at iteration $t$.
Using \eqref{eqn:instant_regret}, we define a cumulative regret:
\begin{equation}
	R_T = \sum_{t = 1}^T r_t,
\end{equation}
and the convergence of Bayesian optimization is validated to prove the cumulative regret is sublinear, 
$\lim_{T \to \infty} R_T / T = 0$~\citep{SrinivasN2010icml,ChowdhurySR2017icml}.


A widely-used method to handle these categorical variables in Bayesian optimization 
is to use one-hot encoded representation, converting $\bc$ into a vector of length $N$
with a single element being one and the remaining entries being zeros~\citep{DuvenaudD2014thesis,HutterF2011lion,WangZ2016jair}.
In such a case, we need a strong constraint that makes
the next candidate one-hot encoded.

Alternatively, $\bc \in \{0,1\}^m$ is represented as a Boolean vector, 
where $m$ is the smallest integer which satisfies $2^m \geq N$.
Since the optimization of an acquisition function yields real-valued variables,
these real values are converted into a Boolean vector by rounding to 0 or 1,
before the objective function is evaluated at the corresponding categorical input.
In fact, we can interpret this approach as a Bayesian optimization method operating in a
0-1 polytope induced by an injective mapping $\phi: \calC \rightarrow \{0,1\}^m$.

This idea was generalized to solve the online decision making problem in general combinatorial spaces
\citep{RajkumarA2014neurips},
introducing an injective mapping $\phi: \calC \rightarrow \calX$,
where $\calX = \convex(\{ \phi( \bc ) \mid \bc \in \calC \})$ is a convex polytope
and $\convex(\cdot)$ denotes a convex hull of the given set \citep{ZieglerGM1993polytopes}.
The low-dimensional online mirror descent algorithm was developed for the case where
costs are linear in a low-dimensional vector representation of the decision space \citep{RajkumarA2014neurips}.
Motivated by \citep{RajkumarA2014neurips}, we take a similar approach to tackle the combinatorial Bayesian optimization.
The main contributions, which are distinct from existing work, are summarized below.
\begin{itemize}
\item We use a random mapping for
\begin{equation}
	\phi: \calC \rightarrow \calX=\convex(\{ \phi(\bc) \mid \bc \in \calC \}),
\end{equation}
to embed Boolean space into a low-dimensional vector space on which the construction of a probabilistic surrogate model
and the optimization of an acquisition function are performed. 
Such random projection was not considered in \citep{RajkumarA2014neurips}.
\item A random embedding was employed to solve the Bayesian optimization in a very high-dimensional space \citep{WangZ2016jair},
but the combinatorial structure and the corresponding regret analysis were not considered.
We show that even with a random projection, our combinatorial Bayesian optimization algorithm achieves a sublinear regret bound
with very high probability.
\end{itemize}


\section{Proposed Method}\label{sec:method}

In this section, we present our combinatorial Bayesian optimization algorithm, which is
summarized in \algref{alg:method}.
We introduce an injective mapping 
\begin{equation}
\phi: \calC \rightarrow \widehat{\calX},
\end{equation}
where $\widehat{\calX} = \{ \bx \mid \bx = \phi(\bc), \ \forall \bc \in \calC \}$.
With the mapping $\phi$, the black-box optimization problem \eqref{eqn:global_opt} 
can be written as
\begin{equation}
    \bc^\star = \argmin_{\bx \in \widehat{\calX}} f(\phi^{-1}(\bx)),
    \label{eqn:global_opt_phi}
\end{equation}
where $\phi^{-1}$ is a left inverse of injective function $\phi$.

At iteration $t$, we construct a surrogate model $\widehat{f}(\bx; \phi, \bC_t, \by_t)$ for $\bx \in \calX$
using Gaussian process regression on the currently available dataset $\{\bC_t, \by_t\} = \{ (\bc_i, y_i) \mid i=1,\ldots, t\}$,
in order to estimate the underlying objective function $f(\phi^{-1}(\bx))$.
Then we optimize the acquisition function $\widetilde{a}(\bx)$ over $\calX$ (instead of $\calC$),
which is built using the posterior mean and variance calculated by
the surrogate model $\widehat{f}(\bx; \phi, \bC_t, \by_t)$,  to determine 
where next to evaluate the objective: 
\begin{equation}
    \bx^\dagger_t = \argmax_{\bx \in \widehat{\calX} \subseteq \calX} \widetilde{a}(\bx; \widehat{f}(\bx; \phi, \bC_t, \by_t)),
\end{equation}
where $\calX=\convex(\{ \phi(\bc) \mid \bc \in \calC \})$.
The underlying objective function is evaluated at $\bc^\dagger = \phi^{-1}(\bx^\dagger)$.


\begin{algorithm}[t]
	\caption{Combinatorial Bayesian Optimization with Random Mapping Functions}
	\label{alg:method}
	\begin{algorithmic}[1]
		\REQUIRE Combinatorial search space $\calC$,
		    time budget $T$, 
		    unknown, but observable objective function $f$,
		    mapping function $\phi$.
		\ENSURE Best point $\bc_{\textrm{best}}$.
		\STATE Initialize $\bC_1 \subset \calC$.
		\STATE Update $\by_1$ by observing $\bC_1$.
		\FOR {$t = 1, \ldots T$}
		    \STATE Estimate a surrogate function $\widehat{f}(\bx; \phi, \bC_t, \by_t)$.
		    \STATE Acquire a query point:
		    \begin{equation}
		    	\bx^\dagger_t = \argmax_{\bx \in \widehat{\calX} \subseteq \calX} \widetilde{a}(\bx; \widehat{f}(\bx; \phi, \bC_t, \by_t)).
                     \end{equation}
		    \STATE Recover $\bx^\dagger_t$ to $\bc^\dagger_t$.
		    \STATE Observe $\bc_t^\dagger$.
		    \STATE Update $\bC_t$ to $\bC_{t + 1}$ and $\by_t$ to $\by_{t + 1}$.
		\ENDFOR
		\STATE Find the best point and its function value:
		\begin{equation}
		    (\bc_{\textrm{best}}, y_{\textrm{best}}) = \argmin_{(\bc, y) \in (\bC_T, \by_T)} y.
		\end{equation}
		\STATE \textbf{return} $\bc_{\textrm{best}}$
	\end{algorithmic}
\end{algorithm}


Before we describe the use of random mapping for $\phi$, we summarize a few things that should be noted:
\begin{itemize}
\item 
We assume a mapping $\phi: \calC \to \widehat{\calX}$ is injective, so it can be reversed by
its left inverse $\phi^{-1}: \widehat{\calX} \to \calC$. Note that $\lvert \calC \rvert=\lvert \widehat{\calX}\rvert=N$.
\item 
We run essential procedures in $\calX$, to solve the combinatorial Bayesian optimization,
that is a convex polytope which includes $\widehat{\calX}$. Note that $\calX$ is supposed to be convex and compact,
so is an appropriate feasible decision space for Bayesian optimization \citep{SrinivasN2010icml}.
\item 
Suppose that we are given a positive-definite kernel $k: \calX \times \calX \to \bbR$, i.e.,
$k(\bx, \bx') = \langle \varphi(\bx), \varphi(\bx') \rangle_{\calH_{\widehat{\calX}}}$
where $\varphi(\cdot)$ is a feature map into a RKHS $\calH_{\widehat{\calX}}$.
Then we define a kernel on a RKHS $\calH_{\calC}$
\begin{equation}
\label{eq:ktilde}
\widetilde{k}(\bc, \bc') = \left\langle \varphi(\phi(\bc)), \varphi(\phi(\bc')) \right\rangle_{\calH_{\calC}},
\end{equation}
which is also positive-definite. $\calH_{\calC}$ and $\calH_{\widehat{\calX}}$ are isometrically isomorphic.
\item 
We use $\widetilde{k}$ defined in \eqref{eq:ktilde} as a covariance function for Gaussian process regression over $\phi(\bc)$
which is applied to estimate the surrogate model.
\end{itemize}


\subsection{Random mapping and left inverse}
Now we consider a random mapping $\phi$ on $\calC$ into $\bbR^{d}$. 
In practice, we first convert categorical variables $\bc \in \calC$ into Boolean vectors $\mathbf{b} \in \{0, 1\}^{m}$
\citep{MatousekJ2002book}, 
where $m$ is the smallest integer that satisfies $2^m \geq N$.
Then we construct a random matrix $\bR \in \bbR^{d \times m}$ to transform Boolean vectors $\mathbf{b}$
into $d$-dimensional real-valued vectors, i.e.,
\begin{equation}
\phi(\bc) = \bR \mathbf{b},
\end{equation}
where $\mathbf{b}$ is a Boolean vector corresponding to $\bc$.

There exists a low-distortion embedding on $\{0, 1\}^m$ into $\bbR^d$ 
\citep{TrevisanL2000siamjc,IndykP2017hdcg}


We define a random mapping function, which is supported by 
the theoretical foundations of general random mapping functions.
First of all, the existence of transformation from $\calC$ to $\bbR^d$ can be shown.
To construct the mapping function that satisfies the aforementioned properties, 
a combination on $\calC$ is transformed to a Boolean vector on $\{0, 1\}^{m}$, where 
$m$ is the smallest dimensionality that satisfies $2^m \geq N$, 
which is always possible~\citep{MatousekJ2002book}.
Then, we can embed $\{0, 1\}^m$ into Euclidean space 
using low-distortion embedding~\citep{TrevisanL2000siamjc,IndykP2017hdcg}.
Therefore, there exists a transformation from $\calC$ to $\bbR^d$.

Since Bayesian optimization models a surrogate based on similarities between two covariates, 
we require to define the covariance function defined on $\calX$, which is an appropriate 
representative of the similarities between two inputs on $\calC$.

\begin{lemma}
\label{lem:random}
    For $\varepsilon \in (0, 1)$ and $d \in \Omega(\log t / \varepsilon^2)$, 
    by the Johnson–Lindenstrauss lemma~\citep{JohnsonWB1984cm}, 
    Proposition~2.3 in \citep{TrevisanL2000siamjc}, and 
    the existence of low-distortion transformation 
    from $\calC$ to $\bbR^d$, 
    a random mapping $\phi: \calC \to \calX$ 
    preserves a similarity between two any covariates on $\calC$:
    \begin{equation}
        (1 - \varepsilon) \| \mathbf{b} - \mathbf{b}' \|_1 
        \leq \| \phi(\bc) - \phi(\bc') \|_2^2 
        \leq (1 + \varepsilon) \| \mathbf{b} - \mathbf{b}' \|_1,
    \end{equation}
    where Boolean vectors $\mathbf{b}, \mathbf{b}' \in \{0, 1\}^m$ are equivalent to some $\bc, \bc' \in \calC$ 
    and $t$ is the current iteration.
\end{lemma}

\begin{proof}
Suppose that we have two mapping functions, $\phi_1: \mathcal{C} \to \{0, 1\}^m$ and $\phi_2: \{0, 1\}^m \to \mathbb{R}^d$.
A function $\phi_1$ is a bijective function since it maps combinatorial variables to their unique binary variables.
Moreover, in Proposition~2.3 in \citep{TrevisanL2000siamjc}, 
$\| \mathbf{b} - \mathbf{b}' \|_p = \| \mathbf{b} - \mathbf{b}' \|_{\textrm{Hamming}}^{1/p}$ is shown, 
where $\mathbf{b}, \mathbf{b}' \in \{0, 1\}^m \subseteq \bbR^m$, 
and $\|\cdot\|_{\textrm{Hamming}}$ is the Hamming distance.
Then, \lemref{lem:random} is readily proved:
\begin{align}
	\| \phi(\bc) - \phi(\bc') \|_2^2 &= \| \phi_2(\mathbf{b}) - \phi_2(\mathbf{b}') \|_2^2 \nonumber\\
	&\geq (1 - \varepsilon) \| \mathbf{b} - \mathbf{b}' \|_2^2 \nonumber\\
	&= (1 - \varepsilon) \| \mathbf{b} - \mathbf{b}' \|_\textrm{Hamming} \nonumber\\
	&= (1 - \varepsilon) \| \mathbf{b} - \mathbf{b}' \|_1,
	\label{eqn:lemma_1_left}
\end{align}
and
\begin{align}
	\| \phi(\bc) - \phi(\bc') \|_2^2 &= \| \phi_2(\mathbf{b}) - \phi_2(\mathbf{b}') \|_2^2 \nonumber\\
	&\leq (1 + \varepsilon) \| \mathbf{b} - \mathbf{b}' \|_2^2 \nonumber\\
	&= (1 + \varepsilon) \| \mathbf{b} - \mathbf{b}' \|_\textrm{Hamming} \nonumber\\
	&= (1 + \varepsilon) \| \mathbf{b} - \mathbf{b}' \|_1,
	\label{eqn:lemma_1_right}
\end{align}
by the Johnson–Lindenstrauss lemma, 
Proposition~2.3 in \citep{TrevisanL2000siamjc}, 
and the existence of low-distortion transformation, 
where $\phi(\cdot) = \phi_2(\phi_1(\cdot))$.
Therefore, by \eqref{eqn:lemma_1_left} and \eqref{eqn:lemma_1_right}, 
the following is satisfied: 
    \begin{equation}
        (1 - \varepsilon) \| \mathbf{b} - \mathbf{b}' \|_1 
        \leq \| \phi(\bc) - \phi(\bc') \|_2^2 
        \leq (1 + \varepsilon) \| \mathbf{b} - \mathbf{b}' \|_1,
    \end{equation}
    which concludes the proof of \lemref{lem:random}.
\end{proof}

\lemref{lem:random} encourages us to define a solid mapping function and its adequate inverse, 
rather than the previous work~\citep{WangZ2016jair}.
This method~\citep{WangZ2016jair}, which is referred to as REMBO converts 
from a query point $\bx$ 
to a variable on 0-1 polytope with random matrix in a generative perspective, 
and then determines a combination with rounding the variable.

Compared to REMBO, we suggest a mapping function from $\calC$ to $\calX$ 
with a uniformly random matrix $\bR \in \bbR^{d \times m}$, of which the left inverse is computed by 
Moore-Penrose inverse $\bR^+ \in \bbR^{m \times d}$, followed by rounding to $\{0, 1\}^m$.
The Bayesian optimization method using this random matrix and its Moore-Penrose inverse is referred to 
as \emph{CBO-Recon} from now, 
because it determines a query combination through reconstruction.

Furthermore, we design a mapping function with a lookup table $\bL$, constructed by 
a uniformly random matrix $\bR$.
Each row of the table is a key and value pair, where
key and value indicate a combination and its embedding vector by $\bR$, respectively.
After determining a query point on $\calX$, we find the closest point in $\bL$ 
and then recover to one of possible combinations on $\calC$.
We call it as \emph{CBO-Lookup}.

By these definitions and properties, we analyze our algorithm in terms of regrets 
and discuss more about our method.


\section{Regret Analysis}\label{sec:analysis}

We analyze the regret bound of combinatorial Bayesian optimization, 
to validate the convergence quality.
Following the foundations of the existing theoretical 
studies~\citep{SrinivasN2010icml,ChowdhurySR2017icml,ScarlettJ2018icml}, 
we use Gaussian process upper confidence bound (GP-UCB) as an acquisition function over $\bx$:
\begin{equation}
    \widetilde{a}(\bx; \widehat{f}(\bx; \phi, \bC_t, \by_t)) = \mu_t(\bx) - \beta_t \sigma_t(\bx),
    \label{eqn:gp_ucb}
\end{equation}
where $\bC_t \in \calC^{t}$, $\by_t \in \bbR^{t}$, and $\beta_t$ is a trade-off hyperparameter 
at iteration $t$.
To be clear, by the form of \eqref{eqn:gp_ucb},
we have to minimize \eqref{eqn:gp_ucb} to find the next query point where an objective function is minimized.
Before providing the details, the main theorem is described first.

\begin{theorem}
    \label{thm:upper}
    Let $\delta \in (0, 1)$, $\varepsilon \in (0, 1)$, and $\beta_t \in \calO(\sqrt{\log (\delta^{-1} N t)})$.
    Suppose that a function $f$ is on a RKHS $\calH_\calC$ and 
    a kernel $k$ is bounded, $k(\cdot, \cdot) \in [0, k_{\textrm{max}}]$.
    Then, the kernel-agnostic cumulative regret of combinatorial Bayesian optimization 
    is upper-bounded with a probability at least $1 - \delta$:
    \begin{equation}
        R_T \in \calO\left( \sqrt{(\sigma_n^{2} + \varepsilon^2) N T \log (\delta^{-1} N T)} \right),
    \end{equation}
    where $\sigma_n^2$ is the variance of observation noise and 
    $N$ is the cardinality of $\calC$.
\end{theorem}

The intuition of \thmref{thm:upper} is three-fold.
First, it is a sublinear cumulative regret bound, since $\lim_{T \to \infty} R_T / T = 0$, 
which implies that the cumulative regret does not increase as $T$ goes to infinity.
Second, importantly, \thmref{thm:upper} shows if $N$ increases, the upper bound of cumulative regret 
increases.
It means that we require additional time to converge the cumulative regret 
if the cardinality of $\calC$ is huge.
Moreover, our regret bound is related to a distortion level $\varepsilon^2$ by a random map and an observation noise level $\sigma_n^2$, 
which implies that small $\varepsilon^2$ and $\sigma_n^2$ yield a lower bound than large values.
See the discussion section for the details of this theorem.

\subsection{On \thmref{thm:upper}}

From now, we introduce the lemmas required to proving the main theorems.

\begin{lemma}
    \label{lem:bound_mean_std}
    Let $\delta \in (0, 1)$ and $\beta_t = \calO(\sqrt{\log (\delta^{-1} N t)})$ at iteration $t$.
    Under~\lemref{lem:random} and the properties mentioned above, 
    the following inequality is satisfied:
    \begin{equation}
        \lvert f(\bc) - \mu_t(\bx)\rvert \leq \beta_t \sigma_t(\bx),
        \label{eqn:f_mu_sigma}
    \end{equation}
    for all $\bx \in \widehat{\calX}$ where a subset of convex polytope $\widehat{\calX}$, 
    with a probability at least $1 - \delta$.
    In addition, using \eqref{eqn:f_mu_sigma}, 
    an instantaneous regret $r_t$ is upper-bounded by $2 \beta_t \sigma_t(\bx_t^\dagger)$,
    where $\bx_t^\dagger$ is a query point at $t$.
\end{lemma}

\begin{proof}
\lemref{lem:bound_mean_std} is originally proved in \citep{SrinivasN2010icml}, 
and discussed more in \citep{ScarlettJ2018icml};
See Lemma~1 in~\citep{ScarlettJ2018icml} for more details.
By \eqref{eqn:f_mu_sigma}, 
the upper bound of instantaneous regret can be straightforwardly proved 
by the definition of GP-UCB and \lemref{lem:bound_mean_std}:
\begin{align}
    r_t &= f(\phi^{-1}(\bx_t^\dagger)) - f(\bc^\star) \nonumber\\
    &\leq f(\phi^{-1}(\bx_t^\dagger)) - \mu_t(\bx_t^\dagger) + \beta_t \sigma_t(\bx_t^\dagger) \nonumber\\
    &\leq 2 \beta_t \sigma_t(\bx_t^\dagger),
\end{align}
which concludes this proof.
\end{proof}

\begin{lemma}
    \label{lem:mutual}
    Let $\epsilon$ be an observation noise, which is sampled from $\calN(0, \sigma_n^2)$.
    Inspired by~\citep{SrinivasN2010icml}, we define a mutual information 
    between noisy observations and function values over $\bC_t$,
    \begin{equation}
        \mutual(\by_t; \bff_t) = I([y_1, \ldots, y_t]; [f(\bc_1), \ldots, f(\bc_t)]).
    \end{equation}
    Suppose that the mutual information is upper-bounded by the maximum mutual information 
    determined by some $\bC^\ast \subset \calC$, which satisfies $\lvert \bC^\ast \rvert = t$:
    \begin{equation}
        \mutual(\by_t; \bff_t) \leq \mutual(\by^\ast; \bff^\ast),
    \end{equation}
    where $\by^\ast$ and $\bff^\ast$ are noisy function values and true function values over $\bC^\ast$, 
    respectively.
    Then, the mutual information $\mutual(\by_t; \bff_t)$ is upper-bounded by the entropy over $\by^\star$,
    \begin{equation}
        \mutual(\by_t; \bff_t) \leq H(\by^\star),
    \end{equation}
    where $\by^\star$ is noisy function values for $\bC^\star \subset \calC$ that satisfies $\lvert \bC^\star \rvert = N$.
\end{lemma}

\begin{proof}
Given an objective function $f$ and a variance of observation noise $\sigma_n^2$, 
a mutual information 
between noisy observations and function values over $\bC_t$,
\begin{equation}
    \mutual(\by_t ; \bff_t) = I([y_1, \ldots, y_t]; [f(\bc_1), \ldots, f(\bc_t)]),
\label{eqn:sup_mutual}
\end{equation}
where $y_i = f(\bc_i) + \epsilon$ and $\epsilon \sim \calN(0, \sigma_n^2)$, for $i \in [t]$.
As discussed in~\citep{SrinivasN2010icml}, 
the mutual information \eqref{eqn:sup_mutual} is upper-bounded by the maximum mutual information 
determined by some $\bC^\ast \subset \calC$, which satisfies $\lvert \bC^\ast \rvert = t$:
\begin{equation}
    \mutual(\by_t; \bff_t) \leq \mutual(\by^\ast; \bff^\ast),
\end{equation}
where $\by^\ast$ and $\bff^\ast$ are noisy function values and true function values for $\bC^\ast$, 
respectively.
By the definition of mutual information, the following inequality is satisfied:
\begin{equation}
    \mutual(\by^\ast; \bff^\ast) \leq H(\by^\ast),
\end{equation}
where $H$ is an entropy.
Then, since $H(\by^\star) = H(\by^\ast) + H(\by^\star \cap (\by^\ast)^c)$ 
where $\by^\star$ is noisy function values for $\bC^\star = \calC$, 
the mutual information \eqref{eqn:sup_mutual} is upper-bounded by the entropy over $\by^\star$, 
\begin{equation}
    \mutual(\by_t; \bff_t) \leq H(\by^\star),
\end{equation}
which concludes the proof of this lemma.
\end{proof}

Using the lemmas described above, the proof of \thmref{thm:upper} can be provided.
For notation simplicity, $\bx_t^\dagger$ 
and $k(\bx, \bx)$ 
would be written as $\bx_t$ and $k_{\textrm{max}}$, respectively.

\begin{proof}
    A cumulative regret $R_T$ can be described as below:
    \begin{align}
        R_T &= \sum_{t = 1}^T r_t \nonumber\\
        &\leq 2 \beta_T \sum_{t = 1}^T \sigma_t(\bx_t) \nonumber\\
        &\leq 2 \beta_T \left( T\sum_{t = 1}^T \sigma_t^2(\bx_t) \right)^{\frac{1}{2}},
        \label{eqn:R_T}
    \end{align}
    by \lemref{lem:bound_mean_std} and the Cauchy-Schwartz inequality.
    By \lemref{lem:random}, we know a covariate $\bx_t \in \calX$ 
    is distorted within 
    a factor of $(1 \pm \varepsilon)$ from the original space $\calC$.
    Here, from this proposition, we assume $\bx_t$ has an input noise, sampled from $\calN(\mathbf{0}, \varepsilon^2 \mathbf{I})$.
    As discussed by \cite{MchutchonA2011neurips}, 
    under this assumption and the Taylor's expansion up to the first-order term, 
    the term $\bK(\bX, \bX) + \sigma_n^2 \bI$ in 
    the posterior mean and variance functions~\citep{RasmussenCE2006book} 
    is replaced with
    \begin{equation}
        \bK(\bX, \bX) + \sigma_n^2 \bI + \varepsilon^2 \textrm{diag} \left(\Delta_{\widehat{f}} \Delta_{\widehat{f}}^\top \right),
    \end{equation} 
    where $\bX \in \bbR^{t \times d}$ is the previously observed inputs, 
    $\Delta_{\widehat{f}} \in \bbR^{t \times d}$ is a derivative matrix of a surrogate model $\widehat{f}$ over $\bX$,\footnote{Gaussian process is at least once differentiable with at least once differentiable kernels.} 
    and $\textrm{diag}(\cdot)$ returns a diagonal matrix;
    See \citep{MchutchonA2011neurips} for the details.
    Using this, 
    \eqref{eqn:R_T} is 
    \begin{align}
        R_T &\leq 2 \beta_T \sqrt{ 4 (\sigma_n^{2} + c \varepsilon^2)} \nonumber\\
        & \quad\times \left( T \sum_{t = 1}^T \frac{1}{2} \log(1 + (\sigma_n^{2} + c \varepsilon^2)^{-1} \sigma_t^2(\bx_t)) \right)^{\frac{1}{2}} \nonumber\\
        &\leq 4 \beta_T \sqrt{(\sigma_n^{2} + c \varepsilon^2) T \, \mutual(\by^\ast; \bff^\ast)} \nonumber\\
        &\leq 4 \beta_T \sqrt{(\sigma_n^{2} + c \varepsilon^2) T \, H(\by^\star)},
        \label{eqn:R_T_entropy}
    \end{align}
    where $c = \max(\textrm{diag}(\Delta_{\widehat{f}} \Delta_{\widehat{f}}^\top))$, 
    by \lemref{lem:mutual}, Lemma~5.3 in~\citep{SrinivasN2010icml}, 
    and $\alpha \leq 2 \log(1 + \alpha)$ for $0 \leq \alpha \leq 1$.
    Finally, by the definition of entropy and the Hadamard's inequality, 
    \eqref{eqn:R_T_entropy} is expressed as
    \begin{align}
        R_T &\leq 4 \beta_T \sqrt{(\sigma_n^{2} + c \varepsilon^2) T \log \lvert 2 \pi e \Sigma \rvert} \nonumber\\
        &\leq 4 \beta_T \sqrt{(\sigma_n^{2} + c \varepsilon^2) N T \log (2 \pi e k_{\textrm{max}})} \nonumber\\
        &\in \calO \left( \beta_T \sqrt{(\sigma_n^{2} + \varepsilon^2) N T} \right) \nonumber\\
        &= \calO \left( \sqrt{(\sigma_n^{2} + \varepsilon^2) N T \log (\delta^{-1} N T}) \right),
    \end{align}
    where $N$ is the cardinality of $\calC$ and $k_{\textrm{max}}$ is the maximum value of kernel $k$.
\end{proof}


\section{Discussion}\label{sec:discussion}

In this section we provide more detailed discussion on combinatorial Bayesian optimization and our methods.

\paragraph{Acquisition function optimization.}
Unlike the existing methods and other baselines, 
CBO-Recon and CBO-Lookup should find a query point 
on convex polytope $\calX$.
If a random matrix $\bR$ is constructed by a distribution supported on a bounded interval, 
$\calX$ is naturally compact.
Thus, CBO-Recon and CBO-Lookup optimize an acquisition function on 
the convex space determined by $\bR$.

\paragraph{Rounding to binary variables.}
To recover a covariate on $\bbR^d$ to $\calC$, 
rounding to binary variables is required in REMBO and CBO-Recon as well as other existing methods.
These methods must choose a proper threshold between 0 and 1 
after scaling them between that range~\citep{HutterF2011lion,WangZ2016jair}, 
but choosing a rule-of-thumb threshold is infeasible because an objective is unknown 
and we cannot validate the threshold without direct access to the objective.
On the contrary, CBO-Lookup does not require any additional hyperparameter 
for rounding to binary variables.

\paragraph{$L_1$ regularization for sparsity.}
As proposed and discussed in~\citep{BaptistaR2018icml,OhC2019neurips}, 
the objective for combinatorial optimization is penalized by 
$L_1$ regularization.
This regularization technique is widely used in machine learning 
in order to induce a sparsity~\citep{TibshiraniR1996jrssb} and avoid over-fitting.
In this discussion, we rethink this sparsification technique for combinatorial structure.
A representation on the Hamming space is not always sparse, 
because, for example, a solution can be $[1, 1, \ldots, 1]$, which is not sparse.

\paragraph{Complexity analysis.}
The time complexity of common Bayesian optimization is $\calO(T^3)$ 
where $T$ is the number of iterations (i.e., we assume the number of observations is proportional to 
$T$, because it evaluates a single input at every iteration), due to the inverse of covariance matrix.
Our CBO-Lookup holds the time complexity of common Bayesian optimization, 
and moreover with the Cholesky decomposition it can be reduced 
to $\calO(T^3 / 6)$~\citep{RasmussenCE2006book}.
Compared to BOCS proposed by~\cite{BaptistaR2018icml}, 
our algorithm has lower complexity than the BOCS variants.
The CBO-Lookup algorithm requires the space complexity $\calO(md)$, 
because it constructs a lookup table.
However, we can speed up this pre-processing stage 
by preemptively constructing the table before the optimization process 
and searching the table with a hash function~\citep{NayebiA2019icml}.
In practice, the combinatorial space of the order of $2^{24}$ requires less than 2 GB.

\paragraph{Regret bound.}
As described in the analysis section, \thmref{thm:upper} supports our intuition 
in terms of $\varepsilon^2$, $\sigma_n^2$, $\delta$, $N$, and $T$.
However, since we prove the cumulative regret bound for any kernel 
that is bounded and at least once differentiable, 
it is considered as a loose bound.
According to other theoretical studies~\citep{SrinivasN2010icml,ChowdhurySR2017icml,ScarlettJ2018icml}, 
this kernel-agnostic bound is readily expanded into a tighter bound, assuming a specific kernel.
To focus on the problem of combinatorial Bayesian optimization itself and moderate the scope of this work, 
the analysis for tighter bound will be left to the future work.

\section{Experiments}\label{sec:experiments}

We test the experiments on combinatorial optimization: 
thumbs-up maximization, seesaw equilibrium, binary quadratic programming, 
Ising model sparsification, and subset selection.
We first introduce the experimental setups.

\subsection{Experimental setup}

To optimize objectives over combinations, 
we use the following combinatorial Bayesian optimization strategies:
\begin{itemize}
    \item Random: It is a simple, but practical random search strategy~\citep{BergstraJ2012jmlr}.
    \item Bin-AA: It uses a surrogate model with AA kernel~\citep{AitchisonJ1976biometrika}, over binary variables.
    \item Bin-Round: The surrogate model is defined on $[0, 1]^m$ and acquired points are determined by rounding. The threshold for rounding is $0.5$.
    \item Dec-Round: Binary representation is converted to a decimal number.
    Then, common Bayesian optimization is conducted on $[0, 2^m - 1]$.
    \item SMAC: It is a sequential model-based optimization with random forests~\citep{HutterF2011lion}.
    It optimizes on binary space.
    \item BOCS-SA/BOCS-SDP: They are proposed by \cite{BaptistaR2018icml}. We use their official implementation and follow their settings. Unless otherwise specified, $\lambda$ is set to zero.
    \item COMBO: This approach~\citep{OhC2019neurips} constructs a surrogate function on the space of Cartesian product of graphs. Each graph corresponds to a categorical variable and a diffusion kernel over graphs is used to measure the similarity between combinations.
    \item REMBO/CBO-Recon: These strategies find the next query point on $\bbR^d$ and recover it to a combination with rounding.
    To round to a binary variable, thresholds for REMBO and CBO-Recon are set to $0.25$ and $0.02$, respectively. If it is not specified, we use $d = 20$.
    \item CBO-Lookup: It determines the next query point after mapping a combination to $\bbR^d$, 
    and recover it to a combination with a pre-defined lookup table. We use the same setting of REMBO and CBO-Recon for $d$.
\end{itemize}

To compare all the methods fairly, we set the same initializations across all the methods, 
by fixing a random seed for the same runs.
The number of initializations is set to 1 for most of experiments.
Gaussian process regression with Mat\'ern 5/2 kernel is employed as a surrogate model 
for Bin-Round, Dec-Round, REMBO, CBO-Recon, and CBO-Lookup.
The GP-UCB acquisition function is used as an acquisition function for Bin-AA, REMBO, CBO-Recon, and CBO-Lookup.
For COMBO, we use the official implementation provided by~\cite{OhC2019neurips},
but we modify some setups, e.g., initial points, for fair comparisons.
The expected improvement criterion is applied for Bin-Round and Dec-Round.
The hyperparameters of Gaussian process regression are optimized by marginal likelihood maximization.
All the experiments are repeated 10 times unless otherwise specified.

\begin{figure}[t]
\centering
    \subfigure[Thumbs-up (20D)]{
        \includegraphics[width=0.225\textwidth]{figures/thumbsup_20.pdf}
        \label{subfig:thumbsup}
    }
    \subfigure[Seesaw (24D)]{
        \includegraphics[width=0.225\textwidth]{figures/seesaw_24.pdf}
        \label{subfig:seesaw}
    }
    \caption{Bayesian optimization results on thumbs-up maximization and seesaw equilibrium. All the experiments are repeated 10 times.}
    \label{fig:exp_1}
\end{figure}

\begin{figure}[t]
\centering
    \subfigure[BQP (10D, $\lambda = 1.0$)]{
        \includegraphics[width=0.225\textwidth]{figures/bqp_10_10_0_1_0.pdf}
        \label{subfig:bqp_1}
    }
    \subfigure[Ising (24D, $\lambda = 1.0$)]{
    	\includegraphics[width=0.225\textwidth]{figures/ising_16_1_0.pdf}
	\label{subfig:ising_2}
    }
    \caption{Bayesian optimization results on binary quadratic programming and Ising model sparsification. All the experiments are repeated 10 times.}
    \label{fig:exp_2}
\end{figure}

\begin{figure}[t]
	\centering
	\subfigure[Sparse regression]{
		\includegraphics[width=0.225\textwidth]{figures/subset_24_0_01_1_0.pdf}
		\label{subfig:subset_2}
	}
	\subfigure[Sparse classification]{
		\includegraphics[width=0.225\textwidth]{figures/subset_20.pdf}
		\label{subfig:subset_3}
	}
	\caption{Results on subset selection, where $\rho = 0.01$ and $\nu = 1.0$. Dotted lines are the ones with $L_1$ regularization. All the experiments are repeated 10 times.}
	\label{fig:exp_3_4_5}
\end{figure}

\subsection{Thumbs-up maximization}
This experiment is an artificial circumstance to maximize the number of thumbs-up 
where the total number of thumbs-up and thumbs-down is fixed as $m$.
As shown in \figref{subfig:thumbsup}, CBO-Lookup performs well compared to other methods.

\subsection{Seesaw equilibrium}
It is a combinatorial problem for balancing the total torques of weights on a seesaw, 
which has $m / 2$ weights on the left side of seesaw and the other $m / 2$ weights on the right side,
where $m$ is an even number.
The weight $w_i$ is uniformly chosen from $[0.5, 2.5]$ for all $i \in \{0, \ldots, m - 1\}$.
Each torque for $w_i$ is computed by $r_i w_i$ where $r_i$ is the distance from the pivot of seesaw, 
and we set a distance between $w_i$ and the pivot as $r_i = i - m / 2$ 
for the weights on the left side and $r_i = i - m / 2 + 1$ for the weights on the right side.
As presented in \figref{subfig:seesaw}, CBO-Lookup achieves the best result in this experiment.

\subsection{Binary quadratic programming}
This experiment is to minimize an objective of binary quadratic programming problem 
with $L_1$ regularization,
\begin{equation}
f(\bc) = \phi(\bc)^\top \mathbf{Q} \phi(\bc) + \lambda \| \phi(\bc) \|_1,
\end{equation}
where $\phi$ is a mapping function to binary variables, 
$\mathbf{Q} \in \bbR^{m \times m}$ is a random matrix, 
and $\lambda$ is a coefficient for $L_1$ regularization.
We follow the settings proposed by \citep{BaptistaR2018icml}.
Similar to thumbs-up maximization and seesaw equilibrium, 
CBO-Lookup shows consistent performance compared to other baselines; see \figref{subfig:bqp_1}.

\subsection{Ising model sparsification}
This problem finds an approximate distribution $q(\phi(\bc))$, 
which is close to
\begin{equation}
    p(\phi(\bc)) = \frac{\exp(\phi(\bc)^\top J^p \phi(\bc))}{\sum_{\bc'} \exp(\phi(\bc')^\top J^p \phi(\bc'))},
\end{equation}
where $\phi$ is a mapping function to $\{-1, 1\}^m$ 
and $J^p$ is a symmetric interaction matrix for Ising models.
To find $q(\phi(\bc))$, we compute and minimize the Kullback-Leibler divergence between two distributions 
in the presence of $L_1$ regularization.
Our implementation for this experiment refers to the open-source repository of \citep{BaptistaR2018icml}.
CBO-Lookup is comparable to or better than other methods, as shown in \figref{subfig:ising_2}.

\subsection{Subset selection}
A regularization technique such as Lasso~\citep{TibshiraniR1996jrssb} has been 
widely used to solve this problem, where some variables are less correlated to the other variables, 
but in this experiment we solve this variable selection using combinatorial Bayesian optimization.
Our target tasks are categorized as two classes: 
(i) a sparse regression task, i.e., \figref{subfig:subset_2}, 
and (ii) a sparse classification task, i.e., \figref{subfig:subset_3}~\citep{HastieT2015book}.
For a fair comparison, we split the datasets into training, validation, and test datasets.
The validation dataset is used to select variables using Bayesian optimization, 
and the evaluation for the selected variables of test dataset is reported.

To conduct the Bayesian optimization methods on a sparse regression task, 
we follow the data creation protocol described in~\citep{HastieT2017arxiv}.
Given the number of data $n$, the dimension of data $p$, a sparsity level $s$, 
a predictor correlation level $\rho$, a signal-to-noise ratio level $\nu$, 
our regression model is defined as $\by = \bX \boldsymbol{\beta}$, 
where $\by \in \bbR^n$, $\bX \in \bbR^{n \times p}$, and $\boldsymbol{\beta} \in \bbR^p$.
We use beta-type 2~\citep{BertsimasD2016aos} with $s$ for $\boldsymbol{\beta}$ and draw $\bX$ from $\calN(\mathbf{0}, \boldsymbol{\Sigma})$ 
where $[\boldsymbol{\Sigma}]_{ij} = \rho^{\lvert i - j \rvert}$ for all $i, j \in [p]$.
Then, we draw $\by$ from $\calN(\bX \boldsymbol{\beta}, \sigma^2 \bI)$, 
where $\sigma^2 = (\boldsymbol{\beta}^\top \boldsymbol{\Sigma} \boldsymbol{\beta}) / \nu$.

For a sparse classification, we employ Mobile Price Classification dataset~\citep{SharmaA2018kaggle}, 
of which the variables are less correlated.
The base classifier for all the experiments are support vector machines 
and the variables selected by Bayesian optimization are given to train and test the dataset.
As presented in \figref{subfig:subset_3}, our method shows the better result than other methods 
in this experiment.

\paragraph{Comparisons with COMBO.}
As is generally known in the Bayesian optimization community, 
COMBO~\citep{OhC2019neurips} shows consistent performance in the experiments run in this work.
Since COMBO uses graph-structured representation as the representation of categorical variables, 
it directly seeks an optimal combination on graphs by considering the connectivity between variables.
This capability allows us to effectively find the optimum.
Compared to the known sophisticated COMBO method, 
CBO-Lookup also shows consistent performance across various tasks as shown in \figref{fig:exp_1}, 
\figref{subfig:bqp_1}, \figref{subfig:ising_2}, and \figref{subfig:subset_3}, 
and guarantees a sublinear cumulative regret bound with a low-distortion embedding 
as shown in~\thmref{thm:upper}.
Moreover, our method is beneficial for cheap computational complexity and easy implementation, 
by utilizing any off-the-shelf implementation of vector-based Bayesian optimization.
In addition to these, without loss of generality, our method is capable of 
optimizing a black-box objective on a high-dimensional mixed-variable space 
by embedding a high-dimensional mixed-variable space to a low-dimensional continuous space.
Unfortunately, it is unlikely that COMBO can solve the optimization problem defined on a high-dimensional mixed-variable space.

\section{Conclusion}\label{sec:conclusion}

In this work, we analyze the aspect of combinatorial Bayesian optimization 
and highlight the difficulty of this problem formulation.
Then, we propose a combinatorial Bayesian optimization method with a random mapping function, 
which guarantees a sublinear cumulative regret bound.
Our numerical results demonstrate that our method shows satisfactory performance compared to other existing methods 
in various combinatorial problems.

\begin{contributions}
J. Kim led this work, suggested the main idea of this paper, implemented the proposed methods, and wrote the paper.
S. Choi and M. Cho are co-corresponding authors and the order of them is randomly determined.
S. Choi and M. Cho advised this work, improved the proposed methods, and wrote the paper.
Along with these contributions, S. Choi introduced the fundamental backgrounds of this work,
and M. Cho substantially revised the paper.
\end{contributions}

\begin{acknowledgements}
This work was supported by Samsung Research Funding \& Incubation Center of Samsung Electronics under Project Number SRFC-TF2103-02.
\end{acknowledgements}

\bibliography{kim_37}

\end{document}
