
\section{Theoretical Analysis}\label{sec: analysis}
\begin{figure*}
  \centering
    {
    \includegraphics[trim={8cm 2cm 7cm 2.5cm}, width=.85\textwidth]
    {./fig/simple_regret_scan_uai.pdf}
    }
\caption{We use black dots and purple dots to show the infeasible region and feasible region in the first row correspondingly. Each column corresponds to a certain threshold choice for the single constraint $c(\instance) = |\instance+0.7|^{1/2}$ in the Rastrigin-1D-1C task. The search space contains a certain portion of the feasible region, denoted on each figure and title. The first row shows the distribution of 1000 samples from the noise-free distribution objective function, and the figures are differentiated with different feasible regions. The second row shows corresponding simple regret curves. We test each method with 15 independent trails and impose observation noises sampled from $\normal{(0, 0.1)}$ not shown in the first row. The scaling and length scale of the GPs are learned via maximum likelihood estimation.
}   
\label{fig:exps:scan_res}
\end{figure*}
We first state a few assumptions that provide insights into the convergence properties of \algname. 
The first one follows \citet{srinivas2009gaussian} as a standard assumption for BO.
\begin{assumption} \label{apt: sample_gp}
The objective and constraints are sampled from independent Gaussian processes. Formally, for all $t < T$ and $\instance \in \searchSpace$, $f(\instance)$ is a sample from $\mathcal{GP}_{\globalf, t}$, and $\cFunc_\conIdx (\instance)$ is a sample from $ \GP_{\cFunc_\conIdx, t} $, for all $\conIdx\in\conSpace$.
\end{assumption}
\revise{The second one assumes that the global optimum lies inside the feasible region due to the reason discussed in \remref{rem: interior}. }

\begin{assumption} \label{apt: exist_star}
A global optimum exists within the feasible region. The distance between this global optimum and the boundaries of the feasible regions is uniformly bounded below by $\epsilon_{\cFunc}$. More specifically, for all $\conIdx\in\conSpace$, $\exists \epsilon_\conIdx > 0$ such that $\cFunc_\conIdx(\instance^*) > \epsilon_{\conIdx}$, then it holds that $\cFunc_\conIdx(\instance^*) > \epsilon_{\cFunc} = \min_{\conIdx\in\conSpace}\epsilon_{\conIdx}$.
\end{assumption}
\revise{We will also show that without \assref{apt: exist_star}, it is possible to bound both the constraint violations and the regret—defined independently of feasibility—with minor adjustments as discussed in \remref{rem: boundary}. }

\begin{assumption} \label{apt: mono_ci}
Given a proper choice of $\beta_t$ that is non-increasing, the confidence intervals are consistent. Concretely, $\forall t_1 < t_2 < T$ and $\instance \in \searchSpace$, if $\beta_{t_1} \geq \beta_{t_2}$, then $\UCBit_{t_1}(\instance) \geq \UCBit_{t_2}(\instance)$ and $\LCB_{t_1}(\instance) \leq \LCB_{t_2}(\instance)$.
\end{assumption}

This is a mild assumption as long as $\beta_t$ is non-increasing, given recent work by \citet{koepernik2021consistency} showing that if the kernel is continuous and the sequence of sampling points lies sufficiently dense, the variance of the posterior \GP converges to zero almost surely monotonically if the function is in metric space.
If the assumption is violated, the technique of taking the intersection of all historical confidence intervals introduced by \citet{gotovos2013active} could similarly guarantee a monotonically shrinking confidence interval. That is, when $\exists t_1 < t_2 < T, \instance\in\searchSpace$, if we have $\UCBit_{t_1}(\instance) < \UCBit_{t_2}(\instance)$ or $\LCB_{t_1}(\instance) > \LCB_{t_2}(\instance)$, we let $\UCBit_{t_2}(\instance) = \UCBit_{t_1}(\instance)$ or $\LCB_{t_2}(\instance) = \LCB_{t_1}(\instance)$ to guarantee the monotonocity. To allow for a plug-in of the intersection technique, and without loss of accuracy, we keep using the notation $\UCB$ and $\LCB$ without further parsing the value in the following discussion of algorithm design and theoretical analysis. The cost of violating the \assref{apt: mono_ci} has been studied in corollary 3 by \cite{zhang2023learning}. We refrain from repeating the analysis here.

The following lemma justifies the definition of the regions(s) of interest $\roi_{t}$ defined in \eqref{eq:roi}. {For clarity, we denote $\discreteROI = \discreteSet \cap \roi_t$, and $CI_{\globalf^*, t } = [\max_{\instance \in \discreteROI}\LCB_{t}(\instance),
\max_{\instance \in \discreteROI}\UCBit_{t}(\instance)]$. }
{
\begin{lemma}\label{lem: roi}
Under the assumptions above, the regions of interest $\roi_{t}$, as defined in \eqref{eq:roi}, contain the global optimum with high probability. Formally, for all $\delta \in (0,1)$, $T \geq t\geq 1$, and any finite discretization $\discreteSet$ of $\searchSpace$ that contains the optimum $\instance^* = \argmax_{\instance\in \searchSpace}f(\instance)$ where $\cFunc_\conIdx(\instance^*) > \epsilon_{\cFunc}$ for all $\conIdx\in\conSpace$ and $\beta_t=2\log(2(\conNum+1)\vert \discreteSet \vert \pi_t/ \delta)$ with $\sum_{T\geq t\geq 1}\pi_t^{-1} = 1$,  we have $\Pr{\instance^* \in \discreteROI} \geq 1-\delta$.
\end{lemma}
}

To guarantee $\beta_t$ to be non-increasing, we could let $\pi_t = T$ and therefore $\beta=2\log(\frac{2(\conNum+1)\vert \discreteSet \vert T}{\delta})$ is a constant. The lemma shows that with proper choice of prior and $\beta$, 
the $\roi_{\globalf, t}$ remains nonempty during optimization.

Subsequently, let's define the maximum information gain about function $f$ after $T$ rounds:
$\maxInfo_{f, T} = \max_{\actionSet\subset \discreteSet: \vert \actionSet \vert=T}{\mutualinfo{y_\actionSet; f_\actionSet}}$ and
\begin{equation}\label{eq:gammaT}
    \widehat{\maxInfo_T} = \sum_{g \in \{\globalf\}\cup \{\cFunc_\conIdx\}_{\conIdx\in\conSpace}}{\maxInfo_{g, T}}    
\end{equation}

In the following, we show that we could bound the simple regret of $\algname$ after sufficient rounds. Concretely, in \thmref{thm: width}, we provide an upper bound on the width of the confidence interval for the global optimum $f^*=f(\instance^*)$.



{
\begin{theorem}\label{thm: width}
 Under the aforementioned assumptions, with a constant $\beta=2\log(\frac{2(\conNum+1)\vert \discreteSet \vert T}{\delta})$ and the acquisition function from $\algoref{alg:main}$, there exists an $\epsilon_\globalf \leq \epsilon_\cFunc$, such that after at most $T \geq \frac{\beta \widehat{\maxInfo_T} C_1}{\epsilon_\globalf^2}$ iterations, we have $\Pr{\vert CI_{\globalf^*, T}\vert \leq \epsilon_\globalf, \globalf^* \in CI_{\globalf^*, T }} \geq 1 - \delta$
    Here, $C_1 = 8/\log(1+\sigma^{-2})$.
\end{theorem}
}

Note $\beta \widehat{\maxInfo_T} C_1$ is sublinear with respect to $T$.
One direct result of \thmref{thm: width} is that if any point belongs to $\discreteSet$ that lies in the feasible set defined by the unknown constraints bears a suboptimal gap on the reward except for the global optimum, then after sufficient query, the algorithm will identify $\instance^*$ as the only point in the ROI. In that case, \algname will only query $\instance^*$ and achieve zero regret afterward.


\begin{cor}\label{cor: zero-regret}
    We assume the aforementioned conditions hold, and $\forall \instance \in \discreteSet$, when $ \forall \conIdx\in\conSpace$, $\cFunc_\conIdx(\instance) > 0$, $\instance \neq \instance^*$, it holds that $\exists \epsilon_\cFunc \geq 2\epsilon_\globalf > 0$, $\globalf^* - \globalf(\instance) > 2\epsilon_\globalf$. In addition, we use $\beta=2\log(\frac{2(\conNum+1)\vert \discreteSet \vert T}{\delta})$ and the acquisition function from $\algoref{alg:main}$. After at most $t \geq \frac{\beta \widehat{\maxInfo_t} C_1}{{\epsilon}_{\globalf}^2}$ iterations, we have $\Pr{\regret_{t} = 0} \geq 1-\delta$. Here, $C_1 = 8/\log(1+\sigma^{-2})$ and $t \leq T$.
\end{cor}

Similarly, if a group of suboptimal candidates lies in the feasible area and is sufficiently close to $\instance^*$, then \assref{apt: exist_star} also holds for those suboptimal points. In this condition, the algorithm achieves a sublinear cumulative regret after identifying this near-optimal region.

\begin{cor}\label{cor: cum-regret}
    We assume the aforementioned conditions hold, and $\forall \instance \in \discreteSet$, when $ \forall \conIdx\in\conSpace$, $\cFunc_\conIdx(\instance) > 0$, $\instance \neq \instance^*$, $\exists \epsilon_\cFunc \geq \epsilon_\globalf > 0$, $\globalf^* - \globalf(\instance) \leq 2\epsilon_\globalf$, it holds that $\forall \conIdx\in\conSpace$, $\cFunc_\conIdx(\instance) \geq \epsilon_{\cFunc}$. In addition, we use $\beta=2\log(\frac{2(\conNum+1)\vert \discreteSet \vert T}{\delta})$ and the acquisition function from $\algoref{alg:main}$. After at most $t' \geq \frac{\beta \widehat{\maxInfo_{t'}} C_1}{{\epsilon}_{\globalf}^2}$ iterations, we have, $\Pr{\sum_{t=t'}^{T}\reward(\instance^*) - \reward(\instance_t) \leq {\sqrt{(T-t')\beta\maxInfo_{T}{C_1}}}} \geq 1-\delta$. Here, $C_1 = 8/\log(1+\sigma^{-2})$ and $t' \leq T$.
\end{cor}


{
Following the path of proof for \thmref{thm: width}, with \lemref{lem:acqBound}, we can show that the algorithm can identify infeasibility when all points in the search space violate at least one of the constraints at least $\epsilon'_{\cFunc}$. Concretely, $\forall \instance \in \searchSpace$, if it holds that $\exists \conIdx\in\conSpace$, $\cFunc_\conIdx(x) < -\epsilon'_{\cFunc}$, with high probability the identified $\discreteSet_{\roi_T} = \emptyset$.
}

\begin{cor}\label{cor: feasibility}
    When the assumptions except for \assref{apt: exist_star} hold, $\forall \instance \in \searchSpace$, if $\exists \conIdx\in\conSpace$, $\cFunc_\conIdx(x) < -\epsilon'_{\cFunc}$, then with a constant $\beta=2\log(\frac{2(\conNum+1)\vert \discreteSet \vert T}{\delta})$ and the acquisition function from $\algoref{alg:main}$, after at most $T \geq \frac{\beta \widehat{\maxInfo_T} C_1}{{\epsilon'}_{\cFunc}^2}$ iterations, we have $\Pr{\discreteSet_{\roi_T} = \emptyset} \geq 1 - \delta$. Here, $C_1 = 8/\log(1+\sigma^{-2})$.
\end{cor}


The above algorithm and theoretical results assume that a discretization $\discreteSet$ is given but is compatible with any density of the discretization. This means that with additional assumptions on the underlying functions, we could adapt the algorithm to a continuous setting by taking a sufficiently dense discretization on a proper embedding space. \footnote{%%\revise
{With additional assumptions on the regularization of the underlying function, we derive the analogous analysis on continuous search space in \appref{sec:continuous}}.
}.

\revise{
\begin{rem}\label{rem: boundary}
    If the goal is to find the boundary optimum despite the feasibility concerns highlighted in \remref{rem: interior}, a practical approach is to uniformly shift the constraints by a small amount ${\epsilon}_{\cFunc}$ to satisfy \assref{apt: exist_star} with the modified constraints. Formally,  $ \forall \conIdx\in\conSpace$, $\cFunc'_\conIdx(\instance) = \cFunc_\conIdx(\instance) + {\epsilon}_{\cFunc}$. Then, running \algname with these adjusted constraints, $\cFunc'_\conIdx$, instead of the original $\cFunc_\conIdx$, yields similar guarantees as those in \thmref{thm: width} and \corref{cor: cum-regret}, with a high probability that any instantaneous violations of the original constraints are uniformly bounded by ${\epsilon}_{\cFunc}$. Further details are discussed in \appref{sec:boundary}.
\end{rem}}

