% \documentclass{uai2025} % for initial submission

\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
% % There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
%                                          Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
%                                           ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

% Choose your variant of English; be consistent

% \usepackage[british]{babel}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
% \usepackage{siunitx} % for proper typesetting of numbers and units

\usepackage[american]{babel}
\usepackage[utf8]{inputenc}
% \usepackage[hidelinks]{hyperref}
\usepackage{multirow}
\usepackage{url}
\usepackage{algorithm}
\usepackage{algpseudocode}
\usepackage{physics}
\usepackage{xcolor}
\usepackage{amsmath, amsthm, amsfonts, amssymb}
\usepackage{mathtools}
\usepackage{graphicx}
\usepackage{thmtools}
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{subcaption}

% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

% Self-defined macros
\DeclareMathOperator*{\argmax}{argmax}
\DeclareMathOperator*{\argmin}{argmin}
\DeclareMathOperator*{\supremum}{sup}
\theoremstyle{plain}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{condition}[theorem]{Condition}
\newtheorem{corollary}[theorem]{Corollary}
\theoremstyle{definition}
\newtheorem{definition}[theorem]{Definition}
\newtheorem{assumption}[theorem]{Assumption}
\theoremstyle{remark}
\newtheorem{remark}[theorem]{Remark}
\newtheorem{sublemma}{Lemma}[subsection]
\newtheorem{subcorollary}[sublemma]{Corollary}
\renewcommand{\arraystretch}{2}
\allowdisplaybreaks
\raggedbottom

\title{Black-box Optimization with Unknown Constraints via Overparameterized Deep Neural Networks}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:d.phantrong@deakin.edu.au}{Dat Phan-Trong}\thanks{Corresponding author: d.phantrong@deakin.edu.au}}
\author[2]{Hung The Tran}
\author[1]{Sunil Gupta}
% Add affiliations after the authors
\affil[1]{%
    Deakin Applied Artificial Intelligence Initiative\\
    Deakin University\\
    Australia
}
\affil[2]{%
    AI Center, VNPT Media \\
    Vietnam
}
  
\begin{document}
\maketitle

\begin{abstract}
Optimizing expensive black-box functions under unknown constraints is a fundamental challenge across a range of real-world domains, such as hyperparameter tuning in machine learning, safe control in robotics, and material or drug discovery. In these settings, each function evaluation may be costly or time-consuming, and the system may need to operate within unknown or difficult-to-specify safety boundaries. We apply the Expected Improvement (EI) acquisition function to select the next samples within a feasible region, determined by Lower Confidence Bound (LCB) conditions for all constraints. The LCB approach guarantees constraint feasibility, while EI efficiently balances exploration and exploitation, especially when the feasible regions are much smaller than the overall search space. To model both the objective function and constraints, we use Deep Neural Networks (DNNs) instead of Gaussian Processes (GPs) to improve scalability and handle complex structured data. We provide a theoretical analysis showing our method's convergence using recent Neural Tangent Kernel (NTK) theory. Under regularity conditions, both cumulative regret and constraint violation are bounded by the maximum information gain, with equivalent upper bounds to GP-based methods. To validate our algorithm, we conduct experiments on synthetic and real-world benchmarks, showing its benefit over recent methods in black-box optimization with unknown constraints.
\end{abstract}

\section{Introduction}
\label{sec:intro}
Global optimization of expensive black-box functions (Black-box Optimization, or BO) is a ubiquitous challenge in machine learning, control systems, and material design fields. These tasks often involve non-convex, multi-modal, and costly-to-evaluate functions, necessitating efficient exploration of the search space. Bayesian Optimization has emerged as a widely adopted model-based approach to address this. Bayesian Optimization builds a surrogate model, typically a Gaussian Process (GP), to approximate the unknown objective function from observed data points. This model guides the selection of new points, balancing exploration (sampling uncertain regions) and exploitation (focusing on promising areas). Classical techniques within Bayesian Optimization include Probability of Improvement (PI) \citep{kushner1964new}, Expected Improvement \citep{mockus1978application}, Gaussian Process Upper Confidence Bound (GP-UCB) \citep{srinivas2009gaussian}, and information-theoretic approaches such as Entropy Search (ES) \citep{hennig2012entropy} and Predictive Entropy Search (PES) \citep{hernandez2014predictive}.

As real-world problems often involve constraints that are also black-box in nature, Constrained Black-box Optimization (CBO) has become a vital extension of BO. CBO methods adjust the acquisition function to account for these constraints, seeking feasible solutions that satisfy the conditions while optimizing the objective function. A prominent method in CBO is the Expected Improvement with Constraints (cEI), first introduced by \citet{schonlau1998global} and later extended by \citet{gardner2014bayesian} and \citet{gelbart2014bayesian}. cEI integrates feasibility into the acquisition function, directing the optimization process toward regions where feasible solutions are likely. \citet{letham2019constrained} further improved cEI by using a quasi-Monte Carlo approximation to better manage observation noise, enhancing its effectiveness in noisy environments.

EI-based methods for constrained optimization face several challenges. When no feasible point exists, the EI cannot be computed, leading to modifications that focus solely on finding feasible regions, ignoring the objective function. Additionally, numerical challenges further limit some methods like EVR and IECI to small-dimensional problems. To address this, alternative methods have been proposed. For example, Predictive Entropy Search with Constraints (PESC,  \citealp{hernandez2015predictive}) offers a heuristic approach that selects feasible candidates directly from the search space, reducing uncertainty more effectively. However, the computational challenges associated with quadrature calculations during sampling have limited its practical applicability. Recently, \citet{takeno2022sequential} proposed a Min-Value Entropy Search method that simplifies the sampling process, making it more tractable.

Numerical optimization has also been taken into consideration as an effective tool for solving the unknown constraint problem. The idea is to reformulate constraints into simpler unconstrained problems solved through alternating iterations. The Augmented Lagrangian method is mostly used in this category. For example, \citet{gramacy2016modeling} with Augmented Lagrangian Bayesian Optimization (ALBO) and its improvement Slack-AL \citep{picheny2016bayesian} use Augmented Lagrangian Function (ALF) to formulate unconstrained surrogate problems and then solve them using EI as an acquisition function. Recently, ADMMBO \citep{ariafar2019admmbo} first applied the ADMM technique to transform the constrained problem into an equivalent unconstrained optimization, then solved an augmented Lagrangian relaxation. However, this method requires the introduction of additional variables, leading to increased computational costs.

Recent research has explored penalty functions and primal-dual methods to handle constraint violations during optimization. For example, \citet{lu2022no} introduced a penalty-function approach that adds a penalty term for constraint violations to the objective function, transforming the constrained problem into an unconstrained one. Similarly, \citet{zhou2022kernelized} proposed a primal-dual approach that balances the trade-off between optimizing the objective and minimizing constraint violations. While these methods are promising, their effectiveness is sensitive to the choice of parameters set, often requiring considerable effort in parameter tuning during implementation.

Alongside empirical advancements, recent theoretical works have started to address the absence of formal guarantees in Constrained Black-box Optimization (CBO). For example, \citet{lu2022no} introduced a penalty-based regret bound that combines the regret from the objective function with penalties for constraint violations. \citet{xu2023constrained} expanded this analysis by separately evaluating cumulative regret and constraint violations. In contrast, \citet{nguyen2023optimistic} provided a theoretical performance guarantee for CBO under unknown constraints in a \textit{decoupled} setting, where cumulative regret is calculated as the sum of both objective function regret and constraint violations.



Despite the success of previous works using Gaussian Processes (GPs) to model both objective functions and constraints, GPs struggle with poor computational scalability. The kernel matrix inversion required in GP methods has cubic complexity, which increases significantly as the number of constraints grows. In contrast, Deep Neural Networks (DNNs) have become a popular alternative in various Machine Learning tasks, offering the ability to extract rich features and scale linearly with dataset size, providing a clear advantage over GPs. Recent research has explored the use of DNNs in unconstrained optimization, including black-box function optimization in continuous search spaces~\citep{snoek2015scalable} and contextual bandit problems in discrete search spaces~\citep{zhou2020neural, zhang2021neural}. However, to the best of our knowledge, the challenge of replacing GPs with neural networks for constrained optimization involving black-box, expensive constraints while providing theoretical guarantees remains largely unaddressed.



In this paper, we provide a simple approach for black-box optimization with unknown constraints using deep neural networks. Our contribution can be summarized in three folds:
\begin{itemize}
    \item We propose a DNN-based black-box optimization algorithm with unknown constraints (Neural-CBO), where both the objective function and constraints are modeled using deep neural networks. We use EI as the acquisition function to find the next samples in a feasible region which is determined using Lower Confidence Bound (LCB) satisfaction conditions to all constraints. Using LCB-based conditions guarantees that the suggested regions encompass the actual feasible regions of the constraints (under our problem setting), while still allowing for constraints exploration. Meanwhile, EI efficiently balances exploration and exploitation in optimizing the objective function, especially when the feasible regions are significantly smaller than the overall search space.

    \item We provide a theoretical analysis of our proposed Neural-CBO algorithm based on recent advances in NTK theory. Under certain regularity assumptions, we show that cumulative regret as well as cumulative constraint violation has an upper
    bound of the form $\mathcal{O}(\gamma_T\sqrt{T})$, where $\gamma_T$ is the maximum information gain. This result is comparable to previous GP-based methods. It is worth noting that, our DNN models only required the network width as $m=\Omega(T)$  for the convergence. 
    \item We conduct benchmarking experiments on synthetic and real-world tasks to prove our algorithm's effectiveness empirically. The numerical results indicate that our algorithm achieves competitive performance with well-known approaches.
    \end{itemize}

\section{Problem Setting}
\label{ConstrainedNeuralBO:setting}
In this paper, we tackle the problem of black-box optimization, where the search space is subject to constraints imposed by other unknown functions. These constraints arise from \emph{real-valued} feedback $c_i(\mathbf{x})$, and the constraint condition $c_i(\mathbf{x})$ is satisfied if and only if $c_i(\mathbf{x}) \leq 0$.  
Formally, this problem is defined as follows:
\begin{equation*}
    \begin{split}
        & \underset{\mathbf{x} \in \mathcal{D}}{\min} f(\mathbf{x}),  \text{ subject to }  c_i (\mathbf{x})  \leq 0, \text{ for all } i = 1, \dots, K,
    \end{split}
\end{equation*}  
where $\mathcal{D} \subset \mathbb{R}^d$ is a bounded domain, and $f$ and $\{c_i\}_{i=1}^K \colon \mathbb{R}^d \rightarrow \mathbb{R}$ are unknown functions that can be evaluated at specific points. We consider this problem in a \textbf{coupled} setting, where both the objective function and constraints are evaluated simultaneously.



\section{Neural-CBO: Neural Network Based Black-Box Optimization with Unknown Constraints}
\label{section:neural_cbo}

In this section, we present Neural-CBO, a neural network-based approach to CBO. The complete algorithm is detailed in Algorithm~\ref{alg:neural_cbo}. The key innovation of Neural-CBO lies in leveraging neural networks as substitutes for GPs, traditionally used in Bayesian Optimization, to model both the black-box objective function and constraints. We first describe the structure of the neural network surrogate model, followed by our algorithm. 
\subsection{The Neural Network for an Arbitrary Function $f_a$}
\label{section:arbitrary_nn}
Given a black-box, expensive function $f_a$, we use a fully connected neural network, denoted as $a(\mathbf{x}; \mathbf{W})$, to model $f_a$:
\begin{equation}
\label{eqn:fcn}
    a(\mathbf{x}; \mathbf{W}) = \frac{\mathbf{q}^\top}{\sqrt{m}} \mathbf{D}^{(L)}(\mathbf{x}) \mathbf{W}^{(L)} \dots \frac{1}{\sqrt{m}} \mathbf{D}^{(1)}(\mathbf{x}) \mathbf{W}^{(1)} \mathbf{x}, 
\end{equation}
where $\mathbf{q} \in \mathbb{R}^m$ is the last layer weight, $\mathbf{W}^{(1)} \in \mathbb{R}^{m \times d}$, $\mathbf{W}^{(l)} \in \mathbb{R}^{m \times m}$ for $2 \leq l \leq L$ is the weight of the $l$-th hidden layer. The matrix $\mathbf{D}^{(l)}(\mathbf{x})$ is associated with the ReLU activation function and is defined as:
\begin{equation*}
    \mathbf{D}^{(l)}(\mathbf{x}) = \text{diag}\{\mathbf{1}_{ \{ \langle w_i^{(l)}, \mathbf{h}^{(l-1)}(\mathbf{x})  \rangle \ge 0 \} } \} \in \mathbb{R}^{m \times m},
\end{equation*}
% , whose $i$-th row is denoted as $w_i^{(l)}$.  

with $m$ as the number of neurons in the hidden layer $l$, and $\mathbf{h}^{(l)}(\mathbf{x})$ is the output of the $l$-th
layer given by 
\begin{equation*}
    \mathbf{h}^{(l)}(\mathbf{x}) = \frac{1}{\sqrt{m}} \mathbf{D}^{(l)}(\mathbf{x}) \mathbf{W}^{(l)} \dots \frac{1}{\sqrt{m}} \mathbf{D}^{(1)}(\mathbf{x}) \mathbf{W}^{(1)} \mathbf{x},
\end{equation*}
with $\mathbf{h}^{(0)}(\mathbf{x}) = \mathbf{x}$. 

At time $t=0$, each weight matrix $\mathbf{W}^{(l)},  2 \le l \le L$ is initialized as $\begin{psmallmatrix}
\boldsymbol{\Psi} & \mathbf{0}  \\
\mathbf{0} & \boldsymbol{\Psi}
\end{psmallmatrix}
$, where $\boldsymbol{\Psi}$ is a Gaussian random matrix with independent and identically distributed (i.i.d.) standard normal entries. Additionally, the outer weights $\mathbf{q} = (\hat{\mathbf{q}}, -\hat{\mathbf{q}})^\top$  are set as random variables, and each entry of $\mathbf{b}$ is set with an equal probability of being either $-1$ or $1$, and remain fixed throughout the training process. This initialization method is commonly employed in the literature, as seen in works like \citet{du2018gradient, arora2019fine}, and it
can be verified that, with this initialization scheme, $a(\mathbf{x}; \mathbf{W}_0) = 0$, for all input $\mathbf{x}$. 

The neural network is trained by running the stochastic gradient descent on the streaming data in \textit{one pass}. In particular, given the initialization $\{\mathbf{W}_0^{(l)} \}_{l=1}^L$ and last layer weight $\mathbf{q}$, the $l$-th layer weight matrix at the $t$-th iteration is updated by minimizing the $L_2$ loss as:
\begin{equation}
    \label{eqn:train_NN}
    \mathbf{W}_{t+1}^{(l)} = \mathbf{W}_{t}^{(l)} + \alpha_t (y_t - a(\mathbf{x}_t; \mathbf{W}_t)) \frac{\partial a(\mathbf{x}_t; \mathbf{W}_t)}{\partial \mathbf{W}^{(l)}},
\end{equation}
where $\alpha_t$ is the step size, and $\{\mathbf{x}_t, y_t\}$ is the observation at the $t$-th optimization iteration. 




% $\phi\colon \mathbb{R} \rightarrow \mathbb{R}$ is the ReLU activation function, and the weights $\mathbf{W}_1 \in \mathbb{R}^{m \times d}$, $\mathbf{W}_i \in \mathbb{R}^{m \times m}$ for $2 \leq i \leq L-1$, and $\mathbf{W}_L \in \mathbb{R}^{1 \times m}$ are the neural network parameters $\mathbf{W} \in \mathbb{R}^p$, with $p = md + m^2(L-2) + m$. The input dimension is $d$, where $\mathbf{x} \in \mathcal{D} \subset \mathbb{R}^d$, and the weights are initialized from a standard normal distribution $\mathcal{N}(0,1)$.

To estimate the uncertainty of the function $f_a$ modeled by $a(\mathbf{x}; \mathbf{W})$, we adopt the variance formula from recent advances in neural contextual bandits research ~\citep{zhou2020neural, kassraie2022neural}:
\begin{equation}
\label{eqn:neural_cbo_variance_formula}
    \sigma_{a,t}(\mathbf{x}) = \sqrt{\mathbf{g}_{a}(\mathbf{x}; \mathbf{W}_0)^\top \mathbf{U}_{a, t-1}^{-1} \mathbf{g}_{a}(\mathbf{x}; \mathbf{W}_0)},
\end{equation}
where
\begin{align}
    \mathbf{g}_{a}(\mathbf{x}; \mathbf{W}) &= \nabla_{\mathbf{W}}a(\mathbf{x}; \mathbf{W}), \text{ and} \nonumber
\\
     \mathbf{U}_{a, t} &= \mathbf{U}_{a, t-1} + \mathbf{g}_{a}(\mathbf{x}_t; \mathbf{W}_0) \mathbf{g}_{a}(\mathbf{x}_t; \mathbf{W}_0)^\top
\end{align}

\subsection{Neural Tangent Kernel}
\label{section:NTK}
\begin{definition}
    Given an $L$-layer neural network $a(\mathbf{x}; \mathbf{W})$ with input $\mathbf{x}$ and parameter $\mathbf{W}$ as defined in Equation \eqref{eqn:fcn}, a Neural Tangent Kernel (NTK) matrix $\mathbf{H}_t$ for a sequence of weights {$\mathbf{W}_t$} can be defined as:
    \begin{equation*}
        \mathbf{H}_t [i, j] \coloneqq \left \langle \frac{\partial a(\mathbf{x}_i; \mathbf{W}_t)}{\partial \mathbf{W}},  \frac{\partial a(\mathbf{x}_j; \mathbf{W}_t)}{\partial \mathbf{W}}\right \rangle = \sum_{l=1}^L \mathbf{H}^{(l)}_t[i, j],
    \end{equation*}
    where $\mathbf{H}^{(l)}_t [i, j] \coloneqq \left \langle \frac{\partial a(\mathbf{x}_i; \mathbf{W}_t)}{\partial \mathbf{W}^{(l)}},  \frac{\partial a(\mathbf{x}_j; \mathbf{W}_t)}{\partial \mathbf{W}^{(l)}}\right \rangle$ is the NTK from the $l$-th hidden layer, for all $1 \le i, j \le T$.
\end{definition}

Next, we present the common and well-established assumptions. The following assumption indicates the smoothness property of the unknown function $f_a$.
\begin{assumption}
\label{assumption:rkhs}
We assume that $f_a \in \mathcal{H}_{k_a}(\mathcal{D})$, where $\mathcal{H}_{k_a}(\mathcal{D})$ is the Reproducing Kernel Hilbert Space (RKHS) associated with a real-valued function $f_a$ defined on the domain $\mathcal{D}$. This space is induced by the Neural Tangent Kernel $k_a$, which arises from a neural network $a(\mathbf{x}; \mathbf{W})$. In particular, the RKHS $\mathcal{H}_{k_a}$ induces an inner product $\langle \cdot, \cdot \rangle_{\mathcal{H}_{k_a}} $ with the reproducing property: for all $f_a \in \mathcal{H}_{k_a}(\mathcal{D})$, we have 
$f_a(\mathbf{x}) = \langle f_a, k_a(\cdot, \mathbf{x}) \rangle_{\mathcal{H}_{k_a}}$. 
The induced norm is bounded and 
serves as a measure of the smoothness of $f_a$ w.r.t the kernel function $k_a$: $\norm{f_a}_{\mathcal{H}_{k_a}} = \sqrt{\langle f_a, f_a \rangle_{\mathcal{H}_{k_a}}} \leq B_a$. 
\end{assumption}


To ensure that the noise arising from querying unknown function $f_a$ remains bounded and manageable, we impose the following assumption:
\begin{assumption}
\label{assumption:subgaussian_noise}
    We assume the noises $\{\zeta_{ t}\}_{t=1}^T$ where $\zeta_t = o_t - f_a(\mathbf{x}_t)$  are conditionally sub-Gaussian with parameter $R_{a} > 0$, where $\{\zeta_t\}_{t=1}^T$ is assumed to capture the noises induced by querying the black-box, expensive function $f_a(\cdot)$.
 \begin{equation*}
         \forall t \ge 0 , \; \forall \lambda_a \in \mathbb{R}, \;  \mathbb{E}[e^{\lambda_a\zeta_t} \rvert \mathcal{F}_{a,t-1}] \le \exp(\frac{\lambda_a^2 R_a^2}{2}),
 \end{equation*}
 
 where $\mathcal{F}_{a, t-1}$ are the $\sigma$-algebra generated by the random variables $\{\mathbf{x}_i, \zeta_i\} 
^{t-1}_{i=1} \cup \{\mathbf{x}_t\}$.
\end{assumption}

\subsection{Maximum Information Gain}
Assume after $t$ steps, the model $a(\mathbf{x}, \mathbf{W})$ receives an input sequence $\mathcal{X}_t = (\mathbf{x}_1, \mathbf{x}_2, \dots  \mathbf{x}_t)$ and observes noisy rewards $\mathbf{o}_t = (o_1, o_2, \dots, o_t)$, where $o_i = f_a(\mathbf{x}_i) + \zeta_i$. The \emph{information gain} $I(\mathbf{o}_t; f_a)$ at step $t$, quantifies the reduction in uncertainty about $f_a$ after observing $\mathbf{o}_t$, defined as the mutual information between  $\mathbf{o}_t$ and $f_a$:
\[
I(\mathbf{o}_t; f_a):=  H(\mathbf{o}_t) - H(\mathbf{o}_t \rvert f_a), 
\]
where $H$ denotes the entropy function. Following \citet{srinivas2009gaussian}, the maximum information gain for the objective $f_a$ can be calculated as: 
\[
\gamma_{a,t} = \max_{\mathcal{X}_t \subset \mathcal{D}, \lvert \mathcal{X}_t \rvert = t} \frac{1}{2} \log \det \left(\mathbf{I} + \lambda_a^{-1} \mathbf{H}_t  \right),
\]

where $\lambda_a > 0$ is a noise variance and $\mathbf{H}_t$ is the kernel matrix. In our case, $\mathbf{H}_t$ can be referred to as the NTK matrix associated with the NTK kernel defined in Section \ref{section:NTK}.

To manage the approximation error, several technical lemmas impose the following condition on the width of the neural network.
\begin{condition}
    \label{condition:network_width}
    Throughout the section, the width of each hidden layer m satisfies is assumed to satisfy:
    \begin{align}
        m \ge d^9 \exp (\Omega(\nu LC^L\log T)), 
    \end{align}
for some absolute constant $C$. Besides, the step size $\alpha_t \le \frac{\nu}{t+1}$, where $\nu$ is a parameter and independent of dimension $d$ and width $m$.
\end{condition}


Before going to our main algorithm, we provide the confidence bound, which is a key component in many BO algorithms, to guide algorithm design and ensure theoretical guarantee. The lemma demonstrates that by following the network width condition stated in Condition \ref{condition:network_width},  the prediction of the trained neural network $a(\cdot;\mathbf{W}_{t-1})$ is concentrated at the actual value of the function $f_a(\cdot)$. 
\begin{restatable}{lemma}{ConfidenceBound}

\label{lemma:confidence_bound}
Let Assumptions \ref{assumption:rkhs} and \ref{assumption:subgaussian_noise} hold. Using neural network $a(\mathbf{x}; \mathbf{W})$ satisfied Condition \ref{condition:network_width} to model an arbitrary function $f_a$. Setting the step size at training step $t$ as $\alpha_t \le \frac{\nu}{(T+1)^2}$, then for any $\delta \in (0,1)$,  with probability at least $1 - \delta \exp (\Omega(C^{-L} m^{1/36}))$, the following holds for all $\mathbf{x} \in \mathcal{D}$ and $1 \le t \le T$:
\begin{align*}
     & \lvert f_a(\mathbf{x}) - a(\mathbf{x}; \mathbf{W}_{t-1}) \rvert 
    \le \beta_{a,t} \sigma_{a, t-1}(\mathbf{x}) + \frac{\mathcal{E}(m)}{T+1},
    \\
    & \beta_{a,t} = \left(B_a + R_a \sqrt{\gamma_{t,a} + 2 + 2 \log(1/\delta)}\right),
    \\
    & \mathcal{E}(m) = \mathcal{O}(C^{2L} L^{3/2} m^{11/36}).
\end{align*}
\end{restatable}
Here, the coefficient $\beta_{a,t}$ control the uncertainty of $a(\mathbf{x}; \mathbf{W}_{t-1})$ about $f_a(\mathbf{x})$ at $\mathbf{x}$, while $ \mathcal{E}(m)$ indicates the approximation error when using the neural network's output $a(\mathbf{x}; \boldsymbol{W})$ to learn the underlying function $f_a$.     

To facilitate the following algorithm design and discussion, we introduce the lower confidence
and upper confidence bound functions w.r.t the \textit{arbitrary} function $f_a$: 
\begin{align*}
\text{LCB}_{a,t}(\mathbf{x}, \mathbf{W}_{t}) &= a(\mathbf{x}, \mathbf{W}_{t}) - \beta_{a,t} \sigma_{a,t} (\mathbf{x}) - \frac{\mathcal{E}(m)}{T+1},
\\
\text{UCB}_{a,t}(\mathbf{x}, \mathbf{W}_{t}) &= a(\mathbf{x}, \mathbf{W}_{t}) + \beta_{a,t} \sigma_{a,t} (\mathbf{x}) + \frac{\mathcal{E}(m)}{T+1},  
\end{align*}
where $\sigma_{a,t}(\mathbf{x})$ is calculated using the formulate given in Equation \eqref{eqn:neural_cbo_variance_formula}. Then, with high probability,  $f_a$ is bounded by $\text{LCB}_{a,t}(\mathbf{x}, \mathbf{W}_{t})$ and $\text{UCB}_{a,t}(\mathbf{x}, \mathbf{W}_{t})$ as in the following corollary:
\begin{corollary}
\label{corrolary:f_in_lcb_ucb}
Let Assumption \ref{assumption:rkhs}, Assumption \ref{assumption:subgaussian_noise} and Condition \ref{condition:network_width} hold. Then with probability at least $1 - \delta \exp (\Omega(C^{-L} m^{1/36}))$, the following holds for all $\mathbf{x} \in \mathcal{D}$
and $1 \le t \le T$:
\begin{align*}
    f_a (\mathbf{x}) \in [\textnormal{LCB}_{a,t}(\mathbf{x}, \mathbf{W}_{t}), \textnormal{UCB}_{a,t}(\mathbf{x}, \mathbf{W}_{t})].
\end{align*}
\end{corollary}
\subsection{Neural-CBO Algorithm}
In the remaining parts of this paper, we refer to $v(\mathbf{x}; \boldsymbol{\theta})$ and $\{u_{c_i}(\mathbf{x}; \boldsymbol{\omega}_{c_i})\}_{i=1}^K$ as the neural network models for the unknown objective function $f$ and constraints $\{c_i\}_{i=1}^K$, respectively. 

Our algorithm starts by initializing the neural networks $v(\mathbf{x}; \boldsymbol{\theta})$ and $\{u_{c_i}(\mathbf{x}; \boldsymbol{\omega}_{c_i})\}_{i=1}^K$ using the initialization scheme described in Section \ref{section:arbitrary_nn}. We use the EI acquisition function to identify the next samples within the feasible region, determined by applying LCB-based conditions to all constraints. LCB conditions guarantee that the suggested regions include the true feasible regions of the constraints, allowing for both feasibility and exploration of the constraint boundaries. Meanwhile, EI effectively balances exploration and exploitation in the objective, which is especially important when the feasible region is significantly smaller than the overall search space.  (Line~\ref{alg:line_lcb_ei}). At each optimization iteration $t$, the next evaluation point $\mathbf{x}_t$ is determined by maximizing the acquisition function $\textsc{EI}_{f,t}(\mathbf{x})$ subject to the lower confidence bound constraints for all unknown constraints $\{c_i(\mathbf{x})\}_{i=1}^K$:
\[
\text{LCB}_{c_i,t}(\mathbf{x}, \boldsymbol{\omega}_{c_i,t}) \le 0, \forall i \in [K].
\]
To handle noisy observations, we utilized the standard choice of the incumbent, which is
the best value of the mean function so far:
$\mu^+_t = \max_{\mathbf{x}_k \in \mathcal{D}_{t-1}} v(\mathbf{x}_k; \boldsymbol{\theta}_{t-1})$, where  
the evaluations of both objective function and constraints on $\mathbf{x}_k$ yield noisy observations, such as the objective value $y_k = f(\mathbf{x}_k) + \epsilon_k$ and constraint values $\{z_{c_i,k}\}_{i=1}^K$, with each constraint observation given by $z_{c_i,k} = c_i(\mathbf{x}_k) + \eta_{c_i,k}$ and $\mathcal{D}_{t-1} = \{\mathbf{x}_k, y_k, z_{c_1,k}, \dots, z_{{c_K}, k}\}_{k=1}^{t-1}$. 




The standard approach for noisy EI formulation considers the difference between the predicted function value \(v(\mathbf{x}; \boldsymbol{\theta}_{t-1})\) and the current best value of the mean function so far \(\mu^+_t\). However, due to the approximation error of the neural network model, using this standard noisy EI to select next queries may lead to suboptimal decisions. To mitigate this issue, we add \(\mathcal{E}(m)\) as a correction term, leading to the modified EI formulation:
\begin{align*}
    \text{EI}_{f,t}(\mathbf{x}) &= \mathbb{E}[\max \{0, v(\mathbf{x}; \boldsymbol{\theta}_{t-1}) - \mu^+_t + \mathcal{E}(m)\}], 
\end{align*}
and achieve the closed form expression using similar technique proposed in \citet{tran2022regret} as: 
\begin{align*}
    \text{EI}_{f,t}(\mathbf{x}) 
    &= \rho (v(\mathbf{x}; \boldsymbol{\theta}_{t-1})- \mu^+_t + \mathcal{E}(m), \sigma_{f,t}(\mathbf{x})), 
\end{align*}
where $
  \rho(u,v) =
    \begin{cases}
      u \boldsymbol{\Phi}(\frac{u}{v}) + v \phi(\frac{u}{v}), & \text{if } v>0,
      \\
      \max \{u, 0\}, & \text{if } v=0.
    \end{cases}       
$

Then, we updated the dataset $\mathcal{D}_t = \mathcal{D}_{t-1} \cup \{\mathbf{x}_t, y_t, z_{c_1,t}, \dots, z_{{c_K}, t}\}$. The parameters $\boldsymbol{\theta}$ (for the objective function) and $\{ \boldsymbol{\omega}_{c_i} \}_{i=1}^K$ (for the constraints) are then updated separately by minimizing the $L_2$ loss on the new observation using stochastic gradient descent (SGD) described in Equation \eqref{eqn:train_NN}.

\begin{algorithm}[!ht]
\caption{}
\label{alg:neural_cbo}
\textbf{Input}: The input space $\mathcal D$, the optimization budget $T$, the number of constraints $N$
\begin{algorithmic}[1]
\State Initialize neural network models parameters $\boldsymbol{\theta}_0, \{ \boldsymbol{\omega}_{c_i,0} \}_{i=1}^K$.
\State Initialize $\mathbf{U}_{f,0} =   \mathbf{I}, \mathbf{U}_{c_i,0} =   \mathbf{I}, \forall i \in [1 \dots K]$,  
\For{$t = 1$ to $T$}

\State \parbox[t]{\dimexpr\linewidth-\algorithmicindent}{% 
 Choose $\mathbf{x}_t = \argmin_{\mathbf{x} \in \mathcal{D}} \textsc{EI}_{f,t}(\mathbf{x}) $ subject to $\text{LCB}_{c_i,t}(\mathbf{x}, \boldsymbol{\omega}_{c_i,t}) \le 0, \forall i \in [K]$ \label{alg:line_lcb_ei}
}
\State \parbox[t]{\dimexpr\linewidth-\algorithmicindent}{% 
 Observe the noisy evaluations of objective function $y_t = f(\mathbf{x}_t) + \epsilon_t$ and constraints $\{ z_{c_i,t} = c_i(\mathbf{x}_t) + \eta_{c_i,t} \}_{i=1}^K$.
}
\State \parbox[t]{\dimexpr\linewidth-\algorithmicindent}{% 
 Update observations set $\mathcal{D}_t = \mathcal{D}_{t-1} \cup \{\mathbf{x}_t, y_t, z_{c_1,t}, \dots, z_{{c_K}, t}\}$
}
\State \parbox[t]{\dimexpr\linewidth-\algorithmicindent}{% 
 Update the neural network parameters $\boldsymbol{\theta}_0, \{ \boldsymbol{\omega}_{c_i,0} \}_{i=1}^K$ using Equation \eqref{eqn:train_NN}.
}
\State \parbox[t]{\dimexpr\linewidth-\algorithmicindent}{%
 Update $\mathbf{U}_{f,t}$ and $\mathbf{U}_{c_i,t}, \forall i \in [1 \cdots K]$ separately using Equation \eqref{eqn:neural_cbo_variance_formula}.
}
\EndFor


\end{algorithmic}
\end{algorithm}

\begin{figure*}[t]
    \centering
    \begin{subfigure}[t]{0.49\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/1d_toy_lcb_ei.pdf}
        \caption{LCB for constraints, EI for objective.}
        \label{fig:neural_cbo_toy_sample_ei}
    \end{subfigure}%
    \hfill
    \begin{subfigure}[t]{0.49\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/1d_toy_lcb_lcb.pdf}
        \caption{LCB for both constraints and objective.}
        \label{fig:neural_cbo_toy_sample_lcb}
    \end{subfigure}
    \caption{Minimization results for the 1D objective $f(x) = \sin(x) + \sin\left(\frac{10x}{3}\right)$ under the constraint $c(x) = (x - 7)^2 - 1 \leq 0$, using different acquisition strategies.}
    \label{fig:neural_cbo_toy_sample_compare}
\end{figure*}

Figure \ref{fig:neural_cbo_toy_sample_compare} demonstrates the minimization of a 1D objective function $f(x) = \sin(x) + \sin(\frac{10x}{3})$ under the constraint $c(x) = (x-7)^2 -1 \le 0$ at the 200th iteration. This example highlights the \textbf{three key components} of our approach: the neural network surrogate model, LCB-based constraint handling, and the use of EI as the acquisition function for the objective. The top panels of Figures \ref{fig:neural_cbo_toy_sample_ei} and \ref{fig:neural_cbo_toy_sample_lcb} display the predicted mean and variance of the objective as modeled by a deep neural network, illustrating the behavior of the variance formula in Equation \eqref{eqn:neural_cbo_variance_formula} and the operation of Algorithm \ref{alg:neural_cbo}. These plots show that the variance is lower in feasible regions and higher in infeasible regions, supporting the effectiveness of our uncertainty estimation.

To clarify our use of LCB for constraints, we show the confidence intervals for the constraint function in the lower panels of the figures. These plots demonstrate that the true constraint $c(\mathbf{x})$ remains within the predicted confidence bounds, consistent with the theoretical guarantee in Corollary \ref{corrolary:f_in_lcb_ucb}. Conditioning on the lower confidence bound allows our algorithm to reliably assess constraint satisfaction and identify feasible solutions. This accurate LCB estimation effectively guides the Expected Improvement (EI) acquisition for the objective, enabling the search to approach the true feasible minimum.

To justify our use of EI for the objective function, we compare Figure \ref{fig:neural_cbo_toy_sample_ei} (EI for the objective, LCB for constraints) with Figure \ref{fig:neural_cbo_toy_sample_lcb} (LCB for both objective and constraints). These results highlight the advantage of our acquisition strategy: while LCB for constraints reliably guides the search toward feasible regions, applying LCB to the objective leads to overly exploratory behavior. In contrast, using EI for the objective alongside LCB for constraints (LCB-EI) achieves a more effective balance between exploration and exploitation, which is particularly beneficial when the feasible region is much smaller than the overall search space.

\section{Theoretical Analysis}
\label{section:theoretical_analysis}
In this section, we provide a theoretical analysis of our algorithm to offer insights on its convergence and sampling efficiency. Since the constraints are also black-box, we analyze a bound on the constraint violations.
\subsection{Metrics}

To evaluate the performance of black-box optimization methods, much of the prior research on unconstrained Bayesian Optimization has focused on minimizing cumulative regret. The cumulative regret after $T$ iterations is defined as $R_T = \sum_{t=1}^T r_t$, where $r_t = f(\mathbf{x}_t) - f(\mathbf{x^*})$ represents the instantaneous regret, quantifying  the difference between the value of the unknown function $f$ at the optimal point, $\mathbf{x}^* = \arg\max_{\mathbf{x} \in \mathcal{D}} f(\mathbf{x})$, and the value of the function at point $\mathbf{x}_t$, which is selected by the algorithm at iteration $t$. However, since $f(\mathbf{x^*})$ represents the optimal value under constraints, the algorithm may sometimes sample infeasible points with lower objective values than $f(\mathbf{x^*})$. To account for this, following \citet{xu2023constrained, nguyen2023optimistic}, we inherited the \textit{positive regret} definition as
$r_t^+ = [f(\mathbf{x}_t) - f(\mathbf{x^*})]^+,$
where $[\cdot]^+ \coloneqq \max\{0, \cdot\}$. Additionally, to measure constraint satisfaction, constraint \textit{violation} is defined as $
v_{c_i,t} = [c_i(\mathbf{x}_t)]^+$. 
Then, we introduce the \textit{cumulative positive regret} for the objective function, $R_T^+$, and the \textit{cumulative violation} for each constraint, $V_{c_i, T}, \forall i \in [K]$. These metrics measure the additional cost incurred due to suboptimal decisions and violations of the constraints over time by running the algorithm.  
\begin{definition} [Cumulative Positive Regret and Cumulative Violation]
    \begin{align*}
        R_T^+ & =\sum_{t=1}^T [f(\mathbf{x}_t) - f(\mathbf{x^*})]^+        
        \\
        V_{c_i, T} &= \sum_{t=1}^T [c_i(\mathbf{x}_t)]^+
    \end{align*}
        
\end{definition}
\subsection{Detailed Assumptions for Objective Function and Constraints}

% \subsubsection{Assumption on black-box objective function and constraints}
We apply the general assumption stated in the Assumption \ref{assumption:rkhs} and \ref{assumption:subgaussian_noise} on both objective function and constraints:

\begin{itemize}
    \item \textbf{Objective function}: $f \in \mathcal{H}_{k_f}(\mathcal{D})$, $\norm{f_a}_{\mathcal{H}_{k_f}} \leq B$,  where $k_f$ is corresponding to $v(\cdot, \boldsymbol{\theta})$. The noisy observation at step $t$ is $y_t = f(\mathbf{x}_t) +  \epsilon_t$, where $\{\epsilon_i\}_{i=1}^t$ is sub-Gaussian with parameter $R_f$ and variance $\lambda_f$.

    \item \textbf{Constraint}: $c_i \in \mathcal{H}_{k_{c_i}}(\mathcal{D}), \norm{c_i}_{\mathcal{H}_{k_{c_i}}} \leq S_i$, where $k_{c_i}$ is corresponding to $u_{c_i}(\cdot, \boldsymbol{\omega}_{c_i}), \forall i = 1, \dots, K$. The noisy observation at step $t$ is $z_{c_i,t} = c_i(\mathbf{x}_t) + \eta_{c_i, t}$, where $\{\eta_{c_i, t}\}_{i=1}^t$ is sub-Gaussian with parameter $R_{c_i}$ and variance $\lambda_{c_i}$.
\end{itemize}
We can now state our main theorem:
\begin{restatable}{theorem}{TheoremMain}
\label{theorem:main}
    Under Assumption \ref{assumption:rkhs}, Assumption \ref{assumption:subgaussian_noise} and Condition \ref{condition:network_width}, set the step size used to train the neural networks in Algorithm \ref{alg:neural_cbo} as $\alpha_t \le \frac{\nu}{(T+1)^2}$, then for any $\delta \in (0,1)$,  with probability at least $1 - \delta \exp (\Omega(C^{-L} m^{1/36}))$, the Cumulative Regret $R_T$ and Cumulative Violation $V_{c_i, T}$ after $T$ iterations are bounded as:
    \begin{align*}
        & V_{c_i, T} \le 2 \beta_{c_i,T} \sqrt{\frac{S_i T}{\log(S_i+1)} (2\gamma_{c_i,T}+1)} + 2\mathcal{E}(m),
        \\
        & R_T \le R_T^+ \le 2 \beta_{f,T} \sqrt{\frac{B T}{\log(B+1)} (2\gamma_{f,T}+1)} + 2 \mathcal{E}(m),
    \end{align*}
where $\mathcal{E}(m) = \mathcal{O}(C^{2L} L^{3/2} m^{11/36})$. Especially, by choosing $m = \Omega(d^9 \exp (\nu LC^L\log T)))$, the Cumulative Regret and Cumulative Violation enjoy the  following results:
\begin{align*}
        V_{c_i, T} = \mathcal{O}(\gamma_{c_i,T} \sqrt{T}), \;\;\;\;\; R_T = \mathcal{O}(\gamma_{f,T} \sqrt{T}).
    \end{align*}
\end{restatable}
\begin{remark}
Unlike previous works \citep{zhou2020neural, zhang2021neural} that require a neural network width of \( m = \Omega(T^6) \) for convergence when modeling the objective function, our paper builds on recent analyzes from \citet{xu2024overparametrized}, which show that only a linear condition of \( m = \Omega(T) \) is needed. Furthermore, while \citet{xu2024overparametrized} focus on the input domain \( \mathbb{S}^{d-1} \), we can adapt to inputs \( \mathbf{x} \in \mathbb{R}^d \) with \( 0 < n_l < \|\mathbf{x}\| < n_b \) (where \( n_l \) and \( n_b \) are positive constants) without changing the order of \( T \) in the width condition for \( m \). Similar arguments are noted in \citet{du2018gradient, cao2020generalization}.
\end{remark}
\section{Experimental Results}
In this section, we demonstrate the effectiveness of our proposed Neural-CBO algorithm through its application of synthetic benchmark optimization functions as well as real-world optimization problems. Our implementation is available at \url{https://github.com/phantrdat/neural-cbo}.

\subsection{Baselines}
\label{section:baselines}
For all experiments, we compared our algorithm with well-known Constrained EI (cEI), the extension of EI into constrained BO from \citet{gardner2014bayesian}. Besides, we also compare our algorithm with recent state-of-the-art algorithms in unknown constrained BO, including ADMMBO \citep{ariafar2019admmbo}, UCB-C \citep{nguyen2023optimistic} and ConfigOpt \citep{xu2023constrained}. For our proposed Neural-CBO algorithm, we employ fully connected deep neural networks as the surrogate models for both objective function and constraints. Due to space constraints, \textbf{implementation details of our Neural-CBO algorithm (including the choice of neural networks hyperparameter and other parameters in Algorithm \ref{alg:neural_cbo}) along with baseline implementations, are provided in Appendix \ref{section:baselines_supp}}. 


% 

  

\subsection{Synthetic Benchmark Functions}
\label{section:synthetic}
 We conducted optimization experiments on four synthetic objective functions: Branin, Ackley, Simionescu and Hartmann. The input dimension of each objective function and the corresponding number of constraints are summarized in Table \ref{table:synthetic_info}. \textbf{We present the expression of each function and its constraints in the Appendix \ref{section:exp_synthetic_supp}}. 
  \begin{table}[t]
 \caption{The input dimension and number of constraints for each synthetic objective function.}
 \vspace{0.15in}
\resizebox{\linewidth}{!}{
\begin{tabular}{|c|c|c|c|c|c|}
\hline
\textbf{Obj}              & Branin & Simionescu & Ackley & Hartmann  \\ \hline
\textbf{Dim}                   & 2   & 2               & 5 & 6                  \\ \hline
\textbf{Constraints}       & 1      & 1              & 2       & 1          \\ \hline
\end{tabular}
}
\label{table:synthetic_info}
\end{table}
% \looseness=-1
The noise in function evaluations follows a normal distribution with zero mean, and the variance is set to 1\% of the function range. All experiments reported here are averaged over 20 runs, each with random initialization. We report the (Log10 of) the Best Positive Regret plus Violation in Figure \ref{fig:synthetic}. We present justification for the choice of this metric as well as results for other metrics in Appendix \ref{exp:further_results}. 

To ensure statistical significance, we performed one-sided $t$-tests to assess whether a baseline outperforms Neural-CBO in terms of the best positive regret plus violation. The null hypothesis is $H_0: \mu_\text{baseline} \leq \mu_{\text{Neural-CBO}}$, and the alternative hypothesis is $H_a: \mu_\text{baseline} > \mu_{\text{Neural-CBO}}$, where $\mu_\text{baseline}$ and $\mu_{\text{Neural-CBO}}$ represent the means of the (Log10
of) Best Positive Regret plus Violation values of the baseline and our proposed Neural-CBO, respectively. Note that lower values indicate better performance. We present the statistical test results for four synthetic benchmark functions and two real-world tasks (described in Section \ref{section:gas} and \ref{section:speed}) in Table \ref{table:t-test}. Each cell in the table shows the $p$-value from the $t$-test as the first value. To account for multiple comparisons, the Benjamini-Hochberg correction was applied, with the corrected value provided as the second value. A result is labeled as ``T'' if the null hypothesis is rejected, meaning that Neural-CBO is statistically better to the compared baselines. Conversely, a result is labeled ``F'' if we cannot reject the null hypothesis, meaning that the baselines and the Neural-CBO are comparable.  The results in Table \ref{table:t-test} indicate that in 18 out of 24 comparisons, Neural-CBO achieves statistically better performance.

\begin{table}[ht]
\caption{One-sided $t$-tests to evaluate whether the baseline outperforms Neural-CBO in terms of the ``best positive regret plus violation'' metric.}
  \vspace{0.15in}
\resizebox{\linewidth}{!}{
\begin{tabular}{|l|c|c|c|c|}
\hline
\multicolumn{1}{|c|}{\textbf{}} & \textbf{ConfigOpt} & \textbf{cEI}  & \textbf{UCB-C} & \textbf{ADMMBO} \\ \hline
\textbf{Branin}             & (3.76e-01, F)      & (2.70e-01, F) & (3.36e-03, T)     & (2.08e-12, T)  \\ \hline
\textbf{Simionescu}             & (0.30e-01, T)      & (0.70e-01, F) & (1.42e-07, T)  & (8.53e-15, T)   \\ \hline
\textbf{Ackley}                 & (0.21e-03, T)      & (0.77e-01, F)      & (3.60e-08, T)   & (0.16e-02, T)        \\ \hline
\textbf{Hartmann}               & (3.35e-02, T)       & (2.79e-06, T) & (1.80e-11, T) & (2.61e-10, T)    \\ 
\hline
\textbf{Gas Transmission}               & (3.51e-10, T)       & (2.23e-07, T) & (5.34e-04, T) & (1.84e-11, T)    \\ 
\hline
\textbf{Speed Reducer}               & (0.30e-01, F)       & (5.83e-08, T) & (0.89e-01, T) & (1.08e-01, F)    \\ 
\hline
\end{tabular}
\label{table:t-test}
}
\end{table}


\begin{figure*}[t]
    \centering
   \includegraphics[width=\linewidth]{figures/Branin-Simionescu-GasTransmission-Ackley-Hartmann-SpeedReducer.pdf}
   % \vspace{0.15in}
    \caption{The plots show (Log10 of) the Best Positive Regret plus Violation up to step $t$, which is $\min_{t \in [T]} [f(\mathbf{x}_t) - f^*]^+ + \sum_{k=1}^K [c_k(\mathbf{x}_t)]^+ ]$, comparing our proposed algorithm and four baselines. The dimension of each objective function is shown in the parenthesis. The left group is four synthetic functions introduced in Section \ref{section:synthetic}, while the right group is the optimization results of Gas Transmission Compressor Design and Speed Reducer Design, described in Section  \ref{section:gas} and \ref{section:speed}.}
    
    \label{fig:synthetic}
\end{figure*}

We analyze three real-world constrained black-box optimization tasks: gas transmission compressor and speed reducer designs from \citet{kumar2020test}, and a third inspired by \citet{he2018verideep}. Details of each task will follow in the upcoming sections.  
\subsection{Gas Transmission Compressor Design}
\label{section:gas}
     The main objective is to minimize operational costs or energy consumption. This requires identifying the optimal configuration of the compressor by optimizing four design variables. The problem involves $d = 4$ input dimensions and includes $K = 1$ constraint. The detailed mathematic formula is provided in Appendix \ref{section:real_world_supp}. 
    
 \subsection{Speed Reducer Design}
\label{section:speed}
This task involves designing a speed reducer for a small aircraft engine, focusing on minimizing weight while meeting several constraints, including bending stress on gear teeth, surface stress, transverse deflections of shafts, and shaft stresses. The problem includes 7 decision variables and 11 constraints, resulting in an input dimension of $d=7$ and $K=11$ constraints. The mathematical formulation is provided in the Appendix \ref{section:real_world_supp}. We report numerical results of Section \ref{section:gas} and  \ref{section:speed} in Figure \ref{fig:synthetic} and Table \ref{table:t-test}.
\subsection{Designing Sensitive Samples for Model Tampering Detection}
\label{section:sensitive_sample}
\begin{figure}[ht!]
    \centering
\includegraphics[width=0.48\textwidth]{figures/sensitive_sample_detection_rate_with_errorbar.pdf}
    \caption{ \textbf{Detection Rates} w.r.t to the number of samples for the MNIST dataset. As shown in the figure, Neural-CBO can generate sensitive samples that achieve nearly 85\% of the detection rate with just 10 samples.}
    \label{fig:sensitive_sample}
\end{figure}
We examine a scenario where a company offers Machine Learning as a Service (MLaaS) and hosts its model in the cloud. In this context, an attacker with system access could alter the model by changing its weights. To detect such tampering, \citet{he2018verideep} propose generating a set of test inputs, called \emph{Sensitive Samples} $\{v_i\}_{i=1}^n$, whose outputs from the modified model will differ from those of the original. Assuming a pre-trained model $s_\varphi(\mathbf{x})$ may have been altered after being uploaded, the goal is to find sensitive samples by solving the optimization problem: 
\[
v = \argmax_\mathbf{x} \norm{\frac{\partial s_\varphi(\mathbf{x})}{\partial \varphi}}_F,
\]
where $\norm{\cdot}_F$ denotes the Frobenius norm. A detection is \emph{successful} if at least one of the $N_S$ sensitive samples shows a different top-1 prediction between the tampered and original models. To prevent attackers from evading detection, sensitive samples must resemble normal inputs. Therefore, a human-in-the-loop process is employed, where reviewers rate the realism of each sample on a scale of $(0,1)$; higher scores indicate more realistic samples. These scores serve as constraints in the optimization, where obtaining human feedback can be costly.

We utilized a pre-trained MNIST handwritten digit classification model and compared our method's performance against several baselines based on average detection rates for sensitive samples. Feasible samples were chosen based on their realistic scores. Figure \ref{fig:sensitive_sample} shows the detection rates of (feasible) sensitive samples generated by our method compared to four baselines, demonstrating that our samples achieved higher detection rates. As expected, the detection rate improves with more samples and our method is consistently competitive.


% as our baseline and tampered it by adding noise to each weight 1,000 times, resulting in 1,000 distinct models. The original model had a top-1 accuracy of $93\%$, which dropped to $87.73\% \pm 0.08\%$ after tampering. To reduce computational costs, we downscaled the images from $28 \times 28$ to $7 \times 7$, optimizing in this 49-dimensional space. After identifying the optimal points, we restored them to the original resolution to generate sensitive samples.

\section{Conclusion}
We propose a novel algorithm for black-box optimization with unknown constraints, utilizing deep neural networks as surrogate models for both the objective function and constraints. Our algorithm leverages the bounded nature of constraint values by applying LCB conditions at each iteration to ensure feasibility. We also employ EI as the acquisition function to balance exploration and exploitation, especially in scenarios where feasible regions are significantly smaller than the search space. Our theoretical analysis shows that, under mild conditions regarding neural network width, our algorithm achieves upper bounds on cumulative regret and constraint violations comparable to previous GP-based methods. We validate our approach through experiments on synthetic and real-world benchmark tasks involving structural data, with results demonstrating competitive performance against state-of-the-art methods. 


\bibliography{NeuralCBO}

\newpage

\onecolumn

\title{Black-box Optimization with Unknown Constraints via Overparameterized Deep Neural Networks\\Appendix}
\maketitle


\appendix

\section{Additional Experimental Results}
\subsection{Baselines}
\label{section:baselines_supp}
In this section, we briefly describe all baselines and our Neural-CBO implementations:
\begin{itemize}
    \item  \textbf{Constrained EI} (cEI) \citep{gardner2014bayesian} integrates feasibility into the acquisition function by multiplying the probability of feasibility into EI value at every point in the search space.
    \item \textbf{ConfigOpt} \citep{xu2023constrained}: Optimize LCB-based acquisition function for the objective, which satisfies LCB-based conditions for constraints. For cEI and ConfigOpt, we used the public implementation provided at GitHub repository: \url{https://github.com/PREDICT-EPFL/ConfigOPT}.
    \item \textbf{ADMMBO} \citep{ariafar2019admmbo}: Reformulates the constrained optimization problem into an unconstrained one using the Alternating Direction Method
    of Multipliers (ADMM) framework. As the official implementation of ADMMBO is written in Matlab and available at \url{https://github.com/SetarehAr/ADMMBO}, we use our own Python implementation based on the official implementation.  
    \item \textbf{UCB-C} \citep{nguyen2023optimistic}: 
    Similar to ConfigOpt, but using a UCB-based acquisition function for the objective, we utilized the implementation obtained directly from the authors.
\end{itemize}
\paragraph{Neural-CBO implementation details:}

As described in Section \ref{section:neural_cbo}, the network's weights are initialized with independent samples drawn from a normal distribution $\mathcal{N} (0, 1)$. We also initialize fixed outer weight $\mathbf{q}$ to be a symmetric Bernoulli random variable with equal probability to be $-1$ or $1$. To train the surrogate neural network models, we use a Gradient Descent optimizer as described in the main paper, with a default learning rate of $\alpha = 1\mathrm{e}{-4}$. However, $\alpha$ can be tuned within the range $(1\mathrm{e}{-4},1\mathrm{e}{-3})$ as needed. The network width depends on the tasks and is set to be $m = T$, where $T$ is the number of optimization iterations. We choose the network depth $L=2$ by default to reduce computational costs. Following Algorithm \ref{alg:neural_cbo}, we update the neural networks modeling the objective function and constraints using $\mathcal{D}_t$ after each optimization iteration with a single training pass.  


In our implementation (as well as in other baselines), we discretized the search space, computed the values of EI (acquisition function for the objective) and LCB (acquisition function for the constraints) for all candidate points, and then selected the next evaluation point 
satisfying Line \ref{alg:line_lcb_ei} in Algorithm \ref{alg:neural_cbo}. More details, we randomly sample 10k points and selecting the one with the highest acquisition function value. This approach was chosen as our paper does not focus on handling high-dimensional cases.  Alternatively, a gradient-based approach could optimize the acquisition function using the Lagrangian method to combine the objective acquisition and constraint acquisition functions into a single unconstrained optimization problem. 

To efficiently compute the inversion of matrix $\mathbf{U}_{a,t}$ in variance formula \eqref{eqn:neural_cbo_variance_formula}, we employ the Sherman-Morrison formula, taking advantage of the low-rank structure in the outer-products that add up to form that matrix. 

\subsection{Synthetic Benchmark Functions}
\label{section:exp_synthetic_supp}
We present the mathematical expressions of four synthetic objective functions and their accompanying constraints used for benchmarking in Section \ref{section:synthetic} of the main paper as follows:
\paragraph{Branin:} We adopt this function from \citet{letham2019constrained}.
\begin{align*}
f(\mathbf{x}) &= \left( \mathbf{x}_2 - \frac{5.1}{4 \pi^2} \mathbf{x}_1^2 + \frac{5}{\pi} \mathbf{x}_1 - 6 \right)^2 + 10 \left( 1 - \frac{1}{8\pi} \right) \cos(\mathbf{x}_1) + 10,
\\
\text{s.t. } c(\mathbf{x}) &= (\mathbf{x}_1 - 2.5)^2 + (\mathbf{x}_2 - 7.5)^2 - 50 \le 0 
\end{align*}
where \( \mathbf{x}_1 \in [-5, 10] \) and \( \mathbf{x}_2 \in [0, 15] \).
\paragraph{Simionescu:} 
\begin{align*}
    f(\mathbf{x}) &= 0.1\mathbf{x}_1 \mathbf{x}_2 
    \\
    \text{ s.t. }   c(\mathbf{x}) &= \mathbf{x}_1^2 + \mathbf{x}_2^2 - \left[ 1 + 0.2 \cos(8 \arctan\left( \frac{\mathbf{x}_2}{\mathbf{x}_1} \right)) \right]^2 \le 0
\end{align*}

\paragraph{Ackley:}  We inherited this function from \citet{zhang2023constrained}.
\begin{align*}
    f(\mathbf{x}) &= -20 \exp\left( -0.2 \sqrt{\frac{1}{d} \sum_{i=1}^{d} \mathbf{x}_i^2} \right)
- \exp\left( \frac{1}{d} \sum_{i=1}^{d} \cos(2 \pi \mathbf{x}_i) \right) + 20 + e
    \\
    \text{s.t. } &  \begin{cases} 
    c_1 (\mathbf{x}) = 1 - (\norm{\mathbf{x}-\mathbf{1}}_2 - 5.5)^2  & \le 0 
    \\
    c_2 (\mathbf{x}) = \norm{\mathbf{x}}_\infty^2 - 9 & \le 0
    \end{cases},
\end{align*} where $\mathbf{x} \in [-5,3]^5$. 
\paragraph{Hartmann} This is a constrained version of the standard Hartmann test function that uses $\norm{\mathbf{x}}_2 -1 \le 0$  as the constraint. This problem comes from \citet{letham2019constrained}.
\begin{align*}    
f(\mathbf{x}) &= -\sum_{i=1}^{4} \alpha_i \exp \left( -\sum_{j=1}^{6} \mathbf{A}_{ij} (\mathbf{x}_j - \mathbf{P}_{ij})^2 \right)
\\
\text{s.t. } & c(\mathbf{x}) = \norm{\mathbf{x}}_2 -1 \le 0
\end{align*}
where \( \mathbf{x} \in [0,1]^6 \), and the constants are:
\[
\alpha = (1.0, 1.2, 3.0, 3.2), 
\mathbf{A} = \begin{bmatrix}
10 & 3 & 17 & 3.5 & 1.7 & 8 \\
0.05 & 10 & 17 & 0.1 & 8 & 14 \\
3 & 3.5 & 1.7 & 10 & 17 & 8 \\
17 & 8 & 0.05 & 10 & 0.1 & 14
\end{bmatrix}, \mathbf{P} = 10^{-4} \times \begin{bmatrix}
1312 & 1696 & 5569 & 124 & 8283 & 5886 \\
2329 & 4135 & 8307 & 3736 & 1004 & 9991 \\
2348 & 1451 & 3522 & 2883 & 3047 & 6650 \\
4047 & 8828 & 8732 & 5743 & 1091 & 381
\end{bmatrix}.
\]

\subsection{Real-world Applications:}
\label{section:real_world_supp}
\paragraph{Gas Transmission Compressor Design:}  The main objective is to minimize operational costs or energy consumption. This requires identifying the optimal configuration of the compressor by optimizing four design variables. The problem involves $d = 4$ input dimensions and includes $K = 1$ constraint. The mathematics formula for this problem is:
\begin{align*}
    f(\mathbf{x}) &= 8.61 \times 10^5\mathbf{x}_1^{1/2} \mathbf{x}_2\mathbf{x}_3^{-2/3} \mathbf{x}_4^{-1/2} + 3.69 \times 10^4\mathbf{x}_3 + 7.72 \times 10^8 \mathbf{x}_1^{-1} \mathbf{x}_2^{0.219} - 765.43 \times 10^6\mathbf{x}_1^{-1},
    \\
    \text{s.t }  c(\mathbf{x}) &= \mathbf{x}_4\mathbf{x}_2^{-2} + \mathbf{x}_2^{-2} - 1 \le 0
\end{align*}

\paragraph{Speed Reducer Design:}
This task involves designing a speed reducer for a small aircraft engine, focusing on minimizing weight while meeting several constraints, including bending stress on gear teeth, surface stress, transverse deflections of shafts, and shaft stresses. The problem includes 7 decision variables and 11 constraints, resulting in an input dimension of $d=7$ and $K=11$ constraints.
\begin{align*}
    f(\mathbf{x}) &= 0.7854\mathbf{x}_2
^2 \mathbf{x}_1(14.9334\mathbf{x}_3 - 43.0934 + 3.3333\mathbf{x}_3^2) +0.7854(\mathbf{x}_5\mathbf{x}_7^2 + \mathbf{x}_4\mathbf{x}_6^2) \\
    & - 1.508\mathbf{x}_1(\mathbf{x}_7^2 + \mathbf{x}_6^2) + 7.477(\mathbf{x}_7^3 + \mathbf{x}_6^3), 
    \\
    \text{s.t. } & \begin{cases} 
    c_1(\mathbf{x}) = -\mathbf{x}_1\mathbf{x}_2^2\mathbf{x}_3 + 27  & \le 0 
    \\
    c_2(\mathbf{x}) = -\mathbf{x}_1\mathbf{x}_2^2\mathbf{x}_3^2 + 397.5 & \le 0 
    \\
    c_3(\mathbf{x}) = -\mathbf{x}_2\mathbf{x}_6^4
    \mathbf{x}_3\mathbf{x}_4^{-3}+ 1.93 &\le 0 
    \\
    c_4(\mathbf{x}) = -\mathbf{x}_2\mathbf{x}_7^4
    \mathbf{x}_3\mathbf{x}_5^{-3}+ 1.93 & \le 0 
    \\
    c_5(\mathbf{x}) = 10\mathbf{x}_6^{-3} \sqrt{16.91 \times 10^6 + (745\mathbf{x}_4\mathbf{x}_2^{-1}
    \mathbf{x}_3^{-1}
    )^2} -1100 & \le 0
    \\
    c_6(\mathbf{x}) = 10\mathbf{x}_7^{-3} \sqrt{157.5 \times 10^6 + (745\mathbf{x}_5\mathbf{x}_2^{-1}
    \mathbf{x}_3^{-1}
    )^2} - 850 & \le 0
    \\
    c_7(\mathbf{x}) = \mathbf{x}_2\mathbf{x}_3 -40 & \le 0 
    \\
    c_8(\mathbf{x}) = -\mathbf{x}_1 \mathbf{x}_2^{-1} + 5 & \le 0
    \\
    c_9(\mathbf{x}) = -\mathbf{x}_1 \mathbf{x}_2^{-1} - 12  & \le 0
    \\
    c_{10}(\mathbf{x}) = 1.5\mathbf{x}_6 - \mathbf{x}_4  + 1.9  & \le 0
    \\
    c_{11}(\mathbf{x}) = 1.1\mathbf{x}_7 - \mathbf{x}_5  + 1.9  & \le 0
    \\
    \end{cases},
\end{align*}

\paragraph{Designing Sensitive Samples for Detection of Model Tampering} 

As described in Section \ref{section:sensitive_sample}, we used a pre-trained MNIST digit classification model and compared our method's detection performance with several baselines. The model was tampered with by adding noise to its weights 1,000 times, producing 1,000 distinct versions. While the original model had a top-1 accuracy of 93\%, this dropped to $87.73\% \pm 0.08\%$ after tampering. To reduce computational costs, we downscaled the images from $28 \times 28$ to $7 \times 7$, optimized in this 49-dimensional space, and then restored them to the original resolution to generate sensitive samples.

\subsection{Further Results}
\label{exp:further_results}
\paragraph{Justification for the metric used in the main paper:}

In the main paper, we report the (Log10 of) the Best Positive Regret plus Violation. To justify our choice of metrics, we consider the definitions of positive regret and violation in the context of a minimization problem and emphasize the following:

\begin{itemize}
    \item When the selected point lies outside the feasible region and its objective function value exceeds the true optimum, the combined sum of positive regret and violation is large, effectively penalizing both infeasibility and poor performance.
    \item If the selected point is infeasible but has an objective function value lower than the true optimum, the violation term dominates the sum. This ensures that the penalty remains significant, especially when the point is far from the feasible region.
    \item When the selected point is within the feasible region, even if its objective function value is suboptimal, it still provides useful guidance by steering the algorithm toward feasible solutions, promoting further exploration in the correct direction.
\end{itemize}

To provide a more comprehensive evaluation of each method, we report results using three metrics: \textbf{Cumulative Positive Regret}, \textbf{Cumulative Violation}, and \textbf{Best Positive Regret for Feasible Points}. These metrics are illustrated in Figure~\ref{fig:synthetic_cumu_regret}, Figure~\ref{fig:synthetic_cumu_violation}, and Figure~\ref{fig:synthetic_best_feasible_minimum}, respectively. As shown, Neural-CBO consistently achieves strong performance across all metrics, demonstrating both rapid convergence to the optimum and effective constraint satisfaction. In particular, for the best positive regret among feasible points, Neural-CBO reliably identifies feasible minima. Note that the starting points in Figure~\ref{fig:synthetic_best_feasible_minimum} differ between methods, as each algorithm may encounter the feasible region at different iterations.

\begin{figure*}[h]
    \centering
   \includegraphics[width=\linewidth]{figures/Branin-Simionescu-GasTransmission-Ackley-Hartmann-SpeedReducer_regret.pdf}
   % \vspace{0.15in}
    \caption{The plots show Cumulative Positive Regret up to step $t$, which is $\sum_{t=1}^T [f(\mathbf{x}_t) - f^*]^+$, comparing our proposed algorithm and four baselines. The dimension of each objective function is shown in the parenthesis. The left group is four synthetic functions introduced in Section \ref{section:synthetic}, while the right group is the optimization results of Gas Transmission Compressor Design and Speed Reducer Design, described in Section  \ref{section:gas} and \ref{section:speed}.}
    \label{fig:synthetic_cumu_regret}
\end{figure*}


\begin{figure*}[h]
    \centering
   \includegraphics[width=\linewidth]{figures/Branin-Simionescu-GasTransmission-Ackley-Hartmann-SpeedReducer_violation.pdf}
   % \vspace{0.15in}
    \caption{The plots show Cumulative Violation up to step $t$, which is $\sum_{t=1}^T [f(\mathbf{x}_t) - f^*]^+$, comparing our proposed algorithm and four baselines. The dimension of each objective function is shown in the parenthesis. The left group is four synthetic functions introduced in Section \ref{section:synthetic}, while the right group is the optimization results of Gas Transmission Compressor Design and Speed Reducer Design, described in Section  \ref{section:gas} and \ref{section:speed}.}
    \label{fig:synthetic_cumu_violation}
\end{figure*}

\begin{figure*}[h]
    \centering
   \includegraphics[width=\linewidth]{figures/Branin-Simionescu-GasTransmission-Ackley-Hartmann-SpeedReducer_best_feasible_minimum.pdf}
   % \vspace{0.15in}
    \caption{The plots show Best Feasible Minimum up to step $t$, which is $\min\limits_{1 \leq j \leq t} f(\mathbf{x}_j), \forall c_i(\mathbf{x}_j) \le 0, \forall i =1 \dots K$ (where $K$ is the number of constraints), comparing our proposed algorithm and four baselines. The dimension of each objective function is shown in the parenthesis. The left group is four synthetic functions introduced in Section \ref{section:synthetic}, while the right group is the optimization results of Gas Transmission Compressor Design and Speed Reducer Design, described in Section  \ref{section:gas} and \ref{section:speed}.}
    \label{fig:synthetic_best_feasible_minimum}
\end{figure*}


\paragraph{Wall-clock Time Comparison:}

We provide the wall-clock running time of our algorithm and the considered baselines in the following table. We revisit six optimization tasks from our paper but increase the number of iterations for Hartmann ($D = 6$) and Speed Reducer ($D = 7$) to $n_{\text{iter}} = 1000$, to demonstrate the performance of neural networks when run for a large number of iterations.  While GP-based methods appear more efficient in terms of runtime with a small number of iterations, this advantage diminishes as the number of iterations increases - an often necessary condition in high-dimensional or challenging problems to approach the global optimum. Due to their cubic time complexity with respect to the number of data points, GPs exhibit significant scalability limitations. In contrast, neural networks enjoy nearly linear scaling in runtime, highlighting their suitability and efficiency for higher number of iterations.
\begin{table}[ht!]
\centering
\resizebox{\textwidth}{!}{%
\begin{tabular}{|l|c|c|c|c|c|c|c|}
\hline
\textbf{Objective} & \textbf{DIM} & \textbf{N\_ITERS} & \textbf{Neural-CBO} & \textbf{ConfigOpt} & \textbf{cEI} & \textbf{UCBC} & \textbf{ADMMBO} \\
\hline
Branin           & 2 & 100  & 40.63 $\pm$ 1.77   & 6.76 $\pm$ 4.54   & 6.31 $\pm$ 1.14   & 34.74 $\pm$ 5.15   & 342.24 $\pm$ 11.10 \\
\hline
Simionescu       & 2 & 100  & 54.41 $\pm$ 2.13   & 18.19 $\pm$ 0.52  & 20.77 $\pm$ 7.31  & 19.94 $\pm$ 1.25   & 253.81 $\pm$ 5.45  \\
\hline
Gas Transmission & 4 & 200  & 125.83 $\pm$ 1.87  & 6.66 $\pm$ 0.46   & 8.56 $\pm$ 0.43   & 30.01 $\pm$ 1.45   & 468.67 $\pm$ 12.40 \\
\hline
Ackley           & 5 & 200  & 85.26 $\pm$ 2.80   & 31.78 $\pm$ 4.71  & 31.44 $\pm$ 1.40  & 117.09 $\pm$ 40.86 & 309.13 $\pm$ 411.46 \\
\hline
Hartmann         & 6 & 1000 & 561.06 $\pm$ 24.00 & 927.45 $\pm$ 45.09& 472.24 $\pm$ 25.50& 2321.70 $\pm$ 781.94& 7196.00 $\pm$ 141.70 \\
\hline
Speed Reducer    & 7 & 1000 & 721.44 $\pm$ 88.25 & 1003.93 $\pm$ 67.57 & 615.10 $\pm$ 30.52 & 2545.92 $\pm$ 512.82 & 8162.02 $\pm$ 133.70 \\
\hline
\end{tabular}%
}
\caption{Wall-clock runtime (in seconds) of our algorithm and baselines across optimization tasks.}
\end{table}


\newpage

\section{Detailed Theoretical Analysis}

In this section, we will provide the proof of Lemma \ref{lemma:confidence_bound} and Theorem \ref{theorem:main}. Before presenting the proofs, we briefly remind the reader of existing terms and introduce new notations for convenience.
Remind that $\mathbf{g}_{a}(\mathbf{x}; \mathbf{W}) = \nabla_{\mathbf{W}}a(\mathbf{x}; \mathbf{W})$. Therefore, $\mathbf{g}_{a}(\mathbf{x}; \mathbf{W}_0)$ and $\mathbf{g}_{a}(\mathbf{x}; \mathbf{W}_t)$ will be the gradients of the neural network $a(\mathbf{x}; \mathbf{W})$ (using to model an \textit{arbitrary} function $f_a$, defined in Equation \eqref{eqn:fcn}) at initialization and at iteration $t$, respectively.  Further, let us define terms as follows:

\begin{equation}
\label{def:linear_kernelized_terms}
\begin{split}
\mathbf{G}_{a,t-1} & = [\mathbf{g}_{a}(\mathbf{x}_1; \mathbf{W}_0),\dots, \mathbf{g}_{a}(\mathbf{x}_{t-1}; \mathbf{W}_0)] 
\\
\Bar{\mathbf{G}}_{a,t-1} & = [\mathbf{g}_{a}(\mathbf{x}_1; \mathbf{W}_{t-1}),\dots, \mathbf{g}_{a}(\mathbf{x}_{t-1}; \mathbf{W}_{t-1})] 
\\
\mathbf{U}_{a,t-1} &=  \mathbf{I} + \mathbf{G}_{a,t-1} \mathbf{G}_{a,t-1}^\top 
\\
\mathbf{F}_{a,t-1} &= [f_a(\mathbf{x}_1), \dots, f_a(\mathbf{x}_{t-1})]
\end{split}
\end{equation}
Further, it can be verified that $\mathbf{H}_0 = \mathbf{G}_{a,t-1} ^\top\mathbf{G}_{a,t-1}$, where $\mathbf{H}_0$ is the NTK matrix at initialization defined in Section \ref{section:NTK}.  Now we are ready to bound Lemma \ref{lemma:confidence_bound}.
\subsection{Proof of Main Results Provided in Section \ref{section:theoretical_analysis}}
\subsubsection{Proof of Lemma \ref{lemma:confidence_bound}}
\ConfidenceBound*

\begin{proof}
To prove Lemma \ref{lemma:confidence_bound}, we analyze the left-hand side as follows:
\begin{align*}
    & \lvert f_a(\mathbf{x}) - a(\mathbf{x}; \mathbf{W}_{t-1}) \rvert 
    \\
    & \le \underbrace{\lvert f_a(\mathbf{x}) - \langle \mathbf{g}_a(\mathbf{x}_{t};\mathbf{W}_{0}),\mathbf{U}_{a,t-1}^{-1}\mathbf{G}_{a,t-1}\mathbf{y}_{a,t-1} \rangle  \rvert}_{T_1} + \underbrace{\lvert a(\mathbf{x}; \mathbf{W}_{t-1}) - \langle \mathbf{g}_a(\mathbf{x}_{t};\mathbf{W}_{0}),\mathbf{U}_{a,t-1}^{-1}\mathbf{G}_{a,t-1}\mathbf{y}_{a,t-1} \rangle  \rvert}_{T_2}
\end{align*}
Here, $T_1$ represents the difference between the actual function value and the theoretical optimal solution for a linearized network. Meanwhile, $T_2$ refers to the gap between the neural network's output $a(\mathbf{x}; \boldsymbol{W}_{t-1})$ at iteration $t-1$ and the theoretical optimal solution for the same linearized network.
\paragraph{Bound term $T_1$}:

First, following Assumption \ref{assumption:rkhs} in the main paper, we assume that $f_a$ is in RKHS $\mathcal{H}_{k_a}$ with NTK kernel $k_a$, and $\mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)$  can be considered as finite approximation of $\varphi(\cdot)$, the feature map of the NTK from $\mathbb{R}^d \rightarrow \mathcal{H}_{k_a}$. From Lemma \ref{lemma:RKHS_expression}, there exists $f_a^* \in \mathbb{R}^p$ such that $f_a(\mathbf{x}) = \langle \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0), f_a^* \rangle = \mathbf{g_a}(\mathbf{x}; \mathbf{W}_0)^\top f_a^*$. 
Then the term $T_1$ can be bounded as:
\begin{align*}
\label{ieqn:confidence_interval}
         T_1 &= \left \lvert f(\mathbf{x}) - \langle \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0); \mathbf{U}^{-1}_{a,t-1} \mathbf{G}_{a,t-1} \mathbf{y}_{a,t-1} \rangle   \right \rvert  
         \\
         & = \left\lvert f(\mathbf{x}) - \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)^\top  \mathbf{U}^{-1}_{a,t-1} \mathbf{G}_{a,t-1} \mathbf{y}_{a,t-1} \right\rvert 
         \\
        & \leq \left\lvert f(\mathbf{x}) - \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)^\top  \mathbf{U}^{-1}_{a,t-1}
        \mathbf{G}_{a,t-1}\mathbf{f}_{a, t-1} \right\rvert + 
        \left\lvert \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)^\top \mathbf{U}^{-1}_{a,t-1}
        \mathbf{G}_{a,t-1} \boldsymbol{\epsilon}_{a, t-1} \right\rvert
        \\
        & = \left\lvert \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)^\top f_a^* - \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)^\top  \mathbf{U}^{-1}_{a,t-1} 
        \mathbf{G}_{a,t-1}
        \mathbf{G}_{a,t-1}^\top f_a^* \rangle \right\rvert + 
        \left\rvert  \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)^\top \mathbf{U}^{-1}_{a,t-1} \mathbf{G}_{a,t-1} \boldsymbol{\epsilon}_{a, t-1}  \right\rvert
        \\
        & = \left\lvert \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)^\top \left( \mathbf{I} -  \mathbf{U}^{-1}_{a,t-1}  \mathbf{G}_{a,t-1} \mathbf{G}_{a,t-1}^\top  \right) f_a^*  \right \vert + 
        \left\lvert  \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)^\top \mathbf{U}^{-1}_{a,t-1} \mathbf{G}_{a,t-1} \boldsymbol{\epsilon}_{a, t-1}  \right\rvert 
        \\
        & = \left\lvert \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)^\top \left( \mathbf{I} -  \mathbf{U}^{-1}_{a,t-1} \left( \mathbf{U}_{t-1} - \mathbf{I} \right)  \right) f_a^*  \right \vert +
        \left \lvert  \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)^\top \mathbf{U}^{-1}_{a,t-1} \mathbf{G}_{a,t-1} \boldsymbol{\epsilon}_{a, t-1}  \right \rvert 
        \\
        & = \left\lvert  \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)^\top \mathbf{U}^{-1}_{a,t-1} \mathbf{w}  \right\rvert  + \left\lvert \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)^\top \mathbf{U}^{-1}_{a,t-1} \mathbf{G}_{a,t-1} \boldsymbol{\epsilon}_{a, t-1}   \right\rvert 
        \\
        & \leq \norm{f_a^*}_{k_a}  \norm{   \mathbf{U}^{-1}_{a,t-1} \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)}_{k_a} + \left\lvert \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)^\top \mathbf{U}^{-1}_{a,t-1} \mathbf{G}_{a,t-1} \boldsymbol{\epsilon}_{a, t-1}   \right\rvert \\
        & \leq  \norm{f_a^*}_{k_a}  \sqrt { \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)^\top \mathbf{U}^{-1}_{a,t-1} \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)}  + \left\lvert \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)^\top \mathbf{U}^{-1}_{a,t-1} \mathbf{G}_{a,t-1} \boldsymbol{\epsilon}_{a, t-1}   \right\rvert 
        \\
        & \leq \sqrt{2}B_a \sigma_{a,t}(\mathbf{x}) + \sigma_{a,t}(\mathbf{x}) R \sqrt{\log \det (\mathbf{I} + \mathbf{H}_0) + 2 \log(1/\delta)} 
        \\
        & \le \sqrt{2}B_a \sigma_{a,t}(\mathbf{x}) +  R \sqrt{\gamma_{a,t} + 2 + 2 \log(1/\delta)} \sigma_{a,t}(\mathbf{x}) 
        \\
        & = \left(\sqrt{2}B_a + R_a \sqrt{\gamma_{a,t} + 2 + 2 \log(1/\delta)}\right)\sigma_{a,t}(\mathbf{x})
\end{align*}
where the first inequality uses triangle inequality and the fact that $\mathbf{y}_{a,t-1}= \mathbf{f}_{a, t-1} + \boldsymbol{\epsilon}_{a, t-1}$. The second inequality is from the reproducing property of function relying on RKHS, and the fourth equality is from the verification noted in Equation  \eqref{def:linear_kernelized_terms}. The last inequality directly uses the results from Lemma \ref{lemma:noise_affeted_bound} and Lemma  \ref{lemma:log_det_Kt_bound}.

\paragraph{Bound term $T_2$}
To bound term $T_2$, we again divide $T2$ into two terms:
\begin{align*}
    T_2 &=\lvert a(\mathbf{x}; \mathbf{W}_{t-1}) - \langle \mathbf{g}_a(\mathbf{x}_{t};\mathbf{W}_{0}),\mathbf{U}_{a,t-1}^{-1}\mathbf{G}_{a,t-1}\mathbf{y}_{a,t-1} \rangle  \rvert 
    \\
    &= \underbrace{\lvert a(\mathbf{x}; \mathbf{W}_{t-1}) - \langle \mathbf{g}_a(\mathbf{x};\mathbf{W}_0), \mathbf{W}_{t-1} - \mathbf{W}_0 \rangle \rvert}_{T_2^\prime} + \underbrace{\lvert \langle \mathbf{g}_a(\mathbf{x};\mathbf{W}_0), \mathbf{W}_{t-1} - \mathbf{W}_0 \rangle  - \langle \mathbf{g}_a(\mathbf{x}_{t};\mathbf{W}_{0}),\mathbf{U}_{a,t-1}^{-1}\mathbf{G}_{a,t-1}\mathbf{y}_{a,t-1} \rangle   \rvert}_{T_2^{\prime \prime}}
    \\
    & \le C^{2L} L^{3/2} m^{11/36}/ (T+1) + C_1^{2L} L^{1/2} m^{-1/36} 
\end{align*}
Here, $T_2^{\prime}$ is the difference between the network output and its linear approximation, while $T_2^{\prime \prime}$ indicates the gap between the network's linear approximation and the theoretical optimal solution for a linearized network. The first inequality uses lemma \ref{lemma:network_output_vs_lin_approx} and Lemma \ref{lemma:lin_approx_vs_theoretical_regression_sol}. Combining the bound of term $T_1$ and $T_2$, then given any $\delta \in (0,1)$,  with probability at least $1 - \delta \exp (\Omega(C^{-L} m^{1/36}))$, we have:
\begin{align*}
     \lvert f_a(\mathbf{x}) - a(\mathbf{x}; \mathbf{W}_{t-1}) \rvert \le  \left(\sqrt{2}B + R \sqrt{\gamma_{t,a} + 2 + 2 \log(1/\delta)}\right)\sigma_{a, t-1}(\mathbf{x}) + \frac{\mathcal{E}(m)}{T+1}, 
\end{align*}
where $\mathcal{E}(m) = \mathcal{O}(C^{2L} L^{3/2} m^{11/36})$.
\end{proof}

\subsubsection{Proof of Theorem \ref{theorem:main}}

\TheoremMain*

\begin{proof}

We gradually provide the upper bound of the cumulative regret $R_T$ and cumulative violation of each constraint $V_{c_i,T}$ as: 
\paragraph{Bound Cumulative Regret $R_{T}$:}
We utilize some results from \citet{tran2022regret} presented in Lemma \ref{lemma:objective_rt} and Lemma \ref{lemma:improvement_f} in Section \ref{section:technical_lemmas} to bound our cumulative regret. We obtain an upper bound for the Cumulative Regret $R_T$ as follows:
\begin{align*}
    R_T & \le R_T^+ = \sum_{t=1}^T r_t^+
    \\
    & \le \sum_{t=1}^T \left(C + \sqrt{2}\pi(B + \sqrt{2}) \right)\left(\text{Im}_t (\mathbf{x})+ \beta_{f,t} \sigma_{f,t-1}(\mathbf{x}_t) \right)
    \\
    & \le \sum_{t=1}^T \left(C + \sqrt{2}\pi(B + \sqrt{2}) \right) \text{Im}_t (\mathbf{x}) +  \sum_{t=1}^T  \left(C + \sqrt{2}\pi(B + \sqrt{2}) \right) \beta_{f,t} \sigma_{f,t-1}(\mathbf{x}_t) 
    \\
    & \le \left(C + \sqrt{2}\pi(B + \sqrt{2}) \right) \sum_{t=1}^T \text{Im}_t (\mathbf{x}) +  \left(C + \sqrt{2}\pi(B + \sqrt{2}) \right) \beta_{f,T} \sum_{t=1}^T \sigma_{f,t-1}(\mathbf{x})
    \\
    & \le \left(C + \sqrt{2}\pi(B + \sqrt{2}) \right) \sum_{t=1}^T \text{Im}_t (\mathbf{x}) +  \left(C + \sqrt{2}\pi(B + \sqrt{2}) \right) \beta_{f,T} \max \left(\sum_{t=1}^T \sigma_{f,t-1}(\mathbf{x}), B \right) 
    \\
    & \le 2 \beta_{f,T} \sqrt{\frac{B T}{\log(B+1)} (2\gamma_{f,T}+1)} + 2 \mathcal{E}(m)
\end{align*}
The first inequality is from Lemma \ref{lemma:objective_rt} and the last inequality is due to Lemma \ref{lemma:improvement_f} and Lemma \ref{lemma:min_sigma}. 
\paragraph{Bound Cumulative Violation $V_{c_i, T}$:}
\begin{align*}
V_{c_i, T}  & = \sum_{t=1}^T [c_i(\mathbf{x}_t)]^+ \\
& = \sum_{t=1}^T[c_i(\mathbf{x}_t) - \text{LCB}_{c_i,t}(\mathbf{x}, \boldsymbol{\omega}_{c_i,t}) + \text{LCB}_{c_i,t}(\mathbf{x}, \boldsymbol{\omega}_{c_i,t})]^+
\\
& \le \sum_{t=1}^T[c_i(\mathbf{x}_t) - \text{LCB}_{c_i,t}(\mathbf{x}, \boldsymbol{\omega}_{c_i,t})]^+  + \sum_{t=1}^T [\text{LCB}_{c_i,t}(\mathbf{x}, \boldsymbol{\omega}_{c_i,t})]^+ 
\\
& = \sum_{t=1}^T[c_i(\mathbf{x}_t) - \text{LCB}_{c_i,t}(\mathbf{x}, \boldsymbol{\omega}_{c_i,t})]^+
\\
& \le \sum_{t=1}^T[\text{UCB}_{c_i,t}(\mathbf{x}, \boldsymbol{\omega}_{c_i,t}) - \text{LCB}_{c_i,t}(\mathbf{x}, \boldsymbol{\omega}_{c_i,t})]^+
\\
& \le 2 \beta_{c_i,t} \sum_{t=1}^T \sigma_{c_i,t}(\mathbf{x})  + \frac{2 \mathcal{E}(m)}{T+1} 
\\ 
& \le 2 \beta_{c_i,T} \max\left (\sum_{t=1}^T \left(\sigma_{c_i,t}(\mathbf{x}), S_i \right)  + \frac{2 \mathcal{E}(m)}{T+1}  \right)
\\
& \le 2 \left(S_i + R_a \sqrt{\gamma_{a,T} + 2 + 2 \log(1/\delta)} \right) \sqrt{\frac{ S_i T}{\log(S_i+1)} (2\gamma_{c_i,T}+1)} + 2 \mathcal{E}(m)
\end{align*}
\end{proof}

The first inequality follows by the fact that $[a+b]^+ \le [a]^+ + [b]^+, \forall a,b \in \mathbb{R}$. The second equality is from the feasibility condition in Algorithm \ref{alg:neural_cbo}. The second inequality is from Corollary  \ref{corrolary:f_in_lcb_ucb} and the last inequality is from Lemma \ref{lemma:confidence_bound}. 



\subsection{Technical Lemmas}
\label{section:technical_lemmas}
The following lemmas provide the upper bound of simultaneous regret $r_t$ and the upper bound on cumulative of improvement $\text{Im}_t (\mathbf{x})$ function: 
\begin{lemma}[Lemma 10, \citep{tran2022regret}]
    \label{lemma:objective_rt}
    There exist constant $C > 0$ such that
    \begin{align*}
        r_t \le r_t^+ \le \left(C + \sqrt{2}\pi(B + \sqrt{2}) \right)\left(\text{Im}_t (\mathbf{x})+ \beta_{f,t} \sigma_{f,t-1}(\mathbf{x}_t) \right),
    \end{align*}
    where $\text{Im}_t (\mathbf{x}) = \max(0, f(\mathbf{x}_t) - \mu_t^+ + \mathcal{E}(m)), $ and $\mu^+_t = \max_{\mathbf{x}_k \in \mathcal{D}_{t-1}} v(\mathbf{x}_k; \boldsymbol{\theta}_{t-1})$ is the best value of the mean objective function so far. 
\end{lemma}

\begin{lemma}[Lemma 14, \citep{tran2022regret}]
\label{lemma:improvement_f}
    Pick $\delta \in (0, 1)$. Then with probability at least $1 - \delta$ we have that:
    \[ \sum_{t=1}^T {\text{Im}}_t (\mathbf{x}_t) = \mathcal{O}(\beta_T \sqrt{T\gamma_T} ) + \mathcal{E}(m). \]
\end{lemma}

The following lemma gives the concentration of NTK at the initialization of the neural network introduced in Equation \eqref{eqn:fcn}.
\begin{lemma}[Theorem 1, \citep{xu2024overparametrized}]
\label{lemma:H_to_Phi}
Under Gaussian initialization, for $m \ge Cd^2 \exp(L^2)$ for some constant $C$, there exist constants $C_1, C_2$ and $C_3$ such that, with probability at least $1 - \exp (-C_1 m^{1/3})$,
\[
\norm{\mathbf{H}^{(l)}_0 - \boldsymbol{\Phi}^{(l)}}_\infty \le C_2 \left(\frac{C_3^L}{m^{1/6}} + \sqrt{\frac{dL\log m}{m}}\right), \forall 1\le l \le L,
\]
where $\boldsymbol{\Phi}^{(l)}$ is a deterministic kernel matrix. For more details about the recursive definition of $\boldsymbol{\Phi}^{(l)}$, see Section 4.1 of \citet{xu2024overparametrized}.
\end{lemma}
The next lemma shows the bound of difference between the gradient of neural network $a(\mathbf{x}; \mathbf{W})$ at step $t$ and initialization.
\begin{lemma}[Proposition 9, \citep{xu2024overparametrized}] Consider the neural network introduced in Equation \eqref{eqn:fcn} and assume that the condition \ref{condition:network_width} holds. With probability $1 - \exp (\Omega(C^{-L} m^{1/36}))$, for any sample path $\{ \mathbf{x}_s, y_s\}_{s=0}^{T-1}$, all $t\le T$, we have
\begin{align*}
\begin{split}
    \sup_\mathbf{x} \norm{\mathbf{g}_{a} (\mathbf{x}, \mathbf{W}_t) - \mathbf{g}_{a}(\mathbf{x}; \mathbf{W}_0)}_2 & \le C_1^{2L} L^{1/2}m^{-1/36}
    \\
    \norm{\mathbf{g}_{a}(\mathbf{x}; \mathbf{W}_0)}_2 &\le C_2^L L^{1/2}  ,
\end{split}    
\end{align*}
for some constants $C_1$ and $C_2$. 
\end{lemma}
The next lemma shows the reproducing property of function $f_a$ being assumed to belong to RKHS induced by NTK kernel $k_a$ of the neural network $a(\mathbf{x}; \mathbf{W})$ introduced in Equation \eqref{eqn:fcn}.
\begin{lemma}
    \label{lemma:RKHS_expression}
    Let $f_a$ be a member of $\mathcal{H}_{k_a}$ with bounded RKHS norm $\norm{f_a}_{\mathcal{H}_{k_a}} \le B_a$. Assume that the network width of the model used to estimate function $f_a (\cdot)$ satisfies the Condition \ref{condition:network_width}, 
    then $\forall \mathbf{x} \in D$, there exists $f_a^* \in \mathbb{R}^p$, where $p=md+m^2(L-2)+m$ such that:
    \[
    f_a(\mathbf{x}) = \langle \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0), f_a^* \rangle = \mathbf{g}_a(\mathbf{x}; \mathbf{W}_0)^\top f_a^*
    \]

\begin{proof}[Proof of Lemma \ref{lemma:RKHS_expression}]
    Due to Lemma \ref{lemma:H_to_Phi}, with probability at least $1 - \exp (-C_1 m^{1/3} \log L)$, we have:
    \[
    \norm{\mathbf{H}_0 - \boldsymbol{\Phi}}_\infty \le C_2 L \left(\frac{C_3^L}{m^{1/6}} + \sqrt{\frac{dL\log m}{m}}\right).
    \] 
    It is noted that following our definition,  $\mathbf{H}_0 = \mathbf{G}_{a,t} ^\top \mathbf{G}_{a,t}$. That leads to:
    \begin{align*}
        \frac{1}{\sqrt{m}}\norm{\mathbf{G}_{a,t} ^\top \mathbf{G}_{a,t} - \boldsymbol{\Phi}}_F & \le \frac{t}{\sqrt{m}} \norm{\mathbf{G}_{a,t-1} ^\top \mathbf{G}_{a,t-1} - \boldsymbol{\Phi}}_\infty 
        \\
        & \le \frac{t}{C_2 \sqrt{m} L} \left(\frac{C_3^L}{m^{1/6}} + \sqrt{\frac{dL\log m}{m}}\right)  \le \lambda_0,
    \end{align*}
    Where $\lambda_0$ is a constant which is independent of $m$. The second inequality is from the choice of $m$ in Condition \ref{condition:network_width}. 
    Then, we have:
    \begin{align*}
        \frac{1}{\sqrt{m}}\mathbf{G}_{a,t} ^\top \mathbf{G}_{a,t} & \succcurlyeq  \frac{1}{\sqrt{m}} \left(\boldsymbol{\Phi} - 
        \norm{\mathbf{G}_{a,t} ^\top \mathbf{G}_{a,t} - \boldsymbol{\Phi}}_F \mathbf{I} \right)
        \\
        & \succcurlyeq \frac{1}{\sqrt{m}} \left(\boldsymbol{\Phi} - 
        \lambda_0 \mathbf{I} \right) \succ 0, 
    \end{align*}
    suggests that $\mathbf{G}_{a,t} ^\top \mathbf{G}_{a,t}$ is positive definite.  Thus, suppose the singular value decomposition of $\mathbf{F}_{a,t-1}$ is $\mathbf{G}_{a,t} = \mathbf{P}_{a,t} \mathbf{A}_{a,t} \mathbf{Q}_{a,t}^\top$, then by choosing $f_a^* = \mathbf{P}_{a,t} \mathbf{A}_{a,t} \mathbf{Q}_{a,t}^\top \mathbf{F}_{a,t}$, we have
    \[
    \mathbf{G}_{a,t-1}^\top f_a^* =  \mathbf{Q}_{a,t} \mathbf{A}_{a,t} \mathbf{P}_{a,t}^\top \mathbf{P}_{a,t} \mathbf{A}_{a,t} \mathbf{Q}_{a,t}^\top \mathbf{F}_{a,t}  = \mathbf{F}_{a,t}, 
    \]
    which indicates that for any $\mathbf{x}$, $\langle g(\mathbf{x}; \mathbf{W}_0), f_a^*\rangle = f_a(\mathbf{x})$.
\end{proof}
\end{lemma}
Let $\mathbf{z}_t^{(l)} (\mathbf{x})$ measure the sensitivity of the output from the $l$-th hidden layer and defined as:
\begin{align*}
    \begin{split}
        [\mathbf{z}_t^{(l)}(\mathbf{x})]^\top &= \left [ \frac{\partial a(\mathbf{x}; \mathbf{W}_t)}{\partial \mathbf{h}_t^{(l)}(\mathbf{x})}\right]^\top \\
        &= \mathbf{q}^\top \frac{1}{\sqrt{m}} \mathbf{D}_t^{(L)}(\mathbf{x}) \mathbf{W}_t^{(L)} \dots \frac{1}{\sqrt{m}} \mathbf{D}_t^{(l+1)}(\mathbf{x}) \mathbf{W}_t^{(l+1)}, 
    \end{split} 
\end{align*}
Then the following lemma provides the bound of the difference between $\mathbf{z}_t^{(l)}(\mathbf{x})$ and $\mathbf{z}_0^{(l)}(\mathbf{x})$:
\begin{lemma}[Lemma 12, \citep{xu2024overparametrized}]
\label{lemma:nn_sensitivity_bound}
Consider the neural network introduced in Equation \eqref{eqn:fcn} and assume that the condition \ref{condition:network_width} holds. With probability $1 - \exp (\Omega( C_1^{-L+l} m^{1/36}))$, for layer $l$ and any sample path $\{ \mathbf{x}_s, y_s\}_{s=0}^{T-1}$, with all $t\le T$, we have:
\begin{align*}
    \sup_\mathbf{x} \norm{\mathbf{z}_t^{(l)}(\mathbf{x}) - \mathbf{z}_0^{(l)}(\mathbf{x})}_2 & \le \mathcal{O}(C_1^{2L-l} m^{17/36}) 
    \\
    \sup_\mathbf{x} \norm{\mathbf{z}_0^{(l)}(\mathbf{x})}_2 &  \le C_2^{L-l} \sqrt{m}
    \\
    \sup_\mathbf{x} \norm{\mathbf{z}_t^{(l)}(\mathbf{x})}_2 &  \le C_3^{2L-l-1} \sqrt{m}
    \\
\end{align*}
for some absolute constant $C_1, C_2, C_3$.
\end{lemma}



The following lemma provides the bound on the difference between neural network weights and output at initialization and at step $t$:

\begin{lemma}[Lemma 10, \citep{xu2024overparametrized}]
\label{lemma:weights_and_output_bounds}
Consider the neural network introduced in Equation \eqref{eqn:fcn} and assume that the condition \ref{condition:network_width} holds. Setting the step size at training step $t$ as $\alpha_t \le \frac{\nu}{(T+1)^2}$, then with probability $1 - \exp (\Omega( C^{-L} m^{1/36}))$, for any sample path $\{ \mathbf{x}_s, y_s\}_{s=0}^{T-1}$, all $t\le T$, we have:
\begin{align*}
    \norm{\mathbf{W}_t^{(l)} - \mathbf{W}_0^{(l)}}_2 & \le \frac{m^{1/3}L^{1/2}}{T+1} \\
    \norm{\mathbf{W}_0^{(l)}}_2, \norm{\mathbf{W}_t^{(l)}}_2 & \le \mathcal{O}(\sqrt{m})
    \\
    \sup_\mathbf{x} \norm{\mathbf{h}_t^{(l)}(\mathbf{x}) - \mathbf{h}_0^{(l)}(\mathbf{x})}_2 & \le \frac{C_3^l}{m^{1/6}}, 
\end{align*}
for some absolute constant $C_3$.
\end{lemma}


The following lemmas provide bound on the technical terms used in Lemma \ref{lemma:confidence_bound}.

\begin{lemma} Let $a(\mathbf{x}; \mathbf{W})$ is the neural network defined in Equation \eqref{eqn:fcn}. Then, with probability $1 - \exp (\Omega(C^{-L} m^{1/36}))$, we have: 
\label{lemma:network_output_vs_lin_approx}
    \begin{align*}
        \lvert a(\mathbf{x}, \mathbf{W}_{t-1}) - a(\mathbf{x}, \mathbf{W}_{0}) - \langle \mathbf{g}_{a}(\mathbf{x}, \mathbf{W}_0), \mathbf{W}_{t-1} - \mathbf{W}_0 \rangle \rvert \le \mathcal{O}(C^{2L} L^{3/2} m^{11/36})
    \end{align*}
\end{lemma}

\begin{proof}[Proof of Lemma \ref{lemma:network_output_vs_lin_approx}]


Remind that 
\begin{equation*}
    \mathbf{h}^{(l)}(\mathbf{x}) = \frac{1}{\sqrt{m}} \mathbf{D}^{(l)}(\mathbf{x}) \mathbf{W}^{(l)} \dots \frac{1}{\sqrt{m}} \mathbf{D}^{(1)}(\mathbf{x}) \mathbf{W}^{(1)} \mathbf{x},
\end{equation*}
Then, by direct calculation, we have
\begin{align*}
 \frac{\partial a(\mathbf{x}; \mathbf{W}_0)}{\partial \mathbf{W}^{(l)}}  &= \frac{\mathbf{q}^\top}{\sqrt{m}} \mathbf{D}_0^{(L)}(\mathbf{x}) \mathbf{W}_0^{(L)} \dots \frac{1}{\sqrt{m}} \mathbf{D}_0^{(l)}(\mathbf{x}) \left[\mathbf{h}_0^{(l-1)}(\mathbf{x}) \right] ^\top
 \\
 &= \frac{1}{\sqrt{m}} [\mathbf{z}_0^{(l)}(\mathbf{x})]^\top \mathbf{D}_0^{(l)}(\mathbf{x})\left[\mathbf{h}_0^{(l-1)}(\mathbf{x}) \right] ^\top,
\end{align*}
and 
\begin{align*}
    \mathbf{g}_{a}(\mathbf{x}; \mathbf{W}_0) &= \left [ \frac{\partial a(\mathbf{x}; \mathbf{W}_0)}{\partial \mathbf{W}^{(1)}}, \frac{\partial a(\mathbf{x}; \mathbf{W}_0)}{\partial \mathbf{W}^{(2)}}, \dots, \frac{\partial a(\mathbf{x}; \mathbf{W}_0)}{\partial \mathbf{W}^{(L)}} \right]
\end{align*}
We also rewrite $a(\mathbf{x}; \mathbf{W}_{t-1})$ and $a(\mathbf{x}; \mathbf{W}_{0})$  as:
\begin{align*}
    a(\mathbf{x}; \mathbf{W}_{t-1}) &= \frac{\mathbf{q}^\top}{\sqrt{m}} \mathbf{D}_{t-1}^{(L)}(\mathbf{x}) \mathbf{W}_{t-1}^{(L)} \dots \frac{1}{\sqrt{m}} \mathbf{D}_{t-1}^{(1)}(\mathbf{x}) \mathbf{W}_{t-1}^{(1)} (\mathbf{x}) 
    \\
    & = \frac{1}{\sqrt{m}} \mathbf{z}_{t-1}^{(l)} (\mathbf{x}) \mathbf{D}_{t-1}^{(l)} (\mathbf{x}) \mathbf{W}_{t-1}  \mathbf{h}_{t-1}^{(l-1)} (\mathbf{x}),
    \\
    a(\mathbf{x}; \mathbf{W}_{0}) &= \frac{\mathbf{q}^\top}{\sqrt{m}} \mathbf{D}_{0}^{(L)}(\mathbf{x}) \mathbf{W}_{0}^{(L)} \dots \frac{1}{\sqrt{m}} \mathbf{D}_{0}^{(1)}(\mathbf{x}) \mathbf{W}_{0}^{(1)} (\mathbf{x})
    \\
    & = \frac{1}{\sqrt{m}} \mathbf{z}_{0}^{(l)} (\mathbf{x}) \mathbf{D}_{0}^{(l)} (\mathbf{x}) \mathbf{W}_{0}^{(l)}  \mathbf{h}_{0}^{(l-1)} (\mathbf{x})
    % \\
    % & = \frac{1}{L} \sum_{l=1}^L \mathbf{z}_{t-1}^{(l-1)} \mathbf{h}_{t-1}^{(l-1)}(\mathbf{x}) 
    % \\
    % & = \frac{1}{L} \sum_{l=1}^L \mathbf{z}_{t-1}^{(l-1)} \left[ \mathbf{h}_{t-1}^{(l-1)}(\mathbf{x})  - \mathbf{h}_{0}^{(l-1)}(\mathbf{x}) \right]  + \frac{1}{L} \sum_{l=1}^L \frac{1}{\sqrt{m}}\mathbf{z}_{t-1}^{(l)} \mathbf{D}_{t-1}^{(l)}(\mathbf{x}) \mathbf{W}_{t-1}^{(l)}(\mathbf{x}) \left[ \mathbf{h}_{0}^{(l-1)}(\mathbf{x}) \right]
\end{align*} 

Using the technique in the proof of Lemma 8.2 in \citet{allen2019convergence},  there exist diagonal matrices $\widehat{\mathbf{D}}^{(l)} (\mathbf{x}) = \mathbf{D}_{t-1}^{(l)}(\mathbf{x}) - \mathbf{D}_{0}^{(l)}(\mathbf{x})   \in \mathbb{R}^{m \times m}, \forall 1\le l \le L$ with entries in
$[-1,1]$ such that:
\begin{align*}
    &  a(\mathbf{x}, \mathbf{W}_{t-1}) - a(\mathbf{x}, \mathbf{W}_{0}) 
    \\
    & = \frac{1}{\sqrt{m}} \sum_{l=1}^L \left [ \prod_{r = l+1}^L \left(\widehat{\mathbf{D}}^{(r)} (\mathbf{x}) + \mathbf{D}_{t-1}^{(r)} (\mathbf{x}) \right) \mathbf{W}_{t-1}^{(r)} \right] \left (\widehat{\mathbf{D}}^{(l)} (\mathbf{x}) + \mathbf{D}_{t-1}^{(l)} (\mathbf{x}) \right) (\mathbf{W}_{t-1}^{(l)} - \mathbf{W}_{0}^{(l)}) \mathbf{h}_{0}^{(l-1)} (\mathbf{x})
    \\
    & = \frac{1}{\sqrt{m}} \sum_{l=1}^L  \widehat{\mathbf{z}}_{t-1}^{(l)} \left (\widehat{\mathbf{D}}^{(l)} (\mathbf{x}) + \mathbf{D}_{t-1}^{(l)} (\mathbf{x}) \right) \left( \mathbf{W}_{t-1}^{(l)} - \mathbf{W}_{0}^{(l)} \right) \mathbf{h}_{0}^{(l-1)} (\mathbf{x})
\end{align*}
Furthermore, we have 
\begin{align*}
    \langle \mathbf{g}_{a}(\mathbf{x}, \mathbf{W}_0), \mathbf{W}_{t-1} - \mathbf{W}_0 \rangle &= \frac{1}{\sqrt{m}}\sum_{l=1}^L  \mathbf{z}_0^{(l)}(\mathbf{x}) \mathbf{D}_0^{(l)}(\mathbf{x}) \left(\mathbf{W}_{t-1}^{(l)} -  \mathbf{W}_0^{(l)}\right) \mathbf{h}_0^{(l-1)}(\mathbf{x}) 
\end{align*}

Replacing all below expressions, we get
\begin{align*}
    & \lvert a(\mathbf{x}, \mathbf{W}_{t-1}) -  a(\mathbf{x}, \mathbf{W}_{0}) - \langle \mathbf{g}_{a}(\mathbf{x}, \mathbf{W}_0), \mathbf{W}_{t-1} - \mathbf{W}_0 \rangle \rvert 
    \\
    & = \frac{1}{\sqrt{m}} \sum_{l=1}^L  \widehat{\mathbf{z}}_{t-1}^{(l)} (\mathbf{x}) \left (\widehat{\mathbf{D}}^{(l)} (\mathbf{x}) + \mathbf{D}_{t-1}^{(l)} (\mathbf{x}) \right) \left( \mathbf{W}_{t-1}^{(l)} - \mathbf{W}_{0}^{(l)} \right) \mathbf{h}_{0}^{(l-1)} (\mathbf{x})
    \\ 
    & - \frac{1}{\sqrt{m}}\sum_{l=1}^L  \mathbf{z}_0^{(l)}(\mathbf{x}) \mathbf{D}_0^{(l)}(\mathbf{x}) \left(\mathbf{W}_{t-1}^{(l)} -  \mathbf{W}_0^{(l)}\right) \mathbf{h}_0^{(l-1)}(\mathbf{x})
    \\
    & \le \frac{1}{\sqrt{m}}\sum_{l=1}^L \norm{\widehat{\mathbf{z}}_{t-1}^{(l)} - \mathbf{z}_0^{(l)}(\mathbf{x})}_2  \norm{\mathbf{W}_{t-1}^{(l)} -  \mathbf{W}_0^{(l)}}_2 \norm{\mathbf{h}_{0}^{(l-1)}}
    \\
    & \le L m^{-1/2} L^{1/2} m^{1/3} C^{2L} m^{17/36} / (T+1)
    \\ 
    & \le (C^{2L} L^{3/2} m^{11/36}) / (T+1).
\end{align*} 
\end{proof}
The first inequality uses triangle inequality. The second inequality is from Lemma \ref{lemma:nn_sensitivity_bound} and Lemma  \ref{lemma:weights_and_output_bounds}.

\begin{lemma} Let $a(\mathbf{x}; \mathbf{W})$ is the neural network defined in Equation \eqref{eqn:fcn}. Then, with probability $1 - \exp (\Omega(C^{-L} m^{1/36}))$, we have: 
\label{lemma:lin_approx_vs_theoretical_regression_sol}
    \begin{align*}
    \lvert \langle \mathbf{g}_a(\mathbf{x};\mathbf{W}_0), \mathbf{W}_{t-1} - \mathbf{W}_0 - \mathbf{U}_{a,t-1}^{-1} \mathbf{G}_{a,t-1} \mathbf{y}_{t-1} \rangle \rvert \le C_1^{2L} L^{1/2} m^{-1/36}.
    \end{align*}
\end{lemma}
\begin{proof}[Proof of Lemma \ref{lemma:lin_approx_vs_theoretical_regression_sol}]
Using the model update formula given in Equation \eqref{eqn:train_NN}, we have 
\begin{align*}
    \mathbf{W}_{t-1} - \mathbf{W}_0 &= (\mathbf{W}_{t-1} - \mathbf{W}_{t-2}) + (\mathbf{W}_{t-2} - \mathbf{W}_{t-3})  + \dots + (\mathbf{W}_1 - \mathbf{W}_0) 
    \\
    &= \sum_{i=1}^{t-1}  (\mathbf{W}_{i} -  \mathbf{W}_{i-1}) 
    \\
    &= \sum_{i=1}^{t-1}  \alpha_i \left(y_i - a(\mathbf{x}_i, \mathbf{W}_{i-1})\right) \nabla_\mathbf{W} a(\mathbf{x}_i, \mathbf{W}_{i-1}) 
    \\
    & = \sum_{i=1}^{t-1}  \alpha_i \left(y_i - a(\mathbf{x}_i, \mathbf{W}_{i-1})\right) \mathbf{g}_{a, i-1} (\mathbf{x}_i, \mathbf{W}_{i-1}) 
    \\
    & = \alpha \mathbf{\Bar{G}}_{a, t-1} (\mathbf{y}_{t-1} - \mathbf{A}_{t-1}),
\end{align*}
where $\mathbf{A}_{t-1} = [a(\mathbf{x}_1, \mathbf{W}_1), \dots, a(\mathbf{x}_{t-1}, \mathbf{W}_{t-1})] \in \mathbb{R}^{t-1}$.  Then we have:
\begin{align*}
    & \lvert \mathbf{W}_{t-1} - \mathbf{W}_0 - \mathbf{U}_{a,t-1}^{-1}
    \mathbf{G}_{a,t-1} \mathbf{y}_{t-1} \rvert 
    \\
    & = \lvert  \alpha \mathbf{\Bar{G}}_{a, t-1} (\mathbf{y}_{t-1} - \mathbf{A}_{t-1}) - \mathbf{U}_{a,t-1}^{-1} \mathbf{G}_{a,t-1} \mathbf{y}_{t-1} \rvert 
    \\
    & = \lvert  \alpha (\mathbf{\Bar{G}}_{a, t-1} - \mathbf{G}_{a, t-1}) (\mathbf{y}_{t-1} - \mathbf{A}_{t-1}) + \alpha \mathbf{G}_{a, t-1} (\mathbf{y}_{t-1} - \mathbf{A}_{t-1}) - (\mathbf{I} + \mathbf{G}_{a,t-1} \mathbf{G}_{a,t-1}^\top)^{-1} \mathbf{G}_{a,t-1} \mathbf{y}_{t-1} \rvert
    \\
    & = \lvert  \alpha (\mathbf{\Bar{G}}_{a, t-1} - \mathbf{G}_{a, t-1}) (\mathbf{y}_{t-1} - \mathbf{A}_{t-1}) + \alpha \mathbf{G}_{a, t-1} (\mathbf{y}_{t-1} - \mathbf{A}_{t-1}) - \mathbf{G}_{a,t-1}(\mathbf{I} + \mathbf{G}_{a,t-1}^\top \mathbf{G}_{a,t-1} )^{-1} \mathbf{y}_{t-1} \rvert
    \\
    & \le \norm{\alpha (\mathbf{\Bar{G}}_{a, t-1} - \mathbf{G}_{a, t-1}) (\mathbf{y}_{t-1} - \mathbf{A}_{t-1})}_2 + \alpha \norm{\mathbf{G}_{a, t-1}}_2 \norm{ \mathbf{y}_{t-1} \left[\mathbf{I} - (\alpha\mathbf{I} + \alpha \mathbf{G}_{a,t-1}^\top \mathbf{G}_{a,t-1} )^{-1} \right] - \mathbf{A}_{t-1}}_2
    \\
    & \le \lvert \alpha \rvert \sqrt{t} \norm{(\mathbf{\Bar{G}}_{a, t-1} - \mathbf{G}_{a, t-1})}_2 + \lvert \alpha \rvert \sqrt{t} \norm{(\mathbf{G}_{a, t-1})}_2
     \\
    & \le C_1^{2L} L^{1/2} m^{-1/36}
\end{align*}
The first inequality is from the triangle inequality and the last inequality is due to the choice of $\alpha = \frac{\nu}{(T+1)^2}$, where $\nu$ is a parameter and independent of
dimension $d$. 
Therefore, we have:
\begin{align*}
    & \lvert \langle \mathbf{g}_a(\mathbf{x};\mathbf{W}_0), \mathbf{W}_{t-1} - \mathbf{W}_0 - \mathbf{U}_{a,t-1}^{-1} \mathbf{\Bar{G}}_{a,t-1} \mathbf{y}_{t-1} \rangle \rvert 
    \\
    & \le \norm{\mathbf{g}_a(\mathbf{x};\mathbf{W}_0)}_2 \norm{\mathbf{W}_{t-1} - \mathbf{W}_0 - \mathbf{U}_{a,t-1}^{-1} \mathbf{\Bar{G}}_{a,t-1} \mathbf{y}_{t-1}}_2 
    \\
    & \le C_1^{2L} L^{1/2} m^{-1/36} 
\end{align*}
\end{proof}








\begin{lemma}[Theorem 1, \citet{chowdhury2017kernelized}]
\label{lemma:noise_affeted_bound}

Let $\{\boldsymbol{\epsilon}_{a, t}\}_{t=1}^\infty$ be a real-valued stochastic process such that for some $R \geq 0$ and for all $t \geq 1$, $\boldsymbol{\epsilon}_{a, t}$ is $\mathcal{F}_{a, t-1}$-measurable and $R$-sub-Gaussian
conditioned on $\mathcal{F}_{a, t-1}$. Recall $\mathbf{H}_{0}$ defined in Equation  \eqref{def:linear_kernelized_terms}. For a given
$\eta > 0$, with probability $1 - \delta$, the following holds for all $t$:
\[
\boldsymbol{\epsilon}_{a, t}^\top ((\mathbf{H}_0 + \eta\mathbf{I})^{-1}+\mathbf{I})^{-1} \boldsymbol{\epsilon}_{a, t}
\leq R_a^2 \log \det ((1+\eta)\mathbf{I} + \mathbf{H}_0) + 2R_a^2 \log(1/\delta).
\]
\end{lemma}

\begin{lemma}
\label{lemma:log_det_Kt_bound}
Let $\delta \in (0,1)$. If the network width $m$ satisfies  Condition \ref{condition:network_width}, then with probability at least $1-\delta$, the following holds for every $t \in [T]$:
\[ \log \det (\mathbf{I} + \mathbf{H}_0) \le 2\gamma_{a,t} + 1,\]
where $\gamma_{a,t}$ is the maximum information gain associated with the NTK kernel $k_a$.
\end{lemma}
\begin{proof}[Proof of Lemma \ref{lemma:log_det_Kt_bound}]

From the definition of $\mathbf{H}_0$ and Lemma B.7 \citet{zhou2020neural}, we have that
\begin{equation*}
\begin{split}
    \log \det(\mathbf{I}+ \mathbf{H}_{0})
    & = \log\det \left(\mathbf{I}+\sum_{i=1}^{T}\mathbf{g}({\mathbf{x}_t};\boldsymbol{\theta}_0)\mathbf{g}({\mathbf{x}_t};\boldsymbol{\theta}_0)^\top \right) \\
    & = \log \det(\mathbf{I}+ \mathbf{H}_0 + (\boldsymbol{\Phi} - \mathbf{H}_0))\\
    & \leq \log \det (\mathbf{I}+\mathbf{H}_0)  + \langle (\mathbf{I}+\mathbf{H}_0)^{-1}, (\boldsymbol{\Phi} - \mathbf{H}_0) \rangle \\
    & \leq \log \det (\mathbf{I}+\mathbf{H}_0)  + \norm{(\mathbf{I}+\mathbf{H}_0)^{-1}}_F \norm{ (\boldsymbol{\Phi} - \mathbf{H}_0)}_F \\
    & \leq 2 \gamma_{a,t} + 1, 
\end{split}
\end{equation*}
where the first equality is from the definition of $\mathbf{K}_t$ in Definition \ref{def:linear_kernelized_terms}, the first inequality is from the convexity of $\log \det(\cdot)$ function, and the second inequality is from the fact that $\langle \mathbf{A}, \mathbf{B} \rangle \le \norm{\mathbf{A}}_F \norm{\mathbf{B}}_F$. The third inequality is from the choice of $m$ in Condition \ref{condition:network_width}, combined with Lemma \ref{lemma:H_to_Phi} and Lemma 3 in \citet{chowdhury2017kernelized}.
\end{proof}

\begin{lemma}[Lemma 8, \citet{phan2023neuralbo}]
\label{lemma:min_sigma}
Consider the neural network $a(\cdot;\boldsymbol{\theta})$ introduced in Equation \eqref{eqn:fcn} and suppose the width of the neural network m satisfies Condition \ref{condition:network_width}. Then 
\begin{equation*}
    \sum_{i=1}^T \min(\sigma_{a,t}(\mathbf{x}_t),B) \leq \sqrt{\frac{ BT}{\log(B+1)} (2\gamma_{a,T}+1)}.
\end{equation*}
\end{lemma}



\end{document}
