\documentclass{uai2025} % for initial submission
%\documentclass[accepted]{uai2025} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2025} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2025} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

% \usepackage[colorlinks,citecolor=blue,urlcolor=blue,linkcolor=blue,linktocpage=true]{hyperref}
\PassOptionsToPackage{colorlinks,citecolor=blue,urlcolor=blue,linkcolor=blue,linktocpage=true}{hyperref} 

%% Some suggested packages, as needed:
% \usepackage{natbib} % has a nice set of citation styles and commands
\usepackage[square,numbers,sort]{natbib}
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams

% \usepackage[normalem]{ulem}
% \usepackage{enumitem}



\usepackage{wrapfig}
\usepackage{comment}
% \usepackage{geometry}
% \geometry{margin=1in}

% \usepackage{amsmath,amssymb,amsfonts}
\usepackage{algorithmic,algorithm}

\usepackage{microtype}
% \usepackage{graphicx}
% \usepackage{subfigure}
% \usepackage{booktabs}

% \usepackage{comment}

\usepackage{multirow,makecell}
% \usepackage[american]{babel}
\usepackage{amsmath,amssymb,amsthm,bm}
\usepackage{mathtools}
\usepackage{enumerate}
\usepackage{enumitem}
\usepackage{subfigure}
\usepackage{color}

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{xcolor}         % colors

% \makeatletter
% \newcommand{\printfnsymbol}[1]{%
% 	\textsuperscript{\@fnsymbol{#1}}%
% }
% \makeatother

\newtheorem{theorem}{Theorem}
\newtheorem{lemma}[theorem]{Lemma}
\newtheorem{definition}[theorem]{Definition}
% \theoremstyle{definition}
\newtheorem{remark}[theorem]{Remark}
% \newtheorem{remark-star}{Remark}
% \newtheorem{remark-star-1}{Remark}
% \newtheorem*{observe}{Observation}
\newtheorem{conjecture}[theorem]{Conjecture}
\newtheorem{assumption}[theorem]{Assumption}
\newtheorem{observation}[theorem]{Observation}
\newtheorem{fact}[theorem]{Fact}
\newtheorem{corollary}[theorem]{Corollary}
\newtheorem{proposition}[theorem]{Proposition}
\newtheorem{claim}[theorem]{Claim}
\newtheorem{example}[theorem]{Example}
% \newtheorem*{proof-sketch}{Proof Sketch}

\newcommand{\argmin}{\mathop{\mathrm{argmin}}}
\newcommand{\argmax}{\mathop{\mathrm{argmax}}}
\newcommand{\minimize}{\mathop{\mathrm{minimize}}}
\newcommand{\st}{\mathop{\mathrm{subject\,\,to}}}

\newcommand{\red}[1]{\textcolor{red}{#1}}
\newcommand{\blue}[1]{\textcolor{blue}{#1}}

\newcommand{\chong}[1]{\textit{\textcolor{purple}{[Chong]: #1}}} % Chong's notes
\newcommand{\yuxin}[1]{\textit{\textcolor{orange}{[Yuxin]: #1}}} % Yuxin's notes
\newcommand{\hwanwoo}[1]{\textit{\textcolor{blue}{[Hwanwoo]: #1}}} % Hwanwoo's notes

%% make lists small
\newcommand{\denselist}{\itemsep 0pt\topsep-10pt\partopsep-6pt}


\def\R{\mathbb{R}}
\def\E{\mathbb{E}}
\def\P{\mathbb{P}}
\def\1{\mathbbm{1}}
\def\j{{(j)}}
\def\Dis{\mathrm{Dis}}
\def\kl{\mathrm{KL}}
\def\Cov{\mathrm{Cov}}
\def\Var{\mathrm{Var}}
\def\half{\frac{1}{2}}
\def\th{\mathrm{th}}
\def\tr{\mathrm{tr}}
\def\df{\mathrm{df}}
\def\dim{\mathrm{dim}}
\def\col{\mathrm{col}}
\def\row{\mathrm{row}}
\def\nul{\mathrm{null}}
\def\rank{\mathrm{rank}}
\def\nuli{\mathrm{nullity}}
\def\spa{\mathrm{span}}
\def\sign{\mathrm{sign}}
\def\supp{\mathrm{supp}}
\def\diag{\mathrm{diag}}
\def\aff{\mathrm{aff}}
\def\conv{\mathrm{conv}}
\def\hy{\hat{y}}
\def\ty{\tilde{y}}
\def\hbeta{\hat{\beta}}
\def\tbeta{\tilde{\beta}}
\def\htheta{\hat{\Theta}}
\def\halpha{\hat{\alpha}}
\def\btheta{\boldsymbol{\theta}}
\def\hf{\hat{f}}
\def\hmu{\hat{\mu}}
\def\hlambda{{\hat{\lambda}}}
\def\heta{{\hat{\eta}}}
\def\hR{{\widehat{R}}}

\def\cA{\mathcal{A}}
\def\cB{\mathcal{B}}
\def\cC{\mathcal{C}}
\def\cD{\mathcal{D}}
\def\cE{\mathcal{E}}
\def\cF{\mathcal{F}}
\def\cG{\mathcal{G}}
\def\cH{\mathcal{H}}
\def\cI{\mathcal{I}}
\def\cJ{\mathcal{J}}
\def\cK{\mathcal{K}}
\def\cL{\mathcal{L}}
\def\cM{\mathcal{M}}
\def\cN{\mathcal{N}}
\def\cO{\mathcal{O}}
\def\cP{\mathcal{P}}
\def\cQ{\mathcal{Q}}
\def\cR{\mathcal{R}}
\def\cS{\mathcal{S}}
\def\cT{\mathcal{T}}
\def\cU{\mathcal{U}}
\def\cV{\mathcal{V}}
\def\cW{\mathcal{W}}
\def\cX{\mathcal{X}}
\def\cY{\mathcal{Y}}
\def\cZ{\mathcal{Z}}

%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example

\title{Bayesian Optimization with Inexact Acquisition:\\Is Random Grid Search Sufficient?}

% The standard author block has changed for UAI 2025 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
% Hwanwoo Kim \\
% \\
% \texttt{hwkim@uchicago.edu}
% \And
% Chong Liu \\
% Data Science Institute\\
% University of Chicago\\
% Chicago, IL, USA\\
% \texttt{chongl@uchicago.edu}\\ 
% \And
% Yuxin Chen\\
% Department of Computer Science\\
% University of Chicago\\
% Chicago, IL, USA\\
% \texttt{chenyuxin@uchicago.edu}
% \author[1]{\href{mailto:<jj@example.edu>?Subject=Your UAI 2025 paper}{Jane~J.~von~O'L\'opez}{}}
\author[1]{Hwanwoo Kim}
\author[2]{Chong Liu}
\author[3]{Yuxin Chen}
% Add affiliations after the authors
\affil[1]{%
    Department of Statistical Science\\
Duke University\\
Durham, NC, USA
}
\affil[2]{%
    Department of Computer Science\\
University at Albany, State University of New York\\
Albany, NY, USA
}
\affil[3]{%
    Department of Computer Science\\
University of Chicago\\
Chicago, IL, USA
  }
  
\begin{document}
\maketitle

\begin{abstract}
Bayesian Optimization (BO) is a widely used iterative algorithm for optimizing black-box functions. Each iteration requires maximizing an acquisition function, such as the Upper Confidence Bound (UCB) or a sample path from the Gaussian process (GP) posterior, as in Thompson Sampling (TS). However, finding an exact solution to these maximization problems is often intractable and computationally expensive. Reflecting such realistic situations, in this paper, we delve into the effect of inexact maximizers of the acquisition functions. Defining a measure of inaccuracy in acquisition solutions, we establish cumulative regret bounds for both GP-UCB and GP-TS without requiring exact solutions of acquisition function maximization. Our results show that under appropriate conditions on accumulated inaccuracy, inexact BO algorithms can still achieve sublinear cumulative regret. Motivated from such findings, we provide both theoretical justification and numerical validation for random grid search as an effective and computationally efficient acquisition function solver. 
\end{abstract}

% \begin{keywords}%
% Bayesian optimization, Acquisition function, Inexact optimization, Gaussian process, Thompson sampling, Bandits
% \end{keywords}

%\yuxin{Todos: full pass on Intro; Josh: Section 3; Chong: Section 4}

\section{Introduction}\label{sec:intro}
Bayesian Optimization (BO) is a class of machine learning-based black-box optimization strategies for finding a global optimum of a real-valued function $f$. Typically, $f$ is a function whose formulaic description is not available but can be evaluated with expensive computational cost.
% Naturally, the learner does not necessarily have access to the gradient information of $f$, which is often required to use first/second-order optimization algorithms. The only available source of information about the objective function $f$ is through its evaluations, which can be noise-corrupted.
Due to its black-box nature, BO has demonstrated its efficacy across a broad spectrum of practical applications, such as tuning hyperparameters in deep learning \citep{wu2020practical,turner2021bayesian,kandasamy2020tuning}, searching for neural network architectures \citep{kandasamy2018neural,zhou2019bayesnas,white2021bananas}, designing materials \citep{frazier2016bayesian,zhang2020bayesian,lei2021bayesian}, and discovering new drugs \citep{korovina2020chembo,bellamy2022batched,colliandre2023bayesian}. 
% In the domain of deep learning hyperparameter optimization, the objective function $f$ is employed to assess the validation accuracy subsequent to the training phase of a deep learning model. The input variable $x$ encapsulates a vector of hyperparameters pertinent to the training process, which encompasses elements such as the learning rate, batch size, and the total number of training iterations. Within the context of drug discovery, $f$ is indicative of a drug candidate's binding efficacy, and $x$ comprises the experimental conditions like temperature, pressure, humidity, and solution concentration necessary for its synthesis. In these scenarios, the exact form or gradients of the objective function $f$ are unknown to the learner, and evaluating $f$ is computationally intensive. For example, completing a 90-epoch training run of ResNet-50 on ImageNet-1k using an NVIDIA M40 GPU requires 14 days \citep{you2018imagenet}, equating to a single evaluation of the objective function in hyperparameter optimization. Similarly, a comprehensive drug screening process involves around 80–100 distinct procedures, with a leading pharmaceutical firm capable of conducting only 50–100 such screenings annually \citep{smith2002screening}.

% \subsection{The Bayesian Optimization Framework}



BO follows a sequential decision-making process, where the outcome of each iteration informs the selection of the next evaluation point. After each step, the surrogate model is updated, thereby guiding the selection of the next point for function evaluation. This process is built around two key components: (1) updating a Gaussian process (GP) surrogate model and (2) optimizing the acquisition function to decide the next evaluation point. The Gaussian process surrogate is favored for modeling the objective function $f$ due to its capability to iteratively incorporate all previous observations, possibly corrupted by noise, thus progressively improving the model. The acquisition function $\alpha_t$ directs the optimization by solving for the next evaluation location, denoted by $x_t$,
\begin{align*}
x_t =\argmax_{x \in \cX} \alpha_t (x).
\end{align*}
%It navigates the balance between exploring search space and exploiting an accurately estimated surrogate model. 
This function balances the exploration of the search space with the exploitation of the accurately estimated surrogate model. 
%It is also called inter optimization in Bayesian optimization while the whole objective function optimization is called outer optimization. 
In BO, this is referred to as \textit{inner optimization}, while optimization of the whole objective function is termed \textit{outer optimization}. 
%
Various strategies for designing the acquisition function exist, with popular options including Expected Improvement (EI) \citep{jones1998efficient}, Knowledge Gradient (KG) \citep{frazier2009knowledge}, Probability of Improvement (PI) \citep{kushner1964new}, Upper Confidence Bound (UCB) \citep{srinivas2010gaussian}, and Thompson Sampling (TS) \citep{thompson1933likelihood}, each offering a unique approach to managing the exploration-exploitation balance. 

% \hwanwoo{Is there any way to cite this box?}
% \chong{I don't know, but we only need to cite it once in Introduction and later we will just use Algorithm blocks in Section 3.}

% \fbox{\parbox{0.98\columnwidth}{Bayesian optimization framework \citep{frazier2018tutorial}:
% \small
% \noindent
% \begin{itemize}\denselist
% \item Step 1: Place a Gaussian Process (GP) prior to $f$.
% \item Step 2: Calculate GP posterior using all data points.
% \item Step 3: Obtain $x_t$ as the maximizer of the acquisition function $\alpha_t$ w.r.t. %which is calculated by 
% the GP posterior.
% \item Step 4: Observe $y_t = f(x_t)$ or $y_t = f(x_t)+\epsilon_t$, where $\epsilon_t$ is a random observation noise.
% \item Step 5: Go back to Step 2 and repeat.
% \vspace{-.5em}
% \end{itemize}}}

%The acquisition function $\alpha_t$ guides the optimization process by determining the next point to sample and it is updated at each iteration. A successful acquisition is able to balance the exploration-exploitation trade-off. Here the exploration means sampling where the model is uncertain and exploitation means sampling where the model predicts a high objective function value. There are many ways to build an acquisition function and common choices include Expected Improvement (EI) \citep{jones1998efficient}, Knowledge Gradient (KG) \citep{frazier2009knowledge}, Probability of Improvement (PI) \citep{kushner1964new}, and Upper Confidence Bound (UCB) \citep{srinivas2010gaussian}, and Thompson Sampling (TS) \citep{thompson1933likelihood}.

%Among all these popular choices of acquisition functions, all of them are required to be optimized precisely, i.e.,
%\begin{align*}
%x_t =\argmax_{x \in \cX} \alpha_t (x).
%\end{align*}
%The optimization of the acquisition function $\alpha_t$ is generally considered more straightforward and less costly than optimizing the objective function $f$, thanks to its foundation on Gaussian process modeling, which provides a more manageable framework compared to the blackbox objective function. 

\subsection{Inexact Acquisition Maximization}

Optimizing the acquisition function $\alpha_t$ is generally perceived as more straightforward and cost-effective than optimizing the objective function $f$. This is largely due to its reliance on Gaussian process modeling, which offers a more tractable framework than the complex and blackbox of the objective function. 
However, theoretical approaches often require an \textit{exact solution} $x_t$ for establishing regret bounds, a task that becomes particularly difficult in practice due to the possible non-convex nature of the acquisition function. This situation raises an important question:
\begingroup
% \addtolength\leftmargini{-0.4in}
\begin{quote}
What are the implications of %imprecisely optimizing acquisition functions 
\textit{inexact} acquisition function maximization in BO?
\end{quote}
\endgroup

Despite the widespread success of Bayesian optimization, this question has been surprisingly overlooked for an extended period and remains unexplored. Moreover, besides Bayesian optimization, this problem widely exists in bandits. Take classical LinUCB algorithm \citep{abbasi2011improved} as an example, the acquisition function optimization requires exact optimization solution of context and model parameter, which is highly intractable in practice. And the problem is even worse when it comes to generalized linear bandits and non-linear bandits. Recently there is a growing line of research on bandits and global optimization with neural network approximation \citep{zhou2020neural,zhang2020neural,dai2022sample} where inexact acquisition function optimization problem still exists. In real-world applications, algorithms commonly employ quasi-Newton methods or random search to optimize the acquisition function, resulting in an approximate solution. This practice diverges from the theoretical assumptions of Bayesian optimization, which typically presume that an exact solution $x_t$ is attainable at every iteration $t$. Therefore, we need systematic study on this problem to understand what theoretical guarantee that one can provide when exact acquisition function solution cannot be obtained. In Table \ref{tab:related}, we summarize existing works address inexactness in bandit optimization and point out the unique position of our paper in bandit optimization. In the subsection below, we discuss related works. 

%More broadly, for bandit optimization tasks, including multi-armed bandits and Bayesian optimization, there exist different kinds of inexactness. Besides inexact acquisition function optimization, inexactness happens in model misspecification where objective function doesn't sit within the modeling function class, and approximate posterior where exact posterior distributions cannot be established. %\hwanwoo{Can we move this to related works? It doesn't seem like crucial for our paper}


\subsection{Related Works}\label{sec:rw}
%\chong{add randome sampling and discretization related work. Add BO with approximate posterior related work (X before).}\yuxin{added RS}
\textbf{Optimization of acquisition functions.}
%Acquisition function optimization remains a ignored problem 
The optimization of acquisition functions has been a long-neglected issue in BO. %for a long time, and there is very limited existing work studying it. 
\citet{wilson2018maximizing} first studied acquisition function maximization in Bayesian optimization. Since acquisition functions are usually non-convex and high-dimensional, %and intractable, 
they focused on how to \emph{approximately} optimize acquisition functions. They found that acquisition functions estimated
via Monte Carlo integration are consistently amenable to gradient-based optimization and EI and UCB can be solved by greedy approaches. Although both their work and this paper study inexact acquisition function optimization problem, our research distinguishes itself by concentrating on the impact that inexact solutions have on Bayesian optimization.

% \begin{table*}[t]
% \centering
%   % \begin{tabular}{l*{2}{c}}
%   \begin{tabular}{p{3cm}p{4cm}p{6cm}p{3cm}}
%     \toprule
%     & \multicolumn{3}{c}{\textbf{Inexactness}} \\
%     \cmidrule(lr){2-3}
%      \textbf{setting} & \textit{Model specification} & \textit{Computational approximation} \\
%     \midrule
%     \multirow{ 2}{*}{
%     \textit{(Contextual) MAB}} & Linear function approx. &  Approximate posterior (TS) \\ 
%     ~   & \citep{lattimore2020learning,foster2020adapting} & \citet{phan2019thompson} \\ &\\
%     \multirow{2}{*}{\textit{Bayesian optimization}} &  Kernel misspecification &  Approximate acquisition optimization\\ %\multirow{ 2}{*}{\textbf{(this work)}} \\
%     & \citet{bogunovic2021misspecified} & \textbf{(this work)} \\
%     \bottomrule
%     \end{tabular}
%   \end{table*}


\begin{table*}[t]
\centering
% \resizebox{6in}{!}{
\begin{tabular}{cccc}
%\noalign{\smallskip} 
\toprule \noalign{\smallskip}
\textbf{Problem settings}               & Model misspecification & Approximate posterior & Inexact acquisition function \\ \noalign{\smallskip} \hline \noalign{\smallskip}
\textit{Multi-armed bandits}   & \citet{lattimore2020learning}                 & \citet{phan2019thompson}               & \citet{wang2018thompson}                \\
&\citet{foster2020adapting}  & \citet{lu2017ensemble} &  \citet{kong2021hardness};~\citet{perrault2022combinatorial}              \\
%& & & \citet{perrault2022combinatorial}\\ 
\noalign{\smallskip} \hline \noalign{\smallskip}
\textit{Bayesian optimization} & \citet{bogunovic2021misspecified}           & \citet{vakili2021scalable, vakili2022improved}               & \blue{This paper}                     \\ \noalign{\smallskip} \bottomrule \noalign{\smallskip}
\end{tabular}
%}
\caption{Theoretical study of inexactness in bandit optimization. 
% $\times$ means the problem is not well defined %where the first $\times$ is because there is no model mapping from context to reward and $\times$ is 
% because posterior can be exactly computed under the Gaussian process framework. %$?$ means the problems are still open.
}
\label{tab:related}
\end{table*}

Inexact acquisition function optimization has also been explored in the context of combinatorial semi-bandits, where the decision-making process involves choosing from a combinatorial set of actions to optimize a reward function under constraints. For instance, \citet{xu2021simple} and \citet{ross2013learning} investigate the intricacies of making efficient selections amidst a combinatorial structure of actions by leveraging contextual information to inform the selection of action subsets. These works consider an $\alpha$-approximate oracle \citep{kakade2007playing}, focusing on a relaxed version of regret to account for the inexactness of inner optimization. Regret is defined as the difference in reward between the pulled (super-) arm with an approximation of the best reward. \citet{wang2018thompson} show that in general, one cannot achieve no-regret with an approximation oracle in Thompson sampling, even for the classical MAB problem. \citet{kong2021hardness} revealed that linear approximation regret for combinatorial Thompson sampling is pathological, while \citet{perrault2022combinatorial} study a specific condition on the approximation oracle, allowing a reduction to the exact oracle analysis and thus attaining sublinear regret for combinatorial semi-bandits. %In this work, we focus on Gaussian process bandits, and provide conditions under which (variants of) classic BO algorithms exhibit sublinear regret. %.. \yuxin{complete the dicussion from here}
In contrast to these works concerning multi-armed bandits, this work focuses on Gaussian process bandits and proposes sufficient conditions that allow classic BO algorithms, or their variants, to achieve sublinear regret. 


\textbf{Random sampling and discretization methods.}
Random discretization methods have been popular approaches in BO to address the challenges posed by non-convex and high-dimensional acquisition functions. One source of inexactness in acquisition function optimization is due to the discretization inherent in these methods. \citet{bergstra2012random} demonstrate that random search is often more effective than grid search for hyper-parameter optimization, particularly as the dimensionality of the problem increases. They show that random search explores a larger, less promising configuration space, but still finds better models within a smaller fraction of the computation time compared to grid search. This approach is beneficial in BO as well, where the curse of dimensionality makes grid-based methods impractical. Our work provides a theoretical understanding of when random sampling works well in the context of inexact acquisition function maximization.
More recently, \citet{gramacy2022triangulation} propose using candidates based on a Delaunay triangulation of the existing input design for Bayesian optimization. These ``tricands'' outperform both numerically optimized acquisitions and random candidate-based alternatives on benchmark synthetic and real simulation experiments. Similarly, \citet{wycoff2024voronoi} introduce the use of candidates lying on the boundary of the Voronoi tessellation of current design points. This approach significantly improves the execution time of multi-start continuous search without a loss in accuracy, by efficiently sampling the Voronoi boundary without explicitly generating the tessellation, thus accommodating large designs in high dimension. These methods leverage geometric structures to provide more efficient candidate sets for the inner optimization process, aligning with our focus on inexact acquisition function optimization.


\textbf{Misspecified and approximate inference in bandits. %optimization.
}
Besides acquisition functions (inner optimization), inexactness also occurs in the (outer) optimization loop of bandit problems, %there misspecified bandit work focuses on inexact 
where the objective function class is misspecified with respect to the modeling function class. For example, in misspecifed linear bandits \citep{lattimore2020learning} the objective function is non-linear and misspecified Gaussian process bandit optimization \citep{bogunovic2021misspecified} studied the scenario where objective function sits outside a bounded RKHS norm function class. 
%In both cases, they focused on inexactness in outer optimization rather than inexact inner optimization, which is the focus of this paper.

\citet{phan2019thompson} studied the effects of approximate inference on the performance of Thompson sampling in $k$-armed bandit problems where only approximate posterior distribution can be used. With $\alpha$-divergence governing the difference between approximate and true posterior distributions, they proposed a new algorithm that works with sublinear regret in this challenging scenario. Our work is different from their work in two ways. First, their work cannot be directly applied to our setting since under Gaussian process assumption, posterior can be exactly calculated so there is no room for approximate inference. Second, our study on Thompson sampling focus on inexact solution to the acquisition function sampled from the posterior distribution rather than the posterior distribution itself.


\subsection{Our Contributions}
%In this paper, we aim to fill in the gap in inexact bandit optimization by investigating the effects of inexact optimization of the acquisition function within the Bayesian optimization framework. 
In this paper, we address the problem of inexact acquisition function maximization in Bayesian optimization by analyzing its impact on regret and demonstrating that random grid search is a theoretically justified and practical acquisition solver. We focus on two popular Bayesian optimization algorithms, GP-UCB \citep{srinivas2010gaussian} and GP-TS \citep{chowdhury2017kernelized}, and study their cumulative regret bound when acquisition function maximization problems are not perfectly solved. Our key contributions are summarized as follows.
\begin{itemize}
    \item To the best of our knowledge, our paper is the first to theoretically study the effect of the inexact acquisition function solutions in BO. Despite its widespread practical use, the effect of inexact acquisition maximization has been largely overlooked in the literature. %which was ignored for a long time. %Through our study, we systematically answer the question ``Under what condition an inexact solution to acquisition function optimization still works for Bayesian optimization?''
    Our work systematically answers the question: ``How do inexact acquisition solutions impact the regret of BO algorithms, and under what conditions do they still guarantee convergence?'' More broadly, 
    our results bridge an important gap in inexact bandit optimization, complementing prior studies on model misspecification and approximate inference in multi-armed bandits and Bayesian optimization (see Table~\ref{tab:related}).
    %existing works have studied multi-armed bandits or Bayesian optimization where the model is misspecified, the posterior distribution is inexact, or an exact acquisition function maximizer cannot be obtained  Our study closes a critical gap in inexact bandit optimization.
    \item Formally, we introduce a measure of inaccuracy in acquisition solution, accumulated inaccuracy, and we establish the cumulative regret bounds of both inexact GP-UCB and GP-TS. Our analysis quantifies how the cumulative impact of inexact acquisition solutions influences regret and establishes sufficient conditions under which %Our bounds show that under some conditions on accumulated inaccuracy, 
    inexact BO algorithms can still achieve sublinear regrets. These results generalize classical BO regret bounds to more realistic settings where exact acquisition maximization is infeasible.
    \item We theoretically justify random grid search as a valid acquisition solver for BO. %Our results provide the first theoretical validation for random grid search, showing that even with a linear growth in grid size, one can still achieve sublinear regret. \yuxin{choudhury has a loosened bound on this as well}
    Our regret bounds show that even with a linearly growing grid size $|\mathcal{X}_t| = \Theta(t)$, random grid search achieves sublinear regret. This significantly relaxes the condition in prior work \citep{chowdhury2017kernelized}, which required an exponentially larger grid size $t^{2d}$, demonstrating that random grid search is both computationally efficient and theoretically grounded. 
    \item %Our experimental results also validate the effectiveness of random grid search in solving UCB-type acquisition function maximization.  
    We empirically validate the efficiency of random grid search over quasi-Newton methods for BO. Our experiments confirm that random grid search not only maintains strong regret performance but also offers substantial computational savings when solving UCB-type acquisition functions, reinforcing its practical viability as an acquisition function solver.
    % \item More formally, we define a measure of inaccuracy in acquisition solution, called \textit{Pseudo Accumulated Inaccuracy} (PAI), and a novel \textit{Multiplicative Factor Condition} (MFC), which permits inexact acquisition function solutions but also captures their behaviors.
    % \item %The original GP-TS algorithm is not guaranteed to be zero-regret when exact acquisition function solutions cannot be obtained. We propose a new Robust GP-TS algorithm and prove that under MFC, the classical GP-UCB and the Robust GP-TS algorithms are still able to achieve sublinear cumulative regret bounds. 
    % \item We show that under this notion of MFC, the classical GP-UCB and a robust version of GP-TS algorithms are still able to achieve sublinear cumulative regret bounds. The proofs are based on a newly defined posterior distribution and a novel set of saturated actions for Thompson sampling, which allows the Robust GP-TS algorithm to work under MFC.
\end{itemize}
% \let\enit@after\relax


% \chong{New structure:
% \begin{enumerate}
%     \item Introduction
%     \item Preliminaries
%     \item Bayesian Optimization with Random Discretization %Random Sampling and its theoretical guarantees
%     \item \sout{Experiments}
%     \item Bayesian Optimization under the Multiplicative Factor Condition
%     \begin{itemize}
%         \item A General Inexactness Condition
%         \item Regret Analysis of GP-UCB under MFC
%         \item Robust GP-TS Algorithm and Its Regret Analysis under MFC
%     \end{itemize}
%       % Additional discussion on inexactness condition for non-random sampling methods
%     \item Conclusion
% \end{enumerate}
% }



% ..\\
% ..\\
% ..\\
% approximate GP \citep{wilson2020efficiently} \citep{lin2023sampling}

% - Online submodular maximization

% - Combinatorial bandits / semibandits (with approximation)
% - Cost-Efficient Online Decision Making: A Combinatorial Multi-Armed Bandit Approach \url{https://arxiv.org/abs/2308.10699}


% - Learning Policies for Contextual Submodular Prediction \url{http://www.yisongyue.com/publications/icml2013_scp.pdf}

\section{Preliminaries}\label{sec:pre}
\subsection{Problem Setup and Notations}\label{sec:problem}
We consider a global optimization problem where the goal is to maximize an objective function $f: \cX \rightarrow [0, \infty)$. We use the following notations
\begin{align*}
x^* &= \argmax_{x \in \cX} f(x), \quad f^* =f(x^*),
\end{align*}
and assume $\cX \subseteq [-b, b]^d$ is a compact and convex search space, which could be viewed as a set of actions or arms. Unlike the first and second-order optimization methods where one needs an analytic expression or derivative information of $f$, we allow $f$ to be a blackbox function, whose closed-form expression for the function or the derivative is not necessarily known. Only through evaluations of the function $f$, which could be contaminated by random noise, we aim to identify the maximum of $f$.

To facilitate theoretical understanding of BO methods, we assume the objective function $f$ belongs to the reproducing kernel Hilbert space (RKHS) \citep{aronszajn1950theory,wahba1990spline,berlinet2011reproducing} corresponding to a positive semi-definite kernel function $k: \cX \times \cX \to \mathbb{R}$, denoted by $\mathcal{H}_k$. Following \citep{chowdhury2017kernelized}, we restrict our attention to a set of functions $f$ whose RKHS norm is bounded by some constant $B \in \mathbb{R}$. Two important assumptions which will be used throughout the paper are stated below.

\begin{assumption}\label{assump:kernel_less_one}
    The kernel $k(x, x) \le 1$ for all $x \in \cX$.
\end{assumption}

\begin{assumption}\label{assump:kernel_smooth}
    We consider the kernel $k$ to be either a square-exponential kernel or a Mat\'ern kernel with smoothness parameter $\nu \ge 2$.
\end{assumption}
% In this paper, we work with Square-Exponential (SE) kernel and Mat\'ern kernel with smoothness parameter $\nu\geq 2$.

% Among numerous existing kernel functions, two popular kernel functions of our interest
% \footnote{The key assumption we need for the kernel function is: $|k(x,x)| \le 1$, for all $x \in \cX$. For example, the linear kernel also works if we appropriately rescale the data.}, 
% are the Square-Exponential (SE) and Mate\'rn kernel functions.
% , whose forms are given below:
% \begin{align*}
%     k_{\text{SE}}(x, x') &= \exp\left(-\frac{\|x-x'\|_2^2}{2\ell^2}\right), \\
%     k_{\text{Mate\'rn}}(x, x') &= \frac{1}{\Gamma(\nu)2^{\nu-1}}\left(\frac{\sqrt{2\nu}\|x-x'\|_2}{\ell}\right)^\nu B_\nu \left(\frac{\sqrt{2\nu}\|x-x'\|_2}{\ell} \right),
% \end{align*}
% where $\|x-x'\|_2$ indicates the Euclidean norm between $x, x' \in \cX$, $\ell > 0$ is a length-scale parameter, $\nu > 0$ is a smoothness parameter, $\Gamma$ is the gamma function and $B_\nu$ is the modified Bessel function of the second kind. 
% An interesting relationship between two kernel functions is that, after some rescaling, one can show that as the smoothness parameter $\nu$ tends to $\infty$, the Mate\'rn kernel converges to the Square-Exponential kernel \citep{williams2006gaussian,stein2012interpolation}.

% Formalizing details of the Bayesian optimization framework in Section \ref{sec:intro}, 
At each $t^{\text{th}}$ round, we select $x_t$ by maximizing an acquisition function $\alpha_t$ and observe a reward
\begin{align*}
    y_t = f(x_t) + \epsilon_t.
\end{align*}
Here we impose popular probabilistic assumptions on noise $\epsilon_t$ \citep{abbasi2011improved,agrawal2013thompson}. To be more specific, we assume  $\epsilon_t$ to be a conditionally $R$-sub-Gaussian \citep{lattimore2020bandit}, i.e., $\forall \lambda \in \mathbb{R}, ~\mathbb{E}[e^{\lambda\epsilon_t}|\mathcal{F}'_{t-1}] \le \exp(\lambda^2 R^2/2)$ where $\mathcal{F}'_{t-1}$ is a $\sigma$-algebra generated by $\{x_1,\cdots, x_t, \epsilon_1, \cdots, \epsilon_{t-1}\}$. Then we assess the performance of a BO algorithm over $T$ iterations based on cumulative regret $R_T$, given by
\begin{align*}
R_T = \sum_{t=1}^T f^* - f(x_t),
\end{align*}
where $r_t = f^* - f(x_t)$ is referred to as an instantaneous regret at round $t$. The BO algorithm is said to be a zero-regret algorithm if $\lim_{T\rightarrow \infty} R_T/T = 0$ and typical theoretical analyses \citep{srinivas2010gaussian,chowdhury2017kernelized,vakili2021information} aim to show sublinear growth of $R_T$ as the zero-regret algorithm will guarantee the convergence of the algorithm to the maximum. This can be seen from the fact that the simple regret, given by $f^*-\max_{t=1, \cdots, T} f(x_t)$, is bounded above by the average cumulative regret $R_T/T$. Throughout the paper, we use standard big $\mathcal{O}$ notation that hides universal constants, and we use $\tilde{\mathcal{O}}$ to hide all logarithmic factors as well as all polynomial factors in problem-specific parameters.

% \begin{assumption}\label{assum:domain}
% The compact domain $\mathcal{X}$ is a rectangle inside $\mathbb{R}^d$.
% \end{assumption}

% \begin{assumption}\label{assum:kernel}
% The kernel function $k$ is continuously differentiable with respect to each coordinate with $|k(x,x)| \le 1, \forall x \in \mathcal{X}$.
% \end{assumption}

%Throughout the paper, we make the following set of assumptions and notations: 2) kernels are twice differentiable with $L = \sup_{x \in \mathcal{X}} \sup_{j \in [d]} \left(\frac{\partial^2 k(p,q)}{\partial p_j \partial q_j}|_{p=q=x} \right)^{1/2}$. Also, we use standard big $\mathcal{O}$ notation that hide universal constants; and to improve the readability, we use $\tilde{\mathcal{O}}$ to hide all logarithmic factors as well as all polynomial factors in problem-specific parameters

\subsection{GP-UCB and GP-TS Algorithms}

% \hwanwoo{Please state the assumptions on kernel here: 1) $k(x,x) \le 1$ and 2) $k$ twice-differentiable(can you check the Chowdhury's paper and their requirement?)}
% \chong{why assume them here for these two algorithms? They are used throughout this paper.}
% \hwanwoo{Yes, so I think it is better to present them early so that we don't need to keep mention it later} \chong{ok, then we are on the same boat.}

% \hwanwoo{I think we need to discuss the cumulative regret here and sublinearity to demonstrate the convergence of the algorithm}
% \chong{but cumulative regret is part of the problem setup and is independent to GP-UCB and GP-TS.}

% \hwanwoo{I meant the definition of cumulative regret and its relationship to simple regret and convergence, oh I guess this is already done, sorry, perhaps rearranging things might clarify things} \chong{ok, I'll rearrange them and make them clear.}

In the midst of numerous Bayesion optimization methods, two particular strategies of interest are Gaussian Process-Upper Confidence Bound (GP-UCB) introduced in \citet{srinivas2010gaussian} and Gaussian Process-Thompson Sampling (GP-TS) proposed by \citet{chowdhury2017kernelized}. More formally, to design BO algorithms, one typically imposes a zero-centered GP prior to the target objective function $f$ and models the random noise through Gaussian random variable with variance $\tau$. 
Conditioning on all $t-1$ observations prior to obtaining $t$ th observation, the posterior mean $\mu_{t-1}$ and standard variance $\sigma^2_{t-1}(x)$ of the Gaussian process are given by
\begin{align*}
    \mu_{t-1}(x) &= k_{t-1}(x)^T (K_{t-1} + \tau I)^{-1} Y_{t-1}, \\
    \sigma^2_{t-1}(x) &= k(x,x) - k_{t-1}(x)^T (K_{t-1} + \tau I)^{-1} k_{t-1}(x),
\end{align*}
where
\begin{align*}
    k_{t-1}(x) &= [k(x_1, x), \cdots, k(x_{t-1}, x)]^T,\\
    Y_{t-1} &= [y_1, \cdots, y_{t-1}],\\
    K_{t-1} &= [k(x, x')]_{x, x' \in \{x_1, \cdots, x_{t-1}\}}.
\end{align*}
In other words, the posterior distribution of $f$ conditioning on $\{(x_i, y_i)\}_{i=1}^{t-1}$ is given by the Gaussian process with mean function $\mu_{t-1}$ and posterior variance $\sigma^2_{t-1}$.
% \begin{figure*}[!htbp]
% 	\centering
% 	\begin{minipage}{0.48\linewidth}\centering
%  GP-UCB
% \caption{GP-UCB \citep{srinivas2010gaussian, chowdhury2017kernelized} \label{alg:gpucb}}
% 		\begin{algorithmic}[1]
% \STATE {\bf Input}: Kernel $k;$ Total number of iterations $T;$ Initial design points $X_0;$ Initial observations $Y_0$.
%     \STATE Construct $\mu_{0}(x)$ and $\sigma_{0}(x)$ using $X_0, Y_0$.
%     %\STATE {\bf For} $t = 1, \ldots, T$ {\bf do}: 
%     \FOR{$t = 1, \ldots, T$}
%     %\begin{enumerate}
%     \STATE Set $\beta_t=B+R\sqrt{2(\gamma_{t-1}+1 +\log(1/\delta))}$.
%     \STATE  Set
% $x_t=\argmax_{x \in \cX} \mu_{t-1}(x)+\beta_t\sigma_{t-1}(x)$.
%     \STATE  Observe $y_t = f(x_t) + \epsilon_t$.
%     \STATE  Set $X_t = X_{t-1} \cup \{
% x_t\}$ and $Y_t = Y_{t-1} \cup \{
% y_t\}$.
%     \STATE  Update $\mu_{t}(x)$ and $\sigma_{t}(x)$ using $X_t$ and $Y_t.$
%     \ENDFOR
%     %\end{enumerate}
%     % \STATE {\bf End For}
% % \FOR{$t = 1,...,T$}
% % \STATE Set $\beta_t=B+R\sqrt{2(\gamma_{t-1}+1 +\log(1/\delta))}$.
% % \STATE Obtain 
% % \STATE Observe $y_t=f(x_t)+\epsilon_t$.
% % \STATE Perform update to obtain $\mu_{t-1}(x)$ and $\sigma_{t-1}(x)$.
% % \ENDFOR
% \end{algorithmic}
% 	\end{minipage}
% 	\begin{minipage}{0.48\linewidth}\centering
% 		\begin{algorithmic}[1]
% \STATE {\bf Input}: Kernel $k;$ Total number of iterations $T;$ Initial design points $X_0;$ Initial observations $Y_0$.
%     \STATE Construct $\mu_{0}(x)$ and $\sigma_{0}(x)$ using $X_0, Y_0$.
%     % \STATE {\bf For} $t = 1, \ldots, T$ {\bf do}:
%     \FOR%{\bf For} 
%     {$t = 1, \ldots, T$}
%     \STATE Set $\beta_t=B+R\sqrt{2(\gamma_{t-1}+1 +\log(2/\delta))}$.
%     \STATE Sample $f_t(\cdot)$ from $\mathcal{GP}_D\left(\mu_{t-1}(\cdot), \beta^2_t \sigma^2_{t-1}(\cdot) \right)$.
%     \STATE Choose the current decision set $\mathcal{X}_t \subset \mathcal{X}$ of size $|\mathcal{X}_t| = (2BLbdt^2)^d$.
%     \STATE  Set
% $x_t = \arg\max_{x \in \mathcal{X}_t}  f_t(x)$.
%     \STATE  Observe $y_t = f(x_t) + \epsilon_t$.
%     \STATE  Set $X_t = X_{t-1} \cup \{
% x_t\}$ and $Y_t = Y_{t-1} \cup \{
% y_t\}$.
%     \STATE Update $\mu_{t}(x)$ and $\sigma_{t}(x)$ using $X_t$ and $Y_t.$
%     \ENDFOR
% \end{algorithmic}
% 	\end{minipage}
% % \caption{GP-UCB}\label{fig:syn}
% \end{figure*}

% \begin{figure}
% \begin{minipage}{0.46\textwidth}
% \vspace{-.2in}
\begin{algorithm}[!htbp]
\caption{GP-UCB \citep{srinivas2010gaussian, chowdhury2017kernelized} \label{alg:gpucb}}
% \textbf{Inputs:} Parameters $B, R, \lambda, \delta$, Kernel $k$
\begin{algorithmic}[1]
\STATE {\bf Input}: Kernel $k;$ Total number of iterations $T;$ Initial design points $X_0;$ Initial observations $Y_0$.
    \STATE Construct $\mu_{0}(x)$ and $\sigma_{0}(x)$ using $X_0, Y_0$.
    %\STATE {\bf For} $t = 1, \ldots, T$ {\bf do}: 
    \FOR{$t = 1, \ldots, T$}
    %\begin{enumerate}
    \STATE $\beta_t\leftarrow B+R\sqrt{2(\gamma_{t-1}+1 +\log(1/\delta))}$.
    \STATE 
$x_t\leftarrow \argmax_{x}% \in \cX} 
\mu_{t-1}(x)+\beta_t\sigma_{t-1}(x)$.
    \STATE  Observe $y_t = f(x_t) + \epsilon_t$.
    \STATE  $X_t = X_{t-1} \cup \{
x_t\}, Y_t = Y_{t-1} \cup \{
y_t\}$.
    \STATE  Update $\mu_{t}(x)$ and $\sigma_{t}(x)$ using $X_t, Y_t$.
    \ENDFOR
    %\end{enumerate}
    % \STATE {\bf End For}
% \FOR{$t = 1,...,T$}
% \STATE Set $\beta_t=B+R\sqrt{2(\gamma_{t-1}+1 +\log(1/\delta))}$.
% \STATE Obtain 
% \STATE Observe $y_t=f(x_t)+\epsilon_t$.
% \STATE Perform update to obtain $\mu_{t-1}(x)$ and $\sigma_{t-1}(x)$.
% \ENDFOR
\end{algorithmic}
\end{algorithm}
% \end{minipage}
% \hspace{0.02\textwidth}
% \begin{minipage}{0.51\textwidth}
% \vspace{-.2in}
\begin{algorithm}[!htbp]
% \begin{algorithm}[h]
\caption{GP-TS \citep{chowdhury2017kernelized} \label{alg:gpts}}
% \textbf{Inputs:} Parameters $B, R, \lambda, \delta$, Kernel $k$
\begin{algorithmic}[1]
\STATE {\bf Input}: Kernel $k;$ Total number of iterations $T;$ Initial design points $X_0;$ Initial observations $Y_0$.
    \STATE Construct $\mu_{0}(x)$ and $\sigma_{0}(x)$ using $X_0, Y_0$.
    % \STATE {\bf For} $t = 1, \ldots, T$ {\bf do}:
    \FOR%{\bf For} 
    {$t = 1, \ldots, T$}
    \STATE $\beta_t\leftarrow B+R\sqrt{2(\gamma_{t-1}+1 +\log(2/\delta))}$.
    \STATE Sample $f_t(\cdot)\sim\mathcal{GP}_D\left(\mu_{t-1}(\cdot), s_t^2 \right)$, with $s_t^2(\cdot) = \beta^2_t \sigma^2_{t-1}(\cdot)$  .
    \STATE Choose the current decision set $\mathcal{X}_t \subset \mathcal{X}$ of size $|\mathcal{X}_t| = (2BLbdt^2)^d$.
    \STATE  Set
$x_t = \arg\max_{x \in \mathcal{X}_t}  f_t(x)$.
    \STATE  Observe $y_t = f(x_t) + \epsilon_t$.
    \STATE  $X_t = X_{t-1} \cup \{x_t\}, Y_t = Y_{t-1} \cup \{y_t\}$.
    \STATE Update $\mu_{t}(x)$ and $\sigma_{t}(x)$ using $X_t, Y_t$.
    \ENDFOR
\end{algorithmic}
\end{algorithm}
% \end{minipage}
% \end{figure}


% \chong{maybe we can put two algorithms left and right to save some space?}
For the choice of an acquisition function to facilitate Bayesian optimization strategy, \citet{srinivas2010gaussian} considered an UCB function of the form
$$
\alpha_t(x) \coloneqq \mu_{t-1}(x) + \beta_t\sigma_{t-1}(x),
$$
where $\beta_t$ is chosen to balance exploitation (picking points with high function values) and exploration (picking points where the prediction based on posterior distribution is highly uncertain). Reflecting on the choice of an acquisition function, \citet{srinivas2010gaussian} named the BO strategy based on the UCB function GP-UCB, which has been popular in both theoretical and empirical studies. In the meantime, \citet{chowdhury2017kernelized} proposed another BO strategy under the name of GP-TS, which leverages the acquisition function of the form
$$
\alpha_t(x) \coloneqq f_t(x), 
$$
\looseness -1 where $f_t$ is a sample path from the posterior Gaussian process with the mean function $\mu_{t-1}$ and variance function $\beta_t^2 \sigma^2_{t-1}$. Such a strategy has been widely used under the name of Thompson sampling in BO and Bandit literature \citep{agrawal2013thompson,russo2014learning,kandasamy2015high}. In practice, optimizing a randomly drawn sample path $f_t$ is just as hard as finding the optimum of the target objective $f$. In reflection of such difficulty, a more practical version of the aforementioned TS-based BO strategy through discretizing the search space for the acquisition function was proposed in \citet{chowdhury2017kernelized}.


% \begin{wrapfigure}{L}{0.48\textwidth}
% \begin{minipage}{0.48\textwidth}
% \vspace{-.2in}
% \begin{algorithm}[H]
% % \begin{algorithm}[h]
% \caption{GP-TS \citep{chowdhury2017kernelized} \label{alg:gpts}}
% % \textbf{Inputs:} Parameters $B, R, \lambda, \delta$, Kernel $k$
% \begin{algorithmic}[1]
% \STATE {\bf Input}: Kernel $k;$ Total number of iterations $T;$ Initial design points $X_0;$ Initial observations $Y_0$.
%     \STATE Construct $\mu_{0}(x)$ and $\sigma_{0}(x)$ using $X_0, Y_0$.
%     % \STATE {\bf For} $t = 1, \ldots, T$ {\bf do}:
%     \FOR%{\bf For} 
%     {$t = 1, \ldots, T$}
%     \STATE Set $\beta_t=B+R\sqrt{2(\gamma_{t-1}+1 +\log(2/\delta))}$.
%     \STATE Sample $f_t(\cdot)$ from $\mathcal{GP}_D\left(\mu_{t-1}(\cdot), \beta^2_t \sigma^2_{t-1}(\cdot) \right)$.
%     \STATE Choose the current decision set $\mathcal{X}_t \subset \mathcal{X}$ of size $|\mathcal{X}_t| = (2BLbdt^2)^d$.
%     \STATE  Set
% $x_t = \arg\max_{x \in \mathcal{X}_t}  f_t(x)$.
%     \STATE  Observe $y_t = f(x_t) + \epsilon_t$.
%     \STATE  Set $X_t = X_{t-1} \cup \{
% x_t\}$ and $Y_t = Y_{t-1} \cup \{
% y_t\}$.
%     \STATE Update $\mu_{t}(x)$ and $\sigma_{t}(x)$ using $X_t$ and $Y_t.$
%     \ENDFOR
% \end{algorithmic}
% \end{algorithm}
% \end{minipage}
% \end{wrapfigure}


For the theoretical analysis, both \citet{srinivas2010gaussian, chowdhury2017kernelized} considered an increasing sequence for the choice of $\beta_t$, and in particular, \citet{chowdhury2017kernelized} set $\beta_t$ to grow at a rate of $\mathcal{O}(\sqrt{\gamma_T})$ where $\gamma_T$ is a kernel-dependent quantity known as the maximum information gain at time $t$, given by
$$
\gamma_T = \max_{\cX_T \subset \cX: |\cX_T| = T} \frac{1}{2}\log\det(I_T + \tau^{-1}K_T).
$$
The formal algorithm based on the upper confidence function and Thompson sampling are respectively provided in the Algorithm \ref{alg:gpucb} and Algorithm \ref{alg:gpts}. Furthermore, the state-of-the-art growth rate of the maximum information gain is provided in \citet{vakili2021information}, which states that $\gamma_T = \mathcal{O}(T^{\frac{d}{d+2\nu}} \log^{\frac{2\nu}{2\nu+d}}(T))$ for Mat\'ern Kernel and $\gamma_T = \mathcal{O}(\log^{d+1}(T))$ for SE Kernel. Although we work with UCB and TS in this paper, our analysis can be extended to more acquisition functions.


\section{Regret bounds under inexact acquisition function maximization
}\label{sec:inexact}

Although theoretical developments in BO often assume that the acquisition maximization is solved perfectly, this is rarely the case in practice. For example, when optimizing the UCB acquisition function—whose gradient information is typically available—a popular approach is to use quasi-Newton methods with multiple starting points \citep{frazier2018tutorial, gramacy2020surrogates, pourmohamad2021bayesian}. While these methods have proven effective for convex problems, the UCB acquisition function is usually nonconvex. Consequently, factors such as the number of starting points, the total number of optimization iterations, and the stopping criteria all impact the quality of the acquisition solution obtained.

In TS-based BO methods, it is common to use a discretized subset of the action space because gradients for the sample paths are difficult to obtain. In these cases, a sorting algorithm is used to select the maximum value from the posterior sample path within the chosen grid \citep{kandasamy2018parallelised, chowdhury2017kernelized}. Although theoretical studies have shown that the TS strategy converges to the optimal function value, the grid size must grow exponentially with the number of dimensions \citep{chowdhury2017kernelized}. Moreover, the granularity of the discretization heavily influences the computational cost. As a result, typical implementations use the finest discretization possible within the available computational budget, which can lead to inexact acquisition solutions even with grid search.

In this section, we examine how a series of inexact acquisition function maximizations affects BO strategies. Although our primary focus is on GP-UCB and GP-TS, our analysis is not limited to any specific solver. We introduce a measure to quantify the accumulated inaccuracies caused by these inexact maximizations, with the goal of understanding how they impact the existing cumulative regret bounds. 

\subsection{Accumulated inaccuracy}
We first introduce a measure for the inaccuracies that arise when solving acquisition function optimization problems. Let 
$
\alpha_t^* = \max \alpha_t(x),
$
be the maximum value of the acquisition function at $t^{\text{th}}$ iteration, which is at least as large as $\alpha_t(x_t)$
where $x_t$ is the action selected at that iteration. For our analysis, we assume that the acquisition functions 
$\{\alpha_t\}_{t=1}^T$ are nonnegative and always achieve a strictly positive maximum. This assumption can be easily met by adding a positive constant to the acquisition function at each iteration or by appropriately restricting the search space. A detailed justification for the existence of such a constant is provided in Appendix~\ref{sec:app:gpucb}. We emphasize that this modification does not change the chosen action nor does it incur any additional computational cost.

At each iteration, we quantify the accuracy of the acquisition optimization solution by the ratio of the acquisition function value at the selected action to its maximum value:
$$
\eta_t \coloneqq \frac{\alpha_t(x_t)}{\alpha_t^*} \in [0,1],
$$
so that $\eta_t = 1$ if the acquisition function is maximized perfectly. In other words, a larger value of $\eta_t$ indicates a more accurate solution to the $t^{\text{th}}$ acquisition optimization problem. The overall impact of inaccurate solutions across iterations is then captured by the accumulated inaccuracy,
$$
M_T = \sum_{t=1}^T (1-\eta_t) \in [0, T],
$$
which represents the total inaccuracy permitted over 
$T$ rounds of Bayesian optimization. Using this measure of accumulated inaccuracy, we establish cumulative regret bounds for GP-UCB and GP-TS in the remainder of this section, in contrast to existing theoretical works that assume perfect acquisition solutions.

\subsection{Regret Bounds}\label{sec:gpucb}

Recall that for the kernel function under consideration, we have $|k(x, x')| \le 1$. Furthermore, the $t^{\text{th}}$ action $x_t$ satisfies $\alpha^*_t \coloneqq \max \alpha_t(x) \ge  \alpha_t(x_t) \ge \eta_t \alpha^*_t$, where $\alpha_t(x) = \mu_{t-1}(x) + \beta_t \sigma_{t-1}(x)$. We denote by $\cH_k$ the RKHS associated with a chosen kernel $k$. We then establish cumulative regret bounds for the GP-UCB algorithm in the presence of inexact acquisition function solutions.

\begin{theorem}\label{thm:GP_UCB_MFC}
Under assumptions \ref{assump:kernel_less_one} and \ref{assump:kernel_smooth}, suppose an objective function $f \in \mathcal{H}_k$ with $\|f\|_{\mathcal{H}_k} \le B$. The inexact GP-UCB algorithm with $\beta_t = B + R\sqrt{2(\gamma_{t-1} + 1 + \log(1/\delta))}$ and an accumulated inaccuracy $M_T$ yields a cumulative regret bound of the form,
$$
R_T = O\left(\gamma_T\sqrt{T}  + M_T\sqrt{\gamma_T} \right),
$$ 
with probability $1-\delta$.
\end{theorem}

Note that when $M_T=0$ (exact maximization), the cumulative bound above recovers standard regret guarantees
%. The cumulative regret bound above aligns with existing bounds 
for GP-UCB \citep{srinivas2010gaussian, chowdhury2017kernelized}. The additive factor of $M_T\sqrt{\gamma_T}$ accounts for the effect of inaccurate acquisition solutions. This implies that if the accumulated inaccuracy $M_T$ and the maximum information gain $\gamma_T$ do not grow too quickly so that $\frac{M_T\sqrt{\gamma_T}}{T} \to 0$ as $T\to\infty$, then the inexact GP-UCB algorithm will converge to the optimal solution. For example, with a squared-exponential kernel, it has been shown that $\gamma_T$ grows logarithmically \citep{vakili2021information}. Theorem \ref{thm:GP_UCB_MFC} thus indicates that the inexact GP-UCB algorithm converges asymptotically to the optimal function value, provided that the accumulated inaccuracy $M_T$ is sublinear.   

One can also establish a similar result for the GP-TS algorithm, as stated below. In contrast to the GP-TS algorithm introduced in \citep{chowdhury2017kernelized}, our formulation accounts for the extra uncertainty induced by inexact solutions through the use of sample paths obtained from the GP posterior with an enlarged variance.

\begin{theorem}\label{thm:GP_TS_MFC}
Under assumptions \ref{assump:kernel_less_one} and \ref{assump:kernel_smooth}, suppose an objective function $f \in \mathcal{H}_k$ with $\|f\|_{\mathcal{H}_k} \le B$. The inexact GP-TS algorithm with variance $s^2_{t-1}(x) = \left(\beta_t \sigma_{t-1}(\cdot) + v_t\right)^2$, $v_t = (\frac{1}{\eta_t}-1)B$ and an accumulated inaccuracy $M_T$ yields a cumulative regret bound of the form
$$
R_T = \mathcal{O}\left(\gamma_T\sqrt{T \log T} + M_T \sqrt{\gamma_T\log T}\right),
$$ 
with probability $1-\delta$. 
\end{theorem}

The above regret bound matches with the existing bound for GP-TS \citep{chowdhury2017kernelized}, up to an additive factor of $M_T\sqrt{\gamma_T\log T}$, that captures the inaccuracies in the acquisition function solutions. Similar to the GP-UCB case, if the accumulated inaccuracy $M_T$ and the maximum information gain $\gamma_T$ do not increase too rapidly so that $M_T\sqrt{\gamma_T \log T} /T \to 0$ as $T\to\infty$,
then Theorem \ref{thm:GP_TS_MFC} shows that the inexact GP-TS algorithm will achieve a sublinear cumulative regret bound.

An overall implication of the theorems established in this section is that BO strategies can retain asymptotic convergence guarantees even without exact acquisition function solutions, provided that the accuracy of these solutions improves over time. This insight naturally motivates the exploration of BO strategies that are computationally more efficient by reducing the effort spent on acquisition function maximizations while preserving convergence guarantees.

\section{Solving acquisition function through random grid search}\label{sec:random_grid}

Building on the insights from Section \ref{sec:inexact}, we further investigate a simple yet computationally efficient approach: a random grid search for acquisition function maximization. In this method, to maximize an acquisition function $\alpha_t$, one randomly samples $t$ points from the search space and select the one with the highest acquisition function value. This approach has been employed in numerous BO problems and has demonstrated its effectiveness \citep{kandasamy2018parallelised, pourmohamad2021bayesian}. % even though it has not been rigorously justified theoretically.
Importantly, our results provide the first theoretical validation for this approach, showing that even with a linear growth in grid size, one can still achieve sublinear regret.

Concretely, we rigorously analyze the GP-UCB and GP-TS algorithms using a random grid search for acquisition maximization and derive their cumulative regret bounds. As noted in the previous section, the accuracy of the acquisition solver must improve over iterations. To achieve this, we employ a sequence of random grids $\{\cX_t\}_{t \in \mathbb{N}}$ whose size grows linearly with the iteration count, i.e., $|\cX_t| = \Theta(t)$. In combination with TS acquisition function, such strategy of increasing the grid size has proven both empirically successful and robust \citep{kandasamy2018parallelised}.

To precisely define our acquisition function optimization procedure, consider a grid of random samples $\cX_t \subset \cX$ that serves as the search space for the $t^{\text{th}}$ acquisition function optimization. At each $t^{\text{th}}$ iteration, we set
\begin{align*}
x_t \coloneqq \begin{cases}
    \argmax_{x \in \mathcal{X}_t} \mu_{t-1}(x) + \beta_t \sigma_{t-1}(x) \quad \text{for GP-UCB} \\
    \argmax_{x \in \mathcal{X}_t} f_{t}(x) \quad \text{for GP-TS}.
\end{cases}
\end{align*}
We refer to the first strategy as random grid GP-UCB and the second as random grid GP-TS. For the choice of random grid, we make the following assumption and state our cumulative regret bound results. 

% \begin{assumption}\label{assum:grid}
% At iteration $t$, $\mathcal{X}_t$ is a collection of $t$ random samples drawn uniformly from $\cX$. Furthermore, we assume that the set of uniform random grids $\{\cX_t\}_{t \in \mathbb{N}}$ are independent. 
% \end{assumption}
\begin{assumption}\label{assum:grid}
At iteration $t$, $\mathcal{X}_t$ consists of $t$ independent samples drawn uniformly from $\cX$. Additionally, the sequence of random grids $\{\cX_t\}_{t \in \mathbb{N}}$ is independent across iterations.
\end{assumption}

\begin{remark}
In fact, our results can be applied to a non-uniform sample grid, provided that the probability density of the random samples is strictly positive in the search space. However, since uniform random sampling is the easiest to implement in practice, we present our results under the uniform random sample grid assumption.
\end{remark}


\begin{theorem}\label{thm:random_UCB}
Under assumptions \ref{assump:kernel_less_one}, \ref{assump:kernel_smooth} and \ref{assum:grid}, suppose $f \in \mathcal{H}_k$ with $\|f\|_{\mathcal{H}_k} \le B$. With probability at least $1-\delta$, the random grid GP-UCB with $\beta_t = B + R\sqrt{2(\gamma_{t-1} + 1 + \log(2/\delta))}$ yields 
\begin{align*}
R_T = \mathcal{O}\left(\gamma_T \sqrt{T} + T^{\frac{d-1}{d} + \xi}\right),
\end{align*}
for some arbitrarily small $\xi > 0$.
\end{theorem}

\begin{theorem}\label{thm:random_TS}
Under assumptions \ref{assump:kernel_less_one}, \ref{assump:kernel_smooth} and \ref{assum:grid}, suppose $f \in \mathcal{H}_k$ with $\|f\|_{\mathcal{H}_k} \le B$. With probability at least $1-\delta$, the random grid GP-TS with $\beta_t = B + R\sqrt{2(\gamma_{t-1} + 1 + \log(3/\delta))}$ yields 
\begin{align*}
R_T = \mathcal{O}\left(\gamma_T \sqrt{T \log T} + T^{\frac{d-1}{d} + \xi} \right),
\end{align*} 
for some arbitrarily small $\xi > 0$.
\end{theorem}
\begin{remark}
Similar to the regret bounds from Section \ref{sec:inexact}, our cumulative regret bounds now incorporate an additional factor that accounts for the slight inaccuracies introduced by the random grid search. In both Theorem \ref{thm:random_UCB} and Theorem \ref{thm:random_TS}, these effects are represented by the term $T^{\frac{d-1}{d}+\xi}$. Importantly, our results demonstrate that even a simple random grid search is sufficient for acquisition function maximization, ensuring that Bayesian optimization converges asymptotically to the global optimum under suitable growth rate of the maximum information gain $\gamma_T$.
\end{remark}


Our theoretical results indicate that a random grid search is sufficient for optimizing UCB and TS acquisition functions to achieve sublinear regret bounds for both GP-UCB and GP-TS. Although many successful implementations of random grid search for acquisition function maximization exist, there has been little theoretical justification for this approach. The closest work we are aware of is by \citep{chowdhury2017kernelized}, which incorporated a grid search within the GP-TS algorithm to solve the acquisition function optimization problem. While their analysis guarantees a sublinear regret bound for certain kernels, it requires that the grid size grows at a rate of 
$t^{2d}$, where $t$ is the iteration index and $d$ is the problem's dimension. This requirement is computationally demanding; indeed, in the same paper, the authors used a fixed grid size for numerical experiments, highlighting the gap between theoretical developments and practical implementations. %Unlike their results, our regret bounds justify the empirical success of the TS acquisition function when using a random grid search that grows linearly with the iteration number, as demonstrated in \citep{kandasamy2018parallelised}. 
This result not only demonstrates the theoretical soundness of using random grid search for acquisition maximization but also provides rigorous justification for the empirical findings in \citep{kandasamy2018parallelised}, which observed that a random grid search of order $d^2t$ is robust and efficient for TS algorithms. Furthermore, as Section~\ref{sec:exp} will show, the computational efficiency of the random grid search approach offers a significant practical advantage over more complex solvers such as quasi-Newton methods, without sacrificing convergence guarantees.

\section{Experiments}\label{sec:exp}

\begin{figure*}[!h]
    \centering
      \includegraphics[width=.93\linewidth]{UAI2025/BO_UCB_GRID_UPDATED.png}
      \vspace{-2mm}
    \caption{Cumulative regret comparison between acquisition function solvers.}\label{fig:acq_cum}
    \vspace{-1mm}
\end{figure*}
In this section, we conduct numerical experiments on six benchmark functions to demonstrate the effectiveness of random grid search combined with a UCB acquisition function. We refer the reader to \citep{kandasamy2018parallelised} for a demonstration of the effectiveness of random grid search when using a TS acquisition function. The test functions include Branin (2D on $[-5, 10] \times [0, 15]$), Rastrigin (3D on $[-5.12, 5.12]^3$), Hartmann3 (3D on $[0,1]^3$), Hartmann4 (4D on $[0,1]^4$), Levy (5D on $[-10, 10]^5$), and Hartmann6 (6D on $[0,1]^6$). For each problem, a set of initial design points is generated using a Sobol sequence. These points are scaled to the appropriate domain. We vary the number of initial design points by problem (20 for Branin, 30 for Rastrigin and Hartmann3, 40 for Hartmann4, 50 for Levy, and 60 for Hartmann6) reflecting the dimension of the objective function. After initialization, BO algorithms with different choices of acquisition function solvers are performed for a predetermined number of iterations: 80 iterations for Branin, 100 for Rastrigin, Hartmann3, and Hartmann4, 150 for Levy, and 200 for Hartmann6. In each iteration, a Gaussian process with a Mat\'ern kernel is fitted to the available data, and a UCB acquisition function is optimized with an exploration parameter $\beta_t = \sqrt{\log(t+2)}$ for each iteration $t$. The optimization of this acquisition function is carried out using one of four inner optimization methods: a uniform random grid search (abbreviated as Uniform), popular quasi-Newton methods including Limited-memory Broyden–Fletcher–Goldfarb–Shanno (abbreviated as L-BFGS-B) and Nelder–Mead (abbreviated as NM), or a Conjugate Gradient (abbreviated as CG) method. For the uniform sample grid size $|\cX_t|$, we set it to be $100t$, which scales linearly in terms of the number of iterations. Each method is evaluated over 20 independent experiments (with varying random seeds) to provide a statistically valid comparison in terms of both cumulative regret and computational time. 


\begin{figure}[!htbp]
  \centering
  \includegraphics[width=\linewidth]{UAI2025/BO_COMP_TIME_GRID_UPDATED.png}
  % \vspace{-5mm}
  \caption{Computational time comparison between acquisition function solvers.}\label{fig:acq_comp_time}
  \vspace{-1mm}
\end{figure}



Figure \ref{fig:acq_cum} shows that, across a range of dimensions and objective landscapes, the uniform sample grid search consistently achieves competitive cumulative regret and keeps pace with, or in some cases outperforms, the more complex optimization routines. For instance, in the 2D Branin problem, the uniform sample grid search’s cumulative regret curve is superior to that of other sophisticated optimization algorithms over the entire horizon, demonstrating rapid improvement from the outset. On the more challenging 3D Rastrigin, 3D Hartmann3, and 4D Hartmann4 functions, uniform sample grid search continues to exhibit a steady reduction in regret, closely matching or surpassing other solvers. Even for the higher-dimensional Levy (5D) and Hartmann6 (6D) problems, the uniform sample grid search approach remains impressively competitive, achieving a final cumulative regret that is comparable to, if not better than, the gradient-based methods. Figure \ref{fig:acq_comp_time} further highlights the uniform sample grid search’s practical advantages: it maintains one of the lowest average runtimes across all problems, standing in stark contrast to NM, which exhibits significantly longer runtimes (especially in the 5D and 6D settings). In fact, uniform sample grid search’s computation times remain modest even in higher dimensions, making it an appealing choice when balancing rapid progress in regret reduction with efficient use of computational resources.


% \vspace{-3mm}
\section{Conclusion}\label{sec:con}
\goodbreak

In this paper, to the best of our knowledge, we study the inexact acquisition function maximization problem for BO for the first time, %which is an ignored problem in Bayesian optimization for a long time. 
a topic that has been largely overlooked. %Although some work studied acquisition function optimization, most existing work just assumes that exact acquisition function optimization solution can be obtained such that rigorous upper bounds can be established in theory. 
% Our study also contributes to inexact bandit optimization problems in general.
Existing works predominantly operate under the assumption that an exact solution is attainable, thus allowing for the establishment of rigorous theoretical upper bounds. However, obtaining such exact solutions is notably challenging in practical applications because acquisition functions are usually non-convex %and non-linear 
which makes them highly intractable. 

To address this discrepancy, formally, we define a measure of inaccuracy in acquisition solution, accumulated inaccuracy, and we establish the cumulative regret bounds of both inexact GP-UCB and GP-TS. Our bounds show that under some conditions on accumulated inaccuracy, inexact BO algorithms can still achieve sublinear regrets. Our results provide the first theoretical validation for random grid search, showing that even with a linear growth in grid size, one can still achieve sublinear regret. Our experimental results also validate the effectiveness of random grid search in solving UCB-type acquisition function maximization.

% To address this discrepancy, our paper first study random sampling for acquisition function maximization. Although it only leads to inexact solutions to acquisition function, random sampling can still achieve zero-regret for both GP-UCB and GP-TS, which means they are able to find the global optimum. Our analysis explains that why random sampling can still be successful in some tasks in practice but also points out that random sampling clearly suffers from the curse of dimensionality. More formally, we define a novel condition, Multiplicative Factor Condition (MFC), and inaccuracy measure, Pseudo Accumulated Inaccuracy (PAI), to encapsulate scenarios that permit but constrain the inexact solutions to acquisition function optimization. Under the MFC, we prove that both classical GP-UCB and the new Robust GP-TS algorithms achieve sublinear regrets. The MFC, therefore, is the first condition under which general inexact solutions still lead to zero-regret performances.
% In fact, the inexact acquisition function optimization problem is not unique in Bayesian optimization, and there are some related works \citep{wang2018thompson,kong2021hardness,perrault2022combinatorial}.

Although our work is limited to GP-UCB and GP-TS, our analysis can be extended to more acquisition functions with similar structural properties. Additionally, our framework opens avenues for studying adaptive inexact optimization strategies, where the computational effort allocated to acquisition function maximization is dynamically adjusted based on accumulated inaccuracy. We hope our study could encourage more systematic and theoretical research on inexact acquisition function optimization and prompt further empirical investigation into the efficient allocation of computational resources for BO.

\bibliography{bib}



% \begin{ack}
% Use unnumbered first level headings for the acknowledgments. All acknowledgments
% go at the end of the paper before the list of references. Moreover, you are required to declare
% funding (financial activities supporting the submitted work) and competing interests (related financial activities outside the submitted work).
% More information about this disclosure can be found at: \url{https://neurips.cc/Conferences/2024/PaperInformation/FundingDisclosure}.

% Do {\bf not} include this section in the anonymized submission, only in the final paper. You can use the \texttt{ack} environment provided in the style file to automatically hide this section in the anonymized submission.
% \end{ack}

% \section*{References}

% References follow the acknowledgments in the camera-ready paper. Use unnumbered first-level heading for
% the references. Any choice of citation style is acceptable as long as you are
% consistent. It is permissible to reduce the font size to \verb+small+ (9 point)
% when listing the references.
% Note that the Reference section does not count towards the page limit.
% \medskip


% {
% \small


% [1] Alexander, J.A.\ \& Mozer, M.C.\ (1995) Template-based algorithms for
% connectionist rule extraction. In G.\ Tesauro, D.S.\ Touretzky and T.K.\ Leen
% (eds.), {\it Advances in Neural Information Processing Systems 7},
% pp.\ 609--616. Cambridge, MA: MIT Press.


% [2] Bower, J.M.\ \& Beeman, D.\ (1995) {\it The Book of GENESIS: Exploring
%   Realistic Neural Models with the GEneral NEural SImulation System.}  New York:
% TELOS/Springer--Verlag.


% [3] Hasselmo, M.E., Schnell, E.\ \& Barkai, E.\ (1995) Dynamics of learning and
% recall at excitatory recurrent synapses and cholinergic modulation in rat
% hippocampal region CA3. {\it Journal of Neuroscience} {\bf 15}(7):5249-5262.
% }


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\newpage
\onecolumn
\appendix

\section{Proof for Theoretical Statements}
\subsection{Assumptions }
We first list out necessary assumptions and key lemmas to establish theorems stated in the manuscript. 
\begin{assumption}\label{append_assump:kernel_less_one}
    The kernel $k(x, x) \le 1$ for all $x \in \cX$.
\end{assumption}

\begin{assumption}\label{append_assump:kernel_smooth}
    We consider the kernel $k$ to be either a square-exponential kernel or a Mat\'ern kernel with smoothness parameter $\nu \ge 2$.
\end{assumption}

\begin{assumption}\label{append_assum:grid}
At iteration $t$, $\mathcal{X}_t$ is a collection of $t$ random samples drawn uniformly from $\cX$. Furthermore, we assume that the set of uniform random grids $\{\cX_t\}_{t \in \mathbb{N}}$ are independent. 
\end{assumption}

\begin{lemma}[Theorem 2 of \citet{chowdhury2017kernelized}]\label{lem:conf} Suppose $f \in \mathcal{H}_k$ with $\| f\|_{\mathcal{H}_k} \le B$. Then the following statement holds with probability at least $1-\delta$, for all $x \in \mathcal{X}$ and $t \in \mathbb{N}$,
\begin{align*}
|f(x) - \mu_{t-1}(x)| \leq \beta_t \sigma_{t-1}(x),
\end{align*}
where $\beta_t = B + R\sqrt{2(\gamma_{t-1} + 1 + \log(1/\delta))}$.
\end{lemma}

\begin{lemma}[Lemma 4 of \citet{chowdhury2017kernelized}]\label{lem:sigma_bound} Let $\{z_1, \cdots, z_T\}$ be the points selected by arbitrary BO strategy. Then the following holds, 
\begin{align*}
\sum_{t=1}^T \sigma_{t-1}(z_t) \leq \sqrt{4(T+2)\gamma_T}
\end{align*}
where $\gamma_T$ is the maximum information gain. 
\end{lemma}

\begin{lemma}[Proposition 4 of \citet{helin2023introduction}]\label{lem:gap}
Let $\tilde x_1,...,\tilde x_t$ be $t$ uniform random samples from a hyperrectangle $\cX \subseteq \mathbb{R}^d$. Define $h_t = \sup_{x \in \cX} \inf_{i=1,...t} \|x - \tilde x_i\|$, then there exists $t^*$ such that $\forall t \ge t^*$, 
\begin{align*}
\E [h_t] = \mathcal{O}(t^{-\frac{1}{d}+\xi}),
\end{align*}
where $\xi > 0$ is an arbitrarily small positive constant.
\end{lemma}

In the following lemma, we establish a high-probability upper bound on the cumulative discrepancy between the random grid $\cX_t$ and the search space $\cX$.
\begin{lemma}\label{lemma:grid_disc}
    Let $h_t = \sup_{x \in \mathcal{X}}\inf_{\tilde x_i \in \mathcal{X}_t} \|x-\tilde x_i\|$. For $\delta > 0$ small, with probability at least $1-\delta$, we have
    $$
        \sum_{t=1}^T h_t \le \sum_{t=1}^{t^*} h_t +  \frac{CT^{\frac{d-1}{d} + \xi}}{\delta} ,
    $$
    for some $C > 0$, $\xi > 0$, and $t^* \in \mathbb{N}$.
\end{lemma}
\begin{proof}
    From Lemma \ref{lem:gap}, we know that there exists $C > 0, \xi > 0$ and $t^* \in \mathbb{N}$ such that $\mathbb{E}[h_t] \le C t^{-\frac{1}{d} + \xi}$ for all $t \ge t^*$. From the Markov's inequality, we know that
    $$
     \mathbb{P}\left[\sum_{t=t^*}^T h_t > \frac{CT^{\frac{d-1}{d} + \xi}}{\delta}\right] \le \frac{\mathbb{E}\left[\sum_{t=t^*}^T h_t \right] }{CT^{\frac{d-1}{d} + \xi}} \delta \le \delta
    $$
    where the second inequality follows from $\mathbb{E}[\sum_{t=t^*}^T h_t]  \le CT^{\frac{d-1}{d}+\xi} $.
\end{proof} 


\subsection{GP-UCB/GP-TS with inexactness}\label{sec:app:gpucb}
We first provide justifications for the nonnegativity of the UCB acquisition function after the constant shift. To this end, we show the existence of a constant $C$ such that $\alpha_t + C$ is non-negative on the search space $\cX$.

\begin{lemma}
    For some $\delta > 0$, a constant shifted $t^{\text{th}}$ acquisition function of the GP-UCB algorithm, given by
    $$
        \alpha_t(x) + C = \mu_{t-1}(x) + \beta_t \sigma_{t-1}(x) + C
    $$
    is non-negative on $\cX$ with probability $1-\delta$, for some constant sequence $C \in \mathbb{R}$. 
    \end{lemma}
\begin{proof}
From Lemma \ref{lem:conf}, we know that
$$
f(x) \le \mu_{t-1}(x) + \beta_t \sigma_{t-1}(x),
$$
for all $x \in \cX$ with probability $1-\delta$. Therefore, 
$$
0 \le f(x) + |f(x)|  \le  \mu_{t-1}(x) + \beta_t \sigma_{t-1}(x) + |f(x)|.
$$
Since $\sup_{x}|f(x)| \le \|f\|_{\mathcal{H}_k} \le B$, we have that $ \mu_{t-1}(x) + \beta_t \sigma_{t-1}(x) + B \ge 0$.
\end{proof}
We next provide justifications for the nonnegativity of the TS acquisition function after the constant shift. To this end, we show the existence of a constant $C_t$ such that $\alpha_t + C_t$ is non-negative on the search space $\cX_t$.
\begin{lemma}
    For some $\delta > 0$, the constant shifted $t^{\text{th}}$ acquisition function of the robust GP-TS algorithm, given by
    $$
        \alpha_t(x) + C_t = f_{t}(x)  + C_t
    $$
    is non-negative on $\cX_t$ with probability $1-\delta$, for some sequence $C_t$. 
    \end{lemma}
\begin{proof}
From Lemma 5 of \citep{chowdhury2017kernelized}, conditioning on the history until time $t$, i.e., $\cH_t:=\{(x_1, y_1), \cdots, (x_t, y_t)\}$, we know that
$$
\forall x \in \mathcal{X}_t, \quad |f_t(x)-\mu_{t-1}(x)| \le \sqrt{2\log(|\mathcal{X}_t|/\delta)}s_{t-1}(x),
$$
with probability $1-\delta/2$. Therefore, from the definition $s_{t-1}(x) = \beta_t \sigma_{t-1}(\cdot) + v_t$, we have
\begin{equation}\label{lem12:first}
\mu_{t-1}(x) - \sqrt{2\log(|\mathcal{X}_t|/\delta)}(\beta_T + v) \le \mu_{t-1}(x) - \sqrt{2\log(|\mathcal{X}_t|/\delta)}s_{t-1}(x) \le f_t(x),
\end{equation}
where $v_t = \left(\frac{1}{\eta_t}-1\right)B$. Using Lemma \ref{lem:conf} with $\beta_t\sigma_t$ replaced with $s_t$, with probability at least $1-\delta/2$
\begin{equation}\label{lem12:second}
-B - (\beta_T+v) \le f(x) - s_{t-1}(x)\le\mu_{t-1}(x)
\end{equation}
holds for all $x \in \cX$.
Combining \eqref{lem12:first} and \eqref{lem12:second}, we observe that
$$
f_t(x) + (1+\sqrt{2\log(|\mathcal{X}_t|/\delta)})(\beta_T + v) + B \ge 0,
$$
for all $x \in \cX_t$ with probability at least $1-\delta$.
\end{proof}


\begin{theorem}[Restatement of Theorem \ref{thm:GP_UCB_MFC}]\label{append_thm:GP_UCB_MFC}
Under assumptions \ref{append_assump:kernel_less_one} and \ref{append_assump:kernel_smooth}, suppose an objective function $f \in \mathcal{H}_k$ with $\|f\|_{\mathcal{H}_k} \le B$. The inexact GP-UCB algorithm with $\beta_t = B + R\sqrt{2(\gamma_{t-1} + 1 + \log(1/\delta))}$ and an accumulated inaccuracy $M_T$ yields a cumulative regret bound of the form,
$$
R_T = O\left(\gamma_T\sqrt{T}  + M_T\sqrt{\gamma_T} \right),
$$ 
with probability $1-\delta$.
\end{theorem}

\begin{proof}
From Theorem 2 of \citet{chowdhury2017kernelized}, we know that
\begin{align*}
     \mu_{t-1}(x^*) - \beta_t \sigma_{t-1}(x^*) &\le f(x^*) \le \mu_{t-1}(x^*) + \beta_t \sigma_{t-1}(x^*), \\
      \mu_{t-1}(x_t) - \beta_t \sigma_{t-1}(x_t) &\le f(x_t) \le \mu_{t-1}(x_t) + \beta_t \sigma_{t-1}(x_t)
\end{align*}
hold with probability $1-\delta$. Then
\begin{align*}
    f(x^*) - f(x_t) &\le \mu_{t-1}(x^*) + \beta_t \sigma_{t-1}(x^*) - \mu_{t-1}(x_t) + \beta_t \sigma_{t-1}(x_t) \\
    &= \alpha_{t-1}(x^*) - \alpha_{t-1}(x_t) + 2\beta_t \sigma_{t-1}(x_t) \\
    &= (1-\eta_t) \alpha_t^* + 2\beta_t \sigma_{t-1}(x_t),
\end{align*}
where the first equality follows from the definition of the UCB acquisition function, and the second equality is due to the fact that $\alpha_{t}(x_t) = \eta_t \alpha_t(x^*)$.
Since $\mu_{t-1}(x) \le f(x) + \beta_t \sigma_{t-1}(x)$, for all $x \in \mathcal{X}$, we conclude that, with probability $1-\delta$,
\begin{align*}
\alpha_t^* \le \max_{x\in\mathcal{X}} [f(x) + 2\beta_T \sigma_{t-1}(x) ]\le B + 2\beta_T,
\end{align*}
where the second inequality follows from the fact that $\sup_{x \in \mathcal{X}}|f(x)| \le \|f\|_{\mathcal{H}_k} \le B$, and for all $x \in \mathcal{X}, \sigma_{t-1}(x) \le 1$. Therefore, we conclude that
\begin{align*}
    R_T &= \sum_{t=1}^T f(x^*) - f(x_t) %\\ 
    %&
    \le (B+2\beta_T) \sum_{t=1}^T (1-\eta_t) + 2\beta_T \sum_{t=1}^T \sigma_{t-1}(x_t).
\end{align*}
From Lemma \ref{lem:sigma_bound}, we arrive at the conclusion.
\end{proof}

A similar analysis can be established for the GP-TS algorithm with an enlarged variance $s^2_{t-1}(x) = \left(\beta_t \sigma_{t-1}(\cdot) + v_t\right)^2$, $v_t = (\frac{1}{\eta_t}-1)B$. We omit the proof of the following statements as it substantially overlaps with the approach taken in \citep{chowdhury2017kernelized} with a variance factor adjustment.  

\begin{theorem}[Restatement of Theorem \ref{thm:GP_TS_MFC}]\label{append_thm:GP_TS_MFC}
Under assumptions \ref{append_assump:kernel_less_one} and \ref{append_assump:kernel_smooth}, suppose an objective function $f \in \mathcal{H}_k$ with $\|f\|_{\mathcal{H}_k} \le B$. The inexact GP-TS algorithm with variance $s^2_{t-1}(x) = \left(\beta_t \sigma_{t-1}(\cdot) + v_t\right)^2$, $v_t = (\frac{1}{\eta_t}-1)B$ and an accumulated inaccuracy $M_T$ yields a cumulative regret bound of the form
$$
R_T = \mathcal{O}\left(\gamma_T\sqrt{T \log T} + M_T \sqrt{\gamma_T\log T}\right),
$$ 
with probability $1-\delta$.
\end{theorem}




\subsection{Random Grid Search based GP-UCB}\label{sec:aux}
\begin{theorem}\label{append_thm:random_UCB}
Under assumptions \ref{append_assump:kernel_less_one}, \ref{append_assump:kernel_smooth} and \ref{append_assum:grid}, suppose $f \in \mathcal{H}_k$ with $\|f\|_{\mathcal{H}_k} \le B$. With probability at least $1-\delta$, the random grid GP-UCB with $\beta_t = B + R\sqrt{2(\gamma_{t-1} + 1 + \log(2/\delta))}$ yields 
\begin{align*}
R_T = \mathcal{O}\left(\gamma_T \sqrt{T} + T^{\frac{d-1}{d} + \xi}\right),
\end{align*}
for some arbitrarily small $\xi > 0$.
\end{theorem}
\begin{proof}
Note that 
\begin{align*}
R_T &= \sum_{t=1}^T f(x^*) - f(x_t)\\
&= \sum_{t=1}^T f(x^*) - f([x^*]_t) + f([x^*]_t)  - f(x_t),
\end{align*}
where $[x^*]_t$ is the point closest to $x^*$ in $\mathcal{X}_t$. By Lemma \ref{lem:conf}, for all possible realizations of $\mathcal{X}_1, \cdots, \mathcal{X}_T$, with a probability $1-\delta/2$, we have
\begin{align*}
\mu_{t-1}([x^*]_t) - \beta_t \sigma_{t-1}([x^*]_t) &\leq f([x^*]_t) \leq \mu_{t-1}(x^*_t) + \beta_t \sigma_{t-1}([x^*]_t) \\
\mu_{t-1}(x_t) - \beta_t \sigma_{t-1}(x_t) &\leq f(x_t) \leq \mu_{t-1}(x_t) + \beta_t \sigma_{t-1}(x_t).
\end{align*}
From the definition of $x_t$, we know $\mu_{t-1}([x^*]_t) + \beta_t \sigma_{t-1}([x^*]_t) \le \mu_{t-1}(x_t) + \beta_t \sigma_{t-1}(x_t)$. Therefore, we have
$$
f([x^*]_t) - f(x_t) \le 2\beta_t \sigma_{t-1}(x_t).
$$
Furthermore, from Lemma 1 of \citep{de2012regret}, we know that $f(x^*) - f([x^*]_t) \le C \|x^*-[x^*]_t\|$ for some constant $C > 0$. Since $\|x^*-[x^*]_t\| \le h_t$ for $h_t := \sup_{x \in \cX} \inf_{\bar x_i \in \cX_t}\|x-\bar x_i\|$, we have
$$
R_T \le C\sum_{t=1}^T h_t + 2\beta_T\sum_{t=1}^T\sigma_{t-1}(x_t).
$$
Then by Lemma \ref{lemma:grid_disc}, we know that with probability at least $1-\delta/2$, $C\sum_{t=1}^T h_t = \mathcal{O}\left(T^{\frac{d-1}{d} + \xi}\right)$. Combined with Lemma \ref{lem:sigma_bound} and invoking the union bound, we have the result.
\end{proof}

\subsection{Random Grid Search based GP-TS}\label{sec:aux}
In this section, we list all additional definitions and lemmas we use in proofs for the regret bound of a random grid search based GP-TS. Since our approach closely follows to that of \citet{chowdhury2017kernelized}, many of the preliminary lemma we list here can be proven in an analogous fashion. We will adjust and restate the Lemma and proof if needed.


\begin{definition}
We define the filtration $\mathcal{F}_t = \sigma\left\{(x_1, y_1, \cX_2), \cdots, (x_t, y_t, \cX_{t+1})\right\}$ as the $\sigma$-algebra generated by the collection of evaluation locations, function evaluations and search grids observed until time $t$. Note that conditional on $\cF_t$, the search grid is no longer random.
\end{definition}



\begin{lemma}[Lemma 5 of \citep{chowdhury2017kernelized}]\label{lem:thompson_gap}
For all $t \in \mathbb{N}$, assume $|\mathcal{X}_t| = ct$ with $c > 0$. Then
$$
\mathbb{P}\left[\forall x \in \mathcal{X}_t, |f_t(x) - \mu_{t-1}(x)| \le \beta_t\sqrt{2\log(ct^3)}\sigma_{t-1}(x) \Big|\mathcal{F}_{t-1} \right] \ge 1 - 1/t^2,
$$
for some constant $c > 0$, $\delta \in (0,1)$ and $\beta_t = B + R\sqrt{2(\gamma_{t-1} + 1 + \log(3/\delta))}$.
\end{lemma}



\noindent We introduce some definitions here.
\begin{definition}\label{def:CT}
$\forall t \ge 1$, $\tilde c_t = \sqrt{6\log t + 2\log c}$ and $c_t = \beta_t(1+\tilde c_t)$.
\end{definition}

\begin{definition}
We define the following two events:
\begin{align*}
E^f(t) &= \left\{\forall x \in \mathcal{X}, |\mu_{t-1}(x)-f(x)| \le \beta_t\sigma_{t-1}(x) \right\} \\
E^{f_t}(t) &= \left\{\forall x \in \mathcal{X}_t, |f_t(x)-\mu_{t-1}(x)| \le \beta \tilde c_t 
\sigma_{t-1}(x)\right\}
\end{align*}
\end{definition}

\begin{definition}
Given a grid $\mathcal{X}_t$, define the set of saturated points to be
    $$
    S_t \coloneqq \{x\in \mathcal{X}_t: \Delta_t(x) > c_t \sigma_{t-1}(x)\},
    $$
    where $[x^*]_t$ is the point closest to $x^*$ in $\mathcal{X}_t$ and $\Delta_t(x) : = f([x^*]_t)-f(x)$. Notice that conditioning on $\mathcal{X}_t$, $[x^*]_t \in \mathcal{X}_t \setminus S_t$.
\end{definition}

\begin{lemma}[Lemma 6 of \citep{chowdhury2017kernelized}]\label{lemma::event_prob}
    Suppose $|\mathcal{X}_t| = ct, ~c > 0$. For $\delta \in (0,1)$, following statements hold.
    \begin{itemize}
        \item $\mathbb{P}[\forall~ t \ge 1, E^f(t)] \ge 1 - \delta/3$
        \item $\mathbb{P}[E^{f_t}(t)| \mathcal{F}_{t-1}] \ge 1-1/t^2$
    \end{itemize}
\end{lemma}

\begin{lemma}[Lemma 7 of \citet{chowdhury2017kernelized}]\label{lemma::prob_lower}
    For any filtration $\mathcal{F}_{t-1}$ such that $E^f(t)$ is true,
    $$
    \mathbb{P}\left[f_t(x) > f(x)\Big|\mathcal{F}_{t-1}\right] \ge \eta\coloneqq \frac{1}{4e\sqrt{\pi}} > 0,
    $$
    holds for any $x \in \mathcal{X}$.
\end{lemma}

\begin{lemma}[Lemma 8 of \citet{chowdhury2017kernelized}]\label{lemma:prob_good_event}
   For any filtration $\mathcal{F}_{t-1}$ such that $E^f(t)$ is true,
    $$
    \mathbb{P}[x_t \in \mathcal{X}_t \setminus S_t| \mathcal{F}_{t-1}] \ge \eta - 1/t^2.
    $$
\end{lemma}


\begin{lemma}\label{lemma::exp_reg_bound}
    For any filtration $ \mathcal{F}_{t-1}$ such that $E^f(t)$ is true, we have
    \begin{align*}
    \mathbb{E}[\Delta_t(x_t) | \mathcal{F}_{t-1}] \le \frac{11c_t}{\eta} \mathbb{E}[\sigma_{t-1}(x_t)|  \mathcal{F}_{t-1}] + \frac{2B}{t^2},
    \end{align*}
    where $\Delta_t(x_t) := f([x^*]_t)-f(x_t)$.
\end{lemma}
\begin{proof}
Given a grid $\mathcal{X}_t$, let $\bar x_t = \argmin_{x \in \mathcal{X}_t \setminus S_t}\sigma_{t-1}(x)$. From the law of total expectation and positivity of the $\sigma_{t-1}$, we have 
\begin{align}
\mathbb{E}[\sigma_{t-1}(x_t)| \mathcal{F}_{t-1}] &\ge \mathbb{E}[\sigma_{t-1}(x_t)| \mathcal{F}_{t-1}, x_t \in \cX_t \setminus S_t]\mathbb{P}[x_t \in \cX_t \setminus S_t| \cF_{t-1}] \nonumber \\ &\ge \mathbb{E}[\sigma_{t-1}(\bar x_t)|\cF_{t-1}] (\eta-1/t^2), \label{ineq::lower_sigma}
\end{align}
where the second inequality follows from the definition of $\bar x_t$ and Lemma \ref{lemma:prob_good_event}. 

To control $\Delta_t(x_t) = f([x^*]_t) - f(x_t)$, recall that if $E^f(t)$ and $E^{f_t}(t)$ are both true, we know that
\begin{equation}\label{eq::up_down_f}
  \text{for all } x \in \mathcal{X}_t, \quad f_t(x) - c_t \sigma_{t-1}(x) \le f(x) \le f_t(x) + c_t \sigma_{t-1}(x).  
\end{equation}
Notice that
\begin{align*}
\Delta_t(x_t) 
&= f([x^*]_t) - f(x_t) \\
&= f([x^*]_t) - f(\bar x_t) +  f(\bar x_t) - f(x_t)\\
&\le \Delta_t(\bar x_t) + f_t(\bar x_t) + c_t \sigma_{t-1}(\bar x_t) - f_t(x_t) +  c_t \sigma_{t-1}(x_t) \\
&\le  c_t (2\sigma_{t-1}(\bar x_t) + \sigma_{t-1}(x_t)) + f_t(\bar x_t) - f_t(x_t)\\
&\le  c_t (2\sigma_{t-1}(\bar x_t) + \sigma_{t-1}(x_t)) 
\end{align*}
where the first inequality is due to \eqref{eq::up_down_f}, the second inequality follow from the fact $\bar x_t \notin S_t$ and the last inequality comes from the definition of $x_t$.

Therefore, we have
\begin{align*}
\mathbb{E}[\Delta_t(x_t)|\cF_{t-1}] 
&\le 2c_t \mathbb{E}[\sigma_{t-1}(\bar x_t) |\cF_{t-1}] + c_t \mathbb{E}[\sigma_{t-1}(x_t) |\cF_{t-1}] + 2B \mathbb{P}\left[E^{f_t}(t)^c|H_{t-1}\right]\\
&\le \frac{2c_t}{\eta-1/t^2} \mathbb{E}[\sigma_{t-1}(x_t) |\cF_{t-1}] + c_t \mathbb{E}[\sigma_{t-1}(x_t) |\cF_{t-1}] + \frac{2B}{t^2} \\
&\le \frac{11c_t}{\eta} \mathbb{E}[\sigma_{t-1}(x_t) |\cF_{t-1}] + \frac{2B}{t^2},
\end{align*}
where we used the fact $\sup_{x \in cX}\Delta_t(x) \le 2B$ which can be deduced from the fact $\sup |f(x)| \le \|f\|_{\cH_k} \le B$ in the first inequality. The second inequality follows from the inequality in \eqref{ineq::lower_sigma} and Lemma \ref{lem:thompson_gap}. The third inequality is due to the fact $\frac{1}{\eta-1/t^2} \le \frac{5}{\eta}$.
\end{proof}

\noindent Next, we define random variables and associated filtration to invoke concentration inequality for super-martingales.
\begin{definition}\label{def::sup_martingale}
Let $Y_0 = 0$, and for all $t \in \{1, \cdots, T\}$,
     \begin{align*}
    \bar{\Delta}_t(x_t) &= \Delta_t(x_t) \mathbb{I}\left\{E^f (t)\right\} \\
    Z_t &= \bar{\Delta}_t(x_t) -\frac{11c_t}{\eta} \sigma_{t-1}(x_t) - \frac{2B}{t^2}\\ 
    Y_t &= \sum_{s=1}^t Z_s
\end{align*}
 \end{definition}




\noindent From the definition, and by Lemma \ref{lemma::exp_reg_bound}, we deduce the following result, which we formally state as lemma.

\begin{lemma}\label{lemma::sup_martingale}
$(Y_t)_{t=0}^T$ is a super-martingale process with respect to filtration $\mathcal{F}_{t}$.
\end{lemma}

\begin{proof}
It suffices to show that for all $t \in \{1, \cdots, T\}$ and any $\mathcal{F}_{t-1}$, $\mathbb{E}\left[Y_t - Y_{t-1} | \cF_{t-1} \right] \le 0$. Note that
\begin{equation}\label{eq::super_martingale}
\mathbb{E}\left[Y_t - Y_{t-1} | \cF_{t-1} \right] = \mathbb{E}\left[Z_t | \cF_{t-1} \right] = \mathbb{E}\left[ \bar \Delta_t(x_t)| \cF_{t-1}\right] - \frac{11c_t}{\eta}\mathbb{E}\left[\sigma_{t-1}(x_t)| \cF_{t-1}\right] - \frac{2B}{t^2}.
\end{equation} 
If $E^t(t)$ is false, we have $\mathbb{E}\left[ \bar \Delta_t(x_t)| \cF_{t-1}\right] = 0$, which shows that \eqref{eq::super_martingale} is less than or equal to zero. On the other hand, if $E^t(t)$ is true, from Lemma \ref{lemma::exp_reg_bound}, we can again conclude that \eqref{eq::super_martingale} is less than or equal to zero.
\end{proof}




\begin{lemma}\label{lemma:discrep_TS_bound}
Given any $\delta > 0$,
\begin{align*}
    \sum_{t=1}^T \Delta_t(x_t) &\le \frac{11c_T}{\eta}\sum_{t=1}^T\sigma_{t-1}(x_t) + \frac{2B\pi^2}{6} + \frac{4B+11c_T}{\eta}\sqrt{ 2T \log(3/\delta) },
\end{align*}
 with probability at least $1-2\delta/3$.
\end{lemma}

\begin{proof}
By construction, 
$$
|Y_t-Y_{t-1}| = |Z_t| \le \left|\bar\Delta_t(x_t)\right| + \frac{11c_t}{\eta} \sigma_{t-1}(x_t) + \frac{2B}{t^2} \le 2B + \frac{11c_t}{\eta} + \frac{2B}{t^2} \le \frac{4B + 11c_t}{\eta}
$$
where the first inequality is due to the triangle inequality. The second inequality comes from the fact that $\left|\bar\Delta_t(x_t)\right| \le 2\sup_{x \in \mathcal{X}} |f(x)| \le 2\|f\|_{\cH_k} \le 2B$ and $\sigma_{t-1}(x) \le 1$ for all $x \in \mathcal{X}$. The third inequality follows from $\eta \le 1$. From the Azuma-Hoeffding inequality, with at least probability $1-\delta/3$, we have
\begin{align*}
    Y_T - Y_0 = \sum_{t=1}^T \bar \Delta_t(x_t) -\sum_{t=1}^T \frac{11c_t}{\eta} \sigma_{t-1}(x_t) - \sum_{t=1}^T \frac{2B}{t^2} \le \sqrt{2\log(3/\delta)\sum_{t=1}^T \frac{(4B+11c_t)^2}{\eta^2}}.
\end{align*}
In other words, we have
\begin{align*}
\sum_{t=1}^T \bar \Delta_t(x_t) &\le \sum_{t=1}^T \frac{11c_t}{\eta} \sigma_{t-1}(x_t) + \sum_{t=1}^T \frac{2B}{t^2} + \sqrt{2\log(3/\delta)\sum_{t=1}^T \frac{(4B+11c_t)^2}{\eta^2}} \\
&\le \frac{11c_T}{\eta}\sum_{t=1}^T  \sigma_{t-1}(x_t) + \frac{2B \pi^2}{6} + \frac{4B+11c_T}{\eta}\sqrt{ 2T \log(3/\delta) } 
\end{align*}
with at least probability $1-\delta/3$. From Lemma \ref{lemma::event_prob}, we know $E^f(t)$ holds for all $t \ge 1$ with probability at least $1- \delta/3$. In other words, by definition, $\Delta_t(x_t) = \bar \Delta_t(x_t)$ for all $t \ge 1$ with probability at least $1- \delta/3$. Applying the union bound, we obtain the statement.
\end{proof}


\begin{theorem}[Restatement of Theorem \ref{thm:random_TS}]
Under assumptions \ref{append_assump:kernel_less_one}, \ref{append_assump:kernel_smooth} and \ref{append_assum:grid}, suppose $f \in \mathcal{H}_k$ with $\|f\|_{\cH_k} \le B$, for some $B >0$. With probability at least $1-\delta$, the random grid GP-TS with $\beta_t = B + R\sqrt{2(\gamma_{t-1} + 1 + \log(3/\delta))}$ yields 
\begin{align*}
R_T = \mathcal{O}\left(\gamma_T \sqrt{T \log T} + T^{\frac{d-1}{d} + \xi} \right),
\end{align*} 
for some arbitrarily small $\xi > 0$.
\end{theorem}
\begin{proof}
Note that
\begin{align*}
R_T &= \sum_{t=1}^T f(x^*) - f(x_t) \\
&= \sum_{t=1}^T f(x^*) - f([x^*]_t) + f([x^*]_t) - f(x_t) \\
&= \sum_{t=1}^T f(x^*) - f([x^*]_t) + \sum_{t=1}^T \Delta_t(x_t) \\
&\le C \sum_{t=1}^T \|x^*-[x^*]_t\|_2 + \sum_{t=1}^T \Delta_t(x_t) \\
& \le C \sum_{t=1}^T h_t + \sum_{t=1}^T \Delta_t(x_t),
\end{align*}
where the first inequality follows from the Lipschitzness of $f$, which was shown in Lemma 1 of \citep{de2012regret} and the second inequality is due to the definition of fill distance $h_t = \sup_{x \in \mathcal{X}}\inf_{\tilde x_i \in \mathcal{X}_t} \|x-\tilde x_i\|$. From Lemma \ref{lemma:grid_disc}, we know that the leading term $C \sum_{t=1}^T h_t = \mathcal{O}\left(T^{\frac{d-1}{d} + \xi}\right)$. For the second term, note that from Lemma \ref{lemma:discrep_TS_bound}
\begin{align*}
    \sum_{t=1}^T \Delta_t(x_t) &\le \frac{11c_T}{\eta}\sum_{t=1}^T\sigma_{t-1}(x_t) + \frac{2B\pi^2}{6} + \frac{4B+11c_T}{\eta}\sqrt{ 2T \log(3/\delta) } \\
    &= \mathcal{O}\left(c_T \sqrt{4(T+2)\gamma_T} + c_T\sqrt{T}\right)
\end{align*}
where the last equality is due to Lemma
\ref{lem:sigma_bound}. From the definition of $c_t$, we know $c_T = \mathcal{O}(\sqrt{\gamma_T \log T})$. Since the leading term dominates, we have
$$
\sum_{t=1}^T \Delta_t(x_t) = \mathcal{O}\left(\gamma_T \sqrt{T \log T}\right).
$$
Applying union bound and combining everything, we get the result.
\end{proof}




% \section{Limitations and Broader Impacts}
% Since this is a fully theoretical paper, the discussion on its limitations focuses only on its assumptions. Section \ref{sec:random} assumes random sampling samples $t$ data points at the $i$-th iteration of Bayesian optimization, which can be hard when $t$ is large. However, $t$ is usually not too large in practice since one of motivations of Bayesian optimization is to reduce number of iterations, especially for scientific discovery problems. Other assumptions made in this paper, like assumptions on kernels and objective functions, usually follow existing work so they are not considered to be too strong to weaken our contributions.

% Given the theoretical nature of this work, there is no immediate concern in terms of broader impacts. %Again, since this is a theoretical paper, nothing should be worried about immediately. 
% For positive societal impacts, we hope our work could help the community deeply understand inexact acquisition function maximization in Bayesian optimization and develop more robust Bayesian optimization algorithms for real-world applications. For potential negative societal impacts, when an algorithm inspired by this work is going to be deployed in practice, safety, privacy, and fairness should be seriously considered.



% Optionally include supplemental material (complete proofs, additional experiments and plots) in appendix. All such materials \textbf{SHOULD be included in the main submission.}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



\end{document}