%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%% PACKAGING AND IMPORTS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\PassOptionsToPackage{colorlinks=true, linkcolor=blue, pdfborder={0 0 0}}{hyperref}
\documentclass[accepted]{uai2022} 

%% Choose your variant of English; be consistent
\usepackage[american]{babel}

% fonts and math
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{bm}
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{mathtools} % amsmath with fixes and additions
\usepackage{amssymb}
\usepackage{amsfonts}
\DeclareMathOperator{\sign}{sign}

%% bibliography
% \usepackage[square]{natbib} % has a nice set of citation styles and commands
%     \bibliographystyle{plainnat}
%     \renewcommand{\bibsection}{\subsubsection*{References}}

% theorems, lemmas, models
\usepackage{thmtools}
\input{notation}
\newcommand{\model}[1]{{\fontfamily{\ttdefault}\selectfont #1}}

% % % citations and bibliography
\usepackage[style=authoryear-comp, natbib=true]{biblatex}
\addbibresource{warren_734.bib}
\setlength\bibitemsep{1.5\itemsep}
\AtEveryBibitem{%
    \clearfield{issn}% Remove issn
    \clearfield{doi} % Remove doi
    \clearfield{url}%
    \clearfield{urldate}%
    \clearfield{urlyear}%
    \clearfield{urlmonth}%
    \clearfield{review}%
    \clearfield{series}%
    \clearfield{note}%
    \clearfield{isbn}%
    \clearfield{urlmonth}%
}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%% TITLE, AUTHORS, AND ABSTRACT %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\title{Generalized Bayesian Quadrature with Spectral Kernels}

% Add authors
\author[1]{\href{mailto:<hwar3277@uni.sydney.edu.au>?Subject=Your UAI 2022 paper}{Houston Warren}{}}
\author[2]{\href{mailto:<rafael.oliveira@sydney.edu.au>?Subject=Your UAI 2022 paper}{Rafael Oliveira}{}}
\author[1, 3]{\href{mailto:<fabio.ramos@sydney.edu.au>?Subject=Your UAI 2022 paper}{Fabio Ramos}{}}
% Add affiliations after the authors
\affil[1]{%
    School of Computer Science\\
    The University of Sydney\\
    Sydney, Australia
}
\affil[2]{%
    Brain and Mind Centre\\
    The University of Sydney\\
    Sydney, Australia
}
\affil[3]{%
    NVIDIA, USA
}
  
  \begin{document}
\maketitle

\begin{abstract}
     Bayesian probabilistic integration, or Bayesian quadrature (BQ), has arisen as a popular means of numerical integral estimation with quantified uncertainty for problems where computational cost limits data availability. BQ leverages flexible Gaussian processes (GPs) to model an integrand which can be subsequently analytically integrated through properties of Gaussian distributions. However, BQ is inherently limited by the fact that the method relies on the use of a strict set of kernels for use in the GP model of the integrand, reducing the flexibility of the method in modeling varied integrand types. In this paper, we present spectral Bayesian quadrature, a form of Bayesian quadrature that allows for the use of \textit{any} shift-invariant kernel in the integrand GP model while still maintaining the analytical tractability of the integral posterior, increasing the flexibility of BQ methods to address varied problem settings. Additionally our method enables integration with respect to a uniform expectation, effectively computing definite integrals of challenging integrands. We derive the theory and error bounds for this model, as well as demonstrate GBQ's improved accuracy, flexibility, and data efficiency, compared to traditional BQ and other numerical integration methods, on a variety of quadrature problems.
\end{abstract}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%% INTRODUCTION %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Introduction}\label{sec:intro}
Methods for estimation of non-analytical integrals through numerical methods play a key role across a broad spectrum of scientific fields, but these methods are often computationally expensive in nature. Methods such as finite-element or volumes, which are widely used in physical simulation to integrate partial differential equations, or Monte Carlo estimation, which is widely used in Bayesian statistics for estimation of posteriors, require a large number of function evaluations to reach a desired level of accuracy. In addition, many numerical integration methods fail to provide uncertainty quantification on their estimates, which is crucial in the applied settings in which physical simulation is often used. 

Bayesian quadrature (BQ) \citep{diaconis_bayesian_1988,ohagan_bayeshermite_1991} is a probabilistic method which can remedy these concerns by offering performance on computationally-limited small data while admitting robust uncertainty bounds. BQ takes the form of a traditional quadrature rule:
\begin{equation}\label{eq:quadrature}
    \int f(\x) p(\x) d\x \approx \sum_{i=1}^n w_i f(\x_i),
\end{equation}
for $n$ evaluations of the function $f$, where weights $w_i \sim p(\x)$ are instead learned through manipulation of a Bayesian non-parametric Gaussian process (GP) \citep{rasmussen_gaussian_2006} model on observations of the integrand $f(\x)$. 

The use of such a Bayesian non-parametric model for learning weights leverages the ability for GPs to perform well under data-scarcity as well as quantify uncertainty in a principled manner. In addition, the Gaussian nature of this model allows for the integral estimate of $f$ to be a simple analytical integration of the GP prior on $f$ using well-known characteristics of multi-variate Gaussian distributions. Previous work \citep{ghahramani_bayesian_2003, kandasamy_bayesian_2015} has clearly demonstrated computational efficiency of BQ versus traditional methods such as Monte Carlo integration when the data dimensionality $d < 10$.

A chief advantage of using GPs in any probabilistic learning setting is the flexibility of choice of the GP kernel function $k$, which allows for a practitioner to inject domain knowledge of the problem space into the GP model. Characteristics such as as data smoothness or periodicity can easily be applied through choice or composition of specific kernel functions tailored to these settings.

However, the traditional BQ formulation hamstrings this flexibility by limiting the choice of kernel in the integrand GP to only a small subset of kernels with known analytical kernel means, such as Gaussian or polynomial kernels. For well-known kernels that may not be analytically tractable in the BQ setting, but nonetheless might better model an integrand, traditional numerical quadrature methods must be used, reducing the computational efficiency that BQ offers. The question naturally arises of how practitioners might enable the full suite of kernel choices for use in the GP integrand prior while still maintaining the analytical tractability in the BQ setting, to most efficiently produce an accurate estimate to the integral of $f$.

In this paper, we expand on the literature of BQ and propose a solution to the problem of kernel choice with generalized Bayesian quadrature (GBQ), a method derived from random Fourier features (RFFs) by which any shift-invariant kernel can be used in the GP integrand prior while still allowing for analytical tractability in the BQ setting. By allowing for both kernel flexibility and analytical integration, we expand upon the ability of traditional BQ to model a variety of integrand types while still maintaining the computational efficiency BQ offers. We summarize our contributions here:

\paragraph{Contributions}
\begin{itemize}
    \item We propose generalized Bayesian quadrature (GBQ), a method of Bayesian quadrature that allows for the use of \textit{any} shift-invariant kernel in the GP model of the integrand while still admitting an analytical estimate of the integral posterior mean and variance.
    \item We show that GBQ can directly be used to compute integrals over Gaussian and uniform measures within the same framework.
    \item We derive the upper-bounded error to this approximation as a function of data-availability.
    \item We outline the assumptions under which GBQ shares the computational complexity of traditional BQ.
    \item We demonstrate the accuracy and flexibility of this quadrature method versus traditional BQ, as well as data-efficiency versus typical Monte Carlo integration, on a selection of relevant domain problems.
\end{itemize}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%% RELATED WORK %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Related Works}\label{sec:related}

Quadrature methods of the type in equation \ref{eq:quadrature} are well-studied due to their importance to a variety of fields, and there is a deep literature dating back centuries on methods for numerically approximating integrals. We will briefly review here relevant methods in relation to Bayesian quadrature.

Rather than deterministic quadrature weighting, various probabilistic quadrature approaches have been proposed \citep{oates_modern_2019} for integration when model observations are expensive, with one of the most popular methods being Bayesian quadrature. Many extensions to vanilla BQ have been developed over the years to improve performance and provide theoretical guarantees \citep{kennedy_bayesian_1998, briol_frank-wolfe_2015, acerbi_variational_2018, belhadji_kernel_2019}. Other applications include use in multi-fidelity modeling \citep{gessner_active_2020}, Bayesian posterior estimation \citep{osborne_bayesian_2012, gunter_sampling_2014}, Bayesian optimization \citep{nguyen_distributionally_2020}, and model selection \citep{osborne_active_2012, chai_automated_2019}. 

The derivation of analytical forms, or empirical approximation, of kernel means, which is a significant component of the BQ formulation, is a problem that appears in numerous other fields. Namely, kernel mean embedding \citep{muandet_kernel_2017}, deep Gaussian processes \citep{damianou_deep_2013}, and neural operators \citep{kovachki_neural_2021, li_fourier_2021} all attempt to do so through various means. There also exist empirical methods for the estimation of kernel means using random Fourier features \citep{muandet_kernel_2017}, as well as strong theoretical connections between the very concept of kernel-based quadrature and random Fourier features \citep{bach_equivalence_2017}. In a related manner, methods have been proposed that seek to implement Fourier feature kernels through quadrature based methods \citep{mutny_efficient_2018}. However, to our knowledge, no methods directly solve kernel integrals analytically in the BQ setting using RFFs, as we propose to do in this paper. 

Similar to RFFs, spectral mixture kernels (SMKs) \citep{wilson_gaussian_2013, oliva_bayesian_2016}, also model shift-invariant kernels as the spectral transform of a probability measure. In the SMK case, this measure is a Gaussian mixture model, which can be shown to asymptotically approximate any stationary kernel as the number of mixture components increases. While BQ has found applications in constructing hyper-kernels by marginalizing SMKs over mixture priors \citep{hamid_marginalising_2022}, their use within BQ has been limited.

The method which shares the most overlap with this work is the Fourier neural operator (FNO) \citep{li_fourier_2021}, which, as a part of a larger deep neural network architecture, estimates the convolution of shift-invariant kernels with a probability measure using parameters in Fourier space. While we take a similar approach to deriving kernel means using Fourier frequencies, the overall frameworks differ, with GBQ existing in the Gaussian process framework, thus offering uncertainty estimates for integral posteriors, while FNOs exist within a deterministic neural network architecture. 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%% METHODS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{Preliminaries}\label{sec:preliminaries}
\subsection{Bayesian Quadrature}

We will now review various preliminary methods upon which GBQ is built, starting with Bayesian quadrature.

BQ assumes we have a function $f$ that we are trying to integrate and a dataset $\mathcal{D} = \{\bm{x}_i, y_i\}_{i = 1}^n$ with $n$ noisy observations of $f$, where $\bm{x} \in \mathcal{R}^d$, $y_i = f(\bm{x}_i) + \epsilon$, and $\epsilon$ is i.i.d normal distributed noise. Typically, $f$ is computationally expensive to evaluate, implying a small $n$ and highlighting the need for uncertainty estimation in the final integral approximation. BQ does this by first placing a Gaussian process \citep{rasmussen_gaussian_2006} prior on $f$, which we will briefly review here. 

\paragraph{Gaussian Processes}
Gaussian processes are a Bayesian non-parametric method which model the target data generation function $f$ we are attempting to learn as a joint multivariate Gaussian of the form:
\begin{align}
    f \sim \mathcal{GP}(\bm{\mu}(\bm{x}), k_{\bm{\theta}}(\bm{x}, \bm{x}^\prime)), \\
    \bm{y} = f(\bm{x}) + \epsilon,
\end{align}
where $k_{\bm{\theta}}$ is a positive semi-definite \textit{kernel function} with hyper-parameters $\bm{\theta}$, and $\bm{\mu}$ is a mean function. In the above, we assume an additive and independent Gaussian noise observation model with, $\epsilon \sim {\mathcal N}(0, \sigma^2 I)$, where $\bm{y}$ are noisy observations with standard deviation $\sigma$. $k$ is typically chosen {\em a-priori} to encode known characteristics of the data $\mathcal{D}$ such as periodicity and smoothness.

For inference, the posterior-predictive distribution of $f_*$ for a new data point $\{\bm{x}_*\}$, given the training data $\mathcal{D} = \{\bm{x}^i, y^i\}_{i=1}^n$, and Gram matrix $\bm{K}_{\bm{x}\bm{x}} = k_{\bm{\theta}}(\bm{x}, \bm{x}^\prime)$, $\forall \, \bm{x}, \bm{x}^\prime$, is given by $\mathcal{N}(\bm{\mu}(f*), \mathrm{Cov}(f*))$ where,
\begin{align}\label{eq:gpmean}
    \mu(f*) = \bm{K}_{*x}(\bm{K}_{xx} + \sigma^2 \bm{I})^{-1}\bm{y},\\
    \mathrm{Cov}(f*) = \bm{K}_{**} - \bm{K}_{*x} (\bm{K}_{xx} + \sigma^2 \bm{I})^{-1}\bm{K}_{x*}. \label{eq:gpcov}
\end{align}
In BQ, by setting a GP prior on the integrand $f$ we can leverage the ability of GPs flexibly and accurately model functions with uncertainty on small data, but it is also advantageous in that we can directly and analytically integrate the integrand GP prior. This is performed using well-known characteristics of Gaussian distributions in order to form a posterior estimate $\fint$ of the integral of $f$. 

Formally, the mean of the BQ estimate of $\fint$ is the expected value over measure $p(\x)$ of the posterior mean of the GP prior \eqref{eq:gpmean} on $f$:
\begin{equation}
    \begin{split} \label{eq:bqmean}
         \fint & = \int_{\x \in \mathcal{R}} k(\x, \bm{X})^{T} \bm{K}^{-1} \bm{y} \, p(\x) \, d\x \\
         & = \bm{y}^T \bm{K}^{-1} \int_{\x \in \mathcal{R}} k(\x, \bm{X}) \, p(\x) \, d\x \\
         & = \mu_{\x}(\bm{X})^T \bm{K}^{-1} \bm{y},
    \end{split}
\end{equation}
where $\mu_{\x}(\bm{X}) = [\mu_{\x}(\bm{x}_1) \dots \mu_{\x}(\bm{x}_n)]$ can be seen as the \textit{kernel mean} over measure $p(\x)$. The variance of this estimate is:
\begin{equation}\label{eq:bqcov}
    \mathbb{V}(\fint) = \int_{\bm{X} \in \mathcal{R}^d}\mu_{\x}(\bm{X}) p(\bm{X}) \, d\bm{X}
\end{equation}
which is notably independent of prior observations $\bm{X}$. 

The mean formulation mirrors that of standard quadrature methods shown in equation \eqref{eq:quadrature}, differing in that weights $\mu_{\x}(\bm{X})^T \bm{K}^{-1}$ are the result of probabilistic learning on observed data $\mathcal{D}$ and associated kernel choice, rather than decided {\em a priori} or by a heuristic.

Under a very limited selection of kernel and sampling measure choices, the mean \eqref{eq:bqmean} and variance \eqref{eq:bqcov} can be calculated analytically \citep{briol_probabilistic_2019}. Most commonly, a Gaussian kernel and Gaussian distribution for the measure $p(\x)$, as proposed by \citep{ohagan_bayeshermite_1991}, is one such case. It is also prudent to note that the measure distribution can be fluid while retaining analytical tractability through use of importance sampling \citep{ghahramani_bayesian_2003, briol_sampling_2017}, while the choice of kernel is more restricted.

In BQ, the limitation of the kernel to certain forms dependent on known closed-form analytical integration over the measure $p(\x)$ gives up one of the greatest advantages of the GP prior: flexible selection of kernels for specific domains. To alleviate this issue, GBQ introduces random Fourier features into the BQ formulation for parametrization of the GP kernel.

\subsection{Random Fourier Features}

As we shall see in Section~\ref{sec:GBQ} Random Fourier features enable the use of \textit{any} shift-invariant kernel in the BQ-GP prior without sacrificing the analytical tractability of the integral posterior. This greatly increases the flexibility of the BQ to perform under a variety of problem conditions for which different kernels may be necessary.

Random Fourier features are obtained from the spectral representation of shift-invariant kernels given by Bochner's theorem:
%
\begin{theorem}[Bochner's theorem (\cite{rudin_fourier_2011})]\label{boch}
A shift-invariant kernel $k(\bm{x}, \bm{x}^\prime) = k(\bm{x} - \bm{x}^\prime)$ is positive-definite if and only if it is the Fourier transform of a non-negative measure.
\end{theorem}

Theorem \ref{boch} is the building block upon which \citep{rahimi_random_2008} introduce random Fourier features (RFFs), which define a practical means by which Bochner's theorem can be applied in practice to estimate kernel functions in finite dimensions. Using the derivation from \citep{rahimi_random_2008}, if the probability density $p(\bm{\omega})$ is the Fourier transform of $k$:
\begin{equation}\label{eq:rff}
    \begin{split}
        k(\bm{x} - \bm{x}^\prime) & = \int_{\mathcal{R}^d} p(\bm{\omega}) e^{j\bm{\omega}(\bm{x} - \bm{x}^\prime)} \ d\bm{\omega}, \\
        & = \int_{\mathcal{R}^d} p(\bm{\omega}) \cos(\bm{\omega}(\bm{x} - \bm{x}^\prime)) \, d\bm{\omega}.
    \end{split}
\end{equation}
For brevity, equation \eqref{eq:rff} provides the formulation for the case that the kernel and data $\bm{x}$ are real-valued, but an alternative formulation exists for the case they are not. 

It can be easily seen that the kernel function $k$ is entirely defined by the choice of density $p(\bm{\omega})$, and several common kernels have known associated densities. For example, if $p(\bm{\omega})$ is multivariate isotropic Gaussian, then \eqref{eq:rff} represents the radial basis function (RBF) kernel. By drawing from the associated $p(\bm{\omega})$ for our choice of kernel, RFFs approximate \eqref{eq:rff} with Monte Carlo by:
\begin{equation}\label{eq:rffmc}
    k(\bm{x}, \bm{x}^\prime) = k(\bm{x} - \bm{x}^\prime) \approx \frac{1}{R} \sum_{r=1}^R \cos(\bm{\omega}_r^T(\bm{x} - \bm{x}^\prime))
\end{equation}
where $R$ is the number of Monte Carlo samples or \textit{Fourier features}.

Alternatively, we can directly parametrize these features $\bm{\omega}$ as GP hyperparameters, which allows for optimal kernels to learned during training to best adapt to specific problem settings \citep{oliva_bayesian_2016, chang_data-driven_2017, tompkins_black_2019, zhen_learning_2020}.

\section{Generalized Bayesian Quadrature}\label{sec:GBQ}

We build upon these concepts to devise our method, generalized Bayesian quadrature, which enables flexible Bayesian quadrature for use with any arbitrary shift-invariant kernel while maintaining analytical tractability of the kernel mean $\mu_{\x}(\bm{X})$. We begin by showing that a Gaussian density can be approximated with RFFs, which will lead to analytical tractability for general shift-invariant kernels. 

\subsection{Probability Density Functions as RFF Kernels}
% rewrite p as approximately q
Analytical tractability of the BQ mean in \eqref{eq:bqmean} for any kernel represented by RFFs can be achieved by reformulating the kernel mean measure $p(\bm{x})$ as an RFF as well. In general, we can turn any positive-definite probability density function $\pDensity:\locDomain\to[0,\infty)$ on $\locDomain\subseteq\R^\locDim$ into a stationary kernel via the following construction:
\begin{equation}
    \begin{split}
        k_\pDensity: \locDomain\times\locDomain &\longrightarrow \R,\\
        k_p(\location,\location')  &\longmapsto
        \begin{cases}
            \pDensity(\location-\location')\,, &\location-\location'\in\locDomain,\\
            0\,, &\location - \location' \notin\locDomain\,.
        \end{cases}
    \end{split}
    \label{eq:density-kernel}
\end{equation}
It is easy to verify that a kernel defined as in equation \ref{eq:density-kernel} is translation-invariant and positive-definite whenever $\pDensity$ is. As examples of distributions with positive-definite densities we have the Gaussian and the Student-T \citep{rossberg_positive_1995}. Given that many probability distributions can be approximated by these densities, or mixtures of them, kernel modeling of distributions as in \eqref{eq:density-kernel} has a wide range of potential applicability.

\paragraph{RFF Representation of the Gaussian}
Given that an RBF kernel represents an un-normalized Gaussian, by sampling $\bm{\rho}$ from $\mathcal{N}(\bm{0}, \bm{I})$ and using a multivariate Gaussian normalizing constant $\tau^{-1} = [(2\pi)^d |\Sigma|]^{-1/2}$, where $\Sigma$ is the lengthscale matrix for features $\bm{\rho}$, we can formulate an RFF kernel approximation of a Gaussian density function $q(\x)$ as $\lim_{R\to\infty}$ as:
\begin{equation} \label{eq:rffdist}
    \begin{split}
        p(\x) \approx q(\x) &= \tau^{-1} \exp\{- |\x - \bm{\mu}|^2 \} \\
        & \approx [(2\pi)^d |\bm{\Sigma}|]^{-1/2} \frac{1}{R} \sum_{r=1}^R \cos(\bm{\rho}_r^T(\bm{x} - \bm{\mu})).
    \end{split}
\end{equation}
This form allows for the use of simple trigonometric identities to form an analytically integrable kernel mean formulation \eqref{eq:bqmean} over a Gaussian measure, which we will shortly demonstrate.

\subsection{Generalized Bayesian Quadrature Posterior}

We now reformulate the BQ mean and variance by substituting the RFF formulations of both the kernel and measure in equations \eqref{eq:rffmc} and \eqref{eq:rffdist} into the BQ mean in equation \eqref{eq:bqmean}.
\begin{multline}
    \fint = \bm{y}^T \bm{K}^{-1} \int_{\x \in \mathcal{R}} \frac{1}{R} \sum_{r=1}^R \cos(\bm{\omega}_r^T(\x - \bm{X})) \\
    \times [(2\pi)^d |\bm{\Sigma}|]^{-1/2} \frac{1}{Z} \sum_{z=1}^Z \cos(\bm{\rho}_z^T(\x - \bm{\mu})) d\x
\end{multline}
The trigonometric form of both the kernel and measure distribution in this setting allow for the application of basic identities to rewrite the integrand as a linear function. Using the identity $\cos(\alpha)\cos(\beta) = \cos(\alpha + \beta) / 2 + \cos(\alpha - \beta) / 2$, and simple properties regarding the anti-derivatives of trigonometric functions, we arrive at the following definition of GBQ over an approximated Gaussian measure $q(\x)$.

\begin{definition}[Generalized Bayesian Quadrature Over Gaussian Measures]\label{def:GBQGauss}
    Given $n$ noisy observations $\{\x_i, y_i\}_{i=1}^n = \{\bm{X}, \bm{y}\}$ of a function $f$ where $\x_i \in \mathcal{R}^d$, a kernel function $k$ parametrized through random Fourier frequencies $\bm{\omega} \in \mathcal{R}^{R \times d}$ sampled from density $p(\bm{\omega})$, a Gaussian measure approximation $q(\x)$ parametrized by Fourier frequencies $\bm{\rho} \in \mathcal{R}^{Z \times d}$ sampled from $\mathcal{N}(\bm{0}, \bm{I})$, and kernel matrix $\bm{K} = [k(\location_i, \location_j^\prime)]_{i,j=1}^\nObs \in \R^{\nObs\times\nObs}$, the GBQ estimate $\fint$ of the mean of the integral of $f$ over domain $\bm{a} \leq \x \leq \bm{b}$ is:
    \begin{equation}
        \fint = \mu_{\x}(\bm{X})^T \bm{K}^{-1} \bm{y} ,
    \end{equation}
    \begin{equation}\label{eq:gaussgbqmean}
        \begin{split}
        \mu_{\x}&(\bm{X}) = \\
            % \Bigg[
                &L \sum_{r=1}^R \sum_{z=1}^Z \frac{h^d(\x^T(\bm{\omega}_r + \bm{\rho}_z) - (\bm{\omega}_r^T\bm{X} + \bm{\rho}_z^T\bm{\mu}))}
                {\prod_{j=1}^d (\omega_r^j + \rho_z^j)}
                \Bigg|_{\bm{a}}^{\bm{b}} \, +\\
                &L \sum_{r=1}^R \sum_{z=1}^Z \frac{h^d(\x^T(\bm{\omega}_r - \bm{\rho}_z) - (\bm{\omega}_r^T\bm{X} - \bm{\rho}_z^T\bm{\mu}))}
                {\prod_{j=1}^d(\omega_r^j - \rho_z^j)}
            \Bigg|_{\bm{a}}^{\bm{b}} \, ,
        \end{split} 
    \end{equation}
    where $d$ is the dimensionality of $\x$, and $h^d$ is the function at the $d$-th index of the repeating series $h = [\sin, -\cos, -\sin, \cos, \sin, \dots]$. The normalization constant $L$ is defined as:
    \begin{equation}\label{eq:bqnormconstant}
        L = (2RZ \times Q_{\bm{a}}^{\bm{b}})^{-1}[(2\pi)^d |\bm{\Sigma}|]^{-1/2} \, ,
    \end{equation}
    where $Q_{\bm{a}}^{\bm{b}} = \int_{\bm{a}}^{\bm{b}} q(\x) d\x$ is an estimate to the CDF of the RFF-parametrized Gaussian $q(\x)$, which is analytically calculable from \eqref{eq:rffdist} \footnote{See supplementary for derivation.}.
\end{definition}

See the supplement for full proof, variance derivation, and details of an algorithm for efficient implementation. In addition, the supplement provides a GBQ formulation over uniform measures, which equates to direct integration of the GP integrand $\Bar{f}$. Through definition \ref{def:GBQGauss} we obtain an analytical posterior for $\fint$ and $\mathbb{V}(\fint)$ that allows for flexible kernel choice through the use of RFFs.

\subsection{Approximation Error}
\subsubsection{Gaussian Process and Random Fourier Features Error Bounds}
The approximation error of GBQ extends from well-known error bounds derived from the literature of RFFs and BQ respectively. We present here an abbreviated form of this proof, the full version of which can be found in the supplement. 

We begin with the following lemma outlining the error of the GP estimate $\Bar{f}$ to the integrand $f$ under the assumption $f$ is a member of the Hilbert space $\Hspace_k$ defined by kernel $k$:

\begin{lemma}[{\citet[Theorem 1]{durand_streaming_2017}}]
\label{thr:rkhs-ucb}
Assume $\integrand\in\Hspace_k$ and that the observation noise $\obsNoise$ is $\sigma_\obsNoise$-sub-Gaussian. Then the following holds with probability at least $1-\delta$:
\begin{equation}
    \forall\nObs\in\N, |\integrand(\location) - \gpMean_\nObs(\location)| \leq \beta_k(\delta)\sigma_\nObs(\location), \forall\location\in\locDomain\,,
\end{equation}
where $\gpMean_\nObs$ and $\sigma_\nObs^2$ denote the GP posterior mean and variance given $\nObs$ observations, according to \eqref{eq:gpmean} and \ref{eq:gpcov}, respectively, and
\begin{equation}
    \begin{split}
        \beta_k(\delta) &:= \norm{f}_k\\
        &+ \sigma_\obsNoise\sqrt{\frac{2}{\gpnoisefactor}\log\left(\frac{\det(\eye + \gpnoisefactor^{-1}\mat K_\nObs )^{1/2}}{\delta}\right)},
    \end{split}
\end{equation}
with 
\begin{equation}
    \mat K_\nObs := [k(\location_i, \location_j^\prime)]_{i,j=1}^\nObs \in \R^{\nObs\times\nObs}\, ,
\end{equation}
\end{lemma}

\begin{figure*}[ht!]
    \centering
    \includegraphics[scale=0.67]{1d_combined}
    \caption{Function Plots and Bounded Error Graphs for 1D Continuous and Disjoint Polynomial Quadrature Experiments.}\label{fig:1dexp}
\end{figure*}

We follow with a lemma related to the error bounds on the RFF approximation to a shift-invariant kernel $k$.

\begin{lemma}[{\citet[Proposition 1]{sutherland_error_2015}}]
\label{thr:kernel-approximation}
Let $k:\locDomain\times\locDomain\to\R$ be a continuous shift-invariant positive-definite kernel with $k(\location,\location) = 1$ and such that $\nabla^2 k(\location, \location)$ exists, for all $\location\in\locDomain\subset\R^\locDim$. Suppose $\locDomain$ is compact with diameter $\ell_\locDomain < \infty$. Denote $k$'s Fourier transform as $\pMeasure_k$, which is a probability measure, and let $\sigma_k^2 := \expectation[\norm{\vec\omega}_2^2]$ for $\omega\sim\pMeasure_k$. Let $\rff{k}:\locDomain\times\locDomain\to\R$ denote $k$'s RFF approximation with $\nFeatures$ frequencies according to \eqref{eq:rffmc}.
% For any $\error > 0$, let:
% \begin{align}
%     \begin{split}
%         \alpha_\error &:= \min\Bigg(1,\\
%         &\sup_{\location,\location'\in\locDomain} \frac{1}{2} + \frac{k(2\location, 2\location')}{2} - k(\location, \location')^2 + \frac{\error}{3} \Bigg)\
%     \end{split}\\
%     \beta_\locDim &:= \left(\left(\frac{\locDim}{2}\right)^{-\frac{\locDim}{\locDim+2}} + \left(\frac{\locDim}{2}\right)^{\frac{2}{\locDim+2}}\right) 2^{\frac{6\locDim+2}{\locDim+2}}\,
% \end{align}
Then the following holds for any $0 < \error < \sigma_k\ell_\locDomain$:
\begin{multline}
    \prob{\sup_{\location, \location' \in\locDomain} |\rff{k}(\location,\location') - k(\location,\location')| \geq \error}\\
    % \leq \beta_\locDim \left(\frac{\sigma_k\ell_\locDomain}{\error}\right)^{\frac{2}{1+ \frac{2}{\locDim}}} \exp\left(-\frac{\nFeatures\error^2}{4(\locDim+2)\alpha_\error}\right)\\
    \leq 66 \left(\frac{\sigma_k\ell_\locDomain}{\error}\right)^2 \exp\left(-\frac{\nFeatures\error^2}{4(\locDim+2)}\right)\,.
\end{multline}
% where for the second statement we assume $\error \leq \sigma_k\ell_\locDomain$.
Therefore, for any $\delta \in (0, 1)$, we can achieve pointwise approximation error less than $\error$ with probability at least $1-\delta$ if:
\begin{equation}
    \begin{split}
    \nFeatures \geq \nFeatures(\error,\delta,\sigma_k) := \frac{4(\locDim+2)}{\error^2} \left( \frac{2}{1 + \frac{2}{\locDim}} \log\frac{\sigma_k \ell_\locDomain}{\error} + \log\frac{66}{\delta} \right)
\end{split}
\end{equation}
% In particular, for any $0 < \error \leq \sigma_k \ell_\locDomain$, we have:
% \begin{equation}
%     \prob{\sup_{\location, \location' \in\locDomain} |\rff{k}(\location,\location') - k(\location,\location')| \geq \error} \leq 66 \left(\frac{\sigma_k\ell_\locDomain}{\error}\right)^2 \exp\left(-\frac{\nFeatures\error^2}{4(\locDim+2)}\right)\,.
% \end{equation}
\end{lemma}

\subsubsection{Generalized Bayesian Quadrature Error}
Next, we formulate an error bound on the RFF parametrization of the Gaussian (or any arbitrary) density shown in equation \eqref{eq:rffdist}, as we build towards a final bound on GBQ.
\begin{theorem}[Error of the RFF Density Approximation]
\label{thr:density-approximation}
Let $p:\locDomain\to\R$ be a positive-definite probability density function defined on $\locDomain\subset\R^\locDim$ which is such that $\nabla^2 \pDensity(\vec 0)$ exists. Assume $\locDomain$ is compact, and let $\bound_\pDensity > 0$ be any constant such that $\bound_\pDensity \geq \max_{\location\in\locDomain}\pDensity(\location)$. Let $\rff k_\pDensity$ denote an RFF approximation with $Z \in \N$ frequencies to $k_\pDensity$ as defined in \eqref{eq:density-kernel}, and let $\rff\pDensity:\location\mapsto\rff k_\pDensity(\location,\vec 0)$, $\location\in\locDomain$. Then, for any $\error > 0$, the following holds:
\begin{multline}
    \prob{\sup_{\location\in\locDomain} |\rff\pDensity(\location) - \pDensity(\location)| \geq \bound_\pDensity\error} \\
    % \leq \beta_\locDim \left(\frac{\sigma_{k_\pDensity}\ell_\locDomain}{\error}\right)^{\frac{2}{1+ \frac{2}{\locDim}}} \exp\left(-\frac{Z_\pDensity\error^2}{4(\locDim+2)\alpha_\error}\right)\\
    \leq 66\left(\frac{\sigma_{k_\pDensity}\ell_\locDomain}{\error}\right)^2 \exp\left(-\frac{Z_\pDensity\error^2}{4(\locDim+2)}\right)
\end{multline}
where for the second statement we assume $\error \leq \sigma_{k_\pDensity}\ell_\locDomain$, and $\sigma_{k_\pDensity}$, $\ell_\locDomain$, $\alpha_\error$ and $\beta_\error$ are the same as defined in \autoref{thr:kernel-approximation} for $k:= \frac{1}{\bound_\pDensity}k_\pDensity$.
\end{theorem}

Finally, we combine these results to arrive at the upper bounded error for GBQ as a composition of the errors of GP approximation, RFF approximation, and RFF measure density estimation.

\begin{theorem}[Upper-Bounded Generalized Bayesian Quadrature Error]
\label{thr:sbq-error-bound}
Let $\integrand\in\Hspace_k$, where $k:\locDomain\times\locDomain\to\R$ is a positive-definite, translation-invariant kernel on $\locDomain\subset\R^\locDim$. Assume that:
\begin{enumerate}
    \item $\locDomain$ is compact with diameter $\ell_\locDomain < \infty$ and volume $v_\locDomain := \int_\locDomain\diff\location < \infty$;
    \item $k(\vec 0, \vec 0) = 1$ and $\nabla^2 k(\vec 0, \vec 0)$ exists;
    \item and $\pDensity:\locDomain\to[0,\infty)$ is a positive-definite probability density function.
\end{enumerate}
Then the following holds with probability at least $1-\delta$:
\begin{multline}
    \left\lvert \int_\locDomain \integrand(\location)\pDensity(\location)\diff\location - \int_\locDomain \rff{\gpMean}_\nObs(\location)\rff{\pDensity}(\location)\diff\location \right\lvert\\
    \leq \left(\frac{\nObs}{\gpnoisefactor}\beta_\obsNoise\left(\frac{\delta}{4}\right)\error_k + \beta_k\left(\frac{\delta}{4}\right)\max_{\location\in\locDomain}\sigma_\nObs(\location)\right)\\
    \times (1 + \bound_\pDensity\error_\pDensity v_\locDomain) + \norm{\integrand}_\infty\bound_\pDensity\error_\pDensity v_\locDomain\,,   
\end{multline}
where $\beta_\obsNoise(\delta) := \norm{\integrand}_\infty + \sigma_\obsNoise\sqrt{2\log\left(\frac{\nObs}{\delta}\right)}$,
for an RFF approximation to $k$ with $\nFeatures \geq \nFeatures\left(\error_k, \frac{\delta}{4}, \sigma_k\right)$ frequencies and an RFF approximation to $\pDensity$ with $Z \geq \nFeatures\left(\error_\pDensity, \frac{\delta}{4}, \sigma_{k_p}\right)$ frequencies, given $\error_k > 0$ and $\error_p > 0$.
\end{theorem}

We refer the reader to the supplementary for the full proof of theorems  \ref{thr:density-approximation} and \ref{thr:sbq-error-bound}.

\subsection{Complexity}

We consider here the complexity of calculating the BQ and GBQ mean integral approximation $\fint$ as in \ref{eq:bqmean} and \ref{eq:gaussgbqmean}. Traditional BQ over a Gaussian measure $p(\x)$ \eqref{eq:bqmean}, under the assumption $N > d$, has a mean-calculation complexity that is dominated by the operation $\bm{K}^{-1}$, which scales in $\mathcal{O}(N^3)$. 

Comparatively, GBQ can be either dominated by the same term or via the complexity introduced through the novel method of estimation of the RFF kernel mean as in definition \ref{def:GBQGauss}.

GBQ mean calculation with a Gaussian measure, over definite bounds of dimensionality $d$ for all $N$, scales in $\mathcal{O}(dNRZ)$ with the number of Fourier features $R$ used for kernel $k$ approximation, and the number of Fourier features $Z$ used for RFF approximation $q(\x)$ of Gaussian measure $p(\x)$. Over a uniform measure and definite bounds, which equates to direct integration of the integrand GP $\Bar{f}$, calculation of the GBQ mean scales in $\mathcal{O}(dNR)$ time.

In the case of GBQ over a Gaussian measure, if $dRZ < N^2$, GBQ mean-calculation is also $\mathcal{O}(N^3)$ as in traditional BQ. For GBQ over a uniform measure, if $dR < N^2$, we can assume the same. 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%% EXPERIMENTS %%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Experiments}\label{sec:experiments}

We demonstrate here the empirical results of GBQ compared to traditional Monte Carlo quadrature methods and BQ. Specifically, we measure percent error versus the analytical integral solution, with baselines of Monte Carlo (MC) integration, quasi Monte Carlo (QMC) using Halton sequence sampling \citep{halton_efficiency_1960} over a uniform hyper-cube, and BQ with the RBF kernel and a Gaussian measure. 

For GBQ, we present results in the form of GBQ-Measure-Kernel, where the kernel is chosen from the RFF estimates to the RBF, Mat\'ern 1/2 (M1/2), Mat\'ern 3/2 (M3/2), and Mat\'ern 5/2 (M5/2), and the measure is either uniform (U) or Gaussian (G). We hold static the number of integrand observations $f(\x)$ available across all baselines and GBQ models. Additionally, we use the same GP kernel hyperparameters $\bm{\theta}$ in both BQ and GBQ, which are trained once per each experiment at each $n$ and shared across all models and kernels. For Fourier features $\bm{\omega}$ and $\bm{\rho}$ in equations \eqref{eq:rffmc} and \eqref{eq:rffdist}, we sample using Halton sequences as well to produce a smoother coverage of the sample space. Finally, we implement these methods in Julia \citep{bezanson_julia_2015}, and code has been made available \footnote{https://github.com/houstonwarren/GBQ.jl}.

We note that while our experiments consider the employment of the Mat\'ern family of kernels, any shift-invariant kernel can be used in the GBQ integrand prior to adapt to a wide array of problem settings. While there are various analytical solutions to the Mat\'ern family in traditional BQ, they require a kernel-specific integral to be calculated and implemented, and don't exist over all measures $p(\x)$. We provide evidence to the flexibility of our method by showing that Mat\'ern kernels can be implemented without change of problem formulation by simply sampling features $\bm{\omega}$ according to the appropriate frequency distribution. 

For all experiments, at each training size $n$ we report results as the average model-wise results over multiple runs under different random seeds, and include information on the error variance over runs. While experiments were run for all kernel-measure combinations for GBQ, for brevity we include here only those models that performed best on a given experiment. 

\begin{table*}
    \centering
    \caption{2D Polynomial of Equation \eqref{eq:2dpoly}. Integration Results (\% Error).}\label{tab:poly2dres}
    \begin{tabular}{|c|c|c|c|c|c|}
        \hline
        $N$ & QMC & BQ & GBQ-U RBF & GBQ-G RBF & GBQ-G M5/2\\
        \hline
        $10$ & $98.78 \pm 7.23$ & $8.57 \pm 6.77$ & $17.03 \pm 9.06$ & $10.27 \pm 5.32$ & $\bm{4.88 \pm 3.73}$\\
        $25$ & $76.57 \pm 16.34$ & $9.69 \pm 7.45$ & $\bm{8.32 \pm 7.16}$ & $8.53 \pm 7.39$ & $11.08 \pm 10.63$\\
        $50$ & $44.92 \pm 5.7$ & $7.81 \pm 2.64$ & $14.77 \pm 2.6$ & $7.33 \pm 3.07$ & $\bm{5.72 \pm 5.22}$\\
        $100$ & $31.02 \pm 3.46$ & $4.02 \pm 3.5$ & $\bm{1.97 \pm 0.88}$ & $4.04 \pm 2.93$ & $2.41 \pm 1.71$\\
        $250$ & $7.97 \pm 1.6$ & $1.22 \pm 1.13$ & $\bm{1.03 \pm 0.93}$ & $2.14 \pm 0.77$ & $1.86 \pm 1.1$\\
        $500$ & $6.07 \pm 0.85$ & $0.68 \pm 0.63$ & $\bm{0.49 \pm 0.53}$ & $1.34 \pm 1.6$ & $1.56 \pm 1.65$\\
        $750$ & $5.51 \pm 0.65$ & $0.73 \pm 0.26$ & $\bm{0.48 \pm 0.38}$ & $1.22 \pm 1.24$ & $1.35 \pm 1.21$\\
        $1000$ & $3.94 \pm 0.46$ & $0.41 \pm 0.26$ & $\bm{0.36 \pm 0.26}$ & $1.41 \pm 1.36$ & $1.52 \pm 1.34$\\
        \hline
    \end{tabular}
\end{table*}

\begin{table*}
    \centering
    \caption{2D Disjoint Polynomial of Equation \eqref{eq:2ddis}. Integration Results (\% Error).}\label{tab:dis2dres}
    \begin{tabular}{|c|c|c|c|c|c|}
        \hline
        $N$ & QMC & BQ & GBQ-U RBF & GBQ-U M1/2 & GBQ-G RBF\\
        \hline
        $10$ & $164.04 \pm 0.34$ & $38.42 \pm 0.72$ & $\bm{8.26 \pm 3.82}$ & $95.64 \pm 12.34$ & $30.79 \pm 3.68$\\
        $25$ & $20.28 \pm 0.75$ & $10.59 \pm 0.75$ & $\bm{2.64 \pm 1.0}$ & $5.06 \pm 4.49$ & $10.17 \pm 0.95$\\
        $50$ & $23.38 \pm 0.28$ & $26.08 \pm 0.3$ & $17.42 \pm 0.7$ & $\bm{12.96 \pm 10.34}$ & $27.14 \pm 0.69$\\
        $100$ & $26.8 \pm 0.24$ & $38.93 \pm 0.23$ & $25.06 \pm 0.3$ & $\bm{5.92 \pm 7.4}$ & $38.26 \pm 0.28$\\
        $250$ & $4.41 \pm 0.16$ & $11.99 \pm 0.16$ & $\bm{2.74 \pm 0.33}$ & $2.99 \pm 2.03$ & $12.01 \pm 0.28$\\
        $500$ & $3.48 \pm 0.09$ & $12.63 \pm 0.09$ & $3.46 \pm 0.12$ & $\bm{2.08 \pm 0.85}$ & $12.68 \pm 0.1$\\
        $750$ & $3.24 \pm 0.07$ & $12.38 \pm 0.07$ & $3.01 \pm 0.06$ & $\bm{2.02 \pm 0.58}$ & $12.24 \pm 0.1$\\
        $1000$ & $0.86 \pm 0.05$ & $9.62 \pm 0.05$ & $\bm{0.61 \pm 0.05}$ & $0.73 \pm 0.18$ & $9.48 \pm 0.08$\\
        \hline
    \end{tabular}
\end{table*}

\subsection{1D Experiments}

Our first experiment is a simple 1D polynomial to empirically verify our theoretical results of section \ref{sec:GBQ} regarding the efficacy of the GBQ method in both recreating results of traditional BQ using the RBF kernel as well as demonstrate the flexibility of kernel choice that GBQ offers.

We model the integral of a polynomial of the form:
\begin{equation}
    f(x) = 0.2x^3(x-4)^2 - 3x -3 \, ,
\end{equation}
in the first case, and disjointed version of the polynomial
\begin{equation}
    f(x) = 
    \begin{cases}
        0.2x^3(x-4)^2 - 3x -3, &  x < 2.5 \, , \\
        0.2x^3(x-4)^2 - 3x -13, & x \geq 2.5\, ,
\end{cases}
\end{equation}
in the second. The choice of the disjoint polynomial is in order to assess the value of the flexibility of GBQ in enabling varied kernel choice in BQ, and in this case we leverage Mat\'ern kernels, which typically perform better than the RBF on non-smooth data. We use 100 Fourier features in all GBQ models, and run each experiment 10 times under different seeds at each $n$ and report the aggregated mean and 95\% confidence bounds in figure \ref{fig:1dexp}.

In the first experiment, which represents a smoother polynomial, BQ and GBQ both outperform QMC in accuracy as a function of data scarcity. We can see that GBQ-G-RBF is an excellent approximation to BQ, which similarly leverages an RBF kernel over a Gaussian measure, which helps to validate our theoretical results on both the accuracy of the RFF-based integration of the RFF-RBF kernel over a Gaussian measure, as well as the ability for RFFs to parametrize Gaussian distributions. 

In the disjoint case, we see that at low $n$, GBQ has a slight advantage over BQ when using the Mat\'ern kernel, but that results converge for all methods as training size increases. While QMC achieves better error at some points, it generally displays more variance over $n$ in this experiment than the BQ and GBQ-based models.

\subsection{2D Experiments}
\begin{figure}[t!]
    \centering
    \includegraphics[scale=0.2]{2d_combined.pdf}
    \caption{Plots of 2D experiment equations \eqref{eq:2dpoly} (top) and \eqref{eq:2ddis} (bottom).}  \label{fig:2dexp}
\end{figure}
We now move to a selection of 2D experiments, first of which is estimating the integral of a polynomial of the form
\begin{multline}\label{eq:2dpoly}
    f(x, y) = -0.005x^4 * 0.1x^3 + y^5(0.02x - 0.08)\\
    - 0.001y^2 + 0.2y + 0.5
\end{multline}
over the interval $x \in [-4, 4], y \in [-2.5, 2.5]$, as well as a disjoint 2D function:
\begin{equation}\label{eq:2ddis}
    f(x, y) = 
    \begin{cases}
        e^{5x + 5y}, &  x < 0.5, y < 0.5 \, , \\
        0, & x \geq 2.5, y \geq 2.5\, ,
\end{cases}
\end{equation}
over the unit cube.

We perform both experiments over a range of training data sizes from 10 to 1000 $n$, with 5 runs per $n$ at different random seeds. All GBQ models use 300 Fourier features. Plots of these functions can be seen in figure \ref{fig:2dexp}, and the means and standard deviations of the results are reported in tables \ref{tab:poly2dres} and \ref{tab:dis2dres}.

In both experiments, we see that GBQ methods have universally lower mean error than QMC and BQ. The best performing kernel varies across $n$, but in several cases we see that the Mat\'ern has the lowest error, supporting the case that flexibility of kernel choice is a valuable addition to the BQ method when considering both different integrand types as well as available training data. 

In the disjoint polynomial experiment, we intentionally include GBQ-G with the RBF (the BQ equivalent) in table \ref{tab:dis2dres}, even though it was not high performing among the GBQ methods, to demonstrate the potential performance enhancement GBQ offers through kernel choice. We see GBQ-G-RBF track closely with BQ, while GBQ-U with the Mat\'ern 1/2 and GBQ-U-RBF in combination perform better at all $n$, and frequently with implied worst-case error bounds well below the BQ mean error. 

\subsection{5D Experiments}

\begin{table*}
    \centering
    \caption{5D Equation \ref{eq:5dexp} Integration Results (\% Error).}\label{tab:5d}
    \begin{tabular}{|c|c|c|c|c|}
        \hline
        $N$ & MC & BQ & GBQ-U RBF & GBQ-G RBF\\
        \hline
        $10$ & $\bm{9.67 \pm 8.43}$ & $20.39 \pm 3.85$ & $23.77 \pm 4.33$ & $20.35 \pm 3.99$\\
        $25$ & $9.32 \pm 7.7$ & $3.21 \pm 1.87$ & $6.02 \pm 2.46$ & $\bm{3.0 \pm 1.97}$\\
        $50$ & $5.57 \pm 4.14$ & $\bm{0.61 \pm 0.34}$ & $2.48 \pm 0.51$ & $0.88 \pm 0.42$\\
        $100$ & $3.81 \pm 2.1$ & $2.05 \pm 0.35$ & $\bm{0.89 \pm 0.44}$ & $2.25 \pm 0.4$\\
        $400$ & $2.74 \pm 1.7$ & $2.28 \pm 0.2$ & $\bm{0.33 \pm 0.14}$ & $2.44 \pm 0.2$\\
        $700$ & $2.39 \pm 2.43$ & $2.29 \pm 0.12$ & $\bm{0.16 \pm 0.1}$ & $2.43 \pm 0.16$\\
        $1000$ & $1.79 \pm 1.09$ & $2.22 \pm 0.08$ & $\bm{0.14 \pm 0.09}$ & $2.37 \pm 0.13$\\
        \hline
    \end{tabular}
\end{table*}

\begin{table*}\label{tbl:5ddis}
    \centering
    \caption{5D Equation \ref{eq:5ddis} Integration Results (\% Error).}\label{tab:5ddis}
    \begin{tabular}{|c|c|c|c|c|}
        \hline
        $N$ & MC & BQ & GBQ-G RBF & GBQ-G M3/2\\
        \hline
        $10$ & $\bm{23.94 \pm 13.0}$ & $33.32 \pm 3.0$ & $33.26 \pm 3.12$ & $38.11 \pm 3.78$\\
        $25$ & $\bm{16.84 \pm 20.99}$ & $18.26 \pm 0.86$ & $17.96 \pm 1.08$ & $22.17 \pm 1.22$\\
        $50$ & $\bm{7.58 \pm 5.92}$ & $15.15 \pm 0.59$ & $14.87 \pm 0.6$ & $16.69 \pm 0.7$\\
        $100$ & $5.89 \pm 3.64$ & $\bm{1.71 \pm 0.83}$ & $2.06 \pm 1.28$ & $5.53 \pm 4.55$\\
        $400$ & $3.98 \pm 2.28$ & $1.11 \pm 0.42$ & $1.7 \pm 0.55$ & $\bm{0.79 \pm 0.64}$\\
        $700$ & $3.93 \pm 2.51$ & $1.03 \pm 0.34$ & $1.31 \pm 0.61$ & $\bm{0.85 \pm 0.44}$\\
        $1000$ & $3.24 \pm 2.15$ & $\bm{0.38 \pm 0.24}$ & $0.89 \pm 0.46$ & $0.53 \pm 0.5$\\
        \hline
    \end{tabular}
\end{table*}

We use a 5D problem from a seminal BQ paper \citep{ghahramani_bayesian_2003} to provide an initial evaluation of GBQ in higher dimensions. We model the equation:

\begin{equation}\label{eq:5dexp}
    f(\x) = 10 \sin(\pi x_1 x_2) + 20(x_3 - 0.5) + 10x_4 + 5x_5 ,
\end{equation}

as well as a disjoint variant:

\begin{equation}\label{eq:5ddis}
    f^*(\x) = 
    \begin{cases}
        f(\x)
        & x_i \leq 0.5 \, \forall \, i\, ,\\
        4 \times f(\x)
        & x_i > 0.5 \, \forall \, i \, ,
    \end{cases}
\end{equation}

where observations $y = f(\x) + \epsilon$ and $y^* = f^*(\x) + \epsilon$ have added noise $\epsilon \sim \mathcal{N}(0, \frac{1}{2})$. We perform integration methods over the 5D unit hypercube using 100 Fourier features. Shortened results are provided in tables \ref{tab:5d} and \ref{tab:5ddis} as the average and standard deviation of integral approximation percent error versus the analytical solution across 10 random seeds. Results across all $n$ are available in the supplement.

In the non-disjoint setting, GBQ methods are the highest performing across all experiments with $n > 50$. Notably, we choose to report MC as a baseline other than QMC, as across both experiments in 5D we see a degradation of QMC methods in favor of simple MC. In the disjoint setting, MC is the highest performing at low N, with BQ and GBQ methods performing best at mid to high $n$.

An interesting experimental result was the importance of consistent methodology used for solving the kernel mean $\bm{\mu}_{\x}(\bm{X})$ and producing the kernel matrix $\bm{K}$, when applied in the BQ posterior mean formulation \eqref{eq:bqmean}. Anecdotally, we found that using the combination of a kernel mean derived from traditional BQ and a kernel that was estimated through RFFs, and vice-versa, produced significantly unstable posterior integral mean estimates. These results suggest the benefit of using the full-stack GBQ method with RFF parametrization of both the kernel and measure distribution in order to achieve the best experimental results.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%% DISCUSSION %%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\section{Discussion}\label{sec:discussion}

In this paper, we have introduced generalized Bayesian quadrature, a method for performing Bayesian quadrature using any shift-invariant kernel while maintaining posterior tractability. We derive the upper bound on the error of this approximation, while also demonstrating the practical benefits on a selection of quadrature problems when compared to traditional numerical integration methods and baseline BQ.

More broadly, we note the wider applicability of the methods proposed in this paper. Our chief theoretical contribution comes within the framework of Bayesian quadrature, but in essence it is providing the analytical solution to a kernel mean when the kernel and measure distribution are approximated by RFFs. However, kernel means have a wide array of use cases as discussed in \ref{sec:related}, and represent fertile ground for future applications of our theoretical results.

Additionally, as part of the process of applying GBQ over closed-bounds in multiple dimensions, raising necessity for a truncation term composed of multivariate cumulative distribution functions, we devised a method to parametrize distributions using RFFs and analytically integrate this estimate in order to produce a CDF. For many distributions which offer no closed-form multivariate CDF, this method might be of use.

Future research may look into these applications as well as extending the flexibility and computational aspects of the method. Potential extensions include learning the RFF kernel through its spectral density, leveraging low-rank GP posteriors for computational efficiency improvements in kernel matrix inversion in the BQ mean, and composing multiple levels of GBQ together into deeper architectures for applications to highly nonlinear problems. The introduction proposed in this paper has demonstrated both theoretical and empirical promise that will provide a solid launching point for these pursuits. 

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%% BACK MATTER %%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{acknowledgements} 
Rafael Oliveira was supported by the Medical Research Future Fund for Applied Artificial Intelligence in Health Care grant (MRFAI000097).
\end{acknowledgements}

% \bibliography{references}
\printbibliography

\end{document}
