\begin{appendices}
\crefalias{section}{appendix}
\crefalias{subsection}{appendix}
\crefalias{subsubsection}{appendix}

\setcounter{equation}{0}
\renewcommand{\theequation}{\thesection.\arabic{equation}}

\onecolumn

% {\hrule height 1mm}
\vspace*{-28pt}
\section*{\LARGE\bf \centering Supplementary Material
}
\vspace{8pt}
{\hrule height 0.1mm}
% {\hrule height 0.3mm}
\vspace{24pt}

\section*{Table of Contents}
\vspace*{-10pt}
\startcontents[sections]
\printcontents[sections]{l}{1}{\setcounter{tocdepth}{2}}

\newpage

\section{Theoretical Results}\label{appendix:convergence_rate}

To validate our methodology, we establish a rate at which the CBQ estimator $\hat{I}_\mathrm{CBQ}$ converges to the true $I$ in the $\calL^2(\Theta, \Pb_\mathrm{tr})$ norm, $\|\hat{I}_\mathrm{CBQ} - I\|_{\calL^2(\Theta, \Pb_\mathrm{tr})}=\int_\Theta (\hat{I}_\mathrm{CBQ}(\theta) - I(\theta))^2 \Pb_\mathrm{tr}(\mathrm d \theta )$, for $\Pb_\mathrm{tr}$ such that $\theta_t \sim \Pb_\mathrm{tr}$ for $t \in \{1,\dots,T\}$ \fxb{I think $\Pb_\mathrm{tr}$ is $\mathbb{Q}$ in the main text}. This result is presented in the main text in~\Cref{thm:convergence}. In this section, we prove the more general version of~\Cref{thm:convergence} presented in the main text and intermediate results required, and expand on the technical background.

The proof primarily builds on two results:~\cite[Theorem 4]{wynne2021convergence} is used to get a bound in Stage 1, and~\cite[Theorem 4]{gogolashvili2023importance} is used to get a bound in Stage 2. Specifically,~\cite[Theorem 4]{gogolashvili2023importance} is used to establish a bound on $\|\hat{I}_\mathrm{CBQ} - I\|_{\calL^2(\Theta, \Pb_\mathrm{tr})}$ in terms of $T$ (the number of samples in $\Theta$), and the largest value for BQ variance, $\max_{t \in \{1,\dots,T\}}\sigma^2_\mathrm{BQ}(\theta_t)$. Then,~\cite[Theorem 4]{wynne2021convergence} is used to bound the variance $\sigma^2_\mathrm{BQ}(\theta_t)$ for any $t \in \{1,\dots,T\}$ in terms of $N$ (the number of samples in $\calX$). 

In~\Cref{sec:connection_to_iwkrr}, we define the weight function $w(\theta)$ that establishes a connection between Stage 2 of the method to the setting of importance-weighted kernel ridge regression in~\cite[Theorem 4]{gogolashvili2023importance}. Then in~\Cref{sec:technical_assumptions} we present technical assumptions under which both~\cite[Theorem 4]{wynne2021convergence} and~\cite[Theorem 4]{gogolashvili2023importance} for the defined $w(\theta)$ hold. Finally, in~\Cref{sec:proof_of_convergence} we prove a more general form of~\Cref{thm:convergence} for $\lambda_\calX \geq 0$ and $\theta_{1:T}$ sampled from distribution \textcolor{red}{that does not necessarily have a density.}

\subsection{Connection to Importance-Weighted Kernel Ridge Regression}
\label{sec:connection_to_iwkrr}

Recall the CBQ estimator proposed in Section~\ref{sec:cbq},
%
\begin{align*}
    \hat{I}_\mathrm{CBQ}(\theta) = k_\Theta(\theta, \theta_{1:T})^\top \big(k_\Theta(\theta_{1:T}, \theta_{1:T}) + (\lambda_\Theta + \sigma^2_\mathrm{BQ}(\theta_{1:T})) \Id_T\big)^{-1} \hat{I}_\mathrm{BQ}(\theta_{1:T}),
\end{align*}
%
where $\lambda_\Theta\geq0$ is the regularisation parameter, and $\hat{I}_\mathrm{BQ}(\theta_t)$ and $\sigma^2_\mathrm{BQ}(\theta_t)$, for $t \in \{1, \dots, T\}$, are BQ posterior mean and variance obtained in the first stage,
%
\begin{align*}
    \hat{I}_\mathrm{BQ}(\theta_t) & = \mu_\theta(x^t_{1:N})^\top \left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} f(x^t_{1:N}, \theta_t),\\
    \sigma^2_\mathrm{BQ}(\theta_t) &= \mathbb{E}_{X,X'\sim \mathbb{P}_\theta}[k_{\calX}(X,X')] - \mu_\theta(x^t_{1:N})^\top \left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} \mu_\theta(x^t_{1:N}).
\end{align*}
%
It was pointed out in~\citep[Remark 2]{gogolashvili2023importance}, (and can be seen through straightforward differentiation) that the estimator $\hat{I}_\mathrm{CBQ}$ is the minimiser of the importance weighted kernel ridge regression loss over functions in the RKHS $\calH_\Theta$ induced by the kernel $k_\Theta$,
%
\begin{align*}
    \hat{I}_\mathrm{CBQ} = \argmin_{F \in \calH_\Theta} \Big\{ \sum_{t=1}^T \frac{\tau}{1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t)} \big(F(\theta_t) - \hat{I}_\mathrm{BQ}(\theta_t)\big)^2 + \tau \lambda_\Theta^{-1}\| F \|_{\calH_\Theta}^2 \Big\},
\end{align*}
%
for any $\tau>0$.\footnote{We will keep $\tau$ as a free parameter for now, and use it in~\Cref{sec:proof_of_convergence} to ensure constraints on $\lambda_\Theta$ imposed by~\cite[Theorem 4]{gogolashvili2023importance} are satisfied} Suppose $\theta_i$ are sampled from a probability measure $\Pb_\mathrm{tr}$ on $\Theta$. Then, 
%
\begin{align}
\label{eq:p_te}
    \Pb_\mathrm{te}(A) = \int_A w(\theta)\Pb_\mathrm{tr}(\mathrm{d} \theta)
\end{align}
%
defines a positive measure $\Pb_\mathrm{te}$ on $\Theta$ for any positive $w(\theta) > 0$ for which the integral exists~\citep[ Proposition 232D]{fremlin2000measure}; further, if $w(\theta)$ is bounded, the measure is finite. Suppose we construct a $w(\theta)$ that satisfies these requirements, and is such that $w(\theta_t) = \tau(1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1}$. Then, since $\E [\hat{I}_\mathrm{BQ}(\theta_i)] = I(\theta_i)$, the importance-weighted loss can be considered an unbiased finite-sample approximation of
\begin{align*}
    \int_\Theta ( F(\theta) - I(\theta) )^2 \Pb_\mathrm{te}(\mathrm{d} \theta) + \frac{1}{n} \| F \|^2_{\calH_\Theta}.
\end{align*}
%
Under a further assumption that the problem is well-specified, meaning $I(\theta) \in \calH_\Theta$, an upper bound on $\|\hat{I}_\mathrm{CBQ} - I\|_{\calL^2(\Theta, \Pb_\mathrm{te})}$ in terms of $T$ and $\sup_{\theta \in \Theta} w(\theta)$ was established in~\citep[Theorem 4]{gogolashvili2023importance}. To apply the result, we define $w(\theta)$ of convenient form that satisfies the requirements mentioned above, specifically $w(\theta) \in (0, W_0]$ for some $W_0<\infty$ and any $\theta \in \Theta$, and $w(\theta_t) = \tau(1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1}$ for some $t \in \{1, \dots,T\}$.\footnote{The integrability requirement is specific to $\Pb_\mathrm{tr}$ and will be assumed at a later stage.} Take $\sigma^{2}_\mathrm{BQ}(\theta_{t'}) = \max_{t \in \{1,\dots,T\}}\{\sigma^{2}_\mathrm{BQ}(\theta_t)\}>0$, and define 
%
\begin{align}
\begin{split}
\label{eq:weight_function}
    w(\theta) = \begin{cases}
        \tau(1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1} & \text{ if } \|\theta - \theta_t\|_\Theta  \geq \varepsilon' \text{ for all } t \in \{1,\dots,T\} \\
        \tau A_t - \tau B_t \frac{\|\theta - \theta_t\|_\Theta }{\varepsilon'}, & \text{ for } t \text{ such that } \|\theta - \theta_t\|_\Theta  < \varepsilon' \\
    \end{cases}
\end{split}
\end{align}
%
for $\| \cdot\|_\Theta$ the Euclidean norm on $\Theta$, some fixed $0<\varepsilon' \leq \min_{i,j \in \{1,\dots,T\},\ i \neq j} \|\theta_i-\theta_j\|_\Theta$, and
%
\begin{align*}
    A_t = (1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1}  \qquad \text{and} \qquad
    B_t = (1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1} - (1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1}.
\end{align*}
%
For $\Theta \subset \R$, such $w(\theta)$ is easily visualised, as can be seen in~\Cref{fig:wtheta_illustration}.
\begin{figure}[H]
\centering \includegraphics[width=0.6\textwidth]{figures/wtheta_example_upd.pdf}
\caption{Illustration of $w(\theta)$ for $\Theta \subset \R$}
\label{fig:wtheta_illustration}
\end{figure}
It is easy to see that $w(\theta)$ is bounded above by $\tau \max_{t \in \{1,\dots,T\}} (1 + \lambda_\Theta^{-1} \sigma^{2}_\mathrm{BQ}(\theta_{t}))^{-1} < \tau$, and below by $\tau(1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1}>0$ for any $\theta \in \Theta$, and $w(\theta_t) = \tau(1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1}$ as required. 

Note that the weight $w(\theta)$ constructed here is by no means a unique way to establish a useful connection between our setting of heteroscedastic GP regression, and importance-weighted kernel ridge regression. As will become evident in the proofs in~\Cref{sec:proof_of_convergence}, one could use any $w(\theta)$ provided it satisfies $w(\theta_t) = \tau(1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1}$ for any $t$, is bounded below by some function of $\sigma^2_\mathrm{BQ}(\theta_{t'})$, and is bounded above by an expression that does not grow in $T$ or $N$. Our proposed construction is simple and easy to visualise, and the parameter $\varepsilon'$ has no impact on the speed of convergence as we shall see in the results in~\Cref{sec:proof_of_convergence}.

\subsection{Technical Assumptions}
\label{sec:technical_assumptions}

Prior to presenting our findings, we present and justify the assumptions we have made. Throughout we use Sobolev spaces to quantify function smoothness. A Sobolev space $\calW^{2, s}(\calX, \mu)$, with $s>d/2$ and a measure $\mu$ on $\calX \subseteq \R^d$, consists of functions that satisfy certain conditions: they are square integrable under the measure $\mu$, and all weak derivatives up to and including order $s$ are also square integrable under $\mu$. Weak derivatives are a generalization of ordinary derivatives, allowing for functions that are not necessarily differentiable everywhere. 

Further, we assume the kernels $k_\Theta, k_\calX$ are Sobolev kernels, meaning they induce Hilbert spaces that are norm-equivalent to Sobolev spaces. Important examples of Sobolev kernels include Mat\'ern kernels and Stein kernels; we expand on these in~\Cref{sec:on_sobolev_kernels}, and refer to~\cite{adams2003sobolev} for an in-depth treatment of Sobolev spaces and~\cite{berlinet2011reproducing} for general RKHS theory.

The following is a more general form of the assumptions in~\Cref{thm:convergence}: specifically, we allow for the case when $\theta_{1:T}$ came from a distribution other than uniform, and do not assume $\lambda_\calX=0$.

\begin{enumerate}[label=(\alph*)]
    \item $\calX \subset \R^d$ is open, convex, and bounded.
    \label{as:app_domains_x}
    \item $\Theta\subset \R^p$ is open, convex, and bounded.
    \label{as:app_domains_theta}
    \item $\theta_t$ were sampled i.i.d. from some $\Pb_\mathrm{tr}$, and $\Pb_\mathrm{tr}$ is equivalent to the uniform distribution on $\Theta$, meaning $\Pb_\mathrm{tr}(A)=0$ for a set $A \subset \Theta$ if and only if $\operatorname{Unif}(A)=0$.
    \label{as:app_theta_samples}
    \item $x_{1:N}^t \sim \Pb_{\theta_t}$ for all $t \in \{1, \cdots, T\}$.
    \label{as:app_x_samples}
    \item $\Pb_\theta$ has a density $p_\theta$ for any $\theta \in \Theta$, and the densities are such that $\inf_{\theta \in \Theta, x \in \calX} p_{\theta}(x)=\eta>0$ and $\sup_{\theta \in \Theta}\|p_{\theta}\|_{\calL^2(\calX)}=\eta_0<\infty$.
    \label{as:app_densities}
    \item $k_\calX$ is a Sobolev kernel of smoothness $s_\calX \in (d/2, s_f]$.
    \label{as:app_kernel_x}
    \item $k_\Theta$ is a Sobolev kernel of smoothness $s_\Theta \in (p/2, s_I]$.
    \label{as:app_kernel_theta}
    \item For any $\theta$, $f(x, \theta)$ lies in the Sobolev space $\calW^{2, s_f}(\calX)$.
    \label{as:app_true_f_smoothness} 
    \item $I(\theta)$ lies in the Sobolev space $\calW^{2, s_I}(\Theta)$.
    \label{as:app_true_I_smoothness}
    \item For the integral operator $L'[f](\theta)=\int_\Theta f(\theta') k_\Theta(\theta, \theta') \Pb_\mathrm{tr}(\text{d} \theta')$ that maps $f \in \calL^2(\Theta, \Pb_\mathrm{tr})$ to $L'[f] \in \calH_\Theta$, there is a $g \in \calL^2(\Theta, \Pb_\mathrm{tr})$ such that $I={L'}^r g$ for some $r \in [1/2, 1]$. We denote $R_0=\| g \|_{\calL^2(\Theta, \Pb_\mathrm{tr})}$. \fxb{As far as I can tell this operator is not defined in the paper} \masha{fixed. $L^r$ is strictly speaking still not defined. I don't think it should be, too much detail for something we ultimately don't use in the main text. And the definition is painful, it includes the eigenvalue decomposition of $L'$. I put a discussion in below.}
    \label{as:app_src}
    \item $\lambda_\Theta = cT^{1/(2r+1)}$, for $c>0$ and $\alpha \in (0, 1)$.
    \label{as:app_regulariser_theta}
    \item $\lambda_\calX \geq 0$.
    \label{as:app_regulariser_x}
\end{enumerate}

In the kernel literature, Assumption~\cref{as:app_src} is known as a \emph{source condition}, and is frequently used to quantify the difficulty of the problem: the larger $r$, the easier it is to learn $I(\theta)$ with $k_\Theta$, and consequently we may expect faster convergence. The result in~\Cref{thm:convergence} is given for $r=1/2$: by Mercer's theorem, the source condition is satisfied for $r=1/2$ since the true function $I(\theta)$ lies in the RKHS of $k_\Theta$ by~\cref{as:app_true_I_smoothness,as:app_kernel_theta}. Intuitively, the more smooth $I(\theta)$ is in relation to the RKHS of $k_\theta$, the greater the value of $r \geq 1/2$ for which the source condition is satisfied will be. For further detail on integral operators, their $r$-powers, and the source condition, we refer to the overview in~\cite{gogolashvili2023importance}.

The rest of the assumptions correspond to assumptions in~\Cref{thm:convergence} as follows:
~\cref{as:app_domains_x,as:app_domains_theta} form~\hyperref[as:domains]{A1},
~\cref{as:app_theta_samples,as:app_x_samples} form a more general form of~\hyperref[as:pars_and_samples]{A2},
~\cref{as:app_densities} is~\hyperref[as:densities]{A3},
~\cref{as:app_kernel_x,as:app_kernel_theta} form~\hyperref[as:kernels]{A4}, and
~\cref{as:app_true_f_smoothness,as:app_true_I_smoothness} are specified in text of~\Cref{thm:convergence} prefacing the list of assumptions. Assumption~\cref{as:app_regulariser_theta} is mentioned in the text of~\Cref{thm:convergence} following the list of assumptions, and~\cref{as:app_regulariser_x} is the more general form of the condition $\lambda_\calX = 0$ in~\Cref{thm:convergence}. 

Crucially, in the proofs in the next section we will see that~\cref{as:app_domains_x,as:app_domains_theta,as:app_theta_samples,as:app_x_samples,as:app_densities,as:app_kernel_x,as:app_kernel_theta,as:app_true_f_smoothness,as:app_true_I_smoothness,as:app_src,as:app_regulariser_theta,as:app_regulariser_x} imply that the setting of the model in Stage 1 satisfies the assumptions of~\cite[Theorem 4]{wynne2021convergence}, and the setting of the model in Stage 2 satisfies the assumptions of~\cite[Theorem 4]{gogolashvili2023importance}---the two key results we will use to prove the convergence rate of the estimator.

% \subsubsection{Discussion of Technical Assumptions}
% Crucially,~\cref{as:app_domains_x,as:app_domains_theta,as:app_theta_samples,as:app_x_samples,as:app_densities,as:app_kernel_x,as:app_kernel_theta,as:app_true_f_smoothness,as:app_true_I_smoothness,as:app_src,as:app_regulariser_theta,as:app_regulariser_x} imply that the setting of the model in Stage 1 satisfied the assumptions of~\cite[Theorem 4]{wynne2021convergence}, and the setting of the model in Stage 2 satisfied the assumptions of~\cite[Theorem 4]{gogolashvili2023importance}---the two key results we will use to prove the convergence rate of the estimator. We discuss and demonstrate this now.

% \paragraph{Assumptions 1-4 of~\cite[Theorem 4]{gogolashvili2023importance} for Stage 2}
% Assumption 1 states that the model is well-specified, $I(\theta) \in \calH_\Theta$, and is trivial in our case: it is well-known that $\calH_\Theta \simeq \calW^{2,\nu_\Theta+p/2}(\Theta)$: the RKHS of a Mat\'ern kernel of order $\nu_\Theta$ over an open, convex and bounded $\Theta \subset \R^p$ is norm-equivalent to the Sobolev space $W^{2,\nu_\Theta+p/2}(\Theta)$~\citep[Corollary 10.48]{Wendland2005}\footnote{Strictly speaking, the result in~\citep[Corollary 10.48]{Wendland2005} is stated for integer-order spaces only, meaning $\nu_\Theta+p/2 \in \mathbb Z$. 
% However, it can be straightforwardly extended to fractional-order Sobolev-Slobodeckij spaces using~\cite[Corollary 10.13]{Wendland2005} that says the RKHS of a Mat\'ern kernel on $\R^p$ is norm-equivalent to a Bessel potential space, which in turn is norm-equivalent to the Sobolev-Slobodeckij space by~\cite[Section 7.62]{adams2003sobolev}, and finally using an extension operator in~\cite[Theorems 6.1 and 6.7]{devore1993besov} to restrict this to open, convex and bounded subsets of $\R^p$.}. By~\cref{as:app_true_I_smoothness} $I(\theta) \in \calW^{2, s_I}(\Theta)$, and finally by the inclusion of Sobolev spaces $I(\theta) \in \calW^{2,\nu_\Theta+p/2}(\Theta)$.

% Proving that Assumptions 2, 3 and 4 in~\citep{gogolashvili2023importance} hold under~\cref{as:app_domains_x,as:app_domains_theta,as:app_theta_samples,as:app_x_samples,as:app_densities,as:app_kernel_x,as:app_kernel_theta,as:app_true_f_smoothness,as:app_true_I_smoothness,as:app_src,as:app_regulariser_theta,as:app_regulariser_x} for $w(\theta)$ requires more care; we state that they hold in the following lemmas.

% \begin{lemma}[Assumption 2 in~\citep{gogolashvili2023importance}]
% \label{lemma:assumption2}
%     Under the Assumption~\cref{as:app_src} it holds that $I=L^r g$, for the integral operator $L: \calL^2(\Theta, \Pb_\mathrm{te}) \to \calH_\Theta$ and $\Pb_\mathrm{te}$ as defined in~\Cref{eq:p_te}, and some $g \in \calL^2(\Theta, \Pb_\mathrm{te})$ of norm $R \leq \tau R_0$.
% \end{lemma}
% \begin{proof}
%     We assumed that the statement holds for $\Pb_\mathrm{tr}$ in Assumption~\cref{as:app_src}. By definition of $\Pb_\mathrm{te}$, for any $\Pb_\mathrm{te}$-integrable $g':\Theta \to \R$ it holds that $\int_\Theta g'(\theta) \Pb_\mathrm{te}{\mathrm d \theta} = \int_\Theta g'(\theta) w(\theta) \Pb_\mathrm{tr}{\mathrm d \theta}$. Since $w(\theta)$ is bounded above and below away from zero, $\calL^2(\Theta, \Pb_\mathrm{tr})$ is norm-equivalent to $\calL^2(\Theta, \Pb_\mathrm{te})$. Therefore, $g \in \calL^2(\Theta, \Pb_\mathrm{tr})$ and the statement holds for $R\leq \tau R_0$ as $w(\theta) \leq \tau$ by construction.
% \end{proof}

% \begin{lemma}[Assumption 3 in~\citep{gogolashvili2023importance}]
% \label{lemma:assumption3}
%     For $w(\theta)$ as defined in~\Cref{eq:weight_function} and $q=1$, $W= \tau$, and $\sigma^2= \| \Theta \| \tau$ it holds for all $m \in \N$, $m \geq 2$, that
%     \begin{align*}
%         \left( \int_\Theta w(\theta)^{\frac{q+m-1}{q}} \Pb_\mathrm{tr}(\mathrm{d} \theta) \right)^q \leq \frac{1}{2} m! W^{m-2} \sigma^2
%     \end{align*}
% %
% \end{lemma}
% \begin{proof}
% Since $w(\theta)$ is bounded from above, $\int_\Theta w(\theta)^m \Pb_\mathrm{tr}(\mathrm d \theta) < \| \Theta \| \tau^m \max_t (1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-m} < \| \Theta \| \tau$, and the result follows.
% \end{proof}

% \begin{lemma}[Assumption 4 in~\citep{gogolashvili2023importance}]
% \label{lemma:assumption4}
%     Under the Assumption~\cref{as:app_kernel_theta}, for $s'=p/(2\nu_\Theta + p)$ it holds that
%     \begin{align*}
%         E_{s'} = \max \left(1, \sup_{\lambda \in (0,1]} \sqrt{\sum_{i=1}^\infty \frac{\mu_i \lambda^{s'}}{\mu_i + \lambda}}  \right) < \infty.
%     \end{align*}
% %
% \end{lemma}
% \begin{proof}
% It is a standard result (see, for instance,~\citep[Section 3.3.4]{edmunds1996function}) that for $k_\Theta$ being a Mat\'ern kernel of order $\nu_\Theta$, the $i$-th eigenvalue decays at the rate of $i^{-\frac{2\nu_\Theta + p}{p}}$. As pointed out in the discussion after Assumption 4 in~\citep{gogolashvili2023importance}, this implies $E_{s'}< \infty$ holds for $s'=p/(2\nu_\Theta + p)$.
% \end{proof}

% \paragraph{Assumptions 1-5 of~\cite[Theorem 4]{wynne2021convergence} for Stage 1}



\subsection{Proofs}
\label{sec:proof_of_convergence}

We are now ready to state the bound on $\| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Pb_\mathrm{tr})}$, which is essentially a corollary of~\cite[Theorem 4]{gogolashvili2023importance}. This bound depends on the largest BQ variance $\max_{t \in \{1,\dots,T\}}\sigma^2_\mathrm{BQ}(\theta_t)$; we obtain a bound on BQ variance $\sigma^2_\mathrm{BQ}(\theta_t)$ for any $t$ in~\Cref{thm:bound_on_bq_var}. Combining the two results gives~\Cref{thm:convergence_generalised}, which is the generalised version of~\Cref{thm:convergence}. 

Before proving our corollary, we point out that $\Pb_\mathrm{te}$ as defined in~\Cref{eq:p_te} is a finite positive measure and not necessarily a probability measure; meanwhile in the statement of~\cite[Theorem 4]{gogolashvili2023importance} $\Pb_\mathrm{te}$ is asked to be a probability measure. This is not an issue since $\Pb_\mathrm{te}$ being a probability measure is never used in the proof of~\cite[Theorem 4]{gogolashvili2023importance}---instead, the proof only asks that $\Pb_\mathrm{te}$ be a finite positive measure. We will therefore make use of Theorem 4 for a finite positive measure $\Pb_\mathrm{te}$.

\begin{cor}
\label{thm:krr_corollary}
    Suppose Assumptions~\cref{as:app_domains_theta,as:app_theta_samples,as:app_kernel_theta,as:app_true_I_smoothness,as:app_src,as:app_regulariser_theta} hold. Then, for a fixed $\delta$, there is a $T_0(\delta)>0$ such that for all $T \geq T_0(\delta)$ with probability at least $1-\delta/2$ it holds that
    \begin{align*}
        \| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Pb_\mathrm{tr})} \leq K_0( \log(12/\delta) + K_3) (1 + c^{-1} T^{-1/(2r+1)} \sigma^2_\mathrm{BQ}(\theta_{t'})) T^{-r/(2r+1)},
    \end{align*}
%
for $\sigma^2_\mathrm{BQ}(\theta_{t'})=\max_{t \in \{1,\dots,T\}}\sigma^2_\mathrm{BQ}(\theta_t)$, $K_0 = 16 (M + \|I\|_{\calH_\Theta}) (1+\tau^{-1/2} \|\Theta\|^{1/2}) c^{-1/2}$, and $K_3 = R_0 c^{r+1/2}  (M + \|I\|_{\calH_\Theta})^{-1} (1+\tau^{-1/2} \|\Theta\|^{1/2})^{-1} /16$.
\end{cor}
\begin{proof}
    First, we show the assumptions in the Theorem hold for $R\leq \tau R_0$, the $r$ in Assumption~\cref{as:app_src}, $q=1$, $W= \tau$, and $\sigma^2= \| \Theta \| \tau$.
    
    {Assumption 1 (Existence of the target function):} As discussed in~\cite{gogolashvili2023importance}, Assumption 1 holds if the model is well-specified, $I(\theta) \in \calH_\Theta$, and the RKHS induced by $k_\Theta$ is dense in $\calL^2(\Theta, \Pb_\mathrm{te})$. Both of these conditions hold by Assumption~\cref{as:app_true_I_smoothness}: the former holds by inclusion of Sobolev spaces as $s_\Theta \leq s_I$, and the latter holds as the RKHS is a Sobolev space $\calW^{2, s_\Theta}(\Theta)$, which is dense in $\calL^2(\Theta)$ for an open and bounded $\Theta$. Finally, $\calL^2(\Theta)$ is norm-equivalent to $\calL^2(\Theta, \Pb_\mathrm{te})$ as $w(\theta)$ is bounded above and below away from zero, and $\Pb_\mathrm{tr}$ is equivalent to the uniform distribution.
    
    {Assumption 2 (The smoothness of the target function):} We assumed that the Assumption holds for $\Pb_\mathrm{tr}$ in~\cref{as:app_src}. By definition of $\Pb_\mathrm{te}$, for any $\Pb_\mathrm{te}$-integrable $g':\Theta \to \R$ it holds that $\int_\Theta g'(\theta) \Pb_\mathrm{te}(\mathrm d \theta) = \int_\Theta g'(\theta) w(\theta) \Pb_\mathrm{tr}(\mathrm d \theta)$. Since $w(\theta)$ is bounded above and below away from zero, $\calL^2(\Theta, \Pb_\mathrm{tr})$ is norm-equivalent to $\calL^2(\Theta, \Pb_\mathrm{te})$. Therefore, $g \in \calL^2(\Theta, \Pb_\mathrm{tr})$ and the Assumption holds for $R\leq \tau R_0$ as $w(\theta) \leq \tau$ by construction.
    
    {Assumption 3 (Importance-weighting function):} For $w(\theta)$ as defined in~\Cref{eq:weight_function} and $q=1$, $W= \tau$, and $\sigma^2= \| \Theta \| \tau$ it holds for all $m \in \N$, $m \geq 2$, that
    \begin{align*}
        \left( \int_\Theta w(\theta)^{\frac{q+m-1}{q}} \Pb_\mathrm{tr}(\mathrm{d} \theta) \right)^q \leq \frac{1}{2} m! W^{m-2} \sigma^2
    \end{align*}
    since $w(\theta)$ is bounded from above, $\int_\Theta w(\theta)^m \Pb_\mathrm{tr}(\mathrm d \theta) < \| \Theta \| \tau^m \max_t (1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-m} < \| \Theta \| \tau$.
    
    {Assumption 4 (Effective dimension):} It is a standard result (see, for instance,~\citep[Section 3.3.4]{edmunds1996function}) that for a Sobolev kernel $k_\Theta$ of smoothness $s_\Theta$, the $i$-th eigenvalue decays at the rate of $i^{-2s_\Theta/p}$. As pointed out in the discussion after Assumption 4 in~\citep{gogolashvili2023importance}, this implies Assumption 4 holds for $s'=p/(2s_\Theta)$.

    Therefore by~\citep[Theorem 4]{gogolashvili2023importance}, for $\lambda=\tau c \lambda_\Theta T^{-1} = \tau c T^{-(1-1/(2r+1))}$ and the weight function $w(\theta)$ defined in~\eqref{eq:weight_function}, we have that with probability at least $1-\delta/2$
    \begin{align}
    \label{eq:l2_in_pte}
        \| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Pb_\mathrm{te})} \leq T^{-r\beta} \left( 16 (M + \|I\|_{\calH_\Theta}) (W+\sigma E_{s'}^{1-q}) c^{-A/2} \log(12/\delta) + c^r R \right)
    \end{align}
%
    provided\footnote{We omit the definition of $E_s$ intentionally as, since $q=1$, it is always raised to the power of zero in this work.}
    \begin{align}
    \label{eq:cond_on_c}
        \lambda=\tau c T^{-(1-1/(2r+1))} \leq 1, \qquad
        \tau c \geq \left( 64 (W+\sigma^2) E_{s'}^{2(1-q)} \log^2(12/\delta) \right)^{1/(1+A)},
    \end{align}
%
    for the constants $W= \tau$, $\sigma^2= \| \Theta \| \tau$, $q=1$, $r \in [1/2, 1]$, $R \leq \tau R_0$, and $A=1$, $\beta=1/(2r+1)$.
    Then, the conditions on $c$ in~\eqref{eq:cond_on_c} become
    \begin{align}
    \label{eq:condition_on_c}
        c \in [8\tau^{-1/2} \log(12/\delta) \left( 1+\| \Theta \| \right)^{1/2}, \tau^{-1} T^{1-1/(2r+1)}].
    \end{align}
%
    We denote the smallest $T$ for which this holds by $T_0$. The rate in~\eqref{eq:l2_in_pte} becomes
    \begin{align*}
        \| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Pb_\mathrm{te})} \leq T^{-\frac{r}{2r+1}} \left( 16 (M + \|I\|_{\calH_\Theta}) (\tau+\tau^{1/2} \|\Theta\|^{1/2}) c^{-1/2} \log(6/\delta) + \tau c^r R_0 \right)
    \end{align*}
%
    Since $\Pb_\mathrm{te}(\mathrm d \theta) = w(\theta)\Pb_\mathrm{tr}(\mathrm d \theta)$, and $w(\theta) \geq \tau(1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1}>0$ for all $\theta$, it holds that
    \begin{align*}
        \| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Pb_\mathrm{te})} &\geq \min_{\theta \in \Theta} w(\theta)\| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Pb_\mathrm{tr})} \\
        &\geq \tau(1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1} \| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Pb_\mathrm{tr})},
    \end{align*}
    and therefore
    \begin{align*}
        &\| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Pb_\mathrm{tr})} \leq (1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'})) T^{-r/(2r+1)} \\
        &\qquad \qquad \qquad \times \left( 16 (M + \|I\|_{\calH_\Theta}) (1+\tau^{-1/2} \|\Theta\|^{1/2}) c^{-1/2} \log(6/\delta) + c^r R_0 \right),
    \end{align*}
    %
    and we arrive at the statement of the theorem.
\end{proof}

The need to introduce $\tau$ in~\Cref{sec:connection_to_iwkrr} is clear now: without it, the condition on $c$ in~\Cref{eq:condition_on_c} may not hold. Since $\tau>0$ can be selected at will, we may set it to the smallest value for which~\Cref{eq:condition_on_c} holds.

Next, we establish a bound on $\sigma^2_\mathrm{BQ}(\theta_t)$ for any $t \in \{1,\dots,T\}$.

\begin{theorem}
\label{thm:bound_on_bq_var}
    Suppose Assumptions~\cref{as:app_domains_x,as:app_x_samples,as:app_densities,as:app_kernel_x,as:app_true_f_smoothness,as:app_regulariser_x} hold. Then there is a $N_0>0$ such that for all $N \geq N_0$ with probability at least $1-\delta/2$ it holds that
    \begin{align*}
        \sigma^2_\mathrm{BQ}(\theta_t) \leq  \lambda_\calX + \frac{4}{\delta^2}\eta^2_0 K^2_1 K_2^d N^{-1 + 2\varepsilon} \left( K_2^{s_\calX-d/2} N^{-s_\calX/d+1/2 + \varepsilon} + \lambda_\calX \right)^2
    \end{align*}
    for any $t \in \{1,\dots,T\}$, any arbitrarily small $\varepsilon>0$, and $K_1$, $K_2$ independent of $N,t,\varepsilon$.
\end{theorem}
\begin{proof}
Recall
%
\begin{align*}
    \hat{I}_\mathrm{BQ}(\theta_t) & = \mu_\theta(x^t_{1:N})^\top \left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} f(x^t_{1:N}, \theta_t),\\
    \sigma^2_\mathrm{BQ}(\theta_t) &= \mathbb{E}_{X,X'\sim \mathbb{P}_\theta}[k_{\calX}(X,X')] - \mu_\theta(x^t_{1:N})^\top \left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} \mu_\theta(x^t_{1:N}).
\end{align*}
%
We seek to bound $\sigma^2_\mathrm{BQ}(\theta_t)$.~\citep[Proposition 3.8]{kanagawa2018gaussian} pointed out that the Gaussian noise posterior is the worst-case error in the $\calH_{\calX}^{\lambda_\calX}$, the RKHS induced by the kernel $k_\calX^{\lambda_\calX}(x, x') = k_\calX(x, x') + \lambda_\calX$. Through straightforward algebraic manipulations and using the reproducing property, one can show that
%
\begin{align}
\label{eq:variance_bound_proof_1}
    \sigma^2_\mathrm{BQ}(\theta_t) - \lambda_\calX = \mathrm{MMD}^2(\hat{\Pb}^N_\theta, \Pb_\theta; \calH_{\calX}^{\lambda_\calX})=\sup_{\|f\|_{\calH_{\calX}^{\lambda_\calX}} \leq 1} \left| w^{\lambda_\calX}_t f(x^t_{1:N}) - \int_\calX f(x) \Pb_\theta(\mathrm d x)\right|^2,
\end{align}
%
for the empirical measure $\hat{\Pb}^N_\theta = w^{\lambda_\calX}_t \delta_{x^t_{1:N}}$, where $\delta_{x^t_i}$ for all $i$ is the Dirac delta distribution, $\delta_{x^t_{1:N}} = [\delta_{x^t_1} \dots \delta_{x^t_N}]^\top$ is our usual vector notation used throughout this work, and the weights are the optimal BQ weights $w^{\lambda_\calX}_t=\left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} \mu_\theta(x^t_{1:N})$.

Since $\calH_\calX^{\lambda_\calX}$ is induced by the sum of kernels, $k_\calX^{\lambda_\calX}(x, x') = k_\calX(x, x') + \lambda_\calX$, it holds that $\calH_\calX \subseteq \calH_\calX^{\lambda_\calX}$, and $\| f  \|_{\calH_\calX^{\lambda_\calX}} \leq \| f  \|_{\calH_\calX}$~\citep[Theorem I.13.IV]{aronszajn1950theory}. Therefore, the class of functions $f$ for which $\| f  \|_{\calH_\calX} \leq 1$ is larger than that for which $\| f  \|_{\calH_\calX^{\lambda_\calX}} \leq 1$, and
%
\begin{align}
\label{eq:variance_bound_proof_2}   \sup_{\|f\|_{\calH_{\calX}^{\lambda_\calX}} \leq 1} \left| w^{\lambda_\calX}_t f(x^t_{1:N}) - \int_\calX f(x) \Pb_\theta(\mathrm d x)\right| \leq \sup_{\|f\|_{\calH_{\calX}} \leq 1} \left| w^{\lambda_\calX}_t f(x^t_{1:N}) - \int_\calX f(x) \Pb_\theta(\mathrm d x)\right|.
\end{align}
%
Next, note that for $\hat{f}_t(x) = k(x, x^t_{1:N})^\top \left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} f(x^t_{1:N})$,
%
\begin{align}
\begin{split}
\label{eq:variance_bound_proof_3}
   \left| w^{\lambda_\calX}_t f(x^t_{1:N}) - \int_\calX f(x) \Pb_\theta(\mathrm d x)\right| = \left| \int_\calX \left(\hat{f}_t(x) - f(x) \right) \Pb_\theta(\mathrm d x)\right| &\leq  \int_\calX \left|\hat{f}_t(x) - f(x) \right| \Pb_\theta(\mathrm d x)  \\
    & \leq \|\hat{f}_t - f\|_{\calL^2(\calX)} \|p_\theta \|_{\calL^2(\calX)},
\end{split}
\end{align}
%
where the last inequality is an application of H\"older inequality. By Assumption~\cref{as:app_densities}$, \|p_\theta \|_{\calL^2(\calX)}$ is bounded above by $\eta_0$. In order to apply~\citep[Theorem 4]{wynne2021convergence} to bound $\|\hat{f}_t - f\|_{\calL^2(\calX)}$, we show the assumptions in the Theorem hold.
    
{Assumption 1 (Assumptions on the Domain):} An open, bounded, and convex $\calX$ satisfies the assumption, as discussed in~\cite{wynne2021convergence}.
    
{Assumption 2 (Assumptions on the Kernel Parameters) and Assumption 3 (Assumptions on the Kernel Smoothness Range):} Our setting is more specific than the one~\citep[Theorem 4]{wynne2021convergence}: the kernel $k_\calX$ is Mat\'ern, and therefore all smoothness constants mentioned in Assumptions 2 and 3 have the same value, $s_\calX$.

{Assumption 4 (Assumptions on the Target Function and Mean Function):} The target function $f$ was assumed to have higher smoothness than $k_\calX$ in~\cref{as:app_true_f_smoothness,as:app_kernel_x}; the mean function was taken to be zero.

{Assumption 5 (Additional Assumptions on Kernel Parameters):} By \cref{as:app_kernel_x,as:app_true_f_smoothness} the smoothness of the true function $s_f \geq s_\calX >d/2$, which verifies both statements in the Assumption since all smoothness constants of the kernel are equal to $s_\calX$.

Therefore~\citep[Theorem 4]{wynne2021convergence} holds, and for $\calW_2^0(\calX)=\calL^2(\calX)$
%
\begin{align*}
    \|\hat{f}_t - f\|_{\calL^2(\calX)} \leq K_1 h_{x_{1:N}^t}^{d/2} \left( h_{x_{1:N}^t}^{s_\calX-d/2} + \lambda_\calX \right),
\end{align*}
%
for any $N$ for which the fill distance $h_{x_{1:N}^t} \leq h_0$ for some $h_0$, and $K_1$ and $h_0$ that depend on $\calX, s_f, s_\calX$. Since $x_i^t \sim \Pb_{\theta_t}$, we can guarantee that $h_{x_{1:N}^t} \leq h_0$ with high probability using~\citep[Lemma 2]{oates2019convergence}, which says that provided the density $\inf_{x} p_{\theta_t}(x)>0$, there is a $K_2$ such that $\E h_{x_{1:N}^t} \leq C_t N^{-1/d + \varepsilon}$ for an arbitrarily small $\varepsilon>0$, for $C_t$ that depends on $t$ through $\inf_{x} p_{\theta_t}(x)$: the smaller $\inf_{x} p_{\theta_t}(x)$, the larger $C_t$. Since we assumed $\inf_{x, \theta} p_{\theta_t}(x)=\eta>0$ there is a $K_2$ such that $C_t\leq K_2$ for any $t$. Therefore, we may take $N_0$ to be the smallest $N$ for which $\E h_{x_{1:N}^t} \leq K_2 N^{-1/d + \varepsilon}$ holds, and have for all $N \geq N_0$
%
\begin{align*}
    \E_{x^t_i \sim \Pb_\theta}\|\hat{f}_t - f\|_{\calL^2(\calX)} \leq K_1 K_2^{d/2} N^{-1/2 + \varepsilon} \left( K_2^{s_\calX-d/2} N^{-s_\calX/d + 1/2 + \varepsilon} + \lambda_\calX \right)
\end{align*}
%
By Markov's inequality, for any $\delta/2 \in (0,1)$ it holds with probability at least $1-\delta/2$ that
%
\begin{align}
\label{eq:variance_bound_proof_4}
    \|\hat{f}_t - f\|_{\calL^2(\calX)} \leq \frac{2}{\delta}K_1 K_2^{d/2} N^{-1/2 + \varepsilon} \left( K_2^{s_\calX-d/2} N^{-s_\calX/d + 1/2 + \varepsilon} + \lambda_\calX \right)
\end{align}
%
Putting together~\Cref{eq:variance_bound_proof_1,eq:variance_bound_proof_2,eq:variance_bound_proof_3,eq:variance_bound_proof_4} and Assumption~\cref{as:app_densities}, we get the result,
\begin{align*}
    \sigma^2_\mathrm{BQ}(\theta_t) - \lambda_\calX &=\sup_{\|f\|_{\calH_{\calX}^{\lambda_\calX}} \leq 1} \left| w^{\lambda_\calX}_t f(x^t_{1:N}) - \int_\calX f(x) \Pb_\theta(\mathrm d x)\right|^2 \\
    &\leq \sup_{\|f\|_{\calH_{\calX}} \leq 1} \left| w^{\lambda_\calX}_t f(x^t_{1:N}) - \int_\calX f(x) \Pb_\theta(\mathrm d x)\right|^2 \\
    &\leq \sup_{\|f\|_{\calH_{\calX}} \leq 1} \|\hat{f}_t - f\|^2_{\calL^2(\calX)} \|p_\theta \|^2_{\calL^2(\calX)} \\
    &\leq \frac{4}{\delta^2}\eta^2_0 K^2_1 K_2^d N^{-1 + 2\varepsilon} \left( K_2^{s_\calX-d/2} N^{-s_\calX/d + 1/2 + \varepsilon} + \lambda_\calX \right)^2.
\end{align*}
\end{proof}

We are now ready to state our main convergence result, which is a more general version of~\Cref{thm:convergence}.

\begin{theorem}
\label{thm:convergence_generalised}
    Suppose all technical assumptions in~\Cref{sec:technical_assumptions} hold. Then for any $\delta \in (0, 1)$ there is a $T_0(\delta)>0$ and an $N_0>0$ such that for any $N \geq N_0$ and $T \geq T_0$, with probability at least $1-\delta$ it holds that
    \begin{align*}
        &\| \hat I_\mathrm{CBQ}  - I \|_{\calL^2(\Theta, \Pb_\mathrm{tr})} \leq K_0( \log(12/\delta) + K_3) \\
        & \qquad\left(1 + c^{-1} T^{-1/(2r+1)} \left( \lambda_\calX + \frac{4}{\delta^2} \eta^2_0 K^2_1 K_2^d N^{-1 + 2\varepsilon} \left( K_2^{s_\calX-d/2} N^{-s_\calX/d + 1/2 + \varepsilon} + \lambda_\calX \right)^2 \right)\right) T^{-r/(2r+1)},
    \end{align*}
    for any arbitrarily small $\varepsilon>0$, and constants $K_0, K_1, K_2, K_3$ independent of $N, T, \delta, \varepsilon$.
\end{theorem}
\begin{proof}
Recall that for any two events $A$ and $B$,
\begin{align*}
    \Pb(A \cap B) = 1 - \Pb(\neg A \cup \neg B) \geq 1 - \Pb(\neg A) - \Pb(\neg B) = \Pb(A) + \Pb(B) - 1.
\end{align*}
Taking $A$ to be the event in~\Cref{thm:krr_corollary}, and $B$ to be the event in~\Cref{thm:bound_on_bq_var},
%
\begin{align*}
        A&=\left\{\| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Pb_\mathrm{tr})} \leq K_0( \log(12/\delta) + K_3) (1 + c^{-1} T^{-1/(2r+1)} \sigma^2_\mathrm{BQ}(\theta_{t'})) T^{-r/(2r+1)}\right\}, \\
        B&=\left\{\text{for all $t$, }\sigma^2_\mathrm{BQ}(\theta_t) \leq  \lambda_\calX + \frac{4}{\delta^2}\eta^2_0 K^2_1 K_2^d N^{-1 + 2\varepsilon} \left( K_2^{s_\calX-d/2} N^{-s_\calX/d + 1/2 + \varepsilon} + \lambda_\calX \right)^2\right\},
\end{align*}
%
we get the result.
\end{proof}

As discussed in the main text, convergence is fastest when the regulariser $\lambda_\calX$ is set to $0$; $\lambda_\calX>0$ ensures greater stability at the cost of a lower speed of convergence. For clarity we show how~\Cref{thm:convergence} in the main text follows from the more general~\Cref{thm:convergence_generalised} by setting $\lambda_\calX=0$. 

\begin{proof}[Proof of~\Cref{thm:convergence}] Take $\lambda_\calX=0$, $C_1(\delta)=K_0( \log(12/\delta) + K_3)=\calO(\log(1/\delta))$, $C_2(\delta)=2c^{-1}K_0( \log(12/\delta) + K_3)\eta^2_0 K^2_1 K_2^{2s_\calX}/\delta^2=\calO((1/\delta^2)\log(1/\delta))$, and $r=1/2$  in~\Cref{thm:convergence_generalised}. The result follows.
\end{proof}

%\textcolor{red}{Lastly, we highlight that our proof strategy readily extends to the case where Stage 1 approximates the integrals through Monte Carlo sampling, instead of Bayesian quadrature. In that case, the variance term $\sigma^2_\mathrm{MC}(\theta_{t'})$ in~\Cref{thm:krr_corollary} may be bounded as $\calO(N^{-1})$, and the final rate takes the form $\calO(T^{-\frac{r}{2 r + 1}}) + \calO(T^{-\frac{r+1}{2r+1}} N^{-1})$. As $2s_\calX/d$ will be greater than $1$ in practice---and considerably greater if $x \mapsto f(x, \theta)$ is smooth---this shows the benefit of using BQ in Stage 1 of the method.}

\subsection{A convergence rate for KLSMC}

\masha{This is not finished!}

Recall the KLSMC estimator,
%
\begin{align}
    \hat I_\mathrm{KLSMC}(\theta) = k_\Theta(\theta, \theta_{1:T}) \left( k_\Theta(\theta_{1:T}, \theta_{1:T}) + \sigma^2 \Id_T \right)^{-1} \hat I_\mathrm{MC}(\theta_{1:T}) 
\end{align}
%
where, for $x_j^{(i)} \sim \Pb_{\theta_i}$, $j \in \{1,\dots ,n\}$, the MC estimator of the integral at $\theta_i$ is
%
\begin{align}
    \hat I_\mathrm{MC}(\theta_i) = \frac{1}{n} \sum_{j=1}^n f(x_j^{(i)}, \theta_i).
\end{align}
%
This is the setting of Gaussian process regression with misspecified likelihood. By law of large numbers, $|\hat I_\mathrm{MC}(\theta_i) - I(\theta_i)|$ has a normal distribution $\calN(0, \mathrm{Var}[I_\mathrm{MC}(\theta_i)])$ with mean $0$ and unknown variance of the MC estimator $\mathrm{Var}[I_\mathrm{MC}(\theta_i)]$. In general, it will not hold that $\mathrm{Var}[I_\mathrm{MC}(\theta_i)]=\sigma^2$, and therefore, we have Gaussian process regression with misspecified likelihood, which is the setting of results in~\cite[Section 4.3.1]{wynne2021convergence}.

\begin{align}
    \E_{\varepsilon_1}\dots \E_{\varepsilon_T} \|\hat I_\mathrm{MC}(\theta) - I(\theta)\|_{\calL^2(\Theta)} \leq K_1 h_{\theta_{1:T}^t}^{d/2} \left( h_{\theta_{1:T}^t}^{\nu_\Theta} + \sigma^2\right),
\end{align}

\begin{align}
    \varepsilon = [\varepsilon_1, \dots, \varepsilon_T]
\end{align}
\begin{align}
    \varepsilon_i \sim \calN(0, \mathrm{Var}[I_\mathrm{MC}(\theta_i) - I(\theta_i)])
\end{align}

\subsection{On Sobolev Kernels}
\label{sec:on_sobolev_kernels}

Throughout this work, we assume the kernels $k_\calX$ and $k_\Theta$ are Sobolev kernels, meaning they induce a Hilbert space norm-equivalent to some Sobolev space. In this section, we expand on two important examples of Sobolev kernels: Mat\'ern kernels and Stein kernels.

It is well-known that the RKHS of a Mat\'ern kernel of order $\nu_\Theta$ over an open, convex and bounded $\Theta \subset \R^p$ is norm-equivalent to the Sobolev space $W^{2,\nu_\Theta+p/2}(\Theta)$. The result for integer-order spaces, meaning $\nu_\Theta+p/2 \in \mathbb Z$, is proven in~\citep[Corollary 10.48]{Wendland2005}. This can be straightforwardly extended to fractional order Sobolev-Slobodeckij spaces, $\nu_\Theta+p/2 \in \mathbb R$, using~\cite[Corollary 10.13]{Wendland2005} that says the RKHS of a Mat\'ern kernel on $\R^p$ is norm-equivalent to a Bessel potential space, which in turn is norm-equivalent to the Sobolev-Slobodeckij space by~\cite[Section 7.62]{adams2003sobolev}, and finally using an extension operator in~\cite[Theorems 6.1 and 6.7]{devore1993besov} to restrict this to open, convex and bounded subsets of $\R^p$.

\masha{The bit here about how Langevin Stein kernels based on a Matern kernel of order $\nu$ induces a space of smoothness $\nu+d/2-1$.}

\section{Practical Considerations for Conditional Bayesian Quadrature}\label{appendix:practical_considerations}

We now discuss important practical considerations which can have significant impact on the performance of CBQ. Firstly, in \Cref{appendix:tractable_kernel_means} we discuss how to ensure a closed-form expression for kernel mean embeddings and initial errors of BQ estimators. Then, we discuss the selection of all kernel hyperparameters in \Cref{appendix:hyperparameter_selection}.

\subsection{Tractable Kernel Means}\label{appendix:tractable_kernel_means}

In the main text, we discussed the requirement for both BQ and  CBQ that the kernel mean embedding $\mu$ and its integral (called initial error) are known in closed-form. A list of well-known pair can be found in Table 1 in \citep{fx_quadrature} or the \texttt{ProbNum} package \citep{Wenger2021}. 
However, even when none of these pairs are appropriate for the problem at hand, there are still multiple solutions:

\begin{itemize}
    \item First, for a fixed $k$, when the embedding of $\mathbb{P}$ is intractable but the embedding of some other distribution $\mathbb{Q}$ is known, we can use the `importance sampling trick' which consists of writing the integral as $I=\mathbb{E}_{X \sim \mathbb{P}} [f(X)] = \mathbb{E}_{X \sim \mathbb{Q}} [g(X)]$ where $g(x)=f(x)p(x)/q(x)$ and $p,q$ are the densities of $\mathbb{P},\mathbb{Q}$. This allows us to use BQ on the integral of $g$, which is tractable by construction.

\vspace{2mm}

\item Secondly, again for a fixed $k$ and assuming that we know the quantile function $\Phi^{-1}$ of the distribution $\mathbb{P}$ and that the embedding of the uniform distribution is available, we can use the `inverse transform trick' which consists of writing $I=\mathbb{E}_{X \sim \mathbb{P}} [f(X)] = \mathbb{E}_{U \sim \mathbb{U}} [g(U)]$ where $g(u) = f(\Phi^{-1}(u))$ and $\mathbb{U}$ is a uniform distribution on some hypercube. Once again, BQ can now be applied to the transformed problem.

\vspace{2mm}

\item Finally, for any distribution $\mathbb{P}$ whose density is known up to the normalisation constant (for example most posterior distributions), then specialised kernels with closed-form embeddings can be constructed. This is true of Stein reproducing kernels~\cite{anastasiou2023stein}. Suppose we have a distribution $\mathbb{P}$ with density $p:\calX \rightarrow \R^+$ and a function $f:\calX \rightarrow \R$ with the property that $\lim_{x \to \infty} p(x)f(x) = 0$. The Langevin Stein kernel $k: \calX \times \calX \to \R$ \citep{anastasiou2023stein} is given by:
\begin{align*}
    k_p(x,x') & := \nabla_x \log p(x)^\top k(x, x' ) \nabla_{x'} \log p(x') + \nabla_{x} \log p(x) ^\top \nabla_{x'} k(x, x') 
    \\ & \qquad + \nabla_{x'} \log p(x') ^\top \nabla_x k(x, x') + \nabla_x \cdot \nabla_{x'} k(x, x'),
\end{align*}
where $\nabla_x = (\partial/ \partial x_1, \cdots, \partial/ \partial x_d)^\top$ and $\nabla_x \cdot \nabla_{x'} k(x, x') = \sum_{i=1}^d \frac{\partial k(x, x')}{\partial x_i \partial x_i'}$.
% We can define the Langevin Stein operator $T_p[f](x) = f(x) \nabla_x \log p(x)+\nabla_x f(x)$ where $\E_p[T_p[f](x)] = 0$ for sufficiently regular $f$.


The main advantage of this construction is that the mean embedding $\mu(x') = \int_{\calX} k_p(x, x')p(x)dx = 0$ by construction. 
However, this means our GP prior on $f$ encodes beliefs that the function has mean zero.
To weaken this, we can add a constant $c \in \R$; i.e $\tilde{k}_p(x, x') = k_p(x,x') + c$, so that the kernel mean embedding becomes $\mu(x') = c$. 
The constant $c$ can then be treated as a kernel hyperparameter and estimated alongside all other parameters. 

% Stein reproducing kernels are also non-stationary, which implies prior beliefs that $f$ has different properties across different parts of $\calX$. Therefore, using a GP prior with Stein kernel as covariance function requires additional caution. Fortunately, our experiments  in \Cref{sec:experiments} and \Cref{appendix:experiments} do not exhibit a huge difference between Stein kernel and traditional kernels.

\end{itemize}

%%%%%%%%%%%%%%%%%%%%%%%%%%%

\subsection{Model and Hyperparameter Selection}\label{appendix:hyperparameter_selection}

\fxb{My overall impression of the section below was that it was initially written in a rush and not very carefully. I have now made a number of changes to improve the phrasing, and added remarks where the description is not clear. In general, I think it is important to be very thorough with these types of sections, otherwise you are at risk of making the paper difficult to reproduce. My general approach would be to consider every single modelling choice and describe the reasoning for how to address it as clearly and concretely as possible. Right now there are still a number of important questions remaining, so someone reading our paper would not be able to reproduce our results.}


Fo reproducibility reasons, we now discuss our approach for model and hyperparameter selection for CBQ and baseline methods. The hyperparameter selection for CBQ boils down to the choice of GP hyperparameters at stage 1 and the choice of GP hyperparameters at stage 2. 

Firstly, to simplify the choice of means, we renormalise all our function values before performing GP regression and interpolation. This is done by first subtracting the empirical mean and then dividing by the empirical standard deviation. All of our experiments then use prior mean functions $m_\Theta$ and $m_\calX$ which are zero functions. This choice is made for simplicity, and we might expect further improvements in accuracy if more information is available.


The choice of covariance functions $k_\calX$ and $k_\Theta$ is made on a case-by-case basis in order to both encode properties we expect the target functions to have, but also to ensure that the corresponding kernel mean is available in closed-form (as per the previous section). Once this is done, we typically still need to make a choice of hyperparameters for both kernel: lengthscales $l_\calX$, $\l_\Theta$ and amplitudes $A_\calX, A_\Theta$. 
We also need to select the regularizer $\lambda_\calX, \lambda_\Theta$. We select all of these hyperparameters through empirical Bayes, which consists of maximising the log-marginal likelihood.
For the $k_\calX$ kernel, assuming the log-marginal likelihood can be written as~\citep{GPML}:
%
\begin{align*}
& L(l_\calX, A_\calX) =  -\frac{1}{2} \log \left| k_{\calX}(x_{1:N},x_{1:N}; l_\calX, A_\calX) \right| - \frac{N}{2} \log(2 \pi) \\
& \quad -\frac{1}{2}(f(x_{1:N})-m_{\calX}(x_{1:N}))^\top \left(k_{\calX}(x_{1:N},x_{1:N};l_\calX, A_\calX) + \lambda_\calX \Id_N \right)^{-1} (f(x_{1:N})-m_{\calX}(x_{1:N})).
\end{align*}
The optimisation is implemented through a grid search over $\left[1.0, 10.0, 100.0, 1000.0 \right]$ for the amplitude $A_\calX$ and a grid search over $\left[0.1, 0.3, 1.0, 3.0, 10.0 \right]$ for the lengthscale $l_\calX$. 
$\lambda_\calX$ is fixed to be $0$ as suggested by \Cref{thm:convergence}.

If $k_\calX$ is Stein kernel, we have an extra hyperparameter $c_\calX$ to select along with lengthscale $l_\calX$ and amplitude $A_\calX$. 
For Stein kernel, we use gradient based optimization like stochastic gradient descent on the log-marginal likelihood to find the optimal value for $c_\calX, l_\calX, A_\calX$, which is implemented with \texttt{JAX} autodiff library~\citep{jax2018github}. 
The reason we are using gradient based optimization instead of grid search for Stein kernel is that Stein kernel requires an accurate estimate of $c_\calX$ to work well. 
And in order to return accurate results, grid search would require finer grid which is very expensive, while gradient based methods would require good initialization to avoid getting stuck in local minima. Fortunately, for $c_\calX$ we know that $c_\calX = 0$ is a good initialization point since we have subtracted the empirical mean when normalizing the function values.

Additionally, it is important to note that we have $T$ kernels $k_\calX^1, \cdots, k_\calX^T$. The hyperparameters of each kernel $k_\calX^t$ need to be selected using empirical Bayes under the observations $x_{1:N}^t$, which means we need to repeat the above optimization $T$ times. 
In practice, we observe that the $k_\calX^t$ hyperparameters share similar values, allowing us to select the hyperparameters of $k_\calX^1$ and subsequently reuse them across all $T$ kernels. This is done for pure computational consideration, and we expect CBQ to show better performances if each $k_\calX^t$ hyperparameters are optimized seperately.


For the kernel $k_\Theta$, we also select the hyperparameters through empirical Bayes by maximising the log-marginal likelihood. 
%
\begin{align*}
   & L(l_\Theta, A_\Theta) =  -\frac{1}{2} \log | k_{\Theta}(\theta_{1:T},\theta_{1:T}; l_\Theta, A_\Theta)| - \frac{T}{2} \log(2 \pi)
     \\
& \quad  -\frac{1}{2} (\hat{I}_\mathrm{BQ} (\theta_{1:T})- m_{\Theta}(\theta_{1:T}))^\top \left(k_{\Theta}(\theta_{1:T}, \theta_{1:T};l_\Theta, A_\Theta) + \left( \lambda_\Theta + \sigma_{\mathrm{BQ}}^2(\theta_{1:T}) \right) \Id_T \right)^{-1} (\hat{I}_\mathrm{BQ} (\theta_{1:T})- m_{\Theta}(\theta_{1:T})).
\end{align*}
%
Similar to above, we also do a grid search over $\left[1.0, 10.0, 100.0, 1000.0 \right]$ for amplitude $A_\Theta$, a grid search over $\left[0.1, 0.3, 1.0, 3.0, 10.0 \right]$ for lengthscale $l_\Theta$ and a grid search over $\left[0.01, 0.1, 1.0 \right]$ for $\lambda_\Theta$, so we select the value that gives the largest log-marginal likelihood. 

For baseline method KLSMC which implements Monte Carlo in the first stage and kernel ridge regression in the second stage, KLSMC only has hyperparameters in the second stage analogous to CBQ, namely $A_\Theta, l_\Theta, \lambda_\Theta$.
These hyperaparameters are selected with grid search to give the lowest RMSE on a separate held out validation set.
For baseline method LSMC which implements Monte Carlo in the first stage and polynomial regression in the second stage, LSMC has hyperparameters in the second stage, namely regularization coefficient $\lambda_\Theta$ and order of polynomial $p \in \{1,2,3,4\}$.
These hyperaparameters are also selected to give the lowest RMSE on a separate held out validation set.
For baseline method importance sampling, there are no hyperparameters to select. 


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



\section{Additional Experiments}\label{appendix:experiments}
In this section, we provide more detailed description of the settings in all experiments in the main text, and we provide further results and ablation studies. All figures reported in the paper are created using the median values obtained from 20 separate runs with different random seeds. Standard error is shown as shaded area around the median. 


\fxb{Hudson: I think this appendix is currently messy, and needs to be meticulously tidied. Overall, it should be as tidy as the main text. Could you please revisit this appendix and pay specific attention to the following: \\(i) The notation should match the notation in the experiments section in the main text. I had to change this slightly because the experiments section was not consistent with the main text, but now it should be, so I think its safe for you to make the change. \\(ii) Ensure that the figures are appropriately formatted (see the guide to academic papers in our Teams folder) - especially the size of the fonts is a bit too small now.\\
(iii) For each experiment, it would help if you could have one paragraph which says whether or not assumptions A1, A2, A3, A4 and A5 are satisfied \fxb{we had a discussion about this in our call a few weeks ago}.}

\subsection{Synthetic Experiment: Bayesian Sensitivity Analysis for Linear Models}\label{appendix:bayes_sensitivity}

\subsubsection{Experimental Setting}

\begin{figure}[t]
    \centering
    \begin{minipage}{1.0\textwidth}
    \centering
    \includegraphics[width=250pt]{figures/legend.pdf}
    \vspace{-5pt}
    \end{minipage}
    
    \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_N_10.pdf}
        \caption{RMSE with fixed $N=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_N_50.pdf}
        \caption{RMSE with fixed $N=50$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_N_100.pdf}
        \caption{RMSE with fixed $N=100$.}
    \end{subfigure}
        \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_T_10.pdf}
        \caption{RMSE with fixed $T=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_T_50.pdf}
        \caption{RMSE with fixed $T=50$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_T_100.pdf}
        \caption{RMSE with fixed $T=100$.}
    \end{subfigure}
    \caption{\emph{Bayesian sensitivity analysis for linear models}. \textbf{First Row:} Dimension $d=2$ with fixed $N=10, 50, 100$ and increasing $T$. 
    \textbf{Second Row:} Dimension $d=2$ with fixed $T=10, 50, 100$ and increasing $N$. The intergral is $f(x) = x^\top x$.} 
    \label{appfig:bayes_sensitivity_1}
\end{figure}


In this synthetic experiment, we do sensitivity analysis on the hyperparameters in Bayesian linear regression. 
The observational data for the linear regression are $Y \in \R^{m \times d}, Z \in \R^{m}$ with $m$ being the number of observations and $d$ being the dimension.
We use $x$ to denote the regression weight to maintain notational consistency as in the main text.
By placing a $\calN(x ; 0, \theta \Id_d)$ prior 
on the regression weights $x \in \R^{d}$ with $\theta \in \left( 1, 3\right)^d$, and using a Gaussian likelihood $p(Z \mid Y)=\calN(Z; Yx, \eta)$, we can obtain (via conjugacy) a multivariate Gaussian posterior $\Pb_\theta$ whose mean and variance have a closed form expression~\citep{bishop:2006:PRML}.
%
\begin{align*}
    \Pb_\theta = \calN(\tilde{m}, \tilde{\Sigma}), \quad \tilde{\Sigma}^{-1} = {\frac{1}{\theta} \Id_d} + \eta Y^\top Y, \quad \tilde{m} = \eta \tilde{\Sigma} Y^\top Z
\end{align*}
We can then analyse sensitivity by computing the conditional expectation $I(\theta)=\int_\calX f(x)\Pb_\theta(dx)$ of some quantity of interest $f$.
For example, if  $f(x)=x^\top x$, then $I(\theta)$ is the second moment of the posterior and the results are already reported in the main text.
If $f(x) = x^\top y^\ast$ for some new observation $y^\ast$, then $I(\theta)$ is the predictive mean. 
In these simple settings, $I(\theta)$ can be
computed analytically, making this a good synthetic example for benchmarking.
We sample parameter values $\theta_{1:T}$ from a uniform distribution $ \Qb = \operatorname{Unif}(\Theta)$ where $\Theta = (1, 3)^d$, and for each such parameter $\theta_t$, we obtain $N$ observations $x_{1:N}^t$ from $\Pb_{\theta_t}$.  
In total, we have $N \times T$ samples.

For conditional Bayesian quadrature (CBQ), we need to carefully choose two kernels $k_\Theta$ and $k_\calX$. Firstly, we choose the kernel $k_\calX$ to be an isotropic Gaussian kernel: $k(x, x') = A_\calX \exp(-\frac{1}{2 l_\calX^2} (x - x')^\top(x - x'))$ for the purpose that the Gaussian kernel mean embedding has a closed form under Gaussian posterior $\Pb_\theta$.
\begin{align}\label{appeq:E14}
    \mu_{\theta}(x) = A_\calX {\left| Id_d + l_\calX^{-2} \tilde{\Sigma} \right|}^{-1/2} \exp \left(-\frac{1}{2} (x - \tilde{m})^\top (\tilde{\Sigma} + l_\calX^2 \Id_d)^{-1} (x - \tilde{m})\right)
\end{align}
And the integral of kernel mean embedding $\mu_\theta$ (known as the initial error) also has a closed form
$\int \mu_\theta(x) \Pb_\theta(dx) = \frac{A_\calX l_\calX}{\sqrt{\left| l_\calX^2 \Id_d + 2 \tilde{\Sigma} \right|}}$.
Then, we pick $k_\Theta$. 
In this sythetic setting, we know that $I(\theta)$ is infinitely times differentiable, but we opt for Mat\'ern-3/2 kernel $k_\Theta(\theta, \theta') = A_\Theta \left(1+\frac{\sqrt{3} |\theta - \theta'|}{l_\Theta}\right) \exp \left(-\frac{\sqrt{3} |\theta - \theta'|}{l_\Theta}\right)$ to encode a more conservative prior information on the smoothness of $I(\theta)$ because in most applications we do not have the closed form expression of $I(\theta)$.
The hyperparameters for both kernels are selected according to \Cref{appendix:hyperparameter_selection}.

\begin{figure}[t]
    \centering
    \begin{minipage}{1.0\textwidth}
    \centering
    \includegraphics[width=250pt]{figures/legend.pdf}
    \vspace{-5pt}
    \end{minipage}
    
    \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_N_10_g4.pdf}
        \caption{RMSE with fixed $N=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_N_50_g4.pdf}
        \caption{RMSE with fixed $N=50$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_N_100_g4.pdf}
        \caption{RMSE with fixed $N=100$.}
    \end{subfigure}
        \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_T_10_g4.pdf}
        \caption{RMSE with fixed $T=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_T_50_g4.pdf}
        \caption{RMSE with fixed $T=50$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_T_100_g4.pdf}
        \caption{RMSE with fixed $T=100$.}
    \end{subfigure}
    \caption{\emph{Bayesian sensitivity analysis for linear models}. \textbf{First Row:} Dimension $d=2$ with fixed $N=10, 50, 100$ and increasing $T$. 
    \textbf{Second Row:} Dimension $d=2$ with fixed $T=10, 50, 100$ and increasing $N$. The intergral is $f(x) = x^\top y^\ast$.} \label{appfig:bayes_sensitivity_2}
\end{figure}

\subsubsection{Check the Assumptions in \Cref{thm:convergence}} 
We would like to check whether the assumptions made in \Cref{thm:convergence} hold in this experiment.
\begin{itemize}
    \item A1: 
    Although $\calX=\R$ is not a compact domain, but $\Pb_{\theta}$ is a Gaussian distribution so the probability mass outside a large compact subset of $\calX$ decays exponentially. $\Theta = \left( 1, 3 \right)^d$ is a compact domain. A1 is approximately satisfied.
    \item A2: A2 is satisfied due to the sampling mechanism of $\theta_{1:T}$ and $\{x_{1:N}^t\}_{t=1}^T$.
    \item A3: $\Qb$ is a uniform distribution so its density $q$ is constant and hence upper bounded and strictly positive. $\Pb_\theta$ is a Gaussian distribution so its density $p_\theta$ is strictly positive on a compact and large domain with finite second moment. A3 is approximately satisfied.
    \item A4: Both $f(x)$ and $I(\theta)$ are infinitely times differentiable, so $s_I=s_f = \infty$. 
    Although $k_\calX$ is Gaussian kernel which does not satisfy the assumption of \Cref{thm:convergence}, we have ablation study in \Cref{appendix:ablation} showing similar performance when $k_\calX$ is Mat\'ern-3/2 kernel so $s_\calX = \frac{3}{2} + \frac{d}{2}$, and $k_\Theta$ is Mat\'ern-3/2 kernel so $s_\Theta = \frac{3}{2} + \frac{d}{2}$, where $d$ is the dimension. A4 is satisfied.
    \item A5: $\lambda_\calX$ is picked to be $0$ and $\lambda_\Theta$ is found via grid search among $\{0.01, 0.1, 1.0\}$. A5 is satisfied.
\end{itemize}
 
\subsubsection{More Experimental Results}
We provide more experimental results for Bayesian sensitivity analysis in linear models here. 
\Cref{appfig:bayes_sensitivity_1} provides the result when the integrand is chosen to be $f(x)=x^\top x$ so $I(\theta)$ represents the posterior second moment, and \Cref{appfig:bayes_sensitivity_2} provides the result when the integrand is chosen to be $f(x)=x^\top y^\ast$ so $I(\theta)$ represents the predictive mean.

In the first row of \Cref{appfig:bayes_sensitivity_1}, we fix $N=10, 50, 100$ showing the performance of RMSE with increasing $T$. 
In the second row of \Cref{appfig:bayes_sensitivity_1}, we fix $T=10, 50, 100$ showing the performance of RMSE with increasing $N$. In the first row of \Cref{appfig:bayes_sensitivity_2}, we fix $N=10, 50, 100$ showing the performance of RMSE with increasing $T$. 
In the second row of \Cref{appfig:bayes_sensitivity_2}, we fix $T=10, 50, 100$ showing the performance of RMSE with increasing $N$. 
We can see that CBQ has demonstrated consistent smaller RMSE for both tasks under the same number of samples and faster convergence rate compared to all other baseline methods. The conclusions that we draw from the main text also hold for different values of $N$ and $T$.
By comparing the performance of CBQ and KLSMC, where the second stage of both methods are identical, and the main difference lies in the first stage, we believe that CBQ shows better performances mainly due to using Bayesian quadrature instead of Monte Carlo in the first stage.
Also by comparing the first and second row in both \Cref{appfig:bayes_sensitivity_1} and \Cref{appfig:bayes_sensitivity_2}, we can confirm the theory we proved in \Cref{appendix:convergence_rate} that CBQ has a faster convergence rate in $N$ than in $T$. 

In general, CBQ is more computationally expensive than baselines (KLSMC, LSMC and IS), so in this simple setting it is more efficient to spend more budget on obtaining more samples. 
Nonetheless, in scenarios where the expense of sample collection constitutes a significant fraction of the computational budget, or when the evaluation of the integrand proves to be highly costly, it becomes more cost-effective to spend a larger share of the budget towards CBQ. For example, sampling can become expensive easily when the prior and likelihood are not conjugate, so Markov chain Monte Carlo methods are needed to sample from unnormalized posterior. 
Also, we show in the next section \Cref{appendix:sir} a real world example when sampling is particularly costly and hence using CBQ is overall more efficient.

\subsection{Bayesian Sensitivity Analysis for Susceptible-Infectious-Recovered (SIR) Model }\label{appendix:sir}
\subsubsection{Experimental Setting}
The SIR model is commonly used to simulate the dynamics of infectious diseases through a population~\cite{kermack1927sir}. 
It divides the population into three sections.
Susceptibles (S) represents people who are not infected but can be infected after getting contact with an infectious individual.
Infectious (I) represents people who are currently infected and can infect susceptible individuals.
Recovered (R) represents individuals who have been infected and then removed from the disease, either by recovering or dying. The dynamics are governed by a system of ordinary differential equations (ODE) as below.
%
\begin{align*}
    \begin{aligned}
\frac{\mathrm{d} S}{\mathrm{~d} r} &= -x S I, \quad
\frac{\mathrm{d} I}{\mathrm{~d} r} &= x S I-\gamma I, \quad
\frac{\mathrm{d} R}{\mathrm{~d} r} &= \gamma I
\end{aligned}
\end{align*}
%
with $x$ being the infection rate, $\gamma$ being the recovery rate and $r$ is the time. The solution to the SIR model would be a vector of $\left(N_I^r, N_S^r, N_R^r \right)$ representing the number of infectious, susceptibles and recovered at day $r$.

In this experiment, we assume that the recovery rate $\gamma$ is fixed and the infection rate $x$ follows a gamma prior distribution $x \sim \Pb_\theta = \operatorname{Gamma}(\theta, \xi)$ where $\theta$ represents the initial belief of the infection rate deduced from the study of the virus in the laboratory at the beginning of the outbreak, and $\xi$ represents the amount of uncertainty on the initial belief. In this experiment, we fix the rate parameter $\xi=10$, the total population is set to be $10^6$ and the recovery rate $\gamma = 0.05$. 
The target of interest is the expected peak number of infected individuals under the prior distribution on $x$: 
\begin{align*}
    I(\theta) = \E_{x}\left[\max_r N_I^r(x) \mid \theta \right] = \int_{\calX} \max_r N^r_I(x) \Pb_\theta(dx)
\end{align*}
with the integrand $f(x) = \max_r N_I^r(x)$. We are interested in the sensitivity analysis of the shape parameter $\theta$ to the final estimate of the expected peak number of infected individuals.
The initial belief of the infection rate $\theta_{1:T}$ are sampled from the uniform distribution $\Qb = \operatorname{Unif}\left(2,9\right)$ and then $N$ number of $x^t_{1:N}$ are sampled from $\Pb_{\theta_t} = \operatorname{Gamma}(\theta_t, \xi)$. 
In this setting, sampling $x$ is very expensive as it necessarily involves solving the system of SIR ODEs, which can be very slow as the discretization step gets finer.
In the middle panel of \Cref{fig:finance_sir}, we have shown that obtaining one sample from SIR ODEs under discretization time step $\tau = 0.1$ takes around $3.0$s, whereas running the whole CBQ algorithm takes $1.0$s, not to mention that sampling from SIR ODEs need to be repeated $N \times T$ times. Therefore, using CBQ is ultimately more efficient overall within the same period of time.

For conditional Bayesian quadrature (CBQ), we need to carefully choose two kernels $k_\Theta$ and $k_\calX$.
First we choose $k_\calX$, we use Mat\'ern-3/2 as the base kernel and then apply Stein operator to both arguments of the base kernel to obtain $k_\calX$. 
The reason we use a Stein kernel is that Stein kernel has one order less smoothness than the base kernel, and since the smoothness of the integrand $f(x) = \max_r N_I^r(x)$ is unknown, using a Stein kernel enforces weaker prior information than Mat\'ern-3/2.
Furthermore, the kernel mean embedding of a Stein kernel $\mu(x)$ is a constant $c$ by construction as per the discussion in \Cref{appendix:practical_considerations}. 
The initial error is also a constant $c$ by construction.
Then we choose $k_\Theta$. Since $I(\theta)$ represents the peak number of infections so $I(\theta)$ is expected to be smooth and continuous, and hence we choose $k_\Theta$ as Mat\'ern-3/2 kernel. 
All hyperparameters in $k_\calX$ and $k_\Theta$ are selected according to \Cref{appendix:hyperparameter_selection}.
We use a MC estimator with $5000$ samples as the pseudo ground truth and evaluate the RMSE across all methods. 

\begin{figure}[t]
    \begin{minipage}{\textwidth}
    \centering
    \includegraphics[width=350pt]{figures/legend_finance.pdf}
    \end{minipage}
    
    \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/SIR_15.pdf}
        \caption{RMSE with fixed $T=15$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/SIR_25.pdf}
        \caption{RMSE with fixed $T=25$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/SIR_35.pdf}
        \caption{RMSE with fixed $T=35$.}
    \end{subfigure}
    \caption{\emph{Bayesian sensitivity analysis for SIR model.} $T=15, 25, 35$ and increasing $N$.}\label{appfig:sir}
\end{figure}

\subsubsection{Check the Assumptions in \Cref{thm:convergence}} 
We would like to check whether the assumptions made in \Cref{thm:convergence} hold in this experiment.
\begin{itemize}
    \item A1: Although $\calX=\R^+$ is not a compact domain, but $\Pb_{\theta}$ is a Gamma distribution so the probability mass outside a large compact subset of $\calX$ decays exponentially. $\Theta = \left(2, 9 \right)^d$ is a compact domain. A1 is approximately satisfied.
    \item A2: A2 is satisfied due to the sampling mechanism of $\theta_{1:T}$ and $\{x_{1:N}^t\}_{t=1}^T$.
    \item A3: $\Qb$ is a uniform distribution so its density $q$ is constant and hence upper bounded and strictly positive. $\Pb_\theta$ is a Gamma distribution so its density $p_\theta$ is strictly positive within a large compact subset of $\calX$ and has finite second moment. A3 is approximately satisfied.
    \item A4: $f(x) = \max_r N_I^r(x)$ is the maximum number of infections so $f(x)$ is not necessarily smooth. $I(\theta)$ represents the peak number of infections with varying initial estimate of the infection rate, so $I(\theta)$ is smooth and continuous with $s_I \leq 1$. 
    $k_\calX$ is Stein kernel with Matern-3/2 kernel as the base, so the smoothness is around .
    and $k_\Theta$ is Matern-3/2 kernel so $s_\Theta = \frac{3}{2} + \frac{1}{2} = 2$. A4 is satisfied. \hudson{I do not know how to write this here.}
    \item A5: $\lambda_\calX$ is picked to be $0$ and $\lambda_\Theta$ is found via grid search among $\{0.01, 0.1, 1.0\}$. A5 is satisfied.
\end{itemize}


\subsubsection{More Experimental Results}
We report more results in \Cref{appfig:sir} with fixed $T=15, 25, 35$ and increasing $N$, to showcase that CBQ consistently exhibits smaller RMSE than baseline methods. The conclusions that we draw from the main text also hold for different values of N and T for this experiment.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\subsection{Option Pricing in Mathematical Finance}\label{appendix:black_scholes}
\subsubsection{Experimental Setting}
In this experiment, we consider specifically an asset whose price $S({\tau})$ at time $\tau$ follows the Black-Scholes formula $S(\tau) = S_0 \exp \left(\sigma W(\tau) - \sigma^2 \tau/2 \right)$ for $\tau \geq 0$, where $\sigma$ is the underlying volatility, $S_0$ is the initial price and $W$ is the standard Brownian motion.
The financial derivative we are interested in is a butterfly call option whose payoff at time $\tau$ can be expressed as $\psi(S({\tau}))=\max (S(\tau)-K_1, 0) + \max (S(\tau)-K_2, 0) - 2\max (S(\tau) - (K_1+K_2)/2, 0)$.

In addition to the expected payoff, insurance companies are interested in computing the expected loss of their portfolios if a shock would occur in the economy.
We follow the setting in \cite{alfonsi2021multilevel, alfonsi2022many} assuming that a shock occur at time $\eta$, at which time the option price is $S(\eta)=\theta$, and this shock multiplies the option price by $1 + s$. The option price at maturity time $\zeta$ is denoted as $S(\zeta) = x$. The expected loss caused by the shock can be expressed as 
\begin{align*}
    \mathcal{L} = \E \Bigl[\max \Bigl(I(\theta), 0 \Bigr) \Bigr], \text{   } I(\theta) = \int_0^\infty \psi(x)-\psi \left((1 + s) x \right) \Pb_\theta(dx)
\end{align*}
So the integrand is $f(x) = \psi(x)-\psi((1+s)x)$.

Following the setting in \cite{alfonsi2021multilevel, alfonsi2022many}, we consider the initial price $S_0 = 100$, the volatility $\sigma = 0.3$, the strikes $K_1 = 50, K_2 = 150$, the option maturity $\zeta=2$ and the shock happens at $\eta=1$ with strength $s = 0.2$. 
The option price at which the shock occurs are $\theta_{1:T}$ sampled from the log normal distribution deduced from the Black-Scholes formula $\theta_{1:T} \sim \Qb = \operatorname{Lognormal}( \log S_0 - \frac{\sigma^2}{2} \eta, \sigma^2 \eta)$. 
Then $x^t_{1:N}$ are sampled from another log normal distribution also deduced from the Black-Scholes formula $x^t_{1:N} \sim \Pb_{\theta_t} = \operatorname{Lognormal}( \log \theta_t - \frac{\sigma^2}{2} (\zeta - \eta), \sigma^2 (\zeta - \eta))$. 

For conditional Bayesian quadrature (CBQ), we need to carefully choose two kernels $k_\calX$ and $k_\Theta$. First we choose the kernel $k_\calX$ to be a log-Gaussian kernel for the purpose that the log-Gaussian kernel mean embedding has a closed form under log-normal distribution $\Pb_\theta = \operatorname{Lognormal}(\bar{m}, \bar{\sigma}^2)$ with $\bar{m} = \log \theta - \frac{\sigma^2}{2}(\zeta - \eta)$ and  $\bar{\sigma}^2 = \sigma^2 (\zeta - \eta)$. 
The log Gaussian kernel is defined as $k_\calX(x, x') = A_\calX \exp\left(-\frac{1}{2 l_\calX^2} (\log x - \log x')^2\right)$
and the kernel mean embedding has the form
\begin{align*}
    \mu_{\theta}(x) = A_\calX \left. \exp \left(-\frac{\bar{m}^2 + (\log x)^2 }{2(\bar{\sigma}^2 + l_\calX^2)}\right) x^{\frac{\bar{m}}{\bar{\sigma}^2 + l_\calX^2}} \middle/ \sqrt{1 + \frac{\bar{\sigma}^2}{l_\calX^2}} \right.
\end{align*}
The initial error, which is the integral of kernel mean $\mu_{\theta}(x)$ does not have a closed form expression, so we use the empirical average as an approximation. Then, we choose the kernel $k_\Theta$ to be a Mat\'ern-3/2 kernel.

For this experiment, we also implement CBQ with Stein kernel. We use Mat\'ern-3/2 as the base kernel and then apply Stein operator to both arguments of the base kernel to obtain $k_\calX$. 
The reason we use Stein kernel is that Stein kernel has one order less smoothness than the base kernel, and since the integrand $f(x) = \psi(x)-\psi((1+s)x)$ is a combination of piecewise linear functions which is first-order weakly differentiable, but not second-order weakly differentiable, using a Stein kernel encodes more accurate prior information. 
\hudson{I am not confident in this.}
The kernel mean embedding of a Stein kernel is a
constant c by construction as per the discussion in \Cref{appendix:practical_considerations}.
The kernel $k_\Theta$ is selected as Mat\'ern-3/2 kernel.
All hyperparameters in $k_\calX$ and $k_\Theta$ for CBQ and hyperparameters for baseline methods are selected according to \Cref{appendix:hyperparameter_selection}.

\subsubsection{Check the Assumptions in \Cref{thm:convergence}} 
We would like to check whether the assumptions made in \Cref{thm:convergence} hold in this experiment.
\begin{itemize}
    \item A1: Although $\calX=\R^+$ is not a compact domain, but $\Pb_{\theta}$ is a lognormal distribution so the probability mass outside a large compact subset of $\calX$ decays super exponentially. A similar argument can be made for $\Theta$ as well. A1 is approximately satisfied.
    \item A2: A2 is satisfied due to the sampling mechanism of $\theta_{1:T}$ and $\{x_{1:N}^t\}_{t=1}^T$.
    \item A3: $\Qb$ is a lognormal distribution so its density $q$ is upper bounded and strictly positive within a large compact subset of $\Theta$. $\Pb_\theta$ is also a lognormal distribution so its density $p_\theta$ is strictly positive within a large compact subset of $\calX$ and has finite second moment. A3 is approximately satisfied.
    \item A4: $f(x)$ is a combination of piecewise linear functions so $s_f = 1$ and $I(\theta)$ is infinitely times differentiable so $s_f = \infty$. 
    When $k_\calX$ is Stein kernel with Matern-3/2 kernel as the base, $s_\calX = \frac{3}{2} + \frac{1}{2} - 1 = 1$ \hudson{not sure}. When $k_\calX$ is Stein kernel is log Gaussian kernel, \hudson{How do we know the smoothness?}. $k_\Theta$ is Matern-3/2 kernel so $s_\Theta = \frac{3}{2} + \frac{1}{2} = 2$. A4 is satisfied.
    \item A5: $\lambda_\calX$ is picked to be $0$ and $\lambda_\Theta$ is found via grid search among $\{0.01, 0.1, 1.0\}$. A5 is satisfied.
\end{itemize}

\subsubsection{More Experimental Results}
We report more results in \Cref{appfig:finance} with fixed $T=10, 20, 30$ and increasing $N$, to showcase that CBQ consistently exhibits smaller RMSE than baseline methods. The conclusions that we draw from the main text also
hold for different values of $N$ and $T$ for this experiment.
The performance of CBQ is similar between $k_\calX$ being Stein kernel and $k_\calX$ being log Gaussian kernel. \hudson{Maybe there is more to add, after we figure out the smoothness of the lognormal kernel?}

\begin{figure}[t]
    \begin{minipage}{\textwidth}
    \centering
    \includegraphics[width=350pt]{figures/legend_finance.pdf}
    \end{minipage}
    
    \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/finance_T_10.pdf}
        \caption{RMSE with fixed $T=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/finance_T_20.pdf}
        \caption{RMSE with fixed $T=20$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/finance_T_30.pdf}
        \caption{RMSE with fixed $T=30$.}
    \end{subfigure}
    \caption{\emph{Option pricing in mathematical finance.} $T=10, 20, 30$ and increasing $N$.}\label{appfig:finance}
\end{figure}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\subsection{Uncertainty Decision Making in Health Economics}\label{appendix:decision}
\subsubsection{Experimental Settings}
In the medical world, it is important to compare the cost and the relative advantage of conducting an extra medical experiment~\citep{brennan2007calculating}.
In the area of oil and gas reservoir, an cost analysis is necessary before deciding whether to drill additional wells.
The expected value of partial perfect information (EVPPI) quantifies the expected gain from conducting extra experiments to obtain precise knowledge of some unknown variables \citep{brennan2007calculating}. 
EVPPI can be expressed as 
\begin{align*}
    \text{EVPPI} = \E \Bigl[\max_c I_c(\theta) \Bigr] - \max_c \E \Bigl[f_c(X, \theta) \Bigr], \text{   } I_c(\theta) = \int_{\calX} f_c(x, \theta) \Pb_\theta(dx)
\end{align*}
where $c \in \mathcal{C}$ is a set of potential treatments and $f_c$ measures the potential outcome of treatment $c$. Our method is applicable for estimating the conditional expectation $I_c(\theta)$ of the first term. 

We adopt the same experimental setup as delineated in \cite{Giles2019}, wherein $X$ and $\theta$ have a joint 19-dimensional Gaussian distribution, meaning that $\Pb_{\theta}$ is a Gaussian distribution. 
The specific meanings of all $X$ and $\theta$ are outlined in \Cref{tab:mytable}.
All these variables are independent except that $\theta_1, \theta_2, X_6, X_{14}$ are pairwise correlated with a correlation coefficient $0.6$.
The observations $\theta_{1:T}$ are sampled from the marginal Gaussian distribution $\Qb$ and then $N$ observations of $x^t_{1:N}$ are sampled from $\Pb_{\theta_t}$.

We are interested in a binary decision-making problem ($\calC = \{1, 2\}$) with $f_1(x, \theta)=10^4 (\theta_1 x_5 x_6 + x_7 x_8 x_{9})-(x_1 + x_2 x_3 x_4)$ and $f_2(x, \theta) = 10^4 (\theta_2 x_{13} x_{14} + x_{15} x_{16} x_{17})-(x_{10} + x_{11} x_{12} x_4)$. 
In computing EVPPI, we estimate $I_c(\theta)$ with CBQ and baselines, and then use standard Monte Carlo for the rest of the expectations.
We draw $10^6$ samples from the joint distribution to generate a pseudo ground truth, and evaluate the RMSE across different methods. 
Note that IS is no longer applicable here because $f_c$ now depends on both $x$ and $\theta$, so we only comparing CBQ against KLSMC and LSMC.

For conditional Bayesian quadrature (CBQ), we need to carefully choose two kernels. First, we select Mat\'ern-3/2 for $k_\calX$ for the purpose that the kernel mean embedding under Gaussian distribution $\Pb_\theta = \calN(\tilde{\mu}, \tilde{\Sigma})$ has a closed form if we use the 'inverse transform trick' as outlined in \Cref{appendix:practical_considerations}. 
Specifically speaking, we initially sample $u$ from $\calN(0, \Id_d)$, then calculate $x = \tilde{m} + L^\top u$ where $L$ is the lower triangular matrix derived from the Cholesky decomposition of the covariance matrix $\tilde{\Sigma}$. 
The integral now becomes
\begin{align}\label{appeq:transform}
    I_c(\theta) = \int_{\R^d} f(x)\calN(x; \tilde{m},\tilde{\Sigma}) dx = \int_{\R^d} f(\tilde{m} + L^\top u) \calN(u; 0, \Id_d) du
\end{align}
The closed form expression of kernel mean embedding for Mat\'ern-3/2 kernel and isotropic Gaussian can be found in the Appendix S.3 of \cite{ming2021linked}.
Then we pick $k_\Theta$. 
We know there is a high chance that $I_c(\theta)$ is infinitely times differentiable, but we opt for Mat\'ern-3/2 kernel to encode a more conservative prior information on the smoothness of $I_c(\theta)$.
All hyperparameters in $k_\calX$ and $k_\Theta$ are selected according to \Cref{appendix:hyperparameter_selection}.

\begin{figure}[t]
    \begin{minipage}{\textwidth}
    \centering
    \includegraphics[width=250pt]{figures/legend.pdf}
    \vspace{-10pt}
    \end{minipage}
    
    \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/decision_T_10_no_legend.pdf}
        \caption{RMSE with fixed $T=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/decision_T_30_no_legend.pdf}
        \caption{RMSE with fixed $T=30$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/decision_T_50_no_legend.pdf}
        \caption{RMSE with fixed $T=50$.}
    \end{subfigure}
    \caption{\emph{Uncertainty decision making in health economics.} $T=10, 30, 50$ and increasing $N$.}\label{appfig:decision}
\end{figure}


\begin{table}[t]
\centering
\begin{tabular}{
>{\centering\arraybackslash}p{1.5cm}
>{\centering\arraybackslash}p{1cm}
>{\centering\arraybackslash}p{1cm}
>{\centering\arraybackslash}p{5cm}}
\toprule
Variables & Mean & Std & Meaning \\
\midrule
$X_1$ & 1000 & 1.0 & Cost of treatment \\
$X_2$ & 0.1 & 0.02 & Probability of admissions \\
$X_3$ & 5.2 & 1.0 & Days of hospital \\
$X_4$ & 400 & 200 & Cost per day \\
$X_5$ & 0.3 & 0.1 & Utility change if response \\
$X_6$ & 3.0 & 0.5 & Duration of response \\
$X_7$ & 0.25 & 0.1 & Probability of side effects \\
$X_8$ & -0.1 & 0.02 & Change in utility if side effect \\
$X_{9}$ & 0.5 & 0.2 & Duration of side effects \\
$X_{10}$ & 1500 & 1.0 & Cost of treatment \\
$X_{11}$ & 0.08 & 0.02 & Probability of admissions \\
$X_{12}$ & 6.1 & 1.0 & Days of hospital \\
$X_{13}$ & 0.3 & 0.05 & Utility change if response \\
$X_{14}$ & 3.0 & 1.0 & Duration of response \\
$X_{15}$ & 0.2 & 0.05 & Probability of side effects \\
$X_{16}$ & -0.1 & 0.02 & Change in utility if side effect \\
$X_{17}$ & 0.5 & 0.2 & Duration of side effects \\
$\theta_1$ & 0.7 & 0.1 & Probability of responding \\
$\theta_2$ & 0.8 & 0.1 & Probability of responding \\
\bottomrule

\end{tabular}
\vspace{5pt}
\caption{Variables in the health economics experiment.}
\label{tab:mytable}
\end{table}

\subsubsection{Check the Assumptions in \Cref{thm:convergence}} 
We would like to check whether the assumptions made in \Cref{thm:convergence} hold in this experiment.
\begin{itemize}
    \item A1: 
    Although $\calX=\R$ is not a compact domain, but $\Pb_{\theta}$ is a Gaussian distribution so the probability mass outside a large compact subset of $\calX$ decays exponentially. Similarly, $\Theta = \R$ is not a compact domain, but $\Qb$ is a Gaussian distribution so the probability mass outside a large compact subset of $\Theta$ decays exponentially. A1 is approximately satisfied.
    \item A2: A2 is satisfied due to the sampling mechanism of $\theta_{1:T}$ and $\{x_{1:N}^t\}_{t=1}^T$.
    \item A3: $\Qb$ is also a Gaussian distribution so its density $q$ is upper bounded and strictly positive on a compact and large domain. $\Pb_\theta$ is a Gaussian distribution so its density $p_\theta$ is strictly positive on a compact and large domain with finite second moment. A3 is approximately satisfied.
    \item A4: Both the integrand $f(x)$ and the conditional expectation $I_c(\theta)$ are infinitely times differentiable, so $s_f = s_I = \infty$.
    \item A5: $\lambda_\calX$ is picked to be $0$ and $\lambda_\Theta$ is found via grid search among $\{0.01, 0.1, 1.0\}$. A5 is satisfied.
\end{itemize}

\subsubsection{More Experimental Results}
We report more results in \Cref{appfig:decision} with fixed T = 10, 30, 50 and increasing N, to showcase that CBQ consistently exhibits smaller RMSE than baseline methods.
The conclusions that we draw from the main text also hold for different values of $N$ and $T$ for this experiment.


\fxb{Again we need much more interpretation of the results, not just a statement of what is observed. Why are these observations interesting? Why is this happening? etc. etc.}


\subsection{Comparison of CBQ and MOBQ}\label{appendix:cbq_mobq}

In Section \ref{sec:cbq} in the main text, we mentioned the comparison of CBQ and multioutput Bayesian quadrature (MOBQ) in terms of their computational complexity. 
For $T$ parameter values $\theta_1, \cdots, \theta_T$ and $N$ samples from each probability distribution $\mathbb{P}_{\theta_1}, \ldots, \mathbb{P}_{\theta_T}$, the computational cost is $\calO(TN^3 + T^3)$ for CBQ and $\calO(N^3T^3)$ for MOBQ. 
We give a more thorough comparison of CBQ and MOBQ in this section, 
% MOBQ is expected to have lower RMSE than CBQ, but the computational cost will get unbearably costly as $N$ or $T$ grows. 

When the integrand $f$ only depends on $x$ (Bayesian sensitivity analysis for linear models, option pricing in mathematical finance), MOBQ estimate only requires one kernel $k_\calX$. 
\begin{align*}
    I_{\mathrm{MOBQ}}(\theta^\ast) = \int_\calX k_\calX(x, x_{1:NT}) \Pb_{\theta^\ast}(dx) \Big(k_\calX(x_{1:NT}, x_{1:NT}) + \lambda_\calX \Id_{NT} \Big)^{-1} f(x_{1:NT})
\end{align*}
where $x_{1:NT}$ is a concatenation of $x_{1:N}^1, \cdots, x_{1:N}^T$.
When the integrand $f$ depends on both $x$ and $\theta$ (uncertainty decision making in health economics), MOBQ requires two kernels $k_\calX$ and $k_\Theta$.
\begin{align*}
\begin{aligned}
    I_{\mathrm{MOBQ}}(\theta^\ast) &= \Big( \int_\calX k_\calX(x, x_{1:NT})   \Pb_{\theta^\ast}(dx)  \odot k_\Theta(\theta^\ast, \theta_{1:NT}) \Big) \Big(k_\calX(x_{1:NT}, x_{1:NT}) \\ &\odot k_\Theta(\theta_{1:NT}, \theta_{1:NT})  + \lambda_\calX \Id_{NT} \Big)^{-1} f(x_{1:NT})
\end{aligned}
\end{align*}
where $\odot$ denotes element-wise product, and $\theta_{1:NT} = \left[\theta_1, \cdots, \theta_1, \cdots, \theta_T, \cdots, \theta_T \right] \in \R^{NT}$.
From the above two equations, we can see that the computation cost of $\calO(N^3T^3)$ mainly comes from the inversion of a $NT \times NT$ kernel matrix.
All the MOBQ hyperparameters in $k_\calX$ and $k_\Theta$ are selected by empirical Bayes in the same way as CBQ outlined in \Cref{appendix:hyperparameter_selection}.
It's crucial to note that MOBQ computational cost is significantly high for Stein kernel during hyperparameter selection, as evaluating the log marginal likelihood at every iteration would require the inversion of a $NT \times NT$ matrix.
Therefore, we do not include the experiment of Bayesian sensitivity analysis for the SIR model in this section.
All the hyperparameters for CBQ are reused as in \Cref{appendix:experiments}.

For Bayesian sensitivity analysis in linear models, the integrand is $f(x) = x^\top x$, the dimension is fixed $d=2$ and $T=50$.
In \Cref{appfig:mobq_bayes_sensitivity}, we can see that MOBQ indeed achieves lower RMSE at the beginning, but CBQ catches up when $N$ grows higher.
For option pricing in mathematical finance, we only compare MOBQ and CBQ when $k_\calX$ is the log Gaussian kernel and $T=20$.
For uncertainty decision making in health economics, we compare MOBQ and CBQ when $T=50$.
In \Cref{appfig:mobq_finance} and \Cref{appfig:mobq_decision}, we can see that CBQ and MOBQ achieves similar performances in terms of RMSE.
Additionally, in the second row of \Cref{appfig:mobq}, we compare the computational cost of MOBQ and CBQ, where we can see that the computational time of MOBQ is much larger than CBQ as $N$ grows across all settings, due to the complexity of $\calO(N^3T^3)$ for MOBQ. 


Additionally, as the main computational bottleneck of MOBQ is the inversion of the kernel matrix, so it would be interesting to see if MOBQ combined with scalable GP methods can reduce the computational time while still preserving the same level of accuracy.
We report the performance of MOBQ (Nyström) in both \Cref{appfig:mobq_bayes_sensitivity} and \Cref{appfig:mobq_finance}, and we can see that MOBQ (Nyström) performs worse than CBQ in terms of RMSE.
The reason of worse performance of MOBQ (Nyström) is that the use of scalable GP methods would introduce an extra layer of approximation that slows down the convergence rate.
Additionally, most scalable GP methods are used in the “regression” setting, while quadrature methods like BQ or CBQ belong to the “interpolation" setting~\cite{kanagawa2018gaussian}, so the quandrature problem will be more sensitive to the approximation error introduced. 

\begin{figure}[t]\label{appfig:mobq}
    \centering
    \begin{minipage}{1.0\textwidth}
    \centering
    \includegraphics[width=350pt]{figures/mobq_legend.pdf}
    \end{minipage}
    
    \begin{subfigure}{0.30\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/mobq_sensitivity_nystrom.pdf}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.30\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/mobq_finance_nystrom.pdf}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.30\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/mobq_decision_T_50.pdf}
    \end{subfigure}
    \\
    \begin{subfigure}{0.30\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/mobq_bayes_sensitivity_time_T_50.pdf}
        \caption{Bayesian sensitivity analysis for linear models.}
        \label{appfig:mobq_bayes_sensitivity}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.30\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/mobq_finance_time_T_20.pdf}
        \caption{Option pricing in mathematical finance.}
        \label{appfig:mobq_finance}
    \end{subfigure}
    \hfill
        \begin{subfigure}{0.30\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/mobq_decision_time_T_50.pdf}
        \caption{Uncertainty decision making in health economics.}
        \label{appfig:mobq_decision}
    \end{subfigure}
    \hfill
    \caption{Comparison of CBQ and MOBQ in terms of RMSE and time (wall clock time) \textbf{Left (a):} Bayesian sensitivity analysis for linear models. \textbf{Middle (b):} Option pricing in mathematical finance. \textbf{Right (c):} Uncertainty decision making in health economics.}
\end{figure}



\subsection{Quasi Monte Carlo}\label{appendix:QMC}

\fxb{This section lacks a lot of details. Is this only for the Bayesian sensitivity experiment? Its not clear as your previous subsection did a comparison across different experiments. I assume that might be the case, and if so you might want to add a sentence to say something along the lines of "We use an identical setting to the previous subsection, but only change the points $x_{1:N}^t$. }

Quasi Monte Carlo (QMC) is another line of research on improving the precision of approximating intractable integrals. 
While quadrature methods like BQ and CBQ aim at finding a smart way to combine the function values, QMC aims to find samples that can more uniformly cover the integration domain than random sampling~\citep{niu2023discrepancy, hickernell1998generalized, gerber2015sequential}. 
In the development of CBQ, we don't make any assumptions about the sampling of observations; specifically, we don't mandate i.i.d sampling. 
Therefore, it would be interesting to see whether combining quadrature algorithms with QMC could further improve the accuracy for estimating conditional expectation.

For a fair comparison in the experiment of Bayesian sensitivity analysis for linear models, we implement QMC sampling for all methods including CBQ and baseline methods. 
The samples $x_{1:N}^t$ are generated from a Sobol sequence which is a low-discrepancy sequence commonly used in QMC to cover the multidimensional space more uniformly than random sequences.
We are not using QMC to sample $\theta_{1:T}$ because i.i.d. sampling is required in the second stage for CBQ, KLSMC and LSMC. \hudson{I am not sure in this.}
We follow the technique introduced in randomized QMC~\cite{lemieux2004randomized} to shift the Sobol sequence by a random amount.

It can be observed in \Cref{appfig:qmc} that replacing random sampling with QMC significantly enhances the performance of baseline methods, such as LSMC and KLSMC, while subtly improves the performance of CBQ. The limited degree of improvement seen in CBQ with QMC sampling can be attributed to the fact that CBQ already yields a remarkably low RMSE. Consequently, the margin of improvement offered by QMC sampling is not as evident in CBQ as in the baseline methods. We have only studied the effect of combining QMC and CBQ in the experiment of Bayesian sensitivity analysis in linear models. It would be interesting to see if combining QMC and CBQ would result in higher accuracy in other settings, and we leave it for future work. 

\begin{figure}[t]
    \centering
    \begin{minipage}{1.0\textwidth}
    \centering
    \includegraphics[width=350pt]{figures/legend_qmc.pdf}
    \end{minipage}
    
    \begin{subfigure}{0.33\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/bayes_sensitivity_qmc.pdf}
        \caption{Quasi Monte Carlo}
        \label{appfig:qmc}
    \end{subfigure}
    \begin{subfigure}{0.30\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/ablation_kernel_x.pdf}
        \caption{Ablation on kernel $k_\Theta$}
        \label{appfig:ablation_theta}
    \end{subfigure}
    \begin{subfigure}{0.30\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/ablation_kernel_y.pdf}
        \caption{Ablation on kernel $k_\calX$}
        \label{appfig:ablation_x}
    \end{subfigure}
    \caption{\textbf{Left:} Comparison of all methods with standard uniform sampling and Quasi-Monte Carlo methods. \textbf{Middle and Right:} Ablation study for CBQ with different $k_\Theta$ and $k_\calX$ kernels in Bayesian sensitivity analysis for linear models.}
\end{figure}


\subsection{Ablations on kernels}\label{appendix:ablation}
\hudson{I am now inclined to remove this paragraph, because I feel that the performance should be different under different kernels with different smoothness. I assume the reason that I do not see much difference here, is that the problem is too simple, so using any kernel would give a quite good result.}
We present an ablation study evaluating the impact of distinct kernel choices $k_\calX$ and $k_\Theta$ within the framework of Bayesian sensitivity analysis in linear models. The integrand is $f(x)=x^\top x$, the dimension $d=2$ and $N=T=50$. 
First, we choose $k_\Theta$ to be Mat\'ern-3/2 kernel and Gaussian kernel. \Cref{appfig:ablation_theta} shows that the performance of CBQ remains consistent across different $k_\Theta$ kernels. 

Subsequently, we opt for Mat\'ern-3/2 kernel, Gaussian kernel and Stein kernel (with Mat\'ern-3/2 as the base kernel) as choices for $k_\calX$. When $k_\calX$ is Gaussian kernel, the formula for kernel mean embedding $\mu_{\theta}(x)$ is presented in \Cref{appeq:E14}. When $k_\calX$ is Mat\'ern-3/2 kernel, a closed form expression for the kernel mean embedding does not exist for the non-isotropic Gaussian distribution $\calN(\tilde{m}, \tilde{\Sigma})$, but the 'inverse transform trick' can be employed as in \Cref{appeq:transform}. 
When $k_\calX$ is Stein kernel, we choose Mat\'ern-3/2 as the base kernel and then apply Stein operator on both arguments of kernel $k_0$.
In \Cref{appfig:ablation_x}.

Although in this simple setting, 
All kernel hyperparameters are selected according to \Cref{appendix:hyperparameter_selection}.

\fxb{In this section, you describe the results of the experiments (i.e. the plots), but you do not provide an interpretation of those results? Do we think what we observe generalises in any way? Any insights as to why we observe what we have observed here? Any connections with our theory?} 


\subsection{Calibration}\label{appendix:calibration}
CBQ falls in the area of probabilistic numeric algorithms that can provide finite-sample Bayesian quantification of uncertainty, where the uncertainty arises from having access to only a finite number of function values of the integrand.
Since CBQ is a two-stage hierarchical Gaussian process method in nature, and the final estimate $I_{\textrm{CBQ}}$ is treated as Gaussian distributed, so the standard deviation $\sigma^2_{\textrm{CBQ}}$ is a measure of uncertainty~\cite{kendall2017uncertainties}.
The calibration plots in \Cref{appfig:calibration} are obtained by altering the width of the credible interval and then computes the percentage of times a credible interval contains the true value $I(\theta)$ under repetitions of the experiment.
The black diagonal line represents the ideal case, with any curve lying above the black line indicating underconfidence and any curve lying below indicating overconfidence.
It is generally regarded more preferable to be underconfident than overconfident. 

In \Cref{appfig:calibration_bayes_sensitivity}, we show the calibration of the CBQ posterior for the integrand $f(x)=x^\top x$ when dimension $d=2$. 
We observe that when the number of samples is as small as $10$, CBQ is overconfident, which can be explained by the poor performance of using empirical Bayes to select hyperparameters in the small sample regime. 
On the other hand, when $N$ and $T$ increase, CBQ becomes underconfident, meaning that our posterior variance is more inflated than needed from a frequentist viewpoint.
The calibration plots for other experiments are all demonstrated in \Cref{appfig:calibration}, and the conclusions are consistent across different experiments.


\begin{figure}[t]
    \begin{subfigure}{0.48\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/calibration_bayes.pdf}
        \caption{Calibration}
    \label{appfig:calibration_bayes_sensitivity}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.48\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/calibration_sir.pdf}
        \caption{Calibration}
        \label{appfig:calibration_finance}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.48\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/calibration_finance.pdf}
        \caption{Calibration}
        \label{appfig:calibration_sir}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.48\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/calibration_decision.pdf}
        \caption{Calibration}
        \label{appfig:calibration_decision}
    \end{subfigure}
    \caption{Calibration plots. \textbf{Top Left:}  Bayesian sensitivity analysis in linear models. \textbf{Top Right:} Bayesian sensitivity analysis for SIR model. \textbf{Bottom Left:} Option pricing in mathematical finance. \textbf{Bottom Right:} Uncertainty decision making in health economics.}
    \label{appfig:calibration}
\end{figure}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%






\end{appendices}