\begin{appendices}
\crefalias{section}{appendix}
\crefalias{subsection}{appendix}
\crefalias{subsubsection}{appendix}

\setcounter{equation}{0}
\renewcommand{\theequation}{\thesection.\arabic{equation}}

\onecolumn

{\hrule height 1mm}
\vspace*{-0pt}
\section*{\LARGE\bf \centering Supplementary Material
}
\vspace{8pt}
{\hrule height 0.1mm}
% {\hrule height 0.3mm}
\vspace{24pt}

\section*{Table of Contents}
\vspace*{-10pt}
\startcontents[sections]
\printcontents[sections]{l}{1}{\setcounter{tocdepth}{2}}

\newpage

\section{Theoretical Results}\label{appendix:convergence_rate}

To validate our methodology, we established a rate at which the CBQ estimator  converges to the true value of the conditional expectation $I$ in the $\calL^2(\Theta, \Qb)$ norm, $\|\hat{I}_\mathrm{CBQ} - I\|_{\calL^2(\Theta, \Qb)}=\int_\Theta (\hat{I}_\mathrm{CBQ}(\theta) - I(\theta))^2 \Qb(\mathrm d \theta )$, for $\Qb$ such that $\theta_t \sim \Qb$ for $t \in \{1,\dots,T\}$. This result was presented in the main text in~\Cref{thm:convergence}. In this section, we prove a more general version that theorem (as well as several intermediate results), and expand on the technical background required.

Our proof can be broken into two main parts: studying the error from stage 1, and then studying the error from stage 2. To do so, we primarily build on two results:~\cite[Theorem 4]{wynne2021convergence} (used for stage 1), and~\cite[Theorem 4]{gogolashvili2023importance} (used for stage 2). Specifically,~\cite[Theorem 4]{gogolashvili2023importance} is used to establish a bound on $\|\hat{I}_\mathrm{CBQ} - I\|_{\calL^2(\Theta, \Qb)}$ in terms of $T$ (the number of samples in $\Theta$), and the largest value for BQ variance, $\max_{t \in \{1,\dots,T\}}\sigma^2_\mathrm{BQ}(\theta_t)$. Then,~\cite[Theorem 4]{wynne2021convergence} is used to bound the variance $\sigma^2_\mathrm{BQ}(\theta_t)$ for any $t \in \{1,\dots,T\}$ in terms of $N$ (the number of samples in $\calX$). 

In~\Cref{sec:connection_to_iwkrr}, we define the weight function $w(\theta)$ that establishes a connection between Stage 2 of the method to the setting of importance-weighted kernel ridge regression in~\cite[Theorem 4]{gogolashvili2023importance}. Then in~\Cref{sec:technical_assumptions} we present technical assumptions under which both~\cite[Theorem 4]{wynne2021convergence} and~\cite[Theorem 4]{gogolashvili2023importance} for the defined $w(\theta)$ hold. Finally, in~\Cref{sec:proof_of_convergence} we prove a more general form of~\Cref{thm:convergence} for $\lambda_\calX \geq 0$ and $\theta_{1:T}$ sampled from distribution that does not necessarily have a density.

\subsection{Conditional Bayesian Quadrature and Importance-Weighted Kernel Ridge Regression}
\label{sec:connection_to_iwkrr}
In this section, we construct an importance-weighted kernel ridge regression estimator of the true $I(\theta)$ based on the BQ posterior means $\hat{I}_\mathrm{BQ}(\theta_t)$ for $t \in \{1, \dots, T\}$ obtained in the first stage. Then we show that it is equivalent to the CBQ estimator for a certain family of weight functions, and choose a specific weight function to be used in analysis of convergence.

Consider $\{(\theta_t, \hat{I}_\mathrm{BQ}(\theta_t))\}_{t=1}^T$ to be training data for the task of approximating the true function $I: \Theta \to \R$. The importance-weighted kernel ridge regression estimator~\citep{gogolashvili2023importance} with kernel $k_\Theta$ and weight function $w: \Theta \to \R$ is defined as
%
\begin{equation}
\label{eq:iw_krr}
    \hat{I}_\mathrm{IW}^{w,\lambda} = \argmin_{F \in \calH_\Theta} \Big\{  \frac{1}{T}\sum_{t=1}^T w(\theta_t) \big(F(\theta_t) - \hat{I}_\mathrm{BQ}(\theta_t)\big)^2 + \lambda \| F \|_{\calH_\Theta}^2 \Big\},
\end{equation}
%
where $\lambda>0$ is a regularisation parameter and $\mathcal{H}_{\Theta}$ the RKHS associated to $k_{\Theta}$. 
%\fxb{One thing which is a bit weird is that you are essentially implicitly assuming that $\hat{I}_{\text{BQ}}:\Theta \rightarrow \mathbb{R}$ is a smooth function in $\theta$. But I don't think this is necessarily true? e.g. you could have that $\hat{I}_{\text{BQ}}(\theta)$ and $\hat{I}_{\text{BQ}}(\theta+\epsilon)$ are not close to one another because the $x$ samples at $\theta$ led BQ to underestimate the integral whereas at $\theta+\epsilon$ then led BQ to overestimate the integral. } 
Convergence of $\hat{I}_\mathrm{IW}^{w,\lambda}$ to $I(\theta)$ was established in~\cite[Theorem 4]{gogolashvili2023importance} under a list of assumptions on $w$, $\lambda$, $k_\Theta$, and $I$. To apply this result to establish convergence of our method, we first introduce the necessary conditions on $w, \lambda$ under which $\hat{I}_\mathrm{IW}^{w,\lambda} = \hat{I}_\mathrm{CBQ}$, where $\hat{I}_\mathrm{CBQ}$ is the CBQ estimator proposed in Section~\ref{sec:cbq},
%
\begin{align*}
    \hat{I}_\mathrm{CBQ}(\theta) = k_\Theta(\theta, \theta_{1:T})^\top \big(k_\Theta(\theta_{1:T}, \theta_{1:T}) + \mathrm{diag}(\lambda_\Theta + \sigma^2_\mathrm{BQ}(\theta_{1:T})) \big)^{-1} \hat{I}_\mathrm{BQ}(\theta_{1:T}),
\end{align*}
%
where $\lambda_\Theta\geq0$ is the regularisation parameter, and $\hat{I}_\mathrm{BQ}(\theta_t)$ and $\sigma^2_\mathrm{BQ}(\theta_t)$, for $t \in \{1, \dots, T\}$, are BQ posterior mean and variance obtained in the first stage.

\begin{prop}
\label{res:iw_is_cbq}
   Suppose the following hold:
    \begin{enumerate}
        \item $w(\theta)>0$ for all $\theta \in \Theta$
        \item $\lambda T / w(\theta_t) = \lambda_\Theta + \sigma^2_\mathrm{BQ}(\theta_{1:T})$ for all $t \in \{1, \dots, T\}$.
    \end{enumerate}
    Then,  $\hat{I}_\mathrm{IW}^{w,\lambda}=\hat{I}_\mathrm{CBQ}$.
%$w(\theta_t)=\tau/(1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_{1:T}))$ for $t \in \{1,\dots,T\}$, and $\lambda = \tau / \lambda_\Theta $.
\end{prop}
\begin{proof}
%
Whenever $w(\theta_t)>0$ for all $t \in \{1,\dots,T\}$, through straightforward differentiation of the objective in~\Cref{eq:iw_krr} 
%\fxb{Maybe we should do this explicitly? Are there any assumptions needed to be allowed to do this.} 
one can show that $\hat{I}_\mathrm{IW}$ is exactly the posterior mean in GP regression with heteroscedastic Gaussian noise,
%
\begin{align*}
    \hat{I}_\mathrm{IW}^{w,\lambda}(\theta) & = k_\Theta(\theta, \theta_{1:T})^\top \big(k_\Theta(\theta_{1:T}, \theta_{1:T}) + \lambda T \diag(1/w(\theta_{1:T})) \big)^{-1} \hat{I}_\mathrm{BQ}(\theta_{1:T}). \\
   & = k_\Theta(\theta, \theta_{1:T})^\top \big(k_\Theta(\theta_{1:T}, \theta_{1:T}) + \mathrm{diag}(\lambda_\Theta + \sigma^2_\mathrm{BQ}(\theta_{1:T})) \big)^{-1} \hat{I}_\mathrm{BQ}(\theta_{1:T})
\end{align*}
%
where the second equation follows from our second assumption. This completes the proof.
\end{proof}

We are now ready to list further desiderata for $w$ and $\lambda$ that will allow us to apply~\cite[Theorem 4]{gogolashvili2023importance} to bound $\|\hat{I}_\mathrm{CBQ}-I\|_{\calL^2(\Theta, \Qb)}$.
\begin{itemize}
    \item \emph{$w(\theta)$ should be bounded above.} The convergence in the theorem in~\cite{gogolashvili2023importance} is stated in $\calL^2(\Theta, \Qb_w)$ norm, where $\Qb_w$ is the measure defined as $\Qb_w(A) = \int_A w(\theta)\Qb(\mathrm{d} \theta)$ that must be finite and positive. By~\cite[ Proposition 232D]{fremlin2000measure}, for $\Qb_w(A)$ to be a finite positive measure, it is sufficient for $w(\theta)$ to be positive and bounded.
    \item \emph{$w(\theta)$ should be bounded below and away from zero by a value that grows as BQ variance reduces.} Provided $\inf_{\theta \in \Theta} w(\theta)=w_0 > 0$, we may use the bound on the $\calL^2(\Theta, \Qb_w)$ norm to bound the $\calL^2(\Theta, \Qb)$ norm, as $\| \cdot \|_{\calL^2(\Theta, \Qb)} \leq \sup_{\theta \in \Theta}\{\frac{1}{w(\theta)}\} \| \cdot \|_{\calL^2(\Theta, \Qb_w)} = \frac{1}{w_0} \| \cdot \|_{\calL^2(\Theta, \Qb_w)}$. Then, if $w_0$ grows as BQ variance reduces, we have a bound of $\calL^2(\Theta, \Qb)$ that shrinks as BQ in stage 1 gets more precise.
    %\fxb{Maybe we could simply use this version of the assumption in the proposition above?}
    \item \emph{$\lambda= \tau T^{-\beta}$ for $\tau$ sufficiently large and $\beta>0$}. One of the conditions in~\cite[Theorem 4]{gogolashvili2023importance} is that $\tau$ is larger than a value dependent on the $\lambda_\Theta$ 
    %\fxb{This statement is a bit vague - what does "dependent on the problem" mean? Maybe you could state which part of the problem you are refering to}
    (but independent of $T$).
    %Therefore, we must parametrise $\lambda$ with a $\tau$ value that can be chosen at will. \fxb{generally i find this condition not very clear.}
\end{itemize}

We shall now propose specific $w(\theta), \lambda$ that satisfies all the requirements above and in~\Cref{res:iw_is_cbq}.  
%\fxb{Maybe worth providing some intuition; e.g. our construction guarantees that $w$ is a continuous function, and how the value depends on $\sigma^2_{\text{BQ}}$, and why that is desirable?}

\begin{prop}
Suppose $\lambda = \tau T / \lambda_\Theta$, where $\lambda_\Theta=\calO(T^{\beta+1})$ for some $\beta>0$ and $\tau \in \R$. Further, suppose the weight function takes the form
\begin{align}
\begin{split}
\label{eq:weight_function}
    w(\theta) = \begin{cases}
        \tau(1 + \lambda_\Theta^{-1} \sigma^2_{\mathrm{BQ}, T})^{-1} & \text{ if } \|\theta - \theta_t\|_\Theta  \geq \varepsilon' \text{ for all } t \in \{1,\dots,T\} \\
        \tau A_t - \tau B_t \frac{\|\theta - \theta_t\|_\Theta }{\varepsilon'}, & \text{ for } t \text{ such that } \|\theta - \theta_t\|_\Theta  < \varepsilon', \\
    \end{cases}
\end{split}
\end{align}
%
for $\| \cdot\|_\Theta$ the Euclidean norm on $\Theta$, some fixed $0<\varepsilon' \leq \min_{i,j \in \{1,\dots,T\},\ i \neq j} \|\theta_i-\theta_j\|_\Theta$, maximiser of BQ variance $\sigma^2_{\mathrm{BQ}, T} = \max_{t \in \{1,\dots,T\}}\{\sigma^{2}_\mathrm{BQ}(\theta_t)\}>0$, and
%
\begin{align*}
    A_t = (1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1}  \qquad \text{and} \qquad
    B_t = (1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1} - (1 + \lambda_\Theta^{-1} \sigma^2_{\mathrm{BQ}, T})^{-1}.
\end{align*}
%
Then, $\hat{I}^{w, \lambda}_\mathrm{IW} = \hat{I}_\mathrm{CBQ}$, and the desiderata above are satisfied.
%\fxb{I think the ordering of your statements are a bit weird (it was the same in the previous proposition). What you are doing is saying "statements XXX and YYY hold whenever ZZZ". I think its much clearer to say "Suppose ZZZ holds. Thenn (i) XXX holds, (ii) YYY holds". Its mostly about the wording being a bit confusing and makes me have to pause to check that I understood what you are saying}
\begin{proof}
    
It is easy to see that $w(\theta)$ is bounded above by $\tau$, and below by $\tau(1 + \lambda_\Theta^{-1} \sigma^2_{\mathrm{BQ}, T})^{-1}>0$ for any $\theta \in \Theta$, and $\lambda T/w(\theta_t) = 
1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t)$ as required by~\Cref{res:iw_is_cbq}. 
\end{proof}
\end{prop}
The weight function for $\Theta \subset \R$ is illustrated in~\Cref{fig:wtheta_illustration} for $\Theta \subset \R$. It is by no means a unique way to establish a useful connection between our setting of heteroscedastic GP regression, and importance-weighted kernel ridge regression: as will become more evident in the proofs in~\Cref{sec:proof_of_convergence}, one could use any $w(\theta)$ provided it satisfies the desiderata, and~\Cref{res:iw_is_cbq}. However, our proposed construction is simple and easy to visualise, and the parameter $\varepsilon'$ has no impact on the speed of convergence as we shall see in the results in~\Cref{sec:proof_of_convergence}.
\begin{figure}[H]
\centering \includegraphics[width=0.6\textwidth]{figures/wtheta_example_upd.pdf}
\caption{Illustration of $w(\theta)$ for $\Theta \subset \R$ for $T=2$. The weight is bounded below by $\max_{t \in \{1,\dots,T\}}\{\sigma^{2}_\mathrm{BQ}(\theta_t)\}$, a value that grows as $\sigma^{2}_\mathrm{BQ}(\theta_t)$ reduce. As the BQ estimate in Stage 1 gets more accurate, $1/\min_t w(\theta_t)$ shrinks, tightening the bound.}
%\fxb{Is it worth clarifying where zero is on the y-axis.}
\label{fig:wtheta_illustration}
\end{figure}





% \subsubsection{OLD}

% \fxb{I wonder if we could restructure this appendix as follows: (1) we define importance-weighted kernel ridge regressions, (2) we provide a Proposition which says "the CBQ estimator is equivalent to importance-weighted kernel ridge regression with importance weight $w=...$". This would make the appendix a bit more formal and clear than it is now. }

% Recall the CBQ estimator proposed in Section~\ref{sec:cbq},
% %
% \begin{align*}
%     \hat{I}_\mathrm{CBQ}(\theta) = k_\Theta(\theta, \theta_{1:T})^\top \big(k_\Theta(\theta_{1:T}, \theta_{1:T}) + (\lambda_\Theta + \sigma^2_\mathrm{BQ}(\theta_{1:T})) \Id_T\big)^{-1} \hat{I}_\mathrm{BQ}(\theta_{1:T}),
% \end{align*}
% %
% where $\lambda_\Theta\geq0$ is the regularisation parameter, and $\hat{I}_\mathrm{BQ}(\theta_t)$ and $\sigma^2_\mathrm{BQ}(\theta_t)$, for $t \in \{1, \dots, T\}$, are BQ posterior mean and variance obtained in the first stage,
% %
% \begin{align*}
%     \hat{I}_\mathrm{BQ}(\theta_t) & = \mu_\theta(x^t_{1:N})^\top \left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} f(x^t_{1:N}, \theta_t),\\
%     \sigma^2_\mathrm{BQ}(\theta_t) &= \mathbb{E}_{X,X'\sim \mathbb{P}_\theta}[k_{\calX}(X,X')] - \mu_\theta(x^t_{1:N})^\top \left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} \mu_\theta(x^t_{1:N}).
% \end{align*}
% %
% It 
% %was pointed out in~\citep[Remark 2]{gogolashvili2023importance} \fxb{This sentence is confusing because it implies that \cite{gogolashvili2023importance} already knew about CBQ}, (and 
% can be seen through straightforward differentiation that the estimator $\hat{I}_\mathrm{CBQ}$ is the minimiser of the importance weighted kernel ridge regression loss over functions in the RKHS $\calH_\Theta$ induced by the kernel $k_\Theta$,
% %
% \begin{align*}
%     \hat{I}_\mathrm{CBQ} = \argmin_{F \in \calH_\Theta} \Big\{  \frac{1}{T}\sum_{t=1}^T \frac{\tau}{1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t)} \big(F(\theta_t) - \hat{I}_\mathrm{BQ}(\theta_t)\big)^2 + \frac{\tau}{T \lambda_\Theta} \| F \|_{\calH_\Theta}^2 \Big\},
% \end{align*}
% %
% for any $\tau>0$ provided $\lambda_\Theta + \sigma^2_\mathrm{BQ}(\theta_t)>0$.\footnote{We will keep $\tau$ as a free parameter for now, and use it in~\Cref{sec:proof_of_convergence} to ensure constraints on $\lambda_\Theta$ imposed by~\cite[Theorem 4]{gogolashvili2023importance} are satisfied} 
% %\fxb{are there any necessary conditions for this step?}
% Suppose $\theta_i$ are sampled from a probability measure $\Qb$ on $\Theta$. Then, 
% %
% \begin{align}
% \label{eq:p_te}
%     \Qb_w(A) = \int_A w(\theta)\Qb(\mathrm{d} \theta)
% \end{align}
% %
% defines a positive measure $\Qb_w$ on $\Theta$ for any positive $w(\theta) > 0$ for which the integral exists~\citep[ Proposition 232D]{fremlin2000measure}; further, if $w(\theta)$ is bounded, the measure is finite. Suppose we construct a $w(\theta)$ that satisfies these requirements, and is such that $w(\theta_t) = \tau(1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1}$. Then, since $\E [\hat{I}_\mathrm{BQ}(\theta_i)] = I(\theta_i)$, the importance-weighted loss can be considered a finite-sample approximation of
% \begin{align*}
%     \int_\Theta ( F(\theta) - I(\theta) )^2 w(\theta) \Qb(\mathrm{d} \theta) + \frac{\tau}{T \lambda_\Theta} \| F \|^2_{\calH_\Theta} = \int_\Theta ( F(\theta) - I(\theta) )^2 \Qb_w(\mathrm{d} \theta) + \frac{\tau}{T \lambda_\Theta} \| F \|^2_{\calH_\Theta}.
% \end{align*}

% \begin{align*}
%     \int_\Theta ( F(\theta) - I(\theta) )^2 \Qb_w(\mathrm{d} \theta) + \frac{1}{n} \| F \|^2_{\calH_\Theta}.
% \end{align*}
% \fxb{Unclear what $n$ is?}

% \fxb{I feel like the sequence of step above is not very clear. Could you break down the steps a bit more for the reader? I think a nice way to do this would be to write something like $\int_\Theta ( F(\theta) - I(\theta) )^2 \Qb_w(\mathrm{d} \theta) = \int_\Theta \frac{w(\theta)}{q(\theta)} ( F(\theta) - I(\theta) )^2 \Qb(\mathrm{d} \theta)$ (in fact where is the $q(\theta)$ term for you?), and then we are just doing a Monte Carlo approximation of the integral. The final step is then to repkace $I(\theta)$ with a finite sample approximation}

% \fxb{you claim that we have an "unbiased" finite sample approximation. But is it really unbiased once the estimate of $I(\theta)$ is squared? i.e. you are looking at the integral of  $(F(\theta)-I(\theta))^2$ and not $I(\theta)$}

% %
% \fxb{Maybe you could provide some intuition for how you decided to construct your choice of $w$?}
% Under a further assumption that the problem is well-specified, meaning $I(\theta) \in \calH_\Theta$, an upper bound on $\|\hat{I}_\mathrm{CBQ} - I\|_{\calL^2(\Theta, \Qb_w)}$ in terms of $T$ and $\sup_{\theta \in \Theta} w(\theta)$ was established in~\citep[Theorem 4]{gogolashvili2023importance}. To apply the result, we define $w(\theta)$ of convenient form that satisfies the requirements mentioned above, specifically $w(\theta) \in (0, W_0]$ for some $W_0<\infty$ and any $\theta \in \Theta$, and $w(\theta_t) = \tau(1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1}$ for some $t \in \{1, \dots,T\}$.\footnote{The integrability requirement is specific to $\Qb$ and will be assumed at a later stage.} Take $\sigma^{2}_\mathrm{BQ}(\theta_{t'}) = \max_{t \in \{1,\dots,T\}}\{\sigma^{2}_\mathrm{BQ}(\theta_t)\}>0$, and define 
% %
% \begin{align}
% \begin{split}
% \label{eq:weight_function}
%     w(\theta) = \begin{cases}
%         \tau(1 + \lambda_\Theta^{-1} \sigma^2_{\mathrm{BQ}, T})^{-1} & \text{ if } \|\theta - \theta_t\|_\Theta  \geq \varepsilon' \text{ for all } t \in \{1,\dots,T\} \\
%         \tau A_t - \tau B_t \frac{\|\theta - \theta_t\|_\Theta }{\varepsilon'}, & \text{ for } t \text{ such that } \|\theta - \theta_t\|_\Theta  < \varepsilon' \\
%     \end{cases}
% \end{split}
% \end{align}
% %
% for $\| \cdot\|_\Theta$ the Euclidean norm on $\Theta$, some fixed $0<\varepsilon' \leq \min_{i,j \in \{1,\dots,T\},\ i \neq j} \|\theta_i-\theta_j\|_\Theta$, and
% %
% \begin{align*}
%     A_t = (1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1}  \qquad \text{and} \qquad
%     B_t = (1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1} - (1 + \lambda_\Theta^{-1} \sigma^2_{\mathrm{BQ}, T})^{-1}.
% \end{align*}
% %
% For $\Theta \subset \R$, such $w(\theta)$ is easily visualised, as can be seen in~\Cref{fig:wtheta_illustration}.
% \begin{figure}[H]
% \centering \includegraphics[width=0.6\textwidth]{figures/wtheta_example_upd.pdf}
% \caption{Illustration of $w(\theta)$ for $\Theta \subset \R$ \fxb{Is it worth clarifying where zero is on the y-axis.}}
% \label{fig:wtheta_illustration}
% \end{figure}
% It is easy to see that $w(\theta)$ is bounded above by $\tau \max_{t \in \{1,\dots,T\}} (1 + \lambda_\Theta^{-1} \sigma^{2}_\mathrm{BQ}(\theta_{t}))^{-1} < \tau$, and below by $\tau(1 + \lambda_\Theta^{-1} \sigma^2_{\mathrm{BQ}, T})^{-1}>0$ for any $\theta \in \Theta$, and $w(\theta_t) = \tau(1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1}$ as required. 

% Note that the weight $w(\theta)$ constructed here is by no means a unique way to establish a useful connection between our setting of heteroscedastic GP regression, and importance-weighted kernel ridge regression. As will become evident in the proofs in~\Cref{sec:proof_of_convergence}, one could use any $w(\theta)$ provided it satisfies $w(\theta_t) = \tau(1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1}$ for any $t$, is bounded below by some function of $\sigma^2_{\mathrm{BQ}, T}$, and is bounded above by an expression that does not grow in $T$ or $N$. Our proposed construction is simple and easy to visualise, and the parameter $\varepsilon'$ has no impact on the speed of convergence as we shall see in the results in~\Cref{sec:proof_of_convergence}.

\subsection{Technical Assumptions}
\label{sec:technical_assumptions}

Prior to presenting our findings, we present and justify the assumptions we have made. Throughout we use Sobolev spaces to quantify a function's smoothness. A Sobolev space $\calW^{2, s}(\calX, \mu)$, with $s>d/2$ and a measure $\mu$ on $\calX \subseteq \R^d$, consists of functions that satisfy certain conditions: they are square integrable under the measure $\mu$, and all weak derivatives up to and including order $s$ are also square integrable under $\mu$. Weak derivatives are a generalization of ordinary derivatives, allowing for functions that are not necessarily differentiable everywhere. 
%\fxb{Maybe you could write the formal definition? I'm thinking of the one under "multi-dimensional case" in \url{https://en.wikipedia.org/wiki/Sobolev_space} (although there are some small notational issues on the Wikipedia page to be weary of - especially for the bit where they define partial derivatives} \masha{I think this is excessive since we never work with the explicit definition..}
Further, we assume the kernels $k_\Theta, k_\calX$ are Sobolev kernels, meaning they induce Hilbert spaces that are norm-equivalent to Sobolev spaces. 
%\fxb{maybe we can state using mathematics what norm equivalence actually means?}

Mat\'ern kernels are important examples of Sobolev kernels: it is well-known that the RKHS of a Mat\'ern kernel of order $\nu_\Theta$ over an open, convex and bounded $\Theta \subset \R^p$ is norm-equivalent to the Sobolev space $W^{2,\nu_\Theta+p/2}(\Theta)$ when $\nu_\Theta+p/2 \in \mathbb Z$; this is proven in~\citep[Corollary 10.48]{Wendland2005}. For $\Theta=\R^p$, the result can be straightforwardly extended to fractional order Sobolev-Slobodeckij spaces, $\nu_\Theta+p/2 \in \mathbb R$: by~\cite[Corollary 10.13]{Wendland2005} the RKHS of a Mat\'ern kernel on $\R^p$ is norm-equivalent to a Bessel potential space, which in turn is norm-equivalent to the Sobolev-Slobodeckij space by~\cite[Section 7.62]{adams2003sobolev}. Finally, one can use an extension operator in~\cite[Theorems 6.1 and 6.7]{devore1993besov} to restrict the norm-equivalence result to open, convex and bounded $\Theta \subset \R^p$. We refer to~\cite{adams2003sobolev} for an in-depth treatment of Sobolev spaces and~\cite{berlinet2011reproducing} for general RKHS theory.

The following is a more general form of the assumptions in~\Cref{thm:convergence}: specifically, we allow for the case when $\theta_{1:T}$ came from a distribution that doesn't necessarily have a density, do not assume $\lambda_\calX=0$, and allow for higher granularity of the relative smoothness of $\calH_\Theta$ and $I$.
\begin{enumerate}[itemsep=0.1pt,topsep=0pt,leftmargin=*]
\item [B0] 
\begin{enumerate}
    \item[(a)] For any $\theta$, $f(x, \theta)$ lies in the Sobolev space $\calW^{2, s_f}(\calX)$. 
    \customlabel{as:app_true_f_smoothness}{B0.(a)} 
    \item[(b)] $I(\theta)$ lies in the Sobolev space $\calW^{2, s_I}(\Theta)$. 
    \customlabel{as:app_true_I_smoothness}{B0.(b)} 
\end{enumerate}
\item [B1] 
\begin{enumerate}
    \item[(a)] $\calX \subset \R^d$ is open, convex, and bounded.  
    \customlabel{as:app_domains_x}{B1.(a)} 
    \item[(b)] $\Theta\subset \R^p$ is open, convex, and bounded. 
    \customlabel{as:app_domains_theta}{B1.(b)} 
\end{enumerate}
\item [B2]
\begin{enumerate}
    \item[(a)] $\theta_t$ were sampled i.i.d. from some $\Qb$, and $\Qb$ is equivalent to the uniform distribution on $\Theta$, meaning $\Qb(A)=0$ for a set $A \subset \Theta$ if and only if $\operatorname{Unif}(A)=0$. 
    \customlabel{as:app_theta_samples}{B2.(a)} 
    \item[(b)] $x_{1:N}^t \sim \Pb_{\theta_t}$ for all $t \in \{1, \cdots, T\}$.  
    \customlabel{as:app_x_samples}{B2.(b)}
\end{enumerate}
\item [B3] $\Pb_\theta$ has a density $p_\theta$ for any $\theta \in \Theta$, and the densities are such that $\inf_{\theta \in \Theta, x \in \calX} p_{\theta}(x)=\eta>0$ and $\sup_{\theta \in \Theta}\|p_{\theta}\|_{\calL^2(\calX)}=\eta_0<\infty$.
\customlabel{as:app_densities}{B3}
\item [B4] 
\begin{enumerate}
    \item[(a)] $k_\calX$ is a Sobolev kernel of smoothness $s_\calX \in (d/2, s_f]$. 
    \customlabel{as:app_kernel_x}{B4.(a)}
    \item[(b)] $k_\Theta$ is a Sobolev kernel of smoothness $s_\Theta \in (p/2, s_I]$. 
    \customlabel{as:app_kernel_theta}{B4.(b)}
    \item[(c)] For the integral operator $L'[f](\theta)=\int_\Theta f(\theta') k_\Theta(\theta, \theta') \Qb(\text{d} \theta')$ that maps $f \in \calL^2(\Theta, \Qb)$ to $L'[f] \in \calH_\Theta$, there is a $g \in \calL^2(\Theta, \Qb)$ such that $I={L'}^r g$ for some $r \in [1/2, 1]$. We denote $R_0=\| g \|_{\calL^2(\Theta, \Qb)}$. 
    \customlabel{as:app_src}{B4.(c)}
\end{enumerate}
\item [B5]
\begin{enumerate}
    \item[(a)] $\lambda_\Theta = cT^{1/(2r+1)}$, for $c>0$ and $\alpha \in (0, 1)$.
    \customlabel{as:app_regulariser_theta}{B5.(a)}
    \item[(b)] $\lambda_\calX \geq 0$. 
    \customlabel{as:app_regulariser_x}{B5.(b)}
\end{enumerate}
\end{enumerate}

Assumption B0 corresponds to conditions specified in the text of~\Cref{thm:convergence} prefacing the list of assumptions. In the kernel literature, Assumption~\ref{as:app_src} is known as a \emph{source condition}, and is frequently used to quantify the difficulty of the problem: the larger $r$, the easier it is to learn $I(\theta)$ with $k_\Theta$, and consequently we may expect faster convergence. The result in~\Cref{thm:convergence} is given for $r=1/2$: by Mercer's theorem, the source condition is satisfied for $r=1/2$ since the true function $I(\theta)$ lies in the RKHS of $k_\Theta$ by~\ref{as:app_true_I_smoothness} and~\ref{as:app_kernel_theta}. Intuitively, the more smooth $I(\theta)$ is in relation to the RKHS of $k_\theta$, the greater the value of $r \geq 1/2$ for which the source condition is satisfied will be. For further detail on integral operators, their $r$-powers, and the source condition, we refer to the overview in~\cite{gogolashvili2023importance}.

Crucially, in the proofs in the next section we will see that the assumptions imply that the setting of the model in Stage 1 satisfies the assumptions of~\cite[Theorem 4]{wynne2021convergence}, and the setting of the model in Stage 2 satisfies the assumptions of~\cite[Theorem 4]{gogolashvili2023importance}---the two key results we will use to prove the convergence rate of the estimator.

% \subsubsection{Discussion of Technical Assumptions}
% Crucially,~\cref{as:app_domains_x,as:app_domains_theta,as:app_theta_samples,as:app_x_samples,as:app_densities,as:app_kernel_x,as:app_kernel_theta,as:app_true_f_smoothness,as:app_true_I_smoothness,as:app_src,as:app_regulariser_theta,as:app_regulariser_x} imply that the setting of the model in Stage 1 satisfied the assumptions of~\cite[Theorem 4]{wynne2021convergence}, and the setting of the model in Stage 2 satisfied the assumptions of~\cite[Theorem 4]{gogolashvili2023importance}---the two key results we will use to prove the convergence rate of the estimator. We discuss and demonstrate this now.

% \paragraph{Assumptions 1-4 of~\cite[Theorem 4]{gogolashvili2023importance} for Stage 2}
% Assumption 1 states that the model is well-specified, $I(\theta) \in \calH_\Theta$, and is trivial in our case: it is well-known that $\calH_\Theta \simeq \calW^{2,\nu_\Theta+p/2}(\Theta)$: the RKHS of a Mat\'ern kernel of order $\nu_\Theta$ over an open, convex and bounded $\Theta \subset \R^p$ is norm-equivalent to the Sobolev space $W^{2,\nu_\Theta+p/2}(\Theta)$~\citep[Corollary 10.48]{Wendland2005}\footnote{Strictly speaking, the result in~\citep[Corollary 10.48]{Wendland2005} is stated for integer-order spaces only, meaning $\nu_\Theta+p/2 \in \mathbb Z$. 
% However, it can be straightforwardly extended to fractional-order Sobolev-Slobodeckij spaces using~\cite[Corollary 10.13]{Wendland2005} that says the RKHS of a Mat\'ern kernel on $\R^p$ is norm-equivalent to a Bessel potential space, which in turn is norm-equivalent to the Sobolev-Slobodeckij space by~\cite[Section 7.62]{adams2003sobolev}, and finally using an extension operator in~\cite[Theorems 6.1 and 6.7]{devore1993besov} to restrict this to open, convex and bounded subsets of $\R^p$.}. By~\cref{as:app_true_I_smoothness} $I(\theta) \in \calW^{2, s_I}(\Theta)$, and finally by the inclusion of Sobolev spaces $I(\theta) \in \calW^{2,\nu_\Theta+p/2}(\Theta)$.

% Proving that Assumptions 2, 3 and 4 in~\citep{gogolashvili2023importance} hold under~\cref{as:app_domains_x,as:app_domains_theta,as:app_theta_samples,as:app_x_samples,as:app_densities,as:app_kernel_x,as:app_kernel_theta,as:app_true_f_smoothness,as:app_true_I_smoothness,as:app_src,as:app_regulariser_theta,as:app_regulariser_x} for $w(\theta)$ requires more care; we state that they hold in the following lemmas.

% \begin{lemma}[Assumption 2 in~\citep{gogolashvili2023importance}]
% \label{lemma:assumption2}
%     Under the Assumption~\cref{as:app_src} it holds that $I=L^r g$, for the integral operator $L: \calL^2(\Theta, \Qb_w) \to \calH_\Theta$ and $\Qb_w$ as defined in~\Cref{eq:p_te}, and some $g \in \calL^2(\Theta, \Qb_w)$ of norm $R \leq \tau R_0$.
% \end{lemma}
% \begin{proof}
%     We assumed that the statement holds for $\Qb$ in Assumption~\cref{as:app_src}. By definition of $\Qb_w$, for any $\Qb_w$-integrable $g':\Theta \to \R$ it holds that $\int_\Theta g'(\theta) \Qb_w{\mathrm d \theta} = \int_\Theta g'(\theta) w(\theta) \Qb{\mathrm d \theta}$. Since $w(\theta)$ is bounded above and below away from zero, $\calL^2(\Theta, \Qb)$ is norm-equivalent to $\calL^2(\Theta, \Qb_w)$. Therefore, $g \in \calL^2(\Theta, \Qb)$ and the statement holds for $R\leq \tau R_0$ as $w(\theta) \leq \tau$ by construction.
% \end{proof}

% \begin{lemma}[Assumption 3 in~\citep{gogolashvili2023importance}]
% \label{lemma:assumption3}
%     For $w(\theta)$ as defined in~\Cref{eq:weight_function} and $q=1$, $W= \tau$, and $\sigma^2= \| \Theta \| \tau$ it holds for all $m \in \N$, $m \geq 2$, that
%     \begin{align*}
%         \left( \int_\Theta w(\theta)^{\frac{q+m-1}{q}} \Qb(\mathrm{d} \theta) \right)^q \leq \frac{1}{2} m! W^{m-2} \sigma^2
%     \end{align*}
% %
% \end{lemma}
% \begin{proof}
% Since $w(\theta)$ is bounded from above, $\int_\Theta w(\theta)^m \Qb(\mathrm d \theta) < \| \Theta \| \tau^m \max_t (1 + \lambda_\Theta^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-m} < \| \Theta \| \tau$, and the result follows.
% \end{proof}

% \begin{lemma}[Assumption 4 in~\citep{gogolashvili2023importance}]
% \label{lemma:assumption4}
%     Under the Assumption~\cref{as:app_kernel_theta}, for $s'=p/(2\nu_\Theta + p)$ it holds that
%     \begin{align*}
%         E_{s'} = \max \left(1, \sup_{\lambda \in (0,1]} \sqrt{\sum_{i=1}^\infty \frac{\mu_i \lambda^{s'}}{\mu_i + \lambda}}  \right) < \infty.
%     \end{align*}
% %
% \end{lemma}
% \begin{proof}
% It is a standard result (see, for instance,~\citep[Section 3.3.4]{edmunds1996function}) that for $k_\Theta$ being a Mat\'ern kernel of order $\nu_\Theta$, the $i$-th eigenvalue decays at the rate of $i^{-\frac{2\nu_\Theta + p}{p}}$. As pointed out in the discussion after Assumption 4 in~\citep{gogolashvili2023importance}, this implies $E_{s'}< \infty$ holds for $s'=p/(2\nu_\Theta + p)$.
% \end{proof}

% \paragraph{Assumptions 1-5 of~\cite[Theorem 4]{wynne2021convergence} for Stage 1}



\subsection{Proofs of Theoretical Results}
\label{sec:proof_of_convergence}

Recall that our proof is broken down into two steps: a step of stage 1, and a step for stage 2. We are now ready to state the bound on $\| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Qb)}$ that we use for stage 2, and which is essentially a corollary of~\cite[Theorem 4]{gogolashvili2023importance}. This bound depends on the largest BQ variance $\max_{t \in \{1,\dots,T\}}\sigma^2_\mathrm{BQ}(\theta_t)$; we obtain a bound on BQ variance $\sigma^2_\mathrm{BQ}(\theta_t)$ for any $t$ in~\Cref{thm:bound_on_bq_var}. Combining the two results gives~\Cref{thm:convergence_generalised}, which is the generalised version of~\Cref{thm:convergence}. 

Before proving our corollary, we point out that $\Qb_w(A) = \int_A w(\theta) \Qb_w(\text{d} \theta)$ is a finite positive measure and not necessarily a probability measure; meanwhile in the statement of~\cite[Theorem 4]{gogolashvili2023importance} $\Qb_w$ is asked to be a probability measure. This is not an issue since $\Qb_w$ being a probability measure is never used in the proof of~\cite[Theorem 4]{gogolashvili2023importance}---instead, the proof only asks that $\Qb_w$ be a finite positive measure. We will therefore make use of Theorem 4 for a finite positive measure $\Qb_w$.

%\fxb{We should probably comment on the fact that our constants are far from tight? In fact since the constants are not tight, should we actually not state them in the corollary? Stating them gives an impression of careful study of these constants, but this is definitely not true?}
\begin{cor}
\label{thm:krr_corollary}
    Suppose Assumptions~\ref{as:app_true_I_smoothness},~\ref{as:app_domains_theta},~\ref{as:app_theta_samples},~\ref{as:app_kernel_theta},~\ref{as:app_src}, and~\ref{as:app_regulariser_theta} hold. Then, for a fixed $\delta$, there is a $T_0(\delta)>0$ such that for all $T \geq T_0(\delta)$ with probability at least $1-\delta/2$ it holds that
    \begin{align*}
        \| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Qb)} \leq K_1\left( \log\frac{12}{\delta} + K_2\right) (1 + c^{-1} T^{-\frac{1}{2r+1}} \sigma^2_{\mathrm{BQ}, T}) T^{-\frac{r}{2r+1}},
    \end{align*}
%
for $\sigma^2_{\mathrm{BQ}, T}=\max_{t \in \{1,\dots,T\}}\sigma^2_\mathrm{BQ}(\theta_t)$, $K_1$ that depends on $c$, $\Theta$, $I$, and $k_\Theta$, and $K_2$ that depends on $c$, $r$, $\Theta$, $I$, and $k_\Theta$.
\end{cor}

\begin{proof}
    First, we show the assumptions in~\cite[Theorem 4]{gogolashvili2023importance} hold for $R\leq \tau R_0$, the $r$ in Assumption~\ref{as:app_src}, $q=1$, $W= \tau$, and $\sigma^2= \| \Theta \| \tau$. 
    %TODO \fxb{Maybe it would help to actually state the theorem you are using our notation? This would make it a lot easier to see what you are doing.}
    
    {Assumption 1 (Existence of the target function):} As discussed in~\cite{gogolashvili2023importance}, Assumption 1 holds if the model is well-specified, $I(\theta) \in \calH_\Theta$, and the RKHS induced by $k_\Theta$ is dense in $\calL^2(\Theta, \Qb_w)$. Both of these conditions hold by Assumption~\ref{as:app_true_I_smoothness}: the former holds by inclusion of Sobolev spaces as $s_\Theta \leq s_I$, and the latter holds as the RKHS is a Sobolev space $\calW^{2, s_\Theta}(\Theta)$, which is dense in $\calL^2(\Theta)$ for an open and bounded $\Theta$. Finally, $\calL^2(\Theta)$ is norm-equivalent to $\calL^2(\Theta, \Qb_w)$ as $w(\theta)$ is bounded above and below away from zero, and $\Qb$ is equivalent to the uniform distribution.
    
    {Assumption 2 (The smoothness of the target function):} We assumed that the Assumption holds for $\Qb$ in~\ref{as:app_src}. By definition of $\Qb_w$, for any $\Qb_w$-integrable $g':\Theta \to \R$ it holds that $\int_\Theta g'(\theta) \Qb_w(\mathrm d \theta) = \int_\Theta g'(\theta) w(\theta) \Qb(\mathrm d \theta)$. Since $w(\theta)$ is bounded above and below away from zero, $\calL^2(\Theta, \Qb)$ is norm-equivalent to $\calL^2(\Theta, \Qb_w)$. Therefore, $g \in \calL^2(\Theta, \Qb)$ and the Assumption holds for $R\leq \tau R_0$ as $w(\theta) \leq \tau$ by construction.
    
    {Assumption 3 (Importance-weighting function):} For $w(\theta)$ as defined in~\Cref{eq:weight_function} and $q=1$, $W= \tau$, and $\sigma^2= \tau^2$ it holds for all $m \in \N$, $m \geq 2$, that
    \begin{align*}
        \left( \int_\Theta w(\theta)^{\frac{q+m-1}{q}} \Qb(\mathrm{d} \theta) \right)^q \leq \frac{1}{2} m! W^{m-2} \sigma^2
    \end{align*}
    since $w(\theta)$ is bounded from above, $\int_\Theta w(\theta)^m \Qb(\mathrm d \theta) \leq \tau^m \leq \frac{1}{2} m! \tau^m$.
    
    {Assumption 4 (Effective dimension):} It is a standard result (see, for instance,~\citep[Section 3.3.4]{edmunds1996function}) that for a Sobolev kernel $k_\Theta$ of smoothness $s_\Theta$, the $i$-th eigenvalue decays at the rate of $i^{-2s_\Theta/p}$. As pointed out in the discussion after Assumption 4 in~\citep{gogolashvili2023importance}, this implies Assumption 4 holds for $s'=p/(2s_\Theta)$.

    Therefore by~\citep[Theorem 4]{gogolashvili2023importance}, for $\lambda=\tau c \lambda_\Theta T^{-1} = \tau c T^{-(1-1/(2r+1))}$ and the weight function $w(\theta)$ defined in~\eqref{eq:weight_function}, we have that with probability at least $1-\delta/2$
    \begin{align*}
        \| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Qb_w)} \leq T^{-\frac{r}{2r+1}} \left( 32 (M + \|I\|_{\calH_\Theta}) \tau c^{-\frac{1}{2}} \log\frac{6}{\delta} + \tau c^r R_0 \right)
    \end{align*}
    for $M=\sup_{\theta \in \Theta} k(\theta, \theta) \| I\|_{\calH_\Theta}$ and all $T \geq T_0$, where $T_0$ is the smallest value for which
    \begin{align}
    \label{eq:condition_on_c}
        c \in \left[8(\tau^{-1}+1)^{\frac{1}{2}}   \log\frac{12}{\delta}, \tau^{-1} T_0^{\frac{2r}{2r+1}}\right].
    \end{align}
%
%     \begin{align}
%     \label{eq:l2_in_pte}
%         \| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Qb_w)} \leq T^{-r\beta} \left( 16 (M + \|I\|_{\calH_\Theta}) (W+\sigma E_{s'}^{1-q}) c^{-\frac{A}{2}} \log\frac{12}{\delta} + c^r R \right)
%     \end{align}
% %
%     provided\footnote{We omit the definition of $E_s$ intentionally as, since $q=1$, it is always raised to the power of zero in this work.}
%     \begin{align}
%     \label{eq:cond_on_c}
%         \lambda=\tau c T^{-(1-1/(2r+1))} \leq 1, \qquad
%         \tau c \geq \left( 64 (W+\sigma^2) E_{s'}^{2(1-q)} \log^2(12/\delta) \right)^{\frac{1}{A+1}},
%     \end{align}
% %
%     for the constants $W= \tau$, $\sigma^2= 
%     \tau^2$, $q=1$, $r \in [1/2, 1]$, $R \leq \tau R_0$, and $A=1$, $\beta=1/(2r+1)$. 
%     %\fxb{Why cant we write down these specific values then?} 
%     Then, the conditions on $c$ in~\eqref{eq:cond_on_c} become
%     \begin{align}
%     \label{eq:condition_on_c}
%         c \in \left[8(\tau^{-1}+1)^{\frac{1}{2}}   \log\frac{12}{\delta}, \tau^{-1} T^{\frac{2r}{2r+1}}\right].
%     \end{align}
% %
%     We denote the smallest $T$ for which this holds by $T_0$. The rate in~\eqref{eq:l2_in_pte} becomes
%     \begin{align*}
%         \| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Qb_w)} \leq T^{-\frac{r}{2r+1}} \left( 32 (M + \|I\|_{\calH_\Theta}) \tau c^{-\frac{1}{2}} \log\frac{6}{\delta} + \tau c^r R_0 \right)
%     \end{align*}
    Since $\Qb_w(\mathrm d \theta) = w(\theta)\Qb(\mathrm d \theta)$, and $w(\theta) \geq \tau(1 + \lambda_\Theta^{-1} \sigma^2_{\mathrm{BQ}, T})^{-1}>0$ for all $\theta$, it holds that
    \begin{align*}
        \| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Qb_w)} \geq \min_{\theta \in \Theta} w(\theta)\| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Qb)} \geq \tau(1 + \lambda_\Theta^{-1} \sigma^2_{\mathrm{BQ}, T})^{-1} \| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Qb)},
    \end{align*}
    and therefore
    \begin{align*}
        &\| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Qb)} \leq (1 + \lambda_\Theta^{-1} \sigma^2_{\mathrm{BQ}, T}) T^{-\frac{r}{2r+1}} \left( 32 (M + \|I\|_{\calH_\Theta}) c^{-\frac{1}{2}} \log\frac{6}{\delta} + c^r R_0 \right),
    \end{align*}
    %
    and we arrive at the statement of the theorem.
\end{proof}

The need to introduce $\tau$ in~\Cref{sec:connection_to_iwkrr} is clear now: without it, the condition on $c$ in~\Cref{eq:condition_on_c} may not hold. Since $\tau>0$ can be selected at will, we may set it to the smallest value for which~\Cref{eq:condition_on_c} holds.

Next, we establish a bound on $\sigma^2_\mathrm{BQ}(\theta_t)$ for any $t \in \{1,\dots,T\}$.

\begin{theorem}
\label{thm:bound_on_bq_var}
    Suppose Assumptions~\ref{as:app_true_f_smoothness},~\ref{as:app_domains_x},~\ref{as:app_x_samples},~\ref{as:app_densities},~\ref{as:app_kernel_x}, and~\ref{as:app_regulariser_x} hold. Then there is a $N_0>0$ such that for all $N \geq N_0$ with probability at least $1-\delta/2$ it holds that
    \begin{align*}
        \sigma^2_\mathrm{BQ}(\theta_t) \leq  \lambda_\calX + \frac{4}{\delta^2}\eta^2_0 K^2_3 K_4^d N^{-1 + 2\varepsilon} \left( K_4^{s_\calX-\frac{d}{2}} N^{-\frac{s_\calX}{d}+\frac{1}{2} + \varepsilon} + \lambda_\calX \right)^2
    \end{align*}
    for any $t \in \{1,\dots,T\}$, any arbitrarily small $\varepsilon>0$, and $K_3$, $K_4$ independent of $N,t,\varepsilon$.
\end{theorem}
\begin{proof}
Recall
%
\begin{align*}
    \hat{I}_\mathrm{BQ}(\theta_t) & = \mu_\theta(x^t_{1:N})^\top \left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} f(x^t_{1:N}, \theta_t),\\
    \sigma^2_\mathrm{BQ}(\theta_t) &= \mathbb{E}_{X,X'\sim \mathbb{P}_\theta}[k_{\calX}(X,X')] - \mu_\theta(x^t_{1:N})^\top \left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} \mu_\theta(x^t_{1:N}).
\end{align*}
%
We seek to bound $\sigma^2_\mathrm{BQ}(\theta_t)$.~\citep[Proposition 3.8]{kanagawa2018gaussian} pointed out that the Gaussian noise posterior is the worst-case error in the $\calH_{\calX}^{\lambda_\calX}$, the RKHS induced by the kernel $k_\calX^{\lambda_\calX}(x, x') = k_\calX(x, x') + \lambda_\calX$. Through straightforward algebraic manipulations and using the reproducing property, one can show that
%
\begin{align}
\label{eq:variance_bound_proof_1}
    \sigma^2_\mathrm{BQ}(\theta_t) - \lambda_\calX = \mathrm{MMD}^2(\hat{\Pb}^N_\theta, \Pb_\theta; \calH_{\calX}^{\lambda_\calX})=\sup_{\|f\|_{\calH_{\calX}^{\lambda_\calX}} \leq 1} \left| w^{\lambda_\calX}_t f(x^t_{1:N}) - \int_\calX f(x) \Pb_\theta(\mathrm d x)\right|^2,
\end{align}
%
for the empirical measure $\hat{\Pb}^N_\theta = w^{\lambda_\calX}_t \delta_{x^t_{1:N}}$, where $\delta_{x^t_i}$ for all $i$ is the Dirac delta distribution, $\delta_{x^t_{1:N}} = [\delta_{x^t_1} \dots \delta_{x^t_N}]^\top$ is our usual vector notation used throughout this work, and the weights are the optimal BQ weights $w^{\lambda_\calX}_t=\left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} \mu_\theta(x^t_{1:N})$. 

Since $\calH_\calX^{\lambda_\calX}$ is induced by the sum of kernels, $k_\calX^{\lambda_\calX}(x, x') = k_\calX(x, x') + \lambda_\calX$, it holds that $\calH_\calX \subseteq \calH_\calX^{\lambda_\calX}$, and $\| f  \|_{\calH_\calX^{\lambda_\calX}} \leq \| f  \|_{\calH_\calX}$~\citep[Theorem I.13.IV]{aronszajn1950theory}. Therefore, the class of functions $f$ for which $\| f  \|_{\calH_\calX} \leq 1$ is larger than that for which $\| f  \|_{\calH_\calX^{\lambda_\calX}} \leq 1$, and
%
\begin{align}
\label{eq:variance_bound_proof_2}   \sup_{\|f\|_{\calH_{\calX}^{\lambda_\calX}} \leq 1} \left| w^{\lambda_\calX}_t f(x^t_{1:N}) - \int_\calX f(x) \Pb_\theta(\mathrm d x)\right| \leq \sup_{\|f\|_{\calH_{\calX}} \leq 1} \left| w^{\lambda_\calX}_t f(x^t_{1:N}) - \int_\calX f(x) \Pb_\theta(\mathrm d x)\right|.
\end{align}
%
Next, note that for $\hat{f}_t(x) = k(x, x^t_{1:N})^\top \left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} f(x^t_{1:N})$,
%
\begin{align}
\begin{split}
\label{eq:variance_bound_proof_3}
   \left| w^{\lambda_\calX}_t f(x^t_{1:N}) - \int_\calX f(x) \Pb_\theta(\mathrm d x)\right| = \left| \int_\calX \left(\hat{f}_t(x) - f(x) \right) \Pb_\theta(\mathrm d x)\right| &\leq  \int_\calX \left|\hat{f}_t(x) - f(x) \right| \Pb_\theta(\mathrm d x)  \\
    & \leq \|\hat{f}_t - f\|_{\calL^2(\calX)} \|p_\theta \|_{\calL^2(\calX)},
\end{split}
\end{align}
%
where the last inequality is an application of H\"older inequality. By Assumption~\ref{as:app_densities}$, \|p_\theta \|_{\calL^2(\calX)}$ is bounded above by $\eta_0$. In order to apply~\citep[Theorem 4]{wynne2021convergence} to bound $\|\hat{f}_t - f\|_{\calL^2(\calX)}$, we show the assumptions of that Theorem hold.
    
{Assumption 1 (Assumptions on the Domain):} An open, bounded, and convex $\calX$ satisfies the assumption, as discussed in~\cite{wynne2021convergence}.
    
{Assumption 2 (Assumptions on the Kernel Parameters) and Assumption 3 (Assumptions on the Kernel Smoothness Range):} Our setting is more specific than the one~\citep[Theorem 4]{wynne2021convergence}: the kernel $k_\calX$ is Mat\'ern, and therefore all smoothness constants mentioned in Assumptions 2 and 3 have the same value, $s_\calX$.

{Assumption 4 (Assumptions on the Target Function and Mean Function):} The target function $f$ was assumed to have higher smoothness than $k_\calX$ in~\ref{as:app_true_f_smoothness}, and~\ref{as:app_kernel_x}; the mean function was taken to be zero.

{Assumption 5 (Additional Assumptions on Kernel Parameters):} By \ref{as:app_kernel_x} and~\ref{as:app_true_f_smoothness} the smoothness of the true function $s_f \geq s_\calX >d/2$, which verifies both statements in the Assumption since all smoothness constants of the kernel are equal to $s_\calX$.

Therefore~\citep[Theorem 4]{wynne2021convergence} holds, and for $\calW_2^0(\calX)=\calL^2(\calX)$
%
\begin{align*}
    \|\hat{f}_t - f\|_{\calL^2(\calX)} \leq K_3 h_{x_{1:N}^t}^{\frac{d}{2}} \left( h_{x_{1:N}^t}^{s_\calX-\frac{d}{2}} + \lambda_\calX \right),
\end{align*}
%
for any $N$ for which the fill distance $h_{x_{1:N}^t} \leq h_0$ for some $h_0$, and $K_3$ and $h_0$ that depend on $\calX, s_f, s_\calX$. 

Since $x_i^t \sim \Pb_{\theta_t}$, we can guarantee that $h_{x_{1:N}^t} \leq h_0$ with high probability using~\citep[Lemma 2]{oates2019convergence}, which says that provided the density $\inf_{x} p_{\theta_t}(x)>0$, there is a $K_4$ such that $\E h_{x_{1:N}^t} \leq C_t N^{-1/d + \varepsilon}$ for an arbitrarily small $\varepsilon>0$, for $C_t$ that depends on $t$ through $\inf_{x} p_{\theta_t}(x)$: the smaller $\inf_{x} p_{\theta_t}(x)$, the larger $C_t$. Since we assumed $\inf_{x, \theta} p_{\theta_t}(x)=\eta>0$ there is a $K_4$ such that $C_t\leq K_4$ for any $t$. Therefore, we may take $N_0$ to be the smallest $N$ for which $\E h_{x_{1:N}^t} \leq K_4 N^{-1/d + \varepsilon}$ holds, and have for all $N \geq N_0$
%
\begin{align*}
    \E_{x^t_{1:N} \sim \Pb_{\theta_t}}\|\hat{f}_t - f\|_{\calL^2(\calX)} \leq K_3 K_4^{\frac{d}{2}} N^{-\frac{1}{2} + \varepsilon} \left( K_4^{s_\calX-\frac{d}{2}} N^{-\frac{s_\calX}{d} + \frac{1}{2} + \varepsilon} + \lambda_\calX \right)
\end{align*}
%
By Markov's inequality, for any $\delta/2 \in (0,1)$ it holds with probability at least $1-\delta/2$ that
%
\begin{align}
\label{eq:variance_bound_proof_4}
    \|\hat{f}_t - f\|_{\calL^2(\calX)} \leq \frac{2}{\delta}K_3 K_4^{\frac{d}{2}} N^{-\frac{1}{2} + \varepsilon} \left( K_4^{s_\calX-\frac{d}{2}} N^{-\frac{s_\calX}{d} + \frac{1}{2} + \varepsilon} + \lambda_\calX \right)
\end{align}
%
Putting together~\Cref{eq:variance_bound_proof_1,eq:variance_bound_proof_2,eq:variance_bound_proof_3,eq:variance_bound_proof_4} and Assumption~\ref{as:app_densities}, we get the result,
\begin{align*}
    \sigma^2_\mathrm{BQ}(\theta_t) - \lambda_\calX &=\sup_{\|f\|_{\calH_{\calX}^{\lambda_\calX}} \leq 1} \left| w^{\lambda_\calX}_t f(x^t_{1:N}) - \int_\calX f(x) \Pb_\theta(\mathrm d x)\right|^2 \\
    &\leq \sup_{\|f\|_{\calH_{\calX}} \leq 1} \left| w^{\lambda_\calX}_t f(x^t_{1:N}) - \int_\calX f(x) \Pb_\theta(\mathrm d x)\right|^2 \\
    &\leq \sup_{\|f\|_{\calH_{\calX}} \leq 1} \|\hat{f}_t - f\|^2_{\calL^2(\calX)} \|p_\theta \|^2_{\calL^2(\calX)} \\
    &\leq \frac{4}{\delta^2}\eta^2_0 K^2_3 K_4^d N^{-1 + 2\varepsilon} \left( K_4^{s_\calX-\frac{d}{2}} N^{-\frac{s_\calX}{d} + \frac{1}{2} + \varepsilon} + \lambda_\calX \right)^2.
\end{align*}
\end{proof}

We are now ready to state our main convergence result, which is a more general version of~\Cref{thm:convergence}.

\begin{theorem}
\label{thm:convergence_generalised}
    Suppose all technical assumptions in~\Cref{sec:technical_assumptions} hold. Then for any $\delta \in (0, 1)$ there is a $T_0(\delta)>0$ and an $N_0>0$ such that for any $N \geq N_0$ and $T \geq T_0$, with probability at least $1-\delta$ it holds that
    \begin{align*}
        &\| \hat I_\mathrm{CBQ}  - I \|_{\calL^2(\Theta, \Qb)} \leq K_1\left( \log\frac{12}{\delta} + K_2\right) \\
        & \qquad \times\left(1 + c^{-1} T^{-\frac{1}{2r+1}} \left( \lambda_\calX + \frac{4}{\delta^2} \eta^2_0 K^2_3 K_4^d N^{-1 + 2\varepsilon} \left( K_4^{s_\calX-\frac{d}{2}} N^{-\frac{s_\calX}{d} + \frac{1}{2} + \varepsilon} + \lambda_\calX \right)^2 \right)\right) T^{-\frac{r}{2r+1}},
    \end{align*}
    for any arbitrarily small $\varepsilon>0$, and constants $K_1, K_2, K_3, K_4$ independent of $N, T, \delta, \varepsilon$.
\end{theorem}
\begin{proof}
Recall that for any two events $A$ and $B$,
\begin{align*}
    \Pb(A \cap B) = 1 - \Pb(\neg A \cup \neg B) \geq 1 - \Pb(\neg A) - \Pb(\neg B) = \Pb(A) + \Pb(B) - 1.
\end{align*}
Taking $A$ to be the event in~\Cref{thm:krr_corollary}, and $B$ to be the event in~\Cref{thm:bound_on_bq_var},
%
\begin{align*}
        A&=\left\{\| \hat I_\mathrm{CBQ} - I \|_{\calL^2(\Theta, \Qb)} \leq K_1\left( \log\frac{12}{\delta} + K_2\right) \left(1 + c^{-1} T^{-\frac{1}{2r+1}} \sigma^2_{\mathrm{BQ}, T}\right) T^{-\frac{r}{2r+1}}\right\}, \\
        B&=\left\{\text{for all $t$, }\sigma^2_\mathrm{BQ}(\theta_t) \leq  \lambda_\calX + \frac{4}{\delta^2}\eta^2_0 K^2_3 K_4^d N^{-1 + 2\varepsilon} \left( K_4^{s_\calX-\frac{d}{2}} N^{-\frac{s_\calX}{d} + \frac{1}{2} + \varepsilon} + \lambda_\calX \right)^2\right\},
\end{align*}
%
we get the result.
\end{proof}

As discussed in the main text, convergence is fastest when the regulariser $\lambda_\calX$ is set to $0$; $\lambda_\calX>0$ ensures greater stability at the cost of a lower speed of convergence. For clarity we show how~\Cref{thm:convergence} in the main text follows from the more general~\Cref{thm:convergence_generalised} by setting $\lambda_\calX=0$. 

\begin{proof}[Proof of~\Cref{thm:convergence}] Take $\lambda_\calX=0$, $C_1(\delta)=K_1( \log(12/\delta) + K_2)=\calO(\log(1/\delta))$, $C_2(\delta)=2c^{-1}K_1( \log(12/\delta) + K_2)\eta^2_0 K^2_3 K_4^{2s_\calX}/\delta^2=\calO((1/\delta^2)\log(1/\delta))$, and $r=1/2$  in~\Cref{thm:convergence_generalised}. The result follows.
\end{proof}

%\textcolor{red}{Lastly, we highlight that our proof strategy readily extends to the case where Stage 1 approximates the integrals through Monte Carlo sampling, instead of Bayesian quadrature. In that case, the variance term $\sigma^2_\mathrm{MC}(\theta_{t'})$ in~\Cref{thm:krr_corollary} may be bounded as $\calO(N^{-1})$, and the final rate takes the form $\calO(T^{-\frac{r}{2 r + 1}}) + \calO(T^{-\frac{r+1}{2r+1}} N^{-1})$. As $2s_\calX/d$ will be greater than $1$ in practice---and considerably greater if $x \mapsto f(x, \theta)$ is smooth---this shows the benefit of using BQ in Stage 1 of the method.}

% \subsection{A convergence rate for KLSMC}

% \masha{This is not finished!}

% Recall the KLSMC estimator,
% %
% \begin{align}
%     \hat I_\mathrm{KLSMC}(\theta) = k_\Theta(\theta, \theta_{1:T}) \left( k_\Theta(\theta_{1:T}, \theta_{1:T}) + \sigma^2 \Id_T \right)^{-1} \hat I_\mathrm{MC}(\theta_{1:T}) 
% \end{align}
% %
% where, for $x_j^{(i)} \sim \Pb_{\theta_i}$, $j \in \{1,\dots ,n\}$, the MC estimator of the integral at $\theta_i$ is
% %
% \begin{align}
%     \hat I_\mathrm{MC}(\theta_i) = \frac{1}{n} \sum_{j=1}^n f(x_j^{(i)}, \theta_i).
% \end{align}
% %
% This is the setting of Gaussian process regression with misspecified likelihood. By law of large numbers, $|\hat I_\mathrm{MC}(\theta_i) - I(\theta_i)|$ has a normal distribution $\calN(0, \mathrm{Var}[I_\mathrm{MC}(\theta_i)])$ with mean $0$ and unknown variance of the MC estimator $\mathrm{Var}[I_\mathrm{MC}(\theta_i)]$. In general, it will not hold that $\mathrm{Var}[I_\mathrm{MC}(\theta_i)]=\sigma^2$, and therefore, we have Gaussian process regression with misspecified likelihood, which is the setting of results in~\cite[Section 4.3.1]{wynne2021convergence}.

% \begin{align}
%     \E_{\varepsilon_1}\dots \E_{\varepsilon_T} \|\hat I_\mathrm{MC}(\theta) - I(\theta)\|_{\calL^2(\Theta)} \leq K_1 h_{\theta_{1:T}}^{p/2} \left( h_{\theta_{1:T}}^{\nu_\Theta} + \sigma^2\right),
% \end{align}

% Therefore, we may take $T_0$ to be the smallest $T$ for which $\E h_{\theta_{1:T}} \leq K_2 T^{-1/p + \varepsilon}$ holds, and have for all $T \geq T_0$

% By Markov's inequality, for any $\delta/2 \in (0,1)$ it holds with probability at least $1-\delta/2$ that
% %
% \begin{align}
% \label{eq:variance_bound_proof_4}
%     \|\hat I_\mathrm{MC}(\theta) - I(\theta)\|_{\calL^2(\Theta)} \leq \frac{2}{\delta} K_1 K_2^{p/2} T^{-1/2 + \varepsilon} \left( K_2^{\nu_\Theta} T^{-\nu_\Theta/p + \varepsilon} + \sigma^2\right)
% \end{align}

% \begin{align}
%     \varepsilon = [\varepsilon_1, \dots, \varepsilon_T]
% \end{align}
% \begin{align}
%     \varepsilon_i \sim \calN(0, \mathrm{Var}[I_\mathrm{MC}(\theta_i) - I(\theta_i)])
% \end{align}

\subsection{Mat\'ern Kernels are Sobolev Kernels}
\label{sec:on_sobolev_kernels}

Throughout this work, we assume the kernels $k_\calX$ and $k_\Theta$ are Sobolev kernels, meaning they induce a Hilbert space norm-equivalent to some Sobolev space. In this section, we expand on important examples of Sobolev kernels: Mat\'ern kernels.

It is well-known that the RKHS of a Mat\'ern kernel of order $\nu_\Theta$ over an open, convex and bounded $\Theta \subset \R^p$ is norm-equivalent to the Sobolev space $W^{2,\nu_\Theta+p/2}(\Theta)$ when $\nu_\Theta+p/2 \in \mathbb Z$; this is proven in~\citep[Corollary 10.48]{Wendland2005}. For $\Theta=\R^p$, the result can be straightforwardly extended to fractional order Sobolev-Slobodeckij spaces, $\nu_\Theta+p/2 \in \mathbb R$: by~\cite[Corollary 10.13]{Wendland2005} the RKHS of a Mat\'ern kernel on $\R^p$ is norm-equivalent to a Bessel potential space, which in turn is norm-equivalent to the Sobolev-Slobodeckij space by~\cite[Section 7.62]{adams2003sobolev}. Finally, one can use an extension operator in~\cite[Theorems 6.1 and 6.7]{devore1993besov} to restrict the norm-equivalence result to open, convex and bounded $\Theta \subset \R^p$.

\section{Practical Considerations for Conditional Bayesian Quadrature}\label{appendix:practical_considerations}

We now discuss important practical considerations which can have significant impact on the performance of CBQ. Firstly, in \Cref{appendix:tractable_kernel_means} we discuss how to ensure a closed-form expression for kernel mean embeddings and initial errors of BQ estimators. Then, we discuss the selection of all kernel hyperparameters in \Cref{appendix:hyperparameter_selection}.

\subsection{Tractable Kernel Means}\label{appendix:tractable_kernel_means}

In the main text, we discussed the requirement for both BQ and  CBQ that the kernel mean embedding $\mu$ and its integral (called initial error) are known in closed-form. A list of well-known pair can be found in Table 1 in \citep{fx_quadrature} or the \texttt{ProbNum} package \citep{Wenger2021}. 
However, even when none of these pairs are appropriate for the problem at hand, there are still multiple solutions:

\begin{itemize}
    \item First, for a fixed $k$, when the embedding of $\mathbb{P}$ is intractable but the embedding of some other distribution $\mathbb{Q}$ is known, we can use the `importance sampling trick' which consists of writing the integral as $I=\mathbb{E}_{X \sim \mathbb{P}} [f(X)] = \mathbb{E}_{X \sim \mathbb{Q}} [g(X)]$ where $g(x)=f(x)p(x)/q(x)$ and $p,q$ are the densities of $\mathbb{P},\mathbb{Q}$. This allows us to use BQ on the integral of $g$, which is tractable by construction.

\vspace{2mm}

\item Secondly, again for a fixed $k$ and assuming that we know the quantile function $\Phi^{-1}$ of the distribution $\mathbb{P}$ and that the embedding of the uniform distribution is available, we can use the `inverse transform trick' which consists of writing $I=\mathbb{E}_{X \sim \mathbb{P}} [f(X)] = \mathbb{E}_{U \sim \mathbb{U}} [g(U)]$ where $g(u) = f(\Phi^{-1}(u))$ and $\mathbb{U}$ is a uniform distribution on some hypercube. Once again, BQ can now be applied to the transformed problem.

\vspace{2mm}

\item Finally, for any distribution $\mathbb{P}$ whose density is known up to the normalisation constant (for example most posterior distributions), then specialised kernels with closed-form embeddings can be constructed. This is true of Stein reproducing kernels~\cite{anastasiou2023stein}. Suppose we have a distribution $\mathbb{P}$ with density $p:\calX \rightarrow \R^+$ and a function $f:\calX \rightarrow \R$ with the property that $\lim_{x \to \infty} p(x)f(x) = 0$. The Langevin Stein kernel $k: \calX \times \calX \to \R$ \citep{anastasiou2023stein} is given by:
\begin{align*}
    k_p(x,x') & := \nabla_x \log p(x)^\top k(x, x' ) \nabla_{x'} \log p(x') + \nabla_{x} \log p(x) ^\top \nabla_{x'} k(x, x') 
    \\ & \qquad + \nabla_{x'} \log p(x') ^\top \nabla_x k(x, x') + \nabla_x \cdot \nabla_{x'} k(x, x'),
\end{align*}
where $\nabla_x = (\partial/ \partial x_1, \cdots, \partial/ \partial x_d)^\top$ and $\nabla_x \cdot \nabla_{x'} k(x, x') = \sum_{i=1}^d \frac{\partial k(x, x')}{\partial x_i \partial x_i'}$.
% We can define the Langevin Stein operator $T_p[f](x) = f(x) \nabla_x \log p(x)+\nabla_x f(x)$ where $\E_p[T_p[f](x)] = 0$ for sufficiently regular $f$.


The main advantage of using Stein kernel is that the mean embedding $\mu(x') = \int_{\calX} k_p(x, x')p(x)dx = 0$ by construction. 
However, this means our GP prior on $f$ encodes beliefs that the function has mean zero.
To weaken this, we can add a constant $c \in \R$; i.e $\tilde{k}_p(x, x') = k_p(x,x') + c$, so that the kernel mean embedding becomes $\mu(x') = c$. 
The constant $c$ can then be treated as a kernel hyperparameter and estimated alongside all other parameters. 

% Stein reproducing kernels are also non-stationary, which implies prior beliefs that $f$ has different properties across different parts of $\calX$. Therefore, using a GP prior with Stein kernel as covariance function requires additional caution. Fortunately, our experiments  in \Cref{sec:experiments} and \Cref{appendix:experiments} do not exhibit a huge difference between Stein kernel and traditional kernels.

\end{itemize}

%%%%%%%%%%%%%%%%%%%%%%%%%%%

\subsection{Model and Hyperparameter Selection}\label{appendix:hyperparameter_selection}


We now discuss our approach for model and hyperparameter selection for CBQ and baseline methods. 

\paragraph{Conditional Bayesian quadrature} The hyperparameter selection for CBQ boils down to the choice of GP interpolation hyperparameters at stage 1 and the choice of GP regression hyperparameters at stage 2. To simplify this choice, we renormalise all our function values before performing GP regression and interpolation. This is done by first subtracting the empirical mean and then dividing by the empirical standard deviation. 
All of our experiments then use prior mean functions $m_\Theta$ and $m_\calX$ which are zero functions, a reasonable choice given the function was renormalised using the empirical mean. This choice is made for simplicity, and we might expect further improvements in accuracy if more information is available.


The choice of covariance functions $k_\calX$ and $k_\Theta$ is made on a case-by-case basis in order to both encode properties we expect the target functions to have, but also to ensure that the corresponding kernel mean is available in closed-form (as per the previous section). Once this is done, we typically still need to make a choice of hyperparameters for both kernel: lengthscales $l_\calX$, $\l_\Theta$ and amplitudes $A_\calX, A_\Theta$. 
We also need to select the regularizer $\lambda_\calX, \lambda_\Theta$. 
$\lambda_\calX$ is fixed to be $0$ as suggested by \Cref{thm:convergence}.
The rest of the hyperparameters are selected through empirical Bayes, which consists of maximising the log-marginal likelihood.
For stage 1, the log-marginal likelihood can be written as~\citep{GPML}:
%
\begin{align*}
& L(l_\calX, A_\calX) =  -\frac{1}{2} \log \left| k_{\calX}(x_{1:N},x_{1:N}; l_\calX, A_\calX) \right| - \frac{N}{2} \log(2 \pi) \\
& \quad -\frac{1}{2}(f(x_{1:N})-m_{\calX}(x_{1:N}))^\top \left(k_{\calX}(x_{1:N},x_{1:N};l_\calX, A_\calX) + \lambda_\calX \Id_N \right)^{-1} (f(x_{1:N})-m_{\calX}(x_{1:N})),
\end{align*}
where $|\cdot|$ denotes the determinant of the matrix, and we write $k_{\calX}(x_{1:N},x_{1:N}; l_\calX, A_\calX)$ to emphasise the hyperparameters used to compute the Gram matrix.
The optimisation is implemented through a grid search over $\left[1.0, 10.0, 100.0, 1000.0 \right]$ for the amplitude $A_\calX$ and a grid search over $\left[0.1, 0.3, 1.0, 3.0, 10.0 \right]$ for the lengthscale $l_\calX$. 

If $k_\calX$ is a Stein reproducing kernel, we have an extra hyperparameter $c_\calX$. 
In this case, we use stochastic gradient descent on the log-marginal likelihood to find the optimal value for $c_\calX, l_\calX, A_\calX$, which is implemented with \texttt{JAX} autodiff library~\citep{jax2018github}. 
The reason we are using gradient based optimization instead of grid search for Stein kernel is that Stein kernel requires an accurate estimate of $c_\calX$ to work well. 
In order to return accurate results, grid search would require finer grid which is very expensive, while gradient based methods would require good initialization to avoid getting stuck in local minima. Fortunately, since $c_\calX$ indicates the mean of functions in the RKHS, we know that $c_\calX = 0$ is a good initialisation point since we have subtracted the empirical mean when normalising.

Additionally, it is important to note that we could technically use $T$ different kernels $k_\calX^1, \cdots, k_\calX^T$ for each integral in stage 1. However, the hyperparameters of each kernel $k_\calX^t$ would need to be selected using empirical Bayes under the observations $x_{1:N}^t$, which means we would need to repeat the above optimization $T$ times. In practice, when performing initial experiments, we observed that the estimated hyperparameters were very similar. Our strategy is therefore to select the hyperparameters of $k_\calX^1$ and subsequently reuse them across all $T$ integrals in stage 1. This is done for computational reasons, and we expect CBQ to show better performances if hyperparameters are optimised separately.


For the kernel $k_\Theta$, we also select the hyperparameters by maximising the log-marginal likelihood: 
\begin{align*}
   & L(l_\Theta, A_\Theta) =  -\frac{1}{2} \log | k_{\Theta}(\theta_{1:T},\theta_{1:T}; l_\Theta, A_\Theta)| - \frac{T}{2} \log(2 \pi)
     \\
& \quad  -\frac{1}{2} (\hat{I}_\mathrm{BQ} (\theta_{1:T})- m_{\Theta}(\theta_{1:T}))^\top \left(k_{\Theta}(\theta_{1:T}, \theta_{1:T};l_\Theta, A_\Theta) + \left( \lambda_\Theta + \sigma_{\mathrm{BQ}}^2(\theta_{1:T}) \right) \Id_T \right)^{-1} (\hat{I}_\mathrm{BQ} (\theta_{1:T})- m_{\Theta}(\theta_{1:T})).
\end{align*}
Similar to above, we also do a grid search over $\left[1.0, 10.0, 100.0, 1000.0 \right]$ for amplitude $A_\Theta$, a grid search over $\left[0.1, 0.3, 1.0, 3.0, 10.0 \right]$ for lengthscale $l_\Theta$ and a grid search over $\left[0.01, 0.1, 1.0 \right]$ for $\lambda_\Theta$, so we select the value that gives the largest log-marginal likelihood. 

\paragraph{Least-squares Monte Carlo} LSMC implements Monte Carlo in the first stage and polynomial regression in the second stage. In the second stage, the hyperparameters include the regularisation coefficient $\lambda_\Theta$ and the order of the polynomial $p \in \{1,2,3,4\}$.
These hyperaparameters are also selected with grid search to give the lowest RMSE on a separate held out validation set.

\paragraph{Kernel least-squares Monte Carlo}
KLSMC implements Monte Carlo in the first stage and kernel ridge regression in the second stage. In the second stage, the hyperparameters are analogous to the hyperparameters in the second stage of CBQ, namely $A_\Theta, l_\Theta, \lambda_\Theta$.
These hyperaparameters are selected with grid search to give the lowest RMSE on a separate held out validation set.

\paragraph{Importance sampling} For IS, there are no hyperparameters to select. 


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



\section{Additional Experiments}\label{appendix:experiments}
We now provide detailed description of all experiments in the main text, as well as further results and ablation studies. All figures reported in the paper are created using the median values obtained from $20$ separate runs with different random seeds. Standard error is shown as shaded area around the median. 


\subsection{Synthetic Experiment: Bayesian Sensitivity Analysis for Linear Models}\label{appendix:bayes_sensitivity}

\subsubsection{Experimental Setting}

\begin{figure}[t]
    \centering
    \begin{minipage}{1.0\textwidth}
    \centering
    \includegraphics[width=250pt]{figures/legend.pdf}
    \vspace{-5pt}
    \end{minipage}
    
    \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_N_10.pdf}
        \caption{RMSE with fixed $N=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_N_50.pdf}
        \caption{RMSE with fixed $N=50$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_N_100.pdf}
        \caption{RMSE with fixed $N=100$.}
    \end{subfigure}
        \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_T_10.pdf}
        \caption{RMSE with fixed $T=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_T_50.pdf}
        \caption{RMSE with fixed $T=50$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_T_100.pdf}
        \caption{RMSE with fixed $T=100$.}
    \end{subfigure}
    \caption{\emph{Bayesian sensitivity analysis for linear models}. \textbf{First Row:} Dimension $d=2$ with fixed $N=10, 50, 100$ and increasing $T$. 
    \textbf{Second Row:} Dimension $d=2$ with fixed $T=10, 50, 100$ and increasing $N$. The intergral is $f(x) = x^\top x$.} 
    \label{appfig:bayes_sensitivity_1}
\end{figure}


In this synthetic experiment, we do sensitivity analysis on the hyperparameters in Bayesian linear regression. 
The observational data for the linear regression are $Y \in \R^{m \times d}, Z \in \R^{m}$ with $m$ being the number of observations and $d$ being the dimension.
We use $x$ to denote the regression weight; this is unusual but is done so as to keep the notation consistent with the main text.
By placing a $\calN(x ; 0, \theta \Id_d)$ prior 
on the regression weights $x \in \R^{d}$ with $\theta \in \left( 1, 3\right)^d$, and assuming independent $\calN(0, \eta)$ observation noise for some known $\eta>0$, we can obtain (via conjugacy) a multivariate Gaussian posterior $\Pb_\theta$ whose mean and variance have a closed form expression~\citep{bishop:2006:PRML}.
%
\begin{align*}
    \Pb_\theta = \calN(\tilde{m}, \tilde{\Sigma}), \quad \tilde{\Sigma}^{-1} = {\frac{1}{\theta} \Id_d} + \eta Y^\top Y, \quad \tilde{m} = \eta \tilde{\Sigma} Y^\top Z.
\end{align*}
We can then analyse sensitivity by computing the conditional expectation $I(\theta)=\int_\calX f(x)\Pb_\theta(dx)$ of some quantity of interest $f$.
For example, if  $f(x)=x^\top x$, then $I(\theta)$ is the second moment of the posterior and the results are already reported in the main text.
If $f(x) = x^\top y^\ast$ for some new observation $y^\ast$, then $I(\theta)$ is the predictive mean. 
In these simple settings, $I(\theta)$ can be
computed analytically, making this a good synthetic example for benchmarking.
We sample parameter values $\theta_{1:T}$ from a uniform distribution $ \Qb = \operatorname{Unif}(\Theta)$ where $\Theta = (1, 3)^d$, and for each such parameter $\theta_t$, we obtain $N$ observations $x_{1:N}^t$ from $\Pb_{\theta_t}$.  
In total, we have $N \times T$ samples.

For conditional Bayesian quadrature (CBQ), we need to carefully choose two kernels $k_\Theta$ and $k_\calX$. Firstly, we choose the kernel $k_\calX$ to be an isotropic Gaussian kernel: $k(x, x') = A_\calX \exp(-\frac{1}{2 l_\calX^2} (x - x')^\top(x - x'))$ for the purpose that the Gaussian kernel mean embedding has a closed form under the Gaussian posterior $\Pb_\theta$:
\begin{align}\label{appeq:E14}
    \mu_{\theta}(x) = A_\calX {\left| Id_d + l_\calX^{-2} \tilde{\Sigma} \right|}^{-1/2} \exp \left(-\frac{1}{2} (x - \tilde{m})^\top (\tilde{\Sigma} + l_\calX^2 \Id_d)^{-1} (x - \tilde{m})\right)
\end{align}
In addition, the integral of the kernel mean embedding $\mu_\theta$ (known as the initial error) also has a closed form
$\int_{\calX} \mu_\theta(x) \Pb_\theta(dx) = A_\calX l_\calX / \sqrt{| l_\calX^2 \Id_d + 2 \tilde{\Sigma}|}$.


This leaves us with a choice for $k_\Theta$. 
In this synthetic setting, we know that $I(\theta)$ is infinitely times differentiable, but we opt for Mat\'ern-3/2 kernel $k_\Theta(\theta, \theta') = A_\Theta (1+\sqrt{3} |\theta - \theta'|/l_\Theta) \exp (-\sqrt{3} |\theta - \theta'|/l_\Theta)$ to encode a more conservative prior information on the smoothness of $I(\theta)$.

\begin{figure}[t]
    \centering
    \begin{minipage}{1.0\textwidth}
    \centering
    \includegraphics[width=250pt]{figures/legend.pdf}
    \vspace{-5pt}
    \end{minipage}
    
    \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_N_10_g4.pdf}
        \caption{RMSE with fixed $N=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_N_50_g4.pdf}
        \caption{RMSE with fixed $N=50$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_N_100_g4.pdf}
        \caption{RMSE with fixed $N=100$.}
    \end{subfigure}
        \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_T_10_g4.pdf}
        \caption{RMSE with fixed $T=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_T_50_g4.pdf}
        \caption{RMSE with fixed $T=50$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/bayes_sensitivity_T_100_g4.pdf}
        \caption{RMSE with fixed $T=100$.}
    \end{subfigure}
    \caption{\emph{Bayesian sensitivity analysis for linear models}. \textbf{First Row:} Dimension $d=2$ with fixed $N=10, 50, 100$ and increasing $T$. 
    \textbf{Second Row:} Dimension $d=2$ with fixed $T=10, 50, 100$ and increasing $N$. The intergral is $f(x) = x^\top y^\ast$. } \label{appfig:bayes_sensitivity_2}
\end{figure}

\subsubsection{Assumptions from \Cref{thm:convergence}} 
We would like to check whether the assumptions made in \Cref{thm:convergence} hold in this experiment.
\begin{itemize}
    \item A1: 
    Although $\calX=\R$ is not a compact domain, $\Pb_{\theta}$ is a Gaussian distribution so the probability mass outside a large compact subset of $\calX$ decays exponentially. $\Theta = \left( 1, 3 \right)^d$ is a compact domain. A1 is therefore approximately satisfied.
    \item A2: A2 is satisfied due to the sampling mechanism of $\theta_{1:T}$ and $\{x_{1:N}^t\}_{t=1}^T$.
    \item A3: $\Qb$ is a uniform distribution so its density $q$ is constant and hence upper bounded and strictly positive. $\Pb_\theta$ is a Gaussian distribution so its density $p_\theta$ is strictly positive on a compact and large domain with finite second moment. A3 is approximately satisfied.
    \item A4: Both $f(x)$ and $I(\theta)$ are infinitely times differentiable, so $s_I=s_f = \infty$. 
    Although $k_\calX$ is Gaussian kernel which does not satisfy the assumption of \Cref{thm:convergence}, we have ablation study in \Cref{appendix:ablation} showing similar performance when $k_\calX$ is Mat\'ern-3/2 kernel so $s_\calX = \frac{3}{2} + \frac{d}{2}$, and $k_\Theta$ is Mat\'ern-3/2 kernel so $s_\Theta = \frac{3}{2} + \frac{d}{2}$, where $d$ is the dimension. A4 is satisfied.
    \item A5: $\lambda_\calX$ is picked to be $0$ and $\lambda_\Theta$ is found via grid search among $\{0.01, 0.1, 1.0\}$. A5 is satisfied.
\end{itemize}
 
\subsubsection{Additional Experimental Results}
We now provide additional experimental results for Bayesian sensitivity analysis in linear models. 
\Cref{appfig:bayes_sensitivity_1} provides the result when the integrand is chosen to be $f(x)=x^\top x$ so $I(\theta)$ represents the posterior second moment, and \Cref{appfig:bayes_sensitivity_2} provides the result when the integrand is chosen to be $f(x)=x^\top y^\ast$ so $I(\theta)$ represents the predictive mean.
% In the first row of \Cref{appfig:bayes_sensitivity_1}, we fix $N=10, 50, 100$ showing the performance of RMSE with increasing $T$. 
% In the second row of \Cref{appfig:bayes_sensitivity_1}, we fix $T=10, 50, 100$ showing the performance of RMSE with increasing $N$. In the first row of \Cref{appfig:bayes_sensitivity_2}, we fix $N=10, 50, 100$ showing the performance of RMSE with increasing $T$. 
% In the second row of \Cref{appfig:bayes_sensitivity_2}, we fix $T=10, 50, 100$ showing the performance of RMSE with increasing $N$. 
We can see that CBQ has demonstrated consistent smaller RMSE for both tasks under the same number of samples and faster convergence rate compared to all other baseline methods. The conclusions that we draw from the main text also hold for different values of $N$ and $T$.
By comparing the performance of CBQ and KLSMC, where the second stage of both methods are identical, and the main difference lies in the first stage, we believe that CBQ shows better performances mainly due to using Bayesian quadrature instead of Monte Carlo in the first stage.
Also by comparing the first and second row in both \Cref{appfig:bayes_sensitivity_1} and \Cref{appfig:bayes_sensitivity_2}, we can confirm the theory we proved in \Cref{appendix:convergence_rate} that CBQ has a faster convergence rate in $N$ than in $T$. 

In general, CBQ is more computationally expensive than baselines (KLSMC, LSMC and IS), so in this simple setting it is more efficient to spend more budget on obtaining more samples. 
Nonetheless, in scenarios where the expense of sample collection constitutes a significant fraction of the computational budget, or when the evaluation of the integrand proves to be highly costly, it becomes more cost-effective to spend a larger share of the budget towards CBQ. For example, sampling can become expensive easily when the prior and likelihood are not conjugate, so Markov chain Monte Carlo methods are needed to sample from unnormalized posterior. 
Also, we show in the next section \Cref{appendix:sir} a real world example when sampling is particularly costly and hence using CBQ is overall more efficient.

\subsection{Bayesian Sensitivity Analysis for Susceptible-Infectious-Recovered (SIR) Model }\label{appendix:sir}
\begin{figure}[t]
    \begin{minipage}{\textwidth}
    \centering
    \includegraphics[width=380pt]{figures/legend_finance.pdf}
    \end{minipage}
    
    \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/SIR_15.pdf}
        \caption{RMSE with fixed $T=15$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/SIR_25.pdf}
        \caption{RMSE with fixed $T=25$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/SIR_35.pdf}
        \caption{RMSE with fixed $T=35$.}
    \end{subfigure}
    \caption{\emph{Bayesian sensitivity analysis for SIR model.} $T=15, 25, 35$ and increasing $N$.}\label{appfig:sir}
\end{figure}
\subsubsection{Experimental Setting}
The SIR model is commonly used to simulate the dynamics of infectious diseases through a population~\cite{kermack1927sir}. 
It divides the population into three sections.
Susceptibles (S) represents people who are not infected but can be infected after getting contact with an infectious individual.
Infectious (I) represents people who are currently infected and can infect susceptible individuals.
Recovered (R) represents individuals who have been infected and then removed from the disease, either by recovering or dying. The dynamics are governed by a system of ordinary differential equations (ODE) as below.
%
\begin{align*}
    \begin{aligned}
\frac{\mathrm{d} S}{\mathrm{~d} r} &= -x S I, \quad
\frac{\mathrm{d} I}{\mathrm{~d} r} &= x S I-\gamma I, \quad
\frac{\mathrm{d} R}{\mathrm{~d} r} &= \gamma I
\end{aligned}
\end{align*}
%
with $x$ being the infection rate, $\gamma$ being the recovery rate and $r$ is the time. The solution to the SIR model would be a vector of $\left(N_I^r, N_S^r, N_R^r \right)$ representing the number of infectious, susceptibles and recovered at day $r$.

In this experiment, we assume that the recovery rate $\gamma$ is fixed and we place a Gamma prior distribution on $x$; i.e. $\Pb_\theta = \operatorname{Gamma}(\theta, \xi)$ where $\theta$ represents the initial belief of the infection rate deduced from the study of the virus in the laboratory at the beginning of the outbreak, and $\xi$ represents the amount of uncertainty on the initial belief. 
We fix the parameter $\xi=10$, the total population is set to be $10^6$ and the recovery rate $\gamma = 0.05$. 
The target of interest is the expected peak number of infected individuals under the prior distribution on $x$: 
\begin{align*}
    I(\theta) = \E_{x}\left[\max_r N_I^r(x) \mid \theta \right] = \int_{\calX} \max_r N^r_I(x) \Pb_\theta(dx)
\end{align*}
with the integrand $f(x) = \max_r N_I^r(x)$. We are interested in the sensitivity analysis of the shape parameter $\theta$ to the final estimate of the expected peak number of infected individuals.
The initial belief of the infection rate $\theta_{1:T}$ are sampled from the uniform distribution $\Qb = \operatorname{Unif}\left(2,9\right)$ and then $N$ number of $x^t_{1:N}$ are sampled from $\Pb_{\theta_t} = \operatorname{Gamma}(\theta_t, \xi)$. 
In this setting, sampling $x$ is very expensive as it necessarily involves solving the system of SIR ODEs, which can be very slow as the discretization step gets finer.
In the middle panel of \Cref{fig:finance_sir}, we have shown that obtaining one sample from SIR ODEs under discretization time step $\tau = 0.1$ takes around $3.0$s, whereas running the whole CBQ algorithm takes $1.0$s, not to mention that sampling from SIR ODEs need to be repeated $N \times T$ times. Therefore, using CBQ is ultimately more efficient overall within the same period of time.

For CBQ, we need to carefully choose two kernels $k_\Theta$ and $k_\calX$.
First we choose $k_\calX$, we use Mat\'ern-3/2 as the base kernel and then apply a Langevin Stein operator to both arguments of the base kernel to obtain $k_\calX$. 
The reason we use a Langevin Stein kernel is that Stein kernel gives an RKHS which is a subset on the Sobolev space with one order less smoothness than the base kernel, and since the smoothness of the integrand $f(x) = \max_r N_I^r(x)$ is unknown, using a Stein kernel enforces weaker prior information than Mat\'ern-3/2.
Furthermore, the kernel mean embedding of a Stein kernel $\mu(x)$ is a constant $c$ by construction as per the discussion in \Cref{appendix:practical_considerations}. 
The initial error is also a constant $c$ by construction.
Then we choose $k_\Theta$. Since $I(\theta)$ represents the peak number of infections so $I(\theta)$ is expected to be smooth and continuous, and hence we choose $k_\Theta$ as Mat\'ern-3/2 kernel. 
All hyperparameters in $k_\calX$ and $k_\Theta$ are selected according to \Cref{appendix:hyperparameter_selection}.
We use a MC estimator with $5000$ samples as the pseudo ground truth and evaluate the RMSE across all methods. 



\subsubsection{Assumptions from \Cref{thm:convergence}} 
We would like to check whether the assumptions made in \Cref{thm:convergence} hold in this experiment.
\begin{itemize}
    \item A1: Although $\calX=\R^+$ is not a compact domain,$\Pb_{\theta}$ is a Gamma distribution so the probability mass outside a large compact subset of $\calX$ around the origin decays exponentially. $\Theta = \left(2, 9 \right)^d$ is a compact domain. A1 is approximately satisfied.
    \item A2: A2 is satisfied due to the sampling mechanism of $\theta_{1:T}$ and $\{x_{1:N}^t\}_{t=1}^T$.
    \item A3: $\Qb$ is a uniform distribution so its density $q$ is constant and hence upper bounded and strictly positive. $\Pb_\theta$ is a Gamma distribution so its density $p_\theta$ is strictly positive within a large compact subset of $\calX$ and has finite second moment. A3 is approximately satisfied.
    \item A4: $f(x) = \max_r N_I^r(x)$ is the maximum number of infections so $f(x)$ is not necessarily smooth. $I(\theta)$ represents the peak number of infections with varying initial estimate of the infection rate, so $I(\theta)$ is smooth and continuous with $s_I \leq 1$. 
    $k_\calX$ is Stein kernel with Matern-3/2 kernel as the base, so the corresponding RKHS will have functions which are rough (i.e. of smoothness $1/2$) but is only a subset of a Sobolev space. In addition, $k_\Theta$ is Matern-3/2 kernel so $s_\Theta = \frac{3}{2} + \frac{1}{2} = 2$. It is therefore unclear if A4 is satisfied.
    \item A5: $\lambda_\calX$ is picked to be $0$ and $\lambda_\Theta$ is found via grid search among $\{0.01, 0.1, 1.0\}$. A5 is satisfied.
\end{itemize}


\subsubsection{Additional Experimental Results}
We report more results in \Cref{appfig:sir} with fixed $T=15, 25, 35$ and increasing $N$, to showcase that CBQ consistently exhibits smaller RMSE than baseline methods. The conclusions that we draw from the main text also hold for different values of N and T for this experiment.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\subsection{Option Pricing in Mathematical Finance}\label{appendix:black_scholes}

\begin{figure}[t]
    \begin{minipage}{\textwidth}
    \centering
    \includegraphics[width=380pt]{figures/legend_finance.pdf}
    \end{minipage}
    
    \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/finance_T_10.pdf}
        \caption{RMSE with fixed $T=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/finance_T_20.pdf}
        \caption{RMSE with fixed $T=20$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/finance_T_30.pdf}
        \caption{RMSE with fixed $T=30$.}
    \end{subfigure}
    \caption{\emph{Option pricing in mathematical finance.} $T=10, 20, 30$ and increasing $N$.}\label{appfig:finance}
\end{figure}

\subsubsection{Experimental Setting}
In this experiment, we consider specifically an asset whose price $S({\tau})$ at time $\tau$ follows the Black-Scholes formula $S(\tau) = S_0 \exp \left(\sigma W(\tau) - \sigma^2 \tau/2 \right)$ for $\tau \geq 0$, where $\sigma$ is the underlying volatility, $S_0$ is the initial price and $W$ is the standard Brownian motion.
The financial derivative we are interested in is a butterfly call option whose payoff at time $\tau$ can be expressed as $\psi(S({\tau}))=\max (S(\tau)-K_1, 0) + \max (S(\tau)-K_2, 0) - 2\max (S(\tau) - (K_1+K_2)/2, 0)$.

In addition to the expected payoff, insurance companies are interested in computing the expected loss of their portfolios if a shock would occur in the economy.
We follow the setting in \cite{alfonsi2021multilevel, alfonsi2022many} assuming that a shock occur at time $\eta$, at which time the option price is $S(\eta)=\theta$, and this shock multiplies the option price by $1 + s$. The option price at maturity time $\zeta$ is denoted as $S(\zeta) = x$. The expected loss caused by the shock can be expressed as 
\begin{align*}
    \mathcal{L} = \E [\max (I(\theta), 0 )], \text{   } I(\theta) = \int_0^\infty \psi(x)-\psi \left((1 + s) x \right) \Pb_\theta(dx)
\end{align*}
So the integrand is $f(x) = \psi(x)-\psi((1+s)x)$.

Following the setting in \cite{alfonsi2021multilevel, alfonsi2022many}, we consider the initial price $S_0 = 100$, the volatility $\sigma = 0.3$, the strikes $K_1 = 50, K_2 = 150$, the option maturity $\zeta=2$ and the shock happens at $\eta=1$ with strength $s = 0.2$. 
The option price at which the shock occurs are $\theta_{1:T}$ sampled from the log normal distribution deduced from the Black-Scholes formula $\theta_{1:T} \sim \Qb = \operatorname{Lognormal}( \log S_0 - \frac{\sigma^2}{2} \eta, \sigma^2 \eta)$. 
Then $x^t_{1:N}$ are sampled from another log normal distribution also deduced from the Black-Scholes formula $x^t_{1:N} \sim \Pb_{\theta_t} = \operatorname{Lognormal}( \log \theta_t - \frac{\sigma^2}{2} (\zeta - \eta), \sigma^2 (\zeta - \eta))$. 

For CBQ, we need to carefully choose two kernels $k_\calX$ and $k_\Theta$. First we choose the kernel $k_\calX$ to be a log-Gaussian kernel for the purpose that the log-Gaussian kernel mean embedding has a closed form under log-normal distribution $\Pb_\theta = \operatorname{Lognormal}(\bar{m}, \bar{\sigma}^2)$ with $\bar{m} = \log \theta - \frac{\sigma^2}{2}(\zeta - \eta)$ and  $\bar{\sigma}^2 = \sigma^2 (\zeta - \eta)$. 
The log Gaussian kernel is defined as $k_\calX(x, x') = A_\calX \exp(-\frac{1}{2 l_\calX^2} (\log x - \log x')^2)$
and the kernel mean embedding has the form
\begin{align*}
    \mu_{\theta}(x) = \frac{A_\calX}{\sqrt{1 + \frac{\bar{\sigma}^2}{l_\calX^2}}} \left. \exp \left(-\frac{\bar{m}^2 + (\log x)^2 }{2(\bar{\sigma}^2 + l_\calX^2)}\right) x^{\frac{\bar{m}}{\bar{\sigma}^2 + l_\calX^2}}  \right.
\end{align*}
The initial error, which is the integral of kernel mean $\mu_{\theta}(x)$ does not have a closed form expression, so we use the empirical average as an approximation. Then, we choose the kernel $k_\Theta$ to be a Mat\'ern-3/2 kernel.

For this experiment, we also implement CBQ with Langevin Stein reproducing kernel. We use Mat\'ern-3/2 as the base kernel and then apply the Langevin Stein operator to both arguments of the base kernel to obtain $k_\calX$. 
The reason we use a Stein kernel is that Stein kernels have an RKHS whose functions have one order less smoothness than the base kernel, and since the integrand has very low smoothness (due to the maximum function), we do not want to use an overly smooth kernel. 
The kernel mean embedding of a Stein kernel is a
constant $c$ by construction as per the discussion in \Cref{appendix:practical_considerations}.
The kernel $k_\Theta$ is selected as Mat\'ern-3/2 kernel.
All hyperparameters in $k_\calX$ and $k_\Theta$ for CBQ and hyperparameters for baseline methods are selected according to \Cref{appendix:hyperparameter_selection}.

\subsubsection{Assumptions from \Cref{thm:convergence}} 
We would like to check whether the assumptions made in \Cref{thm:convergence} hold in this experiment.
\begin{itemize}
    \item A1: Although $\calX=\R^+$ is not a compact domain, $\Pb_{\theta}$ is a lognormal distribution so the probability mass outside a large compact subset of $\calX$ decays super exponentially. A similar argument can be made for $\Theta$ as well. A1 is therefore approximately satisfied.
    \item A2: A2 is satisfied due to the sampling mechanism of $\theta_{1:T}$ and $\{x_{1:N}^t\}_{t=1}^T$.
    \item A3: $\Qb$ is a lognormal distribution so its density $q$ is upper bounded and strictly positive within a large compact subset of $\Theta$. $\Pb_\theta$ is also a lognormal distribution so its density $p_\theta$ is strictly positive within a large compact subset of $\calX$ and has finite second moment. A3 is approximately satisfied.
    \item A4: $f(x)$ is a combination of piecewise linear functions so $s_f = 1$ and $I(\theta)$ is infinitely times differentiable so $s_f = \infty$. 
    When $k_\calX$ is Stein kernel with Matern-3/2 kernel as the base, the functions in the corresponding RKHS have smoothness $1/2$, whereas when $k_\calX$ is the log Gaussian kernel, the functions are infinitely differentiable. Neither of these choices satisfy the assumption, although Stein kernel contain many (but not necessarily all) function of smoothness $1/2$. $k_\Theta$ is Matern-3/2 kernel so $s_\Theta = \frac{3}{2} + \frac{1}{2} = 2$. It is therefore unclear if A4 is satisfied.
    \item A5: $\lambda_\calX$ is picked to be $0$ and $\lambda_\Theta$ is found via grid search among $\{0.01, 0.1, 1.0\}$. A5 is satisfied.
\end{itemize}

\subsubsection{More Experimental Results}
We report more results in \Cref{appfig:finance} with fixed $T=10, 20, 30$ and increasing $N$, to showcase that CBQ consistently exhibits smaller RMSE than baseline methods. The conclusions that we draw from the main text also
hold for different values of $N$ and $T$ for this experiment.
The performance of CBQ is similar between $k_\calX$ being Stein kernel and $k_\calX$ being log Gaussian kernel. It would be interesting to further investigate the performance of CBQ in estimating the future price of other financial derivatives, and we leave it for future work.



%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\subsection{Uncertainty Decision Making in Health Economics}\label{appendix:decision}
\subsubsection{Experimental Settings}
In the medical world, it is important to compare the cost and the relative advantages of conducting extra medical experiments. 
% In the area of oil and gas reservoir, an cost analysis is necessary before deciding whether to drill additional wells.
The expected value of partial perfect information (EVPPI) quantifies the expected gain from conducting extra experiments to obtain precise knowledge of some unknown variables \citep{brennan2007calculating}:
\begin{align*}
    \text{EVPPI} = \E \Bigl[\max_c I_c(\theta) \Bigr] - \max_c \E \Bigl[f_c(X, \theta) \Bigr], \text{   } I_c(\theta) = \int_{\calX} f_c(x, \theta) \Pb_\theta(dx)
\end{align*}
where $c \in \mathcal{C}$ is a set of potential treatments and $f_c$ measures the potential outcome of treatment $c$. Our method is applicable for estimating the conditional expectation $I_c(\theta)$ of the first term. 

We adopt the same experimental setup as delineated in \cite{Giles2019}, wherein $X$ and $\theta$ have a joint 19-dimensional Gaussian distribution, meaning that $\Pb_{\theta}$ is a Gaussian distribution. 
The specific meanings of all $X$ and $\theta$ are outlined in \Cref{tab:mytable}.
All these variables are independent except that $\theta_1, \theta_2, X_6, X_{14}$ are pairwise correlated with a correlation coefficient $0.6$.
The observations $\theta_{1:T}$ are sampled from the marginal Gaussian distribution $\Qb$ and then $N$ observations of $x^t_{1:N}$ are sampled from $\Pb_{\theta_t}$.

We are interested in a binary decision-making problem ($\calC = \{1, 2\}$) with $f_1(x, \theta)=10^4 (\theta_1 x_5 x_6 + x_7 x_8 x_{9})-(x_1 + x_2 x_3 x_4)$ and $f_2(x, \theta) = 10^4 (\theta_2 x_{13} x_{14} + x_{15} x_{16} x_{17})-(x_{10} + x_{11} x_{12} x_4)$. 
In computing EVPPI, we estimate $I_c(\theta)$ with CBQ and baselines, and then use standard MC for the rest of the expectations.
We draw $10^6$ samples from the joint distribution to generate a pseudo ground truth, and evaluate the RMSE across different methods. 
Note that IS is no longer applicable here because $f_c$ now depends on both $x$ and $\theta$, so we only comparing CBQ against KLSMC and LSMC.

For CBQ, we need to carefully choose two kernels. First, we take $k_\calX$ to be a Mat\'ern-3/2 to ensure that the kernel mean embedding under a Gaussian distribution $\Pb_\theta = \calN(\tilde{\mu}, \tilde{\Sigma})$ has a closed form if we use the 'inverse transform trick' as outlined in \Cref{appendix:practical_considerations}. 
Specifically speaking, we initially sample $u$ from $\calN(0, \Id_d)$, then calculate $x = \tilde{m} + L^\top u$ where $L$ is the lower triangular matrix derived from the Cholesky decomposition of the covariance matrix $\tilde{\Sigma}$. 
The integral now becomes
\begin{align}\label{appeq:transform}
    I_c(\theta) = \int_{\R^d} f(x)\calN(x; \tilde{m},\tilde{\Sigma}) dx = \int_{\R^d} f(\tilde{m} + L^\top u) \calN(u; 0, \Id_d) du
\end{align}
The closed form expression of kernel mean embedding for a Mat\'ern-3/2 kernel and isotropic Gaussian can be found in the Appendix S.3 of \cite{ming2021linked}.
Then we pick $k_\Theta$. 
We know there is a high chance that $I_c(\theta)$ is infinitely times differentiable, but we opt for Mat\'ern-3/2 kernel to encode a more conservative prior information on the smoothness of $I_c(\theta)$ because we do not have a closed form of it.
All hyperparameters in $k_\calX$ and $k_\Theta$ are selected according to \Cref{appendix:hyperparameter_selection}.

\begin{figure}[t]
    \begin{minipage}{\textwidth}
    \centering
    \includegraphics[width=250pt]{figures/legend.pdf}
    \vspace{-10pt}
    \end{minipage}
    
    \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/decision_T_10_no_legend.pdf}
        \caption{RMSE with fixed $T=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/decision_T_30_no_legend.pdf}
        \caption{RMSE with fixed $T=30$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{figures/decision_T_50_no_legend.pdf}
        \caption{RMSE with fixed $T=50$.}
    \end{subfigure}
    \caption{\emph{Uncertainty decision making in health economics.} $T=10, 30, 50$ and increasing $N$. }\label{appfig:decision}
\end{figure}


\begin{table}[t]
\centering
\begin{tabular}{
>{\centering\arraybackslash}p{1.5cm}
>{\centering\arraybackslash}p{1cm}
>{\centering\arraybackslash}p{1cm}
>{\centering\arraybackslash}p{5cm}}
\toprule
Variables & Mean & Std & Meaning \\
\midrule
$X_1$ & 1000 & 1.0 & Cost of treatment \\
$X_2$ & 0.1 & 0.02 & Probability of admissions \\
$X_3$ & 5.2 & 1.0 & Days of hospital \\
$X_4$ & 400 & 200 & Cost per day \\
$X_5$ & 0.3 & 0.1 & Utility change if response \\
$X_6$ & 3.0 & 0.5 & Duration of response \\
$X_7$ & 0.25 & 0.1 & Probability of side effects \\
$X_8$ & -0.1 & 0.02 & Change in utility if side effect \\
$X_{9}$ & 0.5 & 0.2 & Duration of side effects \\
$X_{10}$ & 1500 & 1.0 & Cost of treatment \\
$X_{11}$ & 0.08 & 0.02 & Probability of admissions \\
$X_{12}$ & 6.1 & 1.0 & Days of hospital \\
$X_{13}$ & 0.3 & 0.05 & Utility change if response \\
$X_{14}$ & 3.0 & 1.0 & Duration of response \\
$X_{15}$ & 0.2 & 0.05 & Probability of side effects \\
$X_{16}$ & -0.1 & 0.02 & Change in utility if side effect \\
$X_{17}$ & 0.5 & 0.2 & Duration of side effects \\
$\theta_1$ & 0.7 & 0.1 & Probability of responding \\
$\theta_2$ & 0.8 & 0.1 & Probability of responding \\
\bottomrule

\end{tabular}
\vspace{5pt}
\caption{Variables in the health economics experiment.}
\label{tab:mytable}
\end{table}

\subsubsection{Assumptions from \Cref{thm:convergence}} 
We would like to check whether the assumptions made in \Cref{thm:convergence} hold in this experiment.
\begin{itemize}
    \item A1: 
    Although $\calX=\R$ is not a compact domain, but $\Pb_{\theta}$ is a Gaussian distribution so the probability mass outside a large compact subset of $\calX$ decays exponentially. Similarly, $\Theta = \R$ is not a compact domain, but $\Qb$ is a Gaussian distribution so the probability mass outside a large compact subset of $\Theta$ decays exponentially. A1 is approximately satisfied.
    \item A2: A2 is satisfied due to the sampling mechanism of $\theta_{1:T}$ and $\{x_{1:N}^t\}_{t=1}^T$.
    \item A3: $\Qb$ is also a Gaussian distribution so its density $q$ is upper bounded and strictly positive on a compact and large domain. $\Pb_\theta$ is a Gaussian distribution so its density $p_\theta$ is strictly positive on a compact and large domain with finite second moment. A3 is approximately satisfied.
    \item A4: Both the integrand $f$ and the conditional expectation $I_c(\theta)$ are infinitely times differentiable, so $s_f = s_I = \infty$. On the other hand, due to the choice of Mat\'ern-3/2 kernels, $s_{\Theta}=3/2+1/2=2$ and $s_{\calX}=3/2+9/2=6$. A4 is therefore satisfied.
    \item A5: $\lambda_\calX$ is picked to be $0$ and $\lambda_\Theta$ is found via grid search among $\{0.01, 0.1, 1.0\}$. A5 is satisfied.
\end{itemize}

\subsubsection{Additional Experimental Results}
We report more results in \Cref{appfig:decision} with fixed $T = 10, 30, 50$ and increasing N, to showcase that CBQ consistently exhibits smaller RMSE than baseline methods.
The conclusions that we draw from the main text also hold for different values of $N$ and $T$ for this experiment.


\begin{figure}[t]\label{appfig:mobq}
    \centering
    \begin{minipage}{1.0\textwidth}
    \centering
    \includegraphics[width=350pt]{figures/mobq_legend.pdf}
    \end{minipage}
    \vspace{-10pt}
    
    \begin{subfigure}{0.32\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/mobq_sensitivity_nystrom.pdf}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.32\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/mobq_finance_nystrom.pdf}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.32\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/mobq_decision_T_50.pdf}
    \end{subfigure}
    \\
    \begin{subfigure}{0.32\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/mobq_bayes_sensitivity_time_T_50.pdf}
        \caption{Bayesian sensitivity analysis for linear models.}
        \label{appfig:mobq_bayes_sensitivity}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.32\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/mobq_finance_time_T_20.pdf}
        \caption{Option pricing in mathematical finance.}
        \label{appfig:mobq_finance}
    \end{subfigure}
    \hfill
        \begin{subfigure}{0.32\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/mobq_decision_time_T_50.pdf}
        \caption{Uncertainty decision making in health economics.}
        \label{appfig:mobq_decision}
    \end{subfigure}
    \hfill
    \caption{Comparison of CBQ and MOBQ in terms of RMSE (first row) and computational time (second row). \textbf{Left (a):} Bayesian sensitivity analysis for linear models. \textbf{Middle (b):} Option pricing in mathematical finance. \textbf{Right (c):} Uncertainty decision making in health economics.}
\end{figure}

\subsection{Comparison of Conditional Bayesian Quadrature and Multi-Output Bayesian Quadrature}\label{appendix:cbq_mobq}

In Section \ref{sec:cbq} in the main text, we mentioned a comparison of CBQ and multi-output Bayesian quadrature~\cite{xi2018bayesian} (MOBQ) in terms of their computational complexity. 
For $T$ parameter values $\theta_1, \cdots, \theta_T$ and $N$ samples from each probability distribution $\mathbb{P}_{\theta_1}, \ldots, \mathbb{P}_{\theta_T}$, the computational cost is $\calO(TN^3 + T^3)$ for CBQ and $\calO(N^3T^3)$ for MOBQ. 
We now give a more thorough comparison of CBQ and MOBQ in this section. 
% MOBQ is expected to have lower RMSE than CBQ, but the computational cost will get unbearably costly as $N$ or $T$ grows. 

When the integrand $f$ only depends on $x$ (Bayesian sensitivity analysis for linear models, option pricing in mathematical finance), MOBQ only requires one kernel $k_\calX$. 
\begin{align*}
    I_{\mathrm{MOBQ}}(\theta^\ast) = \left(\int_\calX k_\calX(x, x_{1:NT}) \Pb_{\theta^\ast}(dx) \right) \Big(k_\calX(x_{1:NT}, x_{1:NT}) + \lambda_\calX \Id_{NT} \Big)^{-1} f(x_{1:NT})
\end{align*}
where $x_{1:NT} \in \R^{NT}$ is a concatenation of $x_{1:N}^1, \cdots, x_{1:N}^T$.
When the integrand $f$ depends on both $x$ and $\theta$ (uncertainty decision making in health economics), MOBQ requires two kernels $k_\calX$ and $k_\Theta$.
\begin{align*}
\begin{aligned}
    I_{\mathrm{MOBQ}}(\theta^\ast) &= \Big( \int_\calX k_\calX(x, x_{1:NT})    \odot k_\Theta(\theta^\ast, \theta_{1:NT}) \Pb_{\theta^\ast}(dx)  \Big) \\ &
    \Big(k_\calX(x_{1:NT}, x_{1:NT}) \odot k_\Theta(\theta_{1:NT}, \theta_{1:NT})  + \lambda_\calX \Id_{NT} \Big)^{-1} f(x_{1:NT})
\end{aligned}
\end{align*}
where $\odot$ denotes element-wise product, and $\theta_{1:NT} = \left[\theta_1, \cdots, \theta_1, \cdots, \theta_T, \cdots, \theta_T \right] \in \R^{NT}$.
From the above two equations, we can see that the computation cost of $\calO(N^3T^3)$ mainly comes from the inversion of a $NT \times NT$ kernel matrix.
All the MOBQ hyperparameters in $k_\calX$ and $k_\Theta$ are selected by empirical Bayes in the same way as CBQ outlined in \Cref{appendix:hyperparameter_selection}.
It's crucial to note that the MOBQ computational cost is significantly higher for Stein reproducing kernel during hyperparameter selection (an approach analogous to the ``vector-valued control variates'' of \cite{Sun2021}), as evaluating the log marginal likelihood at every iteration would require the inversion of a $NT \times NT$ matrix.
Therefore, we do not include the experiment of Bayesian sensitivity analysis for the SIR model in this section.
All the hyperparameters for CBQ are reused as in \Cref{appendix:experiments}.

For Bayesian sensitivity analysis in linear models, the integrand is $f(x) = x^\top x$, the dimension is fixed $d=2$ and $T=50$.
In \Cref{appfig:mobq_bayes_sensitivity}, we can see that MOBQ indeed achieves lower RMSE at the beginning, but CBQ catches up when $N$ grows higher.
For option pricing in mathematical finance, we only compare MOBQ and CBQ when $k_\calX$ is the log Gaussian kernel and $T=20$.
For uncertainty decision making in health economics, we compare MOBQ and CBQ when $T=50$.
In \Cref{appfig:mobq_finance} and \Cref{appfig:mobq_decision}, we can see that CBQ and MOBQ achieves similar performances in terms of RMSE.
Additionally, in the second row of \Cref{appfig:mobq}, we compare the computational cost of MOBQ and CBQ, where we can see that the computational time of MOBQ is much larger than CBQ as $N$ grows across all settings, due to the complexity of $\calO(N^3T^3)$ for MOBQ. 


Additionally, as the main computational bottleneck of MOBQ is the inversion of the kernel matrix, so it would be interesting to see if MOBQ combined with scalable GP methods can reduce the computational time while still preserving the same level of accuracy. 
The scalable approximation method used here is Nyström approximation~\cite{williams2000using}.
We report the performance of MOBQ (Nyström) in both \Cref{appfig:mobq_bayes_sensitivity} and \Cref{appfig:mobq_finance}, and we can see that MOBQ (Nyström) performs worse than CBQ in terms of RMSE.
The reason of worse performance of MOBQ (Nyström) is that the use of scalable GP methods would introduce an extra layer of approximation that slows down the convergence rate.
Additionally, most scalable GP methods are used in the “regression” setting, while quadrature methods like BQ or CBQ belong to the “interpolation" setting~\cite{kanagawa2018gaussian}, so the quadrature problem will be more sensitive to the approximation error introduced. 





\subsection{Quasi Monte Carlo}\label{appendix:QMC}


Quasi Monte Carlo (QMC) is another line of research on improving the precision of approximating intractable integrals. 
While quadrature methods like BQ and CBQ aim at finding a smart way to combine the function values, QMC aims to find samples that can more uniformly cover the integration domain than random sampling~\citep{niu2023discrepancy, hickernell1998generalized, gerber2015sequential}. 
In the development of CBQ, we don't make any assumptions about the sampling of observations; specifically, we don't mandate i.i.d sampling. 
Therefore, it would be interesting to see whether combining quadrature algorithms with QMC could further improve the accuracy for estimating conditional expectation.

For a fair comparison in the experiment of Bayesian sensitivity analysis for linear models, we implement QMC sampling for all methods including CBQ and baseline methods. 
The samples $x_{1:N}^t$ are generated from a Sobol sequence which is a low-discrepancy sequence commonly used in QMC to cover the multidimensional space more uniformly than random sequences.
% We are not using QMC to sample $\theta_{1:T}$ because i.i.d. sampling is required in the second stage for CBQ, KLSMC and LSMC. \hudson{I am not sure in this.}
We follow the technique introduced in randomized QMC~\cite{lemieux2004randomized} to shift the Sobol sequence by a random amount.

It can be observed in \Cref{appfig:qmc} that replacing random sampling with QMC significantly enhances the performance of baseline methods, such as LSMC and KLSMC, while subtly improves the performance of CBQ. The limited degree of improvement seen in CBQ with QMC sampling can be attributed to the fact that CBQ already yields a remarkably low RMSE. Consequently, the margin of improvement offered by QMC sampling is not as evident in CBQ as in the baseline methods. We have only studied the effect of combining QMC and CBQ in the experiment of Bayesian sensitivity analysis in linear models. It would be interesting to see if combining QMC and CBQ would result in higher accuracy in other settings, and we leave it for future work. 

\begin{figure}[t]
    \begin{minipage}{1.0\textwidth}
    \includegraphics[width=250pt]{figures/legend_qmc.pdf}
    \end{minipage}
    
    \begin{subfigure}{0.30\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/bayes_sensitivity_qmc.pdf}
        \caption{Quasi Monte Carlo}
        \label{appfig:qmc}
    \end{subfigure}
    \begin{subfigure}{0.30\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/ablation_kernel_x.pdf}
        \caption{Ablation on kernel $k_\Theta$}
        \label{appfig:ablation_theta}
    \end{subfigure}
    \begin{subfigure}{0.30\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/ablation_kernel_y.pdf}
        \caption{Ablation on kernel $k_\calX$}
        \label{appfig:ablation_x}
    \end{subfigure}
    \caption{\textbf{Left:} Comparison of all methods with standard i.i.d. sampling and Quasi-Monte Carlo samples. \textbf{Middle and Right:} Ablation study for CBQ with different $k_\Theta$ and $k_\calX$ kernels in Bayesian sensitivity analysis for linear models.}
\end{figure}


\subsection{Ablations on kernels}\label{appendix:ablation}

We present an ablation study evaluating the impact of distinct kernel choices $k_\calX$ and $k_\Theta$ within the framework of Bayesian sensitivity analysis in linear models. The integrand is $f(x)=x^\top x$, the dimension $d=2$ and $N=T=50$. 
First, we choose $k_\Theta$ to be Mat\'ern-3/2 kernel and Gaussian kernel. \Cref{appfig:ablation_theta} shows that the performance of CBQ remains consistent across different $k_\Theta$ kernels. 

Subsequently, we opt for Mat\'ern-3/2 kernel, Gaussian kernel and Stein kernel (with Mat\'ern-3/2 as the base kernel) as choices for $k_\calX$. When $k_\calX$ is Gaussian kernel, the formula for kernel mean embedding $\mu_{\theta}(x)$ is presented in \Cref{appeq:E14}. When $k_\calX$ is Mat\'ern-3/2 kernel, a closed form expression for the kernel mean embedding does not exist for the non-isotropic Gaussian distribution $\calN(\tilde{m}, \tilde{\Sigma})$, but the 'inverse transform trick' can be employed as in \Cref{appeq:transform}. 
When $k_\calX$ is Stein kernel, we choose Mat\'ern-3/2 as the base kernel and then apply Stein operator on both arguments of kernel $k_0$. All hyperparameters are selected according to \Cref{appendix:hyperparameter_selection}.
From \Cref{appfig:ablation_x}, we can see that CBQ performs best when $k_\calX$ is Mat\'ern-3/2 kernel, and we know that $k_\calX$ being Mat\'ern-3/2 kernel satisfies the assumptions of \Cref{thm:convergence}. When $k_\calX$ is Gaussian RBF kernel or Stein kernel, whether the assumptions of \Cref{thm:convergence} still hold is unknown, but in this ablation study, CBQ under both kernels have shown good performances in terms of RMSE.
The ablation study is only implemented in this very simple setting, so we encourage practitioners to be careful in the selection of kernels in real world applications.


\subsection{Calibration}\label{appendix:calibration}
CBQ falls in the area of probabilistic numeric algorithms that can provide finite-sample Bayesian quantification of uncertainty, where the uncertainty arises from having access to only a finite number of function values of the integrand.
Since CBQ is a two-stage hierarchical Gaussian process method in nature, and the final estimate $I_{\textrm{CBQ}}$ is treated as Gaussian distributed, so the standard deviation $\sigma^2_{\textrm{CBQ}}$ is a measure of uncertainty~\cite{kendall2017uncertainties}.
The calibration plots in \Cref{appfig:calibration} are obtained by altering the width of the credible interval and then computes the percentage of times a credible interval contains the true value $I(\theta)$ under repetitions of the experiment.
The black diagonal line represents the ideal case, with any curve lying above the black line indicating underconfidence and any curve lying below indicating overconfidence.
It is generally regarded more preferable to be underconfident than overconfident. 

In \Cref{appfig:calibration_bayes_sensitivity}, we show the calibration of the CBQ posterior for the integrand $f(x)=x^\top x$ when dimension $d=2$. 
We observe that when the number of samples is as small as $10$, CBQ is overconfident, which can be explained by the poor performance of using empirical Bayes to select hyperparameters in the small sample regime. 
On the other hand, when $N$ and $T$ increase, CBQ becomes underconfident, meaning that our posterior variance is more inflated than needed from a frequentist viewpoint.
The calibration plots for other experiments are all demonstrated in \Cref{appfig:calibration}, and the conclusions are consistent across different experiments.


\begin{figure}[t]
\centering
    \begin{subfigure}{0.23\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/calibration_bayes.pdf}
        \caption{Calibration}
    \label{appfig:calibration_bayes_sensitivity}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/calibration_sir.pdf}
        \caption{Calibration}
        \label{appfig:calibration_finance}
    \end{subfigure}
    % \hfill
    \begin{subfigure}{0.23\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/calibration_finance.pdf}
        \caption{Calibration}
        \label{appfig:calibration_sir}
    \end{subfigure}
    \begin{subfigure}{0.23\textwidth}
        \centering
        \includegraphics[width=\linewidth]{figures/calibration_decision.pdf}
        \caption{Calibration}
        \label{appfig:calibration_decision}
    \end{subfigure}
    \caption{Calibration plots. \textbf{Top Left:}  Bayesian sensitivity analysis in linear models. \textbf{Top Right:} Bayesian sensitivity analysis for SIR model. \textbf{Bottom Left:} Option pricing in mathematical finance. \textbf{Bottom Right:} Uncertainty decision making in health economics.}
    \label{appfig:calibration}
\end{figure}

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\end{appendices}