\begin{appendices}
\setcounter{equation}{0}
\renewcommand{\theequation}{\thesection.\arabic{equation}}

\onecolumn

% {\hrule height 1mm}
\vspace*{-28pt}
\section*{\LARGE\bf \centering Supplementary Material
}
\vspace{8pt}
{\hrule height 0.1mm}
% {\hrule height 0.3mm}
\vspace{24pt}

\section*{Table of Contents}
\vspace*{-10pt}
\startcontents[sections]
\printcontents[sections]{l}{1}{\setcounter{tocdepth}{2}}

\newpage

\section{Convergence rate}\label{appendix:convergence_rate}

Recall the CBQ estimator proposed in~\eqref{eq:CBQ_estimator},
%
\begin{equation*}
    \hat{I}_\mathrm{CBQ}(\theta) = k_\Theta(\theta, \theta_{1:T})^\top \big(k_\Theta(\theta_{1:T}, \theta_{1:T}) + (\gamma_T + \sigma^2_\mathrm{BQ}(\theta_{1:T})) \Id_T\big)^{-1} \hat{I}_\mathrm{BQ}(\theta_{1:T}),
\end{equation*}
%
where $\gamma_T>0$, $\hat{I}_\mathrm{BQ}(\theta_t)$ and $\sigma^2_\mathrm{BQ}(\theta_t)$, for $t \in \{1, \dots, T\}$, are BQ posterior mean and variance obtained in the first stage as given in~\eqref{eq:BQ_mean_and_var}
%
\begin{align*}
    \hat{I}_\mathrm{BQ}(\theta_t) & = \mu_\theta(x^t_{1:N})^\top \left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} f(x^t_{1:N}, \theta_t),\\
    \sigma^2_\mathrm{BQ}(\theta_t) &= \mathbb{E}_{X,X'\sim \mathbb{P}_\theta}[k_{\calX}(X,X')] - \mu_\theta(x^t_{1:N})^\top \left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} \mu_\theta(x^t_{1:N}).
\end{align*}
%
It was pointed out in~\citet[Remark 2]{gogolashvili2023importance}, (and can be seen through straightforward differentiation) that the estimator $\hat{I}_\mathrm{CBQ}(\theta)$ is the minimiser of the importance weighted kernel ridge regression loss over functions in the RKHS $\calH_\Theta$,
%
\begin{equation*}
    \hat{I}_\mathrm{CBQ}(\theta) = \argmin_{F \in \calH_\Theta} \Big\{ \sum_{t=1}^T \frac{\tau}{1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_t)} \big(F(\theta_t) - \hat{I}_\mathrm{BQ}(\theta_t)\big)^2 + \tau \gamma_T^{-1}\| F \|_{\calH_\Theta}^2 \Big\},
\end{equation*}
%
for any $\tau>0$. Suppose $\theta_i$ were sampled from a probability measure $\Pb_\mathrm{tr}$ on $\Theta$. Then, 
%
\begin{equation*}
    \Pb_\mathrm{te}(A) = \int_A w(\theta)\Pb_\mathrm{tr}(\mathrm{d} \theta)
\end{equation*}
%
defines a positive measure on $\Theta$ for any positive $w(\theta) > 0$ for which the integral exists~\citep[ Proposition 232D]{fremlin2000measure}; further, if $w(\theta)$ is bounded, the measure is finite. Suppose we construct a $w(\theta)$ that satisfies these requirements, and is such that $w(\theta_t) = \tau(1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1}$. Then, since $\E [\hat{I}_\mathrm{BQ}(\theta_i)] = I(\theta_i)$, this (TODO proper reference) loss can be considered an unbiased finite-sample approximation of
%
\begin{equation*}
    \int_\Theta ( F(\theta) - I(\theta) )^2 \Pb_\mathrm{te}(\mathrm{d} \theta) + \frac{1}{n} \| F \|^2_{\calH_\Theta}.
\end{equation*}
%
Under a further assumption that the problem is well-specified, meaning $I(\theta) \in \calH_\Theta$, an upper bound on the rate of convergence of $\hat{I}_\mathrm{CBQ}(\theta)$ to $I(\theta)$ as $n \to \infty$ was established in~\citet[Theorem 4]{gogolashvili2023importance}. Specifically, [TODO summarise once it's more clear.]

To apply the result, we define $w(\theta)$ of convenient form that satisfies the requirements mentioned above, specifically $w(\theta) \in (0, A]$ for some $A<\infty$ and any $\theta \in \Theta$, and $w(\theta_t) = \tau(1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1}$ for some $t \in \{0, \dots,T\}$.\footnote{The integrability requirement is specific to $\Pb_\mathrm{tr}$ and will be assumed at a later stage.} Take $t' = \argmin_{t \in \{0,\dots,T\}}\{\sigma^{-2}_\mathrm{BQ}(\theta_t)\}>0$, and define
%
\begin{equation}
\begin{split}
\label{eq:weight_function}
    w(\theta) = \begin{cases}
        \tau(1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1} & \text{ if } \|\theta - \theta_t\|_\Theta  \geq \varepsilon \text{ for all } t \in \{0,\dots,T\} \\
        \tau A_t - \tau B_t \frac{\|\theta - \theta_t\|_\Theta }{\varepsilon}, & \text{ for } t \text{ such that } \|\theta - \theta_t\|_\Theta  < \varepsilon \\
    \end{cases}
\end{split}
\end{equation}
%
for
%
\begin{align*}
    A_t = (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1}  \qquad \text{and} \qquad
    B_t = (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1} - (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1}.
\end{align*}
%
For $\Theta \subset \R$, such $w(\theta)$ is easily visualised, as can be seen in~\Cref{fig:wtheta_illustration}.
\begin{figure}[H]
\centering \includegraphics[width=0.5\textwidth]{figures/wtheta_illustration.pdf}
\caption{Illustration of $w(\theta)$ for $\Theta \subset \R$}
\label{fig:wtheta_illustration}
\end{figure}
\textcolor{red}{TODO change figure. or remove}
It is easy to see that $w(\theta)$ is bounded above by $\tau \max_{t \in \{0,\dots,T\}} (1 + \gamma^{-1}_T \sigma^{2}_\mathrm{BQ}(\theta_{t}))^{-1} < \tau$, and below by $\tau(1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1}>0$ for any $\theta \in \Theta$, and $w(\theta_t) = \tau(1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1}$ as required.

\subsection{Technical Assumptions}

\begin{itemize}
    \item $I(\theta)$ lies in the Sobolev space $\calW^{2, s}(\Theta)$.
    \item $k_\Theta$ is a Mat\'ern kernel of order $\nu$ such that $s \geq \nu+p/2$ 
    \item $\gamma_T = cT^\alpha$, for $c>0$ and $\alpha \in (0, 1)$.
    \item $|I(\theta)| \leq M$, for all $\theta \in \Theta$
    \item $\theta_t$ were sampled i.i.d. from some $\Pb_\mathrm{tr}$, and 
    \item For the integral operator $L': \calL^2(\Theta, \Pb_\mathrm{tr}) \to \calH_\Theta$, it holds that $I(\theta)=L^r g$ for some $r \in [1/2, 1]$. We denote $R_0=\| g \|_{\calL^2(\Theta, \Pb_\mathrm{tr})}$.
    \item $\Pb_\theta$ has a density $p_\theta$, and $p_\theta(x)>0$ for all $x \in \calX$.
    \item $k_\calX$ is a Mat\'ern kernel of order $\nu_0$.
    \item For any $\theta$, $f(x, \theta)$ lies in the Sobolev space $\calW^{2, s_0}(\calX)$, for $s_0 \geq \nu_0+d/2$. 
\end{itemize}

\subsection{Convergence}
\begin{lemma}[Assumption 2 in~\citet{gogolashvili2023importance}]
\label{lemma:assumption2}
    Under technical assumptions in TODO, $I(\theta)=L^r g$ and some $g \in \calL^2(\Theta, \Pb_\mathrm{te})$ of norm $R \leq \tau R_0$.
\end{lemma}
\begin{proof}
    We assumed that the statement holds for $\Pb_\mathrm{tr}$. By definition of $\Pb_\mathrm{te}$, for any $g'(\theta)$ it holds that $\int_\Theta g'(\theta) \Pb_\mathrm{te}{\mathrm d \theta} = \int_\Theta g'(\theta) w(\theta) \Pb_\mathrm{tr}{\mathrm d \theta}$. Since $w(\theta)$ is bounded above and below, $\calL^2(\Theta, \Pb_\mathrm{tr})$ is norm-equivalent to $\calL^2(\Theta, \Pb_\mathrm{te})$. Therefore, $g \in \calL^2(\Theta, \Pb_\mathrm{tr})$ and the statement holds for $R\leq \tau R_0$ as $w(\theta) \leq \tau$ by construction.
\end{proof}

\begin{lemma}[Assumption 3 in~\citet{gogolashvili2023importance}]
\label{lemma:assumption3}
    Under technical assumptions in TODO, for $q=1$, $W= \tau$, and $\sigma^2= \| \Theta \| \tau$ it holds for all $m \in \N$, $m \geq 2$, that
    \begin{equation*}
        \left( \int_\Theta w(\theta)^{\frac{q+m-1}{q}} \Pb_\mathrm{tr}(\mathrm{d} \theta) \right)^q \leq \frac{1}{2} m! W^{m-2} \sigma^2
    \end{equation*}
%
\end{lemma}
\begin{proof}
By definition of $w(\theta)$, 
%
\begin{equation}
    \int_\Theta w(\theta)^m \mathrm d \theta < \| \Theta \| \tau \max_t (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-m} < \| \Theta \| \tau,
\end{equation}
%
and the result follows.
\end{proof}

\begin{lemma}[Assumption 4 in~\citet{gogolashvili2023importance}]
\label{lemma:assumption4}
    Under technical assumptions in TODO, for $s'=p/(2\nu + p)$ it holds that
    \begin{equation*}
        E_{s'} = \max \left(1, \sup_{\lambda \in (0,1]} \sqrt{\sum_{i=1}^\infty \frac{\mu_i \lambda^{s'}}{\mu_i + \lambda}}  \right) < \infty.
    \end{equation*}
%
\end{lemma}
\begin{proof}
It is a standard result (see, for instance,~\citep[Section 3.3.4]{edmunds1996function}) that for $k_\Theta$ being a Mat\'ern kernel of order $\nu$, the $i$-th eigenvalue decays at the rate of $i^{-\frac{2\nu + p}{p}}$. As pointed out in the discussion after Assumption 4 in~\citet{gogolashvili2023importance}, this implies $E_{s'}< \infty$ holds for $s'=p/(2\nu + p)$.
\end{proof}

\begin{theorem}
    Suppose technical assumptions in TODO hold, $\alpha=1/(2r+1)$, and $T$ is large enough so that $c \tau T^{-(1-\alpha)} \leq 1$, and $c \geq 8 \tau^{-1/2} |\log (6/\delta)|(\|\Theta\| + 1)^{1/2}$. Then, with probability at least $1-\delta$,
    \begin{equation*}
        \| \hat I_\mathrm{CBQ}(\theta) - I(\theta) \|_{\calL^2(\Theta, \Pb_\mathrm{tr})} \leq C_0 (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'})) T^{-r/(2r+1)},
    \end{equation*}

$\gamma_T^{-1} = T^{-\alpha}$, $\alpha>0$
%    \textcolor{red}{$\delta =  6\exp^{-1} \left[(16 (M + \|I\|_{\calH_\Theta}) (1+\tau^{-1/2} \|\Theta\|^{1/2}) )^{-1} c^{1/2}(j(1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1} T^{r/(2r+1)} - c^r R_0 ) \right] $}
%
    for $C_0 = 16 (M + \|I\|_{\calH_\Theta}) (1+\tau^{-1/2} \|\Theta\|^{1/2}) c^{-1/2} \log(6/\delta) + c^r R_0 $.
\end{theorem}
\begin{proof}
    \textcolor{red}{TODO fix the $R$ situation}
    By~\citet[Theorem 4]{gogolashvili2023importance}, for $\lambda=\tau c T^{-(1-\alpha)}$ and the weight function $w(\theta)$ defined in~\eqref{eq:weight_function}, we have that
    \begin{equation}
    \label{eq:l2_in_pte}
        \| \hat I_\mathrm{CBQ}(\theta) - I(\theta) \|_{\calL^2(\Theta, \Pb_\mathrm{te})} \leq T^{-r\beta} \left( 16 (M + \|I\|_{\calH_\Theta}) (W+\sigma E_{s'}^{1-q}) c^{-A/2} \log(6/\delta) + c^r R \right)
    \end{equation}
%
    provided\footnote{We omit the definition of $E_s$ intentionally as, since $q=1$, it is always raised to the power of zero in this work.}
    \begin{equation}
    \label{eq:cond_on_c}
        \lambda=\tau c T^{-(1-\alpha)} \leq 1, \qquad
        \tau c \geq \left( 64 (W+\sigma^2) E_{s'}^{2(1-q)} \log^2(6/\delta) \right)^{1/(1+A)},
    \end{equation}
%
    for the constants $A = 1/({s'}(1-q)+q)$, $\beta=(2r+A)^{-1}$ and $W, \sigma^2, q, r, R$ specified in~\Cref{lemma:assumption2,lemma:assumption3,lemma:assumption4}, meaning $W= \tau$, $\sigma^2= \| \Theta \| \tau$, $q=1$, $r \in [1/2, 1]$, $R \leq \tau R_0$, $A=1$, $\beta=1/(2r+1)$.
    Then, the conditions on $c$ in~\eqref{eq:cond_on_c} become
    \begin{equation*}
        c \in [8\tau^{-1/2} \log(6/\delta) \left( 1+\| \Theta \| \right)^{1/2}, \tau^{-1} T^{1-\alpha}],
    \end{equation*}
%
    and the rate in~\eqref{eq:l2_in_pte} becomes
    \begin{equation*}
        \| \hat I_\mathrm{CBQ}(\theta) - I(\theta) \|_{\calL^2(\Theta, \Pb_\mathrm{te})} \leq T^{-\frac{r}{2r+1}} \left( 16 (M + \|I\|_{\calH_\Theta}) (\tau+\tau^{1/2} \|\Theta\|^{1/2}) c^{-1/2} \log(6/\delta) + \tau c^r R_0 \right)
    \end{equation*}
%
    Since $\Pb_\mathrm{te}(\mathrm d \theta) = w(\theta)\Pb_\mathrm{tr}(\mathrm d \theta)$, and $w(\theta) \geq \tau(1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1}>0$, it follows that
    \begin{align*}
        &\| \hat I_\mathrm{CBQ}(\theta) - I(\theta) \|_{\calL^2(\Theta, \Pb_\mathrm{tr})} \leq (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'})) T^{-r/(2r+1)} \times \\
        &\qquad \qquad \qquad \times \left( 16 (M + \|I\|_{\calH_\Theta}) (1+\tau^{-1/2} \|\Theta\|^{1/2}) c^{-1/2} \log(6/\delta) + c^r R_0 \right),
    \end{align*}
%
and we arrive at the statement of the theorem.
\end{proof}

Recall
%
\begin{align*}
    \hat{I}_\mathrm{BQ}(\theta_t) & = \mu_\theta(x^t_{1:N})^\top \left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} f(x^t_{1:N}, \theta_t),\\
    \sigma^2_\mathrm{BQ}(\theta_t) &= \mathbb{E}_{X,X'\sim \mathbb{P}_\theta}[k_{\calX}(X,X')] - \mu_\theta(x^t_{1:N})^\top \left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} \mu_\theta(x^t_{1:N}).
\end{align*}
%
We seek to bound $\sigma^2_\mathrm{BQ}(\theta_t)$.~\citet[Proposition 3.8]{kanagawa2018gaussian} pointed out that the Gaussian noise posterior is the worst-case error in the $\calH_{\calX}^{\lambda_\calX}$, the RKHS induced by the kernel $k_\calX^{\lambda_\calX}(x, x') = k_\calX(x, x') + \lambda_\calX$. Through straightforward algebraic manipulations and using the reproducing property, one can show that
%
\begin{equation*}
    \sigma^2_\mathrm{BQ}(\theta_t) - \lambda_\calX = \mathrm{MMD}^2(\hat{\Pb}^N_\theta, \Pb_\theta; \calH_{\calX}^{\lambda_\calX})=\sup_{\|f\|_{\calH_{\calX}^{\lambda_\calX}} \leq 1} \left| w^{\lambda_\calX}_t f(x^t_{1:N}) - \int_\calX f(x) \Pb_\theta(\mathrm d x)\right|,
\end{equation*}
%
for the empirical measure $\hat{\Pb}^N_\theta = w^{\lambda_\calX}_t \delta_{x^t_{1:N}}$, where the weights are the optimal BQ weights $w^{\lambda_\calX}_t=\left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} \mu_\theta(x^t_{1:N})$ and the vector notation means $\delta_{x^t_{1:N}} = [\delta_{x^t_1} \dots \delta_{x^t_N}]^\top$.

Since $\calH_\calX^{\lambda_\calX}$ is induced by the sum of kernels, $k_\calX^{\lambda_\calX}(x, x') = k_\calX(x, x') + \lambda_\calX$, it holds that $\calH_\calX \subseteq \calH_\calX^{\lambda_\calX}$, and $\| f  \|_{\calH_\calX^{\lambda_\calX}} \leq \| f  \|_{\calH_\calX}$~\citep[Theorem I.13.IV]{aronszajn1950theory}. Therefore, the class of functions $f$ for which $\| f  \|_{\calH_\calX} \leq 1$ is larger than that for which $\| f  \|_{\calH_\calX^{\lambda_\calX}} \leq 1$, and
%
\begin{equation*}
    \sup_{\|f\|_{\calH_{\calX}^{\lambda_\calX}} \leq 1} \left| w^{\lambda_\calX}_t f(x^t_{1:N}) - \int_\calX f(x) \Pb_\theta(\mathrm d x)\right| \leq \sup_{\|f\|_{\calH_{\calX}} \leq 1} \left| w^{\lambda_\calX}_t f(x^t_{1:N}) - \int_\calX f(x) \Pb_\theta(\mathrm d x)\right|.
\end{equation*}
%
Next, note that for $\hat{f}_t(x) = k(x, x^t_{1:N})^\top \left(k_{\calX}(x^t_{1:N}, x^t_{1:N})+ \lambda_\calX \Id_N\right)^{-1} f(x^t_{1:N})$,
%
\begin{align*}
   \left| w^{\lambda_\calX}_t f(x^t_{1:N}) - \int_\calX f(x) \Pb_\theta(\mathrm d x)\right| = \left| \int_\calX \left(\hat{f}_t(x) - f(x) \right) \Pb_\theta(\mathrm d x)\right| &\leq  \int_\calX \left|\hat{f}_t(x) - f(x) \right| \Pb_\theta(\mathrm d x)  \\
    & \leq \|\hat{f}_t - f\|_{\calL^2(\calX)} \|p_\theta \|_{\calL^2(\calX)},
\end{align*}
%
where the last inequality is an application of H\"older inequality. Finally, we apply~\citet[Theorem 4]{wynne2021convergence} for $\calW_2^0(\calX)=\calL^2(\calX)$ to get
%
\begin{equation*}
    \|\hat{f}_t - f\|_{\calL^2(\calX)} \leq C_1 h_{x_{1:N}^t}^{d/2} \left( h_{x_{1:N}^t}^{\nu_0} + \lambda_\calX \right)
\end{equation*}
%
\textcolor{red}{TODO under what assumptions}

By~\citet[Lemma 2]{Oates2019-ix}, provided the density $\inf_{\theta, x} p_{\theta}(x)=\eta>0$ for all $x \in \calX$, there is a $C_t$ such that $\E h_{x_{1:N}^t} \leq C_t N^{-1/d + \varepsilon_0}$ for an arbitrarily small $\varepsilon_0>0$. Therefore,
%
\begin{equation*}
    \E_{x^t_i \sim \Pb_\theta}\|\hat{f}_t - f\|_{\calL^2(\calX)} \leq C_1 C_t^{d/2} N^{-1/2 + \varepsilon_0} \left( C_t^{\nu_0} N^{-\nu_0/d + \varepsilon_0} + \lambda_\calX \right)
\end{equation*}
%
By Markov's inequality, for any $\delta \in (0,1)$ it holds with probability at least $1-\delta$ that
%
\begin{equation*}
    \|\hat{f}_t - f\|_{\calL^2(\calX)} \leq \frac{C_1 C_t^{d/2}}{1-\delta} N^{-1/2 + \varepsilon_0} \left( C_t^{\nu_0} N^{-\nu_0/d + \varepsilon_0} + \lambda_\calX \right)
\end{equation*}
%
Putting it all together, we get
%
\begin{equation*}
    \Pb\left(\sigma^2_\mathrm{BQ}(\theta_t) \leq  \lambda_\calX + \frac{C_1 C_t^{d/2}}{1-\delta} N^{-1/2 + \varepsilon_0} \left( C_t^{\nu_0} N^{-\nu_0/d + \varepsilon_0} + \lambda_\calX \right) \right) \geq 1-\delta
\end{equation*}
%
Recall that
\begin{equation*}
        \Pb\left(\| \hat I_\mathrm{CBQ}(\theta) - I(\theta) \|_{\calL^2(\Theta, \Pb_\mathrm{tr})} \leq C_0 (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'})) T^{-r/(2r+1)} \right) \geq 1-\delta
    \end{equation*}
Then, by using that for any two events $A$ and $B$
\begin{equation*}
    \Pb(A \cap B) = 1 - \Pb(\neg A \cup \neg B) \geq 1 - \Pb(\neg A) - \Pb(\neg B) = \Pb(A) + \Pb(B) - 1,
\end{equation*}
we have that, with probability at least $1-2\delta$,
\begin{equation*}
        \| \hat I_\mathrm{CBQ}(\theta) - I(\theta) \|_{\calL^2(\Theta, \Pb_\mathrm{tr})} \leq C_0 \left(1 + \gamma_T^{-1} \lambda_\calX + \frac{C_1 C_t^{d/2}}{1-\delta} N^{-1/2 + \varepsilon_0} \left( C_t^{\nu_0} N^{-\nu_0/d + \varepsilon_0} + \lambda_\calX \right)\right) T^{-r/(2r+1)}
    \end{equation*}


\section{\textcolor{red}{Hudson's bit}}


Assumptions are 
\begin{itemize}
    \item $\calX$ is bounded and satisfies an $(R, \delta)$ interior cone condition with a Lipschitz boundary, also known as $\mathcal{L}(R, \delta)$ domain.  
    \item $f(\cdot, \theta_t) \in W_2^{\tau_f}(\calX)$ for some $\tau_f > d/2$.
    \item The kernel $k_\calX$ is $\tau_k$ smooth in that the RKHS $\calH_\calX$ is norm equivalent to $W_2^{\tau_k}$.
    \item $\tau_f = \tau_k$
    \item fill distance $h_X$. Given a collection of points $X \in \calX$, the fill distance $h_X$ is defined as $h_X:=\sup _{x \in \mathcal{X}} \inf _{y \in X}\|x-y\|_2$.
    \item The distribution $\mathbb{P}_\theta(x)$ has density $p_\theta(x)$ for any $\theta \in \Theta$.
\end{itemize}

\begin{lemma}\label{lemma:wynne}[Corollary 5 in \cite{wynne2021convergence}]
    For Gaussian process regression on observations $\{x_{1:N}, \hat{f}(x_{1:N}, \theta)\}$ with corruption $\varepsilon_{1:N}$. 
    Let $s\in [0, \tau_f]$, under some assumptions above and $h_{X_N} \leq C_1 N^{-\frac{1}{d}}$ for some $C_1 > 0$. Then, $\exists C, h_0>0$ such that with $h_{X_N} < h_0$, we have 
    \begin{equation*}
    \begin{aligned}
    & \mathbb{E}\left[\left\| 
 f(\cdot, \theta) - k_\calX(\cdot, x_{1:N}) \left(k_\calX(x_{1:N}, x_{1:N}) + \gamma_N \right)^{-1} \hat{f}(x_{1:N}, \theta)
 \right\|_{W_q^s(\mathcal{X})}\right]
 \\
& \quad \leq C n^{-\frac{1}{2}+\frac{s}{d}}\left(\mathbb{E}\left[\|\varepsilon\|_2\right]+n^{-\frac{\tau_f}{d}+\frac{1}{2}}\left(\|f\|_{W_2^\tau(\mathcal{X})}+ \left\|m \right\|_{W_2^{\tau_f}(\mathcal{X})}\right)\right),
    \end{aligned}
    \end{equation*}
%
    where the constant $C$ depends on $\calX, d, q, \tau_f$ and $h_0$ depends on $R, d, \tau_f$.
\end{lemma}
\begin{proof}
    It is a slight modification of the original Corollary 5 in \cite{wynne2021convergence} by taking $q=2$ and taking the GP prior mean to have fixed hyperparameters. 
\end{proof}

\begin{theorem}
    Under Assumptions blah blah blah,  for any $\theta_t$, $t=\{0,\dots,T\}$, we have that
    \begin{equation*}
        \sigma^2_\mathrm{BQ}(\theta_t) \leq C N^{-\frac{\tau_f}{d}},
    \end{equation*}
%
    where the constant $C=C_0 \|f\|_{W_2^{\tau_f}(\mathcal{X})} \norm{p_{\theta_t}(x)}{L^2(\calX)}$ and $C_0$ depends on $\calX, d, \tau_f$.
\end{theorem}

\begin{proof}
Recall from \Cref{sec:cbq} that BQ variance has the form
%
\begin{align}\label{appeq:bq_var}
    \sigma^2_\mathrm{BQ}(\theta_t) = \mathbb{E}_{X,X'\sim \mathbb{P}} \left[k_{\calX}(X,X')\right] - \mu(x_{1:N})^\top \left(k_{\calX}(x_{1:N}, x_{1:N}) + \gamma_N \right)^{-1} \mu(x_{1:N})
\end{align} 
where $\gamma_N$ is a small jitter added to the Gram matrix to ensure invertibility. 

In the literature of BQ, it is commonly assumed that the function evaluations are noise-free. For example, the function evaluations are the result of very expensive computer simulations. 
However, in Equation \eqref{appeq:bq_var}, adding a small jitter $\gamma_N$ to the Gram matrix indicates that the likelihood is assumed to be Gaussian with very small variance. 
Therefore, we fall in the misspecified case as \Cref{lemma:wynne} so we have
%
\begin{align*}
\begin{aligned}
& \mathbb{E}\left[\left\| 
f(\cdot, \theta_t) - k_\calX(\cdot, x_{1:N}^t) \left(k_\calX(x_{1:N}^t, x_{1:N}^t) + \gamma_N \right)^{-1} \hat{f}(x_{1:N}^t, \theta_t)
\right\|_{W_q^s(\mathcal{X})}\right]
\\
& \quad \leq C n^{-\frac{1}{2}+\frac{s}{d}}\left(\mathbb{E}\left[\|\varepsilon\|_2\right]+n^{-\frac{\tau_f}{d}+\frac{1}{2}}\left(\|f\|_{W_2^\tau(\mathcal{X})}+ \left\|m \right\|_{W_2^{\tau_f}(\mathcal{X})}\right)\right),
\end{aligned}
\end{align*}
%
Since we are in the noiseless case so $\hat{f} = f$ and $\varepsilon = 0$, and the expectation on the left hand side is removed.
The GP prior mean is taken to be a zero function so $m=0$. Then let $s=0$, we arrive at
%
\begin{equation}
    \norm{f(\cdot, \theta_t) - k_\calX(\cdot, x_{1:N}^t) \left(k_\calX(x_{1:N}^t, x_{1:N}^t) + \gamma_N \right)^{-1} f(x_{1:N}^t, \theta_t) }{L^2(\calX)}
    \leq C N^{-\frac{\tau_f}{d}} \|f\|_{W_2^{\tau_f}(\mathcal{X})}
\end{equation}
%
With Holder inequality, we have
%
\begin{equation}\label{appeq:bq_bound}
\begin{aligned}
    &\int \Bigm|
    f(x, \theta_t) - k_\calX(x, x_{1:N}^t) \left(k_\calX(x_{1:N}^t, x_{1:N}^t) + \gamma_N \right)^{-1} f(x_{1:N}^t, \theta_t) \Bigm|  \Bigm| p_{\theta_t}(x)\Bigm| dx \\
    &\leq \norm{f(\cdot, \theta_t) - k_\calX(\cdot, x_{1:N}^t) \left(k_\calX(x_{1:N}^t, x_{1:N}^t) + \gamma_N \right)^{-1} f(x_{1:N}^t, \theta_t) }{L^2(\calX)} \norm{p_{\theta_t}(x)}{L^2(\calX)} \\
    &\leq C N^{-\frac{\tau_f}{d}} \|f\|_{W_2^{\tau_f}(\mathcal{X})} \norm{p_{\theta_t}(x)}{L^2(\calX)}
\end{aligned}
\end{equation}
%

From Section 2.3 in \cite{fx_quadrature}, we know that the BQ posterior variance is equivalently the worst case error in the reproducing kernel Hilbert space $\calH_\calX$
%
\begin{align*}
\begin{aligned}
    \sigma^2_{BQ}(\theta_t) = \sup_{\substack{g(\cdot, \theta_t) \in \calH_\calX \\ \norm{g(\cdot, \theta_t)}{\calH_\calX} \leq 1}}
 \int \Bigm| g(x, \theta_t) - k_\calX(x, x_{1:N}^t) \left(k_\calX(x_{1:N}^t, x_{1:N}^t) + \gamma_N \right)^{-1} g(x_{1:N}^t, \theta_t) \Bigm|  \Bigm| p_{\theta_t}(x)\Bigm| dx  
\end{aligned}
\end{align*}
%

And since Equation \eqref{appeq:bq_bound} holds for any $f(\cdot, \theta_t)$, and therefore we have
%
\begin{align*}
\begin{aligned}
    \sigma^2_{BQ}(\theta_t) \leq C N^{-\frac{\tau_f}{d}} \|f\|_{W_2^{\tau_f}(\mathcal{X})} \norm{p_{\theta_t}(x)}{L^2(\calX)}
\end{aligned}
\end{align*}
%
And hence Theorem 2 is proved. \hudson{not sure about the final step.}
\end{proof}





\section{Attempt at a better $q$ \textcolor{red}{TODO delete pre submission}}


\begin{lemma}[Assumption 3 in~\citet{gogolashvili2023importance}]
    Under technical assumptions in TODO, there exist constants $q \in [0,1]$, $W>0$, and $\sigma>0$ such that, for all $m \in \N$, $m \geq 2$, it holds that
    \begin{equation*}
        \left( \int_\Theta w(\theta)^{\frac{q+m-1}{q}} \Pb_\mathrm{tr}(\mathrm{d} \theta) \right)^q \leq \frac{1}{2} m! W^{m-2} \sigma^2
    \end{equation*}
%
\end{lemma}
\begin{proof}

%
\begin{equation*}
    w(\theta) = \begin{cases}
        A_t - B_t \frac{\|\theta - \theta_t\|_\Theta }{\varepsilon}, & t \text{ such that } \|\theta - \theta_t\|_\Theta  < \varepsilon \\
        A_t - B_t & \text{otherwise}
    \end{cases}
\end{equation*}
%
\begin{align*}
    A_t &= (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1}  \\
    B_t &= (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_t))^{-1} - (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1}
\end{align*}
%
\begin{equation*}
    w(\theta)^{\frac{q+m-1}{q}} = \begin{cases}
        \left(A_t - B_t \frac{\|\theta - \theta_t\|_\Theta }{\varepsilon} \right)^{\frac{q+m-1}{q}}, & t \text{ such that } \|\theta - \theta_t\|_\Theta  < \varepsilon \\
        (A_t - B_t)^{-\frac{q+m-1}{q}} & \text{otherwise}
    \end{cases}
\end{equation*}
%
Then, 
%
\begin{equation}
\label{eq:eq1}
    \int_\Theta w(\theta)^{\frac{q+m-1}{q}} \mathrm d \theta = \left(\| \Theta \| - \varepsilon T V_d\right) (A_t - B_t)^{-\frac{q+m-1}{q}} + \sum_{t=1}^T \int_{B_\varepsilon(\theta_t)} \left(A_t - B_t \frac{\|\theta - \theta_t\|_\Theta }{\varepsilon} \right)^{\frac{q+m-1}{q}} \mathrm d \theta
\end{equation}
%
where $V_d$ is the volume of a $d$-dimensional unit ball, $V_d = \pi^{d/2} / \Gamma(1+d/2)$.
%
\begin{equation*}
    \int_{B_\varepsilon(\theta_t)} \left(A_t - B_t \frac{\|\theta - \theta_t\|_\Theta }{\varepsilon} \right)^{\frac{q+m-1}{q}} \mathrm d \theta =2 \pi \int_0^\varepsilon \left(A_t - B_t r / \varepsilon \right)^{\frac{q+m-1}{q}} r^{d-1} \mathrm d r \prod_{i=1}^{d-2} \int_0^\pi \sin^{d-2-i+1} (\phi_i) \mathrm d \phi_i
\end{equation*}
%
\begin{equation*}
    2 \pi \int_0^1 r^{d-1} \mathrm d r \prod_{i=1}^{d-2} \int_0^\pi \sin^{d-2-i+1} (\phi_i) \mathrm d \phi_i = V_d = \pi^{d/2} / \Gamma(1+d/2)
\end{equation*}
%
\begin{equation*}
    2 \pi \prod_{i=1}^{d-2} \int_0^\pi \sin^{d-2-i+1} (\phi_i) \mathrm d \phi_i = d \pi^{d/2} / \Gamma(1+d/2)
\end{equation*}
%

%
\begin{align*}
    &\frac{d \pi^{d/2}}{\Gamma(1+d/2) } \int_0^\varepsilon \left(A_t - B_t r / \varepsilon \right)^{\frac{q+m-1}{q}} r^{d-1} \mathrm d r 
    = \frac{\pi^{d/2}}{\Gamma(1+d/2) } \Bigg[ \frac{A_t^{\frac{q+m-1}{q}+d} - (A_t- B_t)^{\frac{q+m-1}{q}+d}}{B_t(\frac{q+m-1}{q}+d)} \prod_{j=1}^{d-1} \frac{d-j+1}{\frac{q+m-1}{q}+j}
    \\
    &\qquad- \sum_{i=1}^{d-1} \frac{1}{B_t^i} \varepsilon^{d-i} (A_t - B_t)^{\frac{q+m-1}{q}+i} \prod_{j=1}^i \frac{d-j+1}{\frac{q+m-1}{q}+j} \Bigg]
\end{align*}
%
The latter equality is
%
\begin{align*}
    \int_0^\varepsilon \left(a-b r \right)^c r^{d-1} \mathrm d r &= \frac{1}{d} \frac{a^{c+d} - (a-b \varepsilon)^{c+d}}{b(c+d)} \prod_{j=1}^{d-1} \frac{d-j+1}{c+j}
    -\frac{1}{d} \sum_{i=1}^{d-1} \frac{1}{b^i} \varepsilon^{d-i} (a-b \varepsilon)^{c+i} \prod_{j=1}^i \frac{d-j+1}{c+j}
\end{align*}
%
which comes from integration by parts,
%
\begin{align*}
    \int_0^\varepsilon \left(a-b r \right)^c r^{d-1} \mathrm d r &= -\frac{1}{b(c+1)} \int_0^\varepsilon r^{d-1} \mathrm d \left(a-b r \right)^{c+1} \\
    &= -\frac{1}{b(c+1)} \varepsilon^{d-1}\left(a-b \varepsilon \right)^{c+1} + \frac{1}{b(c+1)} \int_0^\varepsilon  \left(a-b r \right)^{c+1} \mathrm d r^{d-1} \\
    &= -\frac{1}{b(c+1)} \varepsilon^{d-1}\left(a-b \varepsilon \right)^{c+1} \\
    &\quad- \frac{d-1}{b^2(c+1)(c+2)} \varepsilon^{d-2} \left(a-b \varepsilon \right)^{c+2} \\
    &\quad+ \frac{d-1}{b^2(c+1)(c+2)} \int_0^\varepsilon \left(a-b r \right)^{c+2} \mathrm d  r^{d-2}=\dots
\end{align*}
%
and
%
\begin{equation*}
    \int_0^\varepsilon \left(a-b r \right)^{c+d-1} \mathrm d r = \frac{a^{c+d} - (a-b \varepsilon)^{c+d}}{b(c+d)}.
\end{equation*}
%
So~\eqref{eq:eq1} becomes 
%
\begin{align*}
    &\int_\Theta w(\theta)^{\frac{q+m-1}{q}} \mathrm d \theta = \left(\| \Theta \| - \varepsilon T V_d\right) (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-\frac{q+m-1}{q}} \\
    &\qquad+ \sum_{t=1}^T \frac{\pi^{d/2}}{\Gamma(1+d/2) } \Bigg[ \frac{A_t^{\frac{q+m-1}{q}+d} - (A_t- B_t)^{\frac{q+m-1}{q}+d}}{B_t(\frac{q+m-1}{q}+d)} \prod_{j=1}^{d-1} \frac{d-j+1}{\frac{q+m-1}{q}+j}
    \\
    &\qquad- \sum_{i=1}^{d-1} \frac{1}{B_t^i} \varepsilon^{d-i} (A_t - B_t)^{\frac{q+m-1}{q}+i} \prod_{j=1}^i \frac{d-j+1}{\frac{q+m-1}{q}+j} \Bigg]
\end{align*}
%


blah
%
\begin{equation*}
    w(\theta) = (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1} + \max\left\{ \left(1 - \frac{\|\theta - h(\theta)\|_\Theta }{\varepsilon}\right)\left((1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(h(\theta)))^{-1} - (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1}\right), 0\right\}
\end{equation*}
%

%
\begin{equation*}
    w(\theta) = \max\left\{ \left(1 - \frac{\|\theta - h(\theta)\|_\Theta }{\varepsilon}\right)(1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(h(\theta)))^{-1} + \frac{\|\theta - h(\theta)\|_\Theta }{\varepsilon}(1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1}, (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1}\right\}
\end{equation*}
%
\begin{equation*}
    w(\theta) = \max\left\{ \left(1 - \frac{\|\theta - h(\theta)\|_\Theta }{\varepsilon}\right)(1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(h(\theta)))^{-1} + \frac{\|\theta - h(\theta)\|_\Theta }{\varepsilon}(1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1}, (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1}\right\}
\end{equation*}
%
\begin{equation*}
    \left( \int_\Theta w(\theta)^{\frac{q+m-1}{q}} \Pb_\mathrm{tr}(\mathrm{d} \theta) \right)^\frac{q}{q+m-1} \leq (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1}+ \left( \int_{\Theta_{T, \varepsilon}} B(\theta)^{\frac{q+m-1}{q}} \Pb_\mathrm{tr}(\mathrm{d} \theta) \right)^\frac{q}{q+m-1}
\end{equation*}
%
for 
%
\begin{equation*}
    B(\theta) = \left(1 - \frac{\|\theta - h(\theta)\|_\Theta }{\varepsilon}\right)\left((1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(h(\theta)))^{-1} - (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1}\right)
\end{equation*}
%
So
%
\begin{equation*}
    \left( \int_\Theta B(\theta)^{\frac{q+m-1}{q}} \Pb_\mathrm{tr}(\mathrm{d} \theta) \right)^\frac{q}{q+m-1} = \left( \int_{\Theta_{T, \varepsilon}} \left((1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(h(\theta)))^{-1} - (1 + \gamma_T^{-1} \sigma^2_\mathrm{BQ}(\theta_{t'}))^{-1}\right)^{\frac{q+m-1}{q}} \Pb_\mathrm{tr}(\mathrm{d} \theta) \right)^\frac{q}{q+m-1}
\end{equation*}
%

By the construction of $w(\theta)$,
%
\begin{equation*}
     \int_\Theta w(\theta)^m \mathrm{d} \theta  \leq \| \Theta \| \sigma^{-2m}_\mathrm{BQ}(\theta_{t'}) + \varepsilon_0
\end{equation*}
%
%
\end{proof}

\section{Convergence rate---OLD \textcolor{red}{TODO delete pre submission}}

Recall the CBQ estimator proposed in~\eqref{eq:CBQ_estimator},
%
\begin{equation*}
    \hat{I}_\mathrm{CBQ}(\theta) = k_\Theta(\theta, \theta_{1:T})^\top \big(k_\Theta(\theta_{1:T}, \theta_{1:T}) + T^\alpha \sigma^2_\mathrm{BQ}(\theta_{1:T}) \Id_T\big)^{-1} \hat{I}_\mathrm{BQ}(\theta_{1:T}),
\end{equation*}
%
where $\alpha>0$, $\hat{I}_\mathrm{BQ}(\theta_t)$ and $\sigma^2_\mathrm{BQ}(\theta_t)$, for $t \in \{1, \dots, T\}$, are BQ posterior mean and variance obtained in the first stage as given in~\eqref{eq:BQ_mean_and_var}
%
\begin{align*}
    \hat{I}_\mathrm{BQ}(\theta_t) & = \mu_\theta(x^t_{1:m})^\top k_{\Theta}(x^t_{1:m}, x^t_{1:m})^{-1} f(x^t_{1:m}, \theta_t),\\
    \sigma^2_\mathrm{BQ}(\theta_t) &= \mathbb{E}_{X,X'\sim \mathbb{P}_\theta}[k_{\Theta}(X,X')] - \mu_\theta(x^t_{1:m})^\top k_{\Theta}(x^t_{1:m}, x^t_{1:m})^{-1} \mu_\theta(x^t_{1:m}).
\end{align*}
%
It was pointed out in~\citet[Remark 2]{gogolashvili2023importance}, (and can be seen through straightforward differentiation) that the estimator $\hat{I}_\mathrm{CBQ}(\theta)$ is the minimiser of the importance weighted kernel ridge regression loss over functions in the RKHS $\calH_\Theta$,
%
\begin{equation*}
    \hat{I}_\mathrm{CBQ}(\theta) = \argmin_{F \in \calH_\Theta} \Big\{ \sum_{t=1}^T \frac{\big(F(\theta_t) - \hat{I}_\mathrm{BQ}(\theta_t)\big)^2}{\sigma^2_\mathrm{BQ}(\theta_t)} + T^{-\alpha}\| F \|_{\calH_\Theta}^2 \Big\}.
\end{equation*}
%
Suppose $\theta_i$ were sampled from a probability measure $\Pb_\mathrm{tr}$ on $\Theta$. Then, 
%
\begin{equation*}
    \Pb_\mathrm{te}(A) = \int_A w(\theta)\Pb_\mathrm{tr}(\mathrm{d} \theta)
\end{equation*}
%
defines a positive measure on $\Theta$ for any positive $w(\theta) > 0$ for which the integral exists~\citep[ Proposition 232D]{fremlin2000measure}; further, if $w(\theta)$ is bounded, the measure is finite. Suppose we construct a $w(\theta)$ that satisfies these requirements, and is such that $w(\theta_t) = 1 / \sigma^2_\mathrm{BQ}(\theta_t) = \sigma^{-2}_\mathrm{BQ}(\theta_t)$. Then, since $\E [\hat{I}_\mathrm{BQ}(\theta_i)] = I(\theta_i)$, this (TODO proper reference) loss can be considered an unbiased finite-sample approximation of
%
\begin{equation*}
    \int_\Theta ( F(\theta) - I(\theta) )^2 \Pb_\mathrm{te}(\mathrm{d} \theta) + \frac{1}{n} \| F \|^2_{\calH_\Theta}.
\end{equation*}
%
Under a further assumption that the problem is well-specified, meaning $I(\theta) \in \calH_\Theta$, an upper bound on the rate of convergence of $\hat{I}_\mathrm{CBQ}(\theta)$ to $I(\theta)$ as $n \to \infty$ was established in~\citet[Theorem 4]{gogolashvili2023importance}. Specifically, [TODO summarise once it's more clear.]

To apply the result, we define $w(\theta)$ of convenient form that satisfies the requirements mentioned above, specifically $w(\theta) \in (0, A]$ for some $A<\infty$ and any $\theta \in \Theta$, and $w(\theta_t) = \sigma^{-2}_\mathrm{BQ}(\theta_t)$ for some $t \in \{0, \dots,T\}$.\footnote{The integrability requirement is specific to $\Pb_\mathrm{tr}$ and will be assumed at a later stage.} Take $t' = \argmin_{t \in \{0,\dots,T\}}\{\sigma^{-2}_\mathrm{BQ}(\theta_t)\}>0$, and define
%
\begin{equation*}
    w(\theta) = \sigma^{-2}_\mathrm{BQ}(\theta_{t'}) + \max\left\{ \left(1 - \frac{\|\theta - h(\theta)\|_\Theta }{\varepsilon}\right)\left(\sigma^{-2}_\mathrm{BQ}(h(\theta)) - \sigma^{-2}_\mathrm{BQ}(\theta_{t'})\right), 0\right\}
\end{equation*}
%
where $h(\theta) = \argmin_{\theta' \in \{\theta_0,\dots,\theta_T\}}\{\|\theta-\theta'\|_\Theta\}$ is the point in the set $\{\theta_0,\dots,\theta_T\}$ that is closest to $\theta$. For $\Theta \subset \R$, such $w(\theta)$ is easily visualised, as can be seen in~\Cref{fig:wtheta_illustration}. Crucially, for a bounded $\Theta$, the volume $\int_\Theta w(\theta) \mathrm{d} \theta$ can be made arbitrarily close to $\| \Theta \| \sigma^{-2}_\mathrm{BQ}(\theta_{t'})$ for a small enough $\varepsilon$.
\begin{figure}[H]
\centering \includegraphics[width=0.5\textwidth]{figures/wtheta_illustration.pdf}
\caption{Illustration of $w(\theta)$ for $\Theta \subset \R$}
\label{fig:wtheta_illustration}
\end{figure}

It is easy to see that $w(\theta)$ is bounded above by $\max_{t \in \{0,\dots,T\}} \sigma^{-2}_\mathrm{BQ}(\theta_{t})<\infty$, and below by $\sigma^{-2}_\mathrm{BQ}(\theta_{t'})>0$ for any $\theta \in \Theta$, and $w(\theta_t) = \sigma^{-2}_\mathrm{BQ}(\theta_t)$ as required.

\subsection{Technical Assumptions}

\begin{itemize}
    \item $I(\theta)$ lies in the Sobolev space $\calW^{2, s}(\Theta)$.
    \item $k_\Theta$ is a Mat\'ern kernel of order $\nu$ such that $s \geq \nu+d/2$ 
\end{itemize}

\subsection{Convergence}



% $r = \min\{s/(2\nu-d), 1\}$

% $r = (-1-A)/2 $

% $A = s(1-q)+q$

% $s$ is the rate of decay of eigenvalues. $s=d/(2\nu+d)$

% $q=1$

% $r = -1$

\begin{lemma}[Assumption 2 in~\citet{gogolashvili2023importance}]
\label{lemma:assumption2}
    Under technical assumptions in TODO, $I(\theta)=L^r g$ for $r=\min\{s/(2\nu+d), 1\}$ and some $g \in \calL^2(\Theta, \Pb_\mathrm{te})$ of norm $R = \|g\|_{\calL^2(\Theta, \Pb_\mathrm{te})}$ for some $R>0$.
\end{lemma}
\begin{proof}
% ~\citet[Theorem 4.6]{steinwart2012mercer}
    By definition of the integral operator, the range $\mathrm{ran} L^r$ coincides with the $2r$-interpolation space between $\calH_\Theta$ and $\calL^2(\Theta, \Pb_\mathrm{te})$. Therefore, we ought to show that $I(\theta)$ lies in this $2r$-interpolation space. First, we note that $\calH_\Theta \simeq \calW^{2, \nu+d/2}(\Theta)$, and given condition TODO and the boundedness of $w(x)$, $\calL^2(\Theta, \Pb_\mathrm{te})$ is norm-equivalent to $\calL^2(\Theta)$. This implies norm-equivalence of the interpolations spaces, therefore we will be establishing if $I(\theta)$ lies in this $2r$-interpolation space between $\calW^{2, \nu+d/2}(\Theta)$ and $\calL^2(\Theta)$.

    If $\calH_\Theta^{2r}$ is the $2r$-interpolation space between $\calW^{2, \nu+d/2}(\Theta)$ and $\calL^2(\Theta)$, $\calW^{2, \nu+d/2}(\Theta)$ is the $r$-interpolation space between  $\calH_\Theta^{2r}$ and $\calL^2(\Theta)$. Since $r \in (0,1)$, by~\citet[Theorem 4.6]{steinwart2012mercer}, the $r$-interpolation space is norm-equivalent to the $r$-interpolation space of the real kind. We know that $\calW^{2, \nu+d/2}(\Theta)$ is the $r$-interpolation space of the real kind between $\calW^{2, 2(\nu+d/2)/r}(\Theta)$ and $\calL^2(\Theta)$ (TODO ref?).
    
    The $2r$-interpolation of the real kind between $\calW^{2, \nu-d/2}(\Theta)$ and $\calL^2(\Theta)$ is the space $\calW^{2, 2r(\nu-d/2)}(\Theta)$ (TODO ref?). Since $I(\theta)$ lies in $\calW^{2, s}$, by inclusion of Sobolev spaces we have that it lies in $\calW^{2, s_0}$ for any $s_0 \leq s$. Therefore, $I(\theta)$ lies in the aforementioned interpolation space whenever $2r(\nu+d/2) \leq s$, meaning $r \leq s/(2\nu+d)$. The result follows.
\end{proof}

Note that, under Assumption TODO, $s/(2\nu+d) \geq 1/2$, and we have $r \in [1/2, 1]$, as is required by Assumption 2 in~\citet{gogolashvili2023importance}. 

\begin{lemma}[Assumption 3 in~\citet{gogolashvili2023importance}]
    Under technical assumptions in TODO, there exist constants $q \in [0,1]$, $W>0$, and $\sigma>0$ such that, for all $m \in \N$, $m \geq 2$, it holds that
    \begin{equation*}
        \left( \int_\Theta w(\theta)^{\frac{q+m-1}{q}} \Pb_\mathrm{tr}(\mathrm{d} \theta) \right)^q \leq \frac{1}{2} m! W^{m-2} \sigma^2
    \end{equation*}
%
\end{lemma}
\begin{proof}
%
\begin{equation*}
    w(\theta) = \sigma^{-2}_\mathrm{BQ}(\theta_{t'}) + \max\left\{ \left(1 - \frac{\|\theta - h(\theta)\|_\Theta }{\varepsilon}\right)\left(\sigma^{-2}_\mathrm{BQ}(h(\theta)) - \sigma^{-2}_\mathrm{BQ}(\theta_{t'})\right), 0\right\}
\end{equation*}
%
By the construction of $w(\theta)$,
%
\begin{equation*}
     \int_\Theta w(\theta)^m \mathrm{d} \theta  \leq \| \Theta \| \sigma^{-2m}_\mathrm{BQ}(\theta_{t'}) + \varepsilon_0
\end{equation*}
%
%
\end{proof}

$n^{-1/2}$, $n^{-(1-A)/2}$, $n^{-r}$

As $0 \leq A \leq 1$, $-1/2 \leq -(1-A)/2 \leq 0$, the second term dominates the first term always. Therefore, the rate is $n^{-\min\{r, (1-A)/2\}}$. Recall $r=\min{s/(2\nu+d), 1} \in [1/2, 1]$, and if we take $q=1$ (easiest), then $A = d/(2\nu+d)$. That gives the rate of 
$n^{-\min\{s/(2\nu+d), \nu/(2\nu+d), 1\}}$. Since we assumed $s \geq \nu+d/2$, the rate is $n^{-\min\{\nu/(2\nu+d), 1\}}$

so long as
%
\begin{equation*}
    c^{1+q} n^{-q} \geq 64(V+\gamma^2) N(\lambda)^{1-q} \log^2 (6/\delta)
\end{equation*}
%
for $q=1$,
%
\begin{equation*}
    c^2 n^{-1} \geq 64(W+\sigma^2) \log^2 (6/\delta)
\end{equation*}
%

$  c^{1+q} n^{-q} \geq 64 (V+\gamma^2) c^{s(q-1)} n^{s(1-q)} (E_s)^{2(1-q)} \log$

$  c^{1+q - s(q-1)} n^{-q - s(1-q)} \geq 64 (V+\gamma^2) (E_s)^{2(1-q)} \log$
% \section{Kernel mean embedding for Matern kernel under Gaussian measure}
% This section provides a derivation for the mean embedding of Matern 1.5 kernel under Gaussian measure. Note that in \cite{ming2021linked} only the one dimensional case is provided.

% \begin{align}
%     p(x_{1:T}) = \calN \left(x_{1:T} \mid \bmu, \bSigma \right), \quad k(x_{1:T}, \theta_{1:T}) = \left(1 + \frac{\sqrt{3} |x_{1:T} - \theta_{1:T}|}{l}\right) \exp{\left(-\frac{\sqrt{3}| x_{1:T} - \theta_{1:T}|}{l}\right)}
% \end{align}
% We are interested in the the kernel mean embedding of the form
% \begin{align}
% \begin{aligned}
%     &\int k(x_{1:T}, \theta_{1:T}) \calN \left(x_{1:T} \mid \bmu, \bSigma \right) d x_{1:T} \\
%     &= \int {(2\pi)^{-\frac{d}{2}}} |\Sigma|^{-\frac{1}{2}} \exp \left(-\frac{1}{2} \left(x_{1:T} - \bmu)^\top \bSigma^{-1} (x_{1:T} - \bmu \right) \right) \left(1 + \frac{\sqrt{3} |x_{1:T} - \theta_{1:T}|}{l}\right) \exp{\left(-\frac{\sqrt{3}|x_{1:T} - \theta_{1:T}|}{l}\right)} dx_{1:T} \\
%     &= \int_{\theta_{1:T}} {(2\pi)^{-\frac{d}{2}}} |\Sigma|^{-\frac{1}{2}} \exp \left(-\frac{1}{2} \left(x_{1:T} - \bmu)^\top \bSigma^{-1} (x_{1:T} - \bmu \right) \right) \left(1 + \frac{\sqrt{3} (x_{1:T} - \theta_{1:T})}{l}\right) \exp{\left(-\frac{\sqrt{3}(x_{1:T} - \theta_{1:T})}{l}\right)} dx_{1:T} \\
%     &+ \int^{\theta_{1:T}} {(2\pi)^{-\frac{d}{2}}} |\Sigma|^{-\frac{1}{2}} \exp \left( -\frac{1}{2} \left(x_{1:T} - \bmu)^\top \bSigma^{-1} (x_{1:T} - \bmu \right) \right) \left(1 + \frac{\sqrt{3} (\theta_{1:T} - x_{1:T})}{l}\right) \exp{\left(-\frac{\sqrt{3}(\theta_{1:T} - x_{1:T})}{l}\right)} dx_{1:T}
% \end{aligned}
% \end{align}
% We deal with the first term
% \begin{align}
% \begin{aligned}
    
% \end{aligned}
% \end{align}
%

\section{Notation \textcolor{red}{(TODO delete pre submission)}}

%
\begin{align*}
& \theta_{1:T}  = [\theta_1 \cdots \theta_T]^\top \in \Theta^T, \Theta \subseteq \R^p\\
& x^t_{1:T}  = [x^t_1 \cdots x^t_N]^\top \in \Theta^N \text{ for all } t \in \{1,\ldots,T\}, \Theta \subseteq \R^d \\
&m_\Theta, k_\Theta, \calH_{\Theta},  m_{\Theta}, k_{\Theta}, \calH_{\Theta}\\
& \hat{I}_{\text{MC}}(\theta),  \hat{I}_{\text{IS}}(\theta), \hat{I}_{\text{LSMC}}(\theta),
\hat{I}_{\text{KMS}}(\theta), \\
& (\hat{I}_{\text{BQ}}, \sigma^2_{\text{BQ}}) \text{ or later }
(\hat{I}_{\text{BQ}}(\theta),\sigma^2_{\text{BQ}}(\theta)),
\\
& \mu_\theta(x) = \mathbb{E}_{X\sim \mathbb{P}_\theta}[k_{\Theta}(X,x)],  \mathbb{E}_{X,X'\sim \mathbb{P}_\theta}[k_{\Theta}(X,X')]\\
& \sigma^2(\theta), \qquad |I(\theta)-\hat{I}_{\text{BQ}}(\theta)| < \|f(\cdot,\theta) \|_{\mathcal{H}_{\Theta}} \sigma^2_{\text{BQ}}(\theta)\\
\end{align*}
%

%
\begin{align*}
    g(x,\theta,\omega), I_g(\theta,\omega) = \mathbb{E}_{X \sim \mathbb{P}_\theta}[g(X,\theta,\omega)], k_{\Theta,\Theta} = k_\Theta \times k_\Theta
    \\
    k_\Theta (\theta, \theta') 
  = \int \int  k_\Theta(x, x') p(x' \mid \theta') p(x \mid \theta) dx dx' 
\end{align*}
%


\section{Practical Considerations}

\subsection{Tractable Kernel Means}\label{appsec:tractable_kernel_means}


One of the limitations for CBQ and for all BQ-related methods is that the kernel mean embedding $\mu$ is assumed to be known in closed-form; see Table 1 in \citet{fx_quadrature} or the \texttt{ProbNum} package \citep{Wenger2021} for pairs of kernels and distributions. 
When the pair of kernels and distributions does not produce a closed-form embedding, there are multiple other solutions as well.

First, when the embedding of $\mathbb{P}$ is intractable but the embedding of $\mathbb{Q}$ is known, we can use the `importance sampling trick' which consists of writing the integral as $I=\mathbb{E}_{X \sim \mathbb{P}} [f(X)] = \mathbb{E}_{X \sim \mathbb{Q}} [g(X)]$ where $g(x)=f(x)p(x)/q(x)$ and $p,q$ are the densities of $\mathbb{P},\mathbb{Q}$. Alternatively, assuming we know the quantile function $\Phi^{-1}$ of $\mathbb{P}$, we can use the `inverse transform trick' which consists of writing $I=\mathbb{E}_{X \sim \mathbb{P}} [f(X)] = \mathbb{E}_{U \sim \mathbb{U}} [g(U)]$ where $g(u) = f(\Phi^{-1}(u))$ and $\mathbb{U}$ is a uniform distribution on some hypercube.
Additionally, if the distribution $\mathbb{P}$ is only known up to the normalization constant, for example the posterior distribution of Bayesian neural networks, then we can use Stein kernels which provides more flexible closed-form kernel mean embeddings.

\paragraph{Stein Reproducing Kernels} Suppose we have a distribution with density $p(x)$ and a function $f(x)$ with the property that $\lim_{n \to \infty} p(x)f(x) = 0$.
We can define the Stein operator $T_p$ acting on function $f$ and obtain the Stein identity.
%
\begin{align*}
    T_p[f](x) = f(x) \nabla_x \log p(x)+\nabla_x f(x), \quad \E_p[T_p[f](x)] = 0
\end{align*}
%
As a result, for any positive definite kernel $k_0: \calX \times \calX \to \R$ as the base kernel, we can obtain a Stein kernel by applying the Stein operator on both arguments of the kernel $k_0$.
%
\begin{align*}
\begin{aligned}
    k_p(x, x') = T_p^x[T_p^{x'}[k_0(x, x')]] = &\nabla_x \log p(x) k_0(x, x' ) \nabla_{x'} \log p(x') + \nabla \log p(x) \nabla_{x'} k_0(x, x')\\ 
    &+ \nabla \log p(x') \nabla_x k_0(x, x') + \nabla_x \nabla_{x'} k_0(x, x')
\end{aligned}
\end{align*}
%
It is noteworthy to mention that when taking the derivative of the logarithm, the requirement for the normalization constant of $p(x)$ is effectively eliminated. 

Stein identity indicates that the kernel mean embedding of equals to 0, i.e $\mu(x') = \int k_p(x, x')p(x)dx = 0$. However, it is unreasonable to have $\mu(x_{1:N})=0$ as that would suggest the BQ estimate in Equation \eqref{eq:BQ_mean_and_var} is always equal to $0$, and hence the CBQ estimator is also $0$. Therefore, we add a small learnable constant $c$ to the Stein kernel $k_p$, i.e $\tilde{k}_p(x, x') = k_p(x,x') + c$, so that the kernel mean embedding $\mu(x') = \int \tilde{k}_p(x, x')p(x)dx = c$. The constant $c$ for the Stein kernel is selected jointly via maximizing marginal log-likelihood as other hyperparameters. A similar technique has been used in \cite{oates2017control} to select control functionals.

Unlike traditional kernels like RBF or Matern kernels, Stein kernel is non-stationary which implies that the properties of the samples drawn from the GP, such as smoothness or differentiability, may vary across the input space.
Therefore for practitioners, using a GP prior with Stein kernel as covariance requires extra caution.
So far as in our experiments shown in \Cref{sec:experiments} and \Cref{appsec:experiments}, using Stein kernel does not exhibit huge difference from using traditional kernels.

\subsection{Hyperparameter Selection}\label{appsec:hyperparameter_selection}
In the entirety of the experiments presented within this paper, the Gaussian Process (GP) prior mean functions, denoted as $m_\Theta(\theta)$ and $m_\calX(x)$, are consistently considered to be zero functions.
In accordance with this, the regression target is correspondingly normalized.

Covariance functions determine the properties of samples from a Gaussian process, so the hyperparameters of both kernel $k_\calX$ and $k_\Theta$ needs to be carefully selected. Normally for CBQ, that would include four hyperparameters: lengthscale $l_\calX$, $\l_\Theta$ and amplitude $A_\calX$ and $A_\Theta$.
In principle, all hyperparameters are selected via maximising the log-marginal likelihood.
For $k_\calX$, suppose the GP mean $m(x_{1:N})$ is 0, the log-marginal likelihood can be written as~\citep{GPML}:
%
\begin{align*}
L(l_\calX, A_\calX) &=  -\frac{1}{2}f(x_{1:N})^\top k_{\calX}(x_{1:N},x_{1:N};l_\calX, A_\calX)^{-1} f(x_{1:N}) \\
& \qquad -\frac{1}{2} \log \left| k_{\calX}(x_{1:N},x_{1:N}; l_\calX, A_\calX) \right| - \frac{N}{2} \log(2 \pi).
\end{align*}
%
Fortunately, the optimal amplitude parameter $A_\calX$ is known in closed-form by taking the derivative:
%
\begin{align*}
    A_\calX^* := \sqrt{\frac{f(x_{1:N})^\top \tilde{k}_{\calX}(x_{1:N},x_{1:N}; l_\calX)^{-1} f(x_{1:N})}{N}},
\end{align*} 
where $\tilde{k}_{\calX}$ denotes $k_\calX$ with the amplitude parameter equals to $1$.
Therefore, for $k_\calX$ we only need to select the optimal lengthscale $l_\calX$, and we use a grid search over $\left[0.1, 0.3, 1.0, 3.0, 10.0 \right]$ and select the value that gives the largest log-marginal likelihood.

If $k_\calX$ is a Stein kernel, it has another hyperparameter $c$ along with lengthscale $l_\calX$ and amplitude $A_\calX$. 
For Stein kernel, we use gradient based optimization like stochastic gradient descent on the log-marginal likelihood to find the optimal value for $c, l_\calX, A_\calX$. The optimization is implemented with JAX autodiff library~\citep{jax2018github}.

For kernel $k_\Theta$, we also optimize the hyperparameters via maximizing log-marginal likelihood. 
%
\begin{align*}
    L(l_\Theta, A_\Theta) &=  -\frac{1}{2} \hat{I}_\mathrm{BQ} (\theta_{1:T})^\top \Bigl(k_{\Theta}(\theta_{1:T}, \theta_{1:T};l_\Theta, A_\Theta) + \sigma_{\text{BQ}}^2(\theta_{1:T})\Id_T \Bigr)^{-1} \hat{I}_\mathrm{BQ}(\theta_{1:T}) \\
& \qquad -\frac{1}{2} \log | k_{\Theta}(\theta_{1:T},\theta_{1:T}; l_\Theta, A_\Theta)| - \frac{T}{2} \log(2 \pi).
\end{align*}
%
For $k_\Theta$, both amplitude $A_\Theta$ and lengthscale $l_\Theta$ does not have a closed form expression. We do a grid search over $\left[1.0, 10.0, 100.0, 1000.0 \right]$ for amplitude $A_\Theta$ and a grid search over $\left[0.1, 0.3, 1.0, 3.0, 10.0 \right]$ for lengthscale $l_\Theta$ and we select the value that gives the largest log-marginal likelihood.




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%



\section{Experiments}\label{appsec:experiments}
In this section, we provide more detailed description of the settings in all experiments in the main text, and we provide further results and ablation studies. All figures reported in the paper are created using the median values obtained from 20 separate runs with different random seeds. Standard error is shown as shaded area around the median. 

\subsection{Synthetic Experiment: Bayesian Sensitivity Analysis for Linear Models}\label{appsec:bayes_sensitivity}
\subsubsection{Experimental Setting}

\begin{figure}[t]
    \centering
    \begin{minipage}{1.0\textwidth}
    \centering
    \includegraphics[width=250pt]{neurips_2023/figures/legend.pdf}
    \vspace{-5pt}
    \end{minipage}
    
    \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/bayes_sensitivity_N_10.pdf}
        \caption{RMSE with fixed $N=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/bayes_sensitivity_N_50.pdf}
        \caption{RMSE with fixed $N=50$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/bayes_sensitivity_N_100.pdf}
        \caption{RMSE with fixed $N=100$.}
    \end{subfigure}
        \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/bayes_sensitivity_T_10.pdf}
        \caption{RMSE with fixed $T=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/bayes_sensitivity_T_50.pdf}
        \caption{RMSE with fixed $T=50$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/bayes_sensitivity_T_100.pdf}
        \caption{RMSE with fixed $T=100$.}
    \end{subfigure}
    \caption{More experimental results for Bayesian sensitivity analysis. The integrand is $f(w)=w^\top w$ and the dimension $d=2$.}\label{appfig:bayes_sensitivity_1}
\end{figure}

In the toy experiment, we do sensitivity analysis on the hyperparameters in Bayesian linear regression. 
The observations are $\calD = Y \in \R^{M \times d}, Z \in \R^{M}$ where $M$ is the number of observations and $d$ is the dimension including the intercept. 
The prior is chosen as a multivariate Gaussian $p(w) = \calN(0, \bar{\Sigma})$ and the likelihood is also Gaussian $p(\calD \mid w) = \calN(z; w^\top y, \eta)$. The posterior is known to have a closed form~\citep{bishop:2006:PRML}
%
\begin{align*}
    p(w \mid \calD) = \calN(\tilde{m}, \tilde{\Sigma}), \quad \tilde{\Sigma}^{-1} = \bar{\Sigma}^{-1} + \eta Y^\top Y, \quad \tilde{m} = \eta \tilde{\Sigma} Y^\top Z
\end{align*}
%
We are interested in the integral of two functions against the posterior distribution $I(\bar{\Sigma}) = \int f(w) p(w \mid \calD; \bar{\Sigma}) dw $. 
The first integrand is $f(w)=w^\top w$ and the integral describes the sum of posterior variance. The second integrand is $f(w)=w^\top y^\ast$ for a new observation $y^\ast$ and the integral describes the predictive mean.
Both integrals are known to have closed form expression in Bayesian linear regression~\citep{bishop:2006:PRML}, so it is easier for us to compare the performances of different methods.
We are interested in the analysis on the sensitivity of $I(\bar{\Sigma})$ towards $\bar{\Sigma}$. The prior covariance $\bar{\Sigma}$ is chosen to be a diagonal matrix for simplification. The observations $\bar{\Sigma}_{1:T}$ are sampled from a uniform distribution on each entry, i.e $\bar{\Sigma}_{1:T} \sim \left[1, 3\right]^d$. For each $\bar{\Sigma}_t$, we have $N$ samples from the posterior $w^t_{1:N} \sim p(w \mid \calD, \bar{\Sigma}_t)$. In total, we have $N \times T$ samples.

For conditional Bayesian quadrature (CBQ), we need to specify two kernels. First, we choose the kernel on the space of parameter $w \in \R^d$ (corresponds to $k_\calX$ in \Cref{sec:cbq}) to be a Gaussian kernel with lengthscale $l$ and amplitude $A$
%
\begin{align}
    k(w, w') = A \exp(-\frac{1}{2 l^2} (w - w')^\top(w - w'))
\end{align}
%
So as a result we can have a closed from kernel mean embedding under the Gaussian posterior distribution. 
%
\begin{align}\label{appeq:E14}
    \mu_{\bar{\Sigma}}(w) = A {\mid \mathrm{I} + l^{-2} \tilde{\Sigma} \mid}^{-1/2} \exp \left(-\frac{1}{2} (w - \tilde{m})^\top (\tilde{\Sigma} + l^2 \mathrm{I})^{-1} (w - \tilde{m})\right)
\end{align}
%
Then we choose the kernel on the space of $\bar{\Sigma}$ (corresponds to $k_\Theta$ in \Cref{sec:cbq}). Since we assume $\bar{\Sigma}$ to be diagonal, we use product Matern kernel where the kernel on the space of each entry of $\bar{\Sigma}$ is chosen to be a Matern-3/2 kernel.
The hyperparameters for both kernels are selected according to \Cref{appsec:hyperparameter_selection}.

There are hyperparameters in baseline methods as well. For importance sampling (IS) estimator, there are no hyperparameters. For kernel mean shrinkage (KMS) estimator, we also use product Matern-3/2 kernel on the space of $\bar{\Sigma}$ and select hyperparameters according to \Cref{appsec:hyperparameter_selection}. 
For least square Monte Carlo (LSMC), the hyperparameter is the order of polynomials. We choose the order among the set $\{1,2,3,4\}$ that returns the best performance.

\subsubsection{More Experimental Results}
We provide more experimental results for Bayesian sensitivity analysis here. 
In \Cref{appfig:bayes_sensitivity_1}, the integrand is chosen to be $f(w)=w^\top w$ and the dimension $d$ is 2. In the first row of \Cref{appfig:bayes_sensitivity_1}, we fix $N=10, 50, 100$ showing the performance of RMSE with increasing $T$. 
In the second row of \Cref{appfig:bayes_sensitivity_1}, we fix $T=10, 50, 100$ showing the performance of RMSE with increasing $N$. 
In \Cref{appfig:bayes_sensitivity_2}, the integrand is chosen to be $f(w)=w^\top y^\ast$ and the dimension $d$ is 2. 
In the first row of \Cref{appfig:bayes_sensitivity_2}, we fix $N=10, 50, 100$ showing the performance of RMSE with increasing $T$. 
In the second row of \Cref{appfig:bayes_sensitivity_2}, we fix $T=10, 50, 100$ showing the performance of RMSE with increasing $N$. 
We can see that CBQ has demonstrated consistent lower RMSE for both integrands under the same number of samples and faster convergence rate compared to all other baseline methods. 
Also, we can confirm the theory that CBQ has a faster convergence rate in $N$ than in $T$.

In \Cref{appfig:time_all}, we show the computational cost of different methods in Bayesian sensitivity analysis for fixed $T=50$.
It is clear from the figure that CBQ is more computationally expensive, so in this simple setting it is more efficient to spend more budget on obtaining more samples. 
Nonetheless, in scenarios where the expense of sample collection constitutes a significant fraction of the computational budget, or when the evaluation of the integrand proves to be highly costly, it becomes more cost-effective to spend a larger share of the budget towards the application of the CBQ method.


\begin{figure}[t]
    \centering
    \begin{minipage}{1.0\textwidth}
    \centering
    \includegraphics[width=250pt]{neurips_2023/figures/legend.pdf}
    \vspace{-5pt}
    \end{minipage}
    
    \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/bayes_sensitivity_N_10_g4.pdf}
        \caption{RMSE with fixed $N=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/bayes_sensitivity_N_50_g4.pdf}
        \caption{RMSE with fixed $N=50$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/bayes_sensitivity_N_100_g4.pdf}
        \caption{RMSE with fixed $N=100$.}
    \end{subfigure}
        \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/bayes_sensitivity_T_10_g4.pdf}
        \caption{RMSE with fixed $T=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/bayes_sensitivity_T_50_g4.pdf}
        \caption{RMSE with fixed $T=50$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/bayes_sensitivity_T_100_g4.pdf}
        \caption{RMSE with fixed $T=100$.}
    \end{subfigure}
    \caption{More experimental results for Bayesian sensitivity analysis. The integrand is $f(w)=w^\top y^\ast$ and the dimension $d=2$.}\label{appfig:bayes_sensitivity_2}
\end{figure}


\begin{figure}[t]
    \begin{subfigure}{0.32\textwidth}
        \centering
        \includegraphics[width=\linewidth]{neurips_2023/figures/BQ_CBQ_1.pdf}
        \caption{CQ and CBQ RMSE}
        \label{appfig:bq_cbq_rmse}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.32\textwidth}
        \centering
        \includegraphics[width=\linewidth]{neurips_2023/figures/BQ_CBQ_2.pdf}
        \caption{CQ and CBQ time}
        \label{appfig:bq_cbq_time}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.32\textwidth}
        \centering
        \includegraphics[width=\linewidth]{neurips_2023/figures/bayes_sensitivity_time_T_50.pdf}
        \caption{Computational time}
        \label{appfig:time_all}
    \end{subfigure}
    \caption{\textbf{Left:} Comparison of BQ and CBQ in terms of time (wall clock time) and RMSE in Bayesian sensitivity analysis. \textbf{Right:} Computational time (wall clock time) for different methods in Bayesian sensitivity analysis with increasing $T$ under dimension $d=2$ and fixed $T=50$. \fxb{I dont know how easy it is to try this or not, so this might be an annoying idea (in which case we dont need to do it). But what about trying a curve with $N=2T$ or $N=5T$ or perhaps even $N=T{3/2}$. From previous experiments, it seems like CBQ's performance improves faster when $N$ grows than when $T$ grows. For BQ, I think it wouldnt change too much, but for CBQ things might look much better. Just a thought}}
\end{figure}

\subsubsection{Comparison of BQ and CBQ}\label{appsec:BQ_CBQ}
In \Cref{sec:cbq}, we mentioned the comparison of BQ and CBQ in terms of their computational complexity and convergence rate. For $T$ parameter values $\theta_1, \cdots, \theta_T$ and $N$ samples from each probability distribution $\mathbb{P}_{\theta_1}, \ldots, \mathbb{P}_{\theta_T}$, the computational cost is $\calO(N^3T^3)$ for BQ and $\calO(TN^3)$ for CBQ and the convergence rate is \hudson{add convergence rate}. In \Cref{appfig:bq_cbq_rmse} and \Cref{appfig:bq_cbq_time}, we fix $N=T$ and demonstrate that BQ has a much faster convergence rate but the computational time gets unbearable quickly as the number of samples increases.


\begin{figure}
    \begin{subfigure}{0.33\textwidth}
        \centering
        \includegraphics[width=\linewidth]{neurips_2023/figures/bayes_sensitivity_qmc.pdf}
        \caption{Quasi Monte Carlo}
        \label{appfig:qmc}
    \end{subfigure}
    \begin{subfigure}{0.30\textwidth}
        \centering
        \includegraphics[width=\linewidth]{neurips_2023/figures/ablation_kernel_x.pdf}
        \caption{Ablation on kernel $k_\Theta$}
        \label{appfig:ablation_theta}
    \end{subfigure}
    \begin{subfigure}{0.30\textwidth}
        \centering
        \includegraphics[width=\linewidth]{neurips_2023/figures/ablation_kernel_y.pdf}
        \caption{Ablation on kernel $k_\calX$}
        \label{appfig:ablation_x}
    \end{subfigure}
    \caption{\textbf{Left:} Comparison of all methods with standard sampling methods and Quasi-Monte Carlo methods. \textbf{Middle:} Ablation study for CBQ with different $k_\calX$ kernels in Bayesian sensitivity analysis. \textbf{Right:} Ablation study for CBQ with different $k_\theta$ kernels in Bayesian sensitivity analysis.}
\end{figure}


\subsubsection{Quasi Monte Carlo}\label{appsec:QMC}
Quasi Monte Carlo (QMC) is another line of research on improving the precision of approximating intractable integrals. QMC aims to cover the integration domain more uniformly than random sampling used in standard Monte Carlo methods~\citep{niu2023discrepancy, hickernell1998generalized, gerber2015sequential}. 
Sobol sequences are a type of low-discrepancy sequence commonly used in Quasi-Monte Carlo (QMC) methods, and Sobol sequences are able to cover the multidimensional space more uniformly than random sequences, resulting in a faster convergence rate.
However, since Sobol sequences are deterministic, we follow the technique introduced in randomized QMC ~\cite{lemieux2004randomized} to shift the Sobol sequence by a random amount so that we can combine the advantages of deterministic  sampling from QMC and the robustness of randomness from standard Monte Carlo methods.

For our method CBQ, we put no restrictions on how the data observations are being generated - we do not require i.i.d sampling. 
Therefore, we implement QMC sampling and compare the performances of all methods with random sampling in \Cref{appfig:qmc}. 
It can be observed that Quasi-Monte Carlo (QMC) significantly enhances the performance of baseline methods, such as Kernel Mean Shrinkage (KMS) and Least Squares Monte Carlo (LSMC), while subtly improves the performance of Conjugate Bayesian Quadrature (CBQ). The limited degree of improvement seen in CBQ with QMC sampling can be attributed to the fact that CBQ already yields a remarkably low RMSE. Consequently, the margin of improvement offered by QMC sampling is not as evident in CBQ as in the baseline methods.

\subsubsection{Ablations}\label{appsec:ablation}
We present an ablation study evaluating the impact of distinct kernel choices $k_\calX$ and $k_\Theta$ within the framework of Bayesian sensitivity analysis. The Matern-3/2 kernel and Gaussian Radial Basis Function (RBF) kernel are selected for $k_\Theta$. As illustrated in \Cref{appfig:ablation_theta}, the performance of the CBQ remains consistent across these different $k_\Theta$ kernels.

Subsequently, we opt for Matern-3/2, Gaussian RBF, and Stein kernel (with Matern-3/2 as the base kernel) as choices for $k_\calX$. When $k_\calX$ is the RBF kernel, the formula for kernel mean embedding $\mu_{\bar{\Sigma}}(w)$ is presented in \eqref{appeq:E14}. In the scenario where $k_\calX$ is the Matern-3/2 kernel, a closed form expression for the kernel mean embedding does not exist for the non-isotropic Gaussian distribution $\calN(\tilde{m}, \tilde{\Sigma})$. Consequently, we employ the reparameterization trick, initially sampling $u$ from $\calN(0, \mathrm{I})$, then calculating $w = \tilde{m} + L^\top u$ where $L$ is the lower triangular matrix derived from the Cholesky decomposition of the covariance matrix $\tilde{\Sigma}$. 
In essence, $I(\bar{\Sigma}) = \int f(w)\calN(w; \tilde{m},\tilde{\Sigma}) dw = \int f(\tilde{m} + L^\top u) \calN(u; 0, I) du$.
When $k_\calX$ is Stein kernel, we choose Matern-3/2 kernel $k_0$ as the base kernel and then apply Stein operator on both arguments of kernel $k_0$.
In \Cref{appfig:ablation_x}, we can see that CBQ performance is consistent under different types of kernels $k_\Theta$. All kernel hyperparameters are chosen according to \Cref{appsec:hyperparameter_selection}.


\begin{figure}
    \begin{subfigure}{0.24\textwidth}
        \centering
        \includegraphics[width=\linewidth]{neurips_2023/figures/calibration_bayes.pdf}
        \caption{Calibration}
    \label{appfig:calibration_bayes_sensitivity}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.24\textwidth}
        \centering
        \includegraphics[width=\linewidth]{neurips_2023/figures/calibration_finance.pdf}
        \caption{Calibration}
        \label{appfig:calibration_finance}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.24\textwidth}
        \centering
        \includegraphics[width=\linewidth]{neurips_2023/figures/calibration_sir.pdf}
        \caption{Calibration}
        \label{appfig:calibration_sir}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.24\textwidth}
        \centering
        \includegraphics[width=\linewidth]{neurips_2023/figures/calibration_decision.pdf}
        \caption{Calibration}
        \label{appfig:calibration_decision}
    \end{subfigure}
    \caption{\textbf{Left:} Calibration plot for CBQ in Bayesian sensitivity analysis. \textbf{Middle Left:} Calibration plot for CBQ in Black-Scholes model. \textbf{Middle Right:} Calibration plot for CBQ in SIR sensitivity analysis. \textbf{Right:} Calibration plot for CBQ in uncertainty decision making.}
\end{figure}


\subsubsection{Calibration}\label{appsec:calibration}
In \Cref{sec:cbq}, we mentioned that CBQ provides uncertainty quantification for the integral of interest, but in the main text we only use root mean squared error (RMSE) as the performance metric where the uncertainty information is not fully utilized. In \Cref{appfig:calibration_bayes_sensitivity}, we show the calibration of CBQ in Bayesian sensitivity analysis with integrand $f(w)=w^\top w$ and dimension $d=2$.
The black diagonal line represents the ideal case,  the curve lying above the black line represents underconfidence and the curve lying below the black line represents overconfidence. 
It is generally more preferable to be underconfident than overconfident. 
In \Cref{appfig:calibration_bayes_sensitivity}, we can see that when the number of samples are small as $10$, CBQ is overconfident probably because we use empirical Bayes in selecting the hyperparameters that will tend to give more confident estimates. When the number of samples increases to $50$, CBQ tends to become well calibrated and when the number increases to $100$, CBQ tends to become underconfident.

\subsection{Butterfly Call Option with the Black-Scholes Model}\label{appsec:black_scholes}

\subsubsection{Experimental Setting}
In this experiment, we consider specifically an asset whose price $S_{\eta}$ at time $\eta$ follows the Black-Scholes formula 
%
\begin{align*}
    S_{\eta} = S_0 \exp(\sigma W_{\eta} - \sigma^2 \eta/2), \quad \text{for} \quad \eta \geq 0
\end{align*}
%
with $\sigma$ being the underlying volatility and $W$ being the standard Brownian motion.
The financial derivative we are interested in is a butterfly call option whose payoff at time $\zeta$ can be expressed as
%
\begin{align*}
    \psi\left(S_{\zeta}\right)=\max \left(S_{\zeta}-K_1, 0\right) + \max \left(S_{\zeta}-K_2, 0\right) - 2\max \left(S_{\zeta} - \frac{K_1+K_2}{2}, 0 \right)
\end{align*}
%
In addition to the expected payoff, insurance companies are interested in computing the expected loss of their portfolios if a shock would occur in the economy.
We follow the setting in \cite{alfonsi2021multilevel, alfonsi2022many} assuming that a shock occur at time $\eta$ that multiplies the price of the butterfly option by $1 + s$, so the expected loss caused by the shock can be expressed as 
%
\begin{align*}
    \calL = \E
\left[ \max(\E[\psi(S_{\zeta}) - \psi((1+s)S_{\zeta})\right] \mid S_{\eta}), 0] 
\end{align*}
%

We consider the initial price $S_0 = 100$, the volatility $\sigma = 0.3$, the strikes $K_1 = 50, K_2 = 150$, the option maturity $\zeta=2$ and the shock happens at $\eta=1$ with strength $s = 0.2$. 
The observations $\{S_{\eta}\}_{1:T}$ are sampled from the log normal distribution deduced from the Black-Scholes formula $\{S_{\eta}\}_{1:T} \sim \operatorname{Lognormal}\left( \log S_0 - \frac{\sigma^2}{2} \eta, \sigma^2 \eta \right)$. 
Then $N$ observations of $\{S_{\zeta}\}^t_{1:N}$ are sampled from the log normal distribution deduced from the Black-Scholes formula $\{S_{\zeta}\}^t_{1:N} \sim \operatorname{Lognormal}\left( \log S_\eta - \frac{\sigma^2}{2} (\zeta - \eta), \sigma^2 (\zeta - \eta) \right)$. 

For conditional Bayesian quadrature (CBQ), we need to specify two kernels. First we choose the kernel on the space of $S_\zeta$ (corresponding to $k_\calX$ in \Cref{sec:cbq}). Since in Black-Scholes model, $p(S_\zeta \mid S_\eta)$ is a log normal distribution, we use a log RBF kernel so that we can have a closed form mean embedding $\mu$. $p(S_\zeta \mid S_\eta)$ is the log normal distribution derived from the Black-Scholes model $S_\zeta \sim \operatorname{Lognormal}(\bar{m}, \bar{\sigma}^2)$ with $\bar{m} = \log S_\eta - \frac{\sigma^2}{2}t$ and  $\bar{\sigma}^2 = \sigma^2 t$. The log RBF kernel is defined as 
%
\begin{align*}
    k(S_\zeta, S_\zeta') = \lambda \exp(-\frac{1}{2l^2} (\log S_\zeta - \log S_\zeta')^2)
\end{align*}
%
and the kernel mean embedding has a closed form
%
\begin{align*}
    \mu_{S_\eta}(S_\zeta) = \left. \exp \left(-\frac{\bar{m}^2 + (\log S_\zeta)^2 }{2(\bar{\sigma}^2 + l^2)}\right) S_\zeta^{\frac{\bar{m}}{\bar{\sigma}^2 + l^2}} \middle/ 
    \bar{\sigma} \sqrt{\frac{1}{\bar{\sigma}^2} + \frac{1}{l^2}} \right.
\end{align*}
%
For CBQ with Stein kernel, we use Matern-3/2 as the base kernel and then apply Stein operator to both arguments of the base kernel to obtain $k_\calX$. 
Then we choose the kernel on the space of $S_\eta$ (corresponding to $k_\Theta$ in \Cref{sec:cbq}) as Matern-3/2 kernel.
All hyperparameters in $k_\calX$ and $k_\Theta$ are selected according to  \Cref{appsec:hyperparameter_selection}.

There are hyperparameters in other baseline methods as well. For importance sampling (IS) estimator, there are no hyperparameters. For kernel mean shrinkage (KMS) estimator, we also use product Matern-3/2 kernel on the space of $\bar{\Sigma}$ and select hyperparameters according to \Cref{appsec:hyperparameter_selection}. 
For least square Monte Carlo (LSMC), the hyperparameter is the order of polynomials. We choose the order among the set $\{1,2,3,4\}$ that returns the best performance.

\subsubsection{More Experimental Results}
We report more experimental results for computing the expected loss in butterfly call option with the Black-Scholes model in \Cref{appfig:finance} with fixed $T=10, 20, 30$. We can see that the outstanding performance of CBQ is consistent. 

In \Cref{appfig:calibration_finance}, we also show the calibration of CBQ uncertainty using the log RBF kernel. The conclusion is almost consistent with the conclusion from Bayesian sensitivity analysis in that CBQ is overconfident when the number of observations are small and tend to become underconfident as the number of observation increases.

\begin{figure}[t]
    \begin{minipage}{\textwidth}
    \centering
    \includegraphics[width=250pt]{neurips_2023/figures/legend_finance.pdf}
    \vspace{-10pt}
    \end{minipage}
    
    \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/finance_T_10.pdf}
        \caption{RMSE with fixed $T=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/finance_T_20.pdf}
        \caption{RMSE with fixed $T=20$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/finance_T_30.pdf}
        \caption{RMSE with fixed $T=30$.}
    \end{subfigure}
    \caption{Butterfly call option with the Black-Scholes model}\label{appfig:finance}
\end{figure}

\subsection{Bayesian Sensitivity for a Susceptible-Infectious-Recovered (SIR) Model }\label{appsec:sir}
\subsubsection{Experimental Settings}
The SIR model is commonly used to simulate the dynamics of infectious diseases through a population. It divides the population into three sections.
Susceptibles (S) represent people who are not infected but can be infected after getting contact with an infectious individual.
Infectious (I) represent people who are currently infected and can infect susceptible individuals.
Recovered (R) represent individuals who have been infected and then removed from the disease, either by recovering or dying. The dynamics are governed by a system of ordinary differential equations (ODE) as below.
%
\begin{align*}
    \begin{aligned}
\frac{\mathrm{d} S}{\mathrm{~d} t} &= -\beta S I, \quad
\frac{\mathrm{d} I}{\mathrm{~d} t} &= \beta S I-\gamma I, \quad
\frac{\mathrm{d} R}{\mathrm{~d} t} &= \gamma I
\end{aligned}
\end{align*}
%
$\beta$ is the infection rate and $\gamma$ is the recovery rate. The solution to the SIR model would be a vector of $\left(I_d, S_d, R_d\right)$ indexed by time step $d$. In this experiment, we use scipy odeint function ~\citep{2020SciPy-NMeth} to solve the ODEs.


In this experiment, we assume that the recovery rate $\gamma$ is fixed and $\beta$ follows a gamma prior distribution $\beta \sim \operatorname{Gamma}(\beta ; \bar{\beta}, \xi)$ where $\bar{\beta}$ represents the initial belief of the infection rate deduced from the study of the virus in the laboratory at the beginning of the outbreak, and $\xi$ represents the amount of uncertainty. The target of interest is the expected peak number of infected individuals under the prior distribution on $\beta$: $I_{\text{max}}(\bar{\beta}) = \E_{\beta}\left[\max_d I_d(\beta) \mid \bar{\beta} \right]$. It is always important to know how different initial estimate (different $\bar{\beta}$) of the infection rate will lead to different final estimate of $I_{\text{max}}$. 

For conditional Bayesian quadrature (CBQ), we need to specify two kernels. First we choose the kernel on the space of $\beta$ (corresponding to $k_\calX$ in \Cref{sec:cbq}). We use Matern-3/2 as the base kernel and then apply Stein operator to both arguments of the base kernel to obtain $k_\calX$. Then we choose the kernel on the space of $\bar{\beta}$ (corresponding to $k_\Theta$ in \Cref{sec:cbq}) as Matern-3/2 kernel. All hyperparameters in $k_\calX$ and $k_\Theta$ are selected according to \Cref{appsec:hyperparameter_selection}.

There are hyperparameters in other baseline methods as well. For importance sampling (IS) estimator, there are no hyperparameters. For kernel mean shrinkage (KMS) estimator, we also use product Matern-3/2 kernel on the space of $\bar{\Sigma}$ and select hyperparameters according to \Cref{appsec:hyperparameter_selection}. 
For least square Monte Carlo (LSMC), the hyperparameter is the order of polynomials. We choose the order among the set $\{1,2,3,4\}$ that returns the best performance.

\begin{figure}[t]
    \begin{minipage}{\textwidth}
    \centering
    \includegraphics[width=250pt]{neurips_2023/figures/legend_finance.pdf}
    \vspace{-10pt}
    \end{minipage}
    
    \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/SIR_15.pdf}
        \caption{RMSE with fixed $T=15$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/SIR_25.pdf}
        \caption{RMSE with fixed $T=25$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/SIR_35.pdf}
        \caption{RMSE with fixed $T=35$.}
    \end{subfigure}
    \caption{Bayesian Sensitivity analysis for Susceptible-Infectious-Recovered (SIR) Model}\label{appfig:sir}
\end{figure}

\subsubsection{More Experimental Results}
We report more experimental results for computing the expected peak number of infections from SIR model in \Cref{appfig:sir} with fixed $T=15, 25, 35$. We can see that the outstanding performance of CBQ is consistent.

In \Cref{appfig:calibration_sir}, we also show the calibration of CBQ uncertainty. The conclusion is consistent with the conclusion from Bayesian sensitivity analysis in that CBQ is overconfident when the number of observations are small and tend to become underconfident as the number of observation increases.

\subsection{Uncertainty Decision Making}\label{appsec:decision}
\subsubsection{Experimental Settings}
In the medical world, it is important to compare the cost and the relative advantage of conducting an extra medical experiment~\citep{brennan2007calculating}.
In the area of oil and gas reservoir, an cost analysis is necessary before deciding whether to 
drill additional wells.
The expected value of partial perfect information (EVPPI) quantifies the average advantage gained from conducting extra experiment to obtain precise knowledge of the uncertain variables $W$~\citep{brennan2007calculating}. 
EVPPI can be expressed as $\E[\max_c \E [f_c(X, W) \mid W ] ] - \max_c \E[f_c(X, W)]$ \fxb{I think we should clarify what the expectations are over.} where $c \in \mathcal{C}$ is a set of potential treatments.

\subsubsection{More Experimental Results}
\begin{figure}[t]
    \begin{minipage}{\textwidth}
    \centering
    \includegraphics[width=250pt]{neurips_2023/figures/legend.pdf}
    \vspace{-10pt}
    \end{minipage}
    
    \centering
    % First plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/decision_T_10_no_legend.pdf}
        \caption{RMSE with fixed $T=10$}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/decision_T_30_no_legend.pdf}
        \caption{RMSE with fixed $T=30$.}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=\textwidth]{neurips_2023/figures/decision_T_50_no_legend.pdf}
        \caption{RMSE with fixed $T=50$.}
    \end{subfigure}
    \caption{Uncertainty decision making in health economics}\label{appfig:decision}
\end{figure}


\section{CBQ from the Perspective of Conditional Kernel Mean Embedding}\label{appsec:ckme}
In the main text, CBQ is mainly derived from the perspective of probabilistic numerics and Gaussian process regression is the main probabilistic tool used in the derivation. 
In this section, we offer a distinct perspective, viewing CBQ through a frequentist approach. 
This alternative viewpoint reveals an intriguing connection to the concept of conditional kernel mean embedding, thus expanding our understanding of the CBQ method.

Recall that we have assumed two positive definite kernels $k_\Theta: \Theta \times \Theta \to \R$ and $k_\calX: \calX \times \calX \to \R$ with corresponding reproducing kernel Hilbert spaces $\calH_\Theta$ and $\calH_\calX$. 
The notation $\Theta$ and $\calX$ is quite rare in the literature of kernel mean embedding, but we stick to this notation to keep consistency with the main text.
$\calU_{X \mid \theta} \in \calH_\calX$ is the conditional mean embedding of conditional distribution $P(X \mid \Theta=\theta)$ such that the conditional expectation can be written in the form of inner product:
%
\begin{align}
    \E[f(X) \mid \Theta=\theta] = \int_\calX f(x) p(x \mid \theta) = \PSi{f, \calU_{X \mid \theta}}{\calH_\calX}
\end{align}
%
Note that in the main text, a more general form of function $f$ is allowed that depends on both $\theta$ and $x$ and the distribution $\mathbb{P}_\theta$ has a more flexible parametric form. In this section, we only focus on the setting that the function $f$ depends solely on $x$ and $\mathbb{P}_\theta$ is a conditional distribution with density $p(x \mid \theta)$.

Since for every $\theta \in \Theta$, there exists an embedding $\calU_{X \mid \theta}$, so we are interested in finding an operator $\calU_{X \mid \Theta}: \Theta \to \calH_\calX$ such that
%
\begin{align*}
    \calU_{X \mid \theta} = \calU_{X \mid \Theta} k_\Theta(\theta, \cdot)
\end{align*}
%
$\calU_{X \mid \Theta}$ is the conditional mean embedding of conditional distribution $P(X \mid \Theta)$ and also belongs to the product reproducing kernel Hilbert space: $\calU_{X \mid \Theta} \in \calH_\calX \otimes \calH_\Theta$. If we find a good approximation of $\calU_{X \mid \Theta}$, then the conditional expectation can be written as
%
\begin{align*}
    \E[f(X) \mid \Theta=\theta] = \PSi{f, \calU_{X \mid \theta}}{\calH_\calX} = \PSi{f, \calU_{X \mid \Theta }k_\Theta(\theta, \cdot)}{\calH_\calX}
\end{align*}
%
A more detailed discussion can be found in \cite{muandet2016kernelmeanshrinkage}.

The finding of the optimal approximating operator $F\in \calH_\calX \otimes \calH_\Theta$ can be written as the minimizer of the following objective~\citep{grunewalder2012conditional}.
%
\begin{align}
    \calE[F] = \sup_{\norm{f}{\calH_\calX} \leq 1} \E_{\Theta} \left[\left(\E_X[f(X) \mid \Theta] - \PSi{f, F(X)}{\calH_\calX}\right)^2 \right]
\end{align}
%
which can be further upper bounded by 
%
\begin{align}\label{appeq:A4}
\begin{aligned}
    \calE[F] &= \sup_{\norm{f}{\calH_\calX} \leq 1}  \E_{\Theta} \left[\PSi{f, \E_X\left[k_\calX(X, \cdot) \mid \Theta\right] - F(\Theta)}{\calH_\calX}^2 \right] \\
    &\leq \sup_{\norm{f}{\calH_\calX} \leq 1} \norm{f}{\calH_\calX}^2 \E_{\Theta} \norm{\E_X\left[k_\calX(X, \cdot) \mid \Theta\right] - F(\Theta)}{\calH_\calX}^2 \\
    &= \E_{\Theta} \norm{\E_X\left[k_\calX(X, \cdot) \mid \Theta \right] - F(\Theta)}{\calH_\calX}^2 \\
    & \leq \E_{\Theta, X}\left[\norm{k_\calX(X, \cdot)-F(\Theta)}{\calH_\calX}^2 \right]
\end{aligned}
\end{align}
%
Normally, in the literature of conditional kernel mean embedding, only sample pairs $\{x_{1:T}, \theta_{1:T}\}$ are observed and the objective above is replaced by the empirical estimate with an extra regularization~\citep{grunewalder2012conditional, li2022optimal}.
%
\begin{align}\label{appeq:A5}
    \calE_{\text{original}}[F] = \sum_{t=1}^T \norm{k_\calX(x_t, \cdot) - F(\theta_t)}{\calH_\calX}^2 + \lambda \norm{F}{\calH_\calX \otimes \calH_\Theta}^2
\end{align}
% It is necessary to point out that the first term in Equation \eqref{appeq:A5} is essentially a nested Monte Carlo estimate for the third line in Equation \eqref{appeq:A4}. 
% This insight will be useful for later discussions.
We use the term 'original' to indicate that this is the objective that is commonly used in the literature. Later on in the section we are going to show other different objectives $\calE$ with different subscripts.

With the aid of the Riesz representer theorem, the minimizer to \eqref{appeq:A5} has a closed form expression. The derivations can be found in ~\cite{grunewalder2012conditional, li2022optimal}.
%
\begin{align}\label{eq:app_first_cond_exp}
    F_{\text{original}}(\theta)(\cdot) = k_\Theta(\theta, \theta_{1:T})(k_\Theta(\theta_{1:T}, \theta_{1:T}) + T \lambda I)^{-1} k_\calX(x_{1:T}, \cdot)
\end{align}
%
And therefore, the conditional expectation in Equation \eqref{appeq:A1} can be expressed as
%
\begin{align}
\begin{aligned}
\widehat{\E\left[f(Y)\mid \Theta=\theta \right]}_{\text{original}} &= \PSi{f, F_{\text{original}}(\theta)}{\calH_\calX} \\
&= k_\Theta(\theta, \theta_{1:T})(k_\Theta(\theta_{1:T}, \theta_{1:T}) + T\lambda I)^{-1} f(x_{1:T})
\end{aligned}
\end{align}
%

However, we have mentioned in the main text that in our setting we observe $N$ samples of $x$ from the conditional distribution $P(X \mid \Theta=\theta_t)$ for any given $t$. 
Recall from \Cref{sec:background} that our observations are:
%
\begin{align*}
\theta_{1:T} := [\theta_1 \cdots \theta_T]^\top \in \Theta^T, \quad  x^t_{1:N} := [x^t_1 \cdots x^t_N]^\top \in \calX^N \quad \text{ and } \quad\\
 f(x^t_{1:N}) = [f(x^t_1) \cdots f(x^t_N)]^\top \in \R^N \quad \text{ for all } t \in \{1,\ldots,T\},
\end{align*}
%
As a result, the original objective $\calE_{\text{original}}$ requires some minor modifications to adapt to our setting, so we come up with a new objective
%
\begin{align}\label{appeq:A8}
    \calE_{\text{multiple}}[F] = \sum_{t=1}^T \norm{\E_X\left[k_\calX(X, \cdot) \mid \Theta=\theta_t\right] - F(\theta_t)}{\calH_\calX}^2 + \lambda \norm{F}{\calH_\calX \otimes \calH_\Theta}^2
\end{align}
%
The new objective $\calE_{\text{multiple}}[F]$ differs from $\calE_{\text{original}}[F]$ only by the inner expectation $\E_X\left[k_\calX(X, \cdot) \mid \Theta=\theta_t\right]$. In the new objective $\calE_{\text{multiple}}[F]$, the inner expectation is kept till later.
With the same derivation as above, the minimizer to \eqref{appeq:A8} has a closed form expression
%
\begin{align}\label{appeq:A9}
    F_{\text{multiple}}(\theta)(\cdot) = k_\Theta(\theta, \theta_{1:T})(k_\Theta(\theta_{1:T}, \theta_{1:T}) + T\lambda I)^{-1} \left[\begin{array}{c}
    \E_X\left[k_\calX(X, \cdot) \mid \Theta=\theta_1\right] \\
    \vdots \\
    \E_X\left[k_\calX(X, \cdot) \mid \Theta=\theta_T \right]
    \end{array}\right]
\end{align}
%
And then we take the inner product to have
%
\begin{align}\label{eq:app_second_cond_exp}
\begin{aligned}
    \widehat{\E\left[f(Y)\mid \Theta=\theta \right]}_{\text{multiple}} &= \PSi{f, F_{\text{multiple}}(\theta)}{\calH_\calX} \\ &= k_\Theta(\theta, \theta_{1:T})(k_\Theta(\theta_{1:T}, \theta_{1:T}) + T\lambda I)^{-1} \left[\begin{array}{c}
    \E_X\left[f(X) \mid \Theta=\theta_1\right] \\
    \vdots \\
    \E_X\left[f(X) \mid \Theta=\theta_T \right]
    \end{array}\right]
\end{aligned}
\end{align}
%

If we compare Equation \eqref{eq:app_second_cond_exp} with Equation \eqref{eq:app_first_cond_exp}, we can see that in Equation \eqref{eq:app_first_cond_exp} a one sample Monte Carlo estimate is applied to approximate the conditional expectations $\E_X\left[f(X) \mid \Theta=\theta_t \right]$ due to the limitation that only one sample observed from the conditional distribution $P(X \mid \Theta=\theta_t)$. Since now we have multiple samples $x_{1:n}^t$ available, the first thing to come to mind is to use again Monte Carlo to take an empirical average over function evaluations, which gives us
%
\begin{align}
    \widehat{\E\left[f(Y)\mid \Theta=\theta \right]}_{\text{shrinkage}} = k_\Theta(\theta, \theta_{1:T})(k_\Theta(\theta_{1:T}, \theta_{1:T}) + T\lambda I)^{-1} 
    \left[\begin{array}{c}
    \frac{1}{N}\bone^\top f(x_{1:N}^{1}) \\
    \vdots \\
    \frac{1}{N}\bone^\top f(x_{1:N}^{T})
    \end{array}\right]
\end{align}
%
where $\bone$ is a vector of ones.

This approach is known as the kernel mean shrinkage estimator~\citep{muandet2016kernelmeanshrinkage} and the application of it has been considered in~\cite{chau2021deconditional} . It also has a Bayesian interpretation in ~\cite{flaxman2016bayesian}.

Our approach CBQ proposed in the main text uses Bayesian quadrature to estimate $\E_X\left[f(X) \mid \Theta=\theta_i\right]$ instead of simply averaging the function evaluations. In \Cref{sec:bayesian_quadrature}, if the GP prior on $f$ is chosen to have zero mean, then the BQ estimate is $\widehat{\E_X\left[f(X) \mid \Theta=\theta_t\right]}_{\text{BQ}}=\mu_{\theta_t}(x^t_{1:N})^\top k_\calX(x_{1:N}^t, x_{1:N}^t)^{-1}f(x^t_{1:N})$.

And aftering plugging in the BQ estimate of $\E_X\left[f(X) \mid \Theta=\theta_t\right]$ into Equation \eqref{appeq:A9}
%
\begin{align}\label{eq:app_kernel_perspective}
    \widehat{\E\left[f(Y)\mid \Theta=\theta \right]}_{\text{CBQ}} = k_\Theta(\theta, \theta_{1:T})(k_\Theta(\theta_{1:T}, \theta_{1:T}) + T\lambda I)^{-1} \left[\begin{array}{c}
    \mu_{\theta_1}(x^1_{1:N})^\top k_\calX(x_{1:N}^1, x_{1:N}^t)^{-1}f(x^t_{1:N}) \\
    \vdots \\
    \mu_{\theta_T}(x^T_{1:N})^\top k_\calX(x_{1:N}^T, x_{1:N}^T)^{-1}f(x^T_{1:N})
    \end{array}\right]
\end{align}
%

Now Equation \eqref{eq:app_kernel_perspective} has the same form as the mean estimate in Equation \eqref{eq:CBQ_estimator} in the main text, and the only difference is that the regularization constant $\lambda$ here is replaced by the diagonal heteroskedastic noise $\sigma_{BQ}^2(\theta_{1:T}) \Id_T$ in the main text. So far from a second perspective, we have developed the conditional Bayesian quadrature estimator. However, since this is a frequentist perspective, it is not able to provide uncertainty quantification.


\end{appendices}