\begin{figure*}[t]
\vspace{-30pt}
    \centering
    \begin{minipage}{\textwidth}
    \centering
    \includegraphics[width=240pt]{figures/legend.pdf}
    \vspace{-7pt}
    \end{minipage}
    
    \centering
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=1.0\linewidth]{figures/bayes_sensitivity_N_50.pdf}
        % \caption{RMSE with fixed $N$}
        \label{fig:bayes_sensitivity_1}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Second plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=1.0\linewidth]{figures/bayes_sensitivity_T_50.pdf}
        % \caption{RMSE with fixed $T$}
        \label{fig:bayes_sensitivity_2}
    \end{subfigure}%
    \hfill % Add horizontal space between the subfigures
    % Third plot
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=1.0\linewidth]{figures/bayes_sensitivity_dimensions.pdf}
        % \caption{RMSE with increasing $D$}
        \label{fig:bayes_sensitivity_3}
    \end{subfigure}
    \vspace{-3pt}
    \caption{\emph{Bayesian sensitivity analysis for linear models.} \textbf{Left:} RMSE of all methods when $d=2$ and $N=50$. \textbf{Middle:} RMSE of all methods when $d=2$ and $T=50$. \textbf{Right:} RMSE of all methods when $N=T=100$.}
    \label{fig:bayes_sensitivity}
    \vspace{-20pt}
\end{figure*}

\section{Experiments}\label{sec:experiments}
We will now evaluate the empirical performance of CBQ against baselines including IS, LSMC and KLSMC. 
% For the first three experiments, we focus on the case where $f$ does not depend on $\theta$ (i.e. $f(x, \theta) = f(x)$), and for the fourth experiment we focus on the case where $f$ depends on both $x$ and $\theta$. 
All methods use $\theta_{1:T} \sim \mathbb{Q}$ ($\mathbb{Q}$ is specified individually for each experiment) and $x_{1:N}^t \sim \mathbb{P}_{\theta_t}$ to ensure a fair comparison, and we therefore use $\mathbb{P}_{\theta_1}, \ldots, \mathbb{P}_{\theta_T}$ as our importance distributions in IS. For experiments on nested expectations, we use MC for the outer expectation and use various methods for conditional expectation for the inner expectation. Detailed descriptions of hyperparameter selection can be found in \Cref{appendix:practical_considerations}. 
Detailed experimental settings can be found in \Cref{appendix:bayes_sensitivity} to \Cref{appendix:black_scholes}.
% We also provide additional experiments in \Cref{appendix:experiments}. 
% This includes an ablation study on different kernels $k_{\calX}$ and $k_{\Theta}$ in \Cref{appendix:ablation} which demonstrates that fast rates are available beyond the setting of our theorem. 
% \Cref{appendix:cbq_mobq} includes experiments which show MOBQ obtains similar performance to CBQ, but with a computational cost which is between $10$ and $100$ times larger.
% \Cref{appendix:QMC} includes experiments with quasi-Monte Carlo points~\citep{ hickernell1998generalized}, which demonstrates that CBQ is not limited to independent samples.  
% \Cref{appendix:ablation} includes ablation studies on various kernels $k_\calX$ and $k_\Theta$.
\Cref{appendix:calibration} demonstrates the calibration of CBQ uncertainty.
The code to reproduce the results is available at 
\href{https://github.com/Anonymous65536/cbq}{\texttt{https://github.com/Anonymous65536/cbq}}.







\paragraph{Synthetic Experiment: Bayesian Sensitivity Analysis for Linear Models.}
The prior and likelihood in a Bayesian analysis often depend on hyperparameters, and determining the sensitivity of the posterior to these is critical for assessing robustness~\citep{oakley2004probabilistic,Kallioinen2021}. One way to do this is to study how posterior expectations of interest depend on these hyperparameters, a task usually requiring the computation of conditional expectations. We consider this problem in the context of Bayesian linear regression with a zero-mean Gaussian prior with covariance $\theta \Id_d$ where $\Id_d$ is identity matrix and $\theta \in (1, 3)^d$. Using a Gaussian likelihood, we can obtain a conjugate Gaussian posterior $\mathbb{P}_{\theta}$ on the regression weights. We can then analyse sensitivity by computing the conditional expectation $I(\theta)$ of some quantity of interest $f$. For example, if  $f(x)=x^\top x$, then $I(\theta)$ is the second moment of the posterior, whereas if $f(x) = x^\top y^\ast$ for some new observation $y^\ast$, then $I(\theta)$ is the predictive mean. In these simple settings, $I(\theta)$ can be computed analytically, making this a good synthetic example for benchmarking.



Our results in \Cref{fig:bayes_sensitivity} are for the second moment, whilst the results for the predictive mean are in \Cref{appendix:bayes_sensitivity}.  
We measure performance in terms of root mean squared error (RMSE) and use $ \mathbb{Q} = \operatorname{Unif}(1, 3)^d$.  For CBQ, $k_\calX$ is chosen to be a Gaussian kernel so that the kernel mean embedding $\mu$ has a closed form, and $k_\Theta$ is a Mat\'ern-3/2 kernel.
\Cref{fig:bayes_sensitivity} shows the performance of CBQ against baselines with varying $N$, $T$ and $d$. LSMC performs well for this problem, and this can be explained by the fact that $I(\theta)$ is a polynomial in $\theta$.
Despite this, the left and middle plots show that CBQ consistently outperforms all competitors. Specifically, its rate of convergence is initially much faster in $N$ than in $T$, which confirms the intuition from \Cref{thm:convergence}. The dotted lines also give the performance of baselines under a very large number of samples $N=T=1000$, and we see that CBQ is either comparable or better than these even when it has access only to much smaller $N$ and $T$. In the right-most panel, we see that the baselines gradually catch up with CBQ as $d$ grows, which is again expected since the rate in \Cref{thm:convergence} is $O(N^{-2 s_{\calX}/d+\varepsilon})$ in $N$. 
% Additional experimental results demonstrating these are consistent conclusions for different values of $N, T$ can be found in \Cref{appendix:bayes_sensitivity}.







\begin{wrapfigure}{r}{0.40\textwidth} % 'r' for right side, and width of the figure
    \vspace{-10pt}
    \centering
    \includegraphics[width=0.33\textwidth]{figures/calibration_bayes.pdf}
    \caption{Calibration: \emph{Bayesian sensitivity analysis for linear models $(d=2)$}.}
    \label{fig:calib_bayes}
    \vspace{-10pt}
\end{wrapfigure}

Our last plot is in \Cref{fig:calib_bayes} and studies the calibration of the CBQ posterior. The coverage is the $\%$ of times a credible interval contains $I(\theta)$ under repetitions of the experiment. The black diagonal line represents perfect calibration, whilst any curve lying above or below the black line indicates underconfidence or overconfidence respectively. 
We observe that when $N$ and $T$ are as small as $10$, CBQ is overconfident. When $N$ and $T$ increase, CBQ becomes underconfident, meaning that our posterior variance is more inflated than needed from a frequentist viewpoint.
Calibration plots for the rest of the experiments can be found in \Cref{appendix:experiments} and demonstrate similar results. We note that it is generally preferable to be underconfident than overconfident, and CBQ does a good job at this most of the time. 
We expect that overconfidence in small $N$ and $T$ can be explained by a poor performance of empirical Bayes, and therefore caution users to not overly rely on the reported uncertainty in this regime. 




%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\begin{figure*}[t]
\vspace{-30pt}
    \begin{minipage}{0.99\textwidth}
    \centering
    \includegraphics[width=380pt]{figures/legend_finance.pdf}
    \vspace{-10pt}
    \end{minipage}
    
    \centering
    
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=1.0\linewidth]{figures/SIR_15.pdf}
        \label{fig:sir}
    \end{subfigure}%
    \hfill 
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=1.0\linewidth]{figures/SIR_time.pdf}
        \label{fig:sir_time}
    \end{subfigure}
    \hfill
    \begin{subfigure}{0.33\textwidth}
        \centering
        \hspace{-10pt}
        \includegraphics[width=1.0\linewidth]{figures/finance_T_20.pdf}
        \label{fig:finance}
    \end{subfigure}%
    \vspace{-3pt}
    \caption{\emph{Bayesian sensitivity analysis for SIR Model $\&$ Option pricing in mathematical finance.} \textbf{Left:} RMSE of all methods for the SIR example with $T=15$. \textbf{Middle:} The computational cost (in wall clock time) for CBQ ($T=15, N=40$) and for obtaining one single numerical solution from SIR under different discretization step sizes. In practice, the process of obtaining samples from SIR equations is repeated $NT$ times.
    \textbf{Right:} RMSE of all methods for the finance example with $T=20$.
    }
    \label{fig:finance_sir}
    \vspace{-10pt}
\end{figure*}



\paragraph{Bayesian Sensitivity Analysis for the Susceptible-Infectious-Recovered (SIR) Model.}
The SIR model is commonly used to simulate the dynamics of infectious diseases through a population~\citep{kermack1927sir}. In this model, the dynamics are governed by a system of differential equations parametrised by a positive infection rate and a recovery rate (see \Cref{appendix:sir}). 
The accuracy of the numerical solution to this system typically hinges on a step size parameter.
While smaller step sizes yield more accurate solutions, they are also associated with a much higher computational cost. 
For example, using a step size of $0.1$ days for simulating a $150$-day period would require a computation time of $3$ seconds for generating a single sample. 
The cost would become even larger as the step size gets smaller, as depicted in the middle panel of \Cref{fig:finance_sir}. Consequently, when performing Bayesian sensitivity for this model, there is a clear necessity for more data-efficient algorithms such as CBQ.

We perform a sensitivity analysis for the parameter $\theta$ of our $\operatorname{Gamma}(\theta, 10)$ prior on the infection rate $x$. The parameter $\theta$ represents the initial belief of the infection rate deduced from the study of the virus in the laboratory at the beginning of the outbreak.
% , and $\xi$ represents the amount of uncertainty (we fix $\xi=10$ for simplicity). 
We are interested in the expected peak number of infected individuals: $f(x)= \max_r N^r_I(x)$, where $N^r_I(x)$ is the solution to the SIR equations and represents the number of infections at day $r$. 
It is important to study the sensitivity of $I(\theta)$ to the shape parameter $\theta$. 
The total population is set to be $10^6$ and 
$\mathbb{Q} = \operatorname{Unif}\left(2,9\right)$ and $\mathbb{P}_{\theta_t} = \operatorname{Gamma}(\theta_t, 10)$. 
We use a MC estimator with $5000$ samples as the pseudo ground truth and evaluate the RMSE across all methods. 
For CBQ, we employ a Stein kernel for $k_\calX$, with the Mat\'ern-3/2 as the base kernel, and $k_\Theta$ is selected to be a Mat\'ern-3/2 kernel.


We can see in the left panel of \Cref{fig:finance_sir} that CBQ clearly outperforms  baselines including IS, LSMC and KLSMC.
Although the CBQ estimator exhibits a higher computational cost compared to baselines, we have demonstrated in the middle panel of \Cref{fig:finance_sir} that, due to the increased computational expense of obtaining samples with smaller step size, using CBQ is ultimately more efficient overall within the same period of time.
% Additional experimental results demonstrating these are consistent conclusions for different values of $T$ can be found in \Cref{appendix:sir}.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

\paragraph{Option Pricing in Mathematical Finance.}
Financial institutions are often interested in computing the expected loss of their portfolios if a shock were to occur in the economy, which itself requires the computation of conditional expectations (it is in fact in this context that LSMC was first proposed). This is typically a challenging computational problem since simulating from the stock of interest often requires the numerical solution of stochastic differential equations over a long time horizon (see \citep{achdou2005computational}), making data-efficient methods such as CBQ particularly desirable.

Our next experiment is representative of this class of problems, but has been chosen to have a closed-form expected loss and to be amenable to cheap simulation of the stock to enable extensive benchmarking. We consider a butterfly call option whose price $S(\tau)$ at time $\tau \in [0,\infty)$ follows the Black-Scholes formula; see \Cref{appendix:black_scholes} for full details. The payoff at time $\tau$ can be expressed as
$\psi(S({\tau}))=\max (S(\tau)-K_1, 0) + \max (S(\tau)-K_2, 0) - 2\max (S(\tau) - (K_1+K_2)/2, 0)$ for two fixed constants $K_1,K_2\geq 0$.
We follow the set-up in \citep{alfonsi2021multilevel, alfonsi2022many} assuming that a shock occurs at time $\eta$ when the price is $S(\eta)=\theta \in (0,\infty)$, and this shock multiplies the price by $1 + s$ for some $s\geq 0$. As a result, the expected loss of the option is $\calL = \E_{\theta \sim \mathbb{Q}}
[ \max ( I(\theta), 0)]$, where $I(\theta) =  \int_{0}^\infty f(x) \mathbb{P}_\theta(dx)$, $x=S(\zeta)$ is the price at the time $\zeta$ at which the option matures, $f(x) = \psi(x)-\psi((1+s)x)$, and $\mathbb{P}_\theta$ and $\mathbb{Q}$ are two log-normal distributions induced from the Black-Scholes model. 


Results are presented in the right-most panel of \Cref{fig:finance_sir}. We take $K_1 = 50, K_2 = 150, \eta=1, s = 0.2$ and $\zeta=2$. 
For CBQ, $k_\Theta$ is selected to be a Mat\'ern-3/2 kernel and $k_\calX$ is either a Stein kernel with Mat\'ern-3/2 as base kernel or a logarithmic Gaussian kernel (see \Cref{appendix:black_scholes}) in which case $k_{\calX}$ is too smooth to satisfy the assumption of our theorem. 

As expected, CBQ exhibits much faster convergence in $N$ than IS, LSMC or KLSMC, and outperforms these baselines even when they are given a substantial sample size of $N=T=1000$ (see dotted lines). We can also see that CBQ with the log-Gaussian  kernel or with Stein kernel have similar performance, despite the log-Gaussian kernel not satisfying the smoothness assumptions of our theory. 
% Additional experiments in \Cref{appendix:black_scholes} show that these results are consistent for different values of $T$. 


% \begin{wrapfigure}{r}{0.40\textwidth}
%   \vspace{-19pt}
%   \centering
%   \hspace{-10pt}
%   \includegraphics[width=0.9\linewidth]{figures/decision_T_10.pdf}
%   \vspace{-10pt}
%   \caption{\emph{Uncertainty decision making in health economics.} We study RMSE for different estimators of EVPPI.}
%   \label{fig:decision}
%   \vspace{-5pt}
% \end{wrapfigure}

% \paragraph{Uncertainty Decision Making in Health Economics.} 
% In the medical world, it is important to trade-off the costs and benefits of conducting additional experiments on patients.
% One important measure in this context is the expected value of partial perfect information (EVPPI), which quantifies the expected gain from conducting experiments to obtain precise knowledge of some unknown variables \citep{brennan2007calculating}. 
% The EVPPI can be expressed as $\E_{\theta \sim \mathbb{Q}}[\max_c I_c(\theta) ] - \max_c \E_{\theta \sim \mathbb{Q}}[f_c(X, \theta)]$ where $f_c$ represents a measure of patient outcome (such as quality-adjusted life-years) under treatment $c$ among a set of potential treatments $\calC$, $\theta$ denotes the additional variables we could measure, and $I_c(\theta) = \int_{\calX} f_c(x, \theta) \mathbb{P}_\theta(dx)$ denotes the expected patient outcome given our measurement of $\theta$. We highlight that for these applications $N$ and $T$ are often small due to the very high monetary cost and complexity of collecting patient data.


% We study the potential use of CBQ for this problem using the synthetic problem of \cite{Giles2019}, where $\mathbb{P}_{\theta}$ and $\mathbb{Q}$ are Gaussians (see \Cref{appendix:decision}). We compute EVPPI with $f_1(x, \theta)=10^4 (\theta_1 x_5 x_6 + x_7 x_8 x_{9})-(x_1 + x_2 x_3 x_4)$ and $f_2(x, \theta) = 10^4 (\theta_2 x_{13} x_{14} + x_{15} x_{16} x_{17})-(x_{10} + x_{11} x_{12} x_4)$. 
% The exact practical meanings of $x$ and $\theta$ can be found in \Cref{appendix:decision}.
% Note that IS is no longer applicable here because $f$ depends on both $x$ and $\theta$, so we only compare against KLSMC and LSMC. 
% For CBQ, $k_\calX$ is a Mat\'ern-3/2 kernel and $k_\Theta$ is also a Mat\'ern-3/2 kernel. 
% In \Cref{fig:decision}, we can see that CBQ consistently outperforms baselines with much smaller RMSE. The results are also consistent with different values of $T$; see \Cref{appendix:decision}.

