% \documentclass{uai2022} % for initial submission
\documentclass[accepted]{uai2022} % after acceptance, for a revised
                                    % version; also before submission to
                                    % see how the non-anonymous paper
                                    % would look like
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2022} % ptmx math instead of Computer
                                         % Modern (has noticable issues)
% \documentclass[mathfont=newtx]{uai2022} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{algorithm2e}
\usepackage{amsfonts,amsthm}
\newtheorem{theorem}{Theorem}
\usepackage{algorithmic}
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)
\usepackage{cleveref}

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example
%\newcommand{\EIG}{EIG}
\newcommand{\EIG}{I}
\newcommand{\VNMC}{\text{VNMC}}
\newcommand{\ACE}{\text{ACE}}
\newcommand{\MINE}{\text{MINE}}
%\newcommand{\Eaff}{EIG_{\mathrm{aff}}}
\newcommand{\Eaff}{I_{\mathrm{aff}}}
%\newcommand{\Ereige}{REIG_{\epsilon}}
\newcommand{\Ereige}{I_{\epsilon}}
\newcommand{\Ereigemax}{I_{\epsilon,\max}}
\newcommand{\hatreig}{\hat I_{\epsilon}}
\newcommand{\hatreigmax}{\hat I_{\epsilon,\max}}
\newcommand{\reigvnmc}{\hatreig^{\VNMC}}
\newcommand{\reigace}{\hatreigmax^{\ACE}}
\newcommand{\reigmine}{\hatreigmax^{\MINE}}
\newcommand{\Ereigejoint}{I_{\epsilon,\text{joint}}}
\newcommand{\EE}{E}
\newcommand{\dlp}{\delta\!p}
\newcommand{\dlq}{\hat{q}}
\newcommand{\dld}{\delta\!d}
\newcommand{\dlxi}{\hat\xi}
\newcommand{\Amb}[2]{\mathcal{A}(#1,#2)}
\newcommand{\DKL}[2]{D_{\mathrm{KL}}(#1\|#2)}
\newcommand{\theHalgorithm}{\arabic{algorithm}}
\title{Robust Expected Information Gain for Optimal Bayesian Experimental Design \\
Using Ambiguity Sets}

% The standard author block has changed for UAI 2022 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{\href{mailto:<jgo31@gatech.edu>?Subject=Your UAI 2022 paper}{Jinwoo Go}{}}
\author[1]{Tobin Isaac}
% \author[1,2]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[1]{Further~Coauthor}
% \author[3]{Further~Coauthor}
% \author[3,1]{Further~Coauthor}
% Add affiliations after the authors
\affil[1]{%
    Computational Science and Engineering Dept.\\
    Georgia Institute of Technology\\
    Atlanta, Georgia, USA
}

  \begin{document}
\maketitle
% \todo{1. Abstract}
% \todo{2. Minebed}
\begin{abstract}
  The ranking of experiments by expected information gain (EIG) in Bayesian experimental design is sensitive to changes in the model's prior distribution, and the approximation of EIG yielded by sampling will have errors similar to the use of a perturbed prior.
% 2
We define and analyze \emph{robust expected information gain} (REIG), a modification of the objective in EIG maximization by minimizing an affine relaxation of EIG over an ambiguity set of distributions that are close to the original prior in KL-divergence.
% 3
We show that, when combined with a sampling-based approach to estimating EIG, REIG corresponds to a `log-sum-exp' stabilization
of the samples used to estimate EIG, meaning that it can be efficiently implemented in practice.
% 4
Numerical tests combining REIG with variational nested Monte Carlo (VNMC), adaptive contrastive estimation (ACE) and mutual information neural estimation (MINE) suggest that in practice REIG also compensates for the variability of under-sampled estimators.

\end{abstract}

\section{Introduction}

%Researchers decide the next experiment that maximize the information gain while reducing the cost of the experiment. 
Bayesian Experimental Design (BED) is a probabilistic framework for selecting experiments to learn about one or more uncertain variables. Within BED, the most popular criterion for ranking experiments is by Expected Information Gain (EIG), which estimates from current knowledge, encoded in a prior distribution, how informative a particular experiment is likely to be.  This framework is used in diverse applications across many disciplines \citep{ryan2016fully}, in natural sciences \citep{huan2010accelerated}, social sciences \citep{embretson2013item}, and in machine learning and data analysis \citep{foster2020unified}.
%To be specific, chemistry researchers often use experimental design to set up the reactor, which can take several days for one set up in reality\cite{huan2010accelerated}. On the other hand, Psychology researchers also have tried to set up the experiment to get the most informative result from every questions\cite{foster2020unified, embretson2013item}. 

The sensitivity of experimental design to misspecification of the prior distribution has been described in \citep{dasgupta1991robust,ryan2016review}, even in settings where the information gain is computable in closed form.  For more complex models, EIG can only be estimated numerically, which may further affect the reliability of the computed rankings of experiments (or, in the case of continuously parameterized experiments, the gradient of EIG).
EIG is by definition an expectation of an expectation, so general purpose estimates, such as Nested Monte Carlo (NMC) estimation \citep{ryan2003estimating}, can be expensive, slow to converge, and sensitive to underconverged sample estimates.

%is a general method for estimating EIG that requires only samples
%and density evaluation of the prior distribution and likelihood of a given experiment, but its convergence is slow when posterior distribution and prior distribution are very different.

%However, \citep{ryan2016fully} pointed out that the Bayesian Experimental Design is sensitive to the prior distribution before the enhanced EIG estimation was possible. The EIG estimation methods mentioned before using neural network assume that the prior distribution is Gaussian distribution. Researchers often ignore the diverse prior distributions for the sensitivity evaluation because of its intractable computation time. 
To address the issues above that affect the reliability of 
EIG estimates in BED,
we introduce a quantity we call \emph{robust expected information gain} (REIG) as a probability-theoretic way of
ranking experiments by their expected information gain for some worst-case small perturbation of the prior.  We also show through convex analysis that the estimation of REIG is a simple post-processing of the samples generated by an NMC-like estimator.  As a result, our methodology is applicable with many existing methods, which we demonstrate in \cref{sec:experiments} by applying REIG to samples generated by three recent popular EIG estimators \citep{foster2020unified,NEURIPS2019_d55cbf21,kleinegesse2020bayesian}.


% \subsection{Bayesian Inference can be brittle}
% \citep{owhadi2015brittleness} showed the Brittleness of Bayesian Inference with specific example. One prior, $\epsilon$ difference far from the reference prior, can make the different answer in decision making problem.
% % https://www.stat.berkeley.edu/~stark/Preprints/constraintsPriors15.pdf
% \todo{%
% I think we only need to talk about Bayesian brittleness in the context of
% sequential design, when the prior has already been conditioned by a prior
% experiment
% }%

\section{Background and Notation}

We use $\theta \in \Theta$ to indicate a choice of parameters for a model from a set of possible parameters, and we assume a reference prior probability distribution of $\theta$ which has a measurable density function $p(\theta)$, so that we can write $\EE_{p(\theta)}[f] = \int_{\Theta} f(\theta) p(\theta)\ d\theta.$
We let $\xi \in \Xi$ be a potential experiment from a class of experiments, which has an outcome variable $y(\xi)$.
The experiment $\xi$ is modeled by the likelihood function $p(y|\theta,\xi)$, which for each choice of
$\theta$ defines a measurable probability density function of $y$.  Our interest is in models where the densities $p(\theta)$ and $p(y|\theta,\xi)$ can be efficiently computed, and where samples can be drawn from $p(\theta)$ and from $p(y|\theta,\xi)$ for each $(\theta,\xi)$, so that the joint prior distribution
$p(\theta,y|\xi) = p(\theta)p(y|\theta,\xi)$ also has a computable density function and can be sampled.
We use the notation $p(y|\xi) = \EE_{p(\theta)}[p(y|\theta,\xi)]$ for the marginal distribution of
the outcome $y$.

\subsection{Prior Uncertainty}

While it seems recursive to consider uncertainty in the prior distribution used in Bayesian inference,
the prior distribution is in many settings not determined from first principles or an existing population of data.  In these cases the choice of prior is often dictated by what is required to make a computation tractable or simple, by invariance principles, or by an attempt to be noninformative \citep{stark2015constraints}. But notions of noninformative priors do not scale to high dimensions \citep{yang1996catalog}, and even in low dimensions priors that are close under a weak topology like total variation can have diverging posteriors for the same observations \citep{owhadi2015brittleness}.

So in this work we will consider sets of prior distributions $q(\theta)$ other than the reference $p(\theta)$, but we only consider $q(\theta)$ that are absolutely continuous with respect to $p(\theta)$.  Although methods similar to ours are used to handle model uncertainty \citep{shapiro2021bayesian}, we treat $p(y|\theta,\xi)$ as certain: the only uncertainty we consider is in the prior distribution of $\theta$.  When we extend the notation of derived distributions from $p(\theta)$ to another $q(\theta)$, then, it is always with the same likelihood: the joint prior $q(\theta,y|\xi) = q(\theta)p(y|\theta,\xi)$, the marginal $q(y|\xi) = \EE_{q(\theta)}[p(y|\theta,\xi)]$, etc.

\subsection{Expected Information Gain}

We use $\DKL{p(\theta)}{q(\theta)}$ to denote the Kullback-Leibler divergence, $\DKL{p(\theta)}{q(\theta)}= \int_\Theta p(\theta) \log(p(\theta) / q(\theta))\ d\theta$.
The expected information gain of experiment $\xi$ is defined to be the expectation over the marginal distribution of outcomes $p(y|\xi)$ of the KL-divergence from the Bayesian posterior distribution $p(\theta|y,\xi)$ to the prior $p(\theta)$.  Because the prior is not fixed in this work, we consider EIG to be a function of both the prior $p(\theta)$ and the experiment $\xi$,
%
\begin{equation}
\EIG(p,\xi) = \EE_{p(y|\xi)}[\DKL{p(\theta|y,\xi)}{p(\theta)}].\label{eq:klform}
\end{equation}
%
Bayesian optimal experimental design seeks the experiment $\xi^*$ that maximizes this quantity,
%
$$
\xi^* = \arg\max_{\xi\in\Xi}\ \EIG(p,\xi).
$$
%
Although the form of EIG in \eqref{eq:klform} is the most intuitive for Bayesian experimental design, 
other equivalent definitions map more directly on the robust variant we introduce in \cref{sec:reig}
and the sampling-based estimators in \cref{sec:sampling}.  The EIG of an experiment is also the mutual information between $\theta$ and $y$,
%
\begin{align}
\EIG(p,\xi) &= \DKL{p(\theta,y|\xi)}{p(\theta)p(y|\xi)} \\
&= \EE_{p(\theta)}[\DKL{p(y|\theta,\xi)}{p(y|\xi)}] \label{eq:margklform} \\
&= \EE_{p(\theta,y|\xi)}\big[\log\frac{p(y|\theta,\xi)}{p(y|\xi)}\big].
\label{eq:miform}
\end{align}
%
The last form in \eqref{eq:miform} is the preferred form for many
methods that estimate EIG when the likelihood $p(y|\theta,\xi)$ and
prior $p(\theta)$ can be evaluated directly.
We discuss methods for estimating $\EIG(p,\xi)$ and other related quantities in \cref{sec:sampling}.

% \todo{Our notation for Bayesian experimental design.  Define EIG, and
% show EIG in the mutual information form that we will use throughout the paper.}

%\subsection{Prior distribution in Bayesian Inference}
%\subsubsection{Uncertainty in the parameterized distribution}
%When using Bayesian Inference, researchers use parameters to represent the probability distribution. For example, Gaussian distribution is usually used for the prior distribution and the likelihood distribution. Since the assumption about parameterized probability distributions can have errors, we define the distribution "ambiguously".
%
%\subsubsection{Reason why people set the prior as Gaussian}
%Before discussing "ambiguous" distribution, we try to figure out how researchers choose prior distribution.
%\cite{stark2015constraints} introduced the following reasons that researchers choose the prior distribution: 
%1. Make the calculation simple
%2. Particular prior is conventional
%3. Prior satisfies some in-variance principle
%4. Prior is uninformative
%5. Prior roughly matches the relative frequencies of values in some population.
%\subsubsection{ambiguity set in the prior}
%Since the assumption about prior distribution can be wrong, We are going to apply the "ambiguity" on the prior and make experimental design robust with minimax optimization.

%\subsection{Expected Information Gain}
%When we design the experiment, we use the $\mathrm{EIG}$ as a criteria. $\mathrm{EIG}$ is the expectation of KL divergence between the posterior distribution with given experimental set up($\xi$) and the prior distribution with respect to the marginal likelihood distribution.
%\begin{align}
%\mathrm{EIG}(\xi) = \mathbb{E}_{p(y|\xi)}\big[KL(p(\theta|y,\xi)||p(\theta))\big],    
%\end{align}
%%
%where $p(y|\xi)$ is marginal likelihood, $p(\theta|y,\xi)$ is posterior distribution, and $p(\theta)$ is the prior distribution.
%In this work the prior distribution is under consideration, so we consider $\mathrm{EIG}$ to be a function of both the prior and the design,
%$\mathrm{EIG} = \mathrm{EIG}(p(\theta),\xi)$.
%Bayesian Experimental Design finds the most informative experimental set up, $\xi^* = \arg\max_{\xi}\mathrm{EIG}(p(\theta),\xi)$.
%
%Expected Information Gain can be refactored into multiple equivalent forms:
%%\begin{align}
%%\mathrm{EIG}(\xi) &= \mathbb{E}_{p(\theta,y|\xi)}\big[\log\frac{p(\theta|y,\xi)}{p(\theta)}\big]\\
%%&= KL(p(\theta,y|\xi) || p(\theta)p(y|\xi))\\
%%&= \mathbb{E}_{p(\theta)}\mathbb{E}_{p(y|\theta,\xi)}\big[\log\frac{p(y|\xi,\theta)}{p(y|\xi)}\big] \label{eq:miform}
%%\end{align}
%
%The last form in \cref{eq:miform}, which we focus on for the remainder of this work, is the preferred form for many
%methods that estimate $\mathrm{EIG}$ when the likelihood $p(y|\theta,\xi)$ and
%prior $p(\theta)$ can be evaluated directly.
%We discuss methods for estimating $\mathrm{EIG}(p(\theta),d)$ and other related quantities in \cref{sec:sampling}.

%\subsection{Sampling Average Approximation (SAA)}
%SAA applied to the stochastic programming is investigated and developed by various researchers including \cite{king1991epi, plambeck1996sample}, and is well organized to be used by a few researchers including \cite{kim2015guide, shapiro2021lectures}. Bayesian Experimental Design is applicable to SAA because the $\mathrm{EIG}(\xi)_n$ can be optimized with gradient method and approximated $\mathrm{EIG}(\xi)$ has the similar structure like location of local minima.   

%One way to estimate the EIG is using Sample Average Approximation (SAA). SAA gives the approximation of EIG.

%\subsection{Risk Averse}
%
%\todo{Risk averse section should go after the example, or even after the logsumexp}
%
%Given that the prior distribution can make the difference in the decision making process, this paper assume that the prior distribution can be misspecified and check the Expected Information Gain Estimation as a "bearing"\todo{what do you mean by ``bearing''?} to cover the misspecificed prior assumption and to represent the real world. We minimize the Expected Information Gain with respect to the distribution of which we apply ambiguity set to make estimation robust.
%
% Prior probability distributions—
% and hence posterior distributions—are quantifications of the analyst’s degree of belief. As
% such, they change the subject from the experiment and the external world to the analyst’s
% state of mind. Suppose I claim that my prior probability distribution for the load on a
% structure as a fraction of its breaking strength is the uniform distribution on [0, 1]. I am right
% if that accurately reflects what I believe. I am wrong if it does not accurately reflect what I
% believe. The relationship between the prior and the world has no bearing on whether I am
% right or wrong. Experiments that could show I am wrong involve checking what I actually
% believe—for instance, psychological testing or determining what bets I would take at what
% odds—rather than measurements of the structure or similar structures.

\section{A Simple Example with Two Experiments}\label{sec:example}

%This example shows why, when discriminating between experiments using EIG, one may want to consider distributions in the vicinity of a given prior.

Suppose a doctor has two blood tests for Condition X: test A has a $10^{-14}\%$
chance of a false negative but a $50\%$ chance of a false positive, and test B
has an $\approx18.4\%$ chance of a false negative and the same chance of a false positive.  If the doctor estimates the prior probability that a patient
has condition X is $50\%$, it turns out that both tests have the same expected information gain of $\approx0.22$ nats.  If that prior probability could be mistaken, however, the two tests have different EIGs for prior probabilities in the vicinity of $50\%$, shown in \cref{fig:simple}.  If the prior probability the patient has Condition X is actually $>50\%$, then test A has a greater EIG than test B, and vice versa if it is $<50\%$.

\begin{figure}
    \centering
    \includegraphics[width=0.75\columnwidth]{simple_example_small.png}
    \caption{The expected information gain of two tests for condition X depends on the condition's prior probability.}
    \label{fig:simple}
\end{figure}

We interpret these results as follows: For test A, if the prior probability of Condition X is $>50\%$, then a negative test result is surprising because there are essentially no false negatives, but if the prior probability is $<50\%$, then a positive result is less surprising because it has a high probability of being a false positive.  So in comparison to test B, the EIG of test A is more sensitive to the choice of prior.  In any neighborhood of $p = 50\%$, there are priors where test A is expected to be less informative than test B.  So one could argue that a risk-averse doctor, who would maximize how informative the test would be in the worst case, should select test B.

\section{Ambiguity Sets}\label{sec:ambiguity}

In the simple example above, we used a range of prior probabilities for the model parameter to argue that some tests are less locally sensitive to perturbations of the prior distribution.  To generalize this idea from a simple discrete example to other probability distributions, we rely on the notion of an \emph{ambiguity set} \citep{bayraksan2015data, watson2016approximate}, which is a set of distributions that are not far from a reference prior $p(\theta)$ in some statistical distance.  We use KL-divergence as distance, so our ambiguity set with radius $\epsilon$ centered at reference distribution $p(\theta)$ is
%
$$
\Amb{\epsilon}{p} = \{q: \DKL{q(\theta)}{p(\theta)} \leq \epsilon\}.
$$

KL-divergence as a distance works well with Bayesian optimal experimental design, because the KL-divergence appears in the definition of EIG, and because the set $\Amb{\epsilon}{p}$ is defined as a convex subset of positive measurable functions $q(\theta)$ with just two conditions: $\int_{\Theta} q(\theta)\ d\theta = 1$ and $\DKL{q(\theta)}{p(\theta)} \leq \epsilon$.
Thus the minimization over $q\in\Amb{\epsilon}{p}$ of a well-behaved convex function $f(q)$, which appears difficult because $\Amb{\epsilon}{p}$ of the infinite dimensionality of space of measurable functions, transforms into an equivalent dual convex program with only two variables.

We direct the interested reader to \citep{shapiro2017distributionally} for additional details: here we summarize the results that are important for this work.  If the objective function of interest $f(q)$ is the expectation under $q(\theta)$ of a measurable quantity of interest $Z(\theta)$, then it is an affine function of $q$ and we may use duality to simplify the maximization or minimization of $\EE_{q(\theta)}[Z(\theta)]$ over $\Amb{\epsilon}{p(\theta)}$ into a dual problem with only one variable $\lambda\geq 0$. 
%
The maximization problem 
%%%% TODOJ: E_q?
\begin{equation}\label{eq:maxorig}
R_\epsilon = \sup_{q \in \Amb{\epsilon}{p}}
[Z(\theta)]
\end{equation} 
%
can be solved in dual form as
%
\begin{equation}\label{eq:maxdual}
R_\epsilon = \inf_{\lambda \geq 0}
\lambda \epsilon + \lambda \log \EE_{p(\theta)}[\exp(\lambda^{-1} Z(\theta))],
\end{equation}
%
and the minimization problem
%
\begin{equation}\label{eq:minorig}
M_\epsilon = \inf_{q \in \Amb{\epsilon}{p}}
[Z(\theta)]
\end{equation}
%
can be solved in dual form as
%
\begin{equation}\label{eq:mindual}
M_\epsilon = - \inf_{\lambda \geq 0}
\lambda \epsilon + \lambda \log \EE_{p(\theta)}[\exp(- \lambda^{-1}Z(\theta))].
\end{equation}
%
It is important to note that these are non-parametric results.  Given a parameterized family of priors $\{p(\theta;\psi)\}_\psi$, the gradient $\nabla_\psi \EE_{p(\theta;\psi)}[Z(\theta)]$ is sufficient to compute the optimizer in $\Amb{\epsilon}{p}$ of \eqref{eq:maxorig} or \eqref{eq:minorig} within the parametric family because the objective is affine.  But \eqref{eq:maxdual} and \eqref{eq:mindual} allow us to compute the optimal objective value over the entire ambiguity set without explicitly computing an optimal distribution in the set.

%When we do not know the exact distribution, we define the distribution in the ambiguous set. One way of defining ambiguous set is using the KL divergence. If we assume $p(\theta)$ with parameters (Gaussian distribution), we can define the ambiguity set near $p(\theta)$ and maximize or minimize the measurable objective function $Z$. %: \Omega 	\rightarrow \matbhbb{R}$.
%\begin{align}
%R_\epsilon(Z) :&= \sup_{q(\theta) \in %\Amb{\epsilon}{p(\theta)}
%\mathcal{A}_{\epsilon, p(\theta)}} \mathbb{E}_{q(\theta)} Z(\theta) \\
% \mathcal{A}_{\epsilon,p(\theta)} :&= \big\{KL(q(\theta)||p(\theta))\leq \epsilon, \int_\theta q(\theta)d\theta = 1\big\},
%\end{align}
%where $R_\epsilon(Z)$ is distributionally robust functional, $\mathcal{A}_{\epsilon,p(\theta)}$ is the probability distribution set whose distance from the reference distribution 
%$p(\theta)$ is smaller than $\epsilon$, and $q(\theta)$ is the distribution in the ambiguity set.

%\cite{shapiro2017distributionally} suggests the optimization method by adding the Lagrangian multiplier using the $\phi_{KL}$-divergence conjugate,
%\begin{align}
%R_\epsilon(Z) = \inf_{\lambda\geq0} \lambda\epsilon + \lambda\log \mathbb{E}_{p(\theta)}\exp \frac{Z(\theta)}{\lambda}.
%\end{align}
%Note that when the ambiguity constrains a convex minimization problem,
%$$
%M_\epsilon(Z) := \inf_{q(\theta)\in\Amb{\epsilon}{p(\theta)}} \mathbb{E}_{q(\theta)} [Z(\theta)],
%$$
%then the dual formulation remains the same, but with a negative sign,
%$$
%M_\epsilon(Z) = - \inf_{\lambda\geq0} \lambda\epsilon + \lambda\log \mathbb{E}_{p(\theta)}\exp \frac{-Z(\theta)}{\lambda}.
%$$
%
%By adding one Lagrangian variable $\lambda$, we can estimate the expectation of $Z(\theta)$ with respect to the distribution in the ambiguity set. Because it doesn't require considering all the possible case, it is tractable.

\section{Robust Bayesian Experimental Design} \label{sec:reig}

The insight of the example of \cref{sec:example} was that a risk-averse approach to experimental design that allows for some uncertainty in the prior distribution would select the experiment that maximizes the worst-case EIG in the vicinity of the reference prior $p(\theta)$.  Using the ambiguity set $\Amb{\epsilon}{p(\theta)}$ from \cref{sec:ambiguity}
to define the vicinity of $p(\theta)$, we first formalize the worst-case EIG as $\Ereige^{\mathrm{true}}(p,\xi)$, the \emph{true robust expected information gain with radius $\epsilon$},
%
\begin{equation}\label{eq:ereigetrue}
\Ereige^{\mathrm{true}}(p,\xi) = \inf_{q\in\Amb{\epsilon}{p}}
\EIG(q,\xi).
\end{equation}
%
The experiment that maximizes this quantity is
%
$$
\begin{aligned}
\xi^*_{\mathrm{REIG},\epsilon,\mathrm{true}} &=
\arg\max_{\xi \in \Xi}\ \Ereige^{\mathrm{true}}(p,\xi) \\
&= \arg\max_{\xi \in \Xi} \inf_{q\in\Amb{\epsilon}{p}} \EIG(q,\xi).
\end{aligned}
$$
%
This optimization problem in this definition has a clear meaning, but
we note that $\EIG(q,\xi)$ is a quantity that is concave in $q$,
%\cite{},
%\todo{This appears to be a fact about mutual information theory that is in standard textbooks (https://www.cs.princeton.edu/courses/archive/fall11/cos597D/L04.pdf), just need to find one}
so it can have multiple local minima in the convex ambiguity set $\Amb{\epsilon}{p}$ and the duality framework of \cref{sec:ambiguity} cannot be applied directly.

\subsection{Affine Expected Information Gain Approximation}\label{sec:affineassumption}

To define a relaxation to a tractable problem, we split $\EIG(q,\xi)$ into two contributions, one with the marginal distribution of $y$ fixed by the reference prior, $y\sim p(y|\xi)$, and the other a correction that is the divergence between $p(y|\xi)$ and $q(y|\xi)$,
%
\begin{equation}\label{eq:eigsplit}
\begin{aligned}
\EIG(q,\xi) &=
\EE_{q(\theta,y|\xi)}\big[\log \big( \frac{p(y|\theta,\xi)}{p(y|\xi)}\frac{p(y|\xi)}{q(y|\xi)}\big)\big]
\\
&= \EE_{q(\theta,y|\xi)}\big[\log \frac{p(y|\theta,\xi)}{p(y|\xi)}\big] \\
&\phantom{=} {} - \DKL{q(y|\xi)}{p(y|\xi)}.
\end{aligned}
\end{equation}
%
We denote the first term in this difference
%
\begin{equation}\label{eq:eaff}
\Eaff(q,\xi;p) =
\EE_{q(\theta,y|\xi)}\big[\log \frac{p(y|\theta,\xi)}{p(y|\xi)}\big],
\end{equation}
%
because it is an approximation to $\EIG(q,\xi)$ that
is affine and exact when $q=p$.  By the concavity of EIG with respect to its first argument q,
%
\begin{equation}\label{eq:ubound}
\Eaff(q,\xi;p) \geq \EIG(q,\xi) \quad \text{for all }q.
\end{equation}
%
Due to the data processing inequality that the mutual information between two random variables cannot increase by a deterministic or random transformation of the arguments, the error in $\EIG(q,\xi;p)$ is bounded by
%
\begin{equation}\label{eq:errbound}
\begin{aligned}
|\EIG(q,\xi) - \Eaff(q,\xi;p)| &\leq \DKL{q(y|\xi)}{p(y|\xi)}
\\
&\leq \DKL{q(\theta)}{p(\theta)}.
\end{aligned}
\end{equation}

In fact $\Eaff(q,\xi;p)$ is the affine approximation to $\EIG(q,\xi)$ that is tangent at $q=p$.

\begin{theorem}\label{thm:tangent}
The function $\Eaff(q,\xi;p)$ from \eqref{eq:eaff} is tangent to
$\EIG(q,\xi)$ at $q=p$ for every design $\xi$.
\end{theorem}

\begin{proof}
It is sufficient to show that the difference between the two functions, which by \eqref{eq:eigsplit} is $\DKL{q(y|\xi)}{p(y|\xi)}$, is gradient free at $q = p$.

We first calculate the gradient with respect to $\xi$:  by the chain rule applied to \eqref{eq:miform}, the derivative in the direction $\dlxi$ is
%
$$
\begin{aligned}
&\nabla_{\xi} \DKL{q(y|\xi)}{p(y|\xi)}[\dlxi] \\
={}
&\nabla_{q(y|\xi)} \DKL{q(y|\xi)}{p(y|\xi)}[\nabla_\xi q(y|\xi)[\dlxi]]
\\
&{} -
\EE_{q(y|\xi)}\big[
\frac{\nabla_{\xi} p(y|\xi)[\dlxi]}{p(y|\xi)}
\big].
\end{aligned}
$$
For general distributions $Q$ and $P$ the KL divergence satisfies $\nabla|_{Q=P} \DKL{Q}{P} = 0$,
so the first term vanishes when $q = p$.  In the second term, when $q = p$
the denominator cancels with the measure and we have
%
$$
\begin{aligned}
\EE_{q(y|\xi)}\big[
\frac{\nabla_{\xi} p(y|\xi)[\dlxi]}{p(y|\xi)}
\big] |_{q = p}
&= {\textstyle \int} \nabla_\xi p(y|\xi)[\dlxi]\ dy \\
&= \nabla_\xi (\EE_{p(y|\xi)}[1])[\dlxi] = 0,
\end{aligned}
$$
%
where we use the fact that $E_{p(y|\xi)}[1] = 1$ for all $\xi$.

Finally, we can see that the gradient with respect to $q(\theta)$ in the direction 
$\dlq(\theta)$ is
%
$$
\begin{aligned}
&\nabla_{q(\theta)} \DKL{q(y|\xi)}{p(y|\xi)}[\dlq(\theta)] \\
={}
&\nabla_{q(y|\xi)} \DKL{q(y|\xi)}{p(y|\xi)}[\nabla_{q(\theta)} q(y|\xi)[\dlq(\theta)]],
\end{aligned}
$$
%
which vanishes at $q=p$ for the same reasons as above.
\end{proof}

 %To apply the ambiguity set near the reference distribution and to use the dual form of the objective function, we check the convexity of the objective function. When calculating the marginal likelihood distribution $p(y|\xi)$, we use the prior distribution or the posterior distribution \cite{oladyshkin2019connection}. 
%
%If we consider the the ambiguity set distribution when marginalizing the likelihood, the optimization can be intractable. Because of this, we assume that the marginal likelihood distribution is independent to the prior or the joint prior distribution. The objective function in \cref{eq:econv} is convex, we can use the dual form optimization. We can make the optimization tractable and calculate the distributionally robust Expected Information Gain.

%We have defined two objective function for the minimax problem for $\mathrm{EIG}$, where
%the objective is maximized over designs but minimized over priors
%in the ambiguity set.
%
%The first is the true minimax objective, where changes in the
%prior affect both the joint probability of $(\theta,y)$ used in the outer expectation
%and the marginal probability of $y$ that appears inside the expectation:
%%
%\begin{equation}\label{eq:etrue}
%\mathrm{EIG}(q,\xi) = \mathbb{E}_{q(\theta)p(y|\theta,\xi)}
%\big[
%\log \frac{p(y|\theta,\xi)}{\mathbb{E}_{q(\hat\theta)}[p(y|\hat\theta,\xi)]}
%\big].
%\end{equation}
%%
%The second is the affine relaxation $\Eaff$,
%where the marginal probability of $y$ is
%frozen as the marginal probability in the
%reference prior $p(\theta)$,
%%
%\begin{equation}\label{eq:econv}
%\Eaff(q,\xi)
%=
%\mathbb{E}_{q(\theta)p(y|\theta,\xi)}
%\big[
%\log \frac{p(y|\theta,\xi)}{\mathbb{E}_{p(\hat\theta)}[p(y|\hat\theta,\xi)]}
%\big].
%\end{equation}
%%
%Large difference between $p(\theta)$ and $q(\theta)$ can lead to large differences between the two objectives, but the following theorem suggests that $\Eaff$ may be a good approximation to $\EIG$ near the reference prior.
%
%\begin{theorem}
%The functions $\EIG(q,\xi)$ and $\Eaff(q,\xi;p)$ as defined
%by \cref{eq:miform,eq:econv}, are tangent at $(p,\xi)$ for every design $\xi$.
%\end{theorem}
%
%\begin{proof}
%It is clear that $\Eaff(p,\xi) = \EIG(p,\xi)$: we must show that $D\Eaff(p,\xi)[\dlp,\dlxi] =
%D_p\EIG(p,\xi)[\dlp,\dlxi]$ whenever $\dlp$
%is an admissible variation, that is when
%$\int p(\theta) + \dlp(\theta)\ d\theta = 1.$
%By inspection we see that because $\Eaff$ and
%$\EIG$ only depend on $\xi$ through $p(y|\theta,\xi)$, it must be that
%$D_\xi \Eaff(p,\xi)[\dlxi] = D_\xi \EIG(p,\xi)[\dlxi]$,
%so we only need to consider variations $\dlp$
%in the prior.
%In the notation below we will use $p(y|\xi)$ for the marginal under the reference prior,
%$p(y|\xi) = \mathbb{E}_{p(\theta)}[p(y|\theta,\xi)]$,
%and we will use the expectation notation
%$\mathbb{E}_{\dlp}[f] = \int \dlp(\theta) f(\theta)\ d\theta$, even though $\dlp$ is not itself
%a probability distribution.
%Because $\Eaff$ is only affine in $p$,
%it is straightforward that
%%
%$$
%D_p\Eaff(p,\xi)[\dlp] =
%\mathbb{E}_{\dlp(\theta)p(y|\theta,\xi)}
%[
%\log p(y|\theta,\xi) - \log p(y|\xi)
%].
%$$
%%
%Applying the calculus of variations,
%we see that $D_p\EIG$ appears to have
%an additional term,
%%
%$$
%\begin{aligned}
%D_p&\EIG(p,\xi)[\dlp] =
%\mathbb{E}_{\dlp(\theta)p(y|\theta,\xi)}
%[
%\log
%p(y|\theta,\xi)
%-
%\log
%p(y|\xi)
%]\\
%&-
%\mathbb{E}_{p(\theta)p(y|\theta,\xi)}
%\left[
%\frac{\mathbb{E}_{\dlp(\hat\theta)}[p(y|\hat\theta,\xi)]}{p(y|\xi)}
%\right] \\
%&= D_p\Eaff(p,\xi)[\dlp] -
%\mathbb{E}_{p(\theta)p(y|\theta,\xi)}
%\left[
%\frac{\mathbb{E}_{\dlp(\hat\theta)}[p(y|\hat\theta,\xi)]}{p(y|\xi)}
%\right].
%\end{aligned}
%$$
%%
%Expanding the second term as an integral, however,
%we see that it vanishes:
%%
%$$
%\begin{aligned}
%\mathbb{E}_{p(\theta)p(y|\theta,\xi)}
%\left[
%\frac{\mathbb{E}_{\dlp(\hat\theta)}[p(y|\hat\theta,\xi)]}{p(y|\xi)}
%\right]
%&=
%\mathbb{E}_{p(y|\xi)}
%\left[
%\frac{\mathbb{E}_{\dlp(\hat\theta)}[p(y|\hat\theta,\xi)]}{p(y|\xi)}
%\right]
%\\
%= \int \int \dlp(\hat\theta)\ p(y|\hat\theta,\xi)\ d\hat\theta\ dy &=\int \dlp(\hat\theta) \int p(y|\hat\theta,\xi)\ dy\ d\hat\theta\\
%= \mathbb{E}_{\dlp(\hat\theta)}\big[\mathbb{E}_{p(y|\hat\theta,\xi)}[1]\big] &= 0,
%\end{aligned}
%$$
%%
%where we have used that $\mathbb{E}_{\dlp(\hat\theta)}[1] = 0.$
%
%\end{proof}
\subsection{Robust Expected Information Gain (REIG)}\label{sec:subreig}

Having shown that $\Eaff(q,\xi;p)$ is a good approximation to $\EIG(q,\xi)$ near the reference prior $p(\theta)$, we now use it to define a robust quantity that approximates $\Ereige^{\mathrm{true}}$, which we refer to simply as $\Ereige$,
%
\begin{equation}\label{eq:reig}
\Ereige(p, \xi) =  \inf_{q \in \Amb{\epsilon}{p}} \Eaff(q,\xi;p).
\end{equation}
%
By the properties established in \eqref{eq:ubound}, \eqref{eq:errbound}, and
\cref{thm:tangent}, we have the following relationships between $\Ereige^{\mathrm{true}}$ and $\Ereige$:
%
\begin{gather}
    \EIG(p,\xi) \geq \Ereige(p,\xi) \geq \Ereige^{\mathrm{true}} (p,\xi); \\
    |\Ereige(p,\xi) - \Ereige^{\mathrm{true}} (p,\xi)| \leq \epsilon; \\
    |\Ereige(p,\xi) - \Ereige^{\mathrm{true}} (p,\xi)| \in O(\epsilon^2).
\end{gather}
%
These facts suggest an experiment that maximizes
$\Ereige(p,\xi)$,
%
$$
\xi^*_{\mathrm{REIG},\epsilon} = \arg\max_{\xi\in\Xi} \Ereige(p,\xi),
$$
%
has similar robustness to $\xi^*_{\mathrm{REIG},\epsilon,\mathrm{true}}$ over perturbations of the the prior $p(\theta)$, as long as the radius $\epsilon$ of the ambiguity set is not too large.
%%
%$$
%\max_{\xi} \Ereige(p,\xi) \approx \max_\xi \inf_{q\in \mathcal{A}(p,\epsilon)} \mathrm{EIG}(q,\xi;p).
%$$

\subsection{Computation of $\Ereige(p,\xi)$ via duality}
\label{sec:minimization}
We have selected $\Ereige(p,\xi)$ as our robust quantity to optimize because \eqref{eq:reig}
can be optimized by the dual transformation described in \cref{sec:ambiguity}.
Applying \eqref{eq:mindual} to EIG in the form \eqref{eq:margklform}, we have $\Ereige(p,\xi) = $
%
\begin{equation}\label{eq:reigdual}
%={} &\inf_{\lambda\geq 0} \lambda\epsilon + \lambda\log 
%\EE_{p(\theta)}\left[\exp (-\lambda^{-1} \EE_{p(y|\theta,\xi)}\big[\log \big(
%\frac{p(y|\theta,\xi)}{p(y|\xi)}
%\big) \big])\right] \\
-\inf_{\lambda\geq 0} \lambda\epsilon + \lambda\log 
\EE_{p(\theta)}\big[\exp
\big(
-\frac{\DKL{p(y|\theta,\xi)}{p(y|\xi)}}{\lambda}
\big)
%(
%-\lambda^{-1}\DKL{p(y|\theta,\xi)}{p(y|\xi)}
%)
\big].
\end{equation}
%
We will show in \cref{sec:sampling} that this 1D convex optimization problem
can be solved efficiently by a small adaptation of existing
EIG estimators.

%Let $J(\lambda)$ be the objective function of \eqref{eq:reigdual}.  If $\epsilon = 0$, the objective in \eqref{eq:reigdual} is decreasing for every $\lambda \geq 0$, so the infinimum is $\lim_{\lambda \to \infty}
%J(\lambda) = -\EIG(p,\xi)$, which can be seen by applying L'H\^{o}pital's rule to the second term of \eqref{eq:reigdual}.  This is the expected behavior, because $\epsilon \to 0$ implies the ambiguity set is shrinking to contain just the reference prior $p(\theta)$.
%
%In the limit as $\epsilon \to \infty$, the optimal $\lambda^*$ will converge to $0$, in which case
%
%The first order optimality is
%$$
%\begin{gathered}
%0 = \epsilon + 
%\log 
%\EE_{p(\theta)}\big[\exp
%\big(
%-\frac{\DKL{p(y|\theta,\xi)}{p(y|\xi)}}{\lambda}
%\big)
%\big]
%\\
%+
%\lambda
%\frac{
%\EE_{p(\theta)}\big[\lambda^{-2}
%\DKL{p(y|\theta,\xi)}{p(y|\xi)}
%\exp
%\big(
%-\frac{\DKL{p(y|\theta,\xi)}{p(y|\xi)}}{\lambda}
%\big)
%\big]
%}{
%\EE_{p(\theta)}\big[\exp
%\big(
%-\frac{\DKL{p(y|\theta,\xi)}{p(y|\xi)}}{\lambda}
%\big)
%\big]
%}
%\end{gathered}
%$$

%
%
%We apply ambiguity set near the prior distribution $p(\theta)$ and the joint distribution $p(\theta,y|\xi)$. If we apply the ambiguity set near $p(\theta)$, it minimizes the EIG with respect to $q(\theta) \in \mathcal{A}_{\epsilon, p(\theta)}$. We can change the problem into dual form by adding one variable($\lambda$). We can use the existing method to generate samples, and then update the sampling weight with optimization. Since REIG uses "log-sum-exp", it can be beneficial in numerical stability. 
%\begin{align*}
%I(\theta,\xi) &= \mathbb{E}_{p(\theta)}\mathbb{E}_{p(y|\theta,\xi)}\big[ \log\frac{p(y|\theta,\xi)}{p(y|\xi)}\big]\\
%\Ereige(p,\xi) &= \inf_{\lambda\geq0} \lambda\epsilon + \lambda\log \mathbb{E}_{p(\theta)}\exp(\frac{1}{\lambda}\mathbb{E}_{p(y|\theta,\xi)}\big[T(\theta,\xi)\big]),
%\end{align*}
%where $T(\theta,\xi)=\log\frac{p(y|\theta,\xi)}{p(y|\xi)}$.
%% \subsubsection{VNMC}
%
%% When calculating the marginal likelihood distribution, it uses the VNMC.
%
%% TODO: We need to consider how we train the posterior distribution.
%\todo{Should I include this?}
%If we apply the ambiguity set near the joint distribution $p(\theta,y|\xi)$ and fix the likelihood distribution, it is just the prior distribution perturbation near the reference prior distribution. It makes the the optimization easier. When optimizing, it uses generated samples from existing method, and update the weight of samples with given optimization equation.

%\subsection{$\lambda$ Analysis}
%If $\lambda \rightarrow \infty$, then $\epsilon \rightarrow 0$. It means that existing methods to estimate the $\EIG$ like VNMC, ACE and PCE are used. If $\lambda \rightarrow 0$, then $\epsilon \rightarrow \infty$. It means that the ambiguity set covers all the possible distribution, and $\log\frac{p(y| \theta,\xi)}{p(y|\theta)}$ is considered to be same for all the samples. In other words, the EIG loses the information.

\subsection{Related Design Criteria}
\label{sec:maximization}
Our definition of $\Ereige$ was motivated by a risk-aversion argument in favor of the design with the best worst-case EIG in a neighborhood.
Because the approximation $\Eaff(q,\xi;p)$ is affine, however,
maximization over the ambiguity set can also be solved by duality.
This means that the same methodology can be used to define a risk-loving
strategy for experimental design, which selects the experiment
that has the highest EIG for some prior in the ambiguity set.
We call this criterion $\Ereigemax(p,\xi) = $
%
\begin{equation}\label{eq:reigmax}
%={} &\inf_{\lambda\geq 0} \lambda\epsilon + \lambda\log 
%\EE_{p(\theta)}\left[\exp (-\lambda^{-1} \EE_{p(y|\theta,\xi)}\big[\log \big(
%\frac{p(y|\theta,\xi)}{p(y|\xi)}
%\big) \big])\right] \\
\inf_{\lambda\geq 0} \lambda\epsilon + \lambda\log 
\EE_{p(\theta)}\big[\exp
\big(
\frac{\DKL{p(y|\theta,\xi)}{p(y|\xi)}}{\lambda}
\big)
%(
%-\lambda^{-1}\DKL{p(y|\theta,\xi)}{p(y|\xi)}
%)
\big].
\end{equation}
%
This criterion is used in \cref{sec:experiments} to counteract biased underestimation of EIG by some estimators.

Last, we note that our decision to limit the uncertainty in the models of the experiments to just the prior $p(\theta)$ and not the likelihood
$p(y|\theta,\xi)$ is arbitrary, at least from the perspective of the methods we have developed.  An ambiguity set $\Amb{\epsilon}{p(\theta,y|\xi)}$ can be centered around
the joint prior of the model $p(\theta,y|\xi)$, and an affine
approximation can be taken that would allow for optimization over that ambiguity set via duality.  The result would be an even more conservative quantity, $\Ereigejoint(p,\xi) =$
%
\begin{equation}
-
\inf_{\lambda\geq 0} \lambda\epsilon + \lambda\log 
\EE_{p(\theta,y|\xi)}\big[\exp
\big(
\lambda^{-1}
\log
\frac{p(y|\xi)}{p(y|\theta,\xi)}
\big)
%(
%-\lambda^{-1}\DKL{p(y|\theta,\xi)}{p(y|\xi)}
%)
\big].
\end{equation}
We will not explore this criterion more in this work.

\section{REIG Estimation via Sampling}
\label{sec:sampling}

The design of efficient estimators for EIG has been the subject of much research, in part because their use in experimental design is computationally demanding.
The combination of nested iterations
to estimate the densities of implicitly defined distributions, to
evaluate the expectation of the EIG, and finally to optimize that quantity
lead to many passes over the problem data as well as many model evaluations.

When introducing an implicitly defined quantity like $\Ereige$,
we should be leery of adding another nested loop to the calculation.
This is why we immediately discounted the design criterion
$\Ereige^{\mathrm{true}}$ from \eqref{eq:ereigetrue}, which would require optimization in the original variables parameterizing $p(\theta)$, which could be numerous.

\subsection{Constructing a REIG Estimator}

When both the prior $p(\theta)$ and the likelihood $p(y|\theta,\xi)$ can be sampled directly, sampling-based approaches to estimating EIG often have a two-level structure: an inner estimator is defined for a fixed $\theta$ and/or $y$ in the integrated quantity --- either $\DKL{p(\theta|y,\xi)}{p(\theta)}$ in \eqref{eq:klform},
$\DKL{p(y|\theta,\xi)}{p(y|\xi)}$ in \eqref{eq:margklform}, or
$\log(p(y|\theta,\xi) / p(y|\xi)$ in \eqref{eq:miform} ---
and an outer Monte Carlo estimator over either $p(\theta)$ or $p(\theta,y|\xi)$ calls the inner estimator for each generated
$\theta$ or $(\theta,y)$. 
%It is also typical to freeze the samples
%of the outer Monte Carlo sampler and treat them as an empirical distribution, rather than resampling at each optimization step
%of the experimental design.

This basic paradigm maps closely onto the dual formulation of $\Ereige$ in \eqref{eq:reigdual}, in a method we sketch in \cref{alg:reigalgo} that defines
a REIG estimator $\hat\Ereige$.

\begin{algorithm}[ht]
   \caption{$\Ereige$ Estimation via Sampling}
   \label{alg:reigalgo}
   \begin{enumerate}[noitemsep]
       \item
       Draw $N_1$ i.i.d. samples $\{\theta_i\}_{i=1}^{N_1}$ from $p(\theta)$.
       \item
       For each $\theta_i$, use estimator $\tilde{D}(\theta,\xi)$ to compute an estimate $d_i \gets \tilde{D}(\theta_i, \xi)$ of
       $\DKL{p(y|\theta,\xi)}{p(y|\xi)}$.
       \item
       Solve the 1D convex optimization problem
       %
       \begin{equation}\label{eq:reigdisc}
       M_\epsilon = \inf_{\lambda \geq 0}
       \lambda \epsilon + \lambda \log \frac{1}{N_1} \sum_{i=1}^{N_1} \exp ( - \lambda^{-1} d_i)
       \end{equation}
       %
       and return $-M_\epsilon$.
   \end{enumerate}
\end{algorithm}

In this approach the inner estimator is called $N_1$ times in step 2, which is the same number of times it would have been called to compute the
EIG estimator $\frac{1}{N_1} \sum_{i=1}^{N_1} d_i$, but those estimates are saved and treated as an empirical distribution, so that
the optimization problem in step 3 solves \eqref{eq:reigdual} by sample average approximation (SAA) instead of stochastic approximation (SA).
The assumption is that this 1D convex problem will be solved quickly and the dominant cost in \cref{alg:reigalgo} is the cost of computing the KL-divergence estimators $d_i \gets \tilde{D}(\theta_i, \xi)$.
%TODOJ: alg:example should be reigalgo?

Although the inner optimization in step 3 is solved by SAA, we note that \cref{alg:reigalgo} can be used within either SAA or SA for the optimization over $\xi$, depending on whether the samples in step 1 are reused or not.
The derivative $\nabla_{\xi} \hat\Ereige(p,\xi)$ can be computed as $-\nabla_d M_{\epsilon} \cdot \nabla_\xi d$, where
$d$ is the vector of $d_i$ estimates from step 2.  The partial derivatives $\nabla_\xi d$ are also present in computing the gradients of EIG estimators, so existing methods for this term can be reused: $\nabla_d M_{\epsilon}$ are the only additional derivatives needed for REIG. Letting $\lambda^*$ be the optimal value and letting $L(d) = \log \frac{1}{N_1}\sum_{i=1}^{N_1} \exp(-d_i)$, if $\lambda^* \neq 0$
then $\nabla_d M_{\epsilon} = \nabla_d L(d / \lambda^*)$.  If $\lambda^i = 0$, there is some $i^* = \arg\min_i d_i$ and $M_\epsilon = - d_i$, so that either $-e_i = \nabla_d M_\epsilon$ or $-e_i \in \partial_d M_\epsilon$ if $i^*$ is not unique.

\subsection{EIG Estimators}\label{sec:estimators}


There are many possible choices for the estimator $\tilde{D}(\theta, \xi)$ of $\DKL{p(y|\theta,\xi)}{p(y|\xi)}$. In fact, any EIG estimator $\hat{D}(p(\theta),p(y|\theta,\xi))$ that accepts general prior distributions can be used by running a separate instance of $\hat{D}$ for each $\theta_i$ with
the prior distribution $\theta \sim \delta_{\theta_i}$.
In practice, this approach would have poor performance because sequestering the samples $\theta_i$ into separate estimator instances would not allow for vectorization across samples.
Vectorization and batching are best exploited in a nested
EIG estimator if all instances of the inner estimator have the same hyperparameters to maximize throughput.

In the experiments in \cref{sec:experiments}, we have primarily used the estimators developed in 
\citep{NEURIPS2019_d55cbf21,foster2020unified,kleinegesse2020bayesian}.  The first is the \emph{Variational Nested Monte Carlo} (VNMC) estimator, which is based on the form of EIG in \eqref{eq:miform}.
It draws $N$ samples $(\theta_i, y_i) \sim p(\theta,y|\xi)$ from the joint prior, evaluates $\log p(y_i|\theta_i,\xi)$ directly, and then uses an inner estimator for $\log p(y_i|\xi)$.  That estimator is based on the identity
%
$$
\log p(y|\xi) = \log \EE_{q(\theta)} \big[\frac{p(\theta)}{q(\theta|y,\xi)}p(y|\theta,\xi)\big],
$$
where $q(\theta|y,\xi)$ can be any distribution that is absolutely continuous with respect to $p(\theta)$, but the variance of that expectation is lower the closer $q(\theta|y,\xi)$ is to the posterior distribution $p(\theta|y,\xi)$.  $M$ samples $\{\theta_i^j\}_{j=1}^M$ are drawn from $q(\theta|y,\xi)$ for each $y_i$, resulting in the EIG estimator
%
\begin{equation}
\hat\EIG^{\VNMC} = 
\sum_{i=1}^N \log \frac{p(y_i|\theta_i, \xi)}{
\frac{1}{M} \sum_{j=1}^M  \frac{p(\theta_i^j)}{q(\theta_i^j | y_i, \xi)} p(y_i | \theta_i^j, \xi).
}
\end{equation}
%
This estimator is consistent in the limit as $M \to \infty$, but for finite $M$ is in expectation an upper bound for EIG.  For details see \cite{NEURIPS2019_d55cbf21}.

The second estimator that we use is the \emph{Adaptive Contrastive Estimator} (ACE) from \citep{foster2020unified}.  In description it is almost identical to VNMC, except that in estimating $p(y_i|\xi)$ we add
to the samples $\{\theta_i^j\}_{j=1}^M$ the original sample $\theta_i^0 = \theta_i$ drawn from $p(\theta)$ that generated $y_i$.  The result is
%
\begin{equation}
\hat\EIG^{\ACE} = 
\sum_{i=1}^N \log \frac{p(y_i|\theta_i, \xi)}{
\frac{1}{M+1} \sum_{j=0}^M  \frac{p(\theta_i^j)}{q(\theta_i^j | y_i, \xi)} p(y_i | \theta_i^j, \xi).
}
\end{equation}

This is also a consistent estimator of EIG, but the addition of the prior-samples $\theta_i^0$ to the estimator for $p(y_i|\xi)$ makes it in expectation a lower bound for EIG for finite $M$: see \citep{foster2020unified} for more details.

The last estimator that we use is the \emph{Mutual Information Neural Estimation}  (MINE), which trains the ratio, $\frac{p(y|\theta,\xi)}{p(y|\xi)}$, with samples and estimate the EIG with SAA method \citep{kleinegesse2020bayesian},
%
\begin{equation}\label{eq:mine}
\hat\EIG^{\MINE} = 
\sum_i^N [ T_\psi(\theta_i,y_i) - e^{T_\psi(\theta_i,y_{i}^*)-1} ],
\end{equation}
%
where $y_i^*$ represents the shuffled $y$ and $T$ is a neural network with parameters $\psi$.

The outer loop for ACE, VNMC and MINE methods draw from $p(\theta,y|\xi)$ or $p(\theta)p(y|\xi)$, and in the implementations provided by the authors one sample from $p(y|\theta_i,\xi)$ is drawn for each of $N$ samples $\theta_i$ drawn from $p(\theta)$. To adapt these methods to the needs of our \cref{alg:reigalgo}, we split $N$ into $N = N_1 N_2$, and draw $N_2$ samples from $p(y|\theta_i,\xi)$ for each of $N_1$ samples drawn from $p(\theta)$. We then take the mean over the $N_2$ samples for our estimate of
$\DKL{p(y|\theta_i,\xi)}{p(y|\xi)}$ on line 2 of our algorithm.

%Using SAA, \cite{NEURIPS2019_d55cbf21} uses the variational posterior distribution to estimate the marginal likelihood. \cite{kleinegesse2020bayesian, zhang2021scalable}, on the other hand, train the Neural network to estimate the $p(y|\theta,\xi)/p(y|\xi)$ and use SAA to estimate the EIG. In this paper, we focus on the train the variational posterior distribution and apply the ambiguity when estimating the EIG. %
%% \subsection{Nested Monte Carlo method}
%% Nested Monte Carlo method uses the double loop to estimate the NMC. This method calculates the marginal likelihood distribution using one more loop with respect to the prior distribution.
%% It converges $O(T^{-1/3})$, when $T=O(NM)$ and $M \propto \sqrt{N}$.
%% \begin{gather*}
%% \hat{\mu}_{NMC}(\xi) = \frac{1}{N}\sum_{n=1}^N\log \frac{p(y_n|\theta_{n,0}, \xi)}{\frac{1}{M}\sum_{m=1}^Mp(y_n|\theta_{n,m},\xi)},
%% \end{gather*}
%% where $\theta_{n,m} \overset{\text{i.i.d.}}{\sim}p(\theta)$, $y_n$ $\sim$ $p(y|\theta = \theta_{n,0},\xi) .$ 
%\subsection{Variational Inference}
%
%VNMC \cite{NEURIPS2019_d55cbf21} calculates variational posterior distribution and uses it to estimate the marginal likelihood distribution. It calculates the upper bound of the EIG. 
%\begin{align}
%\EIG(\xi) &= \mathbb{E}_{p(\theta,y|\xi)}\big[\log \frac{p(y|\xi,\theta)}{\mathbb{E}_{p(\theta|y,\xi)}\big[\frac{p(\theta,y|\xi)}{p(\theta|y,\xi)}\big]}\big] \\
%&\leq \frac{1}{N}\sum_{n=1}^N\log \frac{p(y_n|\theta_{n,0}, \xi)}{\frac{1}{M}\sum_{m=1}^M\frac{p(y_n,\theta_{n,m}|\xi)}{q_v(\theta_{n,m}|y_n,\xi)}},
%\end{align}
%where $\theta_{n,0} \overset{\text{i.i.d.}}{\sim}p(\theta)$, $y_n$ $\sim p(y|\theta = \theta_{n,0},\xi)$ , and $\theta_{n,m} \sim q_v(\theta|y = y_n, \xi).$ 
%If $\theta_{n,m}$ are generated from the prior distribution, it is NMC method \cite{rainforth2018nesting}.
%
%\cite{foster2020unified} also further suggests using the Adaptive Contrastive Estimation (ACE) and Prior Contrastive Estimation (PCE) to find the lower bound of the EIG.
%\begin{align}
%ACE &= \mathbb{E}\big[\log \frac{p(y|\theta_{0}, \xi)}{\frac{1}{M+1}\sum_{l=0}^M\frac{p(y|\theta_l,\xi)p(\theta_l)}{q_v(\theta_{l}|y)}}\big],
%\end{align}
%where the expectation is with respect to $p(\theta_0)p(y|\theta_0,\xi)q(\theta_{1:l}|y)$, and $\theta_0$ are original samples from prior distribution, and $\theta_{1:L}$ are the samples from the trained variational posterior distribution.
%\begin{align}
%PCE &= \mathbb{E} \big[\log \frac{p(y|\theta_{0}, \xi)}{\frac{1}{M+1}\sum_{l=0}^Mp(y|\theta_{l}, \xi)}\big],
%\end{align}
%where the expectation is with respect to $p(\theta_0)p(y|\theta_0,\xi)p(\theta_{1:l})$, and $\theta_0$ are original samples from prior distribution, and $\theta_{1:L}$ are samples from prior distribution.
%% Foster[?] further uses neural network to train the relationship between $\xi_i$ and $y_i$ in sequential Bayesian Experimental Design.
%
%\subsection{Train the ratio between the likelihood and the marginal likelihood}
%
%\cite{kleinegesse2020bayesian} transforms the problem from calculating EIG to calculating KL divergence between $p(\theta,y|\xi)
%$ and $p(\theta)p(y|\xi)$ using Mutual Information Neural Estimation (MINE) \cite{belghazi2018mutual}. It calculates the lower bound of the expected information gain. 
%\begin{align}
%EIG(\xi) = \mathbb{E}_{p(\theta,y|\xi)}\big[T_\psi(\theta,y)\big] - \mathbb{E}_{p(\theta)p(y|\xi)}\big[\exp(T_\psi(\theta,y)-1)\big] \end{align}
%
%MINEBED trains the ratio, $\frac{p(y|\theta,\xi}{p(y|\xi)}$, with samples and estimate the EIG with SAA method. 
%
%\cite{kleinegesse2021gradient} further compares the result with other implicit model methods among MINE-f \cite{belghazi2018mutual}, InfoNCE \cite{oord2018representation}, and Jensen–Shannon divergence (JSD). \cite{hjelm2018learning} 
%
%\subsection{Applying ambiguity set at MINEBED method}
%MINEBED can use $\Ereige$ by applying the ambiguity set near the prior distribution as we apply the ambiguity set at the VNMC or ACE. 
%
%MINEBED estimate the lower bound of EIG. Both expectations are calculated with respect to the prior distribution. By substituting the prior distribution with $q(\theta)$ in the ambiguity set, we can get the following equations. 
%\begin{align*}
%\Ereige (p,\xi) &\geq \min_{q\in \mathcal{A}_{\epsilon,p(\theta)}}\mathbb{E}_{q(\theta)}\big[\mathbb{E}_{p(y|\theta,\xi)}\big[T_\psi(\theta,y)\big] \\
%&- \mathbb{E}_{p(y|\xi)}\big[\exp(T_\psi(\theta,y)-1)\big]\big] \\
%&= \inf_{\lambda\geq0} \lambda\epsilon + \lambda\log \mathbb{E}_{p(\theta)}\exp(\frac{1}{\lambda}Z(\theta,\xi)),
%\end{align*}
%where $Z(\theta,\xi)$ is
%$\mathbb{E}_{p(y|\theta,\xi)}\big[T_\psi(\theta,y)\big] - \mathbb{E}_{p(y|\xi)}\big[\exp(T_\psi(\theta,y)-1)\big]$.
%
%% \begin{align*}
%% NCE &= E \big[ \frac{1}{K} \sum_{i=1}^K \log \frac{e^{T_\psi(\theta_i,y_i)}}{\frac{1}{K}\sum_{j=1}^Ke^{T_\psi(\theta_j,y_i)}} \big] \\
%% JSD &= E_{p(v,y|\xi)}\big[-sp(-T_\psi(\theta,y))\big] \\ 
%% &- E_{p(\theta)p(y|\xi)}\big[sp(T_\psi(\theta,y))\big],
%% \end{align*}
%
%% where $sp(z) = \log(1+e^z)$. 

\subsection{REIG as Log-Sum-Exp Stabilization}
\label{sec:logsumexp}

Step 3 of \cref{alg:reigalgo} shows that the convex dual objective function for the $\Ereige$ design criterion manifests as a
$\lambda \epsilon$-biased and 
$\lambda^{-1}$-weighted log-sum-exp combination of the
$-\DKL{p(y|\theta,\xi)}{p(y|\xi)}$ estimates.

From the bounds for log-sum-exp operators we have
%%
$$
\begin{aligned}
\lambda \epsilon -\min_i \{d_i\}
&\geq \lambda\epsilon + \lambda \log \frac{1}{N} \sum_{i=1}^N \exp(-\lambda^{-1} d_i) \\
&> \lambda \epsilon - \min_i \{d_i\} - \lambda \log N.
\end{aligned}
$$
%%
When the ambiguity set radius $\epsilon$ is smaller, the optimal $\lambda$ is larger, and $M_\epsilon$ becomes more like the sample mean; when $\epsilon$ is larger, the optimal $\lambda$ is smaller until eventually the $\lambda \geq 0$ constraint becomes binding.  In that case the limit as $\lambda \to 0$ is achieved and the value is squeezed to become $M_{\epsilon} = -\min_i\ d_i$.

We interpret these facts in the following way: $\Ereige$ tends to bias the samples in the EIG estimate more towards the smaller values of $\DKL{p(y|\theta_i,\xi)}{p(y|\xi)}$ the larger $\epsilon$ is.  A well-recognized problem in EIG estimation \citep{foster2020unified} is the presence of samples where $p(y|\xi)$ is under-estimated, leading to artificially inflated EIG estimates. 

We have until this point consider $\Ereige$ a design criterion in its own right, but this biasing behavior suggests that the use of \cref{alg:reigalgo} with an appropriate choice of $\epsilon$ can also be useful as an estimator for the original EIG criterion whose bias
protects the estimate from sampling error.  This may be a viable approach for stabilizing the computed EIG value when there are insufficient samples to converge the estimate of $p(y|\xi)$ well in the nested sampling approaches.
%We explore this approach in the experiments in the following section.

Since $\hat\EIG^{\VNMC}$ is an upper bound while $\hat\EIG^{\ACE}$ and $\hat\EIG^{\MINE}$ are lower bounds for $\EIG$, we define an estimator $\reigvnmc$ that uses VNMC as the estimator for \eqref{eq:reigdual} and the estimators $\reigace$ and $\reigmine$ that use ACE and MINE, respectively.
The minimization / maximization in each case acts counter to the bias of the sampling process, which we measure in the following section.

%We use four example to show that how ambiguity set stabilizer works on the real algorithm.

\section{Experiments} \label{sec:experiments}

We perform two types of numerical experiments.  In the first type, we test to see how well $\Ereige$ performs as a ``worst-case'' EIG estimate, as discussed in \cref{sec:reig}.  In the second type, we compare the previously developed EIG estimators from \cref{sec:estimators} with a large number of samples to $\Ereige$ with a small number of samples to see if the bias against large summands described in \cref{sec:logsumexp} is as effective as additional samples in stabilizing the computation.

\subsection{REIG as a Worst-Case Estimate}

\begin{figure}%[t]
%\vskip 0.2in
\begin{center}
\centerline{\includegraphics[width=\columnwidth]{first_plot.png}}
\caption{Preference test: EIG change with prior perturbation and the performance of $\Ereige$}
\label{fig:worst_case}
%\label{diff_10} TODO: different labels for each figure
\end{center}
%\vskip -0.2in
\end{figure}

This test uses the Preference model test case from \citep{NEURIPS2019_d55cbf21}.  The parameter $\theta$ is a location parameter with a reference prior $p(\theta)$ that is normally distributed, and the experiments are indexed by locations relative to $\theta$.

To test how well $\Ereige$ serves at computing worst-case estimate,
we select as second prior $q(\theta)$ such that $\EIG(p)$ and $\EIG(q)$ look visibly distinct over the range of possible experiments in \cref{fig:worst_case}, and then measure the KL-divergence between $p(\theta)$ and $q(\theta)$ and take this number to be $\epsilon$ (in this case, $\epsilon = 0.2$).  We then compute the $\Ereige$ criterion for all experiments as well, and plot them in \cref{fig:worst_case}.
(In all of these calculations we use a sampling estimator with a large number of samples so we can be reasonably sure that the values are converged.)
%TODOJ{converged?}
What we see is that $\Ereige(p,\xi)$ succeeds at being a lower bound for both $\EIG(p,\xi)$ and $\EIG(q,\xi)$ in this case.  We also see that this is not 
the result of a uniform or log-uniform scale reduction: $\Ereige(p,\xi)$ has more drastically reduced the gain of some experiments than others, meaning that $\Ereige(p,\xi)$ determines a different optimal $\xi^*$ than $\EIG(p,\xi)$.

%We first calculate the $\EIG$ with given prior distribution($p(\theta)$). We then perturbed the prior distribution ($\hat{p}(\theta)$) with 0.2 KL divergence from the original prior distribution and calculate the $\EIG$. With this simple experiment, we can first evaluate how the $\EIG$ changes from the prior distribution perturbation and then check the ${\Ereige}_{=0.2}$ compute the worst-case $\EIG$. 


% \begin{figure}[ht]
% \vskip 0.2in
% \begin{center}
% \centerline{\includegraphics[width=\columnwidth]{ab_test_prior_diff.png}}
% \caption{A/B test EIG change with prior in ambiguity set}
% %\label{diff_10} TODO: different labels for each figure
% \end{center}
% \vskip -0.2in
% \end{figure}

\subsection{EIG Stabilization via REIG} \label{sec:eigest}

We now test the effectiveness of $\hat\Ereige(p,\xi)$'s log-sum-exp stabilization at producing a stabilized estimation of $\EIG(p,\xi)$. We use the benchmarks designed by \citep{NEURIPS2019_d55cbf21, kleinegesse2020bayesian}, especially three experimental designs which have explicit models for the likelihood distribution: A/B test, Preference, and Pharmacokinetic model. 
% We use the A/B test benchmarks designed by \cite{NEURIPS2019_d55cbf21},
% which has an explicit likelihood model and makes it a good test for our framework.%especially three experimental designs which have explicit models for the likelihood distribution: A/B test, Preference, and Circular design. 

%We evaluate the $\Ereige$ against the baselines with perturbed prior perturbations $\EIG$ and use $\Ereige^{VNMC}$ for the enhanced upper bound and $\Ereige^{ACE}$ for the enhanced lower bound. 

\subsubsection{A/B test}
An A/B test \citep{kohavi2009controlled, box1978statistics} can be used to determine which experiment in a set of two or more results in the largest information gain.
\cite{NEURIPS2019_d55cbf21} introduces the group size selection problem between groups A and B. 
We have $n$ experiment participants, and we can select $n_A$ participants for group A and $n-n_A$ participants for group B.  Each participant is represented as two random variables: the first is measured for participants in group A, and the second for group B.  This A/B test models each participant's random variables using a Bayesian linear model, $y = X\theta + \epsilon$; the prior and likelihood distributions are Gaussian distributions.

Each of the estimators under consideration uses a neural network to generate samples that musts be trained.
The proposal distribution used by the VNMC and ACE estimators is trained with $A$ and $\Sigma_p$
as in \citep{NEURIPS2019_d55cbf21}: more details about the proposal distribution can be found there. Likewise, the parameters $\psi$ of the neural network in $\hat\EIG^{\MINE}$ are trained for each $\theta$ with samples of $y$ from the given distributions.

% \cite{NEURIPS2019_d55cbf21, foster2020unified} show that with enough samples VNMC and ACE are consistent estimators and are in expectation upper and lower bounds, respectively, for EIG. Specifically, using 10 posterior distribution samples for the marginal likelihood estimation and 500 joint distribution samples results in the estimations' biases and variances in the A/B test to be smaller than 0.01.
In our work, we start from a different reference prior distribution, with its mean taken to be $[4.46,0]$ instead of $[0,0]$. We find that the VNMC estimator shows considerably more error for some designs with a small number of samples $M=30$ (\cref{fig:abtest}, top left), and requires more samples to converge ($M\geq 100$). MINE, on the other hand, doesn't have big difference when we have enough samples (\cref{fig:abtest}, bottom left). 

%To check the prior perturbation result and the performance of $\Ereige$, we change the prior distribution $0.1$ KL divergence far from the original prior distribution $\mathcal{N}(0,\Sigma)$, especially the mean.
%In contrast to that larger number of samples, we estimate $\Ereige$ with the VNMC and ACE estimators when the number of posterior samples, M, is just 30. The estimated $\EIG$s are far from the true $\EIG$ when we compare the VNMC performance from  \cite{NEURIPS2019_d55cbf21, foster2020unified}, and this is the perturbed prior distribution $\theta \sim \mathcal{N}([4.47,0],\begin{pmatrix}
%10^2 & 0 \\
%0 & 1.82^2 
%\end{pmatrix}).$

In contrast to that larger number of samples, we compute $\reigvnmc$ and $\reigace$ estimators using only $M=30$ posterior samples and $\reigmine$ using $1000*10$ samples, with an increasing series of ambiguity set radii $\epsilon$ (\cref{fig:abtest}, right).  The estimate is almost unaffected by the ambiguity set if $\epsilon=0.001$, but increasing $\epsilon$ to $0.1$ seems to improve the estimates for the designs that were highly overestimated without negatively affecting the other experiments that were already properly estimated.
 
\begin{figure}%[ht]
%\vskip 0.2in
\begin{center}
\centerline{\includegraphics[width=\columnwidth]{ab_test.png}}
\caption{A/B test convergence for EIG estimators (left column): 100*10 samples from $p(\theta,y)$ and 30/100/1000 samples from $q_\phi(\theta|y,\xi)$ (10,000, 30,000, 50,000 samples for the MINE estimator); $\Ereige$ estimators (right column): 100 $\theta$ samples from $p(\theta)$ and 10 y samples from each $\theta_i$ and 30 posterior samples for the marginal likelihood distribution (1000 $\theta$ and $10$ y samples for the MINE estimator). }
\label{fig:abtest}
\end{center}
%\vskip -0.2in
\end{figure}

\subsubsection{Preference}
Another benchmark that we use to test the effectiveness of $\Ereige(p,\xi)$'s log-sum-exp stabilization is Preference test. The Preference experiment is designed to understand consumer behavior with a utility function \citep{samuelson1948consumption, NEURIPS2019_d55cbf21}. The experiment provides the proposal to subjects and checks their preference as an output to use in the utility function. 

We use the experiment from \citep{NEURIPS2019_d55cbf21}. % The proposal distribution used by the VNMC and ACE estimators is trained with $A$ and $\Sigma_p$ as in \citep{NEURIPS2019_d55cbf21}: more details about the proposal distribution can be found there. 
We start from different reference prior distribution using $-7.35$ as the mean instead of $0$. The KL divergence from the original prior distribution is $\epsilon = 0.2$. VNMC estimator in our test shows more error with a small number of samples M = 30 (\cref{fig:preference}, top left), while ACE estimator shows accurate estimation for the EIG with small number of marginalization samples (\cref{fig:preference}, middle left). MINE estimator, on the other hand, estimate the EIG lower than the true EIG (\cref{fig:preference}, bottom left). But the optimal experimental set-up are same with the true EIG.

We further estimate $\reigvnmc$ and $\reigace$ estimator using only M = 30 samples and MINE, with an increasing series of ambiguity set radii $\epsilon$ (\cref{fig:preference}, right). By applying ambiguity set 0.01, the $\EIG$ estimation boundary becomes tight. If we apply ambiguity set $\epsilon = 0.1$, the lower bound (ACE) becomes higher than the true $\EIG$ while the upper bound (VNMC) becomes lower than the True $\EIG$ (\cref{fig:preference}, right). The $\reigace$ and $\reigvnmc$ are no longer the corresponding upper bound and lower bound. $\reigmine$ with $\epsilon = 0.1$ increase the EIG estimation and change the optimal experimental set up (\cref{fig:preference}, bottom right). 
 
\begin{figure}%[ht]
%\vskip 0.2in
\begin{center}
\centerline{\includegraphics[width=\columnwidth]{preference.png}}
\caption{Preference test convergence for EIG estimators (left column): 100*10 samples from $p(\theta,y)$ and 30/100/1000 samples from $q_\phi(\theta|y,\xi)$ (10,000, 30,000, 50,000 samples for the MINE estimator); $\Ereige$ estimators (right column): 100 $\theta$ samples from $p(\theta)$ and 10 y samples from each $\theta_i$ and 30 posterior samples for the marginal likelihood distribution (1000 $\theta$ samples and 10 y samples for MINE estimator). }
\label{fig:preference}
\end{center}
%\vskip -0.2in
\end{figure}

%\begin{table}[t]
%\caption{Preference Mean Difference between True EIG and $\Ereige$}
%\label{sample-table}
%\vskip 0.15in
%\begin{center}
%\begin{small}
%\begin{sc}
%\begin{tabular}{lcccr}
%\toprule
%$\Ereige$& $\epsilon$ & MD \\
%\midrule
%ACE & 0.001  & 1.4650\\
%ACE& 0.01  & 1.5016\\
%ACE & 0.1  & 5.1921\\
%VNMC& 0.001 & 2.4941\\
%VNMC&0.01 & 2.2393\\
%VNMC&0.1 & 5.4278\\
%\bottomrule
%\end{tabular}
%\end{sc}
%\end{small}
%\end{center}
%\vskip -0.1in
%\end{table}

%\begin{table}[t]
%\caption{A/B Test Mean Difference between True EIG and $\Ereige$}
%\label{sample-table}
%\vskip 0.15in
%\begin{center}
%\begin{small}
%\begin{sc}
%\begin{tabular}{lcccr}
%\toprule
%$\Ereige$& $\epsilon$ & MD \\
%\midrule
%ACE & 0.001  & 2.3014\\
% ACE& 0.01  & 1.5594\\
%ACE & 0.1  & 2.0955\\
%VNMC& 0.001 & 9.6103\\
%VNMC&0.01 & 6.6774\\
%VNMC&0.1 & 3.0193\\
%\bottomrule
%\end{tabular}
%\end{sc}
%\end{small}
%\end{center}
%\vskip -0.1in
%\end{table}

%\subsubsection{Preference}
%The Preference experiment is designed to understand consumer behavior with a utility function \cite{NEURIPS2019_d55cbf21, samuelson1948consumption}. The experiment provides the proposal (e.g. price) to subject and check their preference as an output.
%
%This experiment assumes that the responses are "censored" and "corrupted". \cite{NEURIPS2019_d55cbf21} defines the utility function as follows: let $\xi \in \mathrm{R}$ and 
%\begin{gather}
%    \theta \sim \mathcal{N}(-20, 20^2), \eta|\theta,\xi \sim \mathcal{N}(\xi-\theta,1 + |\xi|^2)\\ y = f(\eta), 
%    f : \mathrm{R} \rightarrow [\epsilon, 1-\epsilon] \\
%    x \rightarrow \begin{cases}
%    \epsilon & \text{if } x\leq \text{logit}(\epsilon)\\
%    1-\epsilon,              & \text{if} x \geq \text{logit}(1 - \epsilon) \\ 
%    \frac{1}{1-e^{-x}} & \text{otherwise},
%    \end{cases}
%\end{gather}
%
%For variational posterior distribution training, $q_p$ is defined as follows:
%\begin{gather}
%    q_p(\theta|y,\xi,\phi) \sim \mathcal{N}(\mu_p,\sigma_p^2)  \\
%    \hat{\eta} = (\xi - \text{logit}(y)), \phi = (w,\sigma, \mu_0, \sigma_0, \mu_1, \sigma_1)\\
%    \mu_p = w\hat{\eta} + (1-w)\mu_\theta + \mu_0\mathbf{1}_{\{y=\epsilon\}} + \mu_1\mathbf{1}_{\{y=1-\epsilon\}} \\
%    \sigma_p^2 = \sigma^2 + \sigma_0^2\mathbf{1}_{\{y=\epsilon\}} + \sigma_1^2\mathbf{1}_{\{y=1-\epsilon\}}
%\end{gather}
%
%In \cite{NEURIPS2019_d55cbf21}, VNMC method calculates the $\EIG$ in a scenario, where $M=50$. The bias and the variance of VNMC calculation compared to the true $\EIG$ are smaller than 0.01.
%
%We change the prior distribution 0.2 KL divergence from the original prior distribution and estimate $\Ereige$ with VNMC and ACE. The perturbed prior distribution for checking the performance of $\Ereige$ is $\theta \sim \mathcal{N}(-7.35,20^2)$.
%
%$\EIG$ estimations with $M=30$ with using perturbed prior distribution are far from the true $\EIG$ when we compare  the VNMC performance from \cite{NEURIPS2019_d55cbf21}. By applying ambiguity set 0.01, the $\EIG$ estimation boundary becomes tight. If we apply the ambiguity set with 0.1, the $\Ereige^{VNMC}$ loses the function of upper bound and $\Ereige^{ACE}$ loses the function of lower bound.
%
%\begin{figure}[ht]
%\vskip 0.2in
%\begin{center}
%\centerline{\includegraphics[width=\columnwidth]{preference.png}}
%\caption{Preference test convergence(left column): 100*10 samples from $p(\theta,y)$ and 30/100/1000 samples from $q_\phi(\theta|y,\xi)$, and Improving estimation with $\Ereige$(right column): 100 $\theta$ samples from $p(\theta)$ and 10 y samples from each $\theta_i$ and 30 posterior samples for the marginal likelihood distribution. }
%\label{preference}
%\end{center}
%\vskip -0.2in
%\end{figure}
%
%%\begin{table}[t]
%%\caption{Preference Mean Difference between True EIG and $\Ereige$}
%%\label{sample-table}
%%\vskip 0.15in
%%\begin{center}
%%\begin{small}
%%\begin{sc}
%%\begin{tabular}{lcccr}
%%\toprule
%%$\Ereige$& $\epsilon$ & MD \\
%%\midrule
%%ACE & 0.001  & 1.4650\\
%%ACE& 0.01  & 1.5016\\
%%ACE & 0.1  & 5.1921\\
%%VNMC& 0.001 & 2.4941\\
%%VNMC&0.01 & 2.2393\\
%%VNMC&0.1 & 5.4278\\
%%\bottomrule
%%\end{tabular}
%%\end{sc}
%%\end{small}
%%\end{center}
%%\vskip -0.1in
%%\end{table}
%
%\subsubsection{Linear model with circular designs}
%Similar to A/B test, Circular design is applicable to experimental design set up as well. \cite{NEURIPS2019_d55cbf21} fixes the 10 prefixed design and calculates the $\EIG$ for each set up.
%
%This model also uses the Bayesian linear model $y = X\theta + \epsilon$. prior distribution for $\theta$ and likelihood distribution for $y$ are defined as follow:
%\begin{gather}
%\theta \sim \mathcal{N}([0,0],\begin{pmatrix}
%10^2 & 0 \\
%0 & 2^2 
%\end{pmatrix}), y|\theta,\xi \sim \mathcal{N}(X_\xi\theta,I)\\ 
%q_p(\theta|y,\xi,\phi) \sim \mathcal{N}(Ay, \Sigma_p), \phi = (A, \Sigma_p)    
%\end{gather}
%We change the prior distribution 0.1 KL divergence from the original prior distribution and analyze the performance of $\Ereige^{VNMC}$ and $\Ereige^{ACE}$. The perturbed prior distribution is 
%$\theta \sim \mathcal{N}([0,0],\begin{pmatrix}
%10^2 & 0 \\
%0 & 2^2 
%\end{pmatrix})$
%
%The results continuously show that the $\Ereige$ can improve the VNMC and ACE result.
%
%\begin{figure}[ht]
%\vskip 0.2in
%\begin{center}
%\centerline{\includegraphics[width=\columnwidth]{circle.png}}
%\caption{Circular design test convergence(left column): 100*10 samples from $p(\theta,y)$ and 30/100/1000 samples from $q_\phi(\theta|y,\xi)$, and Improving estimation with $\Ereige$(right column): 100 $\theta$ samples from $p(\theta)$ and 10 y saples from each $\theta_i$ and 30 posterior samples for the marginal likelihood distribution. }
%\label{diff_10}
%\end{center}
%\vskip -0.2in
%\end{figure}
%
%%\begin{table}[t]
%%\caption{Circular Design Mean Difference between True EIG and $\Ereige$}
%%\label{sample-table}
%%\vskip 0.15in
%%\begin{center}
%%\begin{small}
%%\begin{sc}
%%\begin{tabular}{lcccr}
%%\toprule
%%$\Ereige$& $\epsilon$ & MD \\
%%\midrule
%%ACE & 0.001  & 1.0951\\
%%ACE& 0.01  & 0.8875\\
%%ACE & 0.1  & 1.8627\\
%%VNMC& 0.001 & 2.7821\\
%%VNMC&0.01 & 1.6637\\
%%VNMC&0.1 & 1.6924\\
%%\bottomrule
%%\end{tabular}
%%\end{sc}
%%\end{small}
%%\end{center}
%%\vskip -0.1in
%%\end{table}
\subsubsection{Pharmacokinetic model}
The last benchmark that we use to test the effectiveness of $\Ereige(p,\xi)$ is Pharmacokinetic study. The Pharamcokinetic study is designed to understand how the medicine is absorbed, distributed and eliminated in the subject's body, which we can understand with the compartmental model. We can estimate the compartment model's parameter by blood sampling and we want to calculate the optimal blood sampling time. \citep{ryan2014towards, kleinegesse2020bayesian} 

We start from a different reference prior distribution with its mean taken to be $[0.1, \log 0.1 \log 20]$ whose KL divergence from the original prior distribution is 0.1. 
%
The accuracy of the estimators $\hat\EIG^{\VNMC}$ and $\hat\EIG^{\ACE}$ in our tests have increases with M (\cref{fig:pkmodel}, top left, middle left) while $\hat\EIG^{\MINE}$, on the other hand, does converge to a substantial underestimate (\cref{fig:pkmodel}, bottom left), yet identifies the same optimal experiment.

We further compute $\reigvnmc$ and $\reigace$ estimators using only M = 30 samples and $\reigmine$, with an increasing series of ambiguity set radii $\epsilon$ (\cref{fig:pkmodel}, right). By applying ambiguity set 0.1, the lower bound (ACE) becomes higher than the true $\EIG$ while the upper bound (VNMC) becomes lower than the True $\EIG$ (\cref{fig:pkmodel}, right). The $\reigace$ and $\reigvnmc$ are no longer the corresponding upper bound and lower bound. MINE estimator with ambiguity set $\epsilon = 0.1$, on the other hand, improve the EIG estimation so that the estimation is close to the true EIG (\cref{fig:pkmodel}, bottom right). 

\begin{figure}%[ht]
%\vskip 0.2in
\begin{center}
\centerline{\includegraphics[width=\columnwidth]{pk.png}}
\caption{Pharmacokinetic model convergence for EIG estimators (left column): 100*10 samples from $p(\theta,y)$ and 30/100/1000 samples from $q_\phi(\theta|y,\xi)$ (10,000, 30,000, 50,000 samples for the MINE estimator); $\Ereige$ estimators (right column): 100 $\theta$ samples from $p(\theta)$ and 10 y samples from each $\theta_i$ and 30 posterior samples for the marginal likelihood distribution (1000 $\theta$ samples and 10 y samples for MINE estimator). }
\label{fig:pkmodel}
\end{center}
%\vskip -0.2in
\end{figure}

\section{Discussion}

This work presents an introduction and initial numerical experiments testing the use of a robust modification of expected information gain as a criterion for Bayesian model selection.
The $\Ereige$ estimator is designed to both have rigorously defined approximation properties (as the minimization of a tangent approximation to the EIG over a convex ambiguity set), and to yield a practical algorithm in practice (\cref{alg:reigalgo}, which post-processes the samples of previously defined estimators by the solution of a 1D convex optimization problem). 
%Although we did not explore taking derivatives of $\Ereige$ criterion in this work, the fact that the optimal value of $\lambda$ is defined through a low-dimensional convex program suggests that the implicit gradient of $\Ereige$ with respect to the parameter $\xi$ may be available through differential convex programming \cite{diamond2016cvxpy}.

While our initial results are promising, our understanding of how to apply this method is not complete at this time.  Most of the remaining questions relate to the choice of the radius $\epsilon$ of the ambiguity set over which the approximated EIG is taken to be robust.  A definite a priori estimate for $\epsilon$ seems unlikely in most cases.

In an approach analogous to Morozov's discrepancy principle, a logical choice of $\epsilon$ would be one such that the implied radius of the ambiguity set is on the same order as the error that the sampling based estimator (such as VNMC or ACE) has introduced into the problem.  The fact that these two estimators provide (in expectation) an upper and lower bound for the true EIG of a design suggests that there may be a way to combine the two estimators into an a posteriori estimate of the proper choice of $\epsilon$.  Further investigation into this topic is the subject of future research.

% MINE can use $\Ereige$ by applying the ambiguity set near the prior distribution as we apply the ambiguity set at the VNMC or ACE. 
% MINE estimate the lower bound of EIG. Both expectations are calculated with respect to the prior distribution. By substituting the prior distribution with $q(\theta)$ in the ambiguity set, we can get the following equations. 
% \[
% \begin{aligned}
% \Ereige (p,\xi) &\geq \min_{q\in \mathcal{A}_{\epsilon,p(\theta)}}\mathbb{E}_{q(\theta)}\big[\mathbb{E}_{p(y|\theta,\xi)}\big[T_\psi(\theta,y)\big] \\
% &- \mathbb{E}_{p(y|\xi)}\big[\exp(T_\psi(\theta,y)-1)\big]\big] \\
% &= - \inf_{\lambda\geq0} \lambda\epsilon + \lambda\log \mathbb{E}_{p(\theta)}\exp(-\frac{1}{\lambda}Z(\theta,\xi)),
% \end{aligned}
% \]
% where $Z(\theta,\xi)$ is
% $\mathbb{E}_{p(y|\theta,\xi)}\big[T_\psi(\theta,y)\big] - \mathbb{E}_{p(y|\xi)}\big[\exp(T_\psi(\theta,y)-1)\big]$.

%When we use the MINE, If we don't use the enough samples for training the data, it becomes no longer the tight lower bound. If we add enough samples we can get the tight lower bound. So we use enough samples for the MINE.


% keywords: Bayesian Experimental Design, Ambiguity set, Robustness, Uncertainty Quantification, Convex optimization, Risk averse, Bayesian Inference,

% TL;DR
% "Too Long; Didn't Read": a short sentence describing your paper
% We implement a robust variant of expected information gain (minimized in a small KL-divergence neighborhood) as a simple post-processing of existing estimators and test it on standard benchmarks in Bayesian experimental design.
% We use ambiguity set to make the Bayesian experimental design robust as well as stable with 1-d convex optimization.



% \subsubsection{Linear model with Circular designs}
% Similar to the A/B test, Circular design is applicable to experimental design set up as well. For example, \citep{NEURIPS2019_d55cbf21} fixes the 10 prefixed design and calculates the $\EIG$ for each set up to show how the VNMC works. This model also uses the Bayesian linear model $y = X\theta + \epsilon$, and they define prior distribution for $\theta$ and likelihood distribution for $y$ as follow:
% \begin{gather}
% \theta \sim \mathcal{N}([4.47,0],\begin{pmatrix}
% 10^2 & 0 \\
% 0 & 2^2 
% \end{pmatrix}), y|\theta,\xi \sim \mathcal{N}(X_\xi\theta,I)
% \end{gather}

% Likewise, we change the prior distribution by 0.1 KL divergence from the original prior distribution and analyze the performance of $\Ereige^{VNMC}$ and $\Ereige^{ACE}$. We change the reference prior distribution from
% $\theta \sim \mathcal{N}([0,0],\begin{pmatrix}
% 10^2 & 0 \\
% 0 & 2^2 
% \end{pmatrix})$

% In this case, $\Ereige^{VNMC}$ converges to the true $\EIG$ efficiently with limited amount of samples (\cref{fig:circle}, top right). $\Ereige^{ACE}$, on the other hand, is higher than the true $\EIG$ (\cref{fig:circle}, bottom right), which is because ACE estimate the $\EIG$ accurately (\cref{fig:circle}, bottom left).

% \begin{figure}[ht]
% \vskip 0.2in
% \begin{center}
% \centerline{\includegraphics[width=\columnwidth]{circle.png}}
% \caption{Circular design test convergence(left column): 100*10 samples from $p(\theta,y)$ and 30/100/1000 samples from $q_\phi(\theta|y,\xi)$, and Improving estimation with $\Ereige$(right column): 100 $\theta$ samples from $p(\theta)$ and 10 y saples from each $\theta_i$ and 30 posterior samples for the marginal likelihood distribution. }
% \label{fig:circle}
% \end{center}
% \vskip -0.2in
% \end{figure}





%\subsection{How to choose ambiguity set size:$\epsilon$}
%Finding adequate $\epsilon$ is the future work. We check that the adequate $\epsilon$ improve the lower bound and the upper bound. But if the $\epsilon$ is too big, it make the lower bound bigger than True $\EIG$ and the upper bound smaller than Ture $\EIG$. One promising way to estimate the adequate $\epsilon$ is VNMC - ACE. From the difference, we somehow estimate the distance between the prior distributions and use it to update $\epsilon$.
%
%\subsection{Two stages in Bayesian Experimental Design}
%BED is computationally expensive because it requires two stage optimization. The first stage is to find the parameter($\phi$) to estimate $\EIG$ with the current $\xi$. The second stage calculates the optimal $\xi$ with current $\phi$. Because of that, optimization takes long to find the optimal $\xi$.
%One possible way to handle this two stage problem is CVXPY\cite{diamond2016cvxpy}. CVXPY helps us to tape from finding $\Ereige$ to optimizing $\xi$ and changes the BED optimization problem to one stage problem. How to apply the CVXPY at $\Ereige$ is another future work for $\Ereige$. 
%
%\subsection{MINEBED}
%We formularize how to set up the ambiguity set on the MINEBED. Since $\Ereige$ works as a damping sample, regularizer for the smaples, it can improve the limitations of MINE \cite{song2019understanding, belghazi2018mutual} such as self-consistency issue(overflow). Implementing the MINEBED with $\Ereige$ is another future work.
%



% \subsection{Convexity}

% If we consider the prior distribution on the marginal likelihood distribution, the problem loses the convexity, and we can no longer use the convex optimization method.
% We can parameterize the prior distribution or the joint distribution and update the parameter and use it to calculate the marginal likelihood distribution like EM algorithm.

% \subsection{Affine operator}
% We can change the ACE to upper bound and VNMC to lower bound with adequate $\epsilon$. 


% \section{Proof}
% \subsection{q}
% \begin{gather*}
% \int_\theta\int_y \log \frac{p(y|\theta,\xi)}{\int_{\hat{\theta}} p(y|\hat{\theta},\xi) p(\hat{\theta})d\hat{\theta}}p(y|\theta,\xi) dy (q(\theta) + \delta q (\theta)) d\theta \\
% - \int_\theta\int_y \log \frac{p(y|\theta,\xi)}{\int_{\hat{\theta}} p(y|\hat{\theta},\xi) p(\hat{\theta})d\hat{\theta}}p(y|\theta,\xi) dy q(\theta)d\theta \\
% = \int_\theta\int_y \log \frac{p(y|\theta,\xi)}{\int_{\hat{\theta}} p(y|\hat{\theta},\xi) p(\hat{\theta})d\hat{\theta}}p(y|\theta,\xi) dy \delta q(\theta)d\theta = A
% \end{gather*}

% \begin{gather*}
% \int_\theta\int_y \log \frac{p(y|\theta,\xi)}{\int_{\hat{\theta}} p(y|\hat{\theta},\xi) (q(\hat{\theta})+\delta q(\hat{\theta}))d\hat{\theta}}p(y|\theta,\xi) dy (q(\theta) + \delta q (\theta)) d\theta \\
% - \int_\theta\int_y \log \frac{p(y|\theta,\xi)}{\int_{\hat{\theta}} p(y|\hat{\theta},\xi) q(\hat{\theta})d\hat{\theta}}p(y|\theta,\xi) dy q(\theta)d\theta \\
% = \int_\theta\int_y p(y|\theta,\xi) \log p(y|\theta,\xi)   dy \delta q(\theta)d\theta + \\
% \int_\theta\int_y \log\big[\frac{\int_{\hat{\theta}}p(y|\xi,\hat{\theta})q(\hat{\theta})d\hat{\theta}}{ \int_{\hat{\theta}} p(y|\hat{\theta},\xi) (q(\hat{\theta})+\delta q(\hat{\theta}))d\hat{\theta}}\big]p(y|\theta,\xi) dy q (\theta) d\theta + \\
% \int_\theta\int_y -\log\big[ \int_{\hat{\theta}} p(y|\hat{\theta},\xi) (q(\hat{\theta}) + \delta q(\hat{\theta}))d\hat{\theta}\big]p(y|\theta,\xi) dy \delta q(\theta) d\theta = B
% \end{gather*}

% \begin{gather*}
% A-B = \int_\theta\int_y \log\big[\frac{\int_{\hat{\theta}}p(y|\xi,\hat{\theta})q(\hat{\theta})d\hat{\theta}}{ \int_{\hat{\theta}} p(y|\hat{\theta},\xi) (q(\hat{\theta})+\delta q(\hat{\theta}))d\hat{\theta}}\big]p(y|\theta,\xi) dy q (\theta) d\theta  \\
% + \int_\theta\int_y \log\big[ \frac{\int_{\hat{\theta}} p(y|\hat{\theta},\xi) (q(\hat{\theta}) + \delta q(\hat{\theta}))d\hat{\theta}}{ \int_{\hat{\theta}} p(y|\hat{\theta},\xi) p(\hat{\theta})d\hat{\theta}}\big]p(y|\theta,\xi) dy \delta q(\theta) d\theta \\
% \end{gather*}
% \subsection{p}
% \begin{gather*}
% \int_\theta\int_y \log \frac{p(y|\theta,\xi)}{\int_{\hat{\theta}} p(y|\hat{\theta},\xi) p(\hat{\theta})d\hat{\theta}}p(y|\theta,\xi) dy (p(\theta) + \delta p (\theta)) d\theta \\
% - \int_\theta\int_y \log \frac{p(y|\theta,\xi)}{\int_{\hat{\theta}} p(y|\hat{\theta},\xi) p(\hat{\theta})d\hat{\theta}}p(y|\theta,\xi) dy p(\theta)d\theta \\
% = \int_\theta\int_y \log \frac{p(y|\theta,\xi)}{\int_{\hat{\theta}} p(y|\hat{\theta},\xi) p(\hat{\theta})d\hat{\theta}}p(y|\theta,\xi) dy \delta p(\theta)d\theta = A
% \end{gather*}

% \begin{gather*}
% \int_\theta\int_y \log \frac{p(y|\theta,\xi)}{\int_{\hat{\theta}} p(y|\hat{\theta},\xi) (p(\hat{\theta})+\delta p(\hat{\theta}))d\hat{\theta}}p(y|\theta,\xi) dy (p(\theta) + \delta p (\theta)) d\theta \\
% - \int_\theta\int_y \log \frac{p(y|\theta,\xi)}{\int_{\hat{\theta}} p(y|\hat{\theta},\xi) p(\hat{\theta})d\hat{\theta}}p(y|\theta,\xi) dy p(\theta)d\theta \\
% = \int_\theta\int_y p(y|\theta,\xi) \log p(y|\theta,\xi)   dy \delta p(\theta)d\theta + \\
% \int_\theta\int_y \log\big[\frac{\int_{\hat{\theta}} p(y|\hat{\theta},\xi) p(\hat{\theta}))d\hat{\theta}}{ \int_{\hat{\theta}} p(y|\hat{\theta},\xi) (p(\hat{\theta})+\delta p(\hat{\theta}))d\hat{\theta}}\big]p(y|\theta,\xi) dy p(\theta) d\theta + \\
% \int_\theta\int_y -\log\big[ \int_{\hat{\theta}} p(y|\hat{\theta},\xi) (p(\hat{\theta}) + \delta p(\hat{\theta}))d\hat{\theta}\big]p(y|\theta,\xi) dy \delta p(\theta) d\theta = B
% \end{gather*}

% \begin{gather*}
% A-B = \int_\theta\int_y \log\big[\frac{\int_{\hat{\theta}} p(y|\hat{\theta},\xi) p(\hat{\theta}))d\hat{\theta}}{ \int_{\hat{\theta}} p(y|\hat{\theta},\xi) (p(\hat{\theta})+\delta p(\hat{\theta}))d\hat{\theta}}\big]p(y|\theta,\xi) dy p(\theta) d\theta \\
% + \int_\theta\int_y \log\big[ \frac{\int_{\hat{\theta}} p(y|\hat{\theta},\xi) (p(\hat{\theta}) + \delta p(\hat{\theta}))d\hat{\theta}}{ \int_{\hat{\theta}} p(y|\hat{\theta},\xi) p(\hat{\theta}) d\hat{\theta}}\big]p(y|\theta,\xi) dy \delta q(\theta) d\theta \\
% \end{gather*}

% \section{VNMC-ACE}

%\begin{gather*}
%VNMC - ACE &= \mathbb{E}_{p(y|\xi)}KL(P(\theta_{0:L}|y)||\Pi_{l=0}^Lq_\phi(\theta_l|y)) \\ &+ \mathbb{E}_{p(y|\xi)}KL(\Pi_{l=1}^Lq_\phi(\theta_l|y)||P(\theta_{1:L}|y))
%\end{gather*}

\begin{acknowledgements} % will be removed in pdf for initial submission,
                         % so you can already fill it to test with the
                         % ‘accepted’ class option
    We would like to thank Alexander Shapiro for several stimulating discussions on ambiguity set.
\end{acknowledgements}

\bibliography{go_602}

\appendix
% NOTE: necessary when ptmx or no mathfont class option is given
\section{Experiments Details}
\subsection{Implementation}
All experiments were implemented in PyTorch
1.4.0 \citep{paszke2019pytorch} and Pyro 0.3.4 \citep{bingham2018pyro}.

\subsection{A/B test}
The reference prior and likelihood for the A/B test were as follows:
%
\begin{gather}
\theta \sim \mathcal{N}(\begin{pmatrix} 0 \\ 0
\end{pmatrix}
,\begin{pmatrix}
10^2 & 0 \\
0 & 1.82^2 
\end{pmatrix}), y|\theta,\xi \sim \mathcal{N}(X_\xi\theta,I)%\\ 
%q_p(\theta|y,\xi,\phi) \sim \mathcal{N}(Ay, \Sigma_p), \ \phi = (A, \Sigma_p).
\end{gather}

\subsection{Preference}
We use the utility function from \citep{NEURIPS2019_d55cbf21}.
\begin{gather}
    \text{let \ }\xi \in \mathrm{R} \\
    \theta \sim \mathcal{N}(-7.35, 20^2) \\ \eta|\theta,\xi \sim \mathcal{N}(\xi-\theta,1 + |\xi|^2)\\ y = f(\eta) \\ 
    f : \mathrm{R} \rightarrow [\epsilon, 1-\epsilon] \\
    x \rightarrow \begin{cases}
    \epsilon & \text{if } x\leq \text{logit}(\epsilon)\\
    1-\epsilon,              & \text{if} x \geq \text{logit}(1 - \epsilon) \\ 
    \frac{1}{1-e^{-x}} & \text{otherwise},
    \end{cases}
\end{gather}

\subsection{Pharmacokinetic model}
The prior distribution and noise distributions for the pharmacokinetic model are defined as follows,
% %https://stats.stackexchange.com/questions/289323/kl-divergence-of-multivariate-lognormal-distributions 
% % (s$[\log 1, \log 0.1 \log 20]$

% VNMC, and ACE use the posterior guide from $y_i$ to $\theta_i$ and $\sigma_i$. 

% We use given set-up from MINE repository.(0.001 for the learning rate, 10 hidden layers)
\begin{gather}
    \begin{pmatrix} k_a \\ k_e \\ V
    \end{pmatrix}
     \sim 
    \log \mathcal{N} \begin{bmatrix}\begin{pmatrix}
    0.1 \\ \log0.1 \\ \log20
    \end{pmatrix}, \begin{pmatrix}
0.05 & 0 & 0 \\
0 & 0.05 & 0 \\
0 & 0 & 0.05
\end{pmatrix}\end{bmatrix} \\ 
y_t = \frac{400}{V}\frac{k_a}{k_a - k_e}(\exp{-k_et}-\exp{-k_at})(1 + \epsilon_{1t}) + \epsilon_{2t},
\end{gather}
where $\epsilon_{1t} \sim \mathcal{N}(0,0.01), \epsilon_{2t} \sim \mathcal{N}(0,0.1)$, and $k_a > k_e$.

\clearpage
% \section{Introduction}\label{sec:intro}
% UAI 2022 papers have to be prepared using \LaTeX.
% To start writing your paper, copy \texttt{uai2022-template.tex} and replace title, authorship, and content with your own.

% The UAI 2022 paper style is based on a custom \textsf{uai2022} class.
% The class file sets the page geometry and visual style.\footnote{%
%     The class uses the packages \textsf{adjustbox}, \textsf{environ}, \textsf{letltxmacro}, \textsf{geometry}, \textsf{footmisc}, \textsf{caption}, \textsf{textcase}, \textsf{titlesec}, \textsf{titling}, \textsf{authblk}, \textsf{enumitem}, \textsf{microtype}, \textsf{lastpage}, and \textsf{kvoptions}.
% }
% The class file also loads basic text fonts.\footnote{%
%     Fonts loaded are \textsf{times} (roman), \textsf{helvet} (sanserif), \textsf{courier} (fixed-width), and \textsf{textcomp} (common symbols).
% }
% \emph{You may not modify the geometry or style in any way, for example, to squeeze out a little bit of extra space.}
% (Also do not use \verb|\vspace| for this.)
% Feel free to use convenience functionality of loaded packages such as \textsf{enumitem}.
% The class enables hyperlinking by loading the \textsf{hyperref} package.

% You are free to load any packages available in \TeX{Live}~2020 that are compatible with the UAI class.\footnote{In case this template or your submission does not compile, always first make sure your \TeX\ installation is up-to-date.}
% (Mik\TeX{} and Mac\TeX{} generally contain the same packages.)
% Do not load conflicting packages—you will get an error message—, as this complicates creating the proceedings.
% Please avoid using obsolete commands, such as \verb|\rm|, and obsolete packages, such as \textsf{epsfig}.\footnote{%
%     See \url{https://ctan.org/pkg/l2tabu}.
% }

% \swap[ ]{in the header of your source file.}{Feel free to include your own macros}

% \section{General Formatting Instructions}
% As a general rule: \emph{follow the template}.

% \subsection{Authorship}
% Reviewing is double-blind.
% However, you can already fill in your author names and affiliations in the \verb|\author| block in the preamble following the example of the template because the class will remove it as long as the option \textsf{accepted} is not passed to the class.
% Nevertheless, make sure any other information in the paper does not disclose your identity, for example URLs to supplementary material.

% \subsection{Sectioning}
% Three numbered sectioning commands are provided: \verb|\section|, \verb|\subsection|, and \verb|\subsubsection|.
% Please respect their order, so do not put a \verb|\subsubsection| directly beneath a \verb|\section|.
% One unnumbered sectioning command is provided, \verb|\paragraph|.
% It can be used directly below any numbered section level.
% Do not use any other sectioning commands.

% \subsubsection{Typing the Section Titles}
% The \verb|\section| and \verb|\subsection| titles are uppercased by the class.
% Please type them in title case.
% (This is used in the PDF bookmarks.)
% Please also write the \verb|\subsubsection| titles in title case.

% \paragraph{What is title case?}
% \href{https://en.wikipedia.org/wiki/Title_case}{Wikipedia} explains:
% \begin{quote}
%     Title case or headline case is a style of capitalization used for rendering the titles of published works or works of art in English.
%     When using title case, all words are capitalized except for ‘minor’ words (typically articles, short prepositions, and some conjunctions) unless they are the first or last word of the title.
% \end{quote}

% \subsection{References, Citations, Footnotes}\label{sec:etc}
% \subsubsection{Cross-Referencing}
% Always use \verb|\label| and \verb|\ref|—or a command with a similar effect—when cross-referencing.
% For example, this subsection is Section~\ref{sec:etc}.

% \subsubsection{Citations}
% Citations should include the author's last name and year.
% They should be part of the sentence.
% An example parenthetical citation: “Good introductions to the topic are available \citep{latexcompanion}.”
% An example textual citation: “\citet{einstein} discusses electrodynamics of moving bodies.”
% Do not use a parenthetical citation where a textual one is appropriate.
% An example of what \emph{not} to do: “\citep{einstein} discusses electrodynamics of moving bodies.”

% We strongly advise to use reference list software such as Bib\TeX{} and a citation package such as \textsf{natbib}.
% The reference style you use should be compatible with the author-year citations.
% Both the citation style and reference style used should be consistent.

% For the original submission, take care not to reveal the authors' identity through the manner in which one's own previous work is cited.
% For example, writing
% “I discussed electrodynamics of moving bodies before \citep{einstein}.” would be inappropriate, as it reveals the author's identity.
% Instead, write “\citet{einstein} discussed electrodynamics of moving bodies.”

% \subsubsection{Footnotes}
% You can include footnotes in your text.\footnote{
%     Use footnotes sparingly, as they can be distracting, having readers skip back and forth between the main text and the foot of the page.
% }
% The footnote mark should follow the fragment to which it refers, so a footnote\footnote{
%     A footnote is material put at the foot of a page.
% }
% for a word has a footnote mark attached to that word and a footnote for a phrase or sentence has a footnote mark attached to the closing punctuation.

% \section{Math}\label{sec:math}
% The class file does not load any math support package like \textsf{amsmath}\footnote{%
%   See the \textsf{amsmath} documentation at \url{https://ctan.org/pkg/amsmath} for further details.
% }.
% We advise using the \textsf{mathtools}\footnote{%
%   See the \textsf{mathtools} documentation at \url{https://ctan.org/pkg/mathtools} for further details.
% }
% package, which extends \textsf{amsmath} with fixes and even more useful commands.
% Feel free to load other support packages for symbols, theorems, etc.

% Use the \textsf{amsmath} environments for displayed equations.
% So, specifically, use the \texttt{equation} environment instead of \verb|$$...$$| and the \texttt{align} environment instead of \texttt{eqnarray}.\footnote{For reasons why you should not use the obsolete \texttt{eqnarray} environment, see Lars Madsen, \textit{Avoid eqnarray!} TUGboat 33(1):21--25, 2012.}
% An \texttt{equation}:
% \begin{equation}\label{eq:example}
%   0 = 1 - 1.
% \end{equation}
% Two \texttt{align}'ed equations:
% \begin{align*} % no numbers with starred version
%   1 + 2 &= 3,\\
%   1 - 2 &= -1.
% \end{align*}
% Equations can also be put inline, of course.
% For example, Equation~\eqref{eq:example}: \(0=1+1\). % $0=1+1$ also works
% (Notice that both inline and displayed math are part of the sentence, so punctuation should be added to displayed math.)

% The \textsf{amsmath} and \textsf{mathtools} packages provide a lot of nice functionality, such as many common math operators, e.g., \(\sin\) and \(\max\), and also commands for defining new ones.

% \section{Floats}\label{sec:floats}
% Floats, such as figures, tables and algorithms, are moving objects and are supposed to float to the nearest convenient location.
% Please do not force them to go in the middle of a paragraph.
% They must respect the column width.

% Two-column floats are possible.
% They appear at the top of the next page, so strategic placement may be necessary.
% For an example, see Figure~\ref{fig:tikz}.
% They may not enter the margins.
% \begin{figure*}
%     \centering
%     \begin{tikzpicture}[xscale=1.5]
%         \coordinate (origin);
%         \draw[->] (origin) -- +(1cm,0) node[below] {$x$};
%         \draw[->] (origin) -- +(0,1cm) node[left] {$y$};
%         \fill[gray] (45:1cm) circle[radius=.2cm];
%     \end{tikzpicture}
%     \caption{A Nice Filled Ellipse with a Pair of Coordinate Axes.}\label{fig:tikz}
% \end{figure*}

% All material in floats should be legible and of good quality.
% So avoid very small or large text and pixelated or fuzzy lines.

% \subsection{Figures}\label{sec:figures}
% Figures should go in the \texttt{figure} environment and be centered therein.
% The caption should go below the figure.
% Use \verb|\includegraphics| for external graphics files but omit the file extension.
% Supported formats are \textsf{pdf} (preferred for vector drawings and diagrams), \textsf{png} (preferred for screenshots), and \textsf{jpeg} (preferred for photographs).
% Do not use \verb|\epsfig| or \verb|\psfig|.
% If you want to scale the image, it is better to use a fraction of the line width rather than an explicit length.
% For example, see Figure~\ref{fig:Eindhoven}.
% \begin{figure}
%   \centering
%   \includegraphics[width=0.7\linewidth,page=3]{Eindhoven}
%   \caption{A View of a Nice City.}\label{fig:Eindhoven}
% \end{figure}

% Do not use \verb|\graphicspath|.
% If the images are contained in a subdirectory, specify this when you include the image, for example \verb|\includegraphics{figures/mypic}|.

% \subsection{Tables}\label{sec:tables}
% Tables should go in the \texttt{table} environment and be centered therein.
% The caption should go above the table and be in title caps.
% For an example, see Table~\ref{tab:data}.
% \begin{table}
%     \centering
%     \caption{An Interesting Table.}\label{tab:data}
%     \begin{tabular}{rl}
%       \toprule % from booktabs package
%       \bfseries Dataset & \bfseries Result\\
%       \midrule % from booktabs package
%       Data1 & 0.12345\\
%       Data2 & 0.67890\\
%       Data3 & 0.54321\\
%       Data4 & 0.09876\\
%       \bottomrule % from booktabs package
%     \end{tabular}
% \end{table}

% \subsection{Algorithms}\label{sec:algorithms}
% You can load your favorite algorithm package, such as \textsf{algorithm2e}\footnote{See the \textsf{algorithm2e} documentation at \url{https://ctan.org/pkg/algorithm2e}.}.
% Use the environment defined in the package to create a centered float with an algorithm inside.

% \section{Back Matter}
% There are a some final, special sections that come at the back of the paper, in the following order:
% \begin{itemize}
%   \item Author Contributions
%   \item Acknowledgements
%   \item References
% \end{itemize}
% They all use an unnumbered \verb|\subsubsection|.

% For the first two special environments are provided.
% (These sections are automatically removed for the anonymous submission version of your paper.)
% The third is the ‘References’ section.
% (See below.)

% (This ‘Back Matter’ section itself should not be included in your paper.)

% \begin{contributions} % will be removed in pdf for initial submission,
%                       % so you can already fill it to test with the
%                       % ‘accepted’ class option
%     Briefly list author contributions.
%     This is a nice way of making clear who did what and to give proper credit.

%     H.~Q.~Bovik conceived the idea and wrote the paper.
%     Coauthor One created the code.
%     Coauthor Two created the figures.
% \end{contributions}

% \begin{acknowledgements} % will be removed in pdf for initial submission,
%                          % so you can already fill it to test with the
%                          % ‘accepted’ class option
%     Briefly acknowledge people and organizations here.

%     \emph{All} acknowledgements go in this section.
% \end{acknowledgements}

% %\bibliography{uai2022-template}

% \appendix
% % NOTE: necessary when ptmx or no mathfont class option is given
% \providecommand{\upGamma}{\Gamma}
% \providecommand{\uppi}{\pi}
% \section{Math font exposition}
% How math looks in equations is important:
% \begin{equation*}
%   F_{\alpha,\beta}^\eta(z) = \upGamma(\tfrac{3}{2}) \prod_{\ell=1}^\infty\eta \frac{z^\ell}{\ell} + \frac{1}{2\uppi}\int_{-\infty}^z\alpha \sum_{k=1}^\infty x^{\beta k}\mathrm{d}x.
% \end{equation*}
% However, one should not ignore how well math mixes with text:
% The frobble function \(f\) transforms zabbies \(z\) into yannies \(y\).
% It is a polynomial \(f(z)=\alpha z + \beta z^2\), where \(-n<\alpha<\beta/n\leq\gamma\), with \(\gamma\) a positive real number.

\end{document}
