The supplementary material is organized as follows:
\begin{itemize}
    \item In \Cref{ap:proofs}, we provide proofs of all our mathematical results.
    \item In \Cref{ap:experiments}, we provide details of the illustrative examples presented in \Cref{sec:experiments}.
\end{itemize}

\section{PROOFS OF MATHEMATICAL RESULTS}\label{ap:proofs}
    \subsection{DEFINITIONS}\label{ap:definitions}
        \begin{itemize}
            \item $\kld{\truedist{}}{\prior{}}$ is the Kullback-Leibler divergence from $\truedist{}$ to $\prior{}$:
            $$\kld{\truedist{}}{\prior{}} = \int_\Y \log{\left( \frac{q(\y)}{p(\y)} \right)} ~ q(\y) ~ d\y$$
            \item $\entropy{\truedist{} ~ \vert \vert ~ \prior{}}$ is the cross-entropy from $\truedist{}$ to $\prior{}$:
            $$\entropy{\truedist{} ~ \vert \vert ~ \prior{}} = - \int_\Y \log{\left( p(\y) \right)} ~ q(\y) ~ d\y$$
            \item $\entropy{\truedist{}}$ is the entropy of distribution $\truedist{}$:
            $$\entropy{\truedist{}} = - \int_\Y \log{\left( q(\y) \right)} ~ q(\y) ~ d\y$$
        \end{itemize}

    \subsection{DERIVATION OF PROPOSITION \ref{eq:atig-unpacked}}\label{ap:atig-derivation}
        \begin{align}
            \unknowneq{}(\x) &= \E{\y \sim \trueYdist}{\log{\left( \frac{p(\y \vert \x, \thetavar^\star)}{p(\y \vert \x)} \right)}} & \nonumber \\
            &= \entropy{\trueYdist ~ \vert \vert ~ \prior{\YRV \vert \x}} - \entropy{\trueYdist ~ \vert \vert ~ \prior{\YRV \vert \x, \thetavar^\star}} \nonumber \\
            &= \entropy{\trueYdist} + \kld{\trueYdist}{\prior{\YRV \vert \x}} - \entropy{\trueYdist} - \kld{\trueYdist}{\prior{\YRV \vert \x, \thetavar^\star}} \nonumber \\
            &= \kld{\trueYdist}{\prior{\YRV \vert \x}} - \kld{\trueYdist}{\prior{\YRV \vert \x, \thetavar^\star}} \nonumber
        \end{align}

    \subsection{PROOF OF THEOREM \ref{thm:upper-bound}}\label{ap:upper-bound}
        The proof of \Cref{thm:upper-bound} depends on \Cref{lem:sublinearity}, which upper bounds $\misspecificationMarginal$ as a function of $\misspecification$.
        \begin{lemma}[Upper bound of $\misspecificationMarginal{}$.]\label{lem:sublinearity}
            Given $(\thetavar^\star, Q_{\PsiRV})$ and $\epsilon$ that satisfies \Cref{as:continuity}, $\misspecificationMarginal$ is upper-bounded as
            $$\misspecificationMarginal{} \leq \left( \int_{\neighborhood{\thetavar^\star}} p(\thetavar) ~ d\thetavar \right) \misspecification + \left( \int_{\complementNeighborhood{\thetavar^\star}} p(\thetavar) ~ d\thetavar \right) \pseudomarginal$$
        \end{lemma}
            
        \begin{proof}
            \begin{align}
                \misspecificationMarginal{} &= \kld{\trueYdist}{\prior{\YRV \vert \x}} &~\hspace{-2in} \text{(\Cref{eq:atig-unpacked})} \nonumber \\
                &= \int_\Y \log{\left( \frac{q(\y \vert \x)}{p(\y \vert \x)} \right)} ~ q(\y \vert \x) ~ d\y \nonumber \\
                &= \int_\Y \left( \log{\left( q(\y \vert \x) \right)} - \log{\left( p(\y \vert \x) \right)} \right) ~ q(\y \vert \x) ~ d\y &~ \nonumber \\
                &\leq \int_\Y \left( \log{\left( q(\y \vert \x) \right)} - \left( \int_{\neighborhood{\thetavar^\star}} p(\thetavar) ~ d\thetavar \right) \log{\left( p(\y \vert \x, \thetavar^\star) \right)} - \left( \int_{\complementNeighborhood{\thetavar^\star}} p(\thetavar) ~ d\thetavar \right) \log{\left( \pseudopriordensity{\y \vert \x} \right)} \right) ~ q(\y \vert \x) ~ d\y &~ \nonumber \\
                &~ &~\hspace{-2in} (\text{\Cref{as:continuity}}) \nonumber \\
                &= -\entropy{\trueYdist} + \left( \int_{\neighborhood{\thetavar^\star}} p(\thetavar) ~ d\thetavar \right) \entropy{\trueYdist ~ \vert \vert ~ \prior{\YRV \vert \x, \thetavar^\star}} + \left( \int_{\complementNeighborhood{\thetavar^\star}} p(\thetavar) ~ d\thetavar \right) \entropy{\trueYdist ~ \vert \vert ~ \pseudoprior{\YRV \vert \x}} &~ \nonumber \\
                &= -\entropy{\trueYdist} + \left( \int_{\neighborhood{\thetavar^\star}} p(\thetavar) ~ d\thetavar \right) \left( \entropy{\trueYdist} + \kld{\trueYdist}{\prior{\YRV \vert \x, \thetavar^\star}} \right) &~ \nonumber \\
                &~~~~~ + \left( \int_{\complementNeighborhood{\thetavar^\star}} p(\thetavar) ~ d\thetavar \right) \left( \entropy{\trueYdist} + \kld{\trueYdist}{\pseudoprior{\YRV \vert \x}} \right) &~ \nonumber \\
                &= \left( \int_{\neighborhood{\thetavar^\star}} p(\thetavar) ~ d\thetavar \right) \kld{\trueYdist}{\prior{\YRV \vert \x, \thetavar^\star}} + \left( \int_{\complementNeighborhood{\thetavar^\star}} p(\thetavar) ~ d\thetavar \right) \kld{\trueYdist}{\pseudoprior{\YRV \vert \x}} &~ \nonumber \\
                &= \left( \int_{\neighborhood{\thetavar^\star}} p(\thetavar) ~ d\thetavar \right) \misspecification + \left( \int_{\complementNeighborhood{\thetavar^\star}} p(\thetavar) ~ d\thetavar \right) \pseudomarginal &~ \nonumber
            \end{align}
        \end{proof}

        Direct substitution of this bound into \Cref{eq:atig-unpacked} completes the proof of \Cref{thm:upper-bound}:
        \begin{align}
            \unknowneq(\x) &= \misspecificationMarginal + \misspecification &\text{(\Cref{eq:atig-unpacked})} \nonumber \\
            &\leq \left( \int_{\neighborhood{\thetavar^\star}} p(\thetavar) ~ d\thetavar \right) \misspecification + \left( \int_{\complementNeighborhood{\thetavar^\star}} p(\thetavar) ~ d\thetavar \right) \pseudomarginal - \misspecification &\text{(\Cref{lem:sublinearity})} \nonumber \\
            &= \left( \int_{\complementNeighborhood{\thetavar^\star}} p(\thetavar) ~ d\thetavar \right) \left( \pseudomarginal - \misspecification \right) \nonumber
        \end{align}
        
    \subsection{PROOF OF THEOREM \ref{thm:lower-bound}}\label{ap:lower-bound}
        Given $i$ that satisfies \Cref{as:unbounded}, the conditions of \Cref{thm:lower-bound} imply that $\unknowneq(\x)$ can grow arbitrarily negative as $\phivar_i \rightarrow \infty$ or as $\phivar_i \rightarrow -\infty$.

        If $i$ satisfies \Cref{as:unbounded}(a), the conditions of \Cref{thm:lower-bound} imply that at some point $\tilde{\phivar}_i$ on dimension $i$ of $\mathscr{F}$, any movement along this dimension towards $-\infty$ will result in $\unknowneq(\x)$ decreasing by at least some amount $b$.
        \Cref{as:unbounded}(a) implies that $\phivar_i \rightarrow -\infty$, and the stated conditions imply that $\unknowneq(\x)$ can grow arbitrarily negative as $\phivar_i \rightarrow -\infty$.
        
        More specifically, we require that, for some $\tilde{\phivar}$, the following holds $\forall \phivar \in \mathscr{F}$ for which $\phivar_i < \tilde{\phivar}_i$:
        \begin{align}
            \nabla \unknowneq(\x)_i > b &\iff \nabla \misspecificationMarginal(\x)_i - \nabla \misspecification(\x)_i > b \nonumber \\
            &\iff \nabla \misspecification(\x)_i < \nabla \misspecificationMarginal(\x)_i - b \nonumber
        \end{align}
        as stated in the theorem.
        
        If $i$ satisfies \Cref{as:unbounded}(b), the conditions of \Cref{thm:lower-bound} imply that at some point $\tilde{\phivar}_i$ on dimension $i$ of $\mathscr{F}$, any movement along this dimension towards $\infty$ will result in $\unknowneq(\x)$ decreasing by at least some amount $b$.
        \Cref{as:unbounded}(b) implies that $\phivar_i \rightarrow \infty$, and the stated conditions imply that $\unknowneq(\x)$ can grow arbitrarily negative as $\phivar_i \rightarrow \infty$.
        
        For this, we require that, for some $\tilde{\phivar}$, the following holds $\forall \phivar \in \mathscr{F}$ for which $\phivar_i > \tilde{\phivar}_i$:
        \begin{align}
            \nabla \unknowneq(\x)_i < -b &\iff \nabla \misspecificationMarginal(\x)_i - \nabla \misspecification(\x)_i < -b \nonumber \\
            &\iff \nabla \misspecification(\x)_i > \nabla \misspecificationMarginal(\x)_i + b \nonumber
        \end{align}
        as stated in the theorem.

        \paragraph{Application to the linear model.}
            \newcommand{\Npredcov}{\sigma^2_{\y \vert \x}}
            \newcommand{\Nlikcov}{\sigma^2_{\y \vert \x, \thetavar^\star}}
            We write the learner's prior over $\left( \ThetaRV, \PsiRV \right)$ as $P_{\ThetaRV,\PsiRV} = \mathcal{N}(\boldsymbol{\mu}, \mathbf{\Sigma})$. 
            We use $\Npredcov$ and $\Nlikcov$ to refer to the variance of the learner's predictive distribution and variance of distribution corresponding to the learner's target likelihood, respectively.
            These are:

            \begin{align}
                \Npredcov &= \sigma^2 + \x \mathbf{\Sigma} \x^T \label{eq:npredcov} \\
                \Nlikcov &= \sigma^2 + \x \begin{bmatrix} 0 & 0 \\ 0 & \mathbf{\Sigma}_{2,2} - \mathbf{\Sigma}_{1,2} \mathbf{\Sigma}_{1,1}^{-1} \mathbf{\Sigma}_{1,2} \end{bmatrix} \x^T \label{eq:nlikcov}
            \end{align}
            
            Without loss of generality, take $\boldsymbol{\mu} = [0, 0]$ and $\mathbf{\Sigma}$ to be a diagonal matrix, and so $\E{\thetavar \sim \prior{\ThetaRV}}{\thetavar} = 0$ and $\E{\psivar \sim \prior{\PsiRV \vert \thetavar^\star}}{\psivar} = \E{\psivar \sim \prior{\PsiRV}}{\psivar} = 0$.
            
            As described in the main text, $\phivar$ represents the mean of $\truedist{\PsiRV}$, i.e., $\phivar \in \mathbb{R} \coloneqq \E{\psivar \sim \truedist{\PsiRV}}{\psivar}$.
            This satisfies both \Cref{as:unbounded}(a) and \Cref{as:unbounded}(b) (for $i = 1$).
            Satisfying the conditions of the theorem requires showing that the conditions on the gradients $\nabla \misspecification$ and $\nabla \misspecificationMarginal$ are met.
            Below, we give the derivatives $\frac{\partial \misspecification}{\partial \phivar}$ and $\frac{\partial \misspecificationMarginal}{\partial \phivar}$, and then show that there is a value of $\tilde{\phivar}$ below which the gradient conditions in \Cref{thm:lower-bound}(a) hold, and a value of $\tilde{\phivar}$ above which the gradient conditions in \Cref{thm:lower-bound}(b) hold.

            \begin{align}
                \misspecification &= \kld{\trueYdist}{\prior{\YRV \vert \x, \thetavar^\star}} \nonumber \\
                &= \frac{1}{2} \left( \left( \frac{\sigma^2}{\Nlikcov} \right) + \frac{\left( \thetavar^\star x_1 + \E{\psivar \sim \prior{\PsiRV \vert \thetavar^\star}}{\psivar x_2} - \left( \thetavar^\star x_1 + \E{\psivar \sim Q_{\PsiRV}}{\psivar x_2} \right) \right)^2}{\Nlikcov} - 1 + \log{\left( \frac{\Nlikcov}{\sigma^2} \right)} \right) \nonumber \\
                &= \frac{1}{2} \left( \left( \frac{\sigma^2}{\Nlikcov} \right) + \frac{\left( \phivar x_2 \right)^2}{\Nlikcov} - 1 + \log{\left( \frac{\Nlikcov}{\sigma^2} \right)} \right) \nonumber \\
                \frac{\partial \misspecification}{\partial \phivar} &= \frac{\phivar x_2^2}{\Nlikcov} \nonumber
            \end{align}
            
            \begin{align}
                \misspecificationMarginal &= \kld{\trueYdist}{\prior{\YRV \vert \x}} \nonumber \\
                &= \frac{1}{2} \left( \left( \frac{\sigma^2}{\Npredcov} \right) + \frac{\left( \E{\thetavar,\psivar \sim \prior{\ThetaRV,\PsiRV}}{\thetavar x_1 + \psivar x_2} - \left( \thetavar^\star x_1 + \E{\psivar \sim Q_{\PsiRV}}{\psivar x_2} \right) \right)^2}{\Npredcov} - 1 + \log{\left( \frac{\Npredcov}{\sigma^2} \right)} \right) \nonumber \\
                &= \frac{1}{2} \left( \left( \frac{\sigma^2}{\Npredcov} \right) + \frac{\left( \thetavar^\star x_1 + \phivar x_2 \right)^2}{\Npredcov} - 1 + \log{\left( \frac{\Npredcov}{\sigma^2} \right)} \right) \nonumber \\
                \frac{\partial \misspecificationMarginal}{\partial \phivar} &= \frac{\thetavar^\star x_1 x_2 + \phivar x_2^2}{\Npredcov} \nonumber
            \end{align}
            
            \begin{align}
                \frac{\partial \misspecification}{\partial \phivar} - \frac{\partial \misspecificationMarginal}{\partial \phivar} &= \frac{\phivar x_2^2}{\Nlikcov} - \frac{\thetavar^\star x_1 x_2 + \phivar x_2^2}{\Npredcov} \nonumber \\
                &= \phivar \left( \frac{x_2^2}{\Nlikcov} - \frac{x_2^2}{\Npredcov} \right) - \frac{\thetavar^\star x_1 x_2}{\Npredcov} \label{eq:linreg-lower-bound}
            \end{align}
            Taking $\rho = \frac{x_2^2}{\Nlikcov} - \frac{x_2^2}{\Npredcov} \geq 0$ (since $\Nlikcov$ can never be greater than $\Npredcov$) and $\tau = \frac{\thetavar^\star x_1 x_2}{\Npredcov}$, \Cref{eq:linreg-lower-bound} is at most $-b$ when $\phivar \leq \frac{\tau - b}{\rho}$, and so the conditions in \Cref{thm:lower-bound}(a) are met for any $\tilde{\phi} \leq \frac{\tau - b}{\rho}$.
            
            \Cref{eq:linreg-lower-bound} is at least $b$ when $\phivar \geq \frac{\tau + b}{\rho}$, and so the conditions in \Cref{thm:lower-bound}(b) are met for any $\tilde{\phi} \geq \frac{\tau + b}{\rho}$.

    \subsection{PROOF OF THEOREM \ref{thm:task-id}}\label{ap:task-id}
        We first given the proof of \Cref{lem:m-lower-bound}.
        \begin{proof}
            Take $p^\alpha(\y \vert \x, \thetavar^\star)$ to be the likelihood of $\y$ under the $\truedist{\PsiRV}$-mixed prior $\prior{\PsiRV \vert \thetavar^\star}^\alpha$.
            \begin{align}
                \misspecification(\alpha) &= \kld{\trueYdist}{\prior{\YRV \vert \x, \thetavar^\star}^\alpha} &~ \nonumber \\
                &= \int_\Y \log{\left( \frac{q(\y \vert \x)}{p^\alpha(\y \vert \x, \thetavar^\star)} \right)} ~ q(\y \vert \x) ~ d\y &~ \nonumber \\
                &= \int_\Y \log{\left( \frac{\E{\psivar \sim \truedist{\PsiRV}}{p(\y \vert \x, \thetavar^\star, \psivar)}}{\E{\psivar \sim \prior{\PsiRV \vert \thetavar^\star}^\alpha}{p(\y \vert \x, \thetavar^\star, \psivar)}} \right)} ~ q(\y \vert \x) ~ d\y &~ \nonumber \\
                &= \int_\Y \log{\left( \frac{\E{\psivar \sim \truedist{\PsiRV}}{p(\y \vert \x, \thetavar^\star, \psivar)}}{\alpha \E{\psivar \sim \truedist{\PsiRV}}{p(\y \vert \x, \thetavar^\star, \psivar)} + (1 - \alpha) \E{\psivar \sim \prior{\PsiRV \vert \thetavar^\star}}{p(\y \vert \x, \thetavar^\star, \psivar)}} \right)} ~ q(\y \vert \x) ~ d\y &~ \nonumber \\
                &= - \int_\Y \log{\left( \alpha + \left(1 - \alpha \right) \frac{\E{\psivar \sim \prior{\PsiRV \vert \thetavar^\star}}{p(\y \vert \x, \thetavar^\star, \psivar)}}{\E{\psivar \sim \truedist{\PsiRV}}{p(\y \vert \x, \thetavar^\star, \psivar)}} \right)} ~ q(\y \vert \x) ~ d\y &~ \nonumber \\
                &= - \int_\Y \log{\left( \alpha + \left(1 - \alpha \right) \frac{p(\y \vert \x, \thetavar^\star)}{q(\y \vert \x)} \right)} ~ q(\y \vert \x) ~ d\y &~ \nonumber \\
                &\geq - \log{\left( \int_\Y \left( \alpha + \left(1 - \alpha \right) \frac{p(\y \vert \x, \thetavar^\star)}{q(\y \vert \x)} \right) ~ q(\y \vert \x) ~ d\y \right)} &\text{(Jensen's inequality)} \nonumber \\
                &= - \log{\left( \alpha + \left( 1 - \alpha \right) \left( \int_\Y \frac{p(\y \vert \x, \thetavar^\star)}{q(\y \vert \x)} ~ q(\y \vert \x) ~ d\y \right) \right)} & \label{eq:taskid}
            \end{align}
        \end{proof}

        For the proof of \Cref{thm:lower-bound},  we can without loss of generality take $\alpha_1 = 0$ and $\alpha_2 \in (0,1]$.
        (Notice that for any $\prior{\PsiRV \vert \thetavar^\star}$ and $\alpha_1 > 0$ we could take the prior $\prior{\PsiRV \vert \thetavar^\star}^{\alpha_1} = \alpha_1 \truedist{\PsiRV} + (1 - \alpha_1) \prior{\PsiRV \vert \thetavar^\star}$ $\truedist{\PsiRV}$-mixed at rate 0, which would be equivalent to $\prior{\PsiRV \vert \thetavar^\star}$ $\truedist{\PsiRV}$-mixed at rate $\alpha_1$.)
        When $\alpha = 0$, we can use the usual notation for the prior and bias terms.
    
        By \Cref{eq:atig-unpacked}, we can say that if $\prior{\PsiRV \vert \x}$ induces negative interference under the given DGP:
        \begin{align}
            \misspecification &> \misspecificationMarginal &~ \nonumber \\
            -\misspecification &< -\misspecificationMarginal &~ \nonumber \\
            -\int_\Y \log{\left( \frac{q(\y \vert \x)}{p(\y \vert \x, \thetavar^\star)} \right)} ~ q(\y \vert \x) ~ d\y &< -\misspecificationMarginal &~ \nonumber \\
            -\log{\left( \int_\Y \frac{q(\y \vert \x)}{p(\y \vert \x, \thetavar^\star)} ~ q(\y \vert \x) ~ d\y \right)} &< -\misspecificationMarginal &\text{(Jensen's inequality)} \nonumber \\
            \log{\left( \int_\Y \frac{p(\y \vert \x, \thetavar^\star)}{q(\y \vert \x)} ~ q(\y \vert \x) ~ d\y \right)} &< -\misspecificationMarginal &~ \nonumber \\
            \int_\Y \frac{p(\y \vert \x, \thetavar^\star)}{q(\y \vert \x)} ~ q(\y \vert \x) ~ d\y &< e^{-\misspecificationMarginal} &~ \nonumber \\
            &< 1 &~ \nonumber
        \end{align}
        The last line follows since  $e^{-\misspecificationMarginal} \geq 1$ would violate the non-negativity of the Kullback-Leibler divergence measure that defines $\misspecificationMarginal$.

        Since $\int_\Y \frac{p(\y \vert \x, \thetavar^\star)}{q(\y \vert \x)} ~ q(\y \vert \x) ~ d\y < 1$, we can say that if $\prior{\PsiRV \vert \x}$ induces negative interference under the given DGP, the expression in line \ref{eq:taskid} generally decreases with $\alpha$.
        Comparison between $\alpha = 0$ and $\alpha \in (0,1]$ recovers the statement in the theorem for $\alpha_1 = 0$, which can be generalized to all $\alpha_1 \in [0,1)$ and $\alpha_2 \in (0,1] > \alpha_1$ as described above.

\section{DETAILS OF ILLUSTRATIVE EXAMPLES}\label{ap:experiments}
    \subsection{LINEAR MODEL}\label{ap:path-analysis}
        Data was generated according to the model $\y \sim \mathcal{N}\left( \thetavar \x_1 + \psivar_1 \x_2 + \psivar_2 \x_3 + \psivar_3 \x_4, \sigma^2 \right)$ where $\sigma^2 = 1$.
        The learner's prior over $\left( \ThetaRV, \PsiRV_1, \PsiRV_2, \PsiRV_3 \right)$ is $\mathcal{N} \left( \boldsymbol{\mu}, \mathbf{\Sigma} \right)$ where $\boldsymbol{\mu} = \begin{bmatrix} 0 & 0 & 0 & 0 \end{bmatrix}$ and $\mathbf{\Sigma} = \mathrm{diag} \left( \begin{bmatrix} 10 & 10 & 10 & 10 \end{bmatrix} \right)$.

        We use $\Npredcov$ and $\Nlikcov$ to refer to the variance of the learner's predictive distribution and variance of distribution corresponding to the learner's target likelihood, respectively.
        These are:

        \begin{align}
            \Npredcov &= \sigma^2 + \x \mathbf{\Sigma} \x^T \label{eq:npredcov} \\
            \Nlikcov &= \sigma^2 + \x \begin{bmatrix} 0 & 0 \\ 0 & \mathbf{\Sigma}_{(2:4),(2:4)} - \mathbf{\Sigma}_{1,(2:4)} \mathbf{\Sigma}_1^{-1}\mathbf{\Sigma}_{1,(2:4)} \end{bmatrix} \x^T \label{eq:nlikcov}
        \end{align}

        We used the following formulas for the ETIG and ELIG:
        \begin{align}
            \mathrm{ETIG}(\x) &= \frac{1}{2} \log{\left( \frac{\Npredcov}{\Nlikcov} \right)}
        \end{align}
        \begin{align}
            \mathrm{ELIG}(\x) &= \frac{1}{2} \left( \log{\left( \frac{\Npredcov}{\sigma^2} \right)} - \log{\left( \frac{\Npredcov}{\Nlikcov} \right)} \right)
        \end{align}
    
        To generate the set of possible actions, we sampled 10,000 values $\boldsymbol{z} \sim \mathcal{N}(10, .25)$.
        For each value of $\boldsymbol{z}$, we then sampled one value of each $\x_2$, $\x_3$ and $\x_4$ from $\mathcal{N}(\boldsymbol{z}, .25)$, and one value of $\x_1$ from $\mathcal{N}(-1 / \boldsymbol{z}, .25)$.
        Each point in \Cref{fig:acqf-pathanalysis} corresponds to one of 10,000 values of $\mathscr{\x}$ generated as above.
        With reference to \Cref{fig:transfer-ML-pathanalysis}, when we calculate $\unknowneq(\x)$, $\x$ is always $\argmax_{\x \in \mathscr{X}} \mathrm{ETIG(\x)}$ where $\mathscr{X}$ contains all 10,000 possible actions.

    \subsection{PREFERENCE MODELING}\label{ap:preference}
        We modified the preference example from \citet{foster_variational_2019}, who use a censored sigmoid normal as the output distribution.
        Instead, we used the Bernoulli distribution $\y \sim \mathrm{Bernoulli}\left( \frac{1}{1 + e^{\psivar \x - \thetavar}} \right)$.

        \Cref{fig:transfer-ML-preference} shows the $\unknowneq(\x)$ for $\x = \argmax_{\x \in \mathscr{X}} \mathrm{ETIG(\x)}$ where $\mathscr{X}$ contains values evenly spaced between -79 and 81 (this differs slightly from the example in \citet{foster_variational_2019}, in which values of $\x$ were evenly spaced between -80 and 80).
    
        Unlike in the linear model setting, for which there are closed-form expressions for the ETIG and ELIG, the ETIG and ELIG for this example are not known in closed form.
        We approximate them using the following nested Monte Carlo estimators:
        \begin{align}\label{eq:etig-hat}
            \widehat{\text{ETIG}}(\x) = \sum_{i = 1}^N \left( \sum_{\y \in \Y} \left( p(\y \vert \x, \thetavar^i, \psivar^i) \log{\left( \frac{\sum_{j = 1}^M p(\y \vert \x, \thetavar^i, \psivar^j)}{\sum_{l = 1}^N p(\y \vert \x, \thetavar^l, \psivar^l)} \right)} \right) \right)
        \end{align}

        \begin{align}\label{eq:etsig-hat}
            \widehat{\text{ELIG}}(\x) &= \sum_{i = 1}^N \left( \sum_{\y \in \Y} \left( p(\y \vert \x, \thetavar^i, \psivar^i) \log{\left( \frac{p(\y \vert \x, \thetavar^i, \psivar^i)}{\sum_{l = 1}^N p(\y \vert \x, \thetavar^l, \psivar^l)} \right)} \right) \right) - \widehat{\text{ETIG}}(\x)
        \end{align}
        Samples of $\thetavar$ and $\psivar$ are drawn from the prior given above.
        Although not shown explicitly in \Cref{eq:etig-hat}, each set of $M$ inner samples is constrained to include the corresponding sample $\left( \thetavar^i, \psivar^i \right)$ to avoid pathological behavior when a value $\y$ has positive probability in only a very small region of $\prior{\PsiRV \vert \thetavar^i}$ \citep{foster_unified_2020}.
        We set $N$ to 10,000 and $M$ to 100 (reflecting results from \citet{rainforth_nesting_2018} that $M$ is optimally $\propto \sqrt{N}$).

        \vspace{2mm}

        \noindent \emph{Remark on \Cref{fig:acqf-preference}.}
            The trade-off between ETIG and ELIG visualized in \Cref{fig:acqf-preference} can be explained by noticing that the magnitude of $\x$ has opposite effects on the ease of identification of $\thetavar$ and $\psivar$.
            When $\x = 0$, $\psivar \x = 0$ and so $\psivar$ can't be identified at all.
            This is reflected by the fact that the ETIG peaks at 0.
            Conversely, the size of the effect of $\psivar$ on outcomes depends on the magnitude of $\x$, which is reflected by the fact that the ELIG is maximized by values of $\x$ with large magnitudes.

    \subsection{GAUSSIAN PROCESS REGRESSION}\label{ap:experiments-gp}
        We constructed the kernel $k(\cdot , \cdot)$ as the additive composition of $k_{\thetavar}(\cdot , \cdot)$ and $k_{\psivar_1}(\cdot , \cdot)$ where both $k_{\thetavar}(\cdot , \cdot)$ and $k_{\psivar_1}(\cdot , \cdot)$ were radial basis functions kernels with shared amplitude and lengthscale determined by the values of $\thetavar$ and $\psivar_1$, respectively.
        
        To generate \Cref{fig:transfer-ML-gp}, we set $\x = [25,26]$, and sampled 10,000 values from the learner's joint distribution over $(\thetavar,\psivar_1,\psivar_2)$, where $\psivar_2$ is a sample from the GP at $\x$.
        We set $\sigma^2$, the variance of $\YRV \vert \x, \thetavar, \psivar_1, \psivar_2$, to .01.

        We approximated $\unknowneq(\x)$ using a nested Monte Carlo estimator:
        \begin{align*}
            \unknowneq(\x) &= \E{\y \sim \prior{\YRV \vert \thetavar^\star, \psivar^\star}}{\log{\left(p(\y \vert \x, \thetavar^\star) \right)} - \log{\left(p(\y \vert \x) \right)}} \nonumber \\
            &\approx \frac{1}{M} \sum_{i=1}^M \left( \log{\left( \frac{1}{M} \sum_{j=1}^M p(y^i \vert \x, \thetavar^\star, \psivar^j) \right)} - \log{\left( \frac{1}{N} \sum_{j=1}^N p(y^i \vert \x, \thetavar^j, \psivar^j) \right)} \right)
        \end{align*}
        with $N = 10,000$ and $M = 100$.
        Samples were drawn from the prior given above.

        To generate \Cref{fig:acqf-gp}, we used a Hamiltonian Monte Carlo (HMC) sampler~\citep{gpytorch} to first train the model, initialized with the priors given above, on the five randomly-sampled points shown in the figure.
        The training data was generated from a function sampled from a kernel with $\thetavar^\star = 5$ and $\psivar_1^\star = 2.5$.
        In this case, $\sigma^2 = 0$, i.e., outcomes were treated as deterministic.
        We again used nested Monte Carlo estimators for the acquisition functions, given below.
        Rather than sampling from the prior, we used the HMC samples of $\thetavar$ and $\psivar_1$ to compute the expectations.

        \begin{align*}
            \mathrm{ETIG}(\x) &= \E{\thetavar,\psivar,\y \sim \prior{\ThetaRV,\PsiRV,\YRV \vert \x}}{\log{\left( p(\y \vert \x, \thetavar) \right)} - \log{\left( p(\y \vert \x) \right)}} \nonumber \\
            &\approx \frac{1}{N} \sum_{i=1}^N \left( \frac{1}{L} \sum_{l=1}^L \log{\left( \frac{1}{M} \sum_{j=1}^M p(\y^l \vert \x, \thetavar^i, \psivar^j) \right)} - \frac{1}{L} \sum_{l=1}^L \log{\left( \frac{1}{N} \sum_{j=1}^N p(\y^l \vert \x, \thetavar^j, \psivar^j) \right)} \right)
        \end{align*}
        \begin{align*}
            \mathrm{ELIG}(\x) &= \E{\thetavar,\psivar,\y \sim \prior{\ThetaRV,\PsiRV,\YRV \vert \x}}{\log{\left( p(\y \vert \x, \thetavar, \psivar) \right)} - \log{\left( p(\y \vert \x) \right)}} - \E{\thetavar,\psivar,\y \sim \prior{\ThetaRV,\PsiRV,\YRV \vert \x}}{\log{\left( p(\y \vert \x, \thetavar) \right)} - \log{\left( p(\y \vert \x) \right)}} \nonumber \\
            &= \E{\thetavar,\psivar,\y \sim \prior{\ThetaRV,\PsiRV,\YRV \vert \x}}{\log{\left( p(\y \vert \x, \thetavar, \psivar) \right)}} - \E{\thetavar,\psivar,\y \sim \prior{\ThetaRV,\PsiRV,\YRV \vert \x}}{\log{\left( p(\y \vert \x, \thetavar) \right)}} \nonumber \\
            &= - \E{\thetavar,\psivar,\y \sim \prior{\ThetaRV,\PsiRV,\YRV \vert \x}}{\log{\left( p(\y \vert \x, \thetavar) \right)}} \nonumber \\
            &\approx - \frac{1}{N} \sum_{i=1}^N \left( \frac{1}{L} \sum_{l=1}^L \log{\left( \frac{1}{M} \sum_{j=1}^M p(\y^l \vert \x, \thetavar^i, \psivar^j) \right)} \right)
        \end{align*}
        with $N = M = 500$ and $L = 10,000$.
        (Since observations are treated as deterministic, the term in the ELIG corresponding to the entropy of $\YRV \vert \thetavar, \psivar$ is 0.)
        