\bibliographystyle{plainnat}

\maketitle

The appendix is organized as follows:
\begin{itemize}
    \item In \Cref{sec:related-work}, we discuss related works in more detail.
    \item In \Cref{ap:math}, we provide proofs of all our mathematical results.
    \item In \Cref{ap:experiments}, we provide further details of the examples described in \Cref{sec:experiments}.
        \Cref{ap:gp} additionally provides the results of the GP regression example.
\end{itemize}

\section{RELATED WORK}\label{sec:related-work}
    \paragraph{Likelihood weighting}
        has been applied for purposes that include potential model misspecification \citep{grunwald_safe_2011,miller_robust_2019,dewaskar_robustifying_2023}, potential conflation of transferable and task-specific effects \citep{ibrahim_power_2000,ibrahim_optimality_2011,ibrahim_power_2014,suder_bayesian_2023}, model selection \citep{ibrahim_power_2014}, and increased efficiency of MCMC samplers~\citep{schuster_markov_2021}.

    \paragraph{Probabilistic meta-learning}
        \citep{gordon_meta-learning_2019} is a paradigm in which a meta-learner simultaneously learns a transferable parameter value and a distribution over task parameter values.
        Unlike \algAbbrev{}, this framework assumes the data sources are known in the sense that each data point can be indexed by its task.
        This distinction also sets us apart from other Bayesian meta-learning approaches \citep{grant_recasting_2018,yoon_bayesian_2018,patacchiola_bayesian_2020}.
        Moreover, the aim of probabilistic meta-learning is to learn a distribution over task parameters.
        When the target task will arise from the same distribution as the source tasks, probabilistic meta-learning facilitates good performance on average across tasks.
        However, the goal of \algAbbrev{} is to provide a posterior predictive distribution tailored to a target task that may not arise from the same distribution as the source tasks.

    \paragraph{Using domain similarity for domain adaptation.}
        Many existing theoretical bounds for domain adaptation rely on the similarity between source and target tasks \citep{redko_survey_2019}.
        Some approaches to domain adaptation use similarity of covariates (inputs) in the target and source tasks to weight source data during training \citep{plank_effective_2011,ponomareva_biographies_2012,remus_domain_2012,ruder_learning_2017} or importance sampling techniques \citep{quinonero-candela_dataset_2009}.
        While this can be effective in cases of pure covariate shift (a change in the distribution of inputs), our formulation allows for differences in the map between covariates and outcomes that cannot be detected on the basis of covariate information alone.

    \paragraph{Proximal causal learning}
        is a paradigm that uses proxy information to learn causal effects \citep{kuroki_measurement_2014,tchetgen_introduction_2020,alabdulmohsin_adapting_2023,tsai2024proxy}.
        Our setting is similar to the multi-domain adaptation setting of \citet{tsai2024proxy}.
        We differ in that (i) we assume data sources are unknown, while they assume data can be indexed by its task, and (ii) we assume the presence of both shared and task parameters, while they do not distinguish between these.
        While our method for estimating the task parameter also leverages proxy methods, we differ in our usage of reweighting methods to estimate the shared parameter, which facilitates robust estimation without requiring additional proxy information.

        As shown in \Cref{fig:setting}, our formulation is stated in terms of the dependencies between shared parameters, task parameters, and observations, and so our work shares conceptual connections with the more general paradigm of causal inference.
        For instance, conceptualizing r-weighting as a pseudo-intervention requires conceptualizing the task parameters as a cause of the observations.
        We do not however require that either the shared or task parameters parameterize the causal effect of one observable variable on another; these parameters can represent any unobservable factor influencing the data.

    \paragraph{Human-in-the-loop learning.}
        In many applications, domain experts are a viable source of proxy information, and so our work can be tied to human-in-the-loop machine learning \citep{wu_survey_2022}.
        Like us, some human-in-the-loop methods leverage expert feedback in a Bayesian framework. 
        For example,~\cite{nahal_human-in-the-loop_2024} use expert feedback for learning in out-of-distribution settings, while
        \cite{sundin_improving_2018} query experts about the relevance of a given feature for outcome prediction.

\section{MATHEMATICAL DETAILS}\label{ap:math}
    \subsection{Definitions}
        \begin{itemize}
            \item $\ent{P}$ is the entropy of distribution $P$ with density $p$:
                $$\ent{P} = - \E{\sourcex \sim P}{\log{\left( p(\sourcex) \right)}}$$
            \item $\crossent{P}{Q}$ is the cross-entropy from distribution $P$ to distribution $Q$ with density $q$:
                $$\crossent{P}{Q} = - \E{\sourcex \sim P}{\log{\left( q(\sourcex) \right)}}$$
            \item $\kld{P}{Q}$ is the Kullback-Leibler divergence from distribution $P$ with density $p$, to distribution $Q$ with density $q$:
                $$\kld{P}{Q} = \E{\sourcex \sim P}{\log{\frac{p(\sourcex)}{q(\sourcex)}}}$$
        \end{itemize}

    \subsection{Proof of Theorem 2.4}\label{ap:classic-ig}
        The information gain achieved by the classic Bayesian learner (\Cref{def:pi-tig}) can be written as:
        \begin{align}
            \cIG\left( \targetconcept \right) &= \E{\sourcedata \sim P_{\truedataRV}}{\log{\left( \frac{p(\targetconcept \vert \sourcedata)}{p(\targetconcept)} \right)}} \nonumber \\
            &= \E{\sourcedata \sim P_{\truedataRV}}{\log{\left( \frac{\frac{L\left( \sourcedata, \targetconcept \right) ~ p\left( \targetconcept \right)}{\E{\param \sim P_{\paramRV}}{L\left( \sourcedata, \param \right)}}}{p\left( \targetconcept \right)} \right)}} \nonumber \\
            &= \E{\sourcedata \sim P_{\truedataRV}}{\log{\left( \frac{L\left( \sourcedata, \targetconcept \right)}{\E{\param \sim P_{\paramRV}}{L\left( \sourcedata, \param \right)}} \right)}} \nonumber
        \end{align}
    
        The proof follows the proof of Proposition 4.1 and Theorem 4.5 of \citet{sloman_bayesian_2024}.
        It depends on the following definitions:

        \newcommand{\neighborhood}[1]{N_{\epsilon} \left( #1 \right)}
        \newcommand{\restrictPrior}[1]{P^{#1}_{\paramRV}}
        \begin{definition}[$\epsilon$-neighborhood of $\param$ $\neighborhood{\param}$ (Definition 4.2 of \citealt{sloman_bayesian_2024})]
            $\neighborhood{\param} \equiv \{ \param^{\prime} \in \paramvalues ~ \vert ~ d(\param, \param^{\prime}) < \epsilon \}$, where $d$ is a suitable distance measure, is the $\epsilon$-neighborhood of $\param$.
        \end{definition}

        \begin{definition}[$\restrictPrior{\mathscr{A}}$ (modification of Definition 4.3 of \citealt{sloman_bayesian_2024})]
            $\restrictPrior{\mathscr{A}}$ refers to the distribution of $\paramRV$ obtained by restricting the support of the learner's prior to the set $\mathscr{A}$, under which
            $$p^{\mathscr{A}}(\param) \equiv \frac{p(\param)}{\int_{\mathscr{A}} p(\param) ~ d\param}$$ for any $\param \in \mathscr{A}$.
        \end{definition}
    
        \begin{assumption}[Smoothness in parameter space (Assumption 4.4 of \citet{sloman_bayesian_2024})]\label{as:smoothness}
            There exists some $\epsilon > 0$ such that
            \begin{align}
                &\E{\sourcedata \sim P_{\truedataRV}}{\log{\left( \E{\param \sim P_{\paramRV}}{L(\sourcedata, \param)} \right)}} \nonumber \\ 
                &\geq \E{\sourcedata \sim P_{\truedataRV}}{\left( \int_{N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \log{\left( L(\sourcedata, \targetconcept) \right)} + \left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \log{\left( \E{\param \sim P^{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)}_{\paramRV}}{L(\sourcedata, \param)} \right)}} \nonumber
            \end{align}
            where $\left( \int_{N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right)$ and $\left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right)$ are the probability that a value $\param$ is inside and outside the $\epsilon$-neighborhood of $\targetconcept$, respectively.
        \end{assumption}
        \begin{remark}
            \Cref{as:smoothness} holds when $\paramRV$ is a discrete random variable (in which case the $\epsilon$-neighborhood of $\targetconcept$ can be defined as $\{ \targetconcept \}$ and to exclude all other parameter values).
            When $\paramRV$ is a continuous random variable, \Cref{as:smoothness} is essentially a smoothness condition: For likelihoods that are sufficiently smooth around $\targetconcept$, we can expect it to hold for $\epsilon \rightarrow 0$.
            To see this, notice that Jensen's inequality implies that
            \begin{align}
                &\E{\sourcedata \sim P_{\truedataRV}}{\log{\left( \E{\param \sim P_{\paramRV}}{L(\sourcedata, \param)} \right)}} \nonumber \\
                &\geq \E{\sourcedata \sim P_{\truedataRV}}{\left( \int_{N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \log{\left( \E{\param \sim P^{N_{\epsilon} \left( \targetconcept \right)}_{\paramRV}}{L(\sourcedata, \param)} \right)} + \left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \log{\left( \E{\param \sim P^{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)}_{\paramRV}}{L(\sourcedata, \param)} \right)}}. \nonumber
            \end{align}
            \Cref{as:smoothness} holds when $\E{\param \sim P^{N_{\epsilon} \left( \targetconcept \right)}_{\paramRV}}{p(\sourcedata \vert \param)} \approx p(\sourcedata \vert \targetconcept)$ and the approximation is tight enough that it does not close the Jensen gap.
        \end{remark}

        Taking $P_{\sourcedataRV \vert \param \in \paramvalues \backslash \neighborhood{\targetconcept}}$ to be the source data distribution conditioned on the event that the shared parameter is not in the $\epsilon$-neighborhood of $\targetconcept$, we obtain
        \begin{align}
            \cIG\left( \targetconcept \right) &= \E{\sourcedata \sim P_{\truedataRV}}{\log{\left( L\left( \sourcedata, \targetconcept \right) \right)} - \log{\left( \E{\param \sim P_{\paramRV}}{L\left( \sourcedata, \param \right)} \right)}} \nonumber \\
            &\leq \mathbb{E}_{\sourcedata \sim P_{\truedataRV}} \left[ \log{\left( L(\sourcedata, \targetconcept) \right)} - \left( \int_{N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \log{\left( L(\sourcedata, \targetconcept) \right)} - \right. \nonumber \\
            &\left. \phantom{\mathbb{E}_{\sourcedata \sim P_{\sourcedataRV \vert \targetconcept, \truetsparam}}} \left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \log{\left( \E{\param \sim P^{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)}_{\paramRV}}{L(\sourcedata, \param)} \right)} \right] ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ \text{(\Cref{as:smoothness})} \nonumber \\
            &= \E{\sourcedata \sim P_{\truedataRV}}{\left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \left( \log{\left( L(\sourcedata, \targetconcept) \right)} - \log{\left( \E{\param \sim P^{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)}_{\paramRV}}{L(\sourcedata, \param)} \right)} \right)} \nonumber \\
            &= \left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \left( \crossent{P_{\truedataRV}}{P_{\sourcedataRV \vert \param \in \paramvalues \backslash \neighborhood{\targetconcept}}} - \crossent{P_{\truedataRV}}{P_{\sourcedataRV \vert \targetconcept}} \right) \nonumber \\
            &= \left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \left( \ent{P_{\truedataRV}} + \kld{P_{\truedataRV}}{P_{\sourcedataRV \vert \param \in \paramvalues \backslash \neighborhood{\targetconcept}}} - \ent{P_{\truedataRV}} - \kld{P_{\truedataRV}}{P_{\sourcedataRV \vert \targetconcept}} \right) \nonumber \\
            &= \left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \left( \kld{P_{\truedataRV}}{P_{\sourcedataRV \vert \param \in \paramvalues \backslash \neighborhood{\targetconcept}}} - \kld{P_{\truedataRV}}{P_{\sourcedataRV \vert \targetconcept}} \right)
        \end{align}
        as stated in the theorem for $A = \left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right)$ and $B = \kld{P_{\truedataRV}}{P_{\sourcedataRV \vert \param \in \paramvalues \backslash \neighborhood{\targetconcept}}}$.

    \subsection{Proof of Theorem 4.4}\label{ap:reweighted-ig}
       The r-weighted information gain (\Cref{def:pir-tig}) can be written as:
        \begin{align}
            \rIG\left( \targetconcept \right) &= \E{\sourcedata,\auxInfo \sim P_{\truedataRV,\auxInfoRV}}{\log{\left( \frac{p^{\mathcal{R}}(\targetconcept \vert \sourcedata, \auxInfo)}{p(\targetconcept)} \right)}} \nonumber \\
            &= \E{\sourcedata,\auxInfo \sim P_{\truedataRV,\auxInfoRV}}{\log{\left( \frac{\frac{\E{\tsparam_{n+1} \sim P_{\targettsparamRV \vert \auxInfo}}{\robustL\left( \sourcedata, \targetconcept, \tsparam = \tsparam_{n+1} \right)} ~ p\left( \targetconcept \right)}{\E{\param, \tsparam_{n+1}^{\prime} \sim P_{\paramRV, \targettsparamRV}}{\robustL\left( \sourcedata, \param, \tsparam = \tsparam_{n+1}^{\prime} \right)}}}{p\left( \targetconcept \right)} \right)}} \nonumber \\
            &= \E{\sourcedata,\auxInfo \sim P_{\truedataRV,\auxInfoRV}}{\log{\left( \frac{\E{\tsparam_{n+1} \sim P_{\targettsparamRV \vert \auxInfo}}{\robustL\left( \sourcedata, \targetconcept, \tsparam = \tsparam_{n+1} \right)}}{\E{\param,\tsparam_{n+1}^{\prime} \sim P_{\paramRV,\targettsparamRV}}{\robustL\left( \sourcedata, \param, \tsparam = \tsparam_{n+1}^{\prime} \right)}} \right)}}
        \end{align}

        \begin{remark}
            Notice that $\rIG\left( \targetconcept \right)$ is defined as an expectation over $P_{\auxInfoRV}$ as well as $P_{\truedataRV}$.
            This, and all other quantities in our analysis which include expectations over $\auxInfoRV$, can be interpreted as marginalizing across the learner's subjective uncertainty about the proxy information they will receive.
            We could have defined $\rIG\left( \targetconcept \right)$ as an expectation across a ``true'' distribution of proxy information, with a corresponding interpretation as the extent to which the learner can expect to gain information upon encountering a given distribution generating \textnormal{both} source data and proxy information.
            Although such an extension of the current analysis would in some sense be technically more complete, we opt to simplify our analysis and define the expectation over proxy information with respect to the learner's subjective uncertainty.
            Both the learner using a classic likelihood and the learner using an r-weighted likelihood use the same prior over $\auxInfoRV$ in estimation of $\targettsparam$, and so the incorrectness of the prior over proxy information is less important than the incorrectness of the prior over source task parameters in understanding the relative advantage of r-weighting.
        \end{remark}
    
        The proof of \Cref{prop:ig-reweighted} uses the following lemma:
        \begin{lemma}\label{lem:jensen-gap}
            Define $\mathcal{J}\left( \log{}; P_{\targettsparamRV \vert \auxInfo} \right) \equiv \log{\left( \E{\tsparam_{n+1} \sim P_{\targettsparamRV \vert \auxInfo}}{\robustL\left( \sourcedata, \targetconcept, \tsparam = \tsparam_{n+1} \right)} \right)} - \E{\tsparam_{n+1} \sim P_{\targettsparamRV \vert \auxInfo}}{\log{\left( \robustL\left( \sourcedata, \targetconcept, \tsparam = \tsparam_{n+1} \right) \right)}}$ and $\mathcal{J}\left( \log{}; P_{\targettsparamRV} \right) \equiv \log{\left( \E{\param,\tsparam_{n+1} \sim P_{\paramRV,\targettsparamRV}}{\robustL\left( \sourcedata, \param, \tsparam = \tsparam_{n+1} \right)} \right)} - \E{\tsparam_{n+1} \sim P_{\targettsparamRV}}{\log{\left( \E{\param \sim P_{\paramRV}}{\robustL\left( \sourcedata, \param, \tsparam = \tsparam_{n+1} \right)} \right)}}$.
            Under \Cref{as:jensen-gap} and \Cref{as:convergence} (stated formally in the proof of the lemma), $\E{\sourcedata \sim P_{\truedataRV}}{\mathcal{J}\left( \log{}; P_{\targettsparamRV} \right)} \geq \E{\sourcedata, \auxInfo \sim P_{\truedataRV,\auxInfoRV}}{\mathcal{J}\left( \log{}; P_{\targettsparamRV \vert \auxInfo} \right)}$.
        \end{lemma}
        \begin{proof}[Proof of \Cref{lem:jensen-gap}]
            The lemma leverages a result known as H\"{o}lder's defect \citep{steele_cauchy-schwarz_2004,becker_variance_2012}:
            \begin{theorem}[H\"{o}lder's defect (restated from \citealt{steele_cauchy-schwarz_2004}\footnote{\citet{steele_cauchy-schwarz_2004} states the result in terms of discrete sums; we here modified the statement of the result so it can be interpreted for continuous random variables.})
            ]
                If $f ~ : ~ \left[ a, b \right] \rightarrow \mathbb{R}$ is twice differentiable and if we have the bounds
                $$0 \leq m \leq f^{\prime \prime}(x) \leq M ~ \text{for all} ~ x \in \left[ a, b \right],$$
                then for a distribution $P$ over $\left[ a, b \right]$, there exists a real value $\mu \in \left[ m, M \right]$ for which one has the formula
                $$\underbrace{\E{x \sim P}{f(x)} - f\left( \E{x \sim P}{x} \right)}_{\mathcal{J}\left( -f; P \right)} = \frac{1}{2} \mu \mathrm{Var}_{x \sim P}\left[ x \right]$$
                for $\mathrm{Var}_{x \sim P}\left[ x \right] \equiv \E{x \sim P}{\left( x - \E{x \sim P}{x} \right)^2}$.
            \end{theorem}

            Our goal is to use H\"{o}lder's defect to relate $\mathcal{J}\left( \log{}; P_{\targettsparamRV \vert \auxInfo} \right)$ and $\mathcal{J}\left( \log{}; P_{\targettsparamRV} \right)$ to $\mathrm{Var}_{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1} \vert \auxInfo}} \left[ \robustL\left( \sourcedata, \targetconcept, \tsparam = \tsparam_{n+1} \right) \right]$ and $\mathrm{Var}_{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1}}} \left[ \E{\param \sim P_{\paramRV}}{\robustL\left( \sourcedata, \param, \tsparam = \tsparam_{n+1} \right)} \right]$, respectively.
            We first verify that the conditions required for H\"{o}lder's defect formula to apply are met.
            For both applications of the result, $f$ is the negative of the log function.
            In application to $\mathcal{J}\left( \log{}; P_{\targettsparamRV \vert \auxInfo} \right)$, $f$ takes as input values of $\robustL\left( \sourcedata, \targetconcept, \tsparam = \tsparam_{n+1} \right)$.
            In application to $\mathcal{J}\left( \log{}; P_{\targettsparamRV} \right)$, $f$ takes as input values of $\E{\param \sim P_{\paramRV}}{\robustL\left( \sourcedata, \param, \tsparam = \tsparam_{n+1} \right)}$.
            \begin{itemize}
                \item $f ~ : ~ \left[ a, b \right] \rightarrow \mathbb{R}$: \Cref{as:jensen-gap} ensures that inputs in both cases are bounded from both below and above.
                \item $f$ is twice differentiable: The second derivative of $f$ evaluated at $x$ is $f^{\prime \prime}(x) = \frac{1}{x^{2}}$.
                \item $0 \leq m \leq f^{\prime \prime}(x) \leq M$: \Cref{as:jensen-gap} ensures this for $m = \frac{1}{b^2}$ and $M = \frac{1}{a^2}$.
            \end{itemize}

            H\"{o}lder's defect then implies the following:
            \begin{align}\label{eq:jensen-psi_z}
                \mathcal{J}\left( \log{}; P_{\targettsparamRV \vert \auxInfo} \right) &= \frac{1}{2} \mu_1\left( \sourcedata, \auxInfo \right) \mathrm{Var}_{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1} \vert \auxInfo}} \left[ \robustL\left( \sourcedata, \targetconcept, \tsparam = \tsparam_{n+1} \right) \right]
            \end{align}
            for a scalar $\mu_1$ that depends on $\sourcedata$ and $\auxInfo$, and
            \begin{align}\label{eq:jensen-psi}
                \mathcal{J}\left( \log{}; P_{\targettsparamRV} \right) &= \frac{1}{2} \mu_2\left( \sourcedata \right) \mathrm{Var}_{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1}}} \left[ \E{\param \sim P_{\paramRV}}{\robustL\left( \sourcedata, \param, \tsparam = \tsparam_{n+1} \right)} \right]
            \end{align}
        for a scalar $\mu_2$ that depends on $\sourcedata$.

        We can now formally state \Cref{as:convergence}:
        \begin{assumption}[Sufficiently informative proxy]\label{as:convergence}
            The proxy is sufficiently informative in the sense that the following condition holds on the relative variances of $\targettsparamRV \vert \auxInfo$ and $\targettsparamRV$:
            \begin{align}
                &\E{\sourcedata \sim P_{\truedataRV}}{\mu_2\left( \sourcedata \right) \mathrm{Var}_{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1}}} \left[ \E{\param \sim P_{\paramRV}}{\robustL\left( \sourcedata, \param, \tsparam = \tsparam_{n+1} \right)} \right]} \geq \nonumber \\
                &\E{\sourcedata, \auxInfo \sim P_{\truedataRV, \auxInfoRV}}{\mu_1\left( \sourcedata, \auxInfo \right) \mathrm{Var}_{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1} \vert \auxInfo}} \left[ \robustL\left( \sourcedata, \targetconcept, \tsparam = \tsparam_{n+1} \right) \right]}. \nonumber
            \end{align}
        \end{assumption}

        Direct substitution of the condition in \Cref{as:convergence} into \Cref{eq:jensen-psi_z,eq:jensen-psi} completes the proof.
    \end{proof}

    In addition to \Cref{lem:jensen-gap}, the proof of \Cref{prop:ig-reweighted} uses the following assumption, which is a variant of \Cref{as:smoothness} for the r-weighted case:
    \begin{assumption}[Smoothness in parameter space]\label{as:smoothness-reweighted}
        There exists some $\epsilon > 0$ such that
        \begin{align}
            &\E{\sourcedata, \tsparam_{n+1} \sim P_{\truedataRV, \targettsparamRV}}{\log{\left( \E{\param \sim P_{\paramRV}}{\robustL(\sourcedata, \param, \tsparam = \tsparam_{n+1})} \right)}} \geq \nonumber \\
            &\mathbb{E}_{\sourcedata, \tsparam_{n+1} \sim P_{\truedataRV, \targettsparamRV}} \left[ \left( \int_{N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \log{\left( \robustL(\sourcedata, \targetconcept, \tsparam = \tsparam_{n+1}) \right)} \right. \nonumber \\
            &\phantom{\mathbb{E}_{\sourcedata, \tsparam_{n+1} \sim P_{\truedataRV, \targettsparamRV}}} + \left. \left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \log{\left( \E{\param \sim P^{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)}_{\paramRV}}{\robustL(\sourcedata, \param, \tsparam = \tsparam_{n+1})} \right)} \right] \nonumber
        \end{align}
        where $\left( \int_{N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right)$ and $\left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right)$ are the probability that a value $\param$ is inside and outside the $\epsilon$-neighborhood of $\targetconcept$, respectively.
    \end{assumption}
    \begin{remark}
        In addition to the smoothness condition on the likelihood imposed by \Cref{as:smoothness}, \Cref{as:smoothness-reweighted} additionally imposes what is essentially a ceiling on the outputs of the relevance function.
        Weights $< 1$ ``flatten'', or smooth out, the likelihood function; weights $> 1$ ``sharpen'' it, and may cause violation of \Cref{as:smoothness-reweighted} even in cases where \Cref{as:smoothness} is met.
        The relevance functions used in our examples (\Cref{sec:experiments}) output weights $\leq 1$.
    \end{remark}

    \allowdisplaybreaks
    We obtain
    \begin{align}
        \rIG\left( \targetconcept \right) = &\E{\sourcedata, \auxInfo \sim P_{\truedataRV, \auxInfoRV}}{\log{\left( \frac{\E{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1} \vert \auxInfo}}{\robustL\left( \sourcedata, \targetconcept, \tsparam = \tsparam_{n+1} \right)}}{\E{\param, \tsparam^{\prime}_{n+1} \sim P_{\paramRV, \tsparamRV_{n+1}}}{\robustL \left( \sourcedata, \param, \tsparam_{n+1}^{\prime} \right)}} \right)}} \nonumber \\
        \text{(\Cref{lem:jensen-gap})} ~~~~~~ \leq &\E{\sourcedata, \auxInfo \sim P_{\truedataRV, \auxInfoRV}}{\E{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1} \vert \auxInfo}}{\log{\left( \robustL\left( \sourcedata, \targetconcept, \tsparam = \tsparam_{n+1} \right) \right)}}} \nonumber \\
        &- \E{\sourcedata \sim P_{\truedataRV}}{\E{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1}}}{\log{\left( \E{\param \sim P_{\paramRV}}{\robustL\left( \sourcedata, \param, \tsparam = \tsparam_{n+1} \right)} \right)}}} \nonumber \\
        = &\E{\sourcedata \sim P_{\truedataRV}}{\E{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1}}}{\log{\left( \robustL\left( \sourcedata, \targetconcept, \tsparam = \tsparam_{n+1} \right) \right)}}} \nonumber \\
        &- \E{\sourcedata \sim P_{\truedataRV}}{\E{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1}}}{\log{\left( \E{\param \sim P_{\paramRV}}{\robustL\left( \sourcedata, \param, \tsparam = \tsparam_{n+1} \right)} \right)}}} \nonumber \\
        = &\E{\sourcedata, \tsparam_{n+1} \sim P_{\truedataRV, \tsparamRV_{n+1}}}{\log{\left( \robustL\left( \sourcedata, \targetconcept, \tsparam = \tsparam_{n+1} \right) \right)} - \log{\left( \E{\param \sim P_{\paramRV}}{\robustL\left( \sourcedata, \param, \tsparam = \tsparam_{n+1} \right)} \right)}} \nonumber \\
        \text{(\Cref{as:smoothness-reweighted})} \leq &\mathbb{E}_{\sourcedata, \tsparam_{n+1} \sim P_{\truedataRV, \tsparamRV_{n+1}}}\left[\log{\left( \robustL\left( \sourcedata, \targetconcept, \tsparam = \tsparam_{n+1} \right) \right)} - \left( \int_{N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \log{\left( \robustL(\sourcedata, \targetconcept, \tsparam = \tsparam_{n+1}) \right)} - \right. \nonumber \\
        &\phantom{\mathbb{E}_{\sourcedata,\tsparam_{n+1} \sim P_{\truedataRV,\tsparamRV_{n+1}}}\left[\right. ~~~~~} \left. \left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \log{\left( \E{\param \sim P^{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)}_{\paramRV}}{\robustL(\sourcedata, \param, \tsparam = \tsparam_{n+1})} \right)} \right] \nonumber \\
        = &\mathbb{E}_{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1}}} \left[ \mathbb{E}_{\sourcedata \sim P_{\truedataRV}} \left[ \left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \left( \log{\left( \robustL\left( \sourcedata, \targetconcept, \tsparam = \tsparam_{n+1} \right) \right)} \right. \right. \right. \nonumber \\
        &\left. \left. \left. \phantom{\mathbb{E}_{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1}}} \left[ \mathbb{E}_{\sourcedata \sim P_{\truedataRV}} \left[ \left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \right. \right.} - \log{\left( \E{\param \sim P^{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)}_{\paramRV}}{\robustL(\sourcedata, \param, \tsparam = \tsparam_{n+1})} \right)} \right) \right] \right] \nonumber \\
        = &\left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \mathbb{E}_{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1}}} \left[ \crossent{P_{\truedataRV}}{P_{\sourcedataRV^{\mathcal{R}(\tsparam_{n+1})} \vert \param \in \paramvalues \backslash \neighborhood{\targetconcept}, \tsparam = \tsparam_{n+1}}} \right. \nonumber \\
        &\phantom{\left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \mathbb{E}_{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1}}}} - \left. \crossent{P_{\truedataRV}}{P_{\sourcedataRV^{\mathcal{R}(\tsparam_{n+1})} \vert \targetconcept, \tsparam = \tsparam_{n+1}}} \right] \nonumber \\
        = &\left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \mathbb{E}_{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1}}} \left[ \ent{P_{\truedataRV}} + \kld{P_{\truedataRV}}{P_{\sourcedataRV^{\mathcal{R}(\tsparam_{n+1})} \vert \param \in \paramvalues \backslash \neighborhood{\targetconcept}, \tsparam = \tsparam_{n+1}}} \right. \nonumber \\
        &\phantom{\left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \mathbb{E}_{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1}}} \left[ \right.} \left.- \ent{P_{\truedataRV}} - \kld{P_{\truedataRV}}{P_{\sourcedataRV^{\mathcal{R}(\tsparam_{n+1})} \vert \targetconcept, \tsparam = \tsparam_{n+1}}} \right] \nonumber \\
        = &\left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \mathbb{E}_{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1}}} \left[ \kld{P_{\truedataRV}}{P_{\sourcedataRV^{\mathcal{R}(\tsparam_{n+1})} \vert \param \in \paramvalues \backslash \neighborhood{\targetconcept}, \tsparam = \tsparam_{n+1}}} \right. \nonumber \\
        &\phantom{\left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right) \mathbb{E}_{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1}}}} \left. - \kld{P_{\truedataRV}}{P_{\sourcedataRV^{\mathcal{R}(\tsparam_{n+1})} \vert \targetconcept, \tsparam = \tsparam_{n+1}}} \right]
    \end{align}
    as stated in the theorem for $A = \left( \int_{\paramvalues \backslash N_{\epsilon} \left( \targetconcept \right)} p(\param) ~ d\param \right)$ and $C = \E{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1}}}{\kld{P_{\truedataRV}}{P_{\sourcedataRV^{\mathcal{R}(\tsparam_{n+1})} \vert \param \in \paramvalues \backslash \neighborhood{\targetconcept}, \tsparam = \tsparam_{n+1}}}}$.

    \subsection{Proof of Proposition 5.5}\label{ap:reweighted-rho}
        The proof depends on the following definition:
        \begin{definition}[Fidelity of the relevance function $\rho^{\mathcal{R}}$]\label{def:rho}
            $\rho^{\mathcal{R}}$ is a measure of the fidelity of the relevance function.
            More specifically, it is:
            \begin{align}
                \rho^{\mathcal{R}} \equiv \mathbb{E}_{\sourcedata, \tsparam_{n+1} \sim P_{\truedataRV,\tsparamRV_{n+1}}}
                &\left[ \frac{1}{n} \sum_{i=1}^n \left( \csim{i}{\tsparam_{n+1}} - \frac{1}{n} \sum_{i=1}^n \csim{i}{\tsparam_{n+1}} \right) \right. \nonumber \\
                &\phantom{\frac{1}{n} \sum_{i=1}^n} \left.
                \left( \log{\left( p\left( \sourcedata_i \vert \targetconcept, \tsparam_i = \tsparam_{n+1} \right) \right)} -  \frac{1}{n} \sum_{i=1}^n \log{\left( p\left( \sourcedata_i \vert \targetconcept, \tsparam_i = \tsparam_{n+1} \right) \right)} \right)
                \right], \nonumber
            \end{align}
            i.e., is the covariance of $\csim{i}{\tsparam_{n+1}}$ and $\log{\left( p\left( \sourcedata_i \vert \targetconcept, \tsparam_i = \tsparam_{n+1} \right) \right)}$ with respect to a uniform distribution over the source data, in expectation over $P_{\truedataRV,\tsparamRV_{n+1}}$.
        \end{definition}

        $\rDelta$ can be rewritten as
        \begin{align}\label{eq:cor-reweighting}
            \rDelta &= \E{\tsparam_{n+1} \sim P_{\tsparamRV_{n+1}}}{\E{\sourcedata \sim P_{\truedataRV}}{\log{\left( \frac{L\left( \sourcedata, \targetconcept, \truetsparam \right)}{\robustL\left( \sourcedata, \targetconcept, \tsparam = \tsparam_{n+1} \right)} \right)}}} \nonumber \\
            &= - \E{\sourcedata, \tsparam_{n+1} \sim P_{\truedataRV,\tsparamRV_{n+1}}}{\log{\left( \robustL\left( \sourcedata, \targetconcept, \tsparam = \tsparam_{n+1} \right) \right)}} - \ent{P_{\truedataRV}} \nonumber \\
            &= - \E{\sourcedata,\tsparam_{n+1} \sim P_{\truedataRV,\tsparamRV_{n+1}}}{\sum_{i=1}^n \csim{i}{\tsparam_{n+1}} \log{\left( p\left( \sourcedata_i \vert \targetconcept, \tsparam_i = \tsparam_{n+1} \right) \right)}} - \ent{P_{\truedataRV}} \nonumber \\
            &= \E{\sourcedata,\tsparam_{n+1} \sim P_{\truedataRV,\tsparamRV_{n+1}}}{\left( \sum_{i=1}^n \csim{i}{\tsparam_{n+1}} \right) \left( - \sum_{i=1}^n \log{\left( p\left( \sourcedata_i \vert \targetconcept, \tsparam_i = \tsparam_{n+1} \right) \right)} \right)} - n \rho^{\mathcal{R}} - \ent{P_{\truedataRV}} \nonumber \\
            &= \E{\sourcedata,\tsparam_{n+1} \sim P_{\truedataRV,\tsparamRV_{n+1}}}{\left( \sum_{i=1}^n \csim{i}{\tsparam_{n+1}} \right) \left( - \log{\left( p\left( \sourcedata \vert \targetconcept, \tsparam = \tsparam_{n+1} \right) \right)} \right)} - n \rho^{\mathcal{R}} - \ent{P_{\truedataRV}} ~~~~~~~~~ \text{(\Cref{as:theta-psi-ind})} \nonumber
        \end{align}
        as stated in the proposition for $D = - \ent{P_{\truedataRV}}$.

        \begin{remark}
            As discussed in \Cref{sec:step2}, in practice the learner can often specify a sufficiently high-fidelity relevance function even in the absence of knowledge of $\targetconcept$, i.e., a relevance function for which $\rho^{\mathcal{R}}$ is sufficiently large.
            An example relevance function is given in \Cref{eq:iterR}.
            However, this relevance function is not guaranteed to positively correlate with $p\left( \sourcedata_i \vert \targetconcept, \tsparam_i = \tsparam_{n+1} \right)$.
            If the learner is particularly unlucky, this relevance function could have a negative corresponding value of $\rho^{\mathcal{R}}$, i.e., increase the relevance of source data points \textnormal{least} likely under a particular pseudo-intervention.
            This may occur if $\param$ and $\tsparam$ interact such that the direction of the gradient of predictions with respect to $\tsparam$ depends on $\param$.
            For example, consider a case in which for all except very few values of $\param$, outcomes increase as a function of $\tsparam$.
            In the context of our motivating example of treatment effect estimation, this might correspond to a situation where hospital quality generally increases the relative effectiveness of a treatment, unless the treatment effect is very extreme (in which case hospital quality has a larger impact on the effect of a placebo).
            If the true treatment effect is in fact very extreme, the relevance function shown in \Cref{eq:iterR} would likely negatively correlate with $p\left( \sourcedata_i \vert \targetconcept, \tsparam_i = \tsparam_{n+1} \right)$.
        \end{remark}

\section{DETAILS OF EXAMPLES}\label{ap:experiments}
    We here report the details of the examples described in \Cref{sec:experiments}.
    \Cref{ap:linreg} gives details of the linear regression example, \Cref{ap:smoking} gives details of the example predicting smoking behavior, and \Cref{ap:gp} gives details of the GP regression example and results showing the relative performance of \algAbbrev{} as a function of the values of each of several simulation parameters.

    \subsection{Linear regression}\label{ap:linreg}
        All simulations were run using only a CPU.
        In all simulations, the value of the shared parameter $\targetconcept = -1$.
        The prior $P_{\paramRV,\tsparamRV_i} = \mathcal{N}\left( [ 0, 0 ]^{\top}, \mathrm{diag}\left( [ 1, 1 ] \right) \right)$ for all $i \in 1:n+1$.
        
        To generate source data, we first specified a particular level of multicollinearity $\rho$.
        A higher degree of multicollinearity makes $\targetconcept$ and $\targettsparam$ harder to separately identify, so we interpret this as a higher risk of negative transfer.
        We varied $\rho$ among 0 (no multicollinearity), 1 (mild multicollinearity), and 2 (extreme multicollinearity).
        
        For a given value $\rho$ we sampled values $\sourcex^{\prime} \sim \mathcal{N}\left( \rho, .25 \right)$, and then constructed values $\sourcex_{(\cdot,1)} \sim \mathcal{N}\left( \sourcex^{\prime}, .25 \right)$ and values $\sourcex_{(\cdot,2)} \sim \mathcal{N}\left( -\frac{\rho^2}{\sourcex^{\prime}}, .25 \right)$.
        We created 100 such data points.
        Twenty-five of these data points were used to create proxy information (i.e., used to generate values $\auxInfo_i$ as described below), and 75 were used as outcome information on the basis of which to estimate the shared parameter.

        In the simulations shown in \Cref{fig:betabinom}, all proxy values are uncontaminated.
        In the simulations shown in \Cref{fig:linear-contamination}, $\rho = 2$ always, i.e., all simulations are run in the presence of extreme multicollinearity.

        \paragraph{Relevance function.}
            We first computed the relevances as $\csim{i}{\tsparam_{n+1}} \propto p\left( \sourcedata_i \vert \tsparam_i = \tsparam_{n+1} \right) = \E{\param \sim P_{\paramRV}}{p\left( \sourcedata_i \vert \param, \tsparam_i = \tsparam_{n+1} \right)}$ where the constant of proportionality was the probability a distribution with the same variance would assign to its mode.
            Using the calculated relevances, we computed $P^{\mathcal{R}}_{\paramRV, \targettsparamRV \vert \sourcedata, \auxInfo}$.
            We then defined $\widehat{P}^{\mathcal{R}}_{\paramRV}$ as a Gaussian approximation to samples from the r-weighted posterior $P^{\mathcal{R}}_{\paramRV \vert \sourcedata, \auxInfo}$, recomputed each $\csim{i}{\tsparam_{n+1}} \propto \E{\param \sim \widehat{P}^{\mathcal{R}}_{\paramRV}}{p\left( \sourcedata_i \vert \param, \tsparam_i = \tsparam_{n+1} \right)}$, and recomputed the r-weighted posterior.
            In each simulation, we repeated this three times before ultimately defining the relevance function as an expectation across the distribution $\widehat{P}^{\mathcal{R}}_{\paramRV}$ obtained at the final iteration.

        \paragraph{Proxy information.}
            When $q\%$ of proxy values are contaminated, $1 - q\%$ of proxy values are generated as $\auxInfo \sim \mathrm{Binomial}\left( 7, \tilde{p}(\sourcedata^{\prime} \vert \tsparam^{\prime} = \targettsparam) \right)$ where $\sourcedata^{\prime}$ are observations used to prompt the synthetic expert for feedback, $\tsparam^{\prime}$ are the corresponding task parameters, and $\tilde{p}$ indicates that the probability has been normalized to not exceed 1.
            The remaining $q\%$ of proxy values are generated as $\auxInfo \sim \mathrm{Binomial}\left( 7, 1 - \tilde{p}(\sourcedata^{\prime} \vert \tsparam^{\prime} = \targettsparam) \right)$.

    \subsection{Predicting smoking behavior}\label{ap:smoking}
        All computations were run using only a CPU.
        The prior for all effects in both the classic and r-weighted models was $\mathcal{N}\left( 0, 3 \right)$.

        The classic Bayesian learner estimated the fixed effects model
        $$\sourcey_i \vert \sourcex_i, \param, \tsparam \sim \mathrm{Binomial}\left( \mathrm{sigmoid}\left( \param \sourcex_{i,(1:4)}^{\top} + \tsparam \sourcex_{i,(5:28)}^{\top} \right), N_i \right)$$
        where $\sourcex_{i,(1:4)}$ are indicators of the treatment received, $\sourcex_{i,(5:28)}$ are study indicators, and $N_i$ is the number of patients who received the indicated treatment in the indicated study.
        The classic learner's estimate of the study indicator for the target task conditioned on the proxy information, generated as described in the main text, and their estimate of $\left( \param, \tsparam \right)$ used standard Bayesian updating to condition on the source data.
        
        The r-weighted Bayesian learner estimated the model
        $$\sourcey_i \vert \sourcex_i, \param, \tsparam_{n+1} \sim \mathrm{Binomial}\left( \mathrm{sigmoid}\left( \param \sourcex_{i,(1:4)}^{\top} + \tsparam_{n+1} \right), N_i \right)$$
        The r-weighted learner's estimate of $\left( \param, \tsparam_{n+1} \right)$ used the following proxy-informed r-weighted likelihood of the 23 source data points $p^{\mathcal{R}}\left( \sourcedata, \auxInfo \vert \param, \tsparam_{n+1} \right)$:
        \begin{align}
            p^{\mathcal{R}}\left( \sourcedata, \auxInfo \vert \param, \tsparam_{n+1} \right) &= \robustL\left( \sourcedata, \param, \tsparam = \tsparam_{n+1} \right) ~ p\left( \auxInfo \vert \tsparam_{n+1} \right) \nonumber \\
            &= p\left( \auxInfo \vert \tsparam_{n+1} \right) \prod_{i=1}^{n} p\left( \sourcey_i \vert \sourcex_i, \param, \tsparam_i = \tsparam_{n+1} \right)^{\csim{i}{\tsparam_{n+1}}}. \nonumber
        \end{align}

        \paragraph{Proxy information.}
            To simulate proxy information, we sampled $\auxInfo \sim \mathcal{N}\left( \targettsparam, \sigma \right) + \mathbbm{1}_{noisy} \epsilon$ where $\mathbbm{1}_{noisy}$ indicates whether proxy contamination is present and $\epsilon \sim \mathcal{N}\left( 0, 3 \right)$ is the bias added to contaminate the proxy information.
            Since we do not know the true value $\targettsparam$, we approximated $\targettsparam$ by the mean of the corresponding fixed effect distribution estimated in a model that incorporated data from all 24 studies.
            
            When proxy information is \textit{weakly informative}, $\sigma = 3$ and $\mathbbm{1}_{noisy} = 0$.
            When proxy information is \textit{highly informative}, $\sigma = .1$ and $\mathbbm{1}_{noisy} = 0$.
            When proxy information is \textit{misleading}, $\sigma = 3$ and $\mathbbm{1}_{noisy} = 1$.
            The value of $\mathbbm{1}_{noisy}$ is unknown to the learner, who always models the proxy information as completely uncontaminated (i.e., as if $\mathbbm{1}_{noisy} = 0$).

    \subsection{Gaussian Process regression}\label{ap:gp}
        \begin{figure}[t!]
            \begin{subfigure}{.32\linewidth}
                \includegraphics[width=\linewidth]{figs/gp_0.pdf}
                \caption{Amount of source data.
                    The advantage is more pronounced when more source data and proxy information is available.
                    See interpretation in the text.
                }
                \label{fig:gp-data-proxy}
            \end{subfigure}\hfill\begin{subfigure}{.32\linewidth}
                \includegraphics[width=\linewidth]{figs/gp_1.pdf}
                \caption{Covariate resolution.
                    The advantage is more pronounced for higher covariate resolutions.
                    See interpretation in the text. \\ ~
                }
                \label{fig:gp-res}
            \end{subfigure}\hfill\begin{subfigure}{.32\linewidth}
                \includegraphics[width=\linewidth]{figs/gp_2.pdf}
                \caption{Number of iterations for refinement of the relevance function $T$. \\ ~ \\ ~}
                \label{fig:gp-T}
            \end{subfigure}
            \begin{subfigure}{.32\linewidth}
                \includegraphics[width=\linewidth]{figs/gp_3.pdf}
                \caption{Amount of target information. \\ ~ \\ ~}
                \label{fig:gp-source-dist}
            \end{subfigure}\hfill\begin{subfigure}{.32\linewidth}
                \includegraphics[width=\linewidth]{figs/gp_4.pdf}
                \caption{Value of $\targetconcept$.
                    The advantage is more pronounced for lower values of $\targetconcept$.
                    See interpretation in the text.
                }
                \label{fig:gp-target}
            \end{subfigure}\hfill\begin{subfigure}{.32\linewidth}
                \includegraphics[width=\linewidth]{figs/gp_5.pdf}
                \caption{Amount of proxy contamination.
                See interpretation in the text of \Cref{sec:gp}. \\ ~ }
                \label{fig:gp-proxy-noise}
            \end{subfigure}
            \caption{Advantage of learning with an r-weighted likelihood in the GP regression setting as a function of the simulation parameter indicated in the subfigure caption.
                Each box in the plot shows the interquartile region (boxes) and outliers (points), across all values of all other simulation parameters, of the mean of $\rIG\left( \targetconcept \right) - \cIG\left( \targetconcept \right)$ across 50 simulations.
            }
            \label{fig:gp-supp}
        \end{figure}
        Each simulation was run on a single Nvidia A100 GPU.\footnote{The set of simulations run under 36 sets of simulation parameters (.5\% of all sets of simulation parameters) did not complete successfully.
            For an additional 22 sets of simulation parameters (.3\% of the total number of all sets of simulation parameters), all simulations encountered runtime errors.
        }
        The priors were $P_{\paramRV} = \mathrm{Lognormal}\left( 1, 1 \right)$ and $P_{\tsparamRV_i} = \mathrm{Gamma}\left( 3, .8 \right)$ for all $i \in 1:n+1$.
        
        For each simulation, we generated 80 trajectories drawn from a GP of the form given in the main text.
        From these 80 trajectories, trajectories $1:m_t$ were generated from the target task, where the \textit{amount of target information} $m_t$ was a variable simulation parameter (see below).
        Trajectories $(m_t+1):80$ were generated under a task parameter sampled at random from the learner's prior.
        Since these trajectories comprise most of the source data (see discussion of the effect of $m_t$ below), the learner's prior is in most cases relatively well-specified.
        In this sense, the results in \Cref{fig:gp-supp} are a somewhat conservative test of \algAbbrev{}.
        
        Trajectories $1:m_s$ were then used to create synthetic proxy information, while trajectories $(80-m_s):80$ were used for estimation of the target parameter (i.e., as source data), where the \textit{amount of source data} $m_s$ was a variable simulation parameter (see below).
        Proxy information was generated in the same way as for the linear regression example (see \Cref{ap:linreg}).
        
        The relevance function was computed using the same method described in \Cref{ap:linreg}, with the exception that the number of iterations $T$ used for refinement of the relevance function was a variable simulation parameter.

        After observing that results were affected by the value of some simulation parameters, we varied these parameters across simulations.
        We varied the following simulation parameters:
        \begin{itemize}
            \item \textbf{Amount of source data $m_s$:}
                This parameter, which took values in $\{ 8, 16, 24, 32, 40 \}$, controlled the number of trajectories in the source data.
                A distinct set of the same number of trajectories was used to create synthetic proxy information.
                Notice that the number of trajectories in the source data always equals the number of trajectories used to create proxy information, i.e., when more source data is available more proxy information is also available.
            
            \item \textbf{Covariate resolution:}
                Trajectories were evaluated on a grid of evenly-spaced values $\sourcex$ ranging from 0 to 1.
                This parameter, which took values in $\{ 5, 10, 20, 30 \}$, controlled the resolution and size of that grid.

            \item \textbf{Number of iterations for refinement of the relevance function $T$:}
                This parameter, which took values in $\{ 0, 1, 2, 3 \}$, controlled the number of iterations used for refinement of the relevance function.

            \item \textbf{Amount of target information $m_t$:}
                This parameter, which took values in $\{ 10, 20, 30, 40, 50, 60, 70 \}$, controlled the number of trajectories generated by the target task.
                Notice that observations from the target task are almost exclusively used to create synthetic proxy information (the exception is when $m_t > 80-m_s$, in which case the source data contains $m_t + m_s - 80$ trajectories from the target task).
                This reflects that the learner uses proxy information \textit{instead of} direct observations from the target task (i.e., instead of fine-tuning in the target task).
                The amount of target information $m_t$ to a certain extent admits a parallel interpretation as the number of trajectories a learner with both \textit{knowledge of the data sources} and \textit{the ability to fine-tune} (neither of which are available to the learners in our setting) would have to adapt to the target task, i.e., as the cost of operating in the setting of unknown data sources.

            \item \textbf{Value of $\targetconcept$:}
                We set $\targetconcept$ to either 1 (left tail of $P_{\paramRV}$), $e$ (mode of $P_{\paramRV}$), or 6 (right tail of $P_{\paramRV}$).

            \item \textbf{Amount of proxy contamination:}
                We generated and contaminated synthetic proxy information in the same way as described for the linear regression example in \Cref{sec:treatment-effect}.
                This parameter, which took values in $\{ 0, .25, .5, .75 \}$, controlled the fraction of proxy values which were contaminated.
        \end{itemize}

        \Cref{fig:gp-supp} shows how the relative performance of \algAbbrev{} depends on the value of each of the simulation parameters listed above.

        \Cref{fig:gp-data-proxy} shows that \algAbbrev{}'s advantage is more pronounced when more source data and proxy information are available.
        In other words, when the source data is sparse, the classic learner performs on par with the r-weighted learner.
        We speculate that this result reflects that in cases of source data sparsity both methods gain equally little information about $\targetconcept$.
        \Cref{fig:gp-res} shows a similar effect of the informativeness of the source data: \algAbbrev{}'s advantage is more pronounced for higher covariate resolutions.
        When the covariate resolution is low, the covariates are relatively far apart and so all observations will be relatively uncorrelated regardless of the value of the shared and task parameters.
        In these cases, observations provide little information about the smoothness of the underlying function.
        Like in cases of source data sparsity, we speculate that this result reflects that in cases of disparate observations both methods gain equally little information about $\targetconcept$.

        \Cref{fig:gp-target} shows that \algAbbrev{}'s advantage is more pronounced for smaller values of $\targetconcept$.
        This may be because of the effect of the value of $\targetconcept$ on the threat of negative transfer: 
        Values of $\tsparam^{\star}_i$ tend to be large (the distribution from which source task parameters are drawn is right-skewed), and the learner partially attributes the effect of a larger bandwidth in the task-specific component of the kernel to the shared component of the kernel.
        When the bandwidth of the shared component of the kernel is small, the result is negative transfer.
        In this sense, \Cref{fig:gp-target} corroborates the result shown in \Cref{fig:betabinom} that r-weighting is especially effective in the presence of the threat of negative transfer.

\bibliography{bibliography}