\section{Proof of Lemma \ref{lem:main1}}\label{sec:lemmadiffbd}
Using the definitions in Section \ref{sec:prelim} define $\bu_j := (1/k)\sum_{i \in I_j} \phi(\bx_i)$ so that  $\frac{1}{k}\sum_{i \in I_j}h(\bx_i) = \br_h^{\sf T}\bu_j$. 
\begin{align}
    &\bar{\eps}(\mc{B}, h) 
    = \frac{1}{m}\sum_{j=1}^m\left[\left(\left(\frac{1}{k}\sum_{i \in I_j}h(\bx_i)\right) - y_{B_j}\right)^2\right] \nonumber \\
    = & \frac{1}{m}\sum_{j=1}^m\left[\left(\frac{1}{k}\sum_{i \in I_j}h(\bx_i)\right)^2 + y_{B_j}^2 - 2y_{B_j}\br_h^{\sf T}\bu_j \right] \nonumber \\
    \leq & \frac{1}{m}\sum_{j=1}^m\left[\frac{1}{k}\sum_{i \in I_j}(h(\bx_i)^2 + y_i^2) - 2y_{B_j}\br_h^{\sf T}\bu_j \right] \label{eqn:barepsbd}
\end{align}
where the last upper bound uses Cauchy-Schwarz inequality. On the other hand,
\begin{align}
    & \hat{\eps}(\mc{S}, h) = \frac{1}{mk}\sum_{i=1}^{mk}\left[\left(h(\bz_i) - \ell_i\right)^2\right] \nonumber \\
    = &  \frac{1}{mk}\sum_{i=1}^{mk}\left[h(\bz_i)^2 + \ell_i^2 - 2\ell_i\br_h^{\sf T}\phi(\bz_i)\right] 
\end{align}
Using the above along with \eqref{eqn:barepsbd} we obtain,
\begin{align*}
    &\bar{\eps}(\mc{B}, h) - \hat{\eps}(\mc{S}, h) \nonumber \\
    \leq & \frac{1}{mk}\sum_{i=1}^{mk}\left(h(\bx_i)^2 - h(\bz_i)^2\right) \nonumber \\ 
    & \quad + 2\br_h^{\sf T}\left(\frac{1}{m}\sum_{j=1}^{m}y_{B_j}\bu_j - \frac{1}{mk}\sum_{i=1}^{mk}\ell_i\phi(\bz_i)\right) \nonumber \\
    & \qquad + \frac{1}{mk}\sum_{i=1}^{mk}\left(y_i^2 - \ell_i^2\right)
\end{align*}
Notice that the second term on the RHS of the above is $\leq \xi(\mc{S}, \mc{B})\left\|\br_h\right\|_2$. Taking $\lambda'(\mc{S}, \mc{T})= \left|1/(mk)\sum_{i=1}^{mk}\left(y_i^2 - \ell_i^2\right)\right|$ and $R(h,\mc{S}, \mc{T}) = \left|1/(mk)\sum_{i=1}^{mk}\left(h(\bx_i)^2 - h(\bz_i)^2\right)\right|$ completes the proof of Lemma \ref{lem:main1}.

\section{Proof of Theorem \ref{thm:main1}} \label{sec:proofthmmain1}
The proof proceeds by first reformulating the process of sampling the $m$ training bags as: (i) sample $2mk$ examples from $\mc{D}_T$, (ii) partition them into $m$ disjoint $(2k)$-sized subsets, and (iii) from each subset randomly choose $k$ points to include in a bag, to obtain $m$ $k$-sized bags. First, for a fixed sample of $2mk$ examples and regressor $h \in \mc{F}$, we use the randomness in step (iii) along with concentration bounds to show that with high probability the bag-level mse loss of $h$ on the bags is at least an $O(k)$-fraction of its loss on the sampled instances. A union bound over a fine-grained $\ell_\infty$ cover of $\mc{F}$ essentially allows us to restrict ourselves to regressors in the cover. The randomness in step (i) is used along with standard generalization error bounds to show that instance-level sample loss of every $h \in \mc{F}$ can be replaced with the distributional loss. The parameter $m$ is chosen to make the error probability arbitrarily small. The rest of this section contains the formal proof.

We first describe the following equivalent way of sampling the target training bags $\mc{B} = \mc{B}(m,k) = \{(B_j,y_{B_j})\,\mid\,j=1,\dots, m\}$.
\begin{enumerate}[nolistsep,noitemsep]
    \item Let $\mc{Z} :=  \{(\bx_i, y_i)\,\mid\, i = 1,\dots, 2mk\}$ be $2mk$ iid examples from $\mc{D}_T$.
    \item Define $\ol{I}_j = \{2k(j-1)+1, \dots, 2kj\}$, $j=1, \dots, m$ be a partition of $[2mk]$ into $m$ disjoint subsets.
    \item Independently for each $j = 1,\dots, m$, let $I_j$ be a random subset of $\ol{I}_j$ of exactly $k$ indices.
    \item For each $j = 1, \dots, m$, let $B_j = \{\bx_i\,\mid\, i \in I_j\}$ with bag-labels $y_{B_j} = (1/k)\sum_{i \in I_j}y_i$.
\end{enumerate}
Let us first fix $h \in \mc{F}$ and $\mc{Z}$ and prove a lower bound on the bag-level loss.

\medskip
\noindent
\textbf{Analysis for fixed $h$ and $\mc{Z}$.}
Let us assume that $\hat{\eps}(\mc{Z}, h) = \zeta$, for some $\zeta \geq 0$. For convenience let $z_i = h(\bx_i) - y_i$, $i = 1, \dots, 2mk$. Note that since $y_i, h(\bx_i) \in [0,1]$, $|z_i| \leq 1$. Let $\ol{\mc{Z}}^{(j)} = \{(\bx_i, y_i)\,\mid\, i \in \ol{I}_j\}$ be the restriction of $\mc{Z}$ to the indices in $\ol{I}_j$, so that $\sum_{j=1}^m \hat{\eps}(\mc{Z}^{(j)}, h) = \hat{\eps}(\mc{Z}, h)$.  
Over the choice of $\{I_j\}_{j=1}^m$ define the random variable $L_j := \left[\left(\frac{1}{k}\sum_{i\in I_j}h(\bx_i)\right) - y_{B_j}\right]^2$. Since $y_{B_j} = (1/k)\sum_{i\in I_j}y_i$, $L_j = \left(\frac{1}{k}\sum_{i\in I_j}z_i\right)^2 \leq \left(\frac{1}{k}\sum_{i\in I_j}|z_i|\right)^2$.
Since $I_j \subseteq \ol{I}_j$ and $|z_i| \leq 1$ for all $i$, this implies
\begin{eqnarray}
 L_j & \leq & \tn{min}\left\{1, \left(\frac{1}{k}\sum_{i\in \ol{I}_j}|z_i|\right)^2\right\} \nonumber \\
& \leq &  \tn{min}\left\{1, \frac{2}{k}\sum_{i\in \ol{I}_j}|z_i|^2\right\} =: \gamma_j  \label{eqn:upperbd}
\end{eqnarray}
since $\sum_{i\in \ol{I}_j}|z_i| \leq \sqrt{2k}\sqrt{\sum_{i\in \ol{I}_j}|z_i|}$ by Cauchy-Schwarz inequality.
 Note that after fixing $\mc{Z}$, the choices of $I_1, \dots, I_m$ are independent of each other, and each $L_j$ only depends of the choice of $I_j$.
\begin{eqnarray}
& & \E\left[L_j\right]  =  \E\left[\left(\frac{1}{k}\sum_{i\in I_j}z_i\right)^2\right] \nonumber \\
& = & \frac{1}{k^2}\left(\sum_{r\in \ol{I}_j}z_r^2\Pr[r \in I_j] + \sum_{\substack{r,s\in \ol{I}_j\\ r\neq s}}z_rz_s\Pr[r,s\in I_j]\right) \nonumber 
\end{eqnarray}
Since $I_j$ is a random subset of $\ol{I}_j$ of $k$ out of $2k$ indices, $\Pr[r \in I_j\,\mid\, r \in \ol{I}_j] = 1/2$ and $\Pr[r, s \in I_j\,\mid\, r,s \in \ol{I}_j, r\neq s] = (k-1)/(2(2k-1))$ which simplifies the RHS of the above to:
\begin{eqnarray}
 & & \frac{1}{2k^2}\left[\left(1 - \frac{k-1}{2k-1}\right)\sum_{r\in \ol{I}_j}z_r^2 +   \frac{k-1}{2k-1}\sum_{r,s\in \ol{I}_j}z_rz_s\right] \nonumber \\
 & \geq & \frac{1}{2k^2}\left[\frac{1}{2}\sum_{r\in \ol{I}_j}z_r^2 + \frac{k-1}{2k-1}\left(\sum_{r\in \ol{I_j}}z_r\right)^2\right] \nonumber \\
 & \geq & \frac{1}{4k^2}\sum_{r\in \ol{I}_j}z_r^2 \label{eqn:explowerbd}
\end{eqnarray}
Using \eqref{eqn:upperbd} one can apply Hoeffding's inequality to obtain for any $t \geq 0$ (see Appendix \ref{app:hoeffdings}),
\begin{eqnarray}
& & \Pr\left[\sum_{j=1}^mL_j \leq \E\left[\sum_{j=1}^mL_j\right] - t\right]\nonumber \\
&\leq& 2\tn{exp}\left(\frac{-2t^2}{\sum_{j=1}^m\gamma_j^2}\right)  \nonumber \\
& \leq & 2\tn{exp}\left(\frac{-2t^2}{\left(\max\{\gamma_j\}_{j=1}^m\right)\sum_{j=1}^m\gamma_j}\right) \nonumber \\
& \leq & 2\tn{exp}\left(\frac{-t^2k}{\sum_{j=1}^m\sum_{i\in \ol{I}_j}z_i^2}\right) \nonumber
\end{eqnarray}
By definition we have $\sum_{j\in \ol{I}_j}z_i^2 = \sum_{i=1}^{2mk}z_i^2 = 2\zeta mk$. Thus, the above along with \eqref{eqn:explowerbd} yields
$\Pr\left[\sum_{j=1}^mL_j \leq \zeta m/(2k) - t\right] \leq  2\tn{exp}\left(\frac{-t^2}{2\zeta m}\right)$. Recalling that $\zeta = \hat{\eps}(\mc{Z}, h)$, and noting that $\sum_{j=1}^m L_j = m\ol{\eps}(\mc{B}, h)$ while taking $t = \zeta m /(4k)$  we obtain
\begin{equation}
    \Pr\left[\ol{\eps}(\mc{B}, h) \leq \frac{\hat{\eps}(\mc{Z}, h)}{4k}\right] \leq 2\tn{exp}\left(\frac{-\hat{\eps}(\mc{Z}, h) m}{32k^2}\right) \label{eqn:failureprob}
\end{equation}

\setlength{\tabcolsep}{3pt}


\begin{table*}[!htbp]
\captionsetup{font=small,labelfont=small}
\begin{minipage}{0.48\textwidth}
\caption{MSE scores for different methods and bag sizes on the IPUMS dataset (averaged over 10 runs). The source instance loss is $1.8714 \pm 0.08$ and target instance loss is $1.1237 \pm 0.00$. Lower is better.}
\centering
\scriptsize
\begin{tabular}{c|c|c|c|c}
\diagbox{\textbfne{Method}}{\textbfne{Bag Size}} & \textbfne{8} & \textbfne{32} & \textbfne{128} & \textbfne{256} \\ \hline
Bagged-Target & 1.14 $\pm$ 0.00 & 1.16 $\pm$ 0.00 & 1.22 $\pm$ 0.0046 & 1.31 $\pm$ 0.01 \\ 
AF & 1.23 $\pm$ 0.01 & 1.31 $\pm$ 0.01 & 1.41 $\pm$ 0.02 & 1.43 $\pm$ 0.02 \\ 
LR & 1.15 $\pm$ 0.00 & 1.18 $\pm$ 0.00 & 1.24 $\pm$ 0.01 & 1.29 $\pm$ 0.01 \\ 
AF-DANN & 1.25 $\pm$ 0.02 & 1.33 $\pm$ 0.07 & 1.39 $\pm$ 0.07 & 1.39 $\pm$ 0.02 \\
LR-DANN & 1.16 $\pm$ 0.00 & 1.23 $\pm$ 0.02 & 1.51 $\pm$ 0.07 & 1.61 $\pm$ 0.13 \\ 
DMFA & 1.15 $\pm$ 0.00 & 1.18 $\pm$ 0.00 & 1.26 $\pm$ 0.01 & 1.30 $\pm$ 0.01 \\ 
PL-WFA (our) & 1.15 $\pm$ 0.00 & 1.18 $\pm$ 0.00 & 1.25 $\pm$ 0.01 & 1.29 $\pm$ 0.01 \\ 
BL-WFA (our) & \textbfne{1.14 $\pm$ 0.00} & \textbfne{1.16 $\pm$ 0.00} & \textbfne{1.22 $\pm$ 0.00} & \textbfne{1.25 $\pm$ 0.01} \\ 
\end{tabular}
\label{tab:usc}
\end{minipage}
\hfill
\begin{minipage}{0.48\textwidth}
\caption{MSE scores for different methods and bag sizes on the Wine dataset (averaged over 20 runs). The source instance loss is $195.5 \pm 1.2$ and target instance loss is $170.5 \pm 0.1$. Lower is better.}
\centering
\scriptsize
\begin{tabular}{c|c|c|c|c}
\diagbox{\textbfne{Method}}{\textbfne{Bag Size}}& \textbfne{8} & \textbfne{32} & \textbfne{128} & \textbfne{256} \\ \hline
Bagged-Target & \textbfne{173.5 $\pm$ 0.4} & \textbfne{177.7 $\pm$ 1.2} & 191.0 $\pm$ 2.5 & 206.9 $\pm$ 3.5  \\ 
AF& 186.8 $\pm$ 2.1 & 190.3 $\pm$ 2.8 & 191.0 $\pm$ 2.4 & 192.4 $\pm$ 1.8  \\ 
LR& 185.9 $\pm$ 2.0 & 191.6 $\pm$ 1.6 & 193.8 $\pm$ 0.8 & 194.5 $\pm$ 1.0  \\ 
AF-DANN& 187.6 $\pm$ 1.7 & 190.5 $\pm$ 1.7 & 191.2 $\pm$ 2.5 & 191.9 $\pm$ 2.1  \\ 
LR-DANN& 186.2 $\pm$ 1.5 & 192.1 $\pm$ 2.0 & 193.7 $\pm$ 2.4 & 193.8 $\pm$ 2.5  \\ 
DMFA& 186.1 $\pm$ 1.7 & 191.8 $\pm$ 2.1 & 193.5 $\pm$ 2.4 & 194.5 $\pm$ 0.9  \\ 
PL-WFA (our)& 183.0 $\pm$ 0.6 & 186.6 $\pm$ 1.0 & 189.0 $\pm$ 0.8 & 188.9 $\pm$ 1.2  \\ 
BL-WFA (our)& 180.9 $\pm$ 0.5 & 184.6 $\pm$ 0.7 & \textbfne{186.0 $\pm$ 0.8} & \textbfne{186.4 $\pm$ 0.5}  \\ 
\end{tabular}
\label{tab:wine}
\end{minipage}
\end{table*}

\medskip
\textbf{High probability bound for $\mc{F}$ and $\mc{Z}$.}
Let us fix the parameter $\nu$ in the statement of Theorem \ref{thm:main1}. We fix $\mc{Z}$ for now and consider the cover $\mc{C}_\infty(\xi, \mc{F}, \mc{Z})$ for some $\xi$ which we will choose later, and $q_\infty = N_\infty(\xi, \mc{F}, 2mk)$ be the upper bound on its size. Let $\mc{C}_\tn{err} \subseteq \mc{C}_p(\xi, \mc{F}, \mc{Z})$ s.t. $\forall \hat{h} \in \mc{C}_{\tn{err}}, \hat{\eps}(\mc{Z}, \hat{h}) \geq \nu/2$.
Taking a union bound of the error in \eqref{eqn:failureprob} over $\hat{\mc{F}}_{\tn{err}}$ we obtain that:
\begin{eqnarray}
    & & \Pr\left[\forall \hat{h} \in \mc{C}_{\tn{err}}: \ol{\eps}(\mc{B}, \hat{h}) \geq \frac{\hat{\eps}(\mc{Z}, \hat{h})}{4k}\right] \nonumber \\
    & \leq& 1 - 2q_\infty\tn{exp}\left(\frac{-\nu m}{64k^2}\right) \label{eqn:unionbd}
\end{eqnarray}
Define $\hat{\mc{F}}_{\tn{err}} := \{h \in \mc{F}\,\mid\, \hat{\eps}(\mc{Z}, h) \geq 3\nu/4\}$. For any $h \in \hat{\mc{F}}_{\tn{err}}$ there is $\hat{h} \in \mc{C}_\infty(\xi, \mc{F}, \mc{Z})$ s.t. $|\hat{h}(\bx) - h(\bx)| \leq \xi$ for all $(\bx, y) \in \mc{Z}$. Now, $(\hat{h}(\bx) - y)^2 = (h(\bx) - y + \hat{h}(\bx) - h(\bx))^2 \geq (h(\bx) - y)^2 - 2|\hat{h}(\bx) - h(\bx)||h(\bx) - y| + (\hat{h}(\bx) - h(\bx))^2 \geq (h(\bx) - y)^2 - 2\xi$ since $h(\bx), y \in [0,1]$.
Similarly, consider any bag $B \in \mc{B}$. Using arguments analogous to above we obtain $\left(\E\left[h(\bx)\right] - y_B\right)^2 \geq \left(\E\left[\hat{h}(\bx)\right] - y_B\right)^2 - 2\left|\E[\hat{h}(\bx) - h(\bx)]\right|\left(\E\left[h(\bx)\right] - y_B\right) \geq \left(\E\left[\hat{h}(\bx)\right] - y_B\right)^2 - 2\xi$, implying
\begin{equation}
    \hat{\eps}(\mc{Z}, \hat{h}) \geq \hat{\eps}(\mc{Z}, h) - 2\xi, \quad \ol{\eps}(\mc{B}, h) \geq \ol{\eps}(\mc{B}, \hat{h}) - 2\xi. \label{eqn:hatbd}
\end{equation}
Therefore, taking $\xi = \nu/(32k)$ we obtain from the first bound above that $\hat{h} \in \mc{C}_{\tn{err}}$ and further that $\hat{\eps}(\mc{Z}, \hat{h}) \geq 2\hat{\eps}(\mc{Z}, h)/3 \geq \nu/2 = 16k\xi$. Observe that $\ol{\eps}(\mc{B}, \hat{h}) \geq \hat{\eps}(\mc{Z}, \hat{h})/(4k)$ implies $\ol{\eps}(\mc{B}, \hat{h}) \geq 4\xi$, which in turn implies  $\ol{\eps}(\mc{B}, h) \geq \ol{\eps}(\mc{B}, \hat{h}) - 2\xi \geq \ol{\eps}(\mc{B}, \hat{h})/2$. Combining this with \eqref{eqn:unionbd} and \eqref{eqn:hatbd} we obtain,
\begin{align}
    \Pr&\left[\forall h \in \hat{\mc{F}}_{\tn{err}}: \ol{\eps}(\mc{B}, h) \geq \frac{\hat{\eps}(\mc{Z}, h)}{12k}\right] \nonumber \\
    &\leq 1 - 2q_\infty\tn{exp}\left(\frac{-\nu m}{64k^2}\right) \label{eqn:unionbd-2}
\end{align}
We now unfix $\mc{Z}$, and define $\mc{F}_{\tn{err}} = \{h \in \mc{F}\,\mid\, {\eps}(\mc{D}_T, \hat{h}) \geq \nu\}$. By Theorem 17.1 of \cite{Anthony-Bartlett}, we obtain with probability at least $1 - 4q_1\tn{exp}\left(-2\nu^2mk/512\right)$ over the choice of $\mc{Z}$, $h \in \mc{F}_\tn{err} \Rightarrow h \in \hat{\mc{F}}_{\tn{err}}$ where $q_1 = N_1(\nu/64, \mc{F}, 4mk)$. Using this along with \eqref{eqn:unionbd-2}, we obtain that with probability at least $1 - 2q_\infty\tn{exp}\left(-\nu m/(64k^2)\right) - 4q_1\tn{exp}\left(-2\nu^2mk/512\right)$, $\forall h  \in \mc{F}_{\tn{err}}, \ol{\eps}(\mc{B}, h) \geq \frac{\hat{\eps}(\mc{Z}, h)}{12k} \geq \frac{3\nu}{48 k} = \frac{\nu}{16 k}$. Using the upper bounds in \eqref{eqn:coversize} we see that the probability is $1 - \delta$ if we choose $m \geq
O\left(\left(p\left(\log\left(\frac{k}{\nu}\right) + \log\log\left(\frac{1}{\delta}\right)\right) + \log\frac{1}{\delta}\right)\max\left\{\frac{1}{k\nu^2}, \frac{k^2}{\nu}\right\}\right)$. See Appendix \ref{app:sample_complexity_analysis} for more details.
This completes the proof of Theorem \ref{thm:main1}. %

