\section{Theoretical Analysis} \label{sec:theory}
Let us denote the source distribution with instance level labels by $D_S$; $D_S=\{(x,y) | x \in \mathbb{R}^d, y \in \mathbb{R}\}$ 
and the target distribution with bag level labels by $D_T$; $D_T=\{(X,Y,\bar{y}) | X \in \mathbb{R}^{b\times d}, Y \in \mathbb{R}^b, \bar{y} \in \mathbb{R}\}$. 
Here, $Y$ is the vector of instance level labels for all instances in a bag. $Y$ is assumed to be unknown. 
Note that $d$ is the dimension of input feature vector. 
Also, let the hypothesis class under consideration be denoted by $\mathcal{H}$; $\mathcal{H}=\{h: x\rightarrow y | x \in \mathbb{R}^d, y \in \mathbb{R}, h(x)=W_h^T\phi_h(x)+b_h\}$.
The goal is to learn an optimal hypothesis which performs the best on instance level target data. Let us denote the optimal hypothesis by $h^*$; $h^* = \argmin\limits_{h\in \mathcal{H}} \left(\sum\limits_{(X, Y)\in D_T} \sum\limits_{i\in [b]}\left[Y_i-h(X_i)\right]^2\right)$.
Next, we define a few error terms:
\begin{align*}
    \epsilon^I_T(h) &= \frac{\sum\limits_{X, Y \in D_T}\frac{1}{b}\sum\limits_{i\in [b]}\left(Y_i-h(X_i)\right)^2}{|D_T|}\\
    \epsilon^B_T(h) &= \frac{\sum\limits_{X, \bar{y} \in D_T}\left(\bar{y}-\frac{1}{b}\sum\limits_{i\in [b]}h(X_i)\right)^2}{|D_T|}\\
    \epsilon^I_S(h) &= \frac{\sum\limits_{x, y \in D_S}\left(y-h(x)\right)^2}{|D_S|}
\end{align*}

Although, we would like to find an optimal hypothesis by minimizing $\epsilon^I_T(h)$, it requires instance level labels from target domain to do so, which are unknown. Hence, we try to find an upper bound for $\epsilon^I_T(h)$ which does not rely on the knowledge of instance level labels from target domain.

\begin{equation*}
\begin{split}
    \epsilon^I_T(h) &\leq \epsilon^I_S(h) + |\epsilon^I_T(h) - \epsilon^I_S(h)|\\
    &\leq \epsilon^I_S(h) + |\epsilon_{IB}| + |\epsilon^B_T(h) - \epsilon^I_S(h)|\\
    &\leq \epsilon^I_S(h) + |\epsilon_{IB}| \\
    &+\left|\frac{\sum\limits_{X, \bar{y} \in D_T}\left(\bar{y}-\frac{1}{b}\sum\limits_{i\in [b]}h(X_i)\right)^2}{|D_T|} \right.\\
    &-\left.\frac{\sum\limits_{x, y \in D_S}\left(y-h(x)\right)^2}{|D_S|}\right|\\
    &\leq \epsilon^I_S(h) + |\epsilon_{IB}| + \left(\frac{\sum\limits_{X,\bar{y}\in D_T}\bar{y}^2}{|D_T|} + \frac{\sum\limits_{x,y\in D_S}y^2}{|D_S|}\right)\\
    &+ \left(\frac{\sum\limits_{X,\bar{y}\in D_T}\left(\sum\limits_{i\in [b]}h(X_i)\right)^2}{b^2|D_T|} + \frac{\sum\limits_{x,y\in D_S}h^2(x)}{|D_S|}\right)\\
    &+2\left|\frac{\sum\limits_{X,\bar{y}\in D_T}\bar{y}\sum\limits_{i\in [b]}h(X_i)}{b|D_T|} - \frac{\sum\limits_{x,y\in D_S}yh(x)}{|D_S|}\right|\\
    &\leq \epsilon^I_S(h) + |\epsilon_{IB}| + c_1\\
    &+ \left(\frac{\sum\limits_{X,\bar{y}\in D_T}\left(\sum\limits_{i\in [b]}h(X_i)\right)^2}{b^2|D_T|} + \frac{\sum\limits_{x,y\in D_S}h^2(x)}{|D_S|}\right)\\
    &+2\left|\frac{\sum\limits_{X,\bar{y}\in D_T}\bar{y}\sum\limits_{i\in [b]}h(X_i)}{b|D_T|} - \frac{\sum\limits_{x,y\in D_S}yh(x)}{|D_S|}\right|\\
    &\leq \epsilon^I_S(h) + |\epsilon_{IB}| + c_1\\
    &+ \left(\frac{\sum\limits_{X,\bar{y}\in D_T}\left(\sum\limits_{i\in [b]}h(X_i)\right)^2}{b^2|D_T|} + \frac{\sum\limits_{x,y\in D_S}h^2(x)}{|D_S|}\right)\\
    &+2\left|W_h^T.\left(\frac{\sum\limits_{X,\bar{y}\in D_T}\bar{y}\sum\limits_{i\in [b]}\phi_H(X_i)}{b|D_T|} - \frac{\sum\limits_{x,y\in D_S}y\phi_h(x)}{|D_S|}\right)\right.\\
    &\left.+b\left(\frac{\sum\limits_{X,\bar{y}\in D_T}\bar{y}}{|D_T|}-\frac{\sum\limits_{x,y\in D_S}y}{|D_S|}\right)\right|\\
    &\leq \epsilon^I_S(h) + |\epsilon_{IB}| + c_1 + \|W_h\|_2^2 + b^2 + \\
    &+ \left(\frac{\sum\limits_{X,\bar{y}\in D_T}\left(\sum\limits_{i\in [b]}h(X_i)\right)^2}{b^2|D_T|} + \frac{\sum\limits_{x,y\in D_S}h^2(x)}{|D_S|}\right)\\
    &
\end{split}
\end{equation*}
