\section{Preliminaries for Appendix}\label{sec:app_prelims}
\subsection{Hoeffding's Inequality}
We state the well known Hoeffding's inequality.
\begin{theorem} \label{thm:Hoeffding}
    Let $X_1, X_2,\dots, X_m$ be independent random variables such that $a_i \leq X_i \leq b_i$. Consider the sum of these random variables $S_m = X_1 + X_2+ \dots + X_m$. Then we have for all $t>0$,
        $\Pr(|S_m - \E[S_m]| \geq t) \leq 2 \exp\left( -\frac{2t^2}{\sum_{i=1}^m (b_i-a_i)^2}\right)$
\end{theorem}

\subsection{Differentiation w.r.t. a vector} \label{app:diffvec}
We state basic identities for differentiation with respect to vectors. For $\bx, \by \in \mathbb{R}^d$ and symmetric $A \in \mathbb{R}^{d \times d}$, we have:
\begin{equation}
    \frac{\partial}{\partial \bx^{\sf T}} (\bx^{\sf T}A\by) = A\by,  \qquad 
    \frac{\partial}{\partial \bx^{\sf T}} (\bx^{\sf T}A\bx) = 2A\bx
    \nonumber
\end{equation}
For reference see Appendix C of  [W. Yang, W. Cao, T. Chung, J. Morris: Applied Numerical Methods Using Matlab, 2007]. 
\subsection{Case of singular covariance matrix}
If $\bm{\mu}\bm{\mu}^{\sf T} + \bm{\Sigma}$ is not invertible, 
observe that any $\bx \sim N(\bm{\mu}, \bm{\Sigma})$ is in the linear space spanned by the eigen-vectors of $\bm{\mu}\bm{\mu}^{\sf T} + \bm{\Sigma}$ corresponding to non-zero eigenvalues. Thus, one can consider this reduced space in which case the minimimum non-zero eigenvalue is given by the operator norm of its pseudo-inverse (see Section A.5.4 of [S. P. Boyd and L. Vandenberghe. Convex Optimization, 2014]). The projection of $\bm{\mu}$ into that space yields the new mean vector.

\subsection{Gaussian Random Vectors and their  Concentration}


We also state the equivalent of Theorem~\ref{thm:prelim-random} for Gaussian distributions as given in \cite{Wainwright-HDP}.

\begin{lemma}\label{lem:gaussconc}
    \label{lem:subgauss_conc}
    Consider $\bX_1, \bX_2, \dots, \bX_m$ in $\mathbb{R}^d$ iid from $N(\bm{\mu}, \bm{\Sigma})$. Then we have with probability $1-\delta$,
    \begin{equation}
    \left\vert\left\vert \frac{1}{m} \sum_{i=1}^{m} \left(\bX_i \bX_i^T - \mathbb{E}[\bX_i \bX_i^T]\right) \right\vert\right\vert 
    \leq O\left( \|\bm{\Sigma}\| \sqrt{\log\left(\frac{1}{\delta}\right)}\sqrt{\frac{d}{m}}\right).
    \nonumber
    \end{equation}
\end{lemma}

\input{ProofofgeneralGaussian}

\section{Closure of neural networks under transformation}\label{sec:transform}
We consider a concept class $\mc{F}$ of regressors with bounded outputs in $[0,1]$ which is closed under the following transformation: for any $f \in \mc{F}$, $f_b = bf + (1-b)\E[f] \in \mc{F}$ for any $b \in [0,1]$. Common regression neural networks that have a final activation which is relu are closed under this transformation. Their output can be multiplicatively scaled by simply scaling its input weights of the final layer uniformly. A scalar translation can be achieved by adding a constant to the output.



\section{Estimation for Theorem~\ref{thm:main3}}\label{sec:estimation}
The estimation of $\E[f]$ can be done using averaging the bag label of $m'$ bag samples. As $f(\bx) \in [0,1]$, we can use Hoeffding's inequality (Theorem \ref{thm:Hoeffding}) to bound the error in the estimate $E_{m'}$. We get $\Pr(|E_{m'} - \E[f] | \geq t ) \leq 2 \exp(-2t^2/m')$. To get an absolute error of $t$ with a probability of $1-\delta$, we would need $m' = 2t^2/\log(2/\delta)$ many bag samples. We can estimate $\E[f]$ very accurately with a high probability with a relatively small number of samples. Hence, we exclude the error in the estimation of $\E[f]$ in the analysis of Theorem~\ref{thm:main3} and assume that this constant is known exactly for simplicity.

\section{Experiments over Noisy Synthetic Data}
We conduct experiments on data generated by adding Gaussian noise $N(0,\sigma^2)$ to linear synthetic labels generated using the same methodology as before for bag size $q=5$, dimension $d=10$. We compare our algorithm's robustness to Gaussian noise against the {\sf Instance-MIR}, {\sf Aggregation-MIR} in Table~\ref{tab:noiselinear}. We conduct more experiments adding $N(0,\sigma^2)$ Gaussian noise to $2$-layer neural network synthetic labels generated above for bag size $q = 5$, dimension $d=5$ and report the results in Table~\ref{tab:noisenn}. In Tables~\ref{tab:noiselinear} and \ref{tab:noisenn}, we observe that our algorithm performs favorably under Gaussian noise and is robust.

\begin{table}[ht]
\centering
\caption{Linear Regression MIR over noisy synthetic data}
\label{tab:noiselinear}
\begin{tabular}{cccc}
\toprule
$\sigma^2$ & Instance-MIR & $\mathcal{A}$ & Agg-MIR \\
\midrule
$0.0$ & $7.771 \pm 0.109$ & $0.166 \pm 0.068$ & $0.184 \pm 0.101$ \\
$0.1$ & $7.787 \pm 0.208$ & $0.159 \pm 0.060$ & $0.125 \pm 0.046$ \\
$0.5$ & $7.728 \pm 0.225$ & $0.170 \pm 0.053$ & $0.105 \pm 0.054$ \\
$1.0$ & $7.711 \pm 0.196$ & $0.193 \pm 0.067$ & $0.159 \pm 0.071$ \\
$2.0$ & $7.757 \pm 0.231$ & $0.165 \pm 0.088$ & $0.175 \pm 0.058$ \\
$5.0$ & $7.698 \pm 0.184$ & $0.364 \pm 0.153$ & $0.346 \pm 0.107$ \\
$10.0$ & $8.349 \pm 0.467$ & $1.574 \pm 0.580$ & $1.469 \pm 0.815$ \\
\bottomrule
\end{tabular}
\end{table}

\begin{table}[ht]
\centering
\caption{Neural Network MIR over noisy synthetic data}
\label{tab:noisenn}
\begin{tabular}{cccc}
\toprule
$\sigma^2$ & Instance-MIR & $\mathcal{A}$ & Agg-MIR \\
\midrule
$0.0$ & $0.446 \pm 0.025$ & $0.168 \pm 0.040$ & $0.427 \pm 0.076$ \\
$0.01$ & $0.432 \pm 0.021$ & $0.206 \pm 0.051$ & $0.369 \pm 0.088$ \\
$0.05$ & $0.465 \pm 0.039$ & $0.193 \pm 0.078$ & $0.345 \pm 0.092$ \\
$0.1$ & $0.442 \pm 0.019$ & $0.160 \pm 0.027$ & $0.363 \pm 0.025$ \\
$0.5$ & $0.467 \pm 0.026$ & $0.235 \pm 0.045$ & $0.399 \pm 0.101$ \\
$1.0$ & $0.472 \pm 0.029$ & $0.441 \pm 0.042$ & $0.558 \pm 0.080$ \\
\bottomrule
\end{tabular}
\end{table}