\section{Proof of Theorem \ref{thm:main3}}\label{sec:thm3proof}
For convenience, let us define 
\begin{equation}
    \Delta(h) := \E_{(B, y_B) \leftarrow \mc{D}_{\tn{bag}}(\mc{D}, f, q)} L_{\tn{bag}}\left(B, y_B, h\right). \label{eq:Delta}
\end{equation}
Take $B = \{\bx_{B1}, \dots, \bx_{Bq}\}$ to be a random bag from $\mc{D}_{\tn{bag}}(\mc{D}, f, q)$ where $y_B = f(\bx_{B1})$ as each feature-vector in $B$ is independent and u.a.r. from $\mc{D}$.  Thus, 
    \begin{eqnarray}
    \Delta(h) & := & \E_{\{\bx_{Bj} \leftarrow \mc{D}\,\mid\, j = 1, \dots, q\}}\sum_{j=1}^q\left[\left(h(\bx_{B1}) - f(\bx_{Bj})\right)^2\right] \nonumber \\ & = & \E_{\bx_{B1}, \bx_{B2} \leftarrow \mc{D}}\bigl[\left(h(\bx_{B1}) - f(\bx_{B1})\right)^2 \nonumber \\ &{}& + (q-1)\left(h(\bx_{B2}) - f(\bx_{B1})\right)^2\bigr] \label{eq:Delta-simple}
    \end{eqnarray}
    where $\bx_{B1}, \bx_{B2}$ are iid from $\mc{D}$.

We will first do the analysis for unbiased target concept $f$ i.e., satisfying   $\E_{\bx \leftarrow \mc{D}}\left[f\right] = 0$. 
The following lemma shows that any regressor $h$ for which $\Delta(h)$ is close to its optimal value, must have low error w.r.t. to a scaled version of $f$.

\begin{lemma}
    Consider any $f \in \mc{F}$ s.t. $\E_{\bx \leftarrow \mc{D}}\left[f(\bx)\right] = 0$, then, letting $\tilde{f} := f/q$, for any $h : \mbc{X} \to R$,
    \begin{equation}
        \Delta(h) = \Delta(\tilde{f}) + \tn{err}_2(D, \tilde{f}, h). \label{eq:neuralerrbd-tilde}
    \end{equation}
    In particular, $\tilde{f}$ minimizes $\Delta(h)$ over all regressors $h$.
\end{lemma}
\begin{proof}
    From \eqref{eq:Delta-simple} we have
    \begin{align}
        & \Delta(h) \nonumber \\ 
        & = \E\left[\left(h(\bx_{B1}) - f(\bx_{B1})\right)^2 + (q-1)\left(h(\bx_{B2}) - f(\bx_{B1})\right)^2\right] \nonumber \\
        & = \E\left[\left((h(\bx_{B1}) - \tilde{f}(\bx_{B1})) + (\tilde{f}(\bx_{B1}) - f(\bx_{B1}))\right)^2 \right. \nonumber \\
        & \left. + (q-1)\left((h(\bx_{B2}) - \tilde{f}(\bx_{B2})) + (\tilde{f}(\bx_{B2}) - f(\bx_{B1}))\right)^2\right] \nonumber \\
        & = \E\left[\left(h(\bx_{B1}) - \tilde{f}(\bx_{B1})\right)^2 + \left(\tilde{f}(\bx_{B1}) - f(\bx_{B1})\right)^2 \nonumber \right.\nonumber \\ 
                    &\ \ \ \left. + 2\left(h(\bx_{B1}) - \tilde{f}(\bx_{B1})\right)\left(\tilde{f}(\bx_{B1}) - f(\bx_{B1})\right)\right. \nonumber \\
        &\ + \left. (q-1) \left[\left(h(\bx_{B2}) - \tilde{f}(\bx_{B2})\right)^2 + \left(\tilde{f}(\bx_{B2}) - f(\bx_{B1})\right)^2 \right. \right. \nonumber \\
        & + \left. \left.  2\left(h(\bx_{B2}) - \tilde{f}(\bx_{B2})\right)\left(\tilde{f}(\bx_{B2}) - f(\bx_{B1})\right)\right]\right]. \nonumber
    \end{align}
    Using the fact that $\bx_{B1}$ and $\bx_{B2}$ are iid, $\E[f(\bx)] = 0$, and $\tn{err}_2(\mc{D}, \tilde{f}, h) = \E\left[\left(h(\bx) - \tilde{f}(\bx)\right)^2\right]$, the above simplifies to 
    \begin{eqnarray}
        \Delta(h) & = & \Delta(\tilde{f}) + \tn{err}_2(\mc{D}, \tilde{f}, h)\nonumber \\ 
                    & & + 2\E\left[(1/q - 1)\left(h(\bx_{B1}) - \tilde{f}(\bx_{B1})\right)f(\bx_{B1})\right] \nonumber \\ 
                    & &  + 2(q-1)\E\left[(1/q)\left(h(\bx_{B2}) - \tilde{f}(\bx_{B2})\right)f(\bx_{B2})\right] \nonumber \\
                  & = & \Delta(\tilde{f}) + \tn{err}_2(\mc{D}, \tilde{f}, h)\nonumber \\ 
                    & & + 2\E_{\bx\leftarrow \mc{D}}\left[(1/q - 1)\left(h(\bx) - \tilde{f}(\bx)\right)f(\bx)\right. \nonumber \\
                   & &  + \left.(1 - 1/q)\left(h(\bx) - \tilde{f}(\bx)\right)f(\bx)\right] \nonumber \\
                & = & \Delta(\tilde{f}) + \tn{err}_2(\mc{D}, \tilde{f}, h),
    \end{eqnarray}
    completing the proof.
\end{proof}
We now move to a general target $f$ which may have non-zero expectation. Observing that $\left(h(\bx_{Bj}) - f(\bx_{B1})\right)^2 = \left((h(\bx_{Bj}) - \E[f]) - (f(\bx_{B1}) - \E[f])\right)^2$ and applying the previous lemma, we obtain
\begin{equation}
        \Delta(h) = \Delta(\hat{f}) + \tn{err}_2(D, \hat{f}, h). \label{eq:neuralerrbd-hat}
    \end{equation}
where $\hat{f} = f/q + (1 - 1/q)\E[f]$.
We will now show that the optimizer of the loss on the sampled bags, w.h.p., yields an approximation to $\hat{f}$.  As per our assumptions, ${\sf Pdim}(\mc{F}) = r$ defined over $\mbc{X}$ with range $[0,1]$ that contains $f$ as well as $\hat{f}$.
For the rest of the proofs we shall fix $\mc{B}$ to be a collection of $m$ bags sampled from  $\mc{D}_{\tn{bag}}(\mc{D}, f, q)$. The loss corresponding to $\Delta(h)$ on $\mc{B}$ is given by:
\begin{equation}
    \Delta(\mc{B}, h) := \frac{1}{m}\sum_{B = \{\bx_{Bj}\,\mid\, j\in [q]\}\in \mc{B}}\sum_{i=1}^q\left(h(\bx_{B1}) - f(\bx_{Bj})\right)^2 \label{eq:Delta-sample} 
\end{equation}
\begin{lemma}\label{lem:neural3}
    With probability at least $1 - 4q\left(\frac{32emq}{\eps r}\right)^r\tn{exp}\left(-\frac{(\eps/q)^2m}{32}\right)$ over the choice of $\mc{B}$, for any $h \in \mc{F}$,  $\left| \Delta(\mc{B}, h) - \Delta(h)\right| \leq \eps$. 
\end{lemma}
\begin{proof}
    Consider a random bag $B = \{\bx_{B1}, \dots, \bx_{Bq}\}$. For each $j \in [q]$, applying Theorem 17.1 of \cite{Anthony-Bartlett} to the marginal distribution of $(\bx_{Bj}, f(\bx_{B1}))$, we obtain that w.p.  $1 - 4\left(\frac{32emq}{\eps r}\right)^r\tn{exp}\left(-\frac{(\eps/q)^2m}{32}\right)$ over $\mc{B}$,
    \begin{equation}
    \begin{aligned}
        & &\biggl|\E_{(B = \{\bx_{B1},\dots, \bx_{Bq}\}}\left[\left(h(\bx_{B1}) - f(\bx_{Bj})\right)^2\right] \biggr. \\ \biggl. & & - \frac{1}{m}\sum_{B = \{\bx_{Bj}\,\mid\, j\in [q]\}\in \mc{B}}\left(h(\bx_{B1}) - f(\bx_{Bj})\right)^2\biggr| \leq \eps/q
    \end{aligned}    
    \end{equation}
    where the expectation on the LHS is over a random bag $B$ from $\mc{D}_{\tn{bag}}(\mc{D}, f, q)$.  Thus, in the following we use a union bound to obtain
    \begin{eqnarray}
        & & \left| \Delta(\mc{B}, h) - \Delta(h)\right| \nonumber \\
        & = & \biggl|\sum_{i=1}^q\biggl[\E_{(B = \{\bx_{B1},\dots, \bx_{Bq}\}}\left[\left(h(\bx_{B1}) - f(\bx_{Bj})\right)^2\right] \biggr. \biggr. \nonumber \\
        & & \biggl.\biggl.  -  \frac{1}{m}\sum_{B = \{\bx_{Bj}\,\mid\, j\in [q]\}\in \mc{B}}\left(h(\bx_{B1}) - f(\bx_{Bj})\right)^2\biggr]\biggr| \nonumber \\
        & \leq & \sum_{i=1}^q\biggl|\E_{(B = \{\bx_{B1},\dots, \bx_{Bq}\}}\left[\left(h(\bx_{B1}) - f(\bx_{Bj})\right)^2\right] \biggr. \nonumber \\
        & & \biggl. - \frac{1}{m}\sum_{B = \{\bx_{Bj}\,\mid\, j\in [q]\}\in \mc{B}}\left(h(\bx_{B1}) - f(\bx_{Bj})\right)^2\biggr| \nonumber \\
        & \leq & q \left(\frac{\eps}{q}\right) = \eps \nonumber
    \end{eqnarray}
    with probability $1 - 4q\left(\frac{32emq}{\eps r}\right)^r\tn{exp}\left(-\frac{(\eps/q)^2m}{32}\right)$.
    \end{proof}
For convenience we define $\zeta := 4q\left(\frac{32emq}{\eps r}\right)^r\tn{exp}\left(-\frac{(\eps/q)^2m}{32}\right)$. Using the above we prove the following lemma.
\begin{lemma}\label{lem:neural2}
    With probability $1 - \zeta$,  any $h \in \mc{F}$ s.t. $\Delta(\mc{B}, h) \leq \Delta(\mc{B}, \hat{f})$ satisfies, $\Delta(h) \leq \Delta(\hat{f}) + 3\eps$. 
\end{lemma}
\begin{proof}
    From Lemma \ref{lem:neural3} we have that with probability $1 - \zeta$,
    \begin{equation}
        \left| \Delta(\mc{B}, h) - \Delta(h)\right| \leq \eps , \left| \Delta(\mc{B}, \hat{f}) - \Delta(\hat{f})\right| \leq \eps, \forall h \in \mc{F}. \label{eq:lemneural23}
    \end{equation}
    Suppose for a contradiction that there is some $h' \in \mc{F}$ s.t. 
    \begin{equation}
        \Delta(\mc{B}, h') \leq \Delta(\mc{B}, \hat{f}) \label{eq:lemneural21}
    \end{equation}
    and
    \begin{equation}
        \Delta(h') > \Delta(\hat{f}) + 3\eps. \label{eq:lemneural22}
    \end{equation}
    Using \eqref{eq:lemneural22} along with \eqref{eq:lemneural23} yields $\Delta(\mc{B}, h') > \Delta(\mc{B}, \hat{f}) + \eps$ which is a contradiction to \eqref{eq:lemneural21}.
\end{proof}
\begin{proof} (of Theorem \ref{thm:main3}). Observe that if $h$ minimizes $\Delta(\mc{B}, h')$ over all choices of $h' \in \mc{F}$, then $\Delta(\mc{B}, h) \leq \Delta(\mc{B}, \hat{f})$ since by our assumption, $\hat{f} \in \mc{F}$. Thus, applying Lemma \ref{lem:neural2} we obtain $\Delta(h) \leq \Delta(\hat{f}) + 3\eps$ which by \eqref{eq:neuralerrbd-hat} implies that $\tn{err}_2(D, \hat{f}, h) \leq 3\eps$. The theorem statement is obtained by replacing $\eps$ with $\eps/3$ in the above proof, and substituting the value of $m$ as in the statement of the theorem so that $\zeta \leq \delta$. 
The estimation of $\E[f]$ can be done using additional bag samples and we defer the details to Appendix \ref{sec:estimation}.
\end{proof}