\section{Linear regressors over $N(\mb{0}, \mb{I})$}\label{sec:linear-special}
\begin{algorithm}
%\dontprintsemicolon
\caption{PAC Learner for $f(\bx) := \br^{\sf T}\bx$  over $N(\mb{0}, \mb{I})$}\label{alg:one}
% \SetKwInOut{Input}{Input}
% \SetKwInOut{Output}{Output}
{\bf Input:} $\mathcal{D}_{\mathsf{bag}}(\mathcal{D} = N(\mb{0},\mb{I}), f = \mathsf{Lin}, q), m, q, \text{where } f(\bx) := \br^{\sf T}\bx$. \\
1. Sample a collection $\mc{B}$ of $m$ iid bags from $\mathcal{D}_{\tn{bag}}(\mathcal{D} , f , q  )$.\\ %Let $\mc{B} = \{B_i\}_{i=1}^{m}$ be the sampled bags with $\{ y_{B_i}\}_{i=1}^{m}$ be the corresponding labels.\\
2. Define $\hat{L}(\mc{B}, \bv) = \frac{1}{m}\sum_{B \in \mc{B}} \sum_{\bx \in B} (y_B - \bv^{\sf T}\bx)^2$, use convex optimisation to find $\hat{\bv}_\tn{min} = \tn{argmin}_{\bv} \hat{L}(\mc{B}, \bv)$.\\
3. Output $\hat \br = \left(\frac{1}{m}\sum_{B \in \mc{B}}\sum_{\bx \in B} \bx \bx^{\sf T} \right)\hat{\bv}_\tn{min}$.
\end{algorithm}
For the setting of homogeneous linear regressors over $N(\mb{0}, \mb{I})$, we provide Algorithm \ref{alg:one}. Note that in Step 2 of Algorithm \ref{alg:one}, $\hat{L}(\mc{B}, \bv) = \sum_{B \in \mc{B}}L_{\tn{bag}}(B, y_B, h)$ where $h(\bx) := \bv^{\sf T}\bx$. 

\begin{lemma}
    \label{lem:lin-1}
    For any $\eps, \delta \in (0,1)$, if $m \geq O\left(d q^2 \log{(\frac{q}{\delta})} \|\br\|_2^2/\eps\right)$, then $\hat \br$ returned in Algorithm \ref{alg:one} satisfies $\|\hat\br -\br\|_2 \leq \sqrt{\eps}$ with probability $1-\delta$.
\end{lemma}
We defer the proof of lemma \ref{lem:lin-1} to the next subsection.
\begin{lemma}
    \label{lem:lin-2}
    Let $\eps, \delta \in (0,1)$ and suppose that $\hat\br$ returned in Algorithm \ref{alg:one} satisfies $\|\hat\br -\br\|_2 \leq \sqrt{\eps}$, then $h(\bx) = \hat\br^{\sf T} \bx$ satisfies $\tn{err}_2(\mc{D}, f, h) \leq \eps$ with probability $1-\delta$.
\end{lemma}
\begin{proof}(of Lemma \ref{lem:lin-2})
   $\tn{err}_2(\mc{D}, f, h) = \E_{\bx \sim \mc{D}}\left[(f(\bx) - h(\bx))^2\right] = \E_{\bx \sim N(\mb{0}, \mb{I})}[((\br -\hat\br)^{\sf T} \bx)^2] = \Var[(\br -\hat\br)^{\sf T} \bx] + \mathbb{E}[(\br -\hat\br)^{\sf T} \bx]^2$. Now, note that $(\br -\hat\br)^{\sf T} \bx \sim N(\mb{0}, \|\br -\hat\br\|_2^2)$. So we get $\tn{err}_2(\mc{D}, f, h) = \|\br -\hat\br\|_2^2 \leq \eps$.
\end{proof}
Since $\|f\|_2 \geq \|\br\|_2$, $q \geq \log q$, for $m \geq O\left(d q^2 \log{(\frac{q}{\delta})} \|f\|_2^2/\eps\right)$, we show that Algorithm~\ref{alg:one} outputs $h$ such that $\tn{err}_2(\mc{D}, f, h) \leq \eps$. The convex optimisation subroutine called inside Algorithm~\ref{alg:one} is $\tn{poly}[d,q,(1/\eps),\log(1/\delta)]$, which makes Algorithm~\ref{alg:one} polynomial in $\tn{poly}[d, q, (1/\eps), \log(1/\delta), \|f\|_2]$. This completes the proof of Theorem \ref{thm:main1} for the setting of  homogeneous linear regressors and $\mc{D} = N(\mb{0}, \mb{I})$.

\subsection{Proof of Lemma \ref{lem:lin-1}}


Taking $B = \{\bx_{B1}, \dots, \bx_{Bq}\}$ to be a random bag from $\mc{D}_{\tn{bag}}(\mc{D}, f, q)$, one can assume $y_B = f(\bx_{B1}) = \br^{\sf T}\bx_{B1}$ as each feature-vector in $B$ is iid from $N(\mb{0}, \mb{I})$. Using this:
\begin{eqnarray}
    \hat{L}(\mc{B}, \bv) & = & \frac{1}{m}\sum_{B = \{\bx_i\,\mid\, i \in [q]\} \in \mc{B}}[(\br^{\sf T} \bx_{B1} - \bv^{\sf T} \bx_{B1})^2\nonumber\\
    && +\sum_{j=2}^q (\br^{\sf T} \bx_{B1} - \bv^{\sf T}\bx_{Bj})^2] \nonumber \\
    & = & (\br-\bv)^{\sf T} \mb{A} (\br - \bv) + (q-1) \br^{\sf T} \mb{A} \br \nonumber\\
    &&+ \sum_{j=2}^q(\bv^{\sf T} \mb{C}_j \bv - \br^{\sf T} \mb{D}_j^{\sf T} \bv - \bv^{\sf T} \mb{D}_j \br)  
\end{eqnarray}

where $\mb{A} = \frac{1}{m} \sum_{B = \{\bx_i\,\mid\, i \in [q]\} \in \mc{B}} \bx_{B1}\bx_{B1}^{\sf T}$, $\mb{C}_j =\frac{1}{m}\sum_{B = \{\bx_i\,\mid\, i \in [q]\} \in \mc{B}} \bx_{Bj}\bx_{Bj}^{\sf T} $, and $\mb{D}_j = \frac{1}{m}\sum_{B = \{\bx_i\,\mid\, i \in [q]\} \in \mc{B}} \bx_{Bj}\bx_{B1}^{\sf T}$.

We define $\hat{\bv}_\tn{min} = \mathrm{argmin}_{\bv} \hat{L}(\mc{B}, \bv)$ as used in Algorithm~\ref{alg:one}. $\hat{L}(\mc{B}, \bv)$ is convex in $\bv$, hence $\hat{\bv}_\tn{min}$ can be found by solving $\displaystyle \frac{\partial \hat{L}(\mc{B}, \bv)} {\partial \bv} = 0$, which yields (see Appendix \ref{app:diffvec}),
\[
    0 = \displaystyle \frac{\partial \hat{L}(\hat\bv)} {\partial \bv} = 2\mb{A} \left(\hat\bv_\tn{min} - \br\right) + \sum_{j=2}^{q}\left(2\mb{C}_j\hat\bv_\tn{min} - 2\mb{D}_j\br\right)
\]
\begin{eqnarray}
    \hat\bv_\tn{min} &=& \left(\mb{A} + \sum_{j=2}^{q}\mb{C}_j\right)^{-1} \left(\mb{A} + \sum_{j=2}^{q}\mb{D}_j\right)\br\nonumber\\
    &=& \left(\frac{1}{m}\sum_{B \in \mc{B}} \sum_{j=1}^q \bx_{Bj}\bx_{Bj}^{\sf T} \right)^{-1}\left(\mb{A} + \sum_{j=2}^{q}\mb{D}_j\right)\br\nonumber
\end{eqnarray}
Note that $\E_{\bx\sim \mathcal{D}}[\mb{A}] = \E_{x\sim \mathcal{D}}[\mb{C}_j] = \mb{I}$ and $\E_{\bx\sim \mathcal{D}}[\mb{D}_j] = \mb{0}$. As defined in Algorithm~\ref{alg:one}, $\hat \br$ is 
\begin{equation}
    \hat \br =  \left( \frac{1}{m}\sum_{B \in \mc{B}} \sum_{j=1}^q \bx_{Bj}\bx_{Bj}^{\sf T} \right) \bv_\tn{min} = \left(\mb{A} + \sum_{j=2}^{q}\mb{D}_j\right)\br.
\end{equation}
Note that $\E_{\mc{B}}[\mb{A}] = \E_{\mc{B}}[\mb{C}_j] = \mb{I}$ and $\E_{\mc{B}}[\mb{D}_j] = \mb{0}$, since $\bx_{Bj}$ ($B\in \mc{B}, j \in [q]$) are iid $N(\mb{0}, \mb{I})$. Thus, we have
\begin{eqnarray}
    \left\| \hat \br -\br \right\| & \leq & \left\|\mb{A} - \mb{I} +  \sum_{j=2}^{q}\mb{D}_j\right\| \|\br \| \nonumber \\ & \leq & \left\|\mb{A} - \mb{I}\right\| \|\br\| + \sum_{j=2}^{q} \left\| \mb{D}_j\right\| \|\br\|
    \label{eq:bound}
\end{eqnarray}
    by triangle inequality. As $m \geq O\left(d \log{(\frac{q}{\delta})}\|\br\|_2^2 q^2/\eps\right)$, using using Theorem~\ref{thm:prelim-random} we obtain
    \begin{equation}
         \Pr\left[\left\|\mb{A}-\mb{I}\right\| \leq \frac{\sqrt{\eps}}{2q\|\br\|}\right]\geq 1-\frac{\delta}{2q}. \label{eq:boundonAminusI}
    \end{equation}
    Further, since for any fixed $j \in \{2,\dots, k\}$, $\{(\bx_{B1} - \bx_{Bj})\}_{B\in \mc{B}} \sim N(0, 2\mb{I})$ iid , and $\{(\bx_{B1} + \bx_{Bj})\}_{B\in \mc{B}} \sim N(0, 2\mb{I})$ iid, we have
    \begin{align}
        &\Pr\left[\left\|\frac{1}{m}\sum_{B \in \mc{B}} (\bx_{B1}+\bx_{Bj})(\bx_{B1} +\bx_{Bj})^{\sf T} - 2\mathbb{I}\right\| \leq \frac{\sqrt{\eps}}{q\|\br\|}\right]\nonumber\\
        &\Pr\left[\left\|\frac{1}{m}\sum_{B \in \mc{B}} (\bx_{B1}-\bx_{Bj})(\bx_{B1} -\bx_{Bj})^{\sf T} - 2\mathbb{I}\right\| \leq \frac{\sqrt{\eps}}{q\|\br\|}\right]\nonumber\\
        &\geq 1-\frac{\delta}{4q}.\label{eqn:2outerbounds}
    \end{align}
    Observe that $(\bx_{B1}+\bx_{Bj})(\bx_{B1} +\bx_{Bj})^{\sf T} - (\bx_{B1}-\bx_{Bj})(\bx_{B1} -\bx_{Bj})^{\sf T} = 4\bx_{B1}\bx_{Bj}^{\sf T}$. Thus,
    \begin{align}
        4\mb{D}_j =& \frac{1}{m}\sum_{B \in \mc{B}} (\bx_{B1}+\bx_{Bj})(\bx_{B1} +\bx_{Bj})^{\sf T} - 2\mathbb{I} \nonumber \\
        & - \left[\frac{1}{m}\sum_{B \in \mc{B}} (\bx_{B1}-\bx_{Bj})(\bx_{B1} -\bx_{Bj})^{\sf T} - 2\mathbb{I}\right] \nonumber 
    \end{align}
    The above, using \eqref{eqn:2outerbounds} along with the triangle inequality on the operator norm of matrices gives us
    \begin{equation}
        \Pr\left[\|\mb{D}_j\| \leq \frac{\sqrt{\eps}}{2q\|\br\|}\right]\geq 1-\frac{\delta}{2q}. \nonumber
    \end{equation}
    Combining \eqref{eq:boundonAminusI}, \eqref{eqn:2outerbounds} along with \eqref{eq:bound} and a union bound over $j$, we obtain
    \begin{equation}
         \Pr[\|\hat\br -\br\| \leq \sqrt{\eps}] \geq 1-\delta. \label{eq:matrixperturb}
    \end{equation}