\section{Proof of Theorem~\ref{thm:main1}}\label{sec:thm1-app}

For the rest of the proof we assume that $f(\bx) := \br^{\sf T}\bx$ and $\mc{D} := N(\bm{\mu}, \bm{\Sigma})$. 

In addition, we assume $\bm{\mu}, \bm{\Sigma}$ are unknown and estimate them in one step of the algorithm. Let $\bm{\Gamma} = \bm{\mu}\bm{\mu}^{\sf T} + \bm{\Sigma}$ be the second moment matrix of $\mc{D}$.

\begin{algorithm}
\caption{PAC Learner for $f(\bx) := \br^{\sf T}\bx$  over $N(\bm{\mu}, \bm{\Sigma})$}\label{alg:two}
{\bf Input:} $\mathcal{D}_{\mathsf{bag}}(\mathcal{D} = N(\bm{\mu},\bm{\Sigma}), f = \mathsf{Lin}, q ), m, q, \text{where } f(\bx) := \br^{\sf T}\bx$. \\
1. Sample a collection $\mc{B}$ of $m$ iid bags from $\mathcal{D}_{\tn{bag}}(\mathcal{D} , f , q)$.\\ %Let $\mc{B} = \{B_i\}_{i=1}^{m}$ be the sampled bags with $\{ y_{B_i}\}_{i=1}^{m}$ be the corresponding labels.\\
2. Define $\hat{L}(\mc{B}, \bv) = \frac{1}{m}\sum_{B \in \mc{B}} \sum_{\bx \in B} (y_B - \bv^{\sf T}\bx)^2$, use convex optimisation to find $\hat{\bv}_\tn{min} = \tn{argmin}_{\bv} \hat{L}(\mc{B}, \bv)$.\\
3. Estimate the sample mean $\hat{\bm{\mu}} := \frac{1}{mq}\sum_{B \in \mc{B}}\sum_{\bx \in B} \bx$, and sample second moment $\hat{\bm{\Gamma}} := \frac{1}{mq}\sum_{B \in \mc{B}}\sum_{\bx \in B} \bx\bx^{\sf T}$.\\
4. Output $\hat \br = ((q-1)\hat{\bm{\mu}} \hat{\bm{\mu}}^{\sf T} + \hat{\bm{\Gamma}})^{\sf {-1}}\left(\frac{1}{m}\sum_{B \in \mc{B}}\sum_{\bx \in B} \bx \bx^{\sf T} \right)\hat{\bv}_\tn{min}$.
\end{algorithm}

\begin{lemma}
    \label{lem:genlin-1}
    For any $\eps, \delta \in (0,1)$, if $m \geq O\left(\frac{d q^2 \|\br\|_2^2 \log{(\frac{q}{\delta})}(\| \bm{\mu}\|+1) (\| \bm{\mu}\|^2 + \lambda_{\tn{max}}(\bm{\Sigma}))^3}{\lambda_{\tn{min}}^2(\bm{\Gamma})\eps}\right)$, then $\hat \br$ returned in Algorithm \ref{alg:one} satisfies $\|\hat\br -\br\|_2 \leq \sqrt{\frac{\eps}{\|\bm{\mu}\|_2^2 + \lambda_{\tn{max}}(\bm{\Sigma})}}$ with probability $1-\delta$.
\end{lemma}
We defer the proof of lemma \ref{lem:lin-1} to the next subsection.
\begin{lemma}
    \label{lem:genlin-2}
    Let $\eps, \delta \in (0,1)$ and suppose that $\hat\br$ returned in Algorithm \ref{alg:two} satisfies $\|\hat\br -\br\|_2 \leq \sqrt{\frac{\eps}{\|\bm{\mu}\|_2^2 + \lambda_{max}(\bm{\Sigma})}}$, then $h(\bx) = \hat\br^{\sf T} \bx$ satisfies $\tn{err}_2(\mc{D}, f, h) \leq \eps$ with probability $1-\delta$.
\end{lemma}
\begin{proof}(of Lemma \ref{lem:genlin-2})
   $\tn{err}_2(\mc{D}, f, h) = \E_{\bx \sim \mc{D}}\left[(f(\bx) - h(\bx))^2\right] = \E_{\bx \sim N(\bm{\mu}, \bm{\Sigma})}[((\br -\hat\br)^{\sf T} \bx)^2] = \Var[(\br -\hat\br)^{\sf T} \bx] + \mathbb{E}[(\br -\hat\br)^{\sf T} \bx]^2$. Now, note that $(\br -\hat\br)^{\sf T} \bx \sim N((\br -\hat \br)^{\sf T} \bm{\mu}, (\br -\hat\br)^{\sf T} \bm{\Sigma} (\br -\hat\br))$. So we get
   \begin{eqnarray}
       \tn{err}_2(\mc{D}, f, h) &=& (\br -\hat\br)^{\sf T} \bm{\Sigma} (\br -\hat\br) + ((\br - \hat \br)^{\sf T}\bm{\mu})^2\nonumber\\
        &=&(\br - \hat\br)^{\sf T}(\bm{\Sigma} + \bm{\mu}\bm{\mu}^{\sf T})(\br - \hat\br)  \nonumber\\
        &\leq& (\lambda_{\tn{max}}(\bm{\Sigma})+ \|\bm{\mu}\|^2)\|\br - \hat \br\|^2 \leq \eps.\nonumber
   \end{eqnarray}
\end{proof}

\subsection{Proof of Lemma \ref{lem:genlin-1}}

Taking $B = \{\bx_{B1}, \dots, \bx_{Bq}\}$ to be a random bag from $\mc{D}_{\tn{bag}}(\mc{D}, f, q)$, one can assume $y_B = f(\bx_{B1}) = \br^{\sf T}\bx_{B1}$ as each feature-vector in $B$ is iid from $N(\bm{\mu}, \bm{\Sigma})$. Using this:
\begin{eqnarray}
    \hat{L}(\mc{B}, \bv) & = & \frac{1}{m}\sum_{B = \{\bx_i\,\mid\, i \in [q]\} \in \mc{B}}\left[(\br^{\sf T} \bx_{B1} - \bv^{\sf T} \bx_{B1})^2 + \sum_{j=2}^q (\br^{\sf T} \bx_{B1} - \bv^{\sf T}\bx_{Bj})^2\right] \nonumber \\
    & = & (\br-\bv)^{\sf T} \mb{A} (\br - \bv) + (q-1) \br^{\sf T} \mb{A} \br + \sum_{j=2}^q(\bv^{\sf T} \mb{C}_j \bv - \br^{\sf T} \mb{D}_j^{\sf T} \bv - \bv^{\sf T} \mb{D}_j \br)  
\end{eqnarray}

where $\mb{A} = \frac{1}{m} \sum_{B = \{\bx_i\,\mid\, i \in [q]\} \in \mc{B}} \bx_{B1}\bx_{B1}^{\sf T}$, $\mb{C}_j =\frac{1}{m}\sum_{B = \{\bx_i\,\mid\, i \in [q]\} \in \mc{B}} \bx_{Bj}\bx_{Bj}^{\sf T} $, and $\mb{D}_j = \frac{1}{m}\sum_{B = \{\bx_i\,\mid\, i \in [q]\} \in \mc{B}} \bx_{Bj}\bx_{B1}^{\sf T}$.
We define $\hat{\bv}_\tn{min} = \mathrm{argmin}_{\bv} \hat{L}(\mc{B}, \bv)$ as used in Algorithm~\ref{alg:two}. $\hat{L}(\mc{B}, \bv)$ is convex in $\bv$, hence $\hat{\bv}_\tn{min}$ can be found by solving $\displaystyle \frac{\partial \hat{L}(\mc{B}, \bv)} {\partial \bv} = 0$.

\[
    0 = \displaystyle \frac{\partial \hat{L}(\hat\bv)} {\partial \bv} = 2\mb{A} \left(\hat\bv_\tn{min} - \br\right) + \sum_{j=2}^{q}\left(2\mb{C}_j\hat\bv_\tn{min} - 2\mb{D}_j\br\right)
\]
\[
    \hat\bv_\tn{min} = \left(\mb{A} + \sum_{j=2}^{q}\mb{C}_j\right)^{-1} \left(\mb{A} + \sum_{j=2}^{q}\mb{D}_j\right)\br = \left( \frac{1}{m}\sum_{B \in \mc{B}} \sum_{j=1}^q \bx_{Bj}\bx_{Bj}^{\sf T} \right)^{-1}\left(\mb{A} + \sum_{j=2}^{q}\mb{D}_j\right)\br
\]
Note that $\E_{\mc{B}}[\mb{A}] = \E_{\mc{B}}[\mb{C}_j] = \bm{\mu}\bm{\mu}^{\sf T} + \bm{\Sigma}$ and $\E_{\mc{B}}[\mb{D}_j] = \bm{\mu}\bm{\mu}^{\sf T}$. As defined in Algorithm~\ref{alg:two}, $\hat \br$ is 
\begin{equation}
    \hat \br =  ((q-1)\hat{\bm{\mu}}\hat{\bm{\mu}}^{\sf T} + \hat{\bm{\Gamma}})^{\sf -1}\left( \frac{1}{m}\sum_{B \in \mc{B}} \sum_{j=1}^q \bx_{Bj}\bx_{Bj}^{\sf T} \right) \hat{\bv}_\tn{min} = ((q-1)\hat{\bm{\mu}}\hat{\bm{\mu}}^{\sf T} + \hat{\bm{\Gamma}})^{\sf -1}\left(\mb{A} + \sum_{j=2}^{q}\mb{D}_j\right)\br.
\end{equation}
So we have
\begin{align}
    \| \hat \br -\br \| \leq& \left\|((q-1)\hat{\bm{\mu}}\hat{\bm{\mu}}^{\sf T} + \hat{\bm{\Gamma}})^{\sf -1}(\mb{A} +  \sum_{j=2}^{q}\mb{D}_j) - \mb{I}\right\| \|\br \| \nonumber \\ \leq& \left\|((q-1)\hat{\bm{\mu}}\hat{\bm{\mu}}^{\sf T} + \hat{\bm{\Gamma}})^{\sf -1}\right\|\left\|\mb{A} +  \sum_{j=2}^{q}\mb{D}_j - (q-1)\hat{\bm{\mu}}\hat{\bm{\mu}}^{\sf T} - \hat{\bm{\Gamma}}\right\| \|\br \| \nonumber
\end{align}
    Clearly, $\left\|((q-1)\hat{\bm{\mu}}\hat{\bm{\mu}}^{\sf T} + \hat{\bm{\Gamma}})^{\sf -1}\right\| \leq \frac{1}{\lambda_{\tn{min}}(\hat{\bm{\Gamma}})}$. Using this we obtain,
\begin{eqnarray}
    & & \| \hat \br -\br \| \nonumber \\  &\leq& \frac{1}{\lambda_{\tn{min}}(\hat{\bm{\Gamma}})} \left\|\mb{A} +  \sum_{j=2}^{q}\mb{D}_j - (q-1)\hat{\bm{\mu}}\hat{\bm{\mu}}^{\sf T} - \hat{\bm{\Gamma}}\right\| \|\br \| \nonumber\\
    &\leq& \frac{1}{\lambda_{\tn{min}}(\hat{\bm{\Gamma}})} \left\|\mb{A} - {\bm{\mu}}{\bm{\mu}}^{\sf T} -  {\bm{\Sigma}}+  \sum_{j=2}^{q}(\mb{D}_j - {\bm{\mu}}{\bm{\mu}}^{\sf T}) \right\| \|\br \|  + \frac{\|\hat{\bm{\Gamma}} - (q{\bm{\mu}}{\bm{\mu}}^{\sf T} + \bm{\Sigma})\|\|\br\|}{\lambda_{\tn{min}}(\tilde{ \bm{\Sigma}})}\nonumber\\
    &\leq& \frac{\|\mb{A} - {\bm{\mu}}{\bm{\mu}}^{\sf T} -  {\bm{\Sigma}}\|\|\br\|}{\lambda_{\tn{min}}(\hat{\bm{\Gamma}})} + \sum_{j=2}^{q}\frac{\|\mb{D}_j - {\bm{\mu}}{\bm{\mu}}^{\sf T} \|\|\br\|}{\lambda_{\tn{min}}(\hat{\bm{\Gamma}})}+ \frac{\|\hat{\bm{\Gamma}} - {\bm{\mu}}{\bm{\mu}}^{\sf T} - \bm{\Sigma}\|\|\br\|}{\lambda_{\tn{min}}(\hat{\bm{\Gamma}})} + \frac{(q-1)\|\bm{\mu}\bm{\mu}^{\sf T} - \hat{\bm{\mu}}\hat{\bm{\mu}}^{\sf T}\|}{\lambda_{\tn{min}}(\hat{\bm{\Gamma}})}\label{term0}
\end{eqnarray}

From the lower bound on $m$ in the statement of Lemma \ref{lem:genlin-1} and  
    using Theorem~\ref{thm:prelim-random} we bound the first term on the RHS of \eqref{term0} as follows.
    \begin{equation}
    \label{term1}
         \Pr\left[\|\mb{A}-{\bm{\mu}}{\bm{\mu}}^{\sf T} -  {\bm{\Sigma}}\| \leq \frac{\lambda_{\tn{min}}(\bm{\Gamma})}{8q\|\br\|_2} \sqrt{\frac{\eps}{\|\bm{\mu}\|_2^2 + \lambda_{\tn{max}}(\bm{\Sigma})}} \right]\geq 1-\frac{\delta}{4q}
    \end{equation}
    Further, since for any fixed $j \in \{2,\dots, k\}$, $\{(\bx_{B1} - \bx_{Bj})\}_{B\in \mc{B}} \sim N(\mb{0}, 2\bm{\Sigma})$ iid , and $\{(\bx_{B1} + \bx_{Bj})\}_{B\in \mc{B}} \sim N(2\bm{\mu}, 2\mb{\Sigma})$ iid, we have using  Theorem~\ref{thm:prelim-random} 
    \begin{eqnarray}
        &&\Pr\left[\left\|\frac{1}{m}\sum_{B \in \mc{B}} (\bx_{B1}+\bx_{Bj})(\bx_{B1} +\bx_{Bj})^{\sf T} - 2\bm{\Sigma}- 4{\bm{\mu}}{\bm{\mu}}^{\sf T} \right\| \leq \frac{\lambda_{\tn{min}}(\bm{\Gamma})}{2q\|\br\|_2} \sqrt{\frac{\eps}{\|\bm{\mu}\|_2^2 + \lambda_{\tn{max}}(\bm{\Sigma})}}\right]\geq 1-\frac{\delta}{8q}\nonumber\\
        &&
        \Pr\left[\left\|\frac{1}{m}\sum_{B \in \mc{B}} (\bx_{B1}-\bx_{Bj})(\bx_{B1} -\bx_{Bj})^{\sf T} - 2\bm{\Sigma} \right\| \leq \frac{\lambda_{\tn{min}}(\bm{\Gamma})}{2q\|\br\|_2} \sqrt{\frac{\eps}{\|\bm{\mu}\|_2^2 + \lambda_{\tn{max}}(\bm{\Sigma})}}\right]\geq 1-\frac{\delta}{8q}\nonumber
    \end{eqnarray}
     Observe that $(\bx_{B1}+\bx_{Bj})(\bx_{B1} +\bx_{Bj})^{\sf T} - (\bx_{B1}-\bx_{Bj})(\bx_{B1} -\bx_{Bj})^{\sf T} = 4\bx_{B1}\bx_{Bj}^{\sf T}$. Thus,
    \begin{align}
        4\mb{D}_j - 4\bm{\mu}\bm{\mu}^{\sf T} = \frac{1}{m}\sum_{B \in \mc{B}} (\bx_{B1}+\bx_{Bj})(\bx_{B1} +\bx_{Bj})^{\sf T} - 2\bm{\Sigma}- 4{\bm{\mu}}{\bm{\mu}}^{\sf T} 
        - \left[\frac{1}{m}\sum_{B \in \mc{B}} (\bx_{B1}-\bx_{Bj})(\bx_{B1} -\bx_{Bj})^{\sf T} -  2\bm{\Sigma}\right] \nonumber 
    \end{align}
    Using the above along with the triangle inequality of the operator norm on matrices, and a union bound gives us
    \begin{equation}
        \label{term2}
        \Pr\left[\|\mb{D}_j - {\bm{\mu}}{\bm{\mu}}^{\sf T}\| \leq \frac{\lambda_{\tn{min}}(\bm{\Gamma})}{8q\|\br\|_2} \sqrt{\frac{\eps}{\|\bm{\mu}\|_2^2 + \lambda_{max}(\bm{\Sigma})}}\right]\geq 1-\frac{\delta}{4q}.
    \end{equation}

    We again use Theorem~\ref{thm:prelim-random}, leveraging the lower bound on $m$, to bound $\|\hat{ \bm{\Gamma}}- \bm{\mu}\bm{\mu}^{\sf T} - \bm{\Sigma}\|$, thus obtaining
    \begin{equation}
        \label{term3}
        \Pr\left[{\|\hat{ \bm{\Sigma}}- \bm{\Sigma}\|} \leq \frac{\lambda_{\tn{min}}(\hat{\bm{\Gamma}})}{8\|\br\|_2}\sqrt{\frac{\eps}{\|\bm{\mu}\|_2^2 + \lambda_{max}(\bm{\Sigma})}}\right] \geq  1-\frac{\delta}{8}
    \end{equation}
    
    Now, to bound the last term $\| \hat{\bm{\mu}}\hat{\bm{\mu}}^{\sf T}- \bm{\mu}\bm{\mu}^{\sf T} \|$, we first bound $\|\hat{\bm{\mu}} - \bm{\mu}\|$. Note that $\bm{\hat \mu} - \bm{\mu} \sim N(\bm{0}, \frac{\bm{\Sigma}}{m})$. We use Gaussian concentration (\ref{lem:gaussconc})  to obtain for $m\geq O\left(\frac{dq\|\br\|_2 (\|\bm{\mu}\|+1)\log(\frac{1}{\delta})\lambda_{\tn{max}}(\bm{\Sigma})}{\lambda_{\tn{min}}(\bm{\Gamma})}\sqrt{\frac{\|\bm{\mu}\|^2 + \lambda_{\tn{max}}(\bm{\Sigma})}{\eps}}\right)$,
    \begin{equation}
        \label{mubound}
        \Pr\left[\|\hat{\bm{\mu}} -\bm{\mu}\| \leq \frac{\lambda_{\tn{min}}(\bm{\Gamma})}{16q\|\br\|_2 (\|\bm{\mu}\|_2 + 1)} \sqrt{\frac{\eps}{\|\bm{\mu}\|_2^2 + \lambda_{\tn{max}}(\bm{\Sigma})}} \right]\geq 1-\frac{\delta}{8}
    \end{equation}
    Now, we use this to upper-bound the last term as follows,
    \begin{eqnarray}
        \| \hat{\bm{\mu}}\hat{\bm{\mu}}^{\sf T}-\bm{\mu}\bm{\mu}^{\sf T} \| &\leq& \|(\hat{\bm{\mu}} -\bm{\mu})(\hat{\bm{\mu}} -\bm{\mu})^{\sf T} + \bm{\mu}(\hat{\bm{\mu}} -\bm{\mu})^{\sf T} + (\hat{\bm{\mu}} -\bm{\mu})\bm{ \mu}^{\sf T}\| \nonumber \\
        &\leq& \|\hat{\bm{\mu}} - \bm{\mu}\|^2 + 2\|\bm{\mu}\|\|\bm{\hat \mu} - \bm{\mu}\|
    \end{eqnarray}
    Substituting the bound in equation~\eqref{mubound}, we get that for $m\geq O\left(\frac{dq\|\br\|_2 (\|\bm{\mu}\|+1)\log(\frac{1}{\delta})\lambda_{\tn{max}}(\bm{\Sigma})}{\lambda_{\tn{min}}(\bm{\Gamma})}\sqrt{\frac{\|\bm{\mu}\|^2 + \lambda_{\tn{max}}(\bm{\Sigma})}{\eps}}\right)$,
    \begin{equation}
    \label{term4}
        \Pr\left[\| \hat{\bm{\mu}}\hat{\bm{\mu}}^{\sf T}-\bm{\mu}\bm{\mu}^{\sf T} \| \leq \frac{\lambda_{\tn{min}}(\bm{\Gamma})}{8q\|\br\|_2}\sqrt{\frac{\eps}{\|\bm{\mu}\|_2^2 + \lambda_{max}(\bm{\Sigma})}} \right] \geq 1 - \frac{\delta}{8}
    \end{equation}
    Combining the bounds in  \eqref{term1}, \eqref{term2}, \eqref{term3}, \eqref{term4} we obtain that 
    \begin{equation}
         \Pr\left[\|\hat\br -\br\| \leq \frac{\lambda_{\tn{min}}(\bm{\Gamma})}{2\lambda_{\tn{min}}(\hat{\bm{\Gamma}})}\sqrt{\frac{\eps}{\|\bm{\mu}\|_2^2 + \lambda_{max}(\bm{\Sigma})}}\right] \geq 1-\frac{3\delta}{4}. \label{eq:matrixperturb-pre}
    \end{equation}
     We use Weyl's inequality on perturbation of eigenvalues as mentioned in Equation (6.7) of \citep{Wainwright-HDP} along with Theorem~\ref{thm:prelim-random} applied to iid samples from $N(\bm{\mu}, \bm{\Sigma})$, to conclude that for $m \geq O\left(\frac{d \log(\frac{1}{\delta}) (\|\bm{\mu}\|^2 + \lambda_{\tn{max}}(\bm{\Sigma}))^2}{\lambda_{\tn{min}}^2(\bm{\Gamma})}\right)$, we have $\lambda_{\tn{min}}(\hat{\bm{\Gamma}}) \geq \lambda_{\tn{min}}(\bm{\Gamma})/2$ with probability at least $1-\frac{\delta}{4}$. 
    Combining this with equation~\eqref{eq:matrixperturb-pre}, we get that for $m$ as lower bounded in the statement of Lemma \ref{lem:genlin-1}
    \begin{equation}
         \Pr\left[\|\hat\br -\br\| \leq \sqrt{\frac{\eps}{\|\bm{\mu}\|_2^2 + \lambda_{\tn{max}}(\bm{\Sigma})}}\right] \geq 1-\delta. %\label{eq:matrixperturb}
    \end{equation}
