\section{Generalization Error Bound} \label{sec:genbound}
%Fix $\mbc{X}$ be a subset of real-vectors and $f^* : \mbc{X} \to [0,R]$ be any labeling. Let $\mc{F}$ be a class of functions mapping $\mbc{X}$ to $[0, R]$, with ${\sf Pdim}(\mc{F}) = d$.
With the setup in Sec. \ref{sec:prelims}, let $f^*: \mbc{X} \to [0,R]$ be any labeling.
This section is devoted to proving the following theorem.
\begin{theorem}\label{thm:main-gen-1}
There is a constant $K_0 > 0$ s.t. for parameters $\eps \in [0, R^2]$ and $\delta \in (0,1)$, if $m/(\log m) \geq d\left(\frac{K_0R^4}{\eps^2}\right)^{2k}\left(\log\left(\frac{1}{\delta}\right) + \log\left(\frac{Rk}{\eps}\right)\right)$, then with probability $1 - \delta$ over instance $\mc{I}$ of \iidpmir$[f^*, k, m]$: any $h \in \mc{F}$ s.t. $\E_{\mc{D}}\left[\left|h(\bx) - f^*(\bx)\right|^2\right] \geq \eps$ satisfies $\val_2\left(\mc{I}, h\right) \geq \eps^{2k + 1}/16R^{8k+1}$.
\end{theorem}
The converse of the above theorem is obtained by letting $\eps_{\tn{MIR}} = \eps^{2k + 1}/16R^{8k+1}$ (along with some simplifications) and is stated below.
\begin{corollary} \label{cor:main-gen-1}
    For the lower bound on $m$ above, with probability at least $1 - \delta$, $\val_2\left(\mc{I}, h\right) \leq \eps_{\tn{MIR}}$ implies that $\E_{\mc{D}}\left[\left|h(\bx) - f^*(\bx)\right|^2\right] \leq \left(16\eps_{\tn{MIR}})^{1/(2k+1)}\right)R^{(8k+1)/(2k+1)} \leq \left(16\eps_{\tn{MIR}})^{1/(2k+1)}\right)R^4$, since $R \geq 1$.
\end{corollary}

Let $\zeta > 0$ be a small enough parameter to be fixed later. 
For convenience we shall prove the generalization error bounds for the $\ell_1$-loss (mae) and then translate them to $\ell_2^2$-loss (mse).


\subsection{Fixing the instances}
The process of sampling a random instance $I$ of \iidpmir$[f^*, k, m]$ can be equivalently defined as follows:
\begin{itemize}[nolistsep,noitemsep]
    \item Sample a collection $\mbc{Z}$ of $mk$ i.i.d. points from $\mc{D}$.
    \item Randomly partition $\mbc{Z}$ into $k$-sized bags $B_1, \dots, B_m$
    \item For each $j \in [m]$ choose a random feature-vector from $B_j$ and let its label under $f^*$ be the bag-label of $B_j$ .
\end{itemize}
In this subsection, we shall prove bounds after fixing the underlying instances $\mbc{Z}$ sampled in the above process.

\subsubsection{Bag error lower bound for fixed $f$} \label{sec:bagerrfixedf}
Let $f \in \mc{F}$ be s.t. 
\begin{equation}
    \E_{\bx \in \mbc{Z}}\left[\left|f^*(\bx) - f(\bx)\right|\right] > \zeta \label{eqn:fstarminusf} 
\end{equation}
We have the following lemma.
\begin{lemma} \label{lem:mbS}
    There is $\mbc{S} \subseteq \mbc{Z}$ s.t. $|\mbc{S}| \geq \zeta^2mk/(10R^2)$ and for any $\bx, \bz \in \mbc{S}$, $\left|f^*(\bx) - f(\bz)\right| > \zeta/4$.
\end{lemma}
\begin{proof}
Note that $\max_{\bx\in \mbc{X}}\left|f^*(\bx) - f(\bx)\right| \leq R$. Using this upper bound along with \eqref{eqn:fstarminusf} and an averaging argument we obtain that $\exists \mbc{S}_0  \subseteq \mbc{Z}$ s.t. $|\mbc{S}_0| \geq \zeta mk/(2R)$ and
for any $\bx \in \mbc{S}_0$, $\left|f^*(\bx) - f(\bx)\right| > \zeta/2$, if not, then the LHS of  \eqref{eqn:fstarminusf} is at most $R\zeta/(2R) + (1 - \zeta/(2R))(\zeta/2) < \zeta$ which is a contradiction. For $i \in \{1, \dots, \lceil4R/\zeta\rceil\}$ define $\mbc{S}_i = \{\bx \in \mbc{S}_0\,\mid\, f^*(\bx) \in [(i-1)\zeta/4, i\zeta/4]\}$. Note that by the construction of $\mbc{S}_0$, for each $i \in \{1, \dots, \lceil4R/\zeta\rceil\}$ and any $\bx, \bz \in \mbc{S}_i$, $\left|f^*(\bx) - f(\bz)\right| > \zeta/4$. Choose $\mbc{S}$ to be the $\mbc{S}_i$ with the largest size -- which is at least $mk\left(\zeta/(2R)\right)/\left(\lceil4R/\zeta\rceil\right)$. Note that $\zeta \leq R$ and therefore $\lceil4R/\zeta\rceil \leq 4R/\zeta + 1 \leq 4R/\zeta + R/\zeta \leq 5R/\zeta$. Thus, we obtain that $|\mbc{S}| \geq \zeta^2 mk/(10R^2)$ , completing the proof. 
\end{proof}

Define $\upsilon := \zeta^2/(20R^2)$ so that $\zeta^2mk/(10R^2) = 2\upsilon mk$. Let $p := |\mbc{S}|/(2|\mbc{Z}|) = |\mbc{S}|/(2mk) \geq \upsilon$. We now show that (in the random partitioning step), a significant number of bags have all the elements from $\mbc{S}$. 
\begin{lemma}\label{lem:fullerrorbags}
    With probability at least $1 - 2\tn{exp}\left(-m\upsilon^k/8\right)$  the number of bags having all $k$ elements from $\mbc{S}$ is at least $m\upsilon^k/2$.
\end{lemma}
\begin{proof}
 After the randomized partitioning step each bag gets a certain number of elements from $\mbc{S}$ and the rest from $\mbc{Z}\setminus \mbc{S}$, with exactly $k$ elements per bag. We model this process of generating these counts as follows:
\begin{enumerate}[nolistsep,noitemsep]
    \item Initially each of the $m$ bags have $k$ uncolored balls each, in total having $mk$ balls.
    \item Each uncolored ball independently is colored \emph{red} with probability $p$ and with probability $1 - p$ colored blue. Depending on the total number of red balls go to either Step 3 or Step 4.
    \item If the total number red balls exceeds $|\mbc{S}| = 2pmk$ by $r$, then a random choice of $r$ red balls are colored blue.
    \item If the total number red balls is less than $|\mbc{S}| = 2pmk$ by $r$, then a random choice of $r$ blue balls are colored red.
\end{enumerate}
In the end, a random set of exactly $|\mbc{S}|$ balls are colored red, and therefore the distribution of red-ball counts in the bags is same as that of the number of elements of $\mbc{S}$ in the partitioning process. Thus, all we need to estimate is the number of bags with $k$ red balls. Since the ball coloring step in Step 2 is i.i.d. random, each bag independently gets $k$ red balls with probability $p^k$. Letting $s$ be the number of bags with $k$ red balls, by the lower tail Chernoff bound (see Theorem 4 in \citep{Chernoff}), we obtain at Step 2 that $\Pr[s \geq mp^k/2] \geq 1 - \tn{exp}\left(-mp^k/8\right)$. Now, if we are in Step 4 then the number of bags with all red balls does not decrease, while it Step 3 this number can decrease. So we only need to subtract off the probability that Step 3 happens, which by the Chernoff bound (upper tail) is at most $\tn{exp}\left(-mp^k/3\right) \leq \tn{exp}\left(-mp^k/8\right)$. Thus, at the end of the process,  $\Pr[s \geq mp^k/2] \geq 1 - 2\tn{exp}\left(-mp^k/8\right)$ completing the proof. 
\end{proof}


\subsubsection{Union bound over cover} As described in Sec. \ref{sec:usefulconcepts}, let $\mc{C}_\infty(\xi, \mc{F}, \mbc{Z})$ be an $\ell_\infty$-metric $\xi$-cover whose size we shall denote for convenience by $q_\infty$, for some parameter $\xi> 0$ we shall set later. Then, from Lemma \ref{lem:fullerrorbags} and by union bound we obtain that with probability at least $1 - 2q_\infty\tn{exp}\left(-m\upsilon^k/8\right)$ the following event $E_0$ holds: for each $f \in \mc{C}_\infty(\xi, \mc{F}, \mbc{Z})$ satisfying \eqref{eqn:fstarminusf}
\begin{itemize}[noitemsep,nolistsep]
    \item the number of bags having all $k$ elements from the corresponding $\mbc{S}$ (see  Lemma \ref{lem:mbS}) is at least $mp^k/2 \geq m\upsilon^k/2$. Call these the $\mbc{S}$-covered bags with respect to $f$.
    %\item the above along with the definition of $\mbc{S}$ in Lemma \ref{lem:mbS} also implies that $\val_1\left(\mc{I}, f\right) \geq \zeta \upsilon^k/8$. 
\end{itemize}


\subsection{Error bounds for $\mbc{Z}$}
Consider the subset $\mc{F}_{\tn{err}} \subseteq \mc{F}$ of all $h \in \mc{F}$ such that $\E_{\mc{D}}\left[\left|h(\bx) - f^*(\bx)\right|\right] \geq \hat{\zeta}$ for $\hat{\zeta} := 4\zeta$. %From Theorem 17.1 of \citep{Anthony-Bartlett} we obtain that
We have the following lemma.
\begin{lemma}\label{lem:UnibdFerr}
    With probability at least $1 - 4 q_1\tn{exp}\left(-\hat{\zeta}^2mk/(128 R^2)\right)$ over the choice of $\mbc{Z}$,
\begin{equation}
    \forall h \in \mc{F}_{\tn{err}} \quad  \E_{\bx \in \mbc{Z}}\left[\left|h(\bx) - f^*(\bx)\right|\right] \geq \hat{\zeta}/2 \label{eqn:UnibdFerr}
\end{equation}
where $q_1 = N_1(\hat{\zeta}/32, \mc{F}, 2mk)$.
\end{lemma}
\begin{proof}
    The labeling is given by $f^*$ and therefore the true error of $h$ is $\E_{\mc{D}}\left[\left|h(\bx) - f^*(\bx)\right|\right]$. The empirical error is $\E_{\mathbf{x} \in \mbc{Z}}\left[\left|h(\bx) - f^*(\bx)\right|\right]$ where the expectation is over $\bx$ sampled uniformly at random from $\mbc{Z}$. Therefore, given that $\E_{\mc{D}}\left[\left|h(\bx) - f^*(\bx)\right|\right] \geq \hat{\zeta}$ for all $h$ in $\mc{F}_{\tn{err}}$, the condition of \eqref{eqn:UnibdFerr} follows by an upper bound of $\hat{\zeta}/2$ on the difference between true and empirical errors for the class $\mc{F}_{\tn{err}}$ given by Theorem 17.1 of \citep{Anthony-Bartlett}. Since the mappings in the latter are to $[0,1]$ instead of $[0, R]$ in our case, we apply Theorem 17.1 of \cite{Anthony-Bartlett} with $f^*/R$ as the labeling and $\ol{\mc{F}}_{\tn{err}} := \{h/R : h \in \mc{F}_{\tn{err}}\}$ as the function class. This amounts to taking $\hat\zeta/(2R)$ as the error $\eps$ in Theorem 17.1 of \cite{Anthony-Bartlett}. Observing that $N_1(\hat\zeta/(32R), \ol{\mc{F}}_{\tn{err}}, 2mk) = N_1(\hat\zeta/(32), \mc{F}_{\tn{err}}, 2mk)$ and that $|\mbc{Z}| = mk$ completes the argument.
\end{proof}
Suppose the random choices of $\mbc{Z}$ and $\mc{B}$ ensure that \eqref{eqn:UnibdFerr} holds and letting $\xi := \zeta/8$ the event $E_0$ in the previous subsection also holds. This happens with probability $1 - 2q_\infty\tn{exp}\left(-m\upsilon^k/8\right) - 4 q_1\tn{exp}\left(-\hat{\zeta}^2mk/(128 R^2)\right)$.

Consider any $h \in \mc{F}_{\tn{err}}$ and let $f \in \mc{C}_\infty(\xi, \mc{F}, \mbc{Z})$ be the nearest to it in $\ell_\infty$-distance i.e, $\max_{\bx \in \mbc{Z}}|h(\bx) - f(\bx)| \leq \xi$. Noting that $\xi = \zeta/8$, from \eqref{eqn:UnibdFerr} and the triangle inequality, $f$ satisfies \eqref{eqn:fstarminusf}. Since the event $E_0$ holds, any $\mbc{S}$-covered bag with respect to $f$ incurs a  bag-loss of at least $\zeta/4$ in $\val_1\left(\mc{I}, h\right)$.  By the nearness of $h$ and $f$ such a bag incurs a bag loss of at least $\zeta/4 - \xi \geq \zeta/8$ in $\val_1\left(\mc{I}, h\right)$. By the lower bound on the number of such bags implied by $E_0$ we obtain $\val_1\left(\mc{I}, h\right) \geq \zeta \upsilon^k/16$.

Summarizing the above we have that with probability at least:
\begin{align}
    1 - 2q_\infty\tn{exp}\left(-\frac{m\upsilon^k}{8}\right) - 4 q_1\tn{exp}\left(-\frac{\zeta^2mk}{8 R^2}\right) \label{eqn:totprob1minus}
\end{align}
for all $h \in \mc{F}$ such that $\E_{\mc{D}}\left[\left|h(\bx) - f^*(\bx)\right|\right] \geq 4\zeta$, $\val_1\left(\mc{I}, h\right) \geq \zeta \upsilon^k/16$.

\subsection{Bounds using pseudo-dimension} \label{sec:pseudo-dim}
From Sec. \ref{sec:usefulconcepts} we have that,
\begin{equation}
    q_\infty \leq \left(\frac{2emk}{d\xi}\right)^d, \ \ q_1 \leq \left(\frac{8emk}{d\zeta}\right)^d
\end{equation}
Using the above, there is some absolute constant $K_0 > 0$ s.t. choosing 
\begin{equation}
    \frac{m}{\log m} \geq d\left(\frac{K_0 R^2}{\zeta^2}\right)^{2k}\left(\log\left(\frac{1}{\delta}\right) + \log\left(\frac{k}{\zeta}\right)\right)
\end{equation}
yields that \eqref{eqn:totprob1minus} is at least $1 - \delta$, for $\delta \in (0,1]$. 

{\bf MSE Error Bound.} Suppose that $h \in \mc{F}$ satisfied $\E_{\mc{D}}\left[\left|h(\bx) - f^*(\bx)\right|^2\right] \geq \eps$, then since $h, f$ have range $[0,R]$, we obtain that $\E_{\mc{D}}\left[R\left|h(\bx) - f^*(\bx)\right|\right] \geq \E_{\mc{D}}\left[\left|h(\bx) - f^*(\bx)\right|^2\right] \geq \eps$ i.e., $\E_{\mc{D}}\left[\left|h(\bx) - f^*(\bx)\right|\right] \geq \eps/R$. Also, the optimal primary instance assignment for $\val_1\left(\mc{I}, h\right)$ is also optimal for $\val_2\left(\mc{I}, h\right)$ since the closest instance-prediction in a bag to the bag-label remains the same. Using this, along with the above analysis and substituting $\eps/R$ for $\zeta$ along with the values of the other parameters we obtain the statement of Theorem \ref{thm:main-gen-1}. 

\section{Hardness of Linear \pmir} \label{sec:main_hardness}
\begin{theorem} \label{them:main-hardness}
Let $\mc{F}$ be the class of all linear regressors over $\R^n$ for some $n \in \mathbb{Z}^+$. There is an absolute constant $C_2 > 0$ s.t. given an instance $\mc{I}$ of \pmir whose bags are of size $\leq 2$ with bag-labels in $[-1,1]$ such that there exists a $f^* \in \mc{F}$ such that ${\sf val}_2(\mc{I}, f^*) = 0$, it is NP-hard to find $f \in \mc{F}$ s.t. ${\sf val}_2(\mc{I}, f) \leq C_2 - \eps$, for any constant $\eps >0$. In fact, one can take $C_2 = \frac{2}{100}\left(1 - \frac{1}{\sqrt{\pi}}\right)$. The result also holds when the bags are disjoint and therefore for injective \pmir.
\end{theorem}

The rest of this section describes the dictatorship test which is a key component of the proof, the rest of which is included in Appendix \ref{sec:hardness_redn}. While the hardness reduction creates \pmir instances with overlapping bags, in Appendix \ref{sec:nonoverlapping} we show how to make the bags disjoint while retaining the hardness factor. 

 \subsection{Dictatorship Test}\label{sec:dict}
 For any positive integer $K$, let $\mc{J}_K$ an instance of \pmir on $2$-sized bags as follows. The underlying set of feature-vectors is $\{-1,1\}^K$, we now define $\mc{J}_K$ as a distribution  which samples a random $2$-sized bag along with its label as follows:
 \begin{enumerate}[noitemsep,nolistsep]
     \item Choose $\bx^{(1)}$ uniformly at random from $\{-1,1\}^K$ and define $\bx^{(2)} = -\bx^{(1)}$.
     \item Sample $\sigma \leftarrow \{1,-1\}$ uniformly at random.
     \item Output bag $B = \{\bx^{(1)}, \bx^{(2)}\}$ along with $\sigma$ as its label.
 \end{enumerate}
 While we define $\mc{J}_K$ for convenience as a distribution over bags and their labels, the distribution is uniform over all possible $2^K$ sets $\{\bx^{(1)}, \bx^{(2)} = -\bx^{(1)}\}$ and labels $\{-1,1\}$ for each of them, in total we have $2^{K+1}$ bags.
 Note that in the above we treat a set of two feature-vectors with label $1$ and with label $-1$ as two distinct bags. 
 %In Sec. \ref{sec:disjointbags} we shall that our reduction can be tweaked to output distinct (in fact disjoint) bags.
 We prove the following two properties of $\mc{J}_K$.
 
 \begin{lemma}[(Completeness of $\mc{J}_K$)] \label{lem:dict-comp}For any $i^* \in [K]$, the linear regressor $f^{(i)}(\bx) = x_i$ admits a primary instance assignment $\Gamma$ such that its ${\sf val}_2(\mc{J}_K, f^{(i)}, \Gamma) = 0$.
 \end{lemma}
 \begin{proof}
    %Clearly it suffices to prove the lemma for $\tn{val}_2$ since $\ell_2$ distance being zero is same as $\ell_p$ distance being zero, for all $p \geq 1$. 
    For any fixed $i \in [K]$, and any bag $B = \{\bx^{(1)}, \bx^{(2)}\}$ sampled by  $D(\mc{J}_K)$, we have that $f^{(i)}(\bx^{(1)}) = x^{(1)}_i = - x^{(2)}_i = -  f^{(i)}(\bx^{(2)}) \in \{-1,1\}$. Thus, $\{f^{(i)}(\bx^{(1)}), f^{(i)}(\bx^{(2)})\} = \{-1,1\}$ which are the two possible values of $\sigma$. Therefore, the choice of $\Gamma(B)$ to be $\bx^{a}$ s.t. $f^{(i)}(\bx^{(a)}) = \sigma$ yields that ${\sf val}_2(\mc{J}_K, f^{(i)}, \Gamma) = 0$.
\end{proof}

\subsubsection{Soundness of $\mc{J}_K$}
Let $f(\bx) = \langle\bc, \bx\rangle + c_0 =  \sum_{i=1}^K c_ix_i + c_0$ denote a linear regressor over $\R^K$. We say that $f$ is \emph{$\tau$-regular} if $\max_{i\in [K]} |c_i| \leq \tau \|\bc\|_2$. 
This subsection is devoted to proving the following soundness property of $\mc{J}_K$ for the above linear regressor.
\begin{lemma}[(Soundness of $\mc{J}_K$)] \label{lem:dict-soundness}
    There is an absolute constant $C_2 > 0$ such that for any small enough constant $\tau > 0$, if $f$ is $\tau$-regular then for any primary instance assignment $\Gamma$, $\tn{val}_2(\mc{J}_K, f, \Gamma) \geq C_2 - \tau$. In fact, one can take $C_2 = \frac{2}{100}\left(1 - \frac{1}{\sqrt{\pi}}\right)$.
\end{lemma}
We assume the $\tau$-regularity condition of the above lemma and prove the lemma for the case $c_0 \leq 0$, with the proof for $c_0 \geq 0$ being analogous. 

Observe that the bag gets label either $1$ or $-1$ with equal probability. Therefore, it suffices to lower bound the probability that both $f(\bx^{(1)})$ as well as $f(\bx^{(2)})$ are far enough from $1$ (for $c_0 \geq 1$ the deviation from $-1$ is lower bounded), as the following lemma shows.
\begin{lemma} \label{lem:disct-soundness-2}
    There are absolute constants $t_0 \in (0, 1/2], p_0 \in (0,1]$  s.t.
    \begin{equation}
        \Pr\left[\left|f(\bx^{(1)}) - 1\right|, \left|f(\bx^{(2)}) - 1\right| > t_0\right] \geq p_0 - 2.5\tau.
    \end{equation}
    In fact, the above is satisfied with $t_0 = 0.2$ and $p_0 = 1 - 1/\sqrt{\pi}$
\end{lemma}
\begin{proof} For convenience let $g(\bx) := \langle\bc, \bx\rangle$ i.e., $f(\bx) = g(\bx) + c_0$. Consider a random bag $B$ sampled by $\mc{J}_K$. Define the random variable $\tilde{X} := g(\bx^{(1)})$ and by construction we have that $g(\bx^{(2)}) = -\tilde{X}$. Thus, we have
\begin{eqnarray}
   & & \min\left\{\left|f(\bx^{(1)}) - 1\right|, \left|f(\bx^{(2)}) - 1\right|\right\} > t_0 \nonumber \\
   & \Leftrightarrow & \min\left\{\left|X + c_0 - 1\right|, \left|-X + c_0 - 1\right|\right\} > t_0 \nonumber \\
    & \Leftrightarrow & \min\left\{\left|X - (1 - c_0)\right|, \left|-X -  (1 - c_0)\right|\right\} > t_0 \nonumber \\
    & \Leftrightarrow & |X| \not\in \left[\kappa - t_0, \kappa + t_0\right]
\end{eqnarray}
where $\kappa = 1 - c_0 \geq 1$ since $c_0 \leq 0$.
Now, $X = \sum_{i=1}^K c_i x_i$ where $x_i$ are i.i.d. uniform $\{-1,1\}$ random variables, $i\in [K]$. Applying the Berry-Esseen theorem~\citep{Shevtosa} and using the regularity of $f$ we obtain that for any $t \in \R$
\begin{equation}
    \left|\Pr[X > t] - \Pr[Z > t]\right| \leq 0.57\tau 
\end{equation}
where $Z \sim N(0,\sigma^2)$ is a mean zero Gaussian with variance $\sigma^2 = \|\bc\|_2^2$. Thus, 
\begin{align*}
    &\left|\Pr\left[|X| \not\in \left[\kappa - t_0, \kappa + t_0\right]\right]\right.  \\
    & \ \ \ \ \ \left. - \Pr\left[|Z| \not\in \left[\kappa - t_0, \kappa + t_0\right]\right]\right| \leq 2.5\tau 
\end{align*}
To complete the proof, we need to lower bound the above probability term with $Z$. Since $Z$ is a mean-zero Gaussian and $\kappa - t_0 \geq \kappa - 1/2 > 0$, $\Pr\left[|Z| \not\in \left[\kappa - t_0, \kappa + t_0\right]\right] = 1 - 2\Pr\left[Z \in \left[\kappa - t_0, \kappa + t_0\right]\right]$. Therefore, we need to upper bound $\Pr\left[Z \in \left[\kappa - t_0, \kappa + t_0\right]\right]$ by a constant strictly less than $1/2$. To do this we shall take $t_0 = 0.2 \leq 0.2\kappa$ and consider two cases:
\begin{itemize}
    \item $\sigma  > 4\sqrt{2}\kappa/10$: In this case a straightforward integration over the segment $\left[\kappa - t_0, \kappa + t_0\right]$ of length $2t_0$ yields
    \begin{align*}
        \Pr&\left[Z \in \left[\kappa - t_0, \kappa + t_0\right]\right] \\
        & \leq \frac{2t_0}{\sigma\sqrt{2\pi}} \leq \frac{2(0.2)(10)\kappa}{8\kappa \sqrt{\pi}} = \frac{1}{2\sqrt{\pi}} 
    \end{align*}
    \item $\sigma  \leq 4\sqrt{2}\kappa/10$: In this case we have,
    \begin{align*}
        \Pr\left[Z \in \left[\kappa - t_0, \kappa + t_0\right]\right] &\leq \Pr\left[\frac{Z}{\sigma} \geq \frac{\kappa - t_0}{\sigma}\right] \\
        &\leq \Pr\left[\frac{Z}{\sigma} \geq \sqrt{2}\right]
    \end{align*}
    where we use the upper bound of $4\sqrt{2}\kappa/10$ on $\sigma$ and that $\kappa - t_0 \geq \kappa - 0.2\kappa = 0.8\kappa$. Now since $Z/\sigma \sim N(0,1)$, Prop 2.1.2 of \citep{Vershynin} yields an upper bound for the RHS of above given by $\left(1/(\sqrt{2}\sqrt{2\pi})\right)\tn{exp}(-((\sqrt{2})^2/2) \leq 1/(2\sqrt{\pi})$.
\end{itemize}
Combining everything we complete the proof with $t_0 = 0.2$ and $p_0 = 1 - 1/\sqrt{\pi}$. 
\end{proof}
\begin{proof} (of Lemma \ref{lem:dict-soundness}) 
    Observe that a bag with the same two feature-vectors occurs with bag-label $1$ as well as $-1$. Lemma \ref{lem:disct-soundness-2} shows that the expected contribution to $\tn{val}_2$ from bags with bag-label $1$ is $(p_0 - 2.5\tau)t_0^2$. Since, bag-label $1$ occurs half the time, we obtain the following lower bound on the $\tn{val}_2(\mc{J}_K, f, \Gamma)$ for any $\Gamma$:
    \begin{equation}
        \frac{1}{2}\left(1 - \frac{1}{\sqrt{\pi}} - 2.5\tau\right)(0.2)^2 \geq \frac{2}{100}\left(1 - \frac{1}{\sqrt{\pi}}\right) - \tau
    \end{equation}
    which completes the proof.
\end{proof}





