\section{Hardness Reduction for Linear \pmir}\label{sec:hardness_redn}

\subsection{Preliminaries}\label{sec:prelims_hardness}

Our hardness result is via a reduction from the \slc problem defined below.
\begin{definition}\label{def-SLC}
An instance of \slc
$\mc{L}(G(V,E),N,M,\{\pi^{e,v}\,\mid\,e\in E, v\in e\})$ consists of a regular
connected (undirected)
graph $G(V,E)$
with vertex set $V$ and edge set $E$.  Every edge
$e = (v_1,v_2)$ is associated with  projection functions
$\{\pi^{e,v_i}\}_{i=1}^{2}$ where $\pi^{e,v_i}: [M] \to [N]$.
A vertex labeling is a mapping defined on $L : V\to [M]$. A
labeling $L$ satisfies edge $e = (v_1, v_2)$ if
$\pi^{e,v_1}(L(v_1)) = \pi^{e,v_2}(L(v_2))$. The goal is to
find a labeling which satisfies the maximum number of edges.
\end{definition}
The following theorem states the hardness of \slc and is proved in Appendix A of \citep{GRSW}. 
\begin{theorem}\label{thm:slc-hardness}
There exists a constant $c_0 > 0$ such that for any constant
integer parameters $Q,R\geq 1$,
it is {\rm NP}-hard to distinguish between the following
two cases for a \textnormal{Smooth Label Cover} instance
$\mc{L}(G(V,E),N,M,\{\pi^{e,v}\,\mid\,
e\in E, v\in e\})$ with $M = 7^{(Q+1)R}$ and $N= 2^{R}7^{QR}$:
\begin{itemize}[noitemsep]
	\item \textnormal{(YES Case)} There is a labeling that satisfies every
edge.
\item \textnormal{(NO Case)}  Every labeling satisfies less than a fraction $2^{-c_0 R}$ of the edges.
\end{itemize}
In addition, the instance
$\mc{L}$ satisfies the following properties:
\begin{itemize}[noitemsep]
	\item \textnormal{(Smoothness)} For any vertex $w \in V$, $ \forall
i,j\in [M],\ i\neq j, \ \ \Pr_{e\sim w} \left[\pi^{e,w}(i) = \pi^{e,w}(j)\right] \leq
1/Q,$ where the probability is over a randomly chosen edge incident
on $w$.
%\item The degree of the (regular) graph $G$, which we denote by $d$, is
%a constant depending only on $R$ and $J$.
\item For any vertex $v$, edge $e$ incident on $v$, and any
element $i \in [N]$, we have $|(\pi^{e,v})^{-1}(i)|\leq d := 4^R$;
i.e., there are at most $d = 4^R$ elements in $[M]$ that are mapped to
the same element in $[N]$.
\item \textnormal{(Weak Expansion)} For any $\delta > 0$,  let $V'\subseteq V$ and
$|V'| = \delta\cdot |V|$, then the number of edges among the vertices
in $|V'|$ is at least $\delta^2|E|$.
\end{itemize}
\end{theorem}
Theorem \ref{them:main-hardness} follows from the following hardness reduction and Theorem \ref{thm:slc-hardness}.
\begin{theorem}\label{thm:hardness-redn} 
    There exists a universal constant $C_2 \in (0, 1]$ s.t. for any $\eps > 0$ there exists a polynomial time reduction from an \slc instance $\mc{L}$ with some parameters $Q$ and $R$ depending on $\eps$ to an instance $\mc{I}$ of \pmir with all bags of size $\leq 2$ and labels in $[-1,1]$ s.t.
    \begin{itemize}[noitemsep]
        \item \tn{(YES Case)} If $\mc{L}$ is a YES instance then there exists a linear regressor $h^*$ and a primary instance assignment $\Gamma^*$ satisfying $\tn{val}_2(\mc{I}, h^*, \Gamma^*) = 0$.
        \item \tn{(NO Case)} Id $\mc{L}$ is a NO instance then for all linear regressors $h$ and primary instance assignments $\Gamma$, $\tn{val}_2(\mc{I}, h, \Gamma) > C_2 - \eps$.
    \end{itemize}
    The above holds with $C_2 = \frac{2}{100}\left(1 - \frac{1}{\sqrt{\pi}}\right)$.
 \end{theorem}
 
 The rest of this section is devoted to proving Theorem \ref{thm:hardness-redn}.


\subsection{Hardness Reduction from \slc}
Let $Q, R$ be parameters of an \slc $\mc{L}$ from Theorem \ref{thm:slc-hardness}, to be set later depending on $\eps$ in Theorem \ref{thm:hardness-redn}. We first create an intermediate instance $\tilde{\mc{I}}$ of \pmir as follows. For each vertex in $V$ we have a block of $M$ coordinates i.e., $\mbc{X} = \R^{V\times [M]}$. For a vector $\bx \in \mbc{X}$, let $\bx_v \in \R^M$ denote its restriction of the $M$ coordinates corresponding to $v \in V$. The instance $\tilde{\mc{I}}$ is define by a distribution $D_{\tilde{\mc{I}}}$ which samples a random bag as follows:
\begin{enumerate}[noitemsep,nolistsep]
    \item Sample a vertex $v \in V$ uniformly at random.
    \item Sample a bag-label pair $\left(\{\bx^{(1)}, \bx^{(2)}\}, \sigma\right)$ from $\mc{J}_M$ (see Sec. \ref{sec:dict}).
    \item Define vectors $\tilde{\bx}^{(1)}$ and $\tilde{\bx}^{(2)}$ as follows:
    \begin{equation}
        \forall u \in V, \quad\quad \tilde{\bx}^{(1)}_{u} = \begin{cases} \bx^{(1)} & \tn{ if } u = v \\
                             \mb{0} & \tn{ otherwise.}
                             \end{cases}
        \quad \quad
        \tilde{\bx}^{(2)}_{u} = \begin{cases} \bx^{(2)} & \tn{ if } u = v \\
                             \mb{0} & \tn{ otherwise.}
                             \end{cases} \label{eqn:barx1x2}
    \end{equation}
    \item Output the bag $\left(\{\tilde{\bx}^{(1)}, \tilde{\bx}^{(2)}\}, \sigma\right)$
\end{enumerate}
We now apply the folding transformation to obtain the final instance.
\subsubsection{Folding and Final Instance $\mc{I}$} \label{sec:folding}
For any edge $e = (u, v) \in E$ and element $j \in [N]$, define the vector
${\bf h}^{(e,j)} \in \R^{V\times [M]}$ as follows,
\begin{equation}
h^{(e,j)}_{w, i}\ =\ \begin{cases}
                1 \textnormal{\ \ \ \ \ \ if } w = u \textnormal{ and } i \in
(\pi^{e, u})^{-1}(j) \\
                -1 \textnormal{\ \ \  if } w = v \textnormal{ and } i \in
(\pi^{e, v})^{-1}(j) \\
                0 \textnormal{\ \ \ \ \ \ otherwise.}
                \end{cases}\nonumber
\end{equation}
Therefore, for any vector $\tilde{\bx} \in \R^{V\times [M]}$,
\begin{equation}
\displaystyle \forall e = \{u, v\} \in E,\ j \in [N], \ \ \ \ \tilde{\bx} \perp {\bf h}^{(e,j)} \ \Leftrightarrow\ \sum_{i \in  (\pi^{e, u})^{-1}(j)} \tilde{x}_{u,i} \ =\  \sum_{i' \in  (\pi^{e, v})^{-1}(j)} \tilde{X}_{v,i'}
\label{eqn-orth-1}
\end{equation}
Define two subspaces $H$ and $F$ of $\R^{V\times [M]}$ as:
\begin{equation}
    H := \textnormal{span}({\bf h}^{(e,j)}\ \mid\ e \in E,\ j \in [N]\}) \quad \tn{ and } F = H^\perp
\end{equation}
i.e, $F$ is the orthogonal complement of $H$ in $\R^{V\times [M]}$

The final instance $\mc{I}$ is obtained by replacing each bag $\left(\{\tilde{\bx}^{(1)}, \tilde{\bx}^{(2)}\}, \sigma\right)$ with a bag $\left(\{\ol{\bx}^{(1)}, \ol{\bx}^{(2)}\}, \sigma\right)$, where $\ol{\bx}^{(s)}$ is the projection of the vector $\tilde{\bx}^{(s)}$ onto $F$ and represented using a orthonormal basis for $F$ ($s =1,2$). Thus, the entire instance $\mc{I}$ along with the expected linear regressor solutions reside in $F$.

\subsection{Proof of  YES case}
Suppose $\rho : V \to [M]$ is a labeling that satisfies all edges $E$ of $\mc{L}$. We shall first construct a solution for $\tilde{\mc{I}}$ with objective $0$. Consider the vector $\tilde{\bc} \in \R^{V\times [M]}$ where $\tilde{c}_{v,i} = 1$ if $i = \rho(v)$ and $0$ otherwise, for all $v \in V$ and $i \in [M]$. Observe that for any $v \in V$: (i) $\tilde{\bc}$ is an indicator vector in the $M$ coordinates corresponding to $v$, and (ii) the bags of $\tilde{I}$ after sampling $v$ are exactly those of $\mc{J}_M$ in those coordinates (with coordinates corresponding to $v' \neq v$ being set to zero). Thus, by the the completeness of $\mc{J}_M$ (Lemma \ref{lem:dict-comp}) $f^*(\bx) := \langle \tilde{\bc},\bx\rangle$ has zero objective on the bags of $\tilde{\mc{I}}$.

Now, since $\rho$ is a satisfying assignment, $\tilde{\bc}$ satisfies the condition on the LHS of \eqref{eqn-orth-1} using which we obtain that $\tilde{\bc} \perp H$. Therefore, for any $\tilde{\bx} \in \R^{V\times [M]}$, $\langle \tilde{\bc}, \tilde{\bx}\rangle = \langle \ol{\bc}, \ol{\bx}\rangle$, where $\ol{\bc}$ and $\ol{\bx}$ are the projections of $\tilde{\bc}$ and $\tilde{\bx}$ onto $H^{\perp} = F$. Thus, the objective of $\ol{\bc}$ on $\mc{I}$ is same as that of $\tilde{\bc}$ on $\tilde{\mc{I}}$ which is $0$. 

\subsection{Proof of NO case}
Suppose for a contradiction that there is a regressor $\ol{f}(\ol{\bx}) = \langle \ol{\bc}, \ol{\bx}\rangle + c_0$ where $\ol{\bc} \in F$. for which there is a primary instance assignment $\Gamma$ s.t. $\tn{val}_2(\mc{I}, \ol{f} , \Gamma) < C_2 - \eps$. Here we shall choose $C_2$ to be the constant from Lemma \ref{lem:dict-soundness}. Since it suffices to prove the soundness for small enough values of $\eps$, we shall take $\eps \leq C_2/2$. For $v\in V$, let $\tn{val}_2(\mc{I}, \ol{f}, \Gamma, v)$ be the objective restricted to only those bags corresponding obtained after sampling $v$ i.e., $D_{\tilde{\mc{I}}}$ conditioned on $v$. Therefore, $\tn{val}_2(\mc{I}, \ol{f}, \Gamma) = \E_{v\in V}\left[\tn{val}_2(\mc{I}, \ol{f}, \Gamma, v)\right]$. 
It is easy to see that there must be $(\eps/(2C_2))$-fraction of the vertices $V' \subseteq V$ s.t.  $\tn{val}_2(\mc{I}, \ol{f} , \Gamma, v) \leq C_2 - \eps/2$ for each $v \in V'$, if not then by averaging $\tn{val}_2(\mc{I}, \ol{f} , \Gamma) > (1 - \eps/(2C_2))(C_2 - \eps/2) > C_2 - \eps$ which is a contradiction. 

We now \emph{unfold} $\ol{\bc}$, rewriting it as $\bc \in \R^{V\times [M]}$ which satisfies the folding constraints \eqref{eqn-orth-1}, and letting the corresponding regressor over $\R^{V\times [M]}$ be $\ol{f}(\bx) = \langle \ol{\bc}, \bx\rangle + c_0$ for the intermediate instance $\tilde{\mc{I}}$. From the above we have that for each $v \in V'$, $\tn{val}_2(\tilde{\mc{I}}, f , \Gamma, v) \leq C_2 - \eps/2$. Using our setting of $C_2$ as the constant from Lemma \ref{lem:dict-soundness}, we obtain that for each $v \in V'$, $\bc_v \in R^M$ is \textit{not} $(\eps/2)$-regular where $\bc_v$ is the restriction of $\bc$ to only those coordinates corresponding $v$. Thus, the subsets $S_v := \{i \in [M]\,\mid, |c_{v,i}| \geq (\eps/2)\|\bc_v\|_2\}$ and $R_v := \{i \in [M]\,\mid, |c_{v,i}| \geq (\eps/4)\|\bc_v\|_2\}$ are non empty for each $v \in V'$. Furthermore by definition, $S_v \subseteq R_v$ and $|S_v| \leq (4/\eps^2)$ and $|R_v| \leq 16/\eps^2$ for each $v \in V'$. 

Let us also define the subset $T_v := \{i \in [M]\,\mid, |c_{v,i}| \geq (\eps/(16d))\|\bc_v\|_2\}$ where $d := 4^R$ is the parameter from Theorem \ref{thm:slc-hardness}, so that $S_v \subseteq R_v \subseteq T_v$ and $|T_v| \leq (16d/\eps)^2$, for all $v \in V'$. Letting $E'$ be the edges of $\mc{L}$ induced by $V'$, we obtain from Theorem \ref{thm:slc-hardness} that $|E'| \geq (\eps/(2C_2))^2 |E|$. Call an edge $e = \{u, v\} \in E'$ \textit{good} if $|\pi^{e,u}(T_u)| = |T_u|$ and $|\pi^{e,v}(T_v)| = |T_v|$. For any vertex $v \in V'$, the fraction of edges $e \in E$ incident on $v$ and violating $|\pi^{e,v}(T_v)| = |T_v|$ is at most $|T_v|^2/Q \leq (16d/\eps)^4/Q$ from Theorem \ref{thm:slc-hardness}. We can count these for each of the vertices and remove them, thus, yielding the number of good edges to be at least $\Delta |E|$ where
\begin{equation}
    \Delta \geq \left(\frac{\eps}{2C_2}\right)^2 - \frac{2}{Q}\left(\frac{16d}{\eps}\right)^4 \label{eqn:Delta}
\end{equation}

We now prove the following structural lemma for good edges.
\begin{lemma} \label{lem:goodedge}
    For any good edge $e = \{u, v\}$, $\pi^{e,u}(R_u) \cap \pi^{e,v}(R_v) \neq \emptyset$.
\end{lemma}
\begin{proof}
    Without loss of generality assume that $\|\bc_v\|_2 \geq \|\bc\|_2$. Since $S_v \neq \emptyset$, let $i_0 \in S_v$ and $j^0 = \pi^{e,u}(i_0)$. Furthermore, since $e$ is good we know that $(\pi^{e,v})^{-1}(j_0) \cap T_v = 1$. Thus, 
    \begin{equation}
        \left|\sum_{i \in (\pi^{e,v})^{-1}(j_0)} c_{v,i} \right| \geq \left(\frac{\eps}{2} - d\frac{\eps}{16d}\right)\|\bc_v\|_2 \geq \frac{7\eps}{16}\|\bc_v\|_2 \label{eqn:vsum}
    \end{equation}
    where $\frac{\eps}{2}\|\bc_v\|_2$ is the lower bound on $|c_{v,i_0}|$ since $i_0 \in S_v$, and $\frac{\eps}{16d}\|\bc_v\|_2$ is an upper bound on $|c_{v,i_0}|$ for $i \in (\pi^{e,v})^{-1}(j_0)\setminus\{i_0\}$ from $(\pi^{e,v})^{-1}(j_0) \cap T_v = 1$.
    Now, for a contradiction assume that $(\pi^{e,u})^{-1}(j_0) \cap R_u = \emptyset$. By the goodness of $e$ we already have $(\pi^{e,u})^{-1}(j_0) \cap T_u = 1$. Thus, 
    \begin{equation}
        \left|\sum_{i \in (\pi^{e,u})^{-1}(j_0)} c_{u,i} \right| \leq \left(\frac{\eps}{4} + d\frac{\eps}{16d}\right)\|\bc_u\|_2 \leq \frac{5\eps}{16}\|\bc_u\|_2 \label{eqn:usum}
    \end{equation}
    However, \eqref{eqn:vsum} and \eqref{eqn:usum} violate the folding constraint \eqref{eqn-orth-1} for $\bc$, thus completing the proof.
\end{proof}

{\bf Randomized Labeling.} Consider the following randomized labeling of $V'$: for each $v \in V'$ assign a label uniformly at random from $R_v$. From \eqref{eqn:Delta},  Lemma \ref{lem:goodedge} and the upper bound of $16/\eps^2$ for any $R_v$, $v\in V'$, we obtain that this randomized labeling satisfies in expectation at least $\Delta^* = (\eps/4)^4\Delta |E|$ edges. We can choose the parameter $R$ in Theorem \ref{thm:slc-hardness} to be large enough and $Q \gg d$ to be large enough so that $\Delta^* > 2^{-c_0 R}$ which is a contradiction to the NO case of Theorem \ref{thm:slc-hardness}. This completes the proof of the NO case.

\subsection{Non-overlapping bags} \label{sec:nonoverlapping}
The bags in the instance $\mc{I}$ are overlapping, particularly since the dictatorship test $\mc{J}_M$ and therefore $\tilde{\mc{I}}$ creates multiple copies of the same bag with different bag label, and because the folding step may identify feature-vectors. To make the bags of $\mc{I}$ disjoint, we do the following: independently for each bag (including copies) $B$ sample $\gamma \in (0, \eps/2)$ u.a.r. and scale the bag-label as well as both the feature-vectors in that bag by $(1 - \gamma)$.

First, note that since the original bag-labels were $\{-1,1\}$, and each feature-vector is primary for at least one bag in the YES case, none of the feature-vectors can be $\mb{0}$. This also holds in the NO case, otherwise one can easily distinguish the YES and NO cases, leading to P = NP. Thus, one may assume that none of the feature-vectors in $\mc{I}$ are $\mb{0}$. Now, observe that since the scaling factor is independently sampled for each bag from a continuous range and the number of bags are finite, with probability $1$ over the choice of the scaling factors any two feature-vectors from two different bags will have different lengths, and therefore the bags are pairwise disjoint.
Clearly, a perfect linear regressor (i.e., with zero loss) remains one since the the bag-label is scaled with the same factor as the bag feature-vectors, so the YES case is preserved. For the NO case, observe that this can reduce the loss by a factor of at most $(1 - \eps/2)^2$ therefore the lower bound on the loss remains $C_2 - O(\eps)$. 

\section{Synthetic data experiments with $k=2$} \label{sec:experimental-appendix}
We observe that for tiny bag sizes ($k=2$), {\sf PIR} is able to better identify prime instances correctly, resulting in better performance than {\sf wtd-Assign}. For larger bag sizes though, our {\sf wtd-Assign} method performs the best (Table \ref{tab:synthetic}).

\begin{table}[bhtp]
\centering

\begin{tabular}{lrrrrrr}
\toprule
$ \text{Overlap \%}\rightarrow$ &           10 &          15 &     20 &       25 \\
\midrule
\multicolumn{5}{c}{$k=2$} \\

{\sf InsMIR} &    $2.90$ &  $3.12$ &  $3.43$ &  $3.76$ \\
{\sf AggMIR} &   $4.49$ &    $4.28$ &   $4.15$ &  $4.22$ \\
{\sf PIR} &  $\mb{1.21}$ &  $\mb{1.21}$ &    $\mb{1.21}$ &  $\mb{1.21}$ \\
{\sf BPMIR} &   $1.95$ &   $1.91$ &  $2.13$ &    $2.02$ \\
\wtdAssign &   $1.38$ &   $1.33$ &   $1.34$ &  $1.36$ \\

\bottomrule
\end{tabular}
\caption{Synthetic data ($k=2$): Test MSE }\label{tab:synthetic_k2}
\end{table}


\begin{table}[bhtp]
\centering
\begin{tabular}{lrrrrrr}
\toprule
$ \text{Overlap \%}\rightarrow$ &           10 &          15 &     20 &       25 \\

\midrule
\multicolumn{5}{c}{$k=2$} \\
{\sf PIR} & $\mb{84.32}$  &  $\mb{84.46}$ &    $\mb{84.48}$ &  $\mb{84.60}$ \\
{\sf BPMIR} & $50.30$  & $51.08$ & $50.18$ &    $50.20$ \\
\wtdAssign &   $76.51$ &   $76.39$ &   $79.73$ &  $77.55$ \\

\bottomrule
\end{tabular}
\caption{Synthetic data ($k=2$): Train Attribution Accuracy}\label{tab:synthetic2_k2}
\end{table}