


\subsection{Norm bounds on simple graph matrices}

In this section, we will prove \cref{lem: graphmatrixnormbound_nomiddlevertices}. First, we recall the following scalar concentration result from \cite{schudy2011bernstein}.

\subsubsection{Schudy-Sviridenko moment bound}

The definitions and main bound in this section are from \cite{schudy2011bernstein}.

\begin{definition}
	A random variable $Z$ is central moment bounded with real parameter $L > 0$ if for any integer $i \ge 1$,
	\[\mathbb{E}[|Z - \mathbb{E}[Z]|^i] \le i\cdot L\cdot \mathbb{E}[|Z - \mathbb{E}[Z]|^{i - 1}]\]
\end{definition}

\begin{propn}
	The $p$-biased Bernoulli random variable $Z$ is central moment bounded with real parameter $L =\sqrt{\frac{1 - p}{p}}$.
\end{propn}

\begin{proof}
	We have $\mathbb{E}[Z] = 0$ and for $p \le \frac{1}{2}$, $|Z| \le \sqrt{\frac{1 - p}{p}}$, therefore,
	\begin{align*}
		\mathbb{E}[|Z - \mathbb{E}[Z]|^i] &= p\sqrt{\frac{p}{1 - p}}^i + (1 - p)\sqrt{\frac{1 - p}{p}}^i\\
		&\le \sqrt{\frac{1 - p}{p}}\bigg(p\sqrt{\frac{p}{1 - p}}^{i - 1} + (1 - p)\sqrt{\frac{1 - p}{p}}^{i- 1}\bigg)\\
		&= \sqrt{\frac{1 - p}{p}}\mathbb{E}[|Z - \mathbb{E}[Z]|^{i - 1}]
	\end{align*}
	therefore, we can take $L = \sqrt{\frac{1 - p}{p}}$.
\end{proof}

For a given multilinear polynomial $f(x)$ on variables $x_1, \ldots, x_n$, we can naturally associate with it a hypergraph $H$ on vertices $[n]$ and weighted hyperedges $E(H)$ where each $h \in E(H)$ corresponds to a distinct term of $f(x)$. Each hyperedge $h$ is a subset $V(h)$ of vertices and has a real valued weight $w_h$ which is the coefficient of that monomial in $f$. Therefore,
\[f(x) = \sum_{h \in E(H)} w_h \prod_{v \in V(h)} x_v\]

Assume $f$ has degree $d_p$, then each hyperedge of $H$ has at most $d_p$ vertices.

Now, for a given collection of independent random variables $Y_1, \ldots, Y_n$, a multilinear poynomial $f$ with associated hypergraph $H$ and weights $w$, and an integer $r \ge 0$, define
\[\mu_r(f, Y) = \max_{S \subseteq [n], |S| = r} \bigg(\sum_{h \in E(H), S \subseteq V(h)} |w_h|\prod_{v \in V(h) \setminus S} \mathbb{E}[|Y_v|]\bigg)\]

\begin{lemma}[\cite{schudy2011bernstein}, Lemma 5.1]\label{lem: schudy_sviridenko}
	Given $n$ independent central moment bounded random variables $Y_1, \ldots, Y_n$ with the same parameter $L > 0$ and a degree $d_p$ multilinear polynomial $f(x)$. Let $t \ge 2$ be an even integer, then
	\[\mathbb{E}[|f(Y) - \mathbb{E} [f(Y)]|^t] \le \max\bigg\{\bigg(\sqrt{tR_4^{d_p}\var{f(Y)}}\bigg)^t, \max_{r \in [d_p]}(t^rR_4^{d_p}L^r\mu_r(f, Y))^t\bigg\}\]
	where $R_4 \ge 1$ is some absolute constant.
\end{lemma}

In our setting, we can also bound the variance in terms of the $\mu_r$ as was shown in \cite{schudy2011bernstein}, which will simplify our calculations.

\begin{lemma}[\cite{schudy2011bernstein}, Lemma 1.5]\label{lem: var_bound}
	For the same setting as in \cref{lem: schudy_sviridenko},
	\[\var{f(Y)} \le 2d_p 4^{d_p}\max_{r \in [d_p]} (\mu_0(f, Y) \mu_r(f, Y)4^rL^r)\]
\end{lemma}

\subsubsection{Proof of \cref{lem: graphmatrixnormbound_nomiddlevertices}}

We are ready to prove \cref{lem: graphmatrixnormbound_nomiddlevertices} which we restate for convenience.

\simplegraphmatrixnormbounds*

We will prove it the same way as \cref{lem: empty_shape}, by bounding the schatten norm of each diagonal block by an appropriate power of its Frobenius norm. In this case, to bound the expected power of the Frobenius norm, we use the scalar concentration inequality from the previous section.

\begin{proof}[Proof of \cref{lem: graphmatrixnormbound_nomiddlevertices}]
	First, we note that ${\mat{M}}_{\tau}$ has a block diagonal structure indexed by the realizations of the set of common vertices $S_0 = U_{\tau_P} \cap V_{\tau_P}$. For $T \in [n]^{S_0}$, let ${\mat{M}}_{\tau, T}$ be the block of ${\mat{M}}_{\tau}$ with $\varphi(S_0) = T$. Then, ${\mat{M}}_{\tau, T}{\mat{M}}_{\tau, T'}^\intercal = {\mat{M}}_{\tau, T}^\intercal{\mat{M}}_{\tau, T'} = 0$ for $T \neq T'$ and so,
	\begin{align*}
		\Esch{{\mat{M}}_{\tau}}{2t} = \sum_{T \in [n]^{S_0}} \Esch{{\mat{M}}_{\tau, T}}{2t} \le \sum_{T \in [n]^{S_0}}\mathbb{E}(\sch{{\mat{M}}_{\tau, T}}{2})^t
	\end{align*}
	where we bounded the Schatten norm by a power of the Frobenius norm.

	Fix $T \in [n]^{S_0}$ and consider $\Esch{{\mat{M}}_{\tau, T}}{2}$. Let ${\mathcal{R}}$ be the set of realizations $\varphi$ of $\tau$ such that $\varphi(S_0) = T$. Then, for $\varphi \in {\mathcal{R}}$ and $e \in E(S_0)$, the value of $\varphi(e)$ is fixed. Using this,
	\begin{align*}
		\norm{{\mat{M}}_{\tau, T}}_2^2 &= \sum_{\varphi \in {\mathcal{R}}} \prod_{e \in E(\tau)} G_{\varphi(e)}^2\\
		&= \prod_{e \in E(S_0)} G_{\varphi(e)}^2 \sum_{\varphi \in {\mathcal{R}}} \prod_{e \in E(\tau) \setminus E(S_0)} G_{\varphi(e)}^2\\
		&\le L^{|E(S_0)|} \sum_{\varphi \in {\mathcal{R}}} \prod_{e \in E(\tau) \setminus E(S_0)} G_{\varphi(e)}^2
	\end{align*}
	where $L = \frac{1 - p}{p}$ is an upper bound on $G_{ij}^2$ for $p \le \frac{1}{2}$. Define the quantity
	\[A = \max_{S_0 \subseteq S \subseteq V(\tau)}L^{|E(S)|}n^{|V(\tau)| - |S|}\]

	\begin{claim}\label{claim: lil_claim}
		$\mathbb{E}(\norm{{\mat{M}}_{\tau, T}}_2)^t \le (Ct)^{t|E(\tau)|} |V(\tau)|^{t|V(\tau)|}A^t$ for an absolute constant $C > 0$.
	\end{claim}

	Using this claim, we have
	\begin{align*}
		\Esch{{\mat{M}}_{\tau}}{2t} &\le \sum_{T \in [n]^{S_0}} \mathbb{E}(\norm{{\mat{M}}_{\tau, T}}_2)^t\\
		&\le n^{|S_0|}(Ct)^{t|E(\tau)|} |V(\tau)|^{t|V(\tau)|}A^t\\
		&= n^{|V(\tau)|} (Ct)^{t|E(\tau)|} |V(\tau)|^{t|V(\tau)|}\max_{U_{\tau} \cap V_{\tau} \subseteq S \subseteq V(\tau)}\left(\frac{1 - p}{p}\right)^{t|E(S)|}n^{t(|V(\tau)| - |S|)}
	\end{align*}
	as required.
\end{proof}

It remains to prove the claim.

\begin{proof}[Proof of \cref{claim: lil_claim}]
	For $1\le i, j \le n$, define the variables $Y_{ij} = G_{ij}^2$ with $\mathbb{E}[|Y_{ij}|] = 1$. Let $f(Y)$ be the polynomial $L^{|E(S_0)|} \sum_{\varphi \in {\mathcal{R}}} \prod_{e \in E(\tau) \setminus E(S_0)} Y_{\varphi(e)}$. It suffices to prove that $\mathbb{E}[f(Y)^t] \le (Ct)^{t|E_1|}A^t$.

	We will first prove that $\mathbb{E}[(f(Y) - \mathbb{E}[f(Y)])^t] \le (C't)^{t|E(\tau)|}|V(\tau)|^{t|V(\tau)|}A^t$ for a sufficiently large constant $C' > 0$.

	$f$ is a homogeneous multilinear polynomial of degree $|E(\tau) \setminus E(S_0)|$. If we had $E(\tau) \setminus E(S_0) = \emptyset$, then $f$ is a constant and so, the inequality is obvious because $f(Y) = \mathbb{E}[f(Y)]$. Now, assume $E(\tau) \setminus E(S_0) \neq \emptyset$. We invoke \cref{lem: schudy_sviridenko}. Let $f$ have associated hypergraph $H$ and weights $w$. Then,
    {\footnotesize
	\[\mathbb{E}[|f(Y) - \mathbb{E} [f(Y)]|^t] \le \max\bigg\{\bigg(\sqrt{tR_4^{|E(\tau) \setminus E(S_0)|}\var{f(Y)}}\bigg)^t, \max_{r \in [|E(\tau) \setminus E(S_0)|]}(t^rR_4^{|E(\tau) \setminus E(S_0)|}L^r\mu_r(f, Y))^t\bigg\}\]
}
	For all $r \ge 0$, we will prove that $L^r\mu_r(f, Y) \le |V(\tau)|^{|V(\tau)|}A$. By definition,
	\begin{align*}
		\mu_r(f, Y) &= \max_{F \subseteq \binom{[n]}{2}, |F| = r} \sum_{h \in E(H), F \subseteq V(h)} |w_h|
	\end{align*}
	Consider any set of edge labels $F \subseteq \binom{[n]}{2}, |F| = r$.
	Then, $\sum_{h \in E(H), F \subseteq V(h)} |w_h|$ is at most $L^{|E(S_0)|}c$ where $c$ is the number of realizations $\varphi \in {\mathcal{R}}$ such that $\varphi(E(\tau))$ contains $F$.
	Suppose $F$ contains $v$ new labels apart from $\varphi(S_0) = T$.
	Then $c \le |V(\tau)|^v n^{|V(\tau)| - |S_0| - v}$ because we can first choose and label the set of vertices that get these $v$ labels and then label the remaining vertices freely, each of which has at most $n$ choices.

	Observe that $L^{|E(S_0)|} L^r n^{|V(\tau)| - |S_0| - v} \le A$ because in the definition of $S$, we can set $S$ to be the union of $S$ and any valid choice of these $v$ vertices. Putting this together, we get
	\begin{align*}
		L^r\mu_r(f, Y) &\le L^r\max_{F \subseteq \binom{[n]}{2}, |F| = r} \sum_{h \in E(H), F \subseteq V(h)} |w_h|\\
		&\le |V(\tau)|^{|V(\tau)|} A
	\end{align*}
	which implies
	\[\max_{r \in [|E(\tau) \setminus E(S_0)|]}(t^rR_4^{|E(\tau) \setminus E(S_0)|}L^r\mu_r(f, Y))^t \le |V(\tau)|^{t|V(\tau)|}(R_4t)^{t|E(\tau)|}A^t\]
	and using \cref{lem: var_bound},
	\begin{align*}
		\var{f(Y)} &\le 2|E(\tau)|4^{|E(\tau)|}\max_{r \in [|E(\tau) \setminus E(S_0)|]} (\mu_0(f, Y) \mu_r(f, Y)4^rL^r)\\
		&\le 2|E(\tau)|16^{|E(\tau)|} |V(\tau)|^{2|V(\tau)|}A^2
	\end{align*}
	Putting them together, we get
    {\footnotesize
	\begin{align*}
		\mathbb{E}[(f(Y) - \mathbb{E}[f(Y)])^t] &\le \max\bigg\{\bigg(\sqrt{2tR_4^{|E(\tau)|}|E(\tau)|16^{|E(\tau)|} |V(\tau)|^{2|V(\tau)|}A^2}\bigg)^t, |V(\tau)|^{t|V(\tau)|}(R_4t)^{t|E(\tau)|}A^t\bigg\}\\
		&\le (C't)^{t|E(\tau)|}|V(\tau)|^{t|V(\tau)|}A^t
	\end{align*}
}
	for an absolute constant $C' > 0$.

	Finally, $\mathbb{E}[f(Y)] \le L^{|E(S_0)|} |{\mathcal{R}}| \le L^{|E(S_0)|}n^{|V(\tau) \setminus S_0|} \le A$ which gives
	\begin{align*}
		\mathbb{E}[f(Y)^t] &\le 2^t(\mathbb{E}[(f(Y) - \mathbb{E}[f(Y)])^t] + \mathbb{E}[f(Y)]^t)\\
		&\le 2^t((C't)^{t|E(\tau)|}|V(\tau)|^{t|V(\tau)|}A^t + A^t)\\
		&\le (Ct)^{t|E(\tau)|}|V(\tau)|^{t|V(\tau)|}A^t
	\end{align*}
	for an absolute constant $C > 0$.
\end{proof}



\subsubsection{Definitions}

Define by ${\mathcal{G}}_{n, p}$ the Erd\H{o}s\xspace-R\'enyi\xspace random graph on the vertex set $[n]$ with $n$ vertices, where each edge is present independently with probability $p$. Let the graph be encoded by variables $G_{i, j} \in \Omega = \{-\sqrt{\frac{1 - p}{p}}, \sqrt{\frac{p}{1 - p}}\}$ where $-\sqrt{\frac{1 - p}{p}}$ indicates the presence of the edge $\{i, j\}$ and $\sqrt{\frac{p}{1 - p}}$ indicates absence, for all $1 \le i, j \le n$.

So, each $G_{i, j}$ for $i < j$ is sampled from $\Omega$ where $G_{i, j}$ takes the value $-\sqrt{\frac{1 - p}{p}}$ with probability $p$ and takes the value $\sqrt{\frac{p}{1 - p}}$ otherwise. Here, $\Omega$ has been normalized so that $\mathbb{E}_{x \sim \Omega}[x] = 0, \mathbb{E}_{x \sim \Omega}[x^2] = 1$. as is standard in $p$-biased Fourier analysis.

When $p = \nicefrac{1}{2}$, we are in the setting of \textit{dense graph matrices}. Then, ${\mathcal{G}}_{n, 1/2}$ can be thought of as a sampling of the $G_{i, j}, i < j$ independently and uniformly from $\Omega = \{-1, 1\}$.

For a set of edges $E \subseteq \binom{[n]}{2}$, define $G_E := \prod_{e \in E} G_e$. When $p = \nicefrac{1}{2}$, the $G_E$ correspond to the Fourier basis for functions of the graph.

Define ${\mathcal{I}}$ to be the set of sub-tuples of $[n]$, including the empty tuple. Graph matrices will have rows and columns indexed by ${\mathcal{I}}$. Each graph matrix has a succinct representation as a graph with some extra information, that is called a \textit{shape}.

\begin{definition}[Shape]
	A shape is a tuple $\tau = (V(\tau), E(\tau), U_{\tau}, V_{\tau})$ where $(V(\tau), E(\tau))$ is a graph and $U_{\tau}, V_{\tau}$ are ordered subsets of the vertices.
\end{definition}

\begin{definition}[Realization]
	Given a shape $\tau$, a realization of $\tau$ is an injective map $\varphi: V(\tau) \rightarrow [n].$
\end{definition}

\begin{definition}[Graph matrices]
	Let $\tau$ be a shape.
	Corresponding to $\tau$, the graph matrix $\graphmat{\tau}  : \{ \pm 1\}^{n \choose 2} \rightarrow {\mathbb R}^{{\mathcal{I}}\times {\mathcal{I}}}$ is defined to be the matrix-valued function with $I, J$-th entry defined as follows.
	\[
	{\mat{M}}_{\tau}[I, J] := \sum_{\substack{\text{Realization }\varphi\\ \varphi(U_{\tau}) = I, \varphi(V_{\tau}) = J}}{G_{\varphi(E(\tau))}} = \sum_{\substack{\text{Realization }\varphi\\ \varphi(U_{\tau}) = I, \varphi(V_{\tau}) = J}}\prod_{(u, v) \in E(\tau)} G_{\varphi(u), \varphi(v)}
	\]
	In other words, we sum over all realizations of $\tau$ that map $U_{\tau}, V_{\tau}$ to $I, J$ respectively and for each such realization, we have a term corresponding to the Fourier character that the realization gives.

\end{definition}

\begin{figure}[!h]
	\centering
	\includegraphics[trim={3cm 21cm 3cm 1cm}, clip, scale=1]{efron-stein-sos/images/2shapes.pdf}
	\caption{Left: Shape corresponding to adjacency matrix, Right: Example of a more complicated shape}
	\label{fig: shape}
\end{figure}

The following examples illustrate some simple graph matrices.

\begin{example}[Adjacency matrix]
	Let $\tau$ be the shape on the left in \cref{fig: shape}, with two vertices $V(\tau) = \{u,v\}$ and a single edge $E(\tau) = \{\{ u,v\}\}$. $U_\tau, V_\tau$ are $(u), (v)$ respectively where we use tuples to indicate ordering.
	Then ${\mat{M}}_\tau$ has nonzero entries ${\mat{M}}_\tau[(i), (j)](G) = G_{i, j}$ for all $i \neq j$.
	If $G \in \{ \pm 1\}^{n \choose 2}$ is thought of as a graph, then ${\mat{M}}_\tau$ has as principal submatrix the $\pm 1$ adjacency matrix of $G$ with zeros on the diagonal, and the other entries are $0$.
\end{example}

\begin{example}
	In \cref{fig: shape}, consider the shape $\tau$ on the right. We have $U_{\tau} = (u_1, u_2), V_{\tau} = (v_1), V(\tau) = \{u_1, u_2, v_1, w_1\}$ and $E(\tau) = \{\{u_1, w_1\}, \{u_2, w_1\}, \{w_1, v_1\}\}$. ${\mat{M}}_{\tau}$ is a matrix with rows and columns indexed by sub-tuples of $[n]$. Its nonzero entries are in rows $I$ and columns $J$ with $|I| = |U_{\tau}| = 2$ and $|J| = |V_{\tau}| = 1$ respectively. More specifically, for all distinct $a_1, a_2, b_1$, the entry corresponding to row $(a_1, a_2)$ and column $(b_1)$ is $\sum_{c_1 \in [n] \setminus \{a_1, a_2, b_1\}} G_{a_1, c_1}G_{a_2, c_1}G_{c_1, b_1}$.
	Here, each term is obtained via the realization $\varphi$ that maps $u_1, u_2, w_1, v_1$ to $a_1, a_2, c_1, b_1$ respectively. Succinctly, \[{\mat{M}}_{\tau} =
	\begin{blockarray}{rl@{}c@{}r}
		& & \makebox[0pt]{column $(b_1)$} \\[-0.5ex]
		& & \,\downarrow \\[-0.5ex]
		\begin{block}{r(l@{}c@{}r)}
			&  & \vdots & \\[-0.2ex]
			\text{row }(a_1, a_2) \rightarrow \mkern-9mu & \raisebox{0.5ex}{\makebox[3.2em][l]{\dotfill}} & \sum_{c_1 \in [n] \setminus \{a_1, a_2, b_1\}} G_{a_1, c_1}G_{a_2, c_1}G_{c_1, b_1} & \raisebox{0.5ex}{\makebox[4.2em][r]{\dotfill}} \\[+.5ex]
			&  & \vdots & \\
		\end{block}
	\end{blockarray}\]
\end{example}


Intuitively, graph matrices are symmetrizations of the Fourier basis, where the symmetry is incorporated by summing over all realizations of ``free'' vertices $V(\tau) \setminus U_{\tau} \setminus V_{\tau}$ of the shape $\tau$.
For more examples of graph matrices and why they can be a useful tool to work with, see \cite{ahn2016graph}.

\subsubsection{Norm bounds for dense graph matrices}\label{sec: norm_bounds_for_dense_graph_matrices}

In this section, we study the concentration of the so-called ``dense graph matrices'' which is a term that refers to graph matrices $M_{\tau}$ in the setting $p = \nicefrac{1}{2}$.
Since the edges of a random graph sampled from ${\mathcal{G}}_{n,1/2}$ can be viewed as independent Rademacher random variables, we can apply our framework in this setting.


In particular, we will obtain bounds on $\Esch{{\mat{M}}_{\tau} - \mathbb{E}{\mat{M}}_{\tau}}{2t}$.
The $G_{i, j} \in \{-1, 1\}$ correspond to the $Z_i$s in \cref{sec: basic_recursion} and for a fixed shape $\tau$, ${\mat{M}}_{\tau}$ will be the matrix ${\mat{F}}$ we are interested in analyzing. For $I, J \in {\mathcal{I}}$, ${\mat{M}}_{\tau}[I, J]$ is a nonzero polynomial only when there exists at least one realization of $\tau$ that maps $U_{\tau}, V_{\tau}$ to $I, J$ respectively. In particular, we must have $|I| = |U_{\tau}|$ and $|J| = |V_{\tau}|$. In this case, ${\mat{M}}_{\tau}[I, J]$ is a homogenous polynomial of degree $|E(\tau)|$.

By \cref{thm: main_rademacher}, we have
\[\Esch{{\mat{M}}_{\tau} - \mathbb{E}{\mat{M}}_{\tau}}{2t} ~\le~ \sum_{a + b \ge 1\atop a, b \ge 0}(16t|E(\tau)|)^{(a + b)t}\sch{\mathbb{E}{\mat{M}}_{\tau, a, b}}{2t}\]
where for integers $a, b \ge 0$, ${\mat{M}}_{\tau, a, b}$ is defined to be the matrix with rows and columns each indexed by ${\mathcal{I}} \times \{0, 1\}^{\binom{n}{2}}$ such that for all $I, J \in {\mathcal{I}}$, we have
\[{\mat{M}}_{\tau, a, b}[(I, \alpha), (J, \beta)] ~=~ \begin{dcases}
	\nabla_{\alpha + \beta} {\mat{M}}_{\tau}[I, J] & \text{ if $|\alpha|_0 = a, |\beta|_0 = b, \alpha \cdot \beta = 0$}\\
	0 & \text{o.w.}
\end{dcases}
\]

For any multilinear homogenous polynomial $f$ of degree $d$, since $\mathbb{E}[G_{i, j}] = 0$ for all $i, j$, we have $\nabla_{\alpha}f = 0$ whenever $|\alpha|_0 < d$. Therefore, $\mathbb{E}{\mat{M}}_{\tau, a, b} = 0$ for all $a + b < |E(G)|$. Moreover, $\mathbb{E}{\mat{M}}_{\tau, a, b} = 0$ whenever $a + b \neq |E(G)|$ otherwise $\mathbb{E}{\mat{M}}_{\tau, a, b} = {\mat{M}}_{\tau, a, b}$. So, we can further simplify the above expression to
\[\Esch{{\mat{M}}_{\tau} - \mathbb{E}{\mat{M}}_{\tau}}{2t} ~\le~ \sum_{a + b = |E(\tau)|\atop a, b \ge 0}(16t|E(\tau)|)^{|E(\tau)|t}\sch{{\mat{M}}_{\tau, a, b}}{2t}\]

It remains to analyze $\sch{{\mat{M}}_{\tau, a, b}}{2t}$ for $a + b = |E(G)|$. We will see that analyzing these matrices is much simpler since they are deterministic matrices and simple computations using the Frobenius norm bound will work well. To state our final bounds, we need to define the notion of vertex separators of shapes.

\begin{remark}
	As we will see, when analyzing the Frobenius norms for these deterministic matrices, the notion of the minimum vertex separator arises naturally. In prior trace method calculations (e.g. \cite{medarametla2016bounds}, \cite{ahn2016graph}), this required ingenious combinatorial observations.
\end{remark}

\begin{restatable}[Vertex separator]{definition}{vertexseparator}
	For a shape $\tau$, define a vertex separator to be a subset of vertices $S \subseteq V(\tau)$ such that there is no path from $U_{\tau}$ to $V_{\tau}$ in $\tau \setminus S$, which is the shape obtained by deleting all the vertices of $S$ (including all edges they're incident on).
\end{restatable}

For a shape $\tau$, denote by $S_{\tau}$ a vertex separator of the smallest size. Also, let $I_{\tau}$ be the set of isolated vertices (vertices with degree $0$) in $V(\tau) \setminus U_{\tau} \setminus V_{\tau}$, so the presence of these vertices essentially scale the matrix by a scalar factor.

\begin{theorem}\label{thm: dense_graph_matrix_norm_bounds}
	For a shape $\tau$ and any integer $t \ge 1$,
	\[\mathbb{E}\sch{{\mat{M}}_{\tau} - \mathbb{E} {\mat{M}}_{\tau}}{2t} \le \bigg(C^{t|E(\tau)|}n^{|V(\tau)|} t^{t|E(\tau)|}|E(\tau)|^{2t|E(\tau)|}\bigg)n^{t(|V(\tau)| - |S_{\tau}| + |I_{\tau}|)}\]
	for an absolute constant $C > 0$.
\end{theorem}

Up to lower order terms, the same result has been shown before in \cite{medarametla2016bounds, ahn2016graph}. To interpret this bound, assume that $\tau$ has a constant number of vertices. By setting $t \approx {\mathrm{polylog}}(n)$, we get \[\norm{{\mat{M}}_{\tau}} = \widetilde{\operatorname{O}}\left(\sqrt{n}^{|V(\tau)| - |S_{\tau}| + |I_{\tau}|}\right)\] with high probability, where $\widetilde{\operatorname{O}}$ hides logarithmic factors.
This is obtained by applying Markov's inequality on the bound on $\Esch{{\mat{M}}_{\tau}}{2t}$. If $\tau$ has at least one edge, then $\mathbb{E} {\mat{M}}_{\tau} = 0$ and \cref{thm: dense_graph_matrix_norm_bounds} yields such bounds. If $\tau$ has no edges, then it's quite simple to obtain such a bound and we include it in \cref{lem: empty_shape} for the sake of completeness. \cref{cor: dense_graph_matrix_norm_bounds} makes precise the high probability bound above. Therefore, this power of $n$ is essentially what controls the norm bound and this is utilized heavily in applications (e.g. \cite{BHKKMP16, ghosh2020sum, potechin2020machinery}).

\begin{proof}[Proof of \cref{thm: dense_graph_matrix_norm_bounds}]
    We first argue that we can assume $I_{\tau} = \emptyset$. This is because of the following reason. Each distinct vertex in $\tau$ of degree $0$ essentially scales the matrix by a factor of at most $n$. And in the right hand side of the inequality, each vertex in $I_{\tau}$ contributes a factor of $n^{2t}$ accordingly, from $n^{t|V(\tau)|}$ and from $n^{t|I_{\tau}|}$, and the other changes only weaken the inequality.

	Now, fix $a, b \ge 0$ such that $a + b = |E(\tau)|$ and consider ${\mat{M}}_{\tau, a, b}$. For $I, J \in {\mathcal{I}}, \alpha, \beta \in \{0, 1\}^{\binom{n}{2}}$ such that $|\alpha|_0 = a, |\beta|_0 = b, \alpha \cdot \beta = 0$, by definition,
    \begin{align*}
        {\mat{M}}_{\tau, a, b}[(I, \alpha), (J, \beta)] &~=~ \nabla_{\alpha + \beta} \left(\sum_{\varphi: \varphi(U_{\tau}) = I, \varphi(V_{\tau})= J} \prod_{u, v \in E(\tau)} G_{\varphi(u), \varphi(v)}\right)\\
        &~=~ |\{\varphi ~|~ \varphi(U_{\tau}) = I, \varphi(V_{\tau})= J, \varphi(E(\tau)) = \supp(\alpha + \beta)\}|
    \end{align*}
    where $\supp(.)$ denotes the support. We will now obtain norm bounds on these deterministic matrices by reinterpreting them as graph matrices for different shapes.



    Let $P = (E_1, E_2)$ denote the partition of $E(\tau) = E_1 \sqcup E_2$ into two ordered sets $E_1, E_2$, where $\sqcup$ denotes disjoint union. Then, we can write ${\mat{M}}_{\tau, a, b} = \sum_{P \in {\mathcal{P}}} {\mat{M}}_{\tau, a, b, P}$ where
    \[{\mat{M}}_{\tau, a, b, P}[(I, \alpha), (J, \beta)] ~=~ |\{\varphi ~|~ \varphi(U_{\tau}) = I, \varphi(V_{\tau})= J, \varphi(E_1) = \supp(\alpha), \varphi(E_2) = \supp(\beta)\}|\]

    Let the set of ordered partitions $P$ be ${\mathcal{P}}$. Then, $|{\mathcal{P}}| \le (4|E(\tau)|)^{|E(\tau)|}$ and so, by \cref{fact: holder},
    \[\sch{{\mat{M}}_{\tau, a, b}}{2t} \le (4|E(\tau)|)^{t|E(\tau)|} \sum_{P \in {\mathcal{P}}}\sch{{\mat{M}}_{\tau, a, b, P}}{2t}\]


    Each ${\mat{M}}_{\tau, a, b, P}$ can be interpreted as a graph matrix for a different shape $\tau_P$, with the same vertex set and no edges. Let $V(\tau_P) = V(\tau), E(\tau_P) = \emptyset$ and set $U_{\tau_P} = U_{\tau} \cup V(E_1), V(\tau_P) = V_{\tau} \cup V(E_2)$ using a canonical ordering. Then, ${\mat{M}}_{\tau, a, b}$ is equal to ${\mat{M}}_{\tau_P}$ up to renaming of the rows and columns. For an illustration, see \cref{fig: evolution_new}.


	\begin{figure}[!h]
		\centering
		\includegraphics[trim={2cm 20cm 2cm 2cm}, clip, scale=0.9]{efron-stein-sos/images/evolution_new.pdf}
		\caption{An example illustrating how $\tau_P$ is defined. In this example, $P$ constraints the blue and red edges to go to $\alpha$ and $\beta$ respectively. $U_{\tau_P}, V_{\tau_P}$ have an ordering on the vertices (not shown here).}
		\label{fig: evolution_new}
	\end{figure}

	This graph matrix has a block diagonal structure indexed by the realizations of the set of common vertices $S = U_{\tau_P} \cap V_{\tau_P}$. Indeed, for $K \in [n]^S$, let ${\mat{M}}_{\tau_P, K}$ be the block of ${\mat{M}}_{\tau_P}$ with $\varphi(S) = K$. Then, ${\mat{M}}_{\tau_P, K}{\mat{M}}_{\tau_P, K'}^\intercal = {\mat{M}}_{\tau_P, K}^\intercal{\mat{M}}_{\tau_P, K'} = 0$ for $K \neq K'$ and so,
	\begin{align*}
		\Esch{{\mat{M}}_{\tau, a, b}}{2t} &\le (4|E(\tau)|)^{t|E(\tau)|} \sum_{P \in {\mathcal{P}}}\sch{{\mat{M}}_{\tau_P}}{2t}\\
		&= (4|E(\tau)|)^{t|E(\tau)|} \sum_{P \in {\mathcal{P}}} \sum_{T \in [n]^S}\sch{{\mat{M}}_{\tau_P, T}}{2t}\\
		&\le (4|E(\tau)|)^{t|E(\tau)|} \sum_{P \in {\mathcal{P}}} \sum_{T \in [n]^S}\left(\sch{{\mat{M}}_{\tau_P, T}}{2}\right)^t
	\end{align*}
	where we bounded the Schatten norm by the appropriate power of the Frobenius norm.

	For any fixed $K \in [n]^S$, the entries of ${\mat{M}}_{\tau_P, K}$ take values in $\{0, 1\}$ and the number of nonzero entries is at most $n^{|V(\tau)| - |S|}$ because the realizations of vertices in $S$ are fixed and the other vertices have at most $n$ choices each. Therefore, $\sch{{\mat{M}}_{\tau_P, K}}{2} \le n^{|V(\tau)| - |S|}$.

	Finally, we bound $|S|$ to estimate how large this term can be over all possibilities of $P$.
    We argue that $S$ blocks all paths from $U_{\tau}$ to $V_{\tau}$. To see this, consider any path from $U_{\tau}$ to $V_{\tau}$, it must contain an edge $(u, v) \in E(\tau)$ such that $u \in U_{\tau_P}, v \in V_{\tau_P}$. We must either have $(u, v) \in E_1$, in which case $u,  v \in U_{\tau_P}$ and $v \in S$, or $(u, v) \in E_2$, in which case $u, v \in V_{\tau_P}$ and $u \in S$. In either case, $S$ must contain either $u$ or $v$. This argument implies $S$ must be a vertex separator of $\tau$, giving $|S| \ge |S_{\tau}|$.
   
    For a proof by picture, see \cref{fig: proof_by_picture}.

	\begin{figure}[!h]
		\centering
		\includegraphics[trim={2cm 20cm 2cm 2cm}, clip, scale=0.9]{efron-stein-sos/images/proof_by_picture.pdf}
		\caption{Proof by picture that $|S| \ge |S_{\tau}|$. Green edges can occur in $\tau$, orange edges cannot, so $S$ blocks all paths from $U_{\tau}$ to $V_{\tau}$.}
		\label{fig: proof_by_picture}
	\end{figure}


	We also have the trivial upper bound $|S| \le |V(\tau)|$. Ultimately, this gives
	\begin{align*}
		\sch{{\mat{M}}_{\tau, a, b}}{2t} &\le (4|E(\tau)|)^{t|E(\tau)|} \sum_{P \in {\mathcal{P}}} \sum_{T \in [n]^S}n^{t(|V(\tau)| - |S_{\tau}|)}\\
		&\le (4|E(\tau)|)^{t|E(\tau)|}(4|E(\tau)|)^{|E(\tau)|}n^{|V(\tau)|}n^{t(|V(\tau)| - |S_{\tau}|)}
	\end{align*}
	Along with our prior discussion, we get
    {\footnotesize
	\begin{align*}
		\Esch{{\mat{M}}_{\tau} - \mathbb{E}{\mat{M}}_{\tau}}{2t} &\le 	\sum_{a + b = |E(\tau)|}(16t|E(\tau)|)^{|E(\tau)|t}\sch{{\mat{M}}_{\tau, a, b}}{2t}\\
		&\le \sum_{a + b = 	|E(\tau)|}(16t|E(\tau)|)^{|E(\tau)|t}(4|E(\tau)|)^{t|E(\tau)|}(4|E(\tau)|)^{|E(\tau)|}n^{|V(\tau)|}n^{t(|V(\tau)| - |S_{\tau}|)}\\
		&\le \bigg(C^{t|E(\tau)|}n^{|V(\tau)|} t^{t|E(\tau)|}|E(\tau)|^{2t|E(\tau)|}\bigg)n^{t(|V(\tau)| - |S_{\tau}|)}
	\end{align*}}
	for an absolute constant $C > 0$.
\end{proof}


In the proof above, our analysis of the shape $\tau_P$ which has no edges, applies in general to any shape $\tau$ with no edges. For the sake of completeness, we state it explicity in the following lemma.

\begin{lemma}\label{lem: empty_shape}
	For a shape $\tau$ with no edges and any integer $t \ge 1$,
	\[\Esch{{\mat{M}}_{\tau}}{2t} \le n^{|U_{\tau} \cap V_{\tau}|}n^{t(V(\tau) - |U_{\tau} \cap V_{\tau}| + |I_{\tau}|)}\]
\end{lemma}

Note that this has the same form as \cref{thm: dense_graph_matrix_norm_bounds} because for a shape $\tau$ with no edges, the minimum vertex separator $S_{\tau}$ is just $U_{\tau} \cap V_{\tau}$.

The following corollary obtains high probability norm bounds for norms of graph matrices via Markov's inequality.

\begin{corollary}\label{cor: dense_graph_matrix_norm_bounds}
	For a shape $\tau$, for any constant $\varepsilon > 0$, with probability $1 - \varepsilon$,
	\[\norm{{\mat{M}}_{\tau}} \le (C|E(\tau)| \log(n^{|V(\tau)|}/\varepsilon))^{|E(\tau)|}\cdot\sqrt{n}^{|V(\tau)| - |S_{\tau}| + |I_{\tau}|}\]
	for an absolute constant $C > 0$.
\end{corollary}

\begin{proof}
	If $E(\tau) = \emptyset$, we invoke \cref{lem: empty_shape}. Otherwise, $\mathbb{E}{\mat{M}}_{\tau} = 0$ and we invoke \cref{thm: dense_graph_matrix_norm_bounds}. By an application of Markov's inequality,
	\begin{align*}
		Pr[\norm{{\mat{M}}_{\tau}} \ge \theta] &\le Pr[\sch{{\mat{M}}_{\tau}}{2t} \ge \theta^{2t}]\\
		&\le \theta^{-2t} \mathbb{E}\sch{{\mat{M}}_{\tau}}{2t}\\
		&\le \theta^{-2t}\bigg((C')^{t|E(\tau)|}n^{|V(\tau)|} t^{t|E(\tau)|}|E(\tau)|^{2t|E(\tau)|}\bigg)n^{t(|V(\tau)| - |S_{\tau}| + |I_{\tau}|)}
	\end{align*}
	for an absolute constant $C' > 0$. We now set
	\[\theta = \bigg(\varepsilon^{-1/(2t)} (C'')^{|E(\tau)|} n^{|V(\tau)|/(2t)} t^{|E(\tau)|/2}|E(\tau)|^{|E(\tau)|}\bigg)\sqrt{n}^{|V(\tau)| - |S_{\tau}| + |I_{\tau}|}\] for an absolute constant $C'' > 0$, to make this expression at most $\varepsilon$. Set $t = \frac{1}{2} \log(n^{|V(\tau)|}/\varepsilon)$ to complete the proof.
\end{proof}

\section{Introduction}\label{sec: intro}
\input{intro}

\section{Preliminaries}\label{sec: prelims}
\input{prelims}

\section{The basic framework for Rademacher random variables} \label{sec: basic_recursion}

\input{rademacher_recursion}

\section{Applications}\label{sec:rademacher-applications}

To illustrate our framework, we apply it to obtain concentration bounds for nonlinear random matrices that have been considered in the literature before. The first one is a simple tensor network that arose in the analysis of spectral algorithms for a variant of principal components analysis (PCA) \cite{hopkins2015tensor, hopkins2018statistical}.
The second application is to obtain norm bounds on dense graph matrices \cite{medarametla2016bounds, ahn2016graph}. In the second application, the norm bounds are governed by a combinatorial structure called \textit{the minimum vertex separator of a shape}. We will see how this notion arises naturally under our framework, while prior works that derived such bounds used the trace power method and required nontrivial combinatorial insights.

\subsection{A simple tensor network}

\input{tensor_network_norm_bound}

\subsection{Graph matrices}\label{sec: dense_graph_matrices}

\input{dense_graph_matrices}

\section{Why a na\"ive application of \cite{paulin2016} may fail for general product distributions} \label{sec: failure_of_basic}

\input{failure_of_basic}

\section{The general recursion framework}\label{sec: general_recursion}

\input{general_recursion}

\section{A generalization of \cite{paulin2016} and proof of \cref{lem: main_general}}\label{sec: proof_of_general}

\input{proof_of_general}

\section{Application: Sparse graph matrices} \label{sec: sparse_graph_matrices}

\input{sparse_graph_matrices}



\bibliographystyle{alpha}


\subsection{Generalizing \cite{paulin2016} via explicit inner kernels}\label{sec: explicit_inner_kernels}

In our setting, observe that $(Z, Z')$ has the same distribution as $(Z', Z)$. This is what is known as an \textit{exchangeable pair} of variables, that will be extremely useful for our analysis. In particular, $Z, Z'$ have the same distribution and $\mathbb{E} f(Z, Z') = \mathbb{E} f(Z', Z)$ for every integrable function $f$.

\begin{definition}[Laplacian operator ${\mathcal{L}}$]
	Define the operator ${\mathcal{L}}$ on the space ${\mathcal{S}}$ as
	\[{\mathcal{L}}(f)(Z) = \mathbb{E}[f(Z) - f(Z') | Z]\]
	for all polynomials $f \in {\mathcal{S}}$.
\end{definition}

Note that this operator is well-defined since for any $f \in {\mathcal{S}}$, $\mathbb{E}[L(f)] = \mathbb{E}[\mathbb{E}[f(Z) - f(Z') | Z]] = \mathbb{E}[f(Z) - f(Z')] = 0$ and hence, $L(f) \in {\mathcal{S}}$.

\begin{lemma}\label{lem: eigenvector}
	For all $\alpha \in \mathbb{N}^n$, $\chi_{\alpha}$ is an eigenvector of ${\mathcal{L}}$ with eigenvalue $\frac{|\alpha|_0}{n}$.
\end{lemma}

\begin{proof}
	Recall that $Z'$ is obtained by choosing $i \in [n]$ uniformly at random and then setting $Z' = Z^{(i)}$. Therefore,
	\begin{align*}
		{\mathcal{L}}(\chi_{\alpha})(Z) &= \mathbb{E}[\chi_{\alpha}(Z) - \chi_{\alpha}(Z') | Z]\\
		&= \frac{1}{n}\sum_{i \le n} \mathbb{E}[\chi_{\alpha}(Z) - \chi_{\alpha}(Z^{(i)}) | Z]
	\end{align*}
	When $\alpha_i = 0$, $\chi_{\alpha}(Z) - \chi_{\alpha}(Z^{(i)}) = 0$. Otherwise, $\mathbb{E}[\chi_{\alpha}(Z) - \chi_{\alpha}(Z^{(i)})|Z] = \chi_{\alpha}(Z)$. Therefore, the above expression simplifies to $\frac{|\alpha|_0}{n} \chi_{\alpha}(Z)$.
\end{proof}

\begin{theorem}[Explicit Kernel]\label{thm: explicit_kernel_for_poly}
	For any mean-centered polynomial $f \in {\mathcal{S}}$, there exists a polynomial $K_f$ on $2n$ variables $z_1, \ldots, z_n, z_1', \ldots, z_n'$, denoted collectively as $(z, z')$, with the following properties
	\begin{enumerate}
		\item $K_f(z', z) = -K_f(z, z')$
		\item $\mathbb{E}[K_f(Z, Z') | Z] = f(Z)$ where $(Z, Z')$ is the exchangeable pair we consider above.
	\end{enumerate}
\end{theorem}

\begin{proof}


	Using \cref{propn: basis} and \cref{lem: eigenvector}, under the basis of polynomials $\chi_{\alpha}$, the operator ${\mathcal{L}}$ is a diagonal matrix with nonzero diagonal entries and therefore, ${\mathcal{L}}^{-1}$ exists and is explicitly given by
	\[{\mathcal{L}}^{-1}(f)(Z) = \sum_{\alpha} \frac{n}{|\alpha|_0}\coef{f}{\alpha} \chi_{\alpha}(Z)\]
	We then take $K_f(z, z') = {\mathcal{L}}^{-1}(f)(z) - {\mathcal{L}}^{-1}(f)(z')$. The first condition is obvious and for the second condition, we have
	\[\mathbb{E}[K_f(Z, Z')|Z] = \mathbb{E}[{\mathcal{L}}^{-1}(f)(Z) - {\mathcal{L}}^{-1}(f)(Z') | Z] = {\mathcal{L}}({\mathcal{L}}^{-1}(f)) = f\]
\end{proof}

As seen in the proof of \cref{thm: explicit_kernel_for_poly}, ${\mathcal{L}}$ has a well-defined inverse ${\mathcal{L}}^{-1}$. We now define the matrix ${\mat{K}}_{k, a, b}$ that we call the \textit{inner kernel}.

\begin{definition}[The inner kernel matrix ${\mat{K}}_{k, a, b}$]
	For integers $k \ge 1, a, b \ge 0$ such that $a + b < k$, define the matrix ${\mat{K}}_{k, a, b} \in {\mathbb R}[Z]^{{\mathcal{I}} \times {\mathcal{K}}} \times {\mathbb R}[Z]^{{\mathcal{J}} \times {\mathcal{K}}}$ taking $2n$ variables $(z, z') = (z_1, \ldots, z_n, z_1', \ldots, z_n')$ as input as follows
	\[{\mat{K}}_{k, a, b}(z, z') = {\mathcal{L}}^{-1}({\mat{G}}_{k, a, b})(z) - {\mathcal{L}}^{-1}({\mat{G}}_{k, a, b})(z')\]
\end{definition}

In the rest of this section except where explicitly stated, fix integers $k \ge 1, a, b \ge 0$ such that $a + b < k$. Then, the inner kernel ${\mat{K}}_{k, a, b}$ is well-defined.

\begin{lemma}\label{lem: explicit_kernel_for_matrices}
	${\mat{K}}_{k, a, b}(Z, Z') = \frac{n}{k - a - b}({\mat{G}}_{k, a, b}(Z) - {\mat{G}}_{k, a, b}(Z'))$
\end{lemma}

\begin{proof}
	\begin{align*}
		{\mat{K}}_{k, a, b}(Z, Z') &= {\mathcal{L}}^{-1}({\mat{G}}_{k, a, b})(Z) - {\mathcal{L}}^{-1}({\mat{G}}_{k, a, b})(Z')\\
		&= \sum_{|\alpha|_0 = k - a - b} \coef{{\mat{G}}_{k, a, b}}{\alpha} ({\mathcal{L}}^{-1}(\chi_{\alpha})(Z) - {\mathcal{L}}^{-1}(\chi_{\alpha})(Z'))\\
		&= \frac{n}{k - a - b}\sum_{|\alpha|_0 = k - a - b} \coef{{\mat{G}}_{k, a, b}}{\alpha} (\chi_{\alpha}(Z) - \chi_{\alpha}(Z'))\\
		&= \frac{n}{k - a - b}({\mat{G}}_{k, a, b}(Z) - {\mat{G}}_{k, a, b}(Z'))
	\end{align*}
\end{proof}

The following lemma postulates important properties of the the inner kernel, including how it interacts with ${\mat{D}}_1$ and ${\mat{D}}_2$.

\begin{lemma}\label{lem: props_of_exp_kernel_mat}
	${\mat{K}}_{k, a, b}$ satisfies the following properties
	\begin{enumerate}
		\item ${\mat{K}}_{k, a, b}(z', z) = -{\mat{K}}_{k, a, b}(z, z')$
		\item $\mathbb{E}[{\mat{K}}_{k, a, b}(Z, Z') | Z] = {\mat{G}}_{k, a, b}(Z)$
		\item $({\mat{D}}_1(Z) - {\mat{D}}_1(Z')){\mat{K}}_{k, a, b}(Z, Z') = {\mat{K}}_{k, a, b}(Z, Z')({\mat{D}}_2(Z) - {\mat{D}}_2(Z')) = 0$.
	\end{enumerate}
\end{lemma}

\begin{proof}
	The first equality is obvious from the definition. For the second equality, note that $\mathbb{E}[{\mat{G}}_{k, a, b}] = 0$ and ${\mat{K}}_{k, a, b}$ is defined by replacing each entry $f$ of ${\mat{G}}_{k, a, b}$ by the kernel polynomial $K_f$ as exhibited in \cref{thm: explicit_kernel_for_poly}. Now, we prove the third equality.

	Consider the matrix $({\mat{D}}_1(Z) - {\mat{D}}_1(Z')){\mat{K}}_{k, a, b}(Z, Z')$ whose $[(I, \alpha_1, \gamma_1), (J, \alpha_2, \gamma_2)]$ entry is given by
	\[\frac{n}{k - a - b}\sqrt{\mathbb{E}[Z^{2\alpha_1\cdot (1 - \gamma_1)}]}(Z^{\alpha_1 \cdot \gamma_1} - (Z')^{\alpha_1\cdot \gamma_1})(\nabla_{\alpha_1 + \alpha_2}{\mat{X}}_k[I, J](Z) - \nabla_{\alpha_1 + \alpha_2}{\mat{X}}_k[I, J](Z'))\]
	where we have used \cref{lem: explicit_kernel_for_matrices}.
	We will argue that this term is identically $0$.
	We must have $Z' = Z^{(i)}$ for some $i \le n$. If $(\alpha_1 \cdot \gamma_1)_i = 0$, then $Z^{\alpha_1 \cdot \gamma_1} = (Z')^{\alpha_1\cdot \gamma_1}$ and the above term is $0$.
	Otherwise, $(\alpha_1 + \alpha_2)_i \neq 0$ and so $\nabla_{\alpha_1 +\alpha_2}$ on any polynomial $f$ will only contain the terms independent of $Z_i$, in which case $\nabla_{\alpha_1 + \alpha_2}{\mat{X}}_k[I, J](Z) = \nabla_{\alpha_1 + \alpha_2}{\mat{X}}_k[I, J](Z')$. In this case was well, the above term is $0$. The proof of the other equality is analogous.
\end{proof}

The reason we call ${\mat{K}}_{k, a, b}$ the inner kernel is because, as seen above, it serves as a kernel for the inner matrix ${\mat{G}}$ in the decomposition ${\mat{F}} = {\mat{D}}{\mat{G}}{\mat{D}}$.

Since we will need to work with Hermitian dilations, we define \[{\mat{D}} = \begin{bmatrix}
	{\mat{D}}_1 & 0\\
	0 & {\mat{D}}_2
\end{bmatrix}\]

We will use the following basic fact extensively in our manipulations.

\begin{fact}
	For any matrix ${\mat{A}} \in {\mathbb R}[Z]^{{\mathcal{I}}\times {\mathcal{K}}} \times {\mathbb R}[Z]^{{\mathcal{J}}\times {\mathcal{K}}}$, ${\mat{D}} \herm{{\mat{A}}}{\mat{D}} = \herm{{\mat{D}}_1{\mat{A}}{\mat{D}}_2}$.
\end{fact}

\begin{proof}
	We have
	\begin{align*}
		{\mat{D}} \herm{{\mat{A}}}{\mat{D}} =
		\begin{bmatrix}
			{\mat{D}}_1 & 0\\
			0 & {\mat{D}}_2
		\end{bmatrix}
		\begin{bmatrix}
			0 & {\mat{A}} \\
			{\mat{A}}^\intercal & 0
		\end{bmatrix}
		\begin{bmatrix}
			{\mat{D}}_1 & 0\\
			0 & {\mat{D}}_2
		\end{bmatrix}
		&=
		\begin{bmatrix}
			0 & {\mat{D}}_1{\mat{A}}\\
			{\mat{D}}_2{\mat{A}}^\intercal  & 0
		\end{bmatrix}
		\begin{bmatrix}
			{\mat{D}}_1 & 0\\
			0 & {\mat{D}}_2
		\end{bmatrix}\\
		&=
		\begin{bmatrix}
			0 & {\mat{D}}_1{\mat{A}}{\mat{D}}_2\\
			{\mat{D}}_2{\mat{A}}^\intercal{\mat{D}}_1  & 0
		\end{bmatrix}\\
		&= \herm{{\mat{D}}_1{\mat{A}}{\mat{D}}_2}
	\end{align*}
\end{proof}

We start with a generalized version of a result from \cite{paulin2016}.

\begin{lemma}\label{lem: deviation_bound}
	Let ${\mat{K}} = \herm{{\mat{K}}}_{k, a, b}$. For any symmetric matrix valued function ${\mat{R}}$ on the variables $Z$ of the same dimensions as ${\mat{K}}$, such that $\mathbb{E}\norm{{\mat{K}}(Z, Z'){\mat{R}}(Z)}  < \infty$, we have
	\[\mathbb{E}[\herm{{\mat{F}}}_{k, a, b}(Z){\mat{R}}(Z)] = \frac{1}{2}\mathbb{E}[{\mat{D}}(Z){\mat{K}}(Z, Z'){\mat{D}}(Z) ({\mat{R}}(Z) - {\mat{R}}(Z'))]\]
\end{lemma}

\begin{proof}
	By \cref{lem: props_of_exp_kernel_mat}, we have
	\begin{align*}
		\mathbb{E}[\herm{{\mat{F}}}_{k, a, b}(Z){\mat{R}}(Z)] &= \mathbb{E}[{\mat{D}}(Z)\herm{{\mat{G}}}_{k, a, b}(Z){\mat{D}}(Z) {\mat{R}}(Z)]\\
		&= \mathbb{E}[{\mat{D}}(Z)\mathbb{E}[{\mat{K}}(Z, Z') | Z]{\mat{D}}(Z) {\mat{R}}(Z)]\\
		&= \mathbb{E}[{\mat{D}}(Z){\mat{K}}(Z, Z'){\mat{D}}(Z) {\mat{R}}(Z)]
	\end{align*}
	where the first equality follow from condition $2$ of \cref{lem: props_of_exp_kernel_mat} and the second follows from the pull-through property of expectations. Continuing,
	\begin{align*}
		\mathbb{E}[\herm{{\mat{F}}}_{k, a, b}(Z){\mat{R}}(Z)] &= \mathbb{E}[{\mat{D}}(Z){\mat{K}}(Z, Z'){\mat{D}}(Z){\mat{R}}(Z)]\\
		&= \mathbb{E}[{\mat{D}}(Z'){\mat{K}}(Z', Z){\mat{D}}(Z') {\mat{R}}(Z')]\\
		&= -\mathbb{E}[{\mat{D}}(Z'){\mat{K}}(Z, Z'){\mat{D}}(Z') {\mat{R}}(Z')]\\
		&= -\mathbb{E}[{\mat{D}}(Z){\mat{K}}(Z, Z'){\mat{D}}(Z') {\mat{R}}(Z')]\\
		&= -\mathbb{E}[{\mat{D}}(Z){\mat{K}}(Z, Z'){\mat{D}}(Z) {\mat{R}}(Z')]
	\end{align*}
	Here, the second equality follows from the fact that $(Z, Z')$ has the same distribution as $(Z', Z)$, so we can exchange them. The third, fourth and fifth equalities follow from conditions $1, 3, 3$ of \cref{lem: props_of_exp_kernel_mat} respectively. Adding the two displays, we get the result.
\end{proof}

\begin{definition}[Matrices ${\mat{U}}_{k, a, b}, {\mat{V}}_{k, a, b}$]
	We define the following matrices
	\[{\mat{U}}_{k, a, b} = \mathbb{E}[(\herm{{\mat{F}}}_{k, a, b}(Z) - \herm{{\mat{F}}}_{k, a, b}(Z'))^2|Z]\]
	\[{\mat{V}}_{k, a, b} = \mathbb{E}[({\mat{D}}(Z)\herm{{\mat{K}}}_{k, a, b}(Z, Z'){\mat{D}}(Z))^2|Z]\]
\end{definition}

The definition of ${\mat{U}}_{k, a, b}$ is essentially unchanged from \cite{paulin2016}, where it is called the \textit{conditional variance}. The definition of ${\mat{V}}_{k, a, b}$ is slightly different in our setting. This lets us exploit the specific product structure exhibited by $\herm{{\mat{F}}}_{k, a, b}$ and the special properties of the inner kernel from \cref{lem: props_of_exp_kernel_mat}.

We will now prove a lemma which is similar to a lemma shown in \cite{paulin2016}.

\begin{lemma}\label{lem: main_pmt_bound}
	For any $s > 0$ and for any integer $t \ge 1$,
	\begin{align*}
		\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t}
		&\le \left(\frac{2t - 1}{4}\right)^t\Esch{s{\mat{U}}_{k, a, b} + s^{-1}{\mat{V}}_{k, a, b}}{t}
	\end{align*}
\end{lemma}

To prove this, we will need the following inequality.

\begin{lemma}[Polynomial mean value trace inequality, \cite{paulin2016}]\label{lem: mean_value_trace_inequality}
	For all matrices ${\mat{A}}, {\mat{B}}, {\mat{C}} \in \mathbb{H}^d$, all integers $q \ge 1$ and all $s > 0$,
	\begin{align*}
		\tr [{\mat{C}}({\mat{A}}^q - {\mat{B}}^q)]| \le \frac{q}{4} \tr[(s({\mat{A}} - {\mat{B}})^2 + s^{-1}{\mat{C}}^2)({\mat{A}}^{q - 1} + {\mat{B}}^{q - 1})]
	\end{align*}
\end{lemma}

\begin{proof}[Proof of \cref{lem: main_pmt_bound}]
	We start by invoking \cref{lem: deviation_bound} by setting ${\mat{R}}(Z) = \herm{{\mat{F}}}_{k, a, b}^{2t - 1}(Z)$.
	\begin{align*}
		\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t} &= \mathbb{E}\tr [\herm{{\mat{F}}}_{k, a, b}\cdot\herm{{\mat{F}}}_{k, a, b}^{2t - 1}]\\
	
		&= \frac{1}{2}\mathbb{E}[{\mat{D}}(Z)\herm{{\mat{K}}}_{k, a, b}(Z, Z'){\mat{D}}(Z) (\herm{{\mat{F}}}_{k, a, b}^{2t - 1}(Z) - \herm{{\mat{F}}}_{k, a, b}^{2t - 1}(Z'))]
	\end{align*}

	Applying \cref{lem: mean_value_trace_inequality},
    {\footnotesize
	\begin{align*}
		&\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t}\\
		&\le (\frac{2t - 1}{8})\mathbb{E}\tr[(s(\herm{{\mat{F}}}_{k, a, b}(Z) - \herm{{\mat{F}}}_{k, a, b}(Z'))^2 + s^{-1}({\mat{D}}(Z)\herm{{\mat{K}}}_{k, a, b}(Z, Z'){\mat{D}}(Z))^2)(\herm{{\mat{F}}}_{k, a, b}^{2t - 2}(Z) + \herm{{\mat{F}}}_{k, a, b}^{2t - 2}(Z'))]\\
		&= (\frac{2t - 1}{4})\mathbb{E}\tr[(s(\herm{{\mat{F}}}_{k, a, b}(Z) - \herm{{\mat{F}}}_{k, a, b}(Z'))^2 + s^{-1}({\mat{D}}(Z)\herm{{\mat{K}}}_{k, a, b}(Z, Z'){\mat{D}}(Z))^2)\herm{{\mat{F}}}_{k, a, b}^{2t - 2}(Z)]
	\end{align*}
}
	where the last line used the fact that $(Z, Z')$ has the same distribution as $(Z', Z)$ and applied condition $3$ of \cref{lem: props_of_exp_kernel_mat}. Using the definitions of ${\mat{U}}_{k, a, b}$ and ${\mat{V}}_{k, a, b}$, we get
	\begin{align*}
		\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t} &\le \frac{2t - 1}{4}\mathbb{E}\tr[(s{\mat{U}}_{k, a, b} + s^{-1}{\mat{V}}_{k, a, b})\herm{{\mat{F}}}_{k, a, b}^{2t - 2}]\\
		&\le \frac{2t - 1}{4}\left(\Esch{s{\mat{U}}_{k, a, b} + s^{-1}{\mat{V}}_{k, a, b}}{t}\right)^{1/t}(\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t})^{(t - 1)/t}
	\end{align*}
	where we used H\"{o}lder's inequality for the trace and H\"{o}lder's inequality for the expectation. Rearranging gives the result.
\end{proof}

\subsection{Proof of \cref{lem: main_general}}

\cref{lem: main_pmt_bound} suggests that in order to bound $\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t}$, it suffices to bound $\Esch{{\mat{U}}_{k, a, b}}{t}$ and $\Esch{{\mat{V}}_{k, a, b}}{t}$. Indeed, this will be our strategy. To bound $\Esch{{\mat{U}}_{k, a, b}}{t}$, we will bound it via the matrices that we define below.

\begin{definition}[Matrices ${\mat{\Del}}_1^{k, a, b}, {\mat{\Del}}_2^{k, a, b}, {\mat{\Del}}_3^{k, a, b}$]
	Define the matrices
	\[{\mat{\Del}}_1^{k, a, b} = \mathbb{E}[(({\mat{D}}(Z) - {\mat{D}}(Z'))\herm{{\mat{G}}}_{k, a, b}(Z){\mat{D}}(Z))^2|Z]\]
	\[{\mat{\Del}}_2^{k, a, b} = \mathbb{E}[({\mat{D}}(Z)(\herm{{\mat{G}}}_{k, a, b}(Z) - \herm{{\mat{G}}}_{k, a, b}(Z')){\mat{D}}(Z))^2|Z]\]
	\[{\mat{\Del}}_3^{k, a, b} = \mathbb{E}[({\mat{D}}(Z)\herm{{\mat{G}}}_{k, a, b}(Z)({\mat{D}}(Z) - {\mat{D}}(Z')))^2|Z]\]
\end{definition}

\begin{lemma}\label{lem: bound_U_by_Deltas}
	${\mat{U}}_{k, a, b} \preceq 3({\mat{\Del}}_1^{k, a, b} + {\mat{\Del}}_2^{k, a, b} + {\mat{\Del}}_3^{k, a, b})$.
\end{lemma}

To prove this lemma, we will use the following lemma.

\begin{lemma}\label{lem: orthogonality}
	We have the relations
	\[({\mat{D}}(Z) - {\mat{D}}(Z'))(\herm{{\mat{G}}}_{k, a, b}(Z){\mat{D}}(Z) - \herm{{\mat{G}}}_{k, a, b}(Z'){\mat{D}}(Z')) = 0\]
	\[(\herm{{\mat{G}}}_{k, a, b}(Z) - \herm{{\mat{G}}}_{k, a, b}(Z'))({\mat{D}}(Z) - {\mat{D}}(Z')) = 0\]
\end{lemma}

\begin{proof}[Proof sketch]
	The proof is similar to the proof of third equality in \cref{lem: props_of_exp_kernel_mat}. When $Z'$ is set to $Z^{(i)}$ for some $i \le n$, when a diagonal entry of ${\mat{D}}(Z) - {\mat{D}}(Z')$ is nonzero, then the corresponding row of $\herm{{\mat{G}}}_{k, a, b}(Z){\mat{D}}(Z) - \herm{{\mat{G}}}_{k, a, b}(Z'){\mat{D}}(Z')$ will be $0$. The second equality is analogous.
\end{proof}

\begin{proof}[Proof of \cref{lem: bound_U_by_Deltas}]
	We have
	\begin{align*}
		&(\herm{{\mat{F}}}_{k, a, b}(Z) - \herm{{\mat{F}}}_{k, a, b}(Z'))^2\\
		&= ({\mat{D}}(Z)\herm{{\mat{G}}}_{k, a, b}(Z){\mat{D}}(Z) - {\mat{D}}(Z')\herm{{\mat{G}}}_{k, a, b}(Z'){\mat{D}}(Z'))^2\\
		&= \bigg({\mat{D}}(Z)\herm{{\mat{G}}}_{k, a, b}(Z)({\mat{D}}(Z) - {\mat{D}}(Z')) + {\mat{D}}(Z)(\herm{{\mat{G}}}_{k, a, b}(Z) - \herm{{\mat{G}}}_{k, a, b}(Z')){\mat{D}}(Z')\\
        &\qquad + ({\mat{D}}(Z) - {\mat{D}}(Z'))\herm{{\mat{G}}}_{k, a, b}(Z'){\mat{D}}(Z')\bigg)^2\\
		&= \bigg({\mat{D}}(Z)\herm{{\mat{G}}}_{k, a, b}(Z)({\mat{D}}(Z) - {\mat{D}}(Z')) + {\mat{D}}(Z)(\herm{{\mat{G}}}_{k, a, b}(Z) - \herm{{\mat{G}}}_{k, a, b}(Z')){\mat{D}}(Z)\\
        & \qquad + ({\mat{D}}(Z) - {\mat{D}}(Z'))\herm{{\mat{G}}}_{k, a, b}(Z){\mat{D}}(Z)\bigg)^2
	\end{align*}
	where the last equality follows from \cref{lem: orthogonality}. Taking expectations conditioned on $Z$ and applying \cref{fact: cs}, we immediately get ${\mat{U}}_{k, a, b} \preceq 3({\mat{\Del}}_1^{k, a, b} + {\mat{\Del}}_2^{k, a, b} + {\mat{\Del}}_3^{k, a, b})$.
\end{proof}

In subsequent sections, we will prove the following technical bounds on the matrices we have considered so far.

\begin{restatable}{lemma}{boundDelTwo}\label{lem: bound_on_Del2}
	For all integers $t \ge 1$, \[\Esch{{\mat{\Del}}_2^{k, a, b}}{t} \le \frac{(2d_p)^t}{n^t} (\Esch{\herm{{\mat{F}}}_{k, a, b + 1}}{2t} + \Esch{\herm{{\mat{F}}}_{k, a + 1, b}}{2t})\]
\end{restatable}

\begin{restatable}{lemma}{boundV}\label{lem: bound_on_V}
	For all integers $t \ge 1$, \[\Esch{{\mat{V}}_{k, a, b}}{t} \le (2d_p)^tn^t (\Esch{\herm{{\mat{F}}}_{k, a, b + 1}}{2t} + \Esch{\herm{{\mat{F}}}_{k, a + 1, b}}{2t})\]
\end{restatable}

\begin{restatable}{lemma}{boundDelOne}\label{lem: bound_on_Del1}
	For all integers $t \ge 1$, \[\Esch{{\mat{\Del}}_1^{k, a, b}}{t} \le \frac{(8dd_p)^t}{n^t}\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t}\]
\end{restatable}

\begin{restatable}{lemma}{boundDelThree}\label{lem: bound_on_Del3}
	For all integers $t \ge 1$, \[\Esch{{\mat{\Del}}_3^{k, a, b}}{t} \le \frac{(4d_p)^t}{n^t}\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t}\]
\end{restatable}

Assuming the above lemmas, we can complete the proof of \cref{lem: main_general}, which we restate for convenience.

\maingeneral*

\begin{proof}[Proof of \cref{lem: main_general}]
	Using \cref{lem: main_pmt_bound}, \cref{lem: bound_U_by_Deltas}, we get that for any $s > 0$,
	\begin{align*}
		\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t}
		&\le (\frac{2t - 1}{4})^t\Esch{s{\mat{U}}_{k, a, b} + s^{-1}{\mat{V}}_{k, a, b}}{t}\\
		&\le t^t(s^t\Esch{{\mat{U}}_{k, a, b}}{t} + s^{-t}\Esch{{\mat{V}}_{k, a, b}}{t})\\
		&\le (9st)^t(\Esch{{\mat{\Del}}_1^{k, a, b}}{t} + \Esch{{\mat{\Del}}_2^{k, a, b}}{t} + \Esch{{\mat{\Del}}_3^{k, a, b}}{t}) + t^ts^{-t}\Esch{{\mat{V}}_{k, a, b}}{t}
	\end{align*}
	Let $\rho = s / n$. Since the inequality is true for any choice of $s > 0$, it is true for any choice of $\rho > 0$.
	Now, using \cref{lem: bound_on_Del1}, \cref{lem: bound_on_Del3},
	\begin{align*}
		(9st)^t(\Esch{{\mat{\Del}}_1^{k, a, b}}{t} + \Esch{{\mat{\Del}}_3^{k, a, b}}{t}) &\le (9st)^t\bigg(\frac{(8dd_p)^t}{n^t} + \frac{(4d_p)^t}{n^t}\bigg)\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t}\\
		&= \rho^t (C_1tdd_p)^t\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t}
	\end{align*}
	for an absolute constant $C_1 > 0$. Using \cref{lem: bound_on_Del2}, \cref{lem: bound_on_V},
    {\footnotesize
	\begin{align*}
		(9st)^t\Esch{{\mat{\Del}}_2^{k, a, b}}{t} + t^ts^{-t}\Esch{{\mat{V}}_{k, a, b}}{t} & \le\bigg((9st)^t\frac{(2d_p)^t}{n^t} + t^ts^{-t}(2d_p)^tn^t\bigg)(\Esch{\herm{{\mat{F}}}_{k, a, b + 1}}{2t} + \Esch{\herm{{\mat{F}}}_{k, a + 1, b}}{2t})\\
		&\le (\rho^tC_2^t + \rho^{-t}C_3^t) (td_p)^t (\Esch{\herm{{\mat{F}}}_{k, a, b + 1}}{2t} + \Esch{\herm{{\mat{F}}}_{k, a + 1, b}}{2t})
	\end{align*}
}
	for absolute constants $C_2, C_3 > 0$.
	Therefore,
	\begin{align*}
		\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t} &\le \rho^t (C_1tdd_p)^t\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t} \\&\qquad+ (\rho^tC_2^t + \rho^{-t}C_3^t) (td_p)^t(\Esch{\herm{{\mat{F}}}_{k, a, b + 1}}{2t} + \Esch{\herm{{\mat{F}}}_{k, a + 1, b}}{2t})
	\end{align*}
	We choose $\rho > 0$ so that $\rho^t (C_1tdd_p)^t = \frac{1}{2}$ to get
	\begin{align*}
		\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t} &\le \frac{1}{2}\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t} + \frac{1}{2}(Ct^2dd_p^2)^t (\Esch{\herm{{\mat{F}}}_{k, a, b + 1}}{2t} + \Esch{\herm{{\mat{F}}}_{k, a + 1, b}}{2t})
	\end{align*}
	for an absolute constant $C > 0$.
	Rearranging yields the result.
\end{proof}

\subsection{Bounding ${\mat{\Del}}_2^{k, a, b}$ and ${\mat{V}}_{k, a, b}$}

The next lemma relates ${\mat{V}}_{k, a, b}$ to ${\mat{\Del}}_2^{k, a, b}$ upto a factor of $n^2$ which will be enough for us. We can then focus on bounding ${\mat{\Del}}_2^{k, a, b}$.

\begin{lemma}\label{lem: bounding_V_loewner}
	${\mat{V}}_{k, a, b} \preceq n^2 {\mat{\Del}}_2^{k, a, b}$
\end{lemma}

\begin{proof}
	Using \cref{lem: explicit_kernel_for_matrices},
	\begin{align*}
		{\mat{V}}_{k, a, b} &= \mathbb{E}[({\mat{D}}(Z)\herm{{\mat{K}}}_{k, a, b}(Z, Z'){\mat{D}}(Z))^2|Z]\\
		&= \mathbb{E}[({\mat{D}}(Z)\bigg(\frac{n}{k - a - b}(\herm{{\mat{G}}}_{k, a, b}(Z) - \herm{{\mat{G}}}_{k, a, b}(Z'))\bigg){\mat{D}}(Z))^2|Z]\\
		&\preceq n^2\mathbb{E}[({\mat{D}}(Z)(\herm{{\mat{G}}}_{k, a, b}(Z) - \herm{{\mat{G}}}_{k, a, b}(Z')){\mat{D}}(Z))^2|Z]\\
		&= n^2 {\mat{\Del}}_2^{k, a, b}
	\end{align*}
\end{proof}

For $1 \le i \le n$ and $1 \le l \le d$, let $\mat{e}_{i, l} \in \mathbb{N}^n$ denote the vector $\alpha$ with $\alpha_i = l$ and $\alpha_j = 0$ for $j \neq i$.
We note the following simple proposition.

\begin{propn}\label{propn: difference_equality}
	For any polynomial $f$ such that the degree of $Z_i$ is at most $d$, \[f(Z) - f(Z^{(i)}) = \sum_{1 \le l \le d} (Z_i^l - \resamp{Z_i}^l)\nabla_{\mat{e}_{i, l}}(f)\]
\end{propn}

We now restate and prove \cref{lem: bound_on_Del2}.

\boundDelTwo*

\begin{proof}
	Consider
	\begin{align*}
		{\mat{\Del}}_2^{k, a, b} &= \mathbb{E}[({\mat{D}}(Z)(\herm{{\mat{G}}}_{k, a, b}(Z) - \herm{{\mat{G}}}_{k, a, b}(Z')){\mat{D}}(Z))^2|Z]\\
		&= \mathbb{E}\bigg[ \begin{bmatrix}
			{\mat{M}}\mM^\intercal & 0\\
			0 & {\mat{M}}^\intercal{\mat{M}}
		\end{bmatrix} | Z\bigg]\\
		&= \begin{bmatrix}
			\mathbb{E}[{\mat{M}}\mM^\intercal|Z] & 0\\
			0 & \mathbb{E}[{\mat{M}}^\intercal{\mat{M}}|Z]
		\end{bmatrix}
	\end{align*}
	where ${\mat{M}} = {\mat{D}}_1(Z)({\mat{G}}_{k, a, b}(Z) - {\mat{G}}_{k, a, b}(Z')){\mat{D}}_2(Z)$. Using \cref{propn: difference_equality},
    {\footnotesize
    \begin{align*}
		\mathbb{E}[{\mat{M}}\mM^T | Z] &= \mathbb{E}[{\mat{D}}_1(Z)({\mat{G}}_{k, a, b}(Z) - {\mat{G}}_{k, a, b}(Z')){\mat{D}}_2(Z)\cdot {\mat{D}}_2(Z) ({\mat{G}}_{k, a, b}(Z) - {\mat{G}}_{k, a, b}(Z'))^\intercal{\mat{D}}_1(Z)|Z]\\
		&= \frac{1}{n} \sum_{i = 1}^n\mathbb{E}[{\mat{D}}_1(Z)({\mat{G}}_{k, a, b}(Z) - {\mat{G}}_{k, a, b}(Z^{(i)})){\mat{D}}_2(Z)\cdot {\mat{D}}_2(Z) ({\mat{G}}_{k, a, b}(Z) - {\mat{G}}_{k, a, b}(Z^{(i)}))^\intercal{\mat{D}}_1(Z)|Z]\\
		&= \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^d\mathbb{E}[(Z_i^l - \resamp{Z_i}^l)^2|Z]\cdot {\mat{D}}_1(Z)(\nabla_{\mat{e}_{i, l}} {\mat{G}}_{k, a, b})(Z){\mat{D}}_2(Z)\cdot {\mat{D}}_2(Z) (\nabla_{\mat{e}_{i, l}} {\mat{G}}_{k, a, b})(Z)^\intercal{\mat{D}}_1(Z)
	\end{align*}
}
	Define ${\mat{N}}_{i, l}(Z) := {\mat{D}}_1(Z)(\nabla_{\mat{e}_{i, l}} {\mat{G}}_{k, a, b})(Z){\mat{D}}_2(Z)$. Then,
	\begin{align*}
		\mathbb{E}[{\mat{M}}\mM^T | Z] &= \frac{1}{n}\sum_{i = 1}^n \sum_{l = 1}^d\mathbb{E}[(Z_i^l - \resamp{Z_i}^l)^2|Z]\cdot {\mat{N}}_{i, l}(Z){\mat{N}}_{i, l}(Z)^\intercal\\
		&\preceq \frac{2}{n}\sum_{i = 1}^n \sum_{l = 1}^d(Z_i^{2l} + \mathbb{E}[Z_i^{2l}])\cdot {\mat{N}}_{i, l}(Z){\mat{N}}_{i, l}(Z)^\intercal
	\end{align*}
	Similarly,
	\begin{align*}
		\mathbb{E}[{\mat{M}}^\intercal{\mat{M}} | Z] &\preceq \frac{2}{n}\sum_{i = 1}^n \sum_{l = 1}^d(Z_i^{2l} + \mathbb{E}[Z_i^{2l}])\cdot {\mat{N}}_{i, l}(Z)^\intercal{\mat{N}}_{i, l}(Z)
	\end{align*}

	\begin{claim}\label{claim: reduction_general}
		We have the relations
		\[\sum_{i = 1}^n\sum_{l = 1}^d (Z_i^{2l} + \mathbb{E}[Z_i^{2l}]) \cdot {\mat{N}}_{i, l}(Z){\mat{N}}_{i, l}(Z)^\intercal = (b + 1){\mat{F}}_{k, a, b + 1}{\mat{F}}_{k, a, b + 1}^\intercal\]
		\[\sum_{i = 1}^n \sum_{l = 1}^d(Z_i^{2l} + \mathbb{E}[Z_i^{2l}])\cdot {\mat{N}}_{i, l}(Z)^\intercal{\mat{N}}_{i, l}(Z) = (a + 1){\mat{F}}_{k, a + 1, b}^\intercal{\mat{F}}_{k, a + 1, b}\]
	\end{claim}

	Using this claim, we have
	\[\mathbb{E}[{\mat{M}}\mM^T | Z] \preceq \frac{2(b + 1)}{n}{\mat{F}}_{k, a, b + 1}{\mat{F}}_{k, a, b + 1}^\intercal \preceq \frac{2d_p}{n}{\mat{F}}_{k, a, b + 1}{\mat{F}}_{k, a, b + 1}^\intercal\]
	\[\mathbb{E}[{\mat{M}}^\intercal{\mat{M}} | Z] \preceq \frac{2(a + 1)}{n}{\mat{F}}_{k, a + 1, b}^\intercal{\mat{F}}_{k, a + 1, b} \preceq \frac{2d_p}{n}{\mat{F}}_{k, a + 1, b}^\intercal{\mat{F}}_{k, a + 1, b}\]
	Therefore,
	\begin{align*}
		\Esch{{\mat{\Del}}_2^{k, a, b}}{t} &= \Esch{\mathbb{E}[{\mat{M}}\mM^\intercal|Z]}{t} + \Esch{\mathbb{E}[{\mat{M}}^\intercal{\mat{M}}|Z]}{t}\\
		&\le \frac{(2d_p)^t}{n^t} (\Esch{{\mat{F}}_{k, a, b + 1}}{2t} + \Esch{{\mat{F}}_{k, a + 1, b}}{2t})\\
		&\le \frac{(2d_p)^t}{n^t} (\Esch{\herm{{\mat{F}}}_{k, a, b + 1}}{2t} + \Esch{\herm{{\mat{F}}}_{k, a + 1, b}}{2t})
	\end{align*}
\end{proof}

It remains to prove the claim.
\begin{proof}[Proof of~\cref{claim: reduction_general}]
	We will prove the first relation, the second is analogous.
	For a fixed $i \le n, l \le d$, consider any nonzero entry $[(I_1, \alpha_1, \gamma_1), (I_2, \alpha_2, \gamma_2)]$ of $\sum_{i = 1}^n \sum_{l = 1}^d (Z_i^{2l} + \mathbb{E}[Z_i^{2l}]) {\mat{N}}_{i, l}(Z) {\mat{N}}_{i, l}(Z)^\intercal$, where $I_1, I_2 \in {\mathcal{I}}, (\alpha_1, \gamma_1), (\alpha_2, \gamma_2) \in {\mathcal{K}}$. We must have $|\alpha_1|_0 = |\alpha_2|_0 = a$, in which case the entry  is equal to
    {\footnotesize
	\begin{align*}
		\sum_{\substack{(J, \alpha_3, \gamma_3) \in {\mathcal{J}}\times {\mathcal{K}}\\ |\alpha_3| = b \\ \alpha_1\alpha_3 = \alpha_2\alpha_3 = 0}} &(Z_i^{2l} + \mathbb{E}[Z_i^{2l}]) \cdot (\sqrt{\mathbb{E}[Z^{2\alpha_1\cdot (1 - \gamma_1) + 2\alpha_3\cdot (1 - \gamma_3)}]}Z^{\alpha_1\cdot \gamma_1 + \alpha_3\cdot\gamma_3}\nabla_{\mat{e}_{i, l}} \nabla_{\alpha_1 + \alpha_3} {\mat{X}}_k[I_1, J])\\
		&\cdot (\sqrt{\mathbb{E}[Z^{2\alpha_2\cdot (1 - \gamma_2) + 2\alpha_3\cdot (1 - \gamma_3)}]}Z^{\alpha_2\cdot \gamma_2 + \alpha_3\cdot\gamma_3}\nabla_{\mat{e}_{i, l}} \nabla_{\alpha_2 + \alpha_3} {\mat{X}}_k[I_2, J])
	\end{align*}}
	Note that the term inside the summation is nonzero only when $\mat{e}_{i, l}\cdot (\alpha_1 + \alpha_3) = \mat{e}_{i, l} \cdot (\alpha_2 + \alpha_3) = 0$. Hence, this sum can be written as
	\begin{align*}
		\sum_{\substack{(J, \alpha_3, \gamma_3) \in {\mathcal{J}}\times {\mathcal{K}}\\ |\alpha_3| = b + 1 \\ \mat{e}_{i, l} \unlhd \alpha_3, \alpha_1\alpha_3 = \alpha_2\alpha_3 = 0}} &(\sqrt{\mathbb{E}[Z^{2\alpha_1\cdot (1 - \gamma_1) + 2\alpha_3\cdot (1 - \gamma_3)}]}Z^{\alpha_1\cdot \gamma_1 + \alpha_3\cdot\gamma_3}\nabla_{\alpha_1 + \alpha_3} {\mat{X}}_k[I_1, J])\\
		&\cdot (\sqrt{\mathbb{E}[Z^{2\alpha_2\cdot (1 - \gamma_2) + 2\alpha_3\cdot (1 - \gamma_3)}]}Z^{\alpha_2\cdot \gamma_2 + \alpha_3\cdot\gamma_3}\nabla_{\alpha_2 + \alpha_3} {\mat{X}}_k[I_2, J])
	\end{align*}
	When we add this entry over all $i \le n, l \le d$, this simplifies to
	\begin{align*}
		(b + 1) \cdot \sum_{\substack{(J, \alpha_3, \gamma_3) \in {\mathcal{J}}\times {\mathcal{K}}\\ |\alpha_3| = b + 1 \\ \alpha_1\alpha_3 = \alpha_2\alpha_3 = 0}} &(\sqrt{\mathbb{E}[Z^{2\alpha_1\cdot (1 - \gamma_1) + 2\alpha_3\cdot (1 - \gamma_3)}]}Z^{\alpha_1\cdot \gamma_1 + \alpha_3\cdot\gamma_3}\nabla_{\alpha_1 + \alpha_3} {\mat{X}}_k[I_1, J])\\
		&\cdot (\sqrt{\mathbb{E}[Z^{2\alpha_2\cdot (1 - \gamma_2) + 2\alpha_3\cdot (1 - \gamma_3)}]}Z^{\alpha_2\cdot \gamma_2 + \alpha_3\cdot\gamma_3}\nabla_{\alpha_2 + \alpha_3} {\mat{X}}_k[I_2, J])
	\end{align*}
	The factor of $(b + 1)$ came because the index $i$ could have been chosen from among all the active indices in $\alpha_3$. But this is precisely the $[(I_1, \alpha_1, \gamma_1), (I_2, \alpha_2, \gamma_2)]$ entry of $(b + 1){\mat{F}}_{k, a, b + 1}{\mat{F}}_{k, a, b + 1}^\intercal$, proving the claim.
\end{proof}

We restate and prove \cref{lem: bound_on_V}.

\boundV*

\begin{proof}
	Using \cref{lem: bounding_V_loewner} and \cref{lem: bound_on_Del2}, we get
	\begin{align*}
		\Esch{{\mat{V}}_{k, a, b}}{t} &\le n^{2t}\Esch{{\mat{\Del}}_2^{k, a, b}}{t}\\
		&\le (2d_p)^tn^t (\Esch{\herm{{\mat{F}}}_{k, a, b + 1}}{2t} + \Esch{\herm{{\mat{F}}}_{k, a + 1, b}}{2t})
	\end{align*}
\end{proof}

\subsection{Bounding ${\mat{\Del}}_1^{k, a, b}$ and ${\mat{\Del}}_3^{k, a, b}$}

Define $\sqcup$ to be the disjoint union of sets. For $1 \le i \le n$ and $1 \le l \le d$, define the diagonal matrices ${\mat{\Pi}}_{i, l}, {\mat{\Pi}}_{i, l}', {\mat{\Pi}}_i, {\mat{\Pi}}_i' \in {\mathbb R}^{({\mathcal{I}} \times {\mathcal{K}}) \sqcup ({\mathcal{J}} \times {\mathcal{K}})} \times {\mathbb R}^{({\mathcal{I}} \times {\mathcal{K}}) \sqcup ({\mathcal{J}} \times {\mathcal{K}})}$ (the same dimensions as ${\mat{D}}$) as
{\footnotesize
\[{\mat{\Pi}}_{i, l}[(I, \alpha, \beta), (I, \alpha, \beta)] = \begin{dcases}
	1 & \text{ if $(\alpha \cdot \gamma)_i \neq 0$ and $\alpha_i = l$}\\
	0 & \text{ o.w.}
\end{dcases}\qquad {\mat{\Pi}}_i[(I, \alpha, \beta), (I, \alpha, \beta)] = \begin{dcases}
	1 & \text{ if $(\alpha \cdot \gamma)_i \neq 0$}\\
	0 & \text{ o.w.}
\end{dcases}\]
\[{\mat{\Pi}}'_{i, l}[(I, \alpha, \beta), (I, \alpha, \beta)] = \begin{dcases}
	1 & \text{ if $\alpha_i \neq 0$ and $\alpha_i = l$}\\
	0 & \text{ o.w.}
\end{dcases}\qquad {\mat{\Pi}}_i'[(I, \alpha, \beta), (I, \alpha, \beta)] = \begin{dcases}
	1 & \text{ if $\alpha_i \neq 0$}\\
	0 & \text{ o.w.}
\end{dcases}\]
}
for all $I \in {\mathcal{I}} \sqcup {\mathcal{J}}$.
Note that for all $i \le n$, ${\mat{\Pi}}_i = \sum_{l = 1}^d {\mat{\Pi}}_{i, l}$.

Also, for all $1 \le i \le n$, we define the permutation matrices ${\mat{\Sig}}_i \in{\mathbb R}^{({\mathcal{I}} \times {\mathcal{K}}) \sqcup ({\mathcal{J}} \times {\mathcal{K}})} \times {\mathbb R}^{({\mathcal{I}} \times {\mathcal{K}}) \sqcup ({\mathcal{J}} \times {\mathcal{K}})}$ as follows. Consider the permutation $\sigma_1$ on ${\mathcal{I}}\times {\mathcal{K}}$ that transposes $(I, \alpha, \gamma)$ and $(I, \alpha, \gamma + \mat{e}_i)$ for all $(I, \alpha, \gamma) \in {\mathcal{I}}\times {\mathcal{K}}$ such that $\alpha_i \neq 0$. Here, $\mat{e}_i \in \{0, 1\}^n$ has exactly one nonzero entry, which is in the $i$th position, and $\gamma + \mat{e}_i$ is the usual addition over $\mathbb{F}_2$. $\sigma_1$ leaves other positions fixed. Let ${\mat{\Sig}}^{(1)}_i$ be the permutation matrix for $\sigma$. Similarly, let ${\mat{\Sig}}^{(2)}_i$ be the permutation matrix of the permutation $\sigma_2$ on ${\mathcal{J}} \times {\mathcal{K}}$ that transposes $(J, \alpha, \gamma)$ and $(J, \alpha, \gamma + \mat{e}_i)$ for all $(J, \alpha, \gamma) \in {\mathcal{J}}\times {\mathcal{K}}$ such that $\alpha_i \neq 0$, and leaves all other positions fixed. Then, we define ${\mat{\Sig}}_i = \begin{bmatrix}
	{\mat{\Sig}}^{(1)}_i & 0\\
	0 & {\mat{\Sig}}^{(2)}_i
\end{bmatrix}$. The following fact is easy to verify.

\begin{fact}\label{fact: commutativity}
	${\mat{\Pi}}'_{i, l}{\mat{\Sig}}_i = {\mat{\Sig}}_i{\mat{\Pi}}'_{i, l}$ and ${\mat{\Pi}}_i' {\mat{\Sig}}_i = {\mat{\Sig}}_i {\mat{\Pi}}_i'$.
\end{fact}

We are now ready to prove \cref{lem: bound_on_Del1} which we restate for convenience.

\boundDelOne*

\begin{proof}
	Firstly,
	\begin{align*}
		{\mat{\Del}}_1^{k, a, b} &= \mathbb{E}[(({\mat{D}}(Z) - {\mat{D}}(Z'))\herm{{\mat{G}}}_{k, a, b}(Z){\mat{D}}(Z))^2|Z]\\
		&= \mathbb{E}[({\mat{D}}(Z) - {\mat{D}}(Z'))\herm{{\mat{G}}}_{k, a, b}(Z){\mat{D}}(Z)\cdot {\mat{D}}(Z)\herm{{\mat{G}}}_{k, a, b}(Z)({\mat{D}}(Z) - {\mat{D}}(Z'))|Z]\\
		&= \mathbb{E}[({\mat{D}}(Z) - {\mat{D}}(Z')) {\mat{M}}(Z) ({\mat{D}}(Z) - {\mat{D}}(Z'))|Z]
	\end{align*}
	where we define ${\mat{M}}(Z) = \herm{{\mat{G}}}_{k, a, b}(Z){\mat{D}}(Z)\cdot {\mat{D}}(Z)\herm{{\mat{G}}}_{k, a, b}(Z)$.
	Recall that $Z' = Z^{(i)}$ for some $i$ randomly chosen from $[n]$ uniformly. Observing that ${\mat{D}}(Z) - {\mat{D}}(Z^{(i)}) = {\mat{\Pi}}_i({\mat{D}}(Z) - {\mat{D}}(Z^{(i)}))$ for all $i$, we get
	\begin{align*}
		{\mat{\Del}}_1^{k, a, b}
		&= \mathbb{E}[ \mathbb{E}_{i \in [n]} [({\mat{D}}(Z) - {\mat{D}}(Z^{(i)})) {\mat{M}}(Z) ({\mat{D}}(Z) - {\mat{D}}(Z^{(i)}))]|Z]\\
		&= \mathbb{E}[\mathbb{E}_{i \in [n]} [{\mat{\Pi}}_i({\mat{D}}(Z) - {\mat{D}}(Z^{(i)})) {\mat{M}}(Z) ({\mat{D}}(Z) - {\mat{D}}(Z^{(i)})){\mat{\Pi}}_i]|Z]\\
		&\preceq 2\bigg(\mathbb{E}[\mathbb{E}_{i \in [n]} [{\mat{\Pi}}_i{\mat{D}}(Z){\mat{M}}(Z){\mat{D}}(Z){\mat{\Pi}}_i]|Z] + \mathbb{E}[\mathbb{E}_{i \in [n]} [{\mat{\Pi}}_i{\mat{D}}(Z^{(i)}){\mat{M}}(Z){\mat{D}}(Z^{(i)}){\mat{\Pi}}_i]|Z]\bigg)\\
		&\preceq 2\bigg(\mathbb{E}_{i \in [n]} [{\mat{\Pi}}_i\herm{{\mat{F}}}_{k, a, b}^2{\mat{\Pi}}_i] + \mathbb{E}[\mathbb{E}_{i \in [n]} [{\mat{\Pi}}_i{\mat{D}}(Z^{(i)}){\mat{M}}(Z){\mat{D}}(Z^{(i)}){\mat{\Pi}}_i]|Z]\bigg)\\
		& \preceq 2({\mat{\Del}}_{10} + {\mat{\Del}}_{11})
	\end{align*}
	where we define
	\[{\mat{\Del}}_{10} = \mathbb{E}_{i \in [n]} [{\mat{\Pi}}_i\herm{{\mat{F}}}_{k, a, b}^2{\mat{\Pi}}_i], \qquad {\mat{\Del}}_{11} = \mathbb{E}[\mathbb{E}_{i \in [n]} [{\mat{\Pi}}_i{\mat{D}}(Z^{(i)}){\mat{M}}(Z){\mat{D}}(Z^{(i)}){\mat{\Pi}}_i]|Z]\]
	Invoking \cref{lem: jensen_trace} over the interval $[0, \infty)$ with the convex continuous function $f(x) = x^t$, ${\mat{B}}_i = \herm{{\mat{F}}}_{k, a, b}^2, {\mat{A}}_i = \frac{1}{\sqrt{d_p}}{\mat{\Pi}}_i$ where we observe that $\sum_{i = 1}^n {\mat{A}}_i {\mat{A}}_i^T = \frac{1}{d_p}\sum_{i = 1}^n {\mat{\Pi}}_i^2\preceq {\mat{I}}$, we get

	\begin{align*}
		\Esch{{\mat{\Del}}_{10}}{t} = \mathbb{E}\tr[{\mat{\Del}}_{10}^t] = \mathbb{E}\tr[\bigg(\mathbb{E}_{i \in [n]} [{\mat{\Pi}}_i\herm{{\mat{F}}}_{k, a, b}^2{\mat{\Pi}}_i]\bigg)^t] &= \frac{1}{n^t}\mathbb{E}\tr[\bigg(\sum_{i = 1}^n{\mat{\Pi}}_i\herm{{\mat{F}}}_{k, a, b}^2{\mat{\Pi}}_i\bigg)^t]\\
		&\le \frac{d_p^{t - 1}}{n^t}\mathbb{E}\tr[\bigg(\sum_{i = 1}^n{\mat{\Pi}}_i\herm{{\mat{F}}}_{k, a, b}^{2t}{\mat{\Pi}}_i\bigg)]\\
		&\le \frac{d_p^{t - 1}}{n^t}\mathbb{E}\tr[\bigg(\sum_{i = 1}^n{\mat{\Pi}}_i^2\bigg)\herm{{\mat{F}}}_{k, a, b}^{2t}]\\
		&\le \frac{d_p^t}{n^t}\mathbb{E}\tr[\herm{{\mat{F}}}_{k, a, b}^{2t}]\\
		&= \frac{d_p^t}{n^t}\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t}
	\end{align*}

	Now, consider
	\begin{align*}
		{\mat{\Del}}_{11} &= \mathbb{E}[\mathbb{E}_{i \in [n]} [{\mat{\Pi}}_i{\mat{D}}(Z^{(i)}){\mat{M}}(Z){\mat{D}}(Z^{(i)}){\mat{\Pi}}_i]|Z]\\
		&= \mathbb{E}[\mathbb{E}_{i \in [n]} 	[(\sum_{l = 1}^d{\mat{\Pi}}_{i, l}){\mat{D}}(Z^{(i)}){\mat{M}}(Z){\mat{D}}(Z^{(i)})(\sum_{l = 1}^d{\mat{\Pi}}_{i, l})]|Z]\\
		&\preceq d\cdot \mathbb{E}[\mathbb{E}_{i \in [n]} [\sum_{l = 1}^d{\mat{\Pi}}_{i, l}{\mat{D}}(Z^{(i)}){\mat{M}}(Z){\mat{D}}(Z^{(i)}){\mat{\Pi}}_{i, l}]|Z]\\
		&= d\cdot \mathbb{E}_{i \in [n]} [\sum_{l = 1}^d \frac{\mathbb{E}[Z_i^{2l}]}{Z_i^{2l}}{\mat{\Pi}}_{i, l}{\mat{D}}(Z){\mat{M}}(Z){\mat{D}}(Z){\mat{\Pi}}_{i, l}]\\
		&= \frac{d}{n} \sum_{i = 1}^n\sum_{l = 1}^d \frac{\mathbb{E}[Z_i^{2l}]}{Z_i^{2l}}{\mat{\Pi}}_{i, l}{\mat{D}}(Z){\mat{M}}(Z){\mat{D}}(Z){\mat{\Pi}}_{i, l}\\
		&= \frac{d}{n} \sum_{i = 1}^n\sum_{l = 1}^d {\mat{\Pi}}_{i, l}{\mat{\Sig}}_i{\mat{D}}(Z){\mat{M}}(Z){\mat{D}}(Z){\mat{\Sig}}_i^\intercal{\mat{\Pi}}_{i, l}\\
		&= \frac{d}{n} \sum_{i = 1}^n\sum_{l = 1}^d {\mat{\Pi}}_{i, l}{\mat{\Sig}}_i\herm{{\mat{F}}}_{k, a, b}^2{\mat{\Sig}}_i^\intercal{\mat{\Pi}}_{i, l}
	\end{align*}
	We now invoke \cref{lem: jensen_trace} on $dd_p$ terms with ${\mat{B}}_{i, l} = \herm{{\mat{F}}}_{k, a, b}^2$ and ${\mat{A}}_{i, l} = \frac{1}{\sqrt{d_p}} {\mat{\Pi}}_{i, l}{\mat{\Sig}}_i$ where we observe that
	\[\sum_{i = 1}^n\sum_{l = 1}^d {\mat{A}}_{i, l} {\mat{A}}_{i, l}^T = \frac{1}{d_p}\sum_{i = 1}^n\sum_{l = 1}^d {\mat{\Pi}}_{i, l}{\mat{\Sig}}_i{\mat{\Sig}}_i^\intercal{\mat{\Pi}}_{i, l}^\intercal = \frac{1}{d_p}\sum_{i = 1}^n\sum_{l = 1}^d {\mat{\Pi}}_{i, l}^2 \preceq {\mat{I}}\]
	to get
	\begin{align*}
		\Esch{{\mat{\Del}}_{11}}{t} = \mathbb{E} \tr[{\mat{\Del}}_{11}^t] &\le \frac{d^t}{n^t} \mathbb{E}\tr[(\sum_{i = 1}^n\sum_{l = 1}^d {\mat{\Pi}}_{i, l}{\mat{\Sig}}_i\herm{{\mat{F}}}_{k, a, b}^2{\mat{\Sig}}_i^\intercal{\mat{\Pi}}_{i, l})^t]\\
		&\le \frac{(dd_p)^t}{n^t}\mathbb{E}\tr[\bigg(\frac{1}{d_p}\sum_{i = 1}^n\sum_{l = 1}^d {\mat{\Pi}}_{i, l}{\mat{\Sig}}_i\herm{{\mat{F}}}_{k, a, b}^{2t}{\mat{\Sig}}_i^\intercal{\mat{\Pi}}_{i, l}\bigg)]\\
		&= \frac{(dd_p)^t}{n^t}\mathbb{E}\tr[\bigg(\frac{1}{d_p}\sum_{i = 1}^n\sum_{l = 1}^d {\mat{\Sig}}_i^\intercal{\mat{\Pi}}_{i, l}{\mat{\Pi}}_{i, l}{\mat{\Sig}}_i\herm{{\mat{F}}}_{k, a, b}^{2t}\bigg)]
	\end{align*}
	To simplify this, we use \cref{fact: commutativity} to get
    \begin{align*}
    	\sum_{i = 1}^n\sum_{l = 1}^d {\mat{\Sig}}_i^\intercal({\mat{\Pi}}_{i, l})^2{\mat{\Sig}}_i \preceq \sum_{i = 1}^n\sum_{l = 1}^d {\mat{\Sig}}_i^\intercal({\mat{\Pi}}'_{i, l})^2{\mat{\Sig}}_i = \sum_{i = 1}^n\sum_{l = 1}^d {\mat{\Pi}}'_{i, l}{\mat{\Sig}}_i^\intercal{\mat{\Sig}}_i{\mat{\Pi}}'_{i, l} &= \sum_{i = 1}^n\sum_{l = 1}^d {\mat{\Pi}}'_{i, l}{\mat{\Pi}}'_{i, l}\\ &\preceq d_p {\mat{I}}
    \end{align*}
	Therefore,
	\[\Esch{{\mat{\Del}}_{11}}{t} \le  \frac{(dd_p)^t}{n^t}\mathbb{E}\tr[\herm{{\mat{F}}}_{k, a, b}^{2t}] = \frac{(dd_p)^t}{n^t}\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t}\]
	Putting them together, using \cref{fact: holder},
	\begin{align*}
		\Esch{{\mat{\Del}}_1^{k, a, b}}{t} &\le 4^t(\Esch{{\mat{\Del}}_{10}}{t} + \Esch{{\mat{\Del}}_{11}}{t})\\
		&\le \frac{(8dd_p)^t}{n^t}\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t}
	\end{align*}
\end{proof}

We now restate and prove \cref{lem: bound_on_Del3}.

\boundDelThree*

\begin{proof}
	Recall that $Z' = Z^{(i)}$ for $i$ sampled uniformly from $[n]$. Then,
	\begin{align*}
		{\mat{\Del}}_3^{k, a, b} &= \mathbb{E}[({\mat{D}}(Z)\herm{{\mat{G}}}_{k, a, b}(Z)({\mat{D}}(Z) - {\mat{D}}(Z')))^2|Z]\\
		&= \mathbb{E}[\mathbb{E}_{i \in [n]} [({\mat{D}}(Z)\herm{{\mat{G}}}_{k, a, b}(Z)({\mat{D}}(Z) - {\mat{D}}(Z^{(i)})))^2] | Z]\\
		&= \mathbb{E}[\mathbb{E}_{i \in [n]} [({\mat{D}}(Z)\herm{{\mat{G}}}_{k, a, b}(Z){\mat{\Pi}}_i({\mat{D}}(Z) - {\mat{D}}(Z^{(i)})))^2] | Z]
	\end{align*}
	where we use the fact that ${\mat{D}}(Z) - {\mat{D}}(Z^{(i)}) = {\mat{\Pi}}_i({\mat{D}}(Z) - {\mat{D}}(Z^{(i)}))$ for all $i$. Define ${\mat{M}}(Z) = {\mat{D}}(Z)\herm{{\mat{G}}}_{k, a, b}$ to get
	\begin{align*}
		{\mat{\Del}}_3^{k, a, b} &= \mathbb{E}[\mathbb{E}_{i \in [n]} [{\mat{M}}(Z) {\mat{\Pi}}_i({\mat{D}}(Z) - {\mat{D}}(Z^{(i)}))^2{\mat{\Pi}}_i{\mat{M}}(Z)^\intercal] | Z]\\
		&\preceq 2(\mathbb{E}[\mathbb{E}_{i \in [n]} [{\mat{M}}(Z) {\mat{\Pi}}_i{\mat{D}}(Z)^2{\mat{\Pi}}_i{\mat{M}}(Z)^\intercal] | Z] + \mathbb{E}[\mathbb{E}_{i \in [n]} [{\mat{M}}(Z) {\mat{\Pi}}_i{\mat{D}}(Z^{(i)})^2{\mat{\Pi}}_i{\mat{M}}(Z)^\intercal] | Z])\\
		&= 2(\mathbb{E}_{i \in [n]} [{\mat{M}}(Z) {\mat{\Pi}}_i{\mat{D}}(Z)^2{\mat{\Pi}}_i{\mat{M}}(Z)^\intercal] + \mathbb{E}[\mathbb{E}_{i \in [n]} [{\mat{M}}(Z) {\mat{\Pi}}_i{\mat{D}}(Z^{(i)})^2{\mat{\Pi}}_i{\mat{M}}(Z)^\intercal] | Z])\\
		&= 2({\mat{\Del}}_{30} + {\mat{\Del}}_{31})
	\end{align*}
	where we define
	\[{\mat{\Del}}_{30} = \mathbb{E}_{i \in [n]} [{\mat{M}}(Z) {\mat{\Pi}}_i{\mat{D}}(Z)^2{\mat{\Pi}}_i{\mat{M}}(Z)^\intercal], \qquad {\mat{\Del}}_{31} = \mathbb{E}[\mathbb{E}_{i \in [n]} [{\mat{M}}(Z) {\mat{\Pi}}_i{\mat{D}}(Z^{(i)})^2{\mat{\Pi}}_i{\mat{M}}(Z)^\intercal] | Z]\]
	We have
	\begin{align*}
		{\mat{\Del}}_{30} = \mathbb{E}_{i \in [n]} [{\mat{M}}(Z) {\mat{\Pi}}_i{\mat{D}}(Z)^2{\mat{\Pi}}_i{\mat{M}}(Z)^\intercal] &= \mathbb{E}_{i \in [n]} [{\mat{M}}(Z) {\mat{D}}(Z){\mat{\Pi}}_i{\mat{\Pi}}_i{\mat{D}}(Z){\mat{M}}(Z)^\intercal]\\
		&= {\mat{M}}(Z) {\mat{D}}(Z)(\frac{1}{n}\sum_{i = 1}^n{\mat{\Pi}}_i^2){\mat{D}}(Z){\mat{M}}(Z)^\intercal\\
		&\preceq \frac{d_p}{n} {\mat{M}}(Z) {\mat{D}}(Z){\mat{D}}(Z){\mat{M}}(Z)^\intercal\\
		&= \frac{d_p}{n} \herm{{\mat{F}}}_{k, a, b}^2
	\end{align*}
	For the other term, using \cref{fact: commutativity},
	\begin{align*}
		{\mat{\Del}}_{31} &= \mathbb{E}[\mathbb{E}_{i \in [n]} [{\mat{M}}(Z) {\mat{\Pi}}_i{\mat{D}}(Z^{(i)})^2{\mat{\Pi}}_i{\mat{M}}(Z)^\intercal] | Z]\\
		&= \mathbb{E}_{i \in [n]} [{\mat{M}}(Z) {\mat{\Pi}}_i{\mat{\Sig}}_i{\mat{D}}(Z)^2{\mat{\Sig}}_i{\mat{\Pi}}_i{\mat{M}}(Z)^\intercal]\\
		&\preceq \mathbb{E}_{i \in [n]} [{\mat{M}}(Z) {\mat{\Pi}}'_i{\mat{\Sig}}_i{\mat{D}}(Z)^2{\mat{\Sig}}_i{\mat{\Pi}}'_i{\mat{M}}(Z)^\intercal]\\
		&= \mathbb{E}_{i \in [n]} [{\mat{M}}(Z) {\mat{\Sig}}_i{\mat{\Pi}}'_i{\mat{D}}(Z)^2{\mat{\Pi}}'_i{\mat{\Sig}}_i{\mat{M}}(Z)^\intercal]\\&= \mathbb{E}_{i \in [n]} [{\mat{D}}(Z)\herm{{\mat{G}}}_{k, a, b} {\mat{\Sig}}_i{\mat{\Pi}}'_i{\mat{D}}(Z)^2{\mat{\Pi}}'_i{\mat{\Sig}}_i\herm{{\mat{G}}}_{k, a, b}{\mat{D}}(Z)]
	\end{align*}
	Observe that $\herm{{\mat{G}}}_{k, a, b} {\mat{\Sig}}_i = \herm{{\mat{G}}}_{k, a, b}$ because the entries of $\herm{{\mat{G}}}$ only depend on $\alpha$ and not on $\gamma$, so permuting the $\gamma$s will not have any effect on the matrix. Therefore,

	\begin{align*}
		{\mat{\Del}}_{31} &\preceq \mathbb{E}_{i \in [n]} [{\mat{D}}(Z)\herm{{\mat{G}}}_{k, a, b}{\mat{\Pi}}'_i{\mat{D}}(Z)^2{\mat{\Pi}}'_i\herm{{\mat{G}}}_{k, a, b}{\mat{D}}(Z)]\\
		&\preceq \mathbb{E}_{i \in [n]} [{\mat{D}}(Z)\herm{{\mat{G}}}_{k, a, b}{\mat{D}}(Z){\mat{\Pi}}'_i{\mat{\Pi}}'_i{\mat{D}}(Z)\herm{{\mat{G}}}_{k, a, b}{\mat{D}}(Z)]\\
		&= \mathbb{E}_{i \in [n]} \herm{{\mat{F}}}_{k, a, b}{\mat{\Pi}}'_i{\mat{\Pi}}'_i\herm{{\mat{F}}}_{k, a, b}\\
		&= \frac{1}{n} \sum_{i = 1}^n \herm{{\mat{F}}}_{k, a, b}{\mat{\Pi}}'_i{\mat{\Pi}}'_i\herm{{\mat{F}}}_{k, a, b}\\
		&\preceq \frac{d_p}{n}\herm{{\mat{F}}}_{k, a, b}^2
	\end{align*}
	where we used the fact that $\sum_{i = 1}^n{\mat{\Pi}}'_i{\mat{\Pi}}'_i \preceq d_p {\mat{I}}$. Putting them together,
	\begin{align*}
		\Esch{{\mat{\Del}}_3^{k, a, b}}{t} \le 2^t(\Esch{{\mat{\Del}}_{30}}{t} + \Esch{{\mat{\Del}}_{31}}{t}) \le 2^t \cdot 2 \frac{d_p^t}{n^t}\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t} \le \frac{(4d_p)^t}{n^t}\Esch{\herm{{\mat{F}}}_{k, a, b}}{2t}
	\end{align*}
\end{proof}



\subsubsection*{Random Variables}

\subsubsection*{Multi-index notation}

For any pair of vectors $\alpha, \beta \in \mathbb{N}^n$ and scalar $c \in \mathbb{N}$, we define $\alpha + \beta, \alpha \cdot \beta, c\alpha$ entrywise. We also define the orderings $\alpha \le \beta$ and $\alpha \unlhd \beta$ where we say $\alpha \le \beta$ if for each $i$, $\alpha_i \le \beta_i$, and $\alpha \unlhd \beta$ if for each $i$, $\alpha_i$ is either $0$ or $\beta_i$. We denote by $|\alpha|_0$ the number of nonzero entries of $\alpha$ and by $|\alpha|_1$, the sum of entries of $\alpha$. For a boolean vector $\gamma \in \{0, 1\}^n$, we define $1 - \gamma$ the vector with all its bits flipped.




\subsubsection*{Derivatives}

For variables $Z_1, \ldots, Z_n$ and $\alpha \in \mathbb{N}^n$, define the monomial $Z^{\alpha} := \prod_{i = 1}^n Z_i^{\alpha_i}$. This forms a standard basis for polynomials.

For $\alpha \in \mathbb{N}^n$, we define the linear operator $\nabla_{\alpha}$ that acts on polynomials by defining its action on the elements $Z^{\beta}$ as follows and then extend linearly to all polynomials.
\[\nabla_{\alpha}(Z^{\beta}) = \begin{dcases}
	Z^{\beta - \alpha} & \text{ if $\alpha \unlhd \beta$}\\
	0 & \text{ o.w.}
\end{dcases}\]

Informally, for a polynomial $f$ written as a linear combination of the standard basis polynomials $Z^{\beta}$, $\nabla_{\alpha}(f)$ isolates the terms that precisely contain the powers $Z_i^{\alpha_i}$ for all $i$ such that $\alpha_i \neq 0$ and then truncates these powers. In other words, it's the coefficient of $Z^{\alpha}$ in $f$. In particular, observe that $\nabla_{\alpha}(f)$ does not depend on $Z_i$ for any $i$ such that $\alpha_i \neq 0$.

Supose $f$ is multilinear, as we can assume in the Rademacher case when we are working with $Z_i \in \{-1,1\}\xspace$. For $\alpha \in \{0, 1\}^n$ with nonzero indices $i_1, \ldots, i_k \in [n]$, we have $\nabla_{\alpha}(f) = \frac{\partial}{\partial Z_{i_1}}\ldots \frac{\partial}{\partial Z_{i_k}}f$. So this linear operator generalizes the partial derivative operator. But note that in general, $\nabla$ is not simply the standard partial derivative operator.



\subsubsection*{Matrix Analysis}

Linear operators that act on polynomials can also be naturally defined to act on matrices by acting on each entry.

We define ${\mat{I}}_m$ to be the $m \times m$ identity matrix. We drop the subscript when it's clear.
For matrices ${\mat{F}}, {\mat{G}}$, define ${\mat{F}} \oplus {\mat{G}}$ to be the matrix $\begin{bmatrix}
	0 & {\mat{F}}\\
	{\mat{G}} & 0
\end{bmatrix}$. For a matrix ${\mat{F}}$, define its Hermitian dilation $\herm{{\mat{F}}}$ as ${\mat{F}} \oplus {\mat{F}}^T$. Denote by $\preceq$ the Loewner order, that is, ${\mat{A}} \preceq {\mat{B}}$ for ${\mat{A}}, {\mat{B}} \in \mathbb{H}^n$ if and only if ${\mat{B}} - {\mat{A}}$ is positive semi-definite.

\begin{definition}
	For a matrix ${\mat{F}}$ and an integer $t \ge 0$, define the Schatten $2t$-norm as
	\[\norm{{\mat{F}}}_{2t}^{2t} = \tr[{({\mat{F}}\mF^T)^t}]\]
\end{definition}


\begin{fact}\label{fact: cs}
	For real symmetric matrices ${\mat{X}}_1, \ldots, {\mat{X}}_n$, we have
	\begin{align*}
		({\mat{X}}_1 + \ldots + {\mat{X}}_n)^2 \preceq n({\mat{X}}_1^2 + \ldots + {\mat{X}}_n^2)
	\end{align*}
\end{fact}

\begin{fact}\label{fact: holder}
	For positive semidefinite matrices ${\mat{X}}, {\mat{X}}_1, \ldots, {\mat{X}}_n$ such that ${\mat{X}} \preceq {\mat{X}}_1 + \ldots + {\mat{X}}_n$ and for any integer $t \ge 1$,
	\begin{align*}
		\tr [{\mat{X}}^t] \le n^{t - 1}(\tr[{\mat{X}}_1^t] + \ldots + \tr[{\mat{X}}_n^t])
	\end{align*}
\end{fact}

\begin{proof}
    By H\"{o}lder's inequality, $n^{t - 1}(\tr[{\mat{X}}_1^t] + \ldots + \tr[{\mat{X}}_n^t]) \ge (\norm{{\mat{X}}_1}_t + \ldots + \norm{{\mat{X}}_n}_t)^t$. By triangle inequality of Schatten norms, this is at least $\norm{{\mat{X}}_1 + \ldots + {\mat{X}}_n}_t^t$. Finally, because ${\mat{X}}_1 + \ldots + {\mat{X}}_n \succeq {\mat{X}}\succeq 0$, we can use the monotonicity of trace functions (see \cite[Proposition 1]{petz1994survey}) where we use the increasing function $f(x) = x^t$ on $x \in [0, \infty)$. This proves the result.
\end{proof}


\begin{lemma}[Jensen's operator trace inequality]\cite[Corollary 2.5]{hansen2003jensen}\label{lem: jensen_trace}
	Let $f$ be a convex, continuous function defined on an interval $I$ and suppose that $0 \in I$ and $f(0) \le 0$. Then, for all integers $m, n \ge 1$, for every tuple ${\mat{B}}_1, \ldots, {\mat{B}}_n$ of real symmetric $m \times m$ matrices with spectra contained in $I$ and every tuple ${\mat{A}}_1, \ldots, {\mat{A}}_n$ of $m \times m$ matrices with $\sum_{i = 1}^n {\mat{A}}_i^T{\mat{A}}_i \preceq {\mat{I}}$, we have
	\[\tr[f(\sum_{i = 1}^n {\mat{A}}_i^T {\mat{B}}_i {\mat{A}}_i)] \le \tr[\sum_{i = 1}^n {\mat{A}}_i^T f({\mat{B}}_i) {\mat{A}}_i]\]
\end{lemma}

\section{Introduction}\label{sec: intro}
\input{efron-stein-sos/intro}

\section{Preliminaries}\label{sec: prelims}
\input{efron-stein-sos/prelims}

\section{The basic framework for Rademacher random variables} \label{sec: basic_recursion}

\input{efron-stein-sos/rademacher_recursion}

\section{Applications}\label{sec:rademacher-applications}

To illustrate our framework, we apply it to obtain concentration bounds for nonlinear random matrices that have been considered in the literature before. The first one is a simple tensor network that arose in the analysis of spectral algorithms for a variant of principal components analysis (PCA) \cite{hopkins2015tensor, hopkins2018statistical}.
The second application is to obtain norm bounds on dense graph matrices \cite{medarametla2016bounds, ahn2016graph}. In the second application, the norm bounds are governed by a combinatorial structure called \textit{the minimum vertex separator of a shape}. We will see how this notion arises naturally under our framework, while prior works that derived such bounds used the trace power method and required nontrivial combinatorial insights.

\subsection{A simple tensor network}

\input{efron-stein-sos/tensor_network_norm_bound}

\subsection{Graph matrices}\label{sec: dense_graph_matrices}

\input{efron-stein-sos/dense_graph_matrices}

\section{Why a na\"ive application of \cite{paulin2016} may fail for general product distributions} \label{sec: failure_of_basic}

\input{efron-stein-sos/failure_of_basic}

\section{The general recursion framework}\label{sec: general_recursion}

\input{efron-stein-sos/general_recursion}

\section{A generalization of \cite{paulin2016} and proof of \cref{lem: main_general}}\label{sec: proof_of_general}

\input{efron-stein-sos/proof_of_general}

\section{Application: Sparse graph matrices} \label{sec: sparse_graph_matrices}

\input{efron-stein-sos/sparse_graph_matrices}
\section{Nonlinear concentration for non-product distributions}

Our techniques in \cref{chap: efron_stein} apply to a collection of random variables that are sampled independently of each other. A natural question is to ask if we can generalize to the case when they are not independent. For example, this is useful when instead of analyzing Erd\H{o}s\xspace-R\'enyi\xspace random graphs, we wish to analyze uniform $d$-regular graphs. Such a generalization seems extremely likely because our proof techniques essentially requires a Markov Chain that mixes rapidly to the given distribution, and then we can recursively apply the Poincar\'e\xspace inequality. We leave this for future work.

\section{Sum of Squares lower bounds}

In this dissertation, we saw several SoS lower bounds and while they build on fundamental conceptual building blocks such as the nonlinear concentration results we show and simple heuristics like pseudocalibration, an important technical barrier in the current proofs is that the proofs are highly technical and have many moving parts. It's an important research question to understand if the proofs can be simplified. Apart from enabling a better understanding of the SoS hierarchy, this will also help us understand the computational barriers of several fundamental problems in computer science. Examples of such problems follow.

\subsection{Sparse Independent Set}

In a follow-up work \cite{jones2022sum}, we prove SoS lower bounds for the important problem of maximum independent set on sparse Erd\H{o}s\xspace-R\'enyi\xspace random graphs.

In this dissertation, the SoS lower bounds studied were in the setting when the input was sampled from product distributions where each distribution was either Rademacher or Gaussian. This is also the case in many prior works on SoS lower bounds. Recall that this was termed the \textit{dense setting} in \cref{chap: efron_stein}. It's equally important to study problems in the fascinating average-case \textit{sparse setting} where the input distribution could have high Orlicz norm, for example when the input is an Erd\H{o}s\xspace-R\'enyi\xspace random graph sampled from $G_{n, p}$ instead of $G_{n, \frac{1}{2}}$ for some $p = o(1)$. The techniques developed in this work and prior works for high degree SoS lower bounds do not easily generalize to this setting. The work \cite{jones2022sum} initiates this research direction for the fundamental problem of maximum independent set on random sparse graphs.

Consider the independent set problem on a graph $G \sim G_{n, \frac{d}{n}}$ where $d$ is the average degree. If $d = \frac{n}{2}$, then this is the same as the maximum clique problem and SoS lower bounds were obtained in \cite{BHKKMP16}. We now focus on the setting $d \ll n$. We first state the size of the true optimum.
\begin{fact}[\cite{COE15, DM11, DSS16}]
    W.h.p. the max independent set in $G$ has size $(1+o_d(1)) \cdot \frac{2\ln d}{d} \cdot n$.
\end{fact}

The famous Lov\'asz\xspace $\vartheta$ function efficiently computes an upper bound on this value and its value is well-known on such random graphs.
\begin{fact}[\cite{CO05}]
    W.h.p. $\vartheta(G) =\Theta(\frac{n}{\sqrt{d}} )$.
\end{fact}

The value of the $\vartheta$ function is also the output of the degree $2$ SoS relaxation for this problem. So, there is an integrality gap of approximately $\sqrt{d}$. We therefore naturally ask whether higher degree SoS can perform better or this gap persists. In our work, we show that this $\sqrt{d}$ integrality gap persists for higher degrees of SoS as well

We prove two main results, one in the setting $(\log n)^2 \le d \le \sqrt{n}$ and the other in the setting $n^{\Omega(1)}  \le d \le \frac{n}{2}$.
Note that we have not covered the case when the average degree $d$ is constant. This is an interesting direction for future work.

In the first setting $(\log n)^2 \le d \le \sqrt{n}$, we show a tradeoff between the degree $D_{sos}$  of the SoS relaxation and the integrality gap.
\begin{theorem}
    There is an absolute constant $c_0 \in {\mathbb{N}}$ such that for sufficiently large $n \in {\mathbb{N}}$ and ${d \in [(\log n)^2, n^{0.5}]}$, and parameters $k, D_{\text{SoS}}$ satisfying
    %
    $
   
   
    k ~\leq~ \frac{n}{D_{\text{SoS}}^{c_0}\cdot \log n \cdot d^{1/2}},$
    %
    w.h.p. over $G~\sim~ G_{n,~d/n}$, there exists a degree-$D_{\text{SoS}}$ pseudoexpectation for the maximum independent set problem with objective value $(1-o(1)) k$.
\end{theorem}
In particular, when $d \in [n^{\Omega(1)}, \sqrt{n}]$, this exhibits an SoS lower bound against polynomial degree $n^{\Omega(1)}$ SoS.
In the second setting $n^{\Omega(1)}  \le d \le \frac{n}{2}$, we show an SoS lower bound for logarithmic degree SoS.
\begin{theorem}
    \label{thm:informal-logd}
    For any $\varepsilon_1, \varepsilon_2 >0$ there is $\delta > 0$, such that for $d \in [n^{\varepsilon_1}, n/2]$ and $k \leq \frac{n}{d^{1/2+\varepsilon_2}}$, w.h.p. over $G~\sim~ G_{n,~d/n}$, there exists a degree-$(\delta \log d)$ pseudoexpectation with objective value $(1-o(1))k$.
\end{theorem}

We remark that these theorems rule out polynomial-time certification (i.e. constant
degree SoS) for any $d \geq {\mathrm{polylog}}(n)$.

Broadly speaking, we utilize similar techniques to show these results, namely pseudo-calibration, graph matrices and approximate PSD decomposition. However, the approach does not readily work and we overcome the difficulties with several new ideas and techniques. We summarize some of them below.
\begin{itemize}
    \item The first conceptual difficulty we overcome is that we are unable to apply pseudo-calibration due to the lack of a good candidate planted distribution. For most natural choices of the planted distribution, simple statistics distinguish the random distribution from the planted distribution. While a suitable planted distribution that enables the use of pseudo-calibration may very well exist, we are yet to find one. Instead, in this work, we simply use the na\"ive planted distribution but instead modify the heuristic of pseudo-calibration (that we term \textit{pseudo-calibration with connected trunction}) to construct our candidate moment matrix.
    \item The second conceptual difficulty was the lack of good norm bounds for graph matrices built from sparse graphs. In that work, we utilized the trace method with a careful analysis to obtain better norm bounds. Moreover, as we saw in \cref{chap: efron_stein}, we are able to obtain similar norm bounds without the trace method, using our general recursion theorem.
\end{itemize}
Apart from the above developments, we develop several technical tools such as conditioning, a generalization of the intersection tradeoff lemma, etc.
For more details, see \cite{jones2022sum}.

\subsection{Planted Affine Planes and Maximum Cut}\label{sec:open-problems}

For the Planted Affine Planes problem from \cref{chap: sk} where we sampled $m$ vectors $d_1, \ldots, d_m$ independently from $\mathcal{N}(0, I_n)$, we showed an SoS lower bound for $m \le n^{3/2 - \varepsilon}$. However, from the analysis of $\tilde{\EE}[1]$ in \cref{rmk:pe-one}, we expect a lower bound to hold for $m \ll n^{2 - \varepsilon}$. This is because, as we saw in \cref{chap: sos} and which we will revisit in the next section, analyzing $\tilde{\EE}[1]$ is an established way to hypothesize about the power of SoS. Therefore, we conjecture
\begin{conjecture}
    \cref{theo:sos-bounds} holds with the bound on the number of sampled vectors $m$ loosened to $m \leq n^{2-\varepsilon}$.
\end{conjecture}

Dual to this (in fact, we exploit the duality in our proof in \cref{chap: sk}), we conjecture an SoS lower bound for the Planted Boolean Vector problem holds whenever $p \geq n^{1/2+\varepsilon}$.
\begin{conjecture}
    \cref{theo:boolean-subspace} holds with the bound on the dimension $p$ of a random subspace
    loosened to $p \geq n^{1/2+\varepsilon}$.
\end{conjecture}

We remark that recent work \cite{zadik2021latticebased} has exhibited a polynomial time for the search variant of Planted Affine Planes for $m \ge n + 1$, as opposed to prior known algorithms that required $m \gg n^2$ \cite{mao2021optimal}. The algorithm in \cite{mao2021optimal} is spectral and robust to noise, moreover it is likely captured by SoS. On the other hand, the algorithm in \cite{zadik2021latticebased} is lattice-based and is not robust to noise, (i.e. it assumes that all vectors must exactly lie in the two planes), and is not captured by SoS.

In our SoS lower bounds for the Planted Boolean Vector problem and the Planted Affine Planes problem, we assumed that the input entries were chosen i.i.d Gaussian or Boolean. In fact, it's plausible that our proof techniques go through when the distribution is ``random enough'', such as the uniform distribution from the sphere. One potential extension of this intuition is as follows: In the Planted Boolean Vector problem, if the subspace is the eigenspace of the bottom
eigenvectors of a random adjacency matrix, the instance should still be
difficult. This last setting arises in Maximum Cut, for which we conjecture the following.

\begin{conjecture}\label{conj: sos_for_max_cut}
    Let $d \geq 3$, and let $G$ be a random $d$-regular graph on $n$ vertices. For some $\delta > 0$, w.h.p. there is a degree-$n^\delta$ pseudoexpectation operator $\tilde{\EE}$ on boolean variables $x_i$ with maximum cut value at least
    \[ \frac{1}{2} + \frac{\sqrt{d-1}}{d}(1 - \operatorname{o}_{d,n}(1)) \]
\end{conjecture}

The above expression is w.h.p. the value of the spectral relaxation for Maximum Cut, therefore qualitatively this conjecture expresses that degree $n^\delta$ SoS cannot significantly tighten the basic spectral relaxation.

We should remark that, with respect to the goal of showing SoS cannot significantly outperform the Goemans-Williamson relaxation, random instances are not integrality gap instances. The main difficulty in comparing (even degree 4) SoS to the Goemans-Williamson algorithm seems to be the lack of a candidate hard input distribution.

Evidence for this conjecture comes from the fact that the only property
required of the random inputs $d_1, \dots, d_m$ was that norm bounds hold for
the graph matrix with Hermite polynomial entries. When the variables
$\{d_{u,i}\}$ are i.i.d from some other distribution, if we use graph matrices
for the orthonormal polynomials under the distribution and assuming suitable
bounds on the moments of the distribution, the same norm bounds
hold~\cite{ahn2016graph}.
When $d_u$ is sampled uniformly from the sphere or another distribution for which the coordinates are not
i.i.d, it seems likely that
similar norm bounds hold. Moreover, as explained in the previous section, the techniques from \cref{chap: efron_stein} will likely be useful to obtain such norm bounds.

\subsection{Unique Games}

The famous Unique Games conjecture (UGC) \cite{Khot02:unique} postulates that a graph theory problem known as the Unique Games problem is NP-hard. This conjecture gained tremendous traction in the community because of it's numerous consequences (e.g. \cite{Khot02:unique, KhotKMO04, Raghavendra08})) and connections to various other fields such as metric geometry \cite{KhotV05} and discrete Fourier analysis \cite{KR03}. An exciting array of recent works \cite{dinur2018towards, barak2018small, subhash2018pseudorandom} has shown that a problem closely related to unique games, known as $2$-to-$2$ games, is NP-hard. This is an important step towards proving the UGC and offers evidence that the UGC is true.

On the algorithmic side, there have been various attempts (see for e.g. \cite{T05:unique, CharikarMM06, arora2015subexponential}) to disprove the UGC. In particular, Barak et al. \cite{barak2012hypercontractivity} showed that degree $8$ SoS can efficiently solve integrality gap instances of the Unique Games problem that were proposed for linear programs and SDPs considered earlier. This work caused significant interest in the community, since it suggests that SoS might be a way to refute the UGC.

Therefore, it's tremendously important to understand the performance of SoS on the unique games problem. A good first step would be to understand the performance of SoS for the problem of maximum cut, which is a special case of the Unique Games problem. In fact, we can be even more concrete and ask for the performance of SoS for the problem of maximum cut on random graphs, more precisely \cref{conj: sos_for_max_cut}. Lower bounds were shown for degree $2$ and degree $4$ in \cite{MS16, mohanty2020lifting} and generalizing their analyses for higher degree SoS is a nontrivial but important open problem.

\section{Low degree likelihood ratio hypothesis}

As explained in \cref{chap: sos}, the low-degree likelihood ratio hypothesis analytically predicts the computational barriers for hypothesis testing in bounded time, for \textit{sufficiently nice} distributions. See \cite{hop18, kunisky19notes, holmgren2020counterexamples} and references therein for more details. A full proof of this hypothesis is beyond current techniques, since it's likely harder than proving say $P \neq NP$. Despite this, confirming the hypothesis in restricted proof systems is a fascinating and important field for future research. In particular, building on the notation from \cref{chap: sos}, we would like to prove that for sufficiently nice distributions $\nu, \mu$, after pseudo-calibrating, if $\tilde{\EE}[1] = 1 + o(1)$, then there exists an SoS lower bound. Indeed, in this work, we confirm this for several fundamental problems. Proving this in general will go a long way towards understanding the power of bounded-time algorithms.

\section{Technical improvements}

Having covered some general directions for future research, we now specify a few directions for improving some technical aspects of our results.

\subsubsection{Improving parameter dependences}

In many of our lower bounds, we require polynomial decay in the Fourier coefficients. For example, we require a decay of $n^{\varepsilon}$ for each new Fourier character, where $n$ is the input size. This is done to handle various other factors that appear in norm bounds when doing the charging arguments. In the proofs, we term these as vertex or edge decay, corresponding to how they are encoded in the graph matrix arguments we use.
By doing this, we obtain a slightly weaker lower bound. For example, instead of getting a $n^{1/4}$ lower bound (up to polylogarithmic factors) for Tensor PCA, we obtain a $n^{1/4 - \varepsilon}$ lower bound for any $\varepsilon > 0$. In general, while they facilitate the proof, it's not clear that this sort of decay is necessary and it's open to find a tighter analysis so as to close the gap from known upper bounds up to a polylogarithmic factor.

Related to the above discussion, another open problem is to push the degree of SoS higher in our lower bounds. For example, in the Sherrington-Kirkpatrick lower bound, it's open to push the SoS degree from $n^{\varepsilon}$ to $\widetilde{\Omega}(n)$. Our current techniques do not handle this but we expect the lower bound to nevertheless hold.

\subsubsection{Satisfying constraints exactly}

In some of our lower bounds, our planted distributions only approximately satisfy constraints such as having a subgraph of size $k$, having a unit vector $u$, and having $u$ be $k$-sparse. While we would like to use planted distributions which satisfy such constraints exactly, the moment matrix becomes much harder to analyze.

We do resolve it for the Sherrington-Kirkpatrick lower bound by using a rounding technique \cite{ghosh2020sum}.
This same issue also appeared in the SoS lower bounds for planted clique \cite{BHKKMP16}, which was fixed in a recent paper by Pang \cite{Pang21}.
We leave it to future work to resolve this in general.

\section{Implications for Computer Science}

As we saw in the introduction, the current state of affairs in Theoretical Computer Science research seems to be to understand the limits of computation for various problems. Even though there maybe potential ultimate goals such as settling the P vs NP problem or even relatively modest goals such as settling the Unique Games Conjecture, there's much to be learnt and uncovered from this process. For example, for various problems, there seems to be a discernible gap between what's information theoretically possible and what's computationally feasible. Our work adds insight into this intriguing phenomenon, known as the information-computation tradeoff.
However, there are also other questions that need answering. For example, what makes certification seemingly harder than estimation or recovery? Can we characterize the precise property of problems that potentially make them hard or easy for various classes of algorithms such as Sum of Squares?
While much rich structure is slowly being uncovered in this general pursuit, a proper understanding still eludes us.
However, applications of what we've discovered so far, both technically and philosophically, are already numerous in various branches of mathematics and science, therefore research in this field is more than for the sake of mere curiosity.
We hope our work serves as a meaningful progress towards this grand goal.
\section{Certification problems}

As opposed to search or decision problems, certification problems, given an input, ask for a bound on the objective value that holds with probability $1$, along with a certificate of the output bound. The quality of the algorithm is usually measured in terms of how close the bound gets to the true optimum.

In the running example of maximum cut, given a graph, the task could be to output a value that's always an upper bound on the size of the maximum cut. A simple algorithm could be to simply return the total number of edges in the graph. Indeed, this is a valid certification algorithm but we could ask if one could do better.

This is fundamentally a different approach to algorithm design. Consider the scenario when we are maximizing some objective function and so we desire an upper bound on the optimal value. Then, designing a certification algorithm can be construed as attacking a problem from \textit{above} as opposed to from \textit{below}, the latter of which is the more standard notion of algorithm design.

The notion of linear programming relaxations already provide such certification algorithms. Given a problem that can be formulated as an integer program (as many can be), a natural way to obtain a certification solution is to widen the search space from integral variables to real variables, adding other appropriate constraints as necessary. This is known as relaxing the program. This enables a faster algorithm to attempt to compute the solution, but comes at a loss of only obtaining an approximate solution. More importantly, the objective value obtained by the return solution is a definite bound bound on the optimal solution, no matter the input. This is what a certification algorithm desires. Measuring the quality of the returned output often depends on the type of relaxation considered and problem specific structure.

In many cases, it's possible to obtain an approximation algorithm to a problem by looking at a relaxation of the program, obtaining a non-integral solution and rounding it to a valid solution. For the maximum cut problem, this was done by Goemans and Williamson in their seminal work \cite{GW94} where they used a semidefinite programming relaxation, which is more powerful than linear programming relaxations.

In this dissertation, we will focus on a specific class of such certification algorithms, namely the Sum of Squares (SoS) hierarchy, sometimes referred to as the Lasserre hierarchy. The SoS hierarchy is a series of convex relaxations to a given program. By virtue of being a relaxation, they can be used for certification.
Due to it's tremendous success for various fundamental optimization problems such as maximum cut, constraint satisfaction, etc., the SoS hierarchy has become a powerful optimization technique. This is further amplified by results that say that the SoS hierarchy is the optimal relaxation among a broad class of semidefinite programming relaxations \cite{lrs15}, and assuming the famous unique games conjecture, it's the best approximation algorithm for every constraint satisfaction problem \cite{Raghavendra08}.
A chief goal of this dissertation is to understand the limits of this powerful technique. We especially focus on the so-called average-case setting, that we will define now.

\section{Average-case analysis}

An important theme in this work is the study of random instances of problems, which is termed average-case analysis. As opposed to traditional worst-case algorithm design, where we wish to design an algorithm that performs well on the worst possible input, there has been an exciting development of research on problems where the input is randomly sampled from a distribution. For instance, in the maximum cut problem, we could assume that the input comes from the Erd\H{o}s\xspace-R\'enyi\xspace family of random graphs, where the number of vertices in the graph is chosen beforehand and each edge is present independently with probability $0.5$.

In average-case algorithm design, we wish to design algorithms that perform well on average-case inputs with high probability, as opposed to all inputs.
This is important because studying the worst case complexity of a problem may not shed light on the intrinsic hardness of the problem. This happens because the worst-case instance input for an algorithm could be highly artificial and contrived. Put another way, in real world scenarios, the inputs for various optimization or search problems we encounter are unlikely to be such instances. This is seen in practice as well. For example, the simplex method for linear programming \cite{dantzig2016linear} is exponentially slow in the worst-case, as was shown by Klee and Minty \cite{klee1972good}, but performs extremely well practically. Various works have tried to explain this behavior, e.g. \cite{borgwardt1982average, smale1983average, borgwardt1988probabilistic, spielman2004smoothed}, a highlight is the work of Spielman and Teng for which they were awarded the G\"{o}del\xspace prize in 2008.

Tremendous effort has been invested to understand the average-case complexity for a wide variety of problems. Research towards designing average-case algorithms brings about a deeper understanding of the core of the problem, enabling the design of worst-case algorithms as well. This can be seen for example for the famous Densest $k$-subgraph problem \cite{bhaskara2010detecting}. In this work, we will focus on average-case analysis.

In our pursuit, fundamental mathematical objects that occur repeatedly are large random matrices. We often desire to understand their behavior.

\section{Underlying theme of this work: Random matrices}

Random matrices are abundant in computer science, especially in the fields of optimization and statistics. Often, the analysis of an algorithm requires analyzing the behavior of certain random matrices that can be constructed from the input. Even outside computer science, random matrix theory is a fundamental field of it's own right, having been studied since the early $1900$s, with applications also extending to many branches of mathematics and physics. For a short survey, see \cite{forrester2003developments}.

There has been tremendous effort over the last few decades to develop the theory of random matrices, see the book by Tropp \cite{tropp2015:book}. For example, the matrix-Bernstein inequality studies the behavior of a random weighted sum of matrices; the Wigner semicircle law studies the distribution of the eigenvalues of a random matrix sampled from the Gaussian Orthogonal ensemble. On the other hand, fewer tools are available to understand the behavior of nonlinear random matrices, where each matrix entry is a nonlinear function of the input, say for instance low-degree polynomials.

In our setting, this occurs frequently when trying to analyze the SoS hierarchy for various problems. This is true both when trying to design algorithms via SoS as well as when trying to study the limitations of SoS algorithms, for example, \cite{barak2012hypercontractivity, hopkins2015tensor, schramm2017fast, moitra2019spectral, jones2022sum}. Therefore, we begin with this important endeavor of understanding the behavior of nonlinear random matrices. In the first part of this thesis, we are interested specifically in concentration behavior. We emphasize that this is an important research direction in it's own right.

To bound the fluctuations of a random matrix from its mean, measured in terms of spectral or Schatten $t$-norm of the difference, a simple but powerful technique that has been widely used (including in many of the works cited above) is the so-called trace method. In this method, the (centered) random matrix is raised to a large power and the expected trace of the resulting matrix is bounded. While this method gives satisfactory results, it often requires ingenious observations and highly nontrivial combinatorics.

Another approach is as follows. Consider a random matrix that is a function of several independent input variables. We can study it's behavior by studying how much it deviates when a single uniformly chosen input entry is resampled. By bounding these local fluctuations, we can bound the global fluctuation of the random matrix.
This technique gives rise to the Efron-Stein inequalities. Originally, they were developed for scalar random variables (which can be thought of as $1 \times 1$ matrices). In this special case, they turned out to be extremely powerful since they have been shown to recover many standard concentration inequalities. Recently, the work \cite{paulin2016} showed a matrix version of the Efron-Stein inequalities.
In this work, we build on this to obtain a general framework for proving concentration of large random matrices.

In the second part of this thesis, in the analysis of SoS algorithms, the fundamental difficulty that appears is to analyze the behavior of a large nonlinear random matrix. In particular, we want to argue that this random matrix is positive semidefinite with high probability over the choice of the input. For this, we exhibit an approximate Cholesky decomposition of the matrix and the proof extensively builds on the concentration results we develop above.

In conclusion, the motif in this work is the study of nonlinear random matrices, where we both build a general framework for analyzing concentration and apply them to study algorithms on fundamental problems.

\section{The Sum of Squares Hierarchy}

Given an optimization problem in the form of a program with polynomial inequality constraints, there have been many works proposing generic approaches to relax the program, in order to obtain good solutions efficiently. Some of the more dominant approaches have been the Lov\'asz\xspace-Schrijver hierarchy \cite{LoS91} and the Sherali-Adams hierarchy \cite{SA90}. Informally speaking, these hierarchies of algorithms lift the program to a larger set of variables, tied together via various constraints, relax and solve the larger program, and finally project the solution down to the original variable space. They are parameterized by an integer known as the degree, where larger degrees offer tighter relaxations at the cost of larger running times.

The Sum of Squares (SoS) hierarchy is a similar optimization technique that harnesses the power of semidefinite programming. For polynomial optimization problems, the SoS hierarchy, first independently investigated by Shor \cite{shor1987approach}, Nesterov \cite{nesterov2000squared}, Parillo \cite{parrilo2000structured}, Lasserre \cite{lasserre2001global} and Grigoriev \cite{grigoriev2001complexity, Grigoriev01}, offers a sequence of convex relaxations parameterized by an integer called the degree of the SoS hierarchy.
As we increase the degree $d$ of the hierarchy, we get progressively stronger convex relaxations which are solvable in $n^{O(d)}$ time.
This has paved the way for the SoS hierarchy to be almost a blackbox tool for algorithm design. As has been shown in multiple works, it serves as a strong algorithm for various problems, both in the worst case and the average case settings.

Consider our running example of the Maximum Cut problem. The seminal Goemans-Williamson algorithm \cite{GW94:stoc} achieves an approximation factor of $\approx 0.878$ for this problem via a semidefinite programming relaxation. As it turns out, this algorithm is just the degree $2$ SoS hierarchy. This approximation factor is conjectured to be optimal and there has been increasing evidence that this is indeed the case. This highlights an example of why the SoS hierarchy is powerful.

Indeed, there has been tremendous success in using the SoS hierarchy to obtain efficient algorithms for combinatorial optimization problems (e.g., \cite{GW94, AroraRV04, GuruswamiS11, raghavendra2017strongly}) as well as problems stemming from Statistics and Machine Learning (e.g., \cite{barak2012hypercontractivity, bks15, HopSS15, pot17, kothari2017outlier}). In fact, SoS achieves the state-of-the-art approximation guarantees for many fundamental problems such as Sparsest Cut \cite{AroraRV04}, Maximum Cut \cite{GW94}, Tensor PCA \cite{HopSS15} and all Max-$k$-CSPs \cite{Raghavendra08}. As mentioned earlier, for a large class of problems, it's been shown that SoS relaxations are the most efficient among all semidefinite programming relaxations \cite{lrs15}.

The term ``Sum of Squares'' comes from a dual view in proof complexity.
Besides being an algorithmic technique, SoS can be equivalently viewed as giving a proof or certificate of a bound on the optimal value of a polynomial optimization problem.
This work can be traced back to Hilbert's seventeeth problem which has led to work on a proof complexity result known as the Positivstellensatz, which gives conditions under which polynomial systems can be shown to have no solutions, see e.g. \cite{stengle1974nullstellensatz, putinar1993positive, reznick2000some}. The algorithmic implications were originally observed by Lasserre \cite{lasserre2001global} and Parillo \cite{parrilo2000structured, parrilo2003semidefinite} leading to the interpretation of SoS as an optimization technique as we study in this work.
This duality can be completely formalized and has led to the so-called framework of ``proofs to algorithms'' that has achieved tremendous success, especially recently in robust statistics, see e.g., \cite{kothari2017outlier, karmalkar2019list, hopkins2020mean, bakshi2021robust}. The adage is that if we can find an ``easy'' proof of an identifiability result for a search problem, then it can be automatized to give an algorithm.
We will not explore this in detail here, and we refer the reader to the monograph \cite{FKP19}.

Next, we move onto SoS lower bounds but before that, we highlight some related techniques that has gained traction in the community recently.

\subsection{Related Algorithmic Techniques}\label{subsec: related_techniques}

Apart from search, decision and certification, researchers have also considered other related types of problems. Consider a problem where the input is sampled from one of two known distributions and we would like to identify which distribution it was sampled from. This is known generally as hypothesis testing. For example, one distribution could be the distribution of Erd\H{o}s\xspace-R\'enyi\xspace random graphs while the other could be the distribution of Erd\H{o}s\xspace-R\'enyi\xspace random graphs but with a large cut planted in them. It's clear that this problem is a different flavor of the maximum cut problem on random graphs. Beyond being interesting in their own right, studying these related formulations offer alternate perspectives and interesting insights into the search or certification variants as well.
Another type of problem, known as recovery problems, is to recover the planted structure when the input is sampled from the latter distribution.

For all the type of problems considered so far, apart from SoS, there have also been several other framework of algorithms that have been considered and in some cases, extensively studied. Examples include
\begin{itemize}
    \item Lov\'asz\xspace-Schrijver and Sherali-Adams hierarchies --- As discussed earlier, these hierarchies lift a program to a larger set of variables and then relax any integrality constraints. The resulting solution is then projected back to the original variables which may then be rounded to an integral solution. These hierarchies are captured by the SoS hierarchy, or in other words, the SoS hierarchy is at least as powerful as these hierarchies \cite{FKP19}.
    \item Low degree polynomials --- For hypothesis testing, low degree polynomials can be used to try and distinguish the two distributions. More precisely, if there is a low degree polynomial such that its expected value on the two distributions behave differently and the variance isn't too large, this can be used to distinguish the two distributions. This is related to the SoS hierarchy and we will revisit this point in more detail later.
    \item Statistical query algorithms --- For hypothesis testing, the statistical query model (SQ) is another popular restricted class of algorithms introduced by \cite{kearns1998efficient}. In this model, for an underlying distribution, we can access it indirectly by querying expected values of functions, up to some error.
    Given access to this oracle, we would like to hypothesis test. SQ algorithms capture a broad class of algorithmic techniques in statistics and machine learning including spectral methods, moment and tensor methods (see e.g. \cite{feldman2017statistical, feldman2021statistical}). SQ algorithms has also been used to study information-computation tradeoffs and more broadly has been studied in other contexts \cite{Feldman2016}. There has also been significant work trying to understand the limits of SQ algorithms (e.g. \cite{feldman2017statistical, feldman2018complexity, diakonikolas2017statistical}). Recent work \cite{brennan2020statistical} has shown that low degree polynomials and statistical query algorithms have equivalent power under mild conditions.
    \item Approximate message passing and other statistical physics techniques such as belief propagation, see e.g. the review \cite{zdeborova2016statistical}.
    \item Local algorithms, see e.g. \cite{elek2010borel, fan2017well, hoppen2018local}.
    \item Circuit models of computation of bounded size, see e.g. \cite{rossman2010average, rossman2014monotone}.
\end{itemize}

\section{Lower bounds against the Sum of Squares Hierarchy}

Because of the incredible success of the SoS hierarchy for a variety of problems, it's an important research direction to study the limits of the SoS hierarchy, which we endeavour in this dissertation. In particular, we will focus on average-case problems and as we will see, most of the technical difficulty boils down to the analysis of nonlinear random matrices, to handle which we develop various techniques.

There are many reasons for why studying lower bounds against the SoS hierarchy is important. The SoS hierarchy is general enough to capture a broad class of algorithmic reasoning \cite{FKP19}. In particular, SoS captures the Lov\'asz\xspace-Schrijver and Sherali-Adams hierarchies and under mild restrictions, also statistical query algorithms and algorithms based on low degree polynomials. Therefore, SoS lower bounds indicate to the algorithm designer the intrinsic hardness of the problem and suggest that if they want to break the algorithmic barrier, they need to search for algorithms that are not captured by SoS. Secondly, in average case problem settings, standard complexity theoretic assumptions such as P $\neq$ NP have not been shown to give insight into the limits of efficient algorithms. Instead, lower bounds against powerful techniques such as SoS have served as strong evidence of computational hardness \cite{hop17, hop18}. Thus, understanding the power of the SoS hierarchy on these problems is an important step towards understanding the approximability of these problems. See also the surveys \cite{BS14:ICM, moitra2020sum} for more on this.

There have been relatively fewer works on SoS lower bounds, as opposed to some other classes of algorithms we have discussed, which can be attributed to the sheer technical difficulty of proving such lower bounds. For example, the works \cite{Grigoriev01, Schoenebeck08, KothariMOW17} studied SoS lower bounds for random constraint satisfaction problems. A series of works \cite{feige2000finding, meka2015sum, deshpande2015improved, BHKKMP16, Pang21} studied SoS lower bounds for maximum clique on random graphs. Some other SoS lower bounds, not including the ones in this thesis, are the works \cite{ma_wigderson_15, kothari2018sum, mohanty2020lifting, kunisky2020,  kothari2021stress}.

\section{A summary of our main results}

In the first part of this work, we study concentration behavior of nonlinear random matrices. In the second part, we study lower bounds against the SoS hierarchy for several fundamental problems.

\subsection{Nonlinear matrix concentration via Matrix Efron-Stein}

We start by giving a general theorem on concentration of random matrices whose entries are polynomials of independent random variables. The famous matrix-Bernstein inequality answers this question when we only have linear polynomials. However, understanding the setting of non-linear polynomials is just as important yet it poses significant challenges. When they arise in various applications in the literature, the usual way to handle such random matrices has been the so-called trace method. While this method gives the desired results, sometimes to great effect, applying it usually turns out to be highly nontrivial. In this work, we propose an alternate way to prove matrix concentration via the Matrix Efron-Stein inequalities. We propose a general matrix concentration inequality, the proof of which relies on the powerful method of exchangeable pairs. We show some applications of this inequality and expect it to have significant applications outside what we have explored here.

\subsection{Sum of Squares lower bounds}

We obtain strong sub-exponential time lower bounds against the SoS hierarchy for a variety of fundamental problems in computer science. All our applications start with the so-called pseudocalibration heuristic, reducing the problem to analyzing the behavior of a large random matrix, known as the \textit{moment matrix}. Our conceptual and technical innovations happen at this step. The results we present are as follows.

\subsubsection{Sherrington-Kirkpatrick Hamiltonian}

An important problem in statistical physics, the Sherrington-Kirkpatrick problem is to optimize the quadratic form of a random matrix sampled from the Gaussian Orthogonal Ensemble, over boolean vectors. It's been known for a long time that the true optimal value concentrates at a particular constant, up to scaling. Recently, an efficient algorithm was proposed for this optimization problem. Certification on the other hand was widely believed to be hard beyond the simple spectral algorithm. We provide strong evidence for this by exhibiting lower bounds against SoS for this problem. This work requires us to understand the nullspace of the moment matrix and \textit{nullify it} before applying our matrix concentration tools. Conceptually, this work provides a lot of insight into the behavior of SoS on other fundamental problems such as maximum cut and learning mixtures of Gaussians.

\subsubsection{Sparse PCA}

Sparse PCA is a variant of principal components analysis (PCA), a fundamental routine in statistics and machine learning. We work with the spiked Wishart model, which is the most natural version of this problem, but which has proved quite hard to analyze in SoS.
Prior works have predicted the computational barrier of the recovery of the sparse component, as a tradeoff between the dimension, sparsity and number of samples. We confirm this barrier by proving lower bounds, matching known algorithms, against sub-exponential time SoS. This work involves splitting the random moment matrix into different matrices and using innovative combinatorial charging arguments to study how these matrices interact with each other. Conceptually, this work confirms the computational barrier diagram for this problem, that has been predicted and believed to be true for a long time.

\subsubsection{Planted Slightly Denser subgraph}

Finding a dense subgraph in a given graph is an important problem that has received much scrutiny over the years, both algorithmically as well as from the algorithmic hardness angle. For random instances of the problem under certain parameter regimes, the difficulty of this problem has been conjectured, usually referred to as the PDS conjecture, and this problem has been used as a canonical hard problem to reduce to various other problems and study their computational barriers. Moreover, these hard instances have also been used as a basis for cryptographic schemes.
Therefore, SoS lower bounds against this problem go a long way towards confirming this conjecture. In this work, we exhibit such sub-exponential time lower bounds for certain parameter regimes, where it has been widely believed to require sub-exponential time.

\subsubsection{Tensor PCA}

Tensor PCA is the average-case version of the problem of optimizing homogeneous polynomials over the sphere, which is a fundamental and important problem in optimization due to it's connections to a variety of fields. In this work, we prove SoS lower bounds matching known algorithms for this problem, settling the computational barrier for SoS for this problem. It also offers insight on  the approximability-inapproximability threshold for general homogeneous polynomial optimization and suggests that random instances may not be the hardest for this problem.

\section{Excluded work}

This dissertation contains the main body of my research conducted during my PhD but there have also been other research directions that have been left out, regrettably. This includes the following works.

\subsection{SoS Lower bounds for Sparse Independent Set}

In our work \cite{jones2022sum}, we show SoS lower bounds for the maximum independent set problem on sparse Erd\H{o}s\xspace-R\'enyi\xspace random graphs, matching the Lov\'asz\xspace theta function up to low order terms. To do this, we build on the tools developed in this dissertation as well as develop a variety of new techniques. In particular, this work is the first venture in the important research direction of understanding the limitations of SoS on sparse random graphs. We highlight that for this work, our nonlinear matrix concentration tools from \cref{chap: efron_stein} are very useful. We will elaborate on this result in \cref{chap: future_work} since it builds on much of the work we will develop in this dissertation.

\subsection{Causal Inference }

Causal inference is the study of discovering and understanding causal relationships in observed data, which has diverse applications in medicine, genetics, economics, epidemics, artificial intelligence, etc. In our work \cite{rajendran2021structure}, we focus on the problem of learning a class of causal models known as Bayesian Networks (BN), from data. This is a classical and fundamental problem since BNs are compact, modular and offer intuitive causal interpretation, which has made them very useful in various fields. We propose and study a new practical algorithm for this problem. It is efficient, provably differs from the widely used Greedy-Equivalence-Search algorithm, and since the algorithm is a general-purpose score-based learning algorithm, it is widely applicable. Also, under some statistical assumptions that are inspired from and which generalize recent works, our algorithm provably recovers the true Bayesian Network, even for non-parametric models, while making no assumptions on linearity, additivity, independent noise or faithfulness. It also suggests interesting potential connections to other machine learning fields such as clustering, forward-backward greedy methods, and kernel methods.

\subsection{Latent Variable modeling}

In our work \cite{kivva2021learning}, we study a relatively understudied but important problem of latent variable modeling of observed data. Building from the previous section, we now have unobserved (sometimes even unmeasurable!) latent causes or confounders for the observed variables. We focus on the setting of probabilistic mixture models, which naturally comes up in machine learning, economics, finance, biology, etc. Under some natural assumptions on the model, we develop an algorithm that takes the observed data and uncovers the hidden variables and the underlying causal relationships. Prior works related to this problem have usually focused on special settings such as linear models. We instead propose an algorithm to this problem in the highly nonlinear mixture models setting which works atop existing algorithms for mixture model order estimation (which is easier than density estimation).

\subsection{Causal representation learning}

An exciting new branch of machine learning, known as causal representation learning, takes as input raw, unstructured data, and aims to learn the underlying generative model that generated it. On top of this, it also aims to learn the causal relationships among the learnt latent variables, hence the name causal representation learning. In particular, this field brings together ideas from two fields which have largely developed separately, namely causal inference and latent variable modeling, the two topics described above. In our work \cite{kivva2022identifiability}, we prove an interesting and surprising result in this direction. We show that a broad class of generative models with a mixture of Gaussians prior is identifiable (which means it can be recovered from raw data). In particular, our models have universal approximation capabilities and have been used extensively (without theoretical validation) in many practical works on deep representation learning \cite{dilokthanakul2016deep, jiang2016variational, willetts2021don}.

In deep learning, there has been tremendous effort to identify the latent features and the mechanisms that generate observed data. Instead of handcrafting low level features of data, this process is largely automated via algorithms that learn low level representations. The models thus learnt are quite useful for a variety of downstream tasks such as sampling, prediction, classification, clustering, interventions, etc. A prominent player here is variational autoencoders \cite{kingma2013auto, rezende2014stochastic}. Various improvements to variational autoencoders have been made over the last decade, with a wide variety of applications. A much-desired property of the training process is stability, i.e. whether repeated trainings will lead to the same latent variable generative model. This can be captured by the mathematical notion of identifiability, which is a crucial primitive which guarantees that there is a unique parameter and generation mechanism that could have generated the data. Putting computational feasibility aside, identifiability is a necessary condition for stable and repeatable training. Apart from stability of training, this also paves the way for other important considerations in machine learning, such as the increasing need to learn representations of data that are robust, interpretable, explainable and fair.

In our work \cite{kivvaidentifiability}, we show that for commonly used variational autoencoders with a mixture of Gaussians prior, identifiability holds under the assumption that the warping mechanism is affine (in particular, deep neural networks with ReLU activations satisfy this property) and importantly, without assuming that auxiliary information is available. This significantly improves upon a flurry of recent works (initiated by \cite{khemakhem2020variational}) that have shown identifiability in the presence of auxiliary variables or side information. Also, several prior works have made empirical observations that a mixture of Gaussians prior often leads to stable and repeatable training for variational autoencoders, thereby suggesting identifiability.  Our work theoretically grounds these observations.

\section{Organization of the thesis}

In \cref{chap: efron_stein}, we develop our nonlinear matrix concentration results and show it's applications towards various nonlinear random matrices that have arisen in the literature. We then introduce the Sum of Squares hierarchy in \cref{chap: sos}, introduce the technique of pseudocalibration used for showing SoS lower bounds and show it's connections to low-degree algorithms. In \cref{chap: main_results}, we formally state the main SoS lower bounds we show in this thesis and put them in context with known prior works. In \cref{chap: sk}, we prove the SoS lower bound for the Sherrington-Kirkpatrick problem. In the next two chapters, \cref{chap: qual} and \cref{chap: quant}, we prove the SoS lower bounds for Planted Slightly Denser Subgraph, Tensor PCA and Sparse PCA. We conclude with follow-up and potential future works in \cref{chap: future_work}.
\subsection{Our contributions}

In this paper, we consider the following general category of problems: Given a random input, can we certify that it does not contain a given structure?

Some important examples of this kind of problem are as follows.
\begin{enumerate}
    \item Planted clique: Can we certify that a random graph does not have a large clique?
    \item Tensor PCA: Given an order $k$ tensor $T$ with random independent Gaussian entries, can we certify that there is no unit vector $x$ such that $\ip{T}{x \otimes\ldots\otimes x}$ is large?
    \item Wishart model of sparse PCA: Given an $m \times d$ matrix $S$ with random independent Gaussian entries (which corresponds to taking $m$ samples from $\mathcal{N}(0, I_d)$), can we certify that there is no $k$-sparse unit vector $x$ such that $\norm{Sx}$ is large?
\end{enumerate}

These kinds of problems, known as certification problems, are closely related to their optimization or estimation variants. A certification algorithm is required to produce a proof/certificate of a bound that holds for \textit{all} inputs, as opposed to most inputs. The Sum-of-Squares hierarchy provides such certificates in a canonical way for a wide variety of such problems, so analyzing SoS paves the way towards understanding the certification complexity of these problems. We investigate the following question.

\begin{quote}
    \em{For certification problems, what are the best bounds that SoS can certify?}
\end{quote}

In this work, we build general machinery for proving probabilistic Sum of Squares lower bounds on certification problems. To build our machinery, we generalize the techniques pioneered by \cite{BHKKMP16} for proving Sum of Squares lower bounds for planted clique. We start with the standard framework for proving probabilistic Sum of Squares lower bounds:
\begin{enumerate}
    \item Construct candidate pseudo-expectation values $\tilde{\EE}$ and the corresponding moment matrix $\Lambda$ (see \cref{subsec: sos}).
    \item Show that with high probability, $\Lambda \succeq 0$.
\end{enumerate}
For planted clique, \cite{BHKKMP16} constructed $\tilde{\EE}$ and the corresponding moment matrix $\Lambda$ by introducing the pseudo-calibration technique (see \cref{subsec: pseudocalibration}). They then showed through a careful and highly technical analysis that with high probability $\Lambda \succeq 0$. 

In this paper, we investigate how generally the techniques used for planted clique can be applied. We show that by constructing coefficient matrices based on the coefficients obtained by pseudo-calibration, we can give relatively simple conditions on these coefficient matrices which are sufficient to ensure that the moment matrix $\Lambda$ is PSD with high probability.


By abstracting out a lot of the technical analysis that goes into sum of squares lower bound proofs with a general framework, we pave the way for future works on SoS lower bounds to use our machinery as a blackbox and instead focus on the analysis of the problem specific structure. We exhibit the usefulness of our machinery by achieving strong SoS lower bounds for the problems of Tensor PCA, Sparse PCA and a problem closely related to the Planted Clique problem that we call Planted Slightly Denser Subgraph. We do this with relative ease once the machinery is in place. The Sparse PCA lower bounds complement a long line of work on algorithmic guarantees that stretches for over two decades (the most recent one being \cite{sparse_pca_focs20}), giving a complete picture (up to polylogarithmic factors) for the approximability-inapproximability thresholds for Sparse PCA.

\subsubsection{A brief summary of pseudo-calibration}
A natural way to prove lower bounds on a certification problem is as follows.
\begin{enumerate}
    \item Construct a "maximum entropy" planted distribution of inputs which has the given structure.
    \item Show that we cannot distinguish between the random and planted distributions and thus cannot certify that a random input does not have the given structure.
\end{enumerate}
Based on this idea, the pseudo-calibration technique introduced  by \cite{BHKKMP16} constructs candidate pseudo-expectation values $\tilde{\EE}$ so that as far as low degree tests are concerned, $\tilde{\EE}$ for the random distribution mimics the behavior of the given structure for the planted distribution (for details, see \cref{subsec: pseudocalibration}). This gives a candidate moment matrix $\Lambda$ which we can then analyze with our machinery. A majority of known high-degree average-case SoS lower bounds in the literature have pseudo-expectation values that were either obtained by, or could be obtained by pseudocalibration, e.g., Planted Clique \cite{BHKKMP16}, Max-$k$-CSPs \cite{KothariMOW17}, Max-Cut on regular graphs \cite{MRX20}, Sherrington-Kirkpatrick problem \cite{sklowerbounds, MRX20}. It has also been successful for Densest-$k$-subgraph but for the weaker Sherali-Adams Hierarchy \cite{chlamtavc2018sherali}.

Naturally, pseudocalibration is the starting point for our SoS lower bounds. That said, our machinery is quite general and can be applied even if the candidate moment matrix $\Lambda$ is not obtained via pseudo-calibration.

\subsubsection{Our results on Tensor PCA, Sparse PCA, and Planted Slightly Denser Subgraph}
In this section, we formally state the main hardness theorems we show by applying our machinery. We defer discussing prior work, how we improve on them, and other related work to \cref{sec: prior_work}.

We describe the planted distributions we use to show our SoS lower bounds for planted slightly denser subgraph, tensor PCA, and the the Wishart model of sparse PCA. We also state the random distributions for completeness and for contrast. We then state our results.

\paragraph{Planted slightly denser subgraph}
We use the following distributions.
\begin{restatable}{itemize}{PLDSdistributions}
    \item Random distribution: Sample $G$ from $G(n, \frac{1}{2})$
    \item Planted distribution: Let $k$ be an integer and let $p > \frac{1}{2}$. Sample a graph $G'$ from $G(n, \frac{1}{2})$. Choose a random subset $S$ of the vertices, where each vertex is picked independently with probability $\frac{k}{n}$. For all pairs $i, j$ of vertices in $S$, rerandomize the edge $(i, j)$ where the probability of $(i, j)$ being in the graph is now $p$. Set $G$ to be the resulting graph.
\end{restatable}
In \cref{sec: plds_qual}, we compute the candidate moment matrix $\Lambda$ obtained by using pseudo-calibration on this planted distribution.

\begin{restatable}{theorem}{PLDSmain}\label{thm: plds_main}
Let $C_p > 0$. There exists a constant $C > 0$ such that for all sufficiently small constants $\varepsilon > 0$, if $k \le n^{\frac{1}{2} - \varepsilon}$ and $p =  \frac{1}{2} + \frac{n^{-C_p\varepsilon}}{2}$, then with high probability, the candidate moment matrix $\Lambda$ given by pseudo-calibraton for degree $n^{C\varepsilon}$ Sum-of-Squares is PSD.
\end{restatable}

\begin{corollary}\label{cor: plds_main}
Let $C_p > 0$. There exists a constant $C > 0$ such that for all sufficiently small constants $\varepsilon > 0$, if $k \le n^{\frac{1}{2} - \varepsilon}$ and $p =  \frac{1}{2} + \frac{n^{-C_p\varepsilon}}{2}$, then with high probability, degree $n^{C\varepsilon}$ Sum-of-Squares cannot certify that a random graph $G$ from $G(n, \frac{1}{2})$ does not have a subgraph of size $\approx k$ with edge density $\approx p$.
\end{corollary}

\paragraph{Tensor PCA}
Let $k \ge 2$ be an integer. We use the following distributions.
\begin{restatable}{itemize}{TPCAdistributions}
    \item Random distribution: Sample $A$ from $\mathcal{N}(0, I_{[n]^k})$.
	\item Planted distribution: Let $\lambda,\Delta > 0$. Sample $u$ from $\{-\frac{1}{\sqrt{\Delta n}}, 0, \frac{1}{\sqrt{\Delta n}}\}^n$ where the values are taken with probabilites $\frac{\Delta}{2}, 1 - \Delta, \frac{\Delta}{2}$ respectively. Then sample $B$ from $\mathcal{N}(0, I_{[n]^k})$. Set $A = B + \lambda \tens{u}{k}$.
\end{restatable}

In \cref{sec: tpca_qual}, we compute the candidate moment matrix $\Lambda$ obtained by using pseudo-calibration on this planted distribution.

\begin{restatable}{theorem}{TPCAmain}\label{thm: tpca_main}
    Let $k \ge 2$ be an integer. There exist constants $C,C_{\Delta} > 0$ such that for all sufficiently small constants $\varepsilon > 0$, if $\lambda \le n^{\frac{k}{4} - \varepsilon}$ and $\Delta = n^{-C_{\Delta}\varepsilon}$ then with high probability, the candidate moment matrix $\Lambda$ given by pseudo-calibration for degree $n^{C\varepsilon}$ Sum-of-Squares is PSD.
\end{restatable}
\begin{corollary}\label{cor: tpca_main}
    Let $k \ge 2$ be an integer. There exists a constant $C > 0$ such that for all sufficiently small constants $\varepsilon > 0$, if $\lambda \le n^{\frac{k}{4} - \varepsilon}$, then with high probability, degree $n^{C\varepsilon}$ Sum-of-Squares cannot certify that for a random tensor $A$ from $\mathcal{N}(0, I_{[n]^k})$, there is no vector $u$ such that $\norm{u} \approx 1$ and $\ip{A}{\underbrace{x \otimes\ldots\otimes x}_{\text{k times}}} \approx \lambda$.
\end{corollary}

\paragraph{Wishart model of Sparse PCA}
We use the following distributions.
\begin{restatable}{itemize}{SPCAdistributions}
    \item Random distribution: $v_1, \ldots, v_m$ are sampled from $\mathcal{N}(0, I_d)$ and we take $S$ to be the $m \times d$ matrix with rows $v_1, \ldots, v_m$.
	\item Planted distribution: Sample $u$ from $\{-\frac{1}{\sqrt{k}}, 0, \frac{1}{\sqrt{k}}\}^d$ where the values are taken with probabilites $\frac{k}{2d}, 1 - \frac{k}{d}, \frac{k}{2d}$ respectively. Then sample $v_1, \ldots, v_m$ as follows. For each $i \in [m]$, with probability $\Delta$, sample $v_i$ from $\mathcal{N}(0, I_d + \lambda uu^T)$ and with probability $1 - \Delta$, sample $v_i$ from $\mathcal{N}(0, I_d)$. Finally, take $S$ to be the $m \times d$ matrix with rows $v_1, \ldots, v_m$.
\end{restatable}

In \cref{sec: spca_qual}, we compute the candidate moment matrix $\Lambda$ obtained by using pseudo-calibration on this planted distribution.

\begin{restatable}{theorem}{SPCAmain}\label{thm: spca_main}
    There exists a constant $C > 0$ such that for all sufficiently small constants $\varepsilon > 0$, if $m \le \frac{d^{1 - \varepsilon}}{\lambda^2}, m \le \frac{k^{2 - \varepsilon}}{\lambda^2}$, and there exists a constant $A$ such that $0 < A < \frac{1}{4}$, $d^{4A} \le k \le d^{1 - A\varepsilon}$, and $\frac{\sqrt{\lambda}}{\sqrt{k}} \le d^{-A\varepsilon}$, then with high probability, the candidate moment matrix $\Lambda$ given by pseudo-calibration for degree $d^{C\varepsilon}$ Sum-of-Squares is PSD.
\end{restatable}

\begin{corollary}\label{cor: spca_main}
    There exists a constant $C > 0$ such that for all sufficiently small constants $\varepsilon > 0$, if $m \le \frac{d^{1 - \varepsilon}}{\lambda^2}, m \le \frac{k^{2 - \varepsilon}}{\lambda^2}$, and there exists a constant $A$ such that $0 < A < \frac{1}{4}$, $d^{4A} \le k \le d^{1 - A\varepsilon}$, and $\frac{\sqrt{\lambda}}{\sqrt{k}} \le d^{-A\varepsilon}$, then with high probability, the degree $d^{C\varepsilon}$ degree Sum-of-Squares cannot certify that for a random $m \times d$ matrix $S$ with Gaussian entries, there is no vector $u$ such that $u$ has $\approx k$ nonzero entries, $\norm{u} \approx 1$, and $\norm{Su}^2 \approx m + m{\Delta}\lambda$.
\end{corollary}

\begin{remark}
Note that our planted distributions only approximately satisfy constraints such as having a subgraph of size $k$, having a unit vector $u$, and having $u$ be $k$-sparse. While we would like to use planted distributions which satisfy such constraints exactly, these distributions don't quite satisfy the conditions of our machinery. This same issue appeared in the SoS lower bounds for planted clique \cite{BHKKMP16}. Resolving this issue is a subtle but important open problem.
\end{remark}

\subsection{Relation to prior work on Planted Clique/Dense Subgraph, Tensor PCA, and Sparse PCA}\label{sec: prior_work}

\subsubsection{Planted Dense Subgraph}\label{sec: plds}

In the planted dense subgraph problem, we are given a random graph $G$ where a dense subgraph of size $k$ has been planted and we are asked to find this planted dense subgraph.
This is a natural generalization of the $k$-clique problem \cite{karp1972reducibility} and has been subject to a long line of work over the years (e.g. \cite{feige1997densest, feige2001dense, khot2006ruling, bhaskara2010detecting, bhaskara2012polynomial, braverman2017eth, manurangsi2017almost}).
In this work, we consider the following certification variant of planted dense subgraph.

\begin{quote}
\em{Given a random graph $G$ sampled from the Erd\H{o}s\xspace-R\'enyi\xspace model $G(n, \frac{1}{2})$, certify an upper bound on the edge density of the densest subgraph on $k$ vertices.}
\end{quote}

For many different parameter regimes of the random and planted distributions (an example being planting $G_{k, q}$ in $G_{n, p}$ for constants $p < q$), and when $k = o(\sqrt{n})$, the hardness of the easier distinguishing version of planted dense subgraph problem has been posed as formal conjecture (often referred to as the PDS conjecture) before in the literature (see e.g., \cite{hajek2015computational, chen2014statistical, brennan2018reducibility, brennan2019universality}). This has also led to many reductions to other problems \cite{brennan2019optimal}, although it's not clear if these reductions can be made in the SoS framework without loss in the parameter dependence.

In our case, we consider the slightly planted denser subgraph version where for $k \le n^{\frac{1}{2} - \varepsilon}$, we plant a subgraph of density $\frac{1}{2} + \frac{1}{n^{O(\varepsilon)}}$, i.e. $p = \frac{1}{2}, q = \frac{1}{2} + \frac{1}{n^{O(\varepsilon)}}$. This has been widely believed to require sub-exponential time. Our work provides strong evidence towards this by exhibiting unconditional lower bounds against the powerful SoS hierarchy, even if we consider $n^{O(\varepsilon)}$ levels, which corresponds to $n^{n^{O(\varepsilon)}}$ running time! We expect this to lead to this problem being used as a natural starting point for reductions to show sub-exponential time hardness for various problems.

Within the SoS literature, \cite{BHKKMP16} show that for $k \le n^{\frac{1}{2} - \varepsilon}$ for a constant $\varepsilon > 0$, the degree $o(\log n)$ Sum-of-Squares cannot distinguish between a fully random graph sampled from $G(n, \frac{1}{2})$ from a random graph which has a planted $k$-clique. This implies that degree $o(\log n)$ SoS cannot certify an edge density better than $1$ for the densest $k$-subgraph if $k \le n^{\frac{1}{2} - \varepsilon}$.

In \cref{cor: plds_main}, we show that for $k \le n^{\frac{1}{2} - \varepsilon}$ for a constant $\varepsilon > 0$, degree $n^{\Omega(\varepsilon)}$ SoS cannot certify an edge density better than $\frac{1}{2} + \frac{1}{n^{O(\varepsilon)}}$. The degree of SoS in our setting, $n^{\Omega(\varepsilon)}$ is vastly higher than the earlier known result which uses degree $o(\log n)$. To the best of our knowledge, this is the first result that proves such a high degree lower bound.

We remark that when we take $k = n^{\frac{1}{2} - \varepsilon}$,  the true edge density of the densest $k$-subgraph is $\frac{1}{2} + \frac{\sqrt{\log(n/k)}}{\sqrt{k}} + \operatorname{o}(\frac{1}{\sqrt{k}}) \approx \frac{1}{2} + \frac{1}{n^{1/4 - \varepsilon/2}}$ as was shown in \cite[Corollary 2]{gamarnik2019landscape} whereas, by \cref{cor: plds_main}, the SoS optimum is as large as $\frac{1}{2} + \frac{1}{n^{\varepsilon}}$. This highlights a significant difference in the optimum value.

\subsubsection{Tensor PCA}

The Tensor Principal Component Analysis problem, originally introduced by \cite{richard2014statistical}, is a generalization of the PCA problem from machine learning to higher order tensors. Given an order $k$ tensor of the form $\lambda u^{\otimes k} + B$ where $u \in {\mathbb R}^n$ is a unit vector and $B \in {\mathbb R}^{[n]^k}$ has independent Gaussian entries, we would like to recover $u$. Here, $\lambda$ is known as the signal-to-noise ratio.

This can be equivalently considered to be the problem of optimizing a homogenous degree $k$ polynomial $f(x)$, with random Gaussian coefficients over the unit sphere $\norm{x} = 1$. In general, polynomial optimization over the unit sphere is a fundamental primitive with a lot of connections to other areas of optimization (e.g. \cite{frieze2008new, brubaker2009random,brandao2017quantum, barak2014rounding, bks15, bhattiprolu2017weak}). Tensor PCA is an average case version of the above problem and has been studied before in the literature \cite{richard2014statistical, HopSS15, tensorpca16, hop17}. In this work, we consider the certification version of this average case problem.

\begin{quote}
\em{For an integer $k \ge 2$, given a random tensor $A \in {\mathbb R}^{[n]^k}$ with entries sampled independently from $\mathcal{N}(0, 1)$, certify an upper bound on $\ip{A}{x^{\otimes k}}$ over unit vectors $x$.}
\end{quote}

In \cite{tensorpca16}, it was shown that $q \le n$ levels of SoS certifies an upper bound of $\frac{2^{O(k)} (n \cdot \text{polylog}(n))^{k/4}}{q^{k/4 - 1/2}}$ for the Tensor PCA problem. When $q = n^{\varepsilon}$ for sufficiently small $\varepsilon$, this gives an upper bound of $n^{\frac{k}{4} - O(\varepsilon)}$. \cref{cor: tpca_main} shows that this is tight.

In \cite{hop17}, they state a theorem similar to \cref{cor: tpca_main} and observe that it can be proved by applying the techniques used to prove the SoS lower bounds for planted clique. However, they do not give an explicit proof. Also, while they consider the setting where the random distribution has entries from $\{-1, 1\}$, we work with the more natural setting where the distribution is $\mathcal{N}(0, 1)$. We remark that our machinery can also easily recover their result with the entries being restricted to $\{-1, 1\}$.

When $k = 2$, the maximum value of $\ip{\tens{x}{k}}{A}$ over the unit sphere $\norm{x}^2 = 1$ is precisely the largest eigenvalue of $(A + A^T)/2$ which is $\Theta(\sqrt{n})$ with high probability. For any integer $k \ge 2$, the true maximum of $\ip{\tens{x}{k}}{A}$ over $\norm{x}^2 = 1$ is $O(\sqrt{n})$ with high probability \cite{tomioka2014}. In contrast, by \cref{cor: tpca_main}, the optimum value of the degree $n^{\varepsilon}$ SoS is as large as $n^{\frac{k}{4} - O(\varepsilon)}$. This exhibits an integrality gap of $n^{\frac{k}{4} - \frac{1}{2} - O(\varepsilon)}$.

\subsubsection{Wishart model of Sparse PCA}

The Wishart model of Sparse PCA, also known as the Spiked Covariance model, was originally proposed by \cite{johnstone_lu2009}. In this problem, we observe $m$ vectors $v_1, \ldots, v_m \in {\mathbb R}^d$ from the distribution $\mathcal{N}(0, I_d + \lambda uu^T)$ where $u$ is a $k$-sparse unit vector, and we would like to recover $u$. Here, the sparsity of a vector is the number of nonzero entries and $\lambda$ is known as the signal-to-noise ratio.

Sparse PCA is a fundamental problem that has applications in a diverse range of fields (e.g. \cite{wang2012online, naikal2011informative, majumdar2009image, tan2014classification, chun2009expression, allen2011sparse}). It's known that vanilla PCA does not yield good estimators in high dimensional settings \cite{baik2005phase, paul2007asymptotics, johnstone_lu2009}. A large volume of work has gone into studying Sparse PCA and it's variants, both from an algorithmic perspective (e.g. \cite{amini_wainwright2008, ma2013sparse, krauthgamer2015, deshpande2016, wang2016statistical}) as well as from an inapproximability perspective (e.g. \cite{berthet2013complexity, ma_wigderson_15, diakonikolas2017statistical, hop17, brennan2019optimal}).

Given the decades of research on this problem and how fundamental it is for a multitude of applications and disciplines, understanding the computational threshold behavior of the Wishart model of Sparse PCA is an extremely important research topic in statistics. In particular, prior works have explored statistical query lower bounds, SDP lower bounds, lower bounds by reductions from widely believed conjectures, etc. On the other hand, there have only been two prior works on lower bounds against SoS, specifically only for degree $2$ and degree $4$ SoS, which can be attributed to the difficulty in proving such lower bounds. In this paper, we vastly strengthen these lower bounds and show almost-tight lower bounds for the SoS hierarchy of degree $d^{\varepsilon}$ which corresponds to a running time of $d^{d^{O(\varepsilon)}}$.

Between this work and prior works, we completely understand the parameter regimes where sparse PCA is easy or conjectured to be hard up to polylogarithmic factors. In \cref{fig: spca_thresholds}, we classify the different parameter regimes into the following categories.
\begin{itemize}
    \item DT: In this regime, Diagonal thresholding \cite{johnstone_lu2009, amini_wainwright2008} recovers the sparse vector. Covariance thresholding \cite{krauthgamer2015, deshpande2016} and SoS \cite{sparse_pca_focs20} can also be used in this regime. Covariance thresholding has better dependence on logarithmic factors and SoS works in the presence of adversarial errors.
    \item PCA: Vanilla PCA can recover the vector, i.e. we do not need to use the fact that the vector is sparse (see e.g. \cite{berthet2013, sparse_pca_focs20}).
    \item Spectral: An efficient spectral algorithm recovers the sparse vector (see e.g. \cite{sparse_pca_focs20}).
    \item Spectral*: A simple spectral algorithm distinguishes the planted distribution from the random distribution but it is information theoretically impossible to recover the sparse vector \cite[Appendix E]{sparse_pca_focs20}.
    \item Hard: A regime where it is conjectured to be hard to distinguish between the random and the planted distributions. We discuss this in more detail below.
\end{itemize}


\begin{figure}[!ht]
    \centering
    \includegraphics[scale=.6]{machinery/images/spca_graph_2}
    \caption{The threshold behavior of the Wishart model of Sparse PCA where the $x$-axis is $d$, the ambient dimension and the $y$-axis is $k$, the sparsity of the hidden vector. [*] indicates the regime studied in this paper.}
    \label{fig: spca_thresholds}
\end{figure}


In the \textit{Hard} parameter regime where $m \ll \frac{k^2}{\lambda^2}$ and $m \ll \frac{d}{\lambda^2}$, degree $2$ and degree $4$ SoS lower bounds have been shown in prior works, while we handle degree $d^{O(\varepsilon)}$. In particular, the works \cite{krauthgamer2015, berthet2013} obtain degree $2$ SoS lower bounds. \cite{ma_wigderson_15} obtain degree $4$ SoS lower bounds using an ad-hoc construction. It's not clear if their construction can be generalized for higher degrees. Moreover, the bounds they obtain are tight up to polylogarithmic factors when $\lambda$ is a constant but are not tight when $\lambda$ is not a constant, so we improve their bounds even in the degree $4$ case. We subsume all these earlier known results in this work with \cref{cor: spca_main}. This is a vast improvement over prior known sum of squares lower bounds and provides compelling evidence for the hardness of Sparse PCA in this parameter range.

The work \cite{hop17} considers the related but qualitatively different Wigner model of Sparse PCA and they state degree $d^{\varepsilon}$ SoS lower bounds, without explicitly proving these bounds. The techniques in that work do not recover our results because the matrix formed by the random samples in the Wishart model is asymmetric, and handling it correctly is far from being a mere technicality. On the other hand, our machinery can recover the results on the Wigner model as well, though we only analyze the Wishart model in this paper.

In \cite{sparse_pca_focs20}, they prove that if $m \le \frac{d}{\lambda^2}$ and $m \le \left(\frac{k^2}{\lambda^2}\right)^{1 - \Omega(\varepsilon)}$, then degree $n^{\varepsilon}$ polynomials cannot distinguish the random and planted distributions.
\cref{cor: spca_main} says that under mildly stronger assumptions, degree $n^{\varepsilon}$ Sum-of-Squares cannot distinguish the random and planted distributions, so we confirm that SoS is no more powerful than low degree polynomials in this setting. 

There have also been direct reductions from planted clique to Sparse PCA \cite{brennan2019optimal}, and it's natural to ask if these reductions can obtain SoS lower bounds on Sparse PCA from the known SoS lower bounds on planted clique \cite{BHKKMP16}. To the best of our knowledge, no such reduction is known and constructing such a reduction would be challenging as it would have to be captured by SoS and avoid losing too much in the parameters. Still, it may well be possible to construct such a reduction.







\subsection{An overview of the machinery: All three results from a single main theorem}

Now that we have described our results, 
we want to highlight that all three results are obtained via applications of one main theorem, which we call the machinery. \cref{thm: plds_main}, \cref{thm: tpca_main} and \cref{thm: spca_main} all essentially boil down to showing that a large moment matrix $\Lambda$ is PSD. To show this, the machinery constructs certain \emph{coefficient matrices} from $\Lambda$ and gives conditions on these coefficient matrices which are sufficient to guarantee that $\Lambda$ is PSD with high probability. In this section, we give an informal sketch of the machinery and how it generalizes the techniques used to prove the SoS lower bound for planted clique \cite{BHKKMP16}. We also motivate some of the conditions that arise in the machinery.
\subsubsection{Shapes and graph matrices}
Before we can describe how our machinery works, we need to describe shapes and graph matrices, which were originally introduced by \cite{BHKKMP16, medarametla2016bounds} and later generalized in \cite{AMP20}. Both the planted clique analysis and our analysis use shapes and graph matrices.


Shapes $\alpha$ are graphs that contain extra information about the vertices. Corresponding to each shape $\alpha$, there is a matrix-valued function (i.e. a matrix whose entries depend on the input) $M_{\alpha}$ that we call a graph matrix. Graph matrices are analogous to a Fourier basis, but for matrix-valued functions that exhibit a certain kind of symmetry. In our setting, $\Lambda$ will be such a matrix-valued function, so we can decompose $\Lambda$ as a linear combination of graph matrices.

Shapes and graph matrices have several properties which make them very useful to work with. First, $\norm{M_{\alpha}}$ can be bounded with high probability in terms of simple combinatorial properties of the shape $\alpha$. Second, if two shapes $\alpha$ and $\beta$ match up in a certain way, we can combine them to form a larger shape $\alpha \circ \beta$. We call this operation shape composition. Third, each shape $\alpha$ has a canonical decomposition into three shapes, the left, middle and right parts of $\alpha$, which we call $\sigma$, $\tau$, and ${\sigma'}^T$. For this canonical decomposition, we have that $\alpha = \sigma \circ \tau \circ {\sigma'}^T$ and $M_{\alpha} \approx M_{\sigma}M_{\tau}M_{{\sigma'}^T}$ \footnote{Actually, due to a technical issue related to automorphism groups, this equation is off by a multiplicative constant. For details, see Lemma \ref{lm:morthsimplereexpression}.}. This decomposition turns out to be crucial for both the planted clique analysis and our analysis.
\subsubsection{Summary of the SoS lower bound for planted clique and the machinery}
We now give a brief summary of the techniques for the SoS lower bound for planted clique and for our machinery. We elaborate on these steps in \cref{ideadescriptionsubsection}, we formally describe these steps in \cref{sec: informal_statement}, and we carry out these steps in full generality in \cref{sec: technical_def_and_main_theorem} and \cref{sec: proof_of_main}.






For planted clique, the SoS lower bound analysis works as follow
\begin{enumerate}
	\item Using the technique of pseudo-calibration, construct a candidate moment matrix $\Lambda$.
	\item Decompose the moment matrix $\Lambda$ as a linear combination $\Lambda = \sum_{\text{shapes } \alpha}{\lambda_{\alpha}M_{\alpha}}$ of graph matrices $M_{\alpha}$.
	\item For each shape $\alpha$, decompose $\alpha$ into a left part $\sigma$, a middle part $\tau$, and a right part ${\sigma'}^T$.
	We then have that $M_{\alpha} \approx M_{\sigma}M_{\tau}M_{{\sigma'}^T}$. 




	\item Using the approximate decompositions $M_{\alpha} \approx M_{\sigma}M_{\tau}M_{{\sigma'}^T}$, give an approximate decomposition $\Lambda \approx LQL^T$ of $M$ where $Q \succeq 0$ with high probability.
	\item Show that with high probability, $\Lambda = LQL^T - (LQL^T - M) \succeq 0$ by carefully analyzing the difference $LQL^T - M$ using similar techniques.
\end{enumerate}

For our machinery, we use a similar framework. The key innovation of our machinery is that we introduce coefficient matrices (step 4) and carry out the analysis in terms of these coefficient matrices.
\begin{enumerate}
	\item Construct a candidate moment matrix $\Lambda$. This can be done either using pseudo-calibration or in a more ad-hoc manner.
	\item Decompose the moment matrix $\Lambda$ as a linear combination $\Lambda = \sum_{\text{shapes } \alpha}{\lambda_{\alpha}M_{\alpha}}$ of graph matrices $M_{\alpha}$
	\item For each shape $\alpha$, decompose $\alpha$ into a left part $\sigma$, a middle part $\tau$, and a right part ${\sigma'}^T$.
	\item Based on the coefficients $\lambda_{\alpha}$ and the decompositions of the shapes $\alpha$ into left, middle, and right parts, construct coefficient matrices $H_{Id_U}$ and $H_{\tau}$.

	\item Based on the coefficient matrices $H_{Id_U}$ and $H_{\tau}$, obtain an approximate PSD decomposition of $\Lambda$.
	\item Show that the error terms (which we call intersection terms) can be bounded by the approximate PSD decomposition of $\Lambda$.
\end{enumerate}
We show that this analysis will succeed as long as three conditions on the coefficient matrices are satisfied (see \cref{sec: informal_statement} for a qualitative statement of these conditions and \cref{sec: technical_def_and_main_theorem} for the precise statement of these conditions). Thus, in order to use our machinery to prove sum of squares lower bounds, it is sufficient to do the following.
\begin{enumerate}
    \item Construct a candidate moment matrix $\Lambda$.
    \item Decompose the moment matrix $\Lambda$ as a linear combination $\Lambda = \sum_{\text{shapes } \alpha}{\lambda_{\alpha}M_{\alpha}}$ of graph matrices $M_{\alpha}$ (akin to Fourier decomposition) and find the corresponding coefficient matrices.
    \item Verify the required conditions on the coefficient matrices.
\end{enumerate}




\subsubsection{A sketch of the intuition behind the machinery conditions}\label{ideadescriptionsubsection}



\begin{comment}
\paragraph{Graph matrices}

Graph matrices were originally introduced by \cite{BHKKMP16, medarametla2016bounds} and later generalized in \cite{AMP20}. We use the generalized graph matrices in our analysis.

Each graph matrix is a matrix valued function of the input, that can be identified by a graph with labeled edges that we call a shape. Informally, graph matrices will form a basis for all matrix valued functions of the input that have a certain symmetry. In particular, $\Lambda$ is one such matrix valued function and can thus be decomposed into graph matrices. For a shape $\alpha$, the graph matrix associated to $\alpha$ is denoted by $M_{\alpha}$.

Graph matrices have several useful properties. Firstly, $\norm{M_{\alpha}}$ can be bounded with high probability in terms of simple combinatorial properties of the shape $\alpha$. Secondly, when we multiply two graph matrices $M_{\alpha}, M_{\beta}$ corresponding to shapes $\alpha, \beta$, it approximately equals the graph matrix $M_{\alpha \circ \beta}$ where the shape $\alpha \circ \beta$, called the composition of the two shapes $\alpha$ and $\beta$, is easy to describe combinatorially.

These makes graph matrices a convenient tool to analyze the moment matrix. In our setting, the moment matrix decomposes as $\Lambda = \sum \lambda_{\alpha} M_{\alpha}$ where the sum is over all shapes $\alpha$ and $\lambda_{\alpha} \in {\mathbb R}$ are the coefficients that arise from pseudo-calibration.

\paragraph{Decomposing Shapes}

For graph matrices $\alpha, \beta$, $M_{\alpha}M_{\beta} \approx M_{\alpha \circ \beta}$ where we define the composition of two shapes $\alpha \circ \beta$ to be a larger shape that is obtained by concatenating the shapes $\alpha, \beta$. This equality is only approximate and handling it precisely is a significant source of difficulty in our analysis. Shape composition is also associative, hence we can define composition of three or more shapes.

A crucial idea for our machinery is that for any shape $\alpha$, there exists a canonical and unique decomposition of $\alpha$ as $\sigma \circ \tau \circ \sigma'^T$ satisfying some nice structural properties, for shapes $\sigma, \tau$ and $\sigma'^T$. Here, $\sigma, \tau, \sigma'^T$ are called the left part, the middle part and the right part of $\alpha$ respectively.
Using this, our moment matrix can be written as
\[\Lambda = \sum_{\alpha}\lambda_{\alpha}M_{\alpha} = \sum_{\sigma, \tau, \sigma'} \lambda_{\sigma\circ\tau\circ\sigma'^T}M_{\sigma\circ\tau\circ\sigma'^T}\]
\end{comment}

\paragraph{Giving an approximate PSD factorization}
As discussed above, we decompose the moment matrix $\Lambda$ as a linear combination $\Lambda = \sum_{\text{shapes } \alpha}{\lambda_{\alpha}M_{\alpha}}$ of graph matrices $M_{\alpha}$. We then decompose each $\alpha$ into left, middle, and right parts $\sigma$, $\tau$, and ${\sigma'}^T$. We now have that 
\[
\Lambda = \sum_{\alpha = \sigma \circ \tau \circ {\sigma'}^T}{\lambda_{\sigma \circ \tau \circ {\sigma'}^T}M_{\sigma \circ \tau \circ {\sigma'}^T}}
\]

We first consider the terms $\sum_{\sigma, \sigma'} \lambda_{\sigma \circ \sigma'^T}M_{\sigma \circ \sigma'^T} \approx \sum_{\sigma, \sigma'} \lambda_{\sigma \circ \sigma'^T}M_{\sigma} M_{\sigma'^T}$ where $\tau$ corresponds to an identity matrix and can be ignored. 

If there existed real numbers $v_{\sigma}$ for all left shapes $\sigma$ such that $\lambda_{\sigma \circ \sigma'^T} = v_{\sigma}v_{\sigma'}$, then we would have
\[
\sum_{\sigma, \sigma'} \lambda_{\sigma \circ \sigma'^T}M_{\sigma} M_{\sigma'^T} = \sum_{\sigma, \sigma'} v_{\sigma}v_{\sigma'}M_{\sigma} M_{\sigma'^T} = (\sum_{\sigma} v_{\sigma}M_{\sigma})(\sum_{\sigma} v_{\sigma}M_{\sigma})^T \succeq 0
\]
which shows that the contribution from these terms is positive semidefinite. In fact, this turns out to be the case for the planted clique analysis. However, this may not hold in general. To handle this, we note that the existence of $v_{\sigma}$ can be relaxed as follows: Let $H$ be the matrix with rows and columns indexed by left shapes $\sigma$ such that $H(\sigma, \sigma') = \lambda_{\sigma \circ \sigma'^T}$. Up to scaling, $H$ will be one of our coefficient matrices. If $H$ is positive semidefinite then the contribution from these terms will also be positive semidefinite. In fact, this will be the first condition of our main theorem, the qualitative version of which can be found in \cref{informalmaintheoremstatement}.

\paragraph{Handling terms with a non-trivial middle part}

Unfortunately, we also have terms $\lambda_{\sigma \circ \tau \circ \sigma'^T}M_{\sigma \circ \tau \circ \sigma'^T}$ where $\tau$ is non-trivial. Our strategy will be to charge these terms to other terms.

For the sake of simplicity, we will describe how to handle one term. A starting point is the following inequality. For a left shape $\sigma$, a middle shape $\tau$, a right shape $\sigma'^T$, and real numbers $a, b$,
\[(a M_{\sigma} - bM_{\sigma'}M_{\tau^T})(a M_{\sigma} - bM_{\sigma'}M_{\tau^T})^T \succeq 0\]
which rearranges to
\begin{align*}
ab(M_{\sigma}M_{\tau}M_{\sigma'^T} + (M_{\sigma}M_{\tau}M_{\sigma'^T})^T) &\preceq a^2M_{\sigma}M_{\sigma^T} + b^2M_{\sigma'}M_{\tau^T}M_{\tau}M_{\sigma'^T}\\
&\preceq a^2M_{\sigma}M_{\sigma^T} + b^2\norm{M_{\tau}}^2M_{\sigma'}M_{\sigma'^T}
\end{align*}

If $\lambda_{\sigma \circ \tau \circ \sigma'^T}^2\norm{M_{\tau}}^2 \le \lambda_{\sigma \circ \sigma^T}\lambda_{\sigma' \circ \sigma'^T}$, then we can choose $a, b$ such that $a^2 \le \lambda_{\sigma \circ \sigma^T}, b^2 \norm{M_{\tau}}^2 \le \lambda_{\sigma' \circ \sigma'^T}$ and $ab = \lambda_{\sigma \circ \tau \circ \sigma'^T}$. This will approximately imply
\[\lambda_{\sigma \circ \tau \circ \sigma'^T}(M_{\sigma \circ \tau \circ \sigma'^T} + M_{\sigma \circ \tau \circ \sigma'^T}^T) \preceq \lambda_{\sigma \circ \sigma^T}M_{\sigma \circ \sigma^T} + \lambda_{\sigma' \circ \sigma'^T}M_{\sigma' \circ \sigma'^T}\]
which will give us a way to charge terms with a nontrivial middle part against terms with a trivial middle part.

While we could try to apply this inequality term by term, it is not strong enough to give us our results. Instead, we generalize this inequality to work with the entire set of shapes $\sigma, \sigma'$ for a fixed $\tau$. This will lead us to the second condition of our main theorem, the qualitative version of which can be found in \cref{informalmaintheoremstatement}.

\paragraph{Handing intersection terms}

There's one important technicality in the above heuristic calculations. Whenever we decompose $\alpha$ into left, middle, and right parts $\sigma$, $\tau$, and ${\sigma'}^T$, $M_{\sigma}M_{\tau}M_{{\sigma'}^T}$ is only approximately equal to $M_{\alpha} = M_{\sigma \circ \tau \circ {\sigma'}^T}$. All the other error terms have to be carefully handled in our analysis. We call these terms intersection terms.

These intersection terms themselves turn out to be graph matrices and our strategy is to now recursively decompose them into $\sigma_2 \circ \tau_2 \circ \sigma_2'^T$ and apply the previous ideas. To do this methodically, we employ several ideas such as the notion of intersection patterns and the generalized intersection tradeoff lemma (see \cref{sec: proof_of_main}). Properly handling the intersection terms is one of the most technically intensive parts of our work.
This analysis leads us to the third condition of the main theorem, the qualitative version of which can be found in \cref{informalmaintheoremstatement}.

\paragraph{Applying the machinery}

To apply the machinery to our problems of interest, we verify the spectral conditions that our coefficients should satisfy and then we can use our main theorem. The Planted slightly denser subgraph application is straightforward and will serve as a good warmup to understand our machinery. In the applications to Tensor PCA and Sparse PCA, the shapes corresponding to the graph matrices with nonzero coefficients have nice structural properties that will be crucial for our analysis. We exploit this structure and use novel charging arguments to verify the conditions of our machinery.

\subsubsection{A summary of our contributions}
\begin{comment}
\subsubsection{Power and limitations of the machinery}
We note that our machinery has the same level of generality as the generalized graph matrices in \cite{AMP20}, which is quite general. In particular, our machinery can handle inputs This generality allows our machinery to handle problems such as the Wishart model of Sparse PCA which were outside the range of previous lower bound analyses for sum of squares. 

That said, in our experience, in order for the machinery to work well, we need the coefficients $\lambda_{\alpha}$ to decay exponentially in the number of vertices and edges of $\alpha$ and we need this decay to be precise. This is an important reason why our machinery has trouble handling hard global constraints

It is an interesting question whether or not our machinery can be applied to problems on sparse inputs. On the one hand, qualitatively, there is 
\end{comment}
\paragraph{The machinery} In this work, we have attempted to drastically simplify the notoriously hard task of proving PSDness for SoS lower bounds to verifying relatively simple conditions that are assembled from the candidate moment matrix. Coefficient matrices are simple matrices whose entries are filled in a specific but well defined manner with the Fourier coefficients of $\Lambda$, i.e. coefficients $\lambda_{\alpha}$ for which we can write $\Lambda = \sum_{\alpha} \lambda_{\alpha} M_{\alpha}$. Here, $M_{\alpha}$ are called graph matrices, that will play the part of a Fourier basis. Indeed, they can be thought of as an analogue for the Fourier basis for large matrices that satisfy a certain type of symmetry.

The conditions, which are relatively easy to verify for specific applications, moreover give the right tradeoffs for SoS lower bounds that can be shown via coefficient decay techniques. That is, the bounds  $\lambda \ll n^{k/4}$ for Tensor PCA or the bound $m \ll \min(d / \lambda^2, k^2/\lambda^2)$ for the Wishart model of Sparse PCA, almost naturally appear when applying the machinery on these problems. Moreover, as we saw in \cref{sec: prior_work}, these thresholds in fact match the best known algorithmic guarantees, thereby giving conclusive evidence of the correct computational threshold behavior of these problems.

\paragraph{Applicability to certification problems}
The machinery works well for problems where the Fourier coefficients of the candidate moment matrix exhibit a sort of decay behavior. This is akin to saying larger Fourier coefficients should be smaller in the Fourier decomposition. Such coefficient decay is present in many standard certification problems once we find a good planted distribution. Sometimes, noise can be easily introduced to the problem instance to force such kinds of decay, at a cost of slightly lowering the integrality gap we set out to prove, say from $n^{k/4}$ to $n^{k/4 - \varepsilon}$ for a small $\varepsilon > 0$.

The machinery in this work applies for both dense and sparse problems alike. Dense problems include certification problems on $G_{n, 1/2}$ or statistics problems with inputs being standard Gaussian inputs. Sparse problems include problems on $G_{n, p}$ for $p = n^{\varepsilon}$ or statistics problems with inputs having bounded sub-Gaussian norm. While we have optimized our functions for dense problems, the general main theorem is readily applicable, with almost no modifications, for sparse problems as well. For sparse problems, to get the tightest possible tradeoffs, the functions might have to be optimized even further to reflect the behavior of random matrices on these sparse inputs but we haven't attempted this in this work.

\paragraph{Other contributions}
While the machinery builds on ideas from the work on Planted Clique, the kind of matrices they use, known as graph matrices are not powerful enough to handle problems on tensors or Gaussian inputs. This is addressed in the work of \cite{AMP20} who introduce the notion of generalized graph matrices. When trying to apply generally the same techniques, there are numerous difficulties that have to be overcome and we systematically handle them using both new ideas and by significantly generalizing known techniques.

We introduce and use the notion of coefficient matrices which attempt to abstract out the precise conditions that are needed in such SoS lower bound proofs. By making the dominant and non-dominant terms in the PSD decomposition explicit, the coefficient matrices shed light on the intrinsic structure of the problem at hand, to prevent getting lost in the technical details.



\subsection{Comparison to Other Sum-of-Squares Lower Bounds on Certification Problems}
\cite{BHKKMP16} proved sum of squares lower bounds for the planted clique problem. Our machinery vastly generalizes the techniques of their paper. For a specific technical reason, our machinery actually doesn't recover the same lower bounds as planted clique in particular (See \cref{rmk: planted_clique_failure}). That said, this is because we have attempted to keep our framework as general as possible so that it is applicable to other problems, at the cost of losing a specific technicality that's needed for planted clique in particular.

\cite{hop17} remarked that the techniques used in \cite{BHKKMP16} can be used to give Sum-of-Squares lower bounds for $\pm{1}$ variants of tensor PCA and sparse PCA, though this is not made explicit. In this paper, we use our machinery to make these lower bounds explicit. We also handle the Wishart model of sparse PCA, which is more natural and significantly harder to prove lower bounds for. In particular, the bounds we prove do not follow solely from the techniques used in prior works.

\cite{KothariMOW17} proved that for random constraint satisfaction problems (CSPs) where the predicate has a balanced pairwise independent distribution of solutions, with high probability, degree $\Omega(n)$ SoS is required to certify that these CSPs do not have a solution. While they don't state it in this manner, the pseudo-expectation values used by \cite{KothariMOW17} can also be derived using pseudo-calibration \cite{rajendran2018combinatorial, brown2020extended}. The analysis for showing that the moment matrix is PSD is very different. It is an interesting question whether or not it is possible to unify these analyses.

\cite{MRX20} showed that it's possible to lift degree $2$ SoS solutions to degree $4$ SoS solutions under suitable conditions, and used it to obtain degree $4$ SoS lower bounds for average case $d$-regular Max-Cut and the Sherrington Kirkpatrick problem. Their construction is inspired by pseudo-calibration and their analysis also goes via graph matrices.

Recently, \cite{sklowerbounds} proved degree $n^{\varepsilon}$ SoS lower bounds for the Sherrington-Kirkpatrick problem via an intermediate problem known as Planted Affine Planes. Their construction and analysis also goes via pseudo-calibration and graph matrices, but since the constructed moment matrix had a nontrivial nullspace, they had to use different techniques to handle them. However, once this nullspace is taken into account, the moment matrix is dominated by its expected value. After this preprocessing, our machinery would imply their result but using it would be overkill.

\cite{kunisky2020} recently proposed a technique to lift degree $2$ SoS lower bounds to higher levels and applied it to construct degree $6$ lower bounds for the Sherrington-Kirkpatrick problem. Interestingly, their construction does not go via pseudo-calibration.

\subsection{Related Algorithmic Techniques}\label{subsec: related_techniques}

\paragraph{Low degree polynomials} Consider a problem where the input is sampled from one of two distributions and we would like to identify which distribution it was sampled from. Usually, one distribution is the completely random distribution while the other is a planted distribution that contains a given structure not present in the random distribution. In this setting, a closely related method is to use low degree polynomials to try and distinguish the two distributions. More precisely, if there is a low degree polynomial such that its expected value on the random distribution is very different than its expected value on the planted distribution, this distinguishes the two distributions.
Recently, this method has been shown to be an excellent heuristic, as it recovers the conjectured hardness thresholds for several problems and is considerably easier to analyze \cite{hop17, hop18, kunisky19notes}.

Under some conditions, the SoS hierarchy is at least as powerful as low degree polynomials \cite{hop17}. It is an important open question whether low degree polynomials generally have the same power as the SoS hierarchy or if there are situations where the SoS hierarchy is more powerful. On the one hand, we know of very few examples where low degree polynomials fail yet some polynomial time algorithm succeeds and the examples we do know of are somewhat contrived \cite{holmgren2020counterexamples}. On the other hand, to the best of our knowledge there is no known way to obtain sum of squares lower bounds from low degree polynomial lower bounds. In this paper, we confirm that for tensor PCA and the Wishart model of sparse PCA with slightly adjusted planted distributions, the SoS hierarchy is no more powerful than low-degree polynomials.

\paragraph{The Statistical Query Model}
The statistical query model is another popular restricted class of algorithms introduced by \cite{kearns1998efficient}. In this model, for an underlying distribution, we can access it by querying expected value of functions of the distribution. Concretely, for a distribution $D$ on ${\mathbb R}^n$, we have access to it via an oracle that given as query a function $f: {\mathbb R}^n \rightarrow [-1, 1]$ returns $\mathbb{E}_{x \sim D} f(x)$ upto some additive adversarial error. SQ algorithms capture a broad class of algorithms in statistics and machine learning and has also been used to study information-computation tradeoffs. There has also been significant work trying to understand the limits of SQ algorithms (e.g. \cite{feldman2017statistical, feldman2018complexity, diakonikolas2017statistical}). The recent work \cite{brennan2020statistical} showed that low degree polynomials and statistical query algorithms have equivalent power under mild conditions. It's an interesting open question whether or not SQ algorithms have the same power as Sum-of-Squares algorithms. An important distinction to note is that SQ algorithms do not take into account the complexity of the oracle whereas SoS studies the whole computational aspect of the problem. Therefore, with our current knowledge, SoS lower bounds are strictly stronger evidence of intrinsic hardness of problems.

\subsection{Organization of the paper}

In this work, we occasionally distinguish between the qualitative and quantitative versions of theorem statements. Qualitative theorem statements capture the essence of the inequalities we prove, and serve to illustrate the main forms of the bounds we desire, without getting lost in the details. Quantitative theorems on the other hand build on their qualitative counterparts by stating the precise bounds that are needed.

The remainder of this paper is organized as follows. In \cref{sec: prelim}, we give some preliminaries. In particular, we describe the Sum-of-Squares hierarchy and present a brief overview of the machinery and some proof techniques that we use. In \cref{sec: informal_statement}, we present the informal statement of the main theorem. In \cref{sec: plds_qual}, \cref{sec: tpca_qual} and \cref{sec: spca_qual}, we qualitatively verify the conditions of the machinery for planted slightly denser subgraph, tensor PCA, and sparse PCA respectively. While these sections only verify the qualitative conditions, the results in these sections are precise and will be reused in \cref{sec: plds_quant}, \cref{sec: tpca_quant} and \cref{sec: spca_quant} to fully verify the conditions of the machinery. In \cref{sec: technical_def_and_main_theorem}, we introduce all the formal definitions and state the main theorem in full generality. In \cref{sec: proof_of_main}, we prove the main theorem while abstracting out the choice of several functions. In \cref{sec: choosing_funcs}, we choose these functions so that that they satisfy the conditions needed for our main theorem. In Section \ref{sec: showing_positivity}, we give tools for verifying a technical condition of our machinery which is related to truncation error. Finally, in \cref{sec: plds_quant}, \cref{sec: tpca_quant} and \cref{sec: spca_quant}, we verify all the conditions necessary to prove \cref{thm: plds_main}, \cref{thm: tpca_main} and \cref{thm: spca_main} respectively.

For a first pass, the reader is encouraged to read the informal description of the machinery as outlined in \cref{sec: informal_statement}. Then, they can read any of the qualitative bounds \cref{sec: plds_qual}, \cref{sec: tpca_qual}, \cref{sec: spca_qual} to get a sense of how the machinery can be applied to these problems qualitatively. For readers interested in seeing complete proofs of the main applications, they can read parts of the quantitative bounds sections of their favorite problem(s) from among \cref{sec: plds_quant}, \cref{sec: tpca_quant}, \cref{sec: spca_quant}. To understand the details of the machinery, the reader may then read the formal definitions and theorem statement in \cref{sec: technical_def_and_main_theorem} whose proof follows in section \cref{sec: proof_of_main}. In \cref{sec: technical_def_and_main_theorem}, for the sake of clarity, the reader is given a choice of generality in the definitions and theorem statement.
\subsection{Middle shape bounds}

\begin{lemma}\label{lem: tpca_charging}
	Suppose $\lambda \le n^{\frac{k}{4} - \varepsilon}$. For all $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$, suppose $deg^{\tau}(i)$ is even for all $i \in V(\tau) \setminus U_{\tau} \setminus V_{\tau}$, then
	\[\sqrt{n}^{|V(\tau)| - |U_{\tau}|}S(\tau) \le \frac{1}{n^{0.5\varepsilon\sum_{e \in E(\tau)} l_e}}\]
\end{lemma}

\begin{proof}
	Firstly, we claim that $\sum_{e \in E(\tau)} kl_e \ge 2(|V(\tau)| - |U_{\tau}|)$. For any vertex $i \in V(\tau) \setminus U_{\tau} \setminus V_{\tau}$, $deg^{\tau}(i)$ is even and is not $0$, hence, $deg^{\tau}(i) \ge 2$. Any vertex $i \in U_{\tau} \setminus V_{\tau}$ cannot have $deg^{\tau}(i) = 0$ otherwise $U_{\tau} \setminus\{i\}$ is a vertex separator of strictly smaller weight than $U_{\tau}$, which is not possible, hence, $deg^{\tau}(i) \ge 1$. Therefore,
	\begin{align*}
	\sum_{e \in E(\tau)}kl_e = \sum_{i \in V(\tau)} deg^{\tau}(i)
	&\ge \sum_{i \in V(\tau) \setminus U_{\tau} \setminus V_{\tau}} deg^{\tau}(i) + \sum_{i \in U_{\tau} \setminus V_{\tau}} deg^{\tau}(i) + \sum_{i \in V_{\tau} \setminus U_{\tau}} deg^{\tau}(i)\\
	&\ge 2|V(\tau) \setminus U_{\tau} \setminus V_{\tau}| + |U_{\tau} \setminus V_{\tau}| + |V_{\tau} \setminus U_{\tau}|\\
	&= 2(|V(\tau)| - |U_{\tau}|)
	\end{align*}
	By choosing $C_{\Delta}$ sufficiently small, we have
	\begin{align*}
	\sqrt{n}^{|V(\tau)| - |U_{\tau}|}S(\tau) &= \sqrt{n}^{|V(\tau)| - |U_{\tau}|} \Delta^{|V(\tau)| - |U_{\tau}|}\prod_{e \in E(\tau)}\left(\frac{\lambda}{(\Delta n)^{\frac{k}{2}}}\right)^{l_e}\\
	&\le \sqrt{n}^{|V(\tau)| - |U_{\tau}|}\Delta^{|V(\tau)| - |U_{\tau}|}\prod_{e \in E(\tau)}n^{(-\frac{k}{4} - 0.5\varepsilon)l_e}\\
	&= \sqrt{n}^{|V(\tau)| - |U_{\tau}| - \frac{\sum_{e \in E(\tau)}kl_e}{2}}\Delta^{|V(\tau)| - |U_{\tau}|}\prod_{e \in E(\tau)}n^{-0.5\varepsilon l_e}\\
	&= \Delta^{|V(\tau)| - |U_{\tau}|}\prod_{e \in E(\tau)}n^{-0.5 \varepsilon l_e}\\
	&\le\frac{1}{n^{0.5\varepsilon\sum_{e \in E(\tau)} l_e}}
	\end{align*}
\end{proof}

\begin{corollary}\label{cor: tpca_norm_decay}
	For all $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$, we have \[c(\tau)B_{norm}(\tau)S(\tau) \le 1\]
\end{corollary}

\begin{proof}
	Since $\tau$ is a proper middle shape, we have $w(I_{\tau}) = 0$ and $w(S_{\tau, min}) = w(U_{\tau})$. This implies
	$n^{\frac{w(V(\tau)) + w(I_{\tau}) - w(S_{\tau, min})}{2}} = \sqrt{n}^{|V(\tau)| - |U_{\tau}|}$.
	If $deg^{\tau}(i)$ is odd for any vertex $i \in V(\tau) \setminus U_{\tau} \setminus V_{\tau}$, then $S(\tau) = 0$ and the inequality is true. So, assume $deg^{\tau}(i)$ is even for all $i \in V(\tau) \setminus U_{\tau} \setminus V_{\tau}$.	As was observed in the proof of \cref{lem: tpca_charging}, every vertex $i \in V(\tau) \setminus U_{\tau}$ or $i \in V(\tau) \setminus V_{\tau}$ has $deg^{\tau}(i) \ge 1$ and hence, $|V(\tau)\setminus U_{\tau}| + |V(\tau)\setminus V_{\tau}| \le 4 \sum_{e \in E(\tau)} l_e$. Also, $|E(\tau)| \le \sum_{e \in E(\tau)} l_e$ and $q = n^{O(1) \cdot \varepsilon (C_V + C_E)}$. We can set $C_V, C_E$ sufficiently small so that, using \cref{lem: tpca_charging},
	\begin{align*}
	c(\tau)B_{norm}(\tau)S(\tau)
	&= 100(3D_V)^{|U_{\tau}\setminus V_{\tau}| + |V_{\tau}\setminus U_{\tau}| + k|E(\tau)|}2^{|V(\tau)\setminus (U_{\tau}\cup V_{\tau})|}\\
	&\quad\cdot 2e(6qD_V)^{|V(\tau)\setminus U_{\tau}| + |V(\tau)\setminus V_{\tau}|}\prod_{e \in E(\tau)} (400D_V^2D_E^2q)^{l_e}\sqrt{n}^{|V(\tau)| - |U_{\tau}|}S(\tau)\\
	&\le n^{O(1) \cdot \varepsilon(C_V + C_E) \cdot \sum_{e \in E(\tau)} l_e} \cdot \sqrt{n}^{|V(\tau)| - |U_{\tau}|}S(\tau)\\
	&\le n^{O(1) \cdot \varepsilon(C_V + C_E) \cdot \sum_{e \in E(\tau)} l_e} \cdot \frac{1}{n^{0.5\varepsilon\sum_{e \in E(\tau)} l_e}}\\
	&\le 1
	\end{align*}
\end{proof}

We can now show middle shape bounds.

\begin{lemma}\label{lem: tpca_cond2}
    For all $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$,
    \[
    \begin{bmatrix}
        \frac{1}{|Aut(U)|c(\tau)}H_{Id_U} & B_{norm}(\tau) H_{\tau}\\
        B_{norm}(\tau) H_{\tau}^T & \frac{1}{|Aut(U)|c(\tau)}H_{Id_U}
    \end{bmatrix}
    \succeq 0
    \]
\end{lemma}

\begin{proof}
	We have
	\begin{align*}
	&\begin{bmatrix}
	\frac{1}{|Aut(U)|c(\tau)}H_{Id_U} & B_{norm}(\tau)H_{\tau}\\
	B_{norm}(\tau)H_{\tau}^T & \frac{1}{|Aut(U)|c(\tau)}H_{Id_U}
	\end{bmatrix}\\
	&\qquad= \begin{bmatrix}
	\left(\frac{1}{|Aut(U)|c(\tau)} - \frac{S(\tau)B_{norm}(\tau)}{|Aut(U)|}\right)H_{Id_U} & 0\\
	0 & \left(\frac{1}{|Aut(U)|c(\tau)} - \frac{S(\tau)B_{norm}(\tau)}{|Aut(U)|}\right)H_{Id_U}
	\end{bmatrix}\\
	&\qquad \qquad+ B_{norm}(\tau)\begin{bmatrix}
	\frac{S(\tau)}{|Aut(U)|}H_{Id_U} & H_{\tau}\\
	H_{\tau}^T & \frac{S(\tau)}{|Aut(U)|}H_{Id_U}
	\end{bmatrix}
	\end{align*}
	By \cref{lem: tpca_cond2_simplified}, $\begin{bmatrix}
	\frac{S(\tau)}{|Aut(U)|}H_{Id_U} & H_{\tau}\\
	H_{\tau}^T & \frac{S(\tau)}{|Aut(U)|}H_{Id_U}
	\end{bmatrix}
	\succeq 0$, so the second term above is positive semidefinite. For the first term, by \cref{lem: tpca_cond1}, $H_{Id_U} \succeq 0$ and by \cref{cor: tpca_norm_decay}, $\frac{1}{|Aut(U)|c(\tau)} - \frac{S(\tau)B_{norm}(\tau)}{|Aut(U)|} \ge 0$, which proves that the first term is also positive semidefinite.
\end{proof}

\subsection{Intersection term bounds}

\begin{lemma}\label{lem: tpca_charging2}
	Suppose $\lambda \le n^{\frac{k}{4} - \varepsilon}$. For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$ and for all $\gamma \in \Gamma_{U, V}$,
	\[n^{w(V(\gamma)\setminus U_{\gamma})} S(\gamma)^2 \le \frac{1}{n^{B\varepsilon (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + \sum_{e \in E(\gamma)} l_e)}}\]
	for some constant $B$ that depends only on $C_{\Delta}$. In particular, it is independent of $C_V$ and $C_E$.
\end{lemma}

\begin{proof}
	Suppose there is a vertex $i \in V(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}$ such that $deg^{\gamma}(i)$ is odd, then $S(\gamma) = 0$ and the inequality is true. So, assume $deg^{\gamma}(i)$ is even for all vertices $i \in V(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}$.
	We first claim that $k\sum_{e \in E(\gamma)} l_e \ge 2|V(\gamma) \setminus U_{\gamma}|$. Since $\gamma$ is a left shape, all vertices $i$ in $V(\gamma) \setminus U_{\gamma}$ have $deg^{\gamma}(i) \ge 1$. In particular, all vertices $i \in V_{\gamma} \setminus U_{\gamma}$ have $deg^{\gamma}(i) \ge 1$.
	Moreover, if $i \in V(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}$, since $deg^{\gamma}(i)$ is even, we must have $deg^{\gamma}(i) \ge 2$.

	Let $S'$ be the set of vertices $i \in U_{\gamma} \setminus V_{\gamma}$ that have $deg^{\gamma}(i) \ge 1$. Then, note that $|S'| + |U_{\gamma} \cap V_{\gamma}| \ge |V_{\gamma}| \Longrightarrow |S'| \ge |V_{\gamma} \setminus U_{\gamma}|$ since otherwise $S' \cup (U_{\gamma} \cap V_{\gamma})$ will be a vertex separator of $\gamma$ of weight strictly less than $V_{\gamma}$, which is not possible. Then,
	\begin{align*}
	\sum_{e \in E(\gamma)}kl_e &= \sum_{i \in V(\gamma)} deg^{\gamma}(i)\\
	&\ge \sum_{i \in V(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}} deg^{\gamma}(i) + \sum_{i \in U_{\gamma} \setminus V_{\gamma}} deg^{\gamma}(i) + \sum_{i \in V_{\gamma} \setminus U_{\gamma}} deg^{\gamma}(i)\\
	&\ge 2|V(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}| + |S'| + |V_{\gamma} \setminus U_{\gamma}|\\
	&\ge 2|V(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}| + 2|V_{\gamma} \setminus U_{\gamma}|\\
	&= 2|V(\gamma) \setminus U_{\gamma}|
	\end{align*}

	Finally, note that $2|V(\gamma)| - |U_{\gamma}| - |V_{\gamma}| = |U_{\gamma} \setminus V_{\gamma}| + |V_{\gamma} \setminus U_{\gamma}| + 2|V(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}| \ge |V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})|$. By choosing $C_{\Delta}$ sufficiently small, we have
	\begin{align*}
	n^{w(V(\gamma)\setminus U_{\gamma})} S(\gamma)^2 &= n^{|V(\gamma)\setminus U_{\gamma})|} \Delta^{2|V(\gamma)| - |U_{\gamma}| - |V_{\gamma}|} \prod_{e \in E(\gamma)} \left(\frac{\lambda^2}{(\Delta n)^k}\right)^{l_e}\\
	&\le n^{|V(\gamma)\setminus U_{\gamma})|} \Delta^{2|V(\gamma)| - |U_{\gamma}| - |V_{\gamma}|} \prod_{e \in E(\gamma)} n^{-(\frac{k}{2} + \varepsilon)l_e}\\
	&\le \Delta^{2|V(\gamma)| - |U_{\gamma}| - |V_{\gamma}|} \prod_{e \in E(\gamma)} n^{-\varepsilon l_e}\\
	&\le \frac{1}{n^{B\varepsilon (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + \sum_{e \in E(\gamma)} l_e)}}
	\end{align*}
for a constant $B$ that depends only on $C_{\Delta}$.
\end{proof}

\begin{remk}
	In the above bounds, note that there is a decay of $n^{B\varepsilon}$ for each vertex in $V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})$.	One of the main technical reasons for introducing the slack parameter $C_{\Delta}$ in the planted distribution was to introduce this decay, which is needed in the current machinery.
\end{remk}

We can now obtain the intersection term bounds.

\begin{lemma}\label{lem: tpca_cond3}
    For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$ and all $\gamma \in \Gamma_{U, V}$, \[c(\gamma)^2N(\gamma)^2B(\gamma)^2H_{Id_V}^{-\gamma, \gamma} \preceq H_{\gamma}'\]
\end{lemma}

\begin{proof}
	By \cref{lem: tpca_cond3_simplified}, we have
	\begin{align*}
	c(\gamma)^2N(\gamma)^2B(\gamma)^2H_{Id_V}^{-\gamma, \gamma} &\preceq c(\gamma)^2N(\gamma)^2B(\gamma)^2 S(\gamma)^2 \frac{|Aut(U)|}{|Aut(V)|} H'_{\gamma}
	\end{align*}
	Using the same proof as in \cref{lem: tpca_cond1}, we can see that $H'_{\gamma} \succeq 0$. Therefore, it suffices to prove that $c(\gamma)^2N(\gamma)^2B(\gamma)^2 S(\gamma)^2 \frac{|Aut(U)|}{|Aut(V)|} \le 1$.
	Since $U, V \in {\mathcal I}_{mid}$, $|Aut(U)| = |U|!,|Aut(V)| = |V|!$. Therefore, $\frac{|Aut(U)|}{|Aut(V)|} = \frac{|U|!}{|V|!} \le D_V^{|U_{\gamma} \setminus V_{\gamma}|}$. Also, $|E(\gamma)| \le \sum_{e \in E(\gamma)} l_e$ and $q = n^{O(1) \cdot \varepsilon (C_V + C_E)}$. Let $B$ be the constant from \cref{lem: tpca_charging2}. We can set $C_V, C_E$ sufficiently small so that, using \cref{lem: tpca_charging2},
	\begin{align*}
	c(\gamma)^2&N(\gamma)^2B(\gamma)^2S(\gamma)^2 \frac{|Aut(U)|}{|Aut(V)|} \\
    &\le 100^2 (3D_V)^{2|U_{\gamma}\setminus V_{\gamma}| + 2|V_{\gamma}\setminus U_{\gamma}| + 2k|E(\alpha)|}4^{|V(\gamma) \setminus (U_{\gamma} \cup V_{\gamma})|}\\
	&\quad\cdot (3D_V)^{4|V(\gamma)\setminus V_{\gamma}| + 2|V(\gamma)\setminus U_{\gamma}|} (6qD_V)^{2|V(\gamma)\setminus U_{\gamma}| + 2|V(\gamma)\setminus V_{\gamma}|} \prod_{e \in E(\gamma)} (400D_V^2D_E^2q)^{2l_e}\\
	&\quad\cdot  n^{w(V(\gamma)\setminus U_{\gamma})} S(\gamma)^2 \cdot D_V^{|U_\gamma \setminus V_{\gamma}|} \\
	&\le n^{O(1) \cdot \varepsilon(C_V + C_E) \cdot (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + \sum_{e \in E(\gamma)} l_e)} \cdot n^{w(V(\gamma)\setminus U_{\gamma})} S(\gamma)^2\\
	&\le n^{O(1) \cdot \varepsilon(C_V + C_E) \cdot (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + \sum_{e \in E(\gamma)} l_e)} \cdot \frac{1}{n^{B\varepsilon (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + \sum_{e \in E(\gamma)} l_e)}}\\
	&\le 1
	\end{align*}
\end{proof}

\subsection{Truncation error bounds}

In this section, we will obtain the truncation error bounds using the strategy sketched in section 10 of \cite{potechin2020machinery}. We also reuse the notation. First, we need the following bound on $B_{norm}(\sigma) B_{norm}(\sigma') H_{Id_U}(\sigma, \sigma')$.

\begin{lemma}\label{lem: tpca_charging3}
	Suppose $\lambda = n^{\frac{k}{4} - \varepsilon}$. For all $U \in {\mathcal I}_{mid}$ and $\sigma, \sigma' \in {\mathcal L}_U$,
	\[B_{norm}(\sigma) B_{norm}(\sigma') H_{Id_U}(\sigma, \sigma') \le \frac{1}{n^{0.5\varepsilon C_{\Delta}|V(\sigma \circ \sigma')|}\Delta^{D_{sos}}n^{|U|}}\]
\end{lemma}

\begin{proof}
	Suppose there is a vertex $i \in V(\sigma) \setminus V_{\sigma}$ such that $deg^{\sigma}(i) + deg^{U_{\sigma}}(i)$ is odd, then $H_{Id_U}(\sigma, \sigma') = 0$ and the inequality is true. So, assume that $deg^{\sigma}(i) + deg^{U_{\sigma}}(i)$ is even for all $i \in V(\sigma) \setminus V_{\sigma}$. Similarly, assume that $deg^{\sigma'}(i) + deg^{U_{\sigma'}}(i)$ is even for all $i \in V(\sigma') \setminus V_{\sigma'}$. Also, if $\rho_{\sigma} \neq \rho_{\sigma'}$, we will have $H_{Id_U}(\sigma, \sigma') = 0$ and we'd be done. So, assume $\rho_{\sigma} = \rho_{\sigma'}$.

	Let $\alpha = \sigma \circ \sigma'$. We will first prove that $\sum_{e \in E(\alpha)} kl_e + 2deg(\alpha) \ge 2|V(\alpha)| + 2|U|$. Firstly, note that all vertices $i \in V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})$ have $deg^{\alpha}(i)$ to be even and nonzero, and hence at least $2$. Moreover, in both the sets $U_{\alpha} \setminus (U_{\alpha} \cap V_{\alpha})$ and $V_{\alpha} \setminus (U_{\alpha} \cap V_{\alpha})$, there are at least $|U| - |U_{\alpha} \cap V_{\alpha}|$ vertices of degree at least $1$, because $U$ is a minimum vertex separator. Also, note that $deg(\alpha) \ge |U_{\alpha}| + |V_{\alpha}|$. This implies that
	\begin{align*}
	\sum_{e \in E(\alpha)} kl_e &+ 2deg(\alpha)\\
     &\ge 2 |V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})| + 2(|U| - |U_{\alpha} \cap V_{\alpha}|) + 2(|U_{\alpha}| + |V_{\alpha}|)\\
	&= 2 (|V(\alpha)| - |U_{\alpha} \cup V_{\alpha}|) + 2(|U| - |U_{\alpha} \cap V_{\alpha}|) + 2(|U_{\alpha} \cup V_{\alpha}| + |U_{\alpha} \cap V_{\alpha}|)\\
	&= 2|V(\alpha)| + 2|U|
	\end{align*}
	where we used the fact that $U_{\alpha} \cap V_{\alpha} \subseteq U$. Finally, by choosing $C_V, C_E$ sufficiently small,
	\begin{align*}
	&B_{norm}(\sigma) B_{norm}(\sigma') H_{Id_U}(\sigma, \sigma') \\
    &= 2e(6qD_V)^{|V(\sigma)\setminus U_{\sigma}| + |V(\sigma)\setminus V_{\sigma}|}\prod_{e \in E(\sigma)} (400D_V^2D_E^2q)^{l_e} n^{\frac{w(V(\sigma)) - w(U)}{2}}\\
	&\quad\cdot 2e(6qD_V)^{|V(\sigma')\setminus U_{\sigma'}| + |V(\sigma')\setminus V_{\sigma'}|}\prod_{e \in E(\sigma')} (400D_V^2D_E^2q)^{l_e} n^{\frac{w(V(\sigma')) - w(U)}{2}}\\
	&\quad\cdot \frac{1}{|Aut(U)|} \Delta^{|V(\alpha)|} \left(\frac{1}{\sqrt{\Delta n}}\right)^{deg(\alpha)} \prod_{e \in E(\alpha)} \left(\frac{\lambda}{(\Delta n)^{\frac{k}{2}}}\right)^{l_e}\\
	&\le n^{O(1) \cdot \varepsilon (C_V + C_E) \cdot (|V(\alpha)| + \sum_{e \in E(\alpha)} l_e)} \Delta^{|V(\alpha)|}\left(\frac{1}{\sqrt{\Delta}}\right)^{deg(\alpha)}\\
	&\quad\cdot \sqrt{n}^{|V(\alpha)| - |U|} \left(\frac{1}{\sqrt{n}}\right)^{deg(\alpha)}\prod_{e \in E(\alpha)}n^{(-\frac{k}{4} - 0.5\varepsilon)l_e}\\
	&\le \frac{n^{O(1) \cdot \varepsilon (C_V + C_E) \cdot (|V(\alpha)| + \sum_{e \in E(\alpha)} l_e)}}{n^{\varepsilon C_{\Delta}|V(\alpha)|}n^{0.5\varepsilon\sum_{e \in E(\alpha)} l_e}} \cdot \frac{1}{\Delta^{D_{sos}}n^{|U|}}\sqrt{n}^{|V(\alpha)| + |U| - deg(\alpha) - \frac{1}{2}\sum_{e \in E(\alpha)} kl_e}\\
	&\le \frac{1}{n^{0.5\varepsilon C_{\Delta}|V(\alpha)|}\Delta^{D_{sos}}n^{|U|}}
	\end{align*}
where we used the facts $\Delta \le 1, deg(\alpha) \le 2D_{sos}$.
\end{proof}

We now apply the strategy by showing the following bounds.

\begin{restatable}{lemma}{TPCAfive}\label{lem: tpca_cond5}
	Whenever $\norm{M_{\alpha}} \le B_{norm}(\alpha)$ for all $\alpha \in {\mathcal M}'$,
	\[
	\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}} \succeq \frac{\Delta^{2D_{sos}^2}}{n^{D_{sos}}} Id_{sym}
	\]
\end{restatable}

\begin{proof}
    For $V \in {\mathcal I}_{mid}$, $\lambda_V = \frac{1}{n^{|V|}}$. We then choose $w_V = \left(\frac{1}{n}\right)^{D_{sos} - |V|}$. For all left shapes $\sigma \in {\mathcal L}_V$, it's easy to verify $w_{V} \leq \frac{w_{U_{\sigma}}\lambda_{U_{\sigma}}}{|\mathcal{I}_{mid}|B_{norm}(\sigma)^2{c(\sigma)^2}{H_{Id_V}(\sigma,\sigma)}}$ using \cref{lem: tpca_charging3}. This completes the proof.
\end{proof}

\begin{restatable}{lemma}{TPCAsix}\label{lem: tpca_cond6}
	\[\sum_{U\in {\mathcal I}_{mid}} \sum_{\gamma \in \Gamma_{U, *}} \frac{d_{Id_{U}}(H_{Id_{U}}, H'_{\gamma})}{|Aut(U)|c(\gamma)} \le \frac{1}{\Delta^{2D_{sos}}2^{D_V}}\]
\end{restatable}

\begin{proof}
    We use the same argument and notation as in \cref{lem: plds_cond6}. When we plug in the bounds, we get
	\begin{align*}
	\sum_{U\in {\mathcal I}_{mid}} \sum_{\gamma \in \Gamma_{U, *}} &\frac{d_{Id_{U}}(H_{Id_{U}}, H'_{\gamma})}{|Aut(U)|c(\gamma)} \\
    &\le \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}} {B_{norm}(\sigma)B_{norm}(\sigma')H_{Id_{U}}(\sigma,\sigma')\frac{1}{2^{\min(m_{\sigma}, m_{\sigma'}) - 1}}}\\
	&\le \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}}\frac{1}{n^{0.5\varepsilon C_{\Delta}|V(\sigma \circ \sigma')|}\Delta^{D_{sos}}n^{|U|}2^{\min(m_{\sigma}, m_{\sigma'}) - 1}}\\
	&\le \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}}\frac{1}{n^{0.5\varepsilon C_{\Delta}|V(\sigma \circ \sigma')|}\Delta^{D_{sos}}2^{\min(m_{\sigma}, m_{\sigma'}) - 1}}
	\end{align*}
	where we used \cref{lem: tpca_charging3}. Using $n^{0.5 C_{\Delta} |V(\sigma \circ \sigma')|} \ge n^{0.1\varepsilon C_{\Delta} |V(\sigma \circ \sigma')|}2^{|V(\sigma \circ \sigma')|}$,
	\begin{align*}
	\sum_{U\in {\mathcal I}_{mid}} \sum_{\gamma \in \Gamma_{U, *}} &\frac{d_{Id_{U}}(H_{Id_{U}}, H'_{\gamma})}{|Aut(U)|c(\gamma)}\\
    &\le \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}}\frac{1}{n^{0.1 \varepsilon  C_{\Delta}|V(\sigma \circ \sigma')|} \Delta^{D_{sos}} 2^{|V(\sigma \circ \sigma')|}2^{\min(m_{\sigma}, m_{\sigma'}) - 1}}\\
	&\le \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}}\frac{1}{n^{0.1 \varepsilon C_{\Delta}|V(\sigma \circ \sigma')|}\Delta^{D_{sos}} 2^{D_V}}\\
	&\le \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}}\frac{1}{D_{sos}^{D_{sos}}n^{0.1 \varepsilon C_{\Delta}|V(\sigma \circ \sigma')|}\Delta^{2D_{sos}} 2^{D_V}}
	\end{align*}
	where we set $C_{sos}$ small enough so that $D_{sos} = n^{\varepsilon C_{sos}} \le n^{c\varepsilon C_{\Delta}} = \frac{1}{\Delta}$. The final step will be to argue that $\sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}}\frac{1}{D_{sos}^{D_{sos}}n^{0.1 C_{\Delta}\varepsilon|V(\sigma \circ \sigma')|}} \le 1$ which will complete the proof. But this will follow if we set $C_V, C_E$ small enough.
\end{proof}

We can finally complete the analysis of the truncation error.

\begin{lemma}\label{lem: tpca_cond4}
    Whenever $\norm{M_{\alpha}} \le B_{norm}(\alpha)$ for all $\alpha \in \mathcal{M}'$,
    \[
    \sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}} \succeq 6\left(\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U}}(H'_{\gamma},H_{Id_{U}})}{|Aut(U)|c(\gamma)}}}\right)Id_{sym}
    \]
\end{lemma}

\begin{proof}
	Choose $C_{sos}$ sufficiently small so that $\frac{\Delta^{2D_{sos}^2}}{n^{D_{sos}}} \ge \frac{6}{\Delta^{2D_{sos}}2^{D_V}}$ which is satisfied by setting $C_{sos} < 0.5 C_V$. Then, since $Id_{Sym} \succeq 0$, using \cref{lem: tpca_cond5} and \cref{lem: tpca_cond6},
	\begin{align*}
	\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}} &\succeq \frac{\Delta^{2D_{sos}^2}}{n^{D_{sos}}} Id_{sym}\\
	&\succeq \frac{6}{\Delta^{2D_{sos}}2^{D_V}} Id_{sym}\\
	&\succeq 6\left(\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U}}(H'_{\gamma},H_{Id_{U}})}{|Aut(U)|c(\gamma)}}}\right)Id_{sym}
	\end{align*}
\end{proof}
\subsection{Pseudo-calibration}

\begin{definition}[Slack parameter]
	Define the slack parameter to be $\Delta = n^{-C_{\Delta}\varepsilon}$ for a constant $C_{\Delta} > 0$.
\end{definition}

We will pseudo-calibrate with respect the following pair of random and planted distributions which we denote $\nu$ and $\mu$ respectively.

\TPCAdistributions*

Let the Hermite polynomials be $h_0(x) = 1, h_1(x) = x, h_2(x) = x^2 - 1, \ldots$. For $a \in \mathbb{N}^{[n]^k}$ and variables $A_e$ for $e \in [n]^k$, define $h_a(A) := \prod_{e \in [n]^k} h_e(A_e)$. We will work with this Hermite basis.

\begin{lemma}
	Let $I \in \mathbb{N}^n, a \in \mathbb{N}^{[n]^k}$. For $i \in [n]$, let $d_i = \sum_{i \in e \in [n]^k} a_e$. Let $c$ be the number of $i$ such that $I_i + d_i$ is nonzero. Then, if $I_i + d_i$ are all even, we have
	\[\mathbb{E}_{\mu}[u^I h_a(A)] = \Delta^c\left(\frac{1}{\sqrt{\Delta n}}\right)^{|I|} \prod_{e \in [n]^k} \left(\frac{\lambda}{(\Delta n)^{\frac{k}{2}}}\right)^{a_e}\]
	Else, $\mathbb{E}_{\mu}[u^I h_a(v)] = 0$.
\end{lemma}

\begin{proof}
	When $A \sim \mu$, for all $e \in [n]^k$, we have $A_e = B_e + \lambda \prod_{i \le k} u_{e_i}$. where $B_e \sim \mathcal{N}(0, 1)$.
	Let's analyze when the required expectation is nonzero. We can first condition on $u$ and use the fact that for a fixed $t$, $\mathbb{E}_{g \sim \mathcal{N}(0, 1)}[h_k(g + t)] = t^k$ to obtain
	\[\mathbb{E}_{(u_i, w_e) \sim \mu}[u^I h_a(A)] = \mathbb{E}_{(u_i) \sim \mu}[u^I\prod_{e \in [n]^k}(\lambda \prod _{i \le k}u_{e_i})^{a_e}] = \mathbb{E}_{(u_i) \sim \mu}[\prod_{i \in [n]} u_i^{I_i + d_i}] \prod_{e \in [n]^k} \lambda^{a_e}\]

	Observe that this is nonzero precisely when all $I_i + d_i$ are even, in which case \[\mathbb{E}_{(u_i) \sim \mu}[\prod_{i \in [n]} u_i^{I_i + d_i}] = \Delta^c\left(\frac{1}{\sqrt{\Delta n}}\right)^{\sum_{i \le n} I_i + d_i} =  \Delta^c\left(\frac{1}{\sqrt{\Delta n}}\right)^{|I|} \prod_{e \in [n]^k} \left(\frac{1}{(\Delta n)^{\frac{k}{2}}}\right)^{a_e}\]
	where we used the fact that $\sum_{e \in [n]^k} a_e = k \sum_{i \in [n]} d_i$.
	This completes the proof.
\end{proof}


Define the degree of SoS to be $D_{sos} = n^{C_{sos}\varepsilon}$ for some constant $C_{sos} > 0$ that we choose later. And define the truncation parameters to be $D_V = n^{C_V\varepsilon}, D_E = n^{C_E\varepsilon}$ for some constants $C_V, C_E > 0$.

\begin{remk}[Choice of parameters]\label{rmk: choice_of_params2}
	We first set $\varepsilon$ to be a sufficiently small constant. Based on the choice of $\varepsilon$, we will set the constant $C_{\Delta} > 0$ sufficiently small so that the planted distribution is well defined. Based on these choices, just as in \cref{rmk: choice_of_params1} we choose $C_V, C_E, C_{sos}$ in that order.
\end{remk}

The underlying graphs for the graph matrices have the following structure; There will be $n$ vertices of a single type and the edges will be ordered hyperedges of arity $k$.
For the analysis of Tensor PCA, we will use the following notation.
\begin{itemize}
	\item For an index shape $U$ and a vertex $i$, define $deg^{U}(i)$ as follows: If $i \in V(U)$, then it is the power of the unique index shape piece $A \in U$ such that $i \in V(A)$. Otherwise, it is $0$.
	\item For an index shape $U$, define $deg(U) = \sum_{i \in V(U)} deg^U(i)$. This is also the degree of the monomial that $U$ corresponds to.
	\item For a shape $\alpha$ and vertex $i$ in $\alpha$, let $deg^{\alpha}(i) = \sum_{i \in e \in E(\alpha)} l_e$.
	\item For any shape $\alpha$, let $deg(\alpha) = deg(U_{\alpha}) + deg(V_{\alpha})$.
\end{itemize}

We will now describe the decomposition of the moment matrix $\Lambda$.

\begin{definition}\label{def: tpca_coeffs}
	If a shape $\alpha$ satisfies the following properties:
	\begin{itemize}
		\item $deg^{\alpha}(i) + deg^{U_{\alpha}}(i) + deg^{V_{\alpha}}(i)$ is even for all $i \in V(\alpha)$,
		\item $\alpha$ is proper,
		\item $\alpha$ satisfies the truncation parameters $D_{sos}, D_V, D_E$.
	\end{itemize}
	then define \[\lambda_{\alpha} = \Delta^{|V(\alpha)|} \left(\frac{1}{\sqrt{\Delta n}}\right)^{deg(\alpha)}  \prod_{e \in E(\alpha)} \left(\frac{\lambda}{(\Delta n)^{\frac{k}{2}}}\right)^{l_e}\]
	Otherwise, define $\lambda_{\alpha} = 0$.
\end{definition}

\begin{corollary}
	$\Lambda = \sum \lambda_{\alpha}M_{\alpha}$.
\end{corollary}

\subsection{Qualitative machinery bounds}

Just as in planted slightly denser subgraph, we prove the PSD mass condition and the qualitative middle shape and intersection term bounds, by first stating them and then introducing appropriate notation to prove them all in a unified manner.

\begin{restatable}[PSD mass]{lemma}{TPCAone}\label{lem: tpca_cond1}
	For all $U \in {\mathcal I}_{mid}$, $H_{Id_U} \succeq 0$
\end{restatable}

We define the following quantities to capture the contribution of the vertices within $\tau, \gamma$ to the Fourier coefficients.

\begin{restatable}{definition}{TPCAstau}\label{def: tpca_stau}
	For $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$, if $deg^{\tau}(i)$ is even for all vertices $i \in V(\tau) \setminus U_{\tau} \setminus V_{\tau}$, define
	\[S(\tau) = \Delta^{|V(\tau)| - |U_{\tau}|}\prod_{e \in E(\tau)}\left(\frac{\lambda}{(\Delta n)^{\frac{k}{2}}}\right)^{l_e}\]
	Otherwise, define $S(\tau) = 0$.
	For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$ and $\gamma \in \Gamma_{U, V}$, if $deg^{\gamma}(i)$ is even for all vertices $i$ in $V(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}$, define
	\[S(\gamma) = \Delta^{|V(\gamma)| - \frac{|U_{\gamma}| + |V_{\gamma}|}{2}}\prod_{e \in E(\gamma)}\left(\frac{\lambda}{(\Delta n)^{\frac{k}{2}}}\right)^{l_e}\]
	Otherwise, define $S(\gamma) = 0$.
\end{restatable}

We now state the qualitative bounds in terms of these quantities.

\begin{restatable}[Qualitative middle shape bounds]{lemma}{TPCAtwosimplified}\label{lem: tpca_cond2_simplified}
	For all $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$,
	\[
	\begin{bmatrix}
		\frac{S(\tau)}{|Aut(U)|}H_{Id_U} & H_{\tau}\\
		H_{\tau}^T & \frac{S(\tau)}{|Aut(U)|}H_{Id_U}
	\end{bmatrix}
	\succeq 0
	\]
\end{restatable}



We again use the canonical definition of $H_{\gamma}'$ from \cref{sec: hgamma_qual}.

\begin{restatable}[Qualitative intersection term bounds]{lemma}{TPCAthreesimplified}\label{lem: tpca_cond3_simplified}
	For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$ and all $\gamma \in \Gamma_{U, V}$,
	\[\frac{|Aut(V)|}{|Aut(U)|}\cdot\frac{1}{S(\gamma)^2}H_{Id_V}^{-\gamma, \gamma} \preceq H_{\gamma}'\]
\end{restatable}

\subsubsection{Proof of PSD mass condition}

We introduce some notation which makes it easy to show the qualitative bounds and which also sheds light on the structure of the coefficient matrices. When we compose shapes $\sigma, \sigma'$, from \cref{def: tpca_coeffs}, in order for $\lambda_{\sigma\circ \sigma'}$ to be nonzero, observe that all vertices $i$ in $\lambda_{\sigma \circ \sigma'}$ should have $deg^{\sigma \circ \sigma'}(i) + deg^{U_{\sigma \circ \sigma'}}(i) + deg^{V_{\sigma \circ \sigma'}}(i)$ to be even. To partially capture this notion conveniently, we will introduce the notion of parity vectors.

\begin{definition}
	Define a parity vector $\rho$ to be a vector whose entries are in $\{0, 1\}$.
	For $U\in {\mathcal I}_{mid}$, define ${\mathcal P}_U$ to be the set of parity vectors $\rho$ whose coordinates are indexed by $U$.
\end{definition}

\begin{definition}
	For a left shape $\sigma$, define $\rho_{\sigma} \in {\mathcal P}_{V_{\sigma}}$, called the parity vector of $\sigma$, to be the parity vector such that for each vertex $i \in V_{\sigma}$, the $i$-th entry of $\rho_{\sigma}$ is the parity of $deg^{U_{\sigma}}(i) + deg^{\sigma}(i)$, that is $(\rho_{\sigma})_i \equiv deg^{U_{\sigma}}(i) + deg^{\sigma}(i) \pmod 2$.
	For $U \in {\mathcal I}_{mid}$ and $\rho \in {\mathcal P}_U$, let ${\mathcal L}_{U, \rho}$ be the set of all left shapes $\sigma \in {\mathcal L}_U$ such that $\rho_{\sigma} = \rho$, that is, the set of all left shapes with parity vector $\rho$.
\end{definition}

For a shape $\tau$, for a $\tau$ coefficient matrix $H_{\tau}$ and parity vectors $\rho \in {\mathcal P}_{U_{\tau}}, \rho' \in {\mathcal P}_{V_{\tau}}$, define the $\tau$-coefficient matrix $H_{\tau, \rho, \rho'}$ as $H_{\tau ,\rho, \rho'}(\sigma, \sigma') = H_{\tau}(\sigma, \sigma')$ if $\sigma \in {\mathcal L}_{U_{\tau}, \rho}, \sigma' \in {\mathcal L}_{V_{\tau}, \rho'}$ and $0$ otherwise.
The following proposition is immediate.

\begin{propn}
	For any shape $\tau$ and $\tau$-coefficient matrix $H_{\tau}$, we have the equality $H_{\tau} = \sum_{\rho \in {\mathcal P}_{U_{\tau}}, \rho' \in {\mathcal P}_{V_{\tau}}} H_{\tau, \rho, \rho'}$
\end{propn}

\begin{propn}
	For any $U \in {\mathcal I}_{mid}$, $H_{Id_U} = \sum_{\rho \in {\mathcal P}_U} H_{Id_U, \rho, \rho}$
\end{propn}

\begin{proof}
	For any $\sigma, \sigma' \in {\mathcal L}_U$, using \cref{def: tpca_coeffs}, note that in order for $H_{Id_U}(\sigma, \sigma')$ to be nonzero, we must have $\rho_{\sigma} = \rho_{\sigma'}$.
\end{proof}

We define the following quantity to capture the contribution of the vertices within $\sigma$ to the Fourier coefficients.

\begin{definition}
	For a shape $\sigma\in {\mathcal L}$, if $deg^{\sigma}(i) + deg^{U_{\sigma}}(i)$ is even for all vertices $i \in V(\sigma) \setminus V_{\sigma}$, define
	\[T(\sigma) = \Delta^{|V(\sigma)| - \frac{|V_{\sigma}|}{2}}\left(\frac{1}{\sqrt{\Delta n}}\right)^{deg(U_{\sigma})}\prod_{e \in E(\sigma)}\left(\frac{\lambda}{(\Delta n)^{\frac{k}{2}}}\right)^{l_e}\]
	Otherwise, define $T(\sigma) = 0$.
	For $U \in {\mathcal I}_{mid}$ and $\rho \in {\mathcal P}_U$, define $v_{\rho}$ to be the vector indexed by $\sigma \in {\mathcal L}$ such that $v_{\rho}(\sigma)$ is $T(\sigma)$ if $\sigma \in {\mathcal L}_{U, \rho}$ and $0$ otherwise.
\end{definition}

With this notation, the PSD mass condition is easily shown.

\begin{proof}[Proof of the PSD mass condition \cref{lem: tpca_cond1}]
    For all $U\in {\mathcal I}_{mid}, \rho \in {\mathcal P}_U$, \cref{def: tpca_coeffs} implies $H_{Id_U, \rho, \rho} = \frac{1}{|Aut(U)|}v_{\rho}v_{\rho}^T$.
	Therefore, \[H_{Id_U} = \sum_{\rho \in {\mathcal P}_U} H_{Id_U, \rho, \rho} = \frac{1}{|Aut(U)|} \sum_{\rho \in {\mathcal P}_U} v_{\rho}v_{\rho}^T \succeq 0\]
\end{proof}

\subsubsection{Qualitative middle shape bounds}

The next proposition captures the fact that when we compose shapes $\sigma, \tau, \sigma'^T$, in order for $\lambda_{\sigma \circ \tau \circ \sigma'^T}$ to be nonzero, the parities of the degrees of the merged vertices should add up correspondingly.

\begin{propn}\label{propn: tpca_coeff_2}
	For all $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$, there exist two sets of parity vectors $P_{\tau}, Q_{\tau} \subseteq {\mathcal P}_{U}$ and a bijection $\pi : P_{\tau} \rightarrow Q_{\tau}$ such that $H_{\tau} = \sum_{\rho \in P_{\tau}} H_{\tau, \rho, \pi(\rho)}$.
\end{propn}

\begin{proof}
	Using \cref{def: tpca_coeffs}, in order for $H_{\tau}(\sigma, \sigma')$ to be nonzero, in $\sigma \circ \tau \circ \sigma'$, we must have that for all $i \in U_{\tau} \cup V_{\tau}$, $deg^{U_{\sigma}}(i) + deg^{U_{\sigma'}}(i) + deg^{\sigma \circ \tau \circ \sigma'^T}(i)$ must be even. In other words, for any $\rho \in {\mathcal P}_U$, there is at most one $\rho' \in {\mathcal P}_U$ such that if we take $\sigma \in {\mathcal L}_{U, \rho}, \sigma' \in {\mathcal L}_U$ with $H_{\tau}(\sigma, \sigma')$ nonzero, then the parity of $\sigma'$ is $\rho'$. Also, observe that $\rho'$ determines $\rho$. We then take $P_{\tau}$ to be the set of $\rho$ such that $\rho'$ exists, $Q_{\tau}$ to be the set of $\rho'$ and in this case, we define $\pi(\rho) = \rho'$.
\end{proof}



A straightforward verification of the conditions of \cref{def: tpca_coeffs} implies the following proposition.

\begin{propn}
	For any $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$, suppose we take $\rho \in P_{\tau}$.  Let $\pi$ be the bijection from \cref{propn: tpca_coeff_2} so that $\pi(\rho) \in Q_{\tau}$. Then, $H_{\tau, \rho, \pi(\rho)} = \frac{1}{|Aut(U)|^2} S(\tau) v_{\rho}v_{\pi(\rho)}^T$.
\end{propn}


We can now prove the qualitative middle shape bounds.


\begin{proof}[Proof of the qualitative middle shape bounds \cref{lem: tpca_cond2_simplified}]
	Let $P_{\tau}, Q_{\tau}, \pi$ be from \cref{propn: tpca_coeff_2}. For $\rho, \rho' \in {\mathcal P}_U$, let $W_{\rho, \rho'} = v_{\rho}(v_{\rho'})^T$. Then, $H_{Id_U} = \sum_{\rho \in {\mathcal P}_U} H_{Id_U, \rho, \rho} = \frac{1}{|Aut(U)|} \sum_{\rho \in {\mathcal P}_U}W_{\rho, \rho}$ and $H_{\tau} = \sum_{\rho \in P_{\tau}} H_{\tau, \rho, \pi(\rho)} = \frac{1}{|Aut(U)|^2}S(\tau)\sum_{\rho \in P_{\tau}} W_{\rho, \pi(\rho)}$. We have

	\begin{align*}
		\begin{bmatrix}
			\frac{S(\tau)}{|Aut(U)|}H_{Id_U} & H_{\tau}\\
			H_{\tau}^T & \frac{S(\tau)}{|Aut(U)|}H_{Id_U}
		\end{bmatrix}
		&= \frac{S(\tau)}{|Aut(U)|^2}
		\begin{bmatrix}
			\sum_{\rho \in {\mathcal P}_U} W_{\rho, \rho} & \sum_{\rho \in P_{\tau}} W_{\rho, \pi(\rho)}\\
			\sum_{\rho \in P_{\tau}} W_{\rho, \pi(\rho)}^T & \sum_{\rho \in {\mathcal P}_U} W_{\rho, \rho}
		\end{bmatrix}
	\end{align*}
	Since $\frac{S(\tau)}{|Aut(U)|^2} \ge 0$, it suffices to prove that $\begin{bmatrix}
		\sum_{\rho \in {\mathcal P}_U} W_{\rho, \rho} & \sum_{\rho \in P_{\tau}} W_{\rho, \pi(\rho)}\\
		\sum_{\rho \in P_{\tau}} W_{\rho, \pi(\rho)}^T & \sum_{\rho \in {\mathcal P}_U} W_{\rho, \rho}
	\end{bmatrix}\succeq 0$. Consider
	\begin{align*}
		\begin{bmatrix}
			\sum_{\rho \in {\mathcal P}_U} W_{\rho, \rho} & \sum_{\rho \in P_{\tau}} W_{\rho, \pi(\rho)}\\
			\sum_{\rho \in P_{\tau}} W_{\rho, \pi(\rho)}^T & \sum_{\rho \in {\mathcal P}_U} W_{\rho, \rho}
		\end{bmatrix} =& \begin{bmatrix}
			\sum_{\rho \in {\mathcal P}_U \setminus P_{\tau}} W_{\rho, \rho} & 0\\
			0 & \sum_{\rho \in {\mathcal P}_U \setminus Q_{\tau}} W_{\rho, \rho}
		\end{bmatrix}\\
		& + \begin{bmatrix}
			\sum_{\rho \in P_{\tau}} W_{\rho, \rho} & \sum_{\rho \in P_{\tau}} W_{\rho, \pi(\rho)}\\
			\sum_{\rho \in P_{\tau}} W_{\rho, \pi(\rho)}^T & \sum_{\rho \in P_{\tau}} W_{\pi(\rho), \pi(\rho)}
		\end{bmatrix}\\
	\end{align*}

	We have $\sum_{\rho \in {\mathcal P}_U \setminus P_{\tau}} W_{\rho, \rho} = \sum_{\rho \in {\mathcal P}_U \setminus P_{\tau}} v_{\rho}v_{\rho}^T \succeq 0$. Similarly, $\sum_{\rho \in {\mathcal P}_U \setminus Q_{\tau}} W_{\rho, \rho} \succeq 0$ and so, the first term in the above expression,
	$\begin{bmatrix}
		\sum_{\rho \in {\mathcal P}_U \setminus P_{\tau}} W_{\rho, \rho} & 0\\
		0 & \sum_{\rho \in {\mathcal P}_U \setminus Q_{\tau}} W_{\rho, \rho}
	\end{bmatrix}$ is positive semidefinite. For the second term,
	\begin{align*}
		\begin{bmatrix}
			\sum_{\rho \in P_{\tau}} W_{\rho, \rho} & \sum_{\rho \in P_{\tau}} W_{\rho, \pi(\rho)}\\
			\sum_{\rho \in P_{\tau}} W_{\rho, \pi(\rho)}^T & \sum_{\rho \in P_{\tau}} W_{\pi(\rho), \pi(\rho)}
		\end{bmatrix} &= \sum_{\rho \in P_{\tau}}
		\begin{bmatrix}
			W_{\rho, \rho} & W_{\rho, \pi(\rho)}\\
			W_{\rho, \pi(\rho)}^T & W_{\pi(\rho), \pi(\rho)}
		\end{bmatrix}\\
		&= \sum_{\rho \in P_{\tau}}
		\begin{bmatrix}
			v_{\rho}v_{\rho}^T & v_{\rho}(v_{\pi(\rho)})^T\\
			v_{\pi(\rho)}(v_{\rho})^T & v_{\pi(\rho)}(v_{\pi(\rho)})^T
		\end{bmatrix}\\
		&= \sum_{\rho \in P_{\tau}}
		\begin{bmatrix}
			v_{\rho}\\
			v_{\pi(\rho)}
		\end{bmatrix}
		\begin{bmatrix}
			v_{\rho} &
			v_{\pi(\rho)}
		\end{bmatrix}\\
		& \succeq 0
	\end{align*}
\end{proof}

\subsubsection{Qualitative intersection term bounds}

Similar to \cref{propn: tpca_coeff_2}, the next proposition captures the fact that when we compose shapes $\sigma, \gamma, \gamma^T, \sigma'^T$, in order for $\lambda_{\sigma \circ \gamma \circ \gamma'^T \circ \sigma'^T}$ to be nonzero, the parities of the degrees of the merged vertices should add up correspondingly.

We use the following notation.
For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$, for $\gamma \in \Gamma_{U, V}$ and parity vectors $\rho, \rho' \in {\mathcal P}_U$,  define the $\gamma \circ \gamma^T$-coefficient matrix $H_{Id_V, \rho, \rho'}^{-\gamma, \gamma}$ as $H_{Id_V, \rho, \rho'}^{-\gamma, \gamma}(\sigma, \sigma') = H_{Id_V}^{-\gamma, \gamma}(\sigma, \sigma')$ if $\sigma \in {\mathcal L}_{U, \rho},  \sigma' \in {\mathcal L}_{U, \rho'}$ and $0$ otherwise.

\begin{propn}
	For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$, for all $\gamma \in \Gamma_{U, V}$, there exists a set of parity vectors $P_{\gamma} \subseteq {\mathcal P}_U$ such that
	$H_{Id_V}^{-\gamma, \gamma} = \sum_{\rho \in P_{\gamma}} H_{Id_V, \rho, \rho}^{-\gamma, \gamma}$.
\end{propn}

\begin{proof}
	Take any $\rho \in {\mathcal P}_U$. For $\sigma \in {\mathcal L}_{U, \rho}, \sigma' \in {\mathcal L}_U$,  since $H_{Id_V}^{-\gamma, \gamma}(\sigma, \sigma') = \frac{\lambda_{\sigma \circ \gamma \circ \gamma^T \circ \sigma'^T}}{|Aut(V)|}$, $H_{Id_V}^{-\gamma, \gamma}(\sigma, \sigma')$ is nonzero precisely when $\lambda_{\sigma \circ \gamma \circ \gamma^T \circ \sigma'^T}$ is nonzero. For this quantity to be nonzero, using \cref{def: tpca_coeffs}, we get that it is necessary, but not sufficient, that the parity vector of $\sigma'$ must also be $\rho$. And also observe that there exists a set $P_{\gamma}$ of parity vectors $\rho$ for which $H_{Id_V, \rho, \rho}^{-\gamma, \gamma}$ is nonzero and their sum is precisely $H_{Id_V}^{-\gamma, \gamma}$.
\end{proof}

For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$, for all $\gamma \in \Gamma_{U, V}$ and parity vector $\rho \in {\mathcal P}_U$, define the matrix $H'_{\gamma, \rho, \rho}$ as $H'_{\gamma, \rho, \rho}(\sigma, \sigma') = H'_{\gamma}(\sigma, \sigma')$ if $\sigma, \sigma' \in {\mathcal L}_{U, \rho}$ and $0$ otherwise. The following proposition is immediate from the definition.

\begin{propn}
	For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$, for $\gamma \in \Gamma_{U, V}$, $H_{\gamma}' = \sum_{\rho \in P_{\gamma}} H_{\gamma, \rho, \rho}'$.
\end{propn}



\begin{propn}
	For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$, for all $\gamma \in \Gamma_{U, V}$ and $\rho \in P_{\gamma}$,
	\[H_{Id_V, \rho, \rho}^{-\gamma, \gamma} = \frac{|Aut(U)|}{|Aut(V)|} S(\gamma)^2 H'_{\gamma, \rho, \rho}\]
\end{propn}

\begin{proof}
	Fix $\sigma, \sigma' \in {\mathcal L}_{U, \rho}$ such that $|V(\sigma \circ \gamma)|, |V(\sigma' \circ \gamma)| \le D_V$. Note that $|V(\sigma)| - \frac{|V_{\sigma}|}{2} + |V(\sigma')| - \frac{|V_{\sigma'}|}{2} + 2(|V(\gamma)| - \frac{|U_{\gamma}| + |V_{\gamma}|}{2}) = |V(\sigma \circ \gamma \circ \gamma^T \circ \sigma'^T)|$. Using \cref{def: tpca_coeffs}, we can easily verify that $\lambda_{\sigma \circ \gamma \circ \gamma^T \circ \sigma'^T} = T(\sigma)T(\sigma') S(\gamma)^2$. Therefore, $H_{Id_V, \rho, \rho}^{-\gamma, \gamma}(\sigma, \sigma') = \frac{|Aut(U)|}{|Aut(V)|} S(\gamma)^2 H_{Id_U, \rho, \rho}(\sigma, \sigma')$. Since $H'_{\gamma, \rho, \rho}(\sigma, \sigma') = H_{Id_U, \rho, \rho}(\sigma, \sigma')$ whenever we have $|V(\sigma \circ \gamma)|, |V(\sigma' \circ \gamma)| \le D_V$, this completes the proof.
\end{proof}

With this, we can prove the qualitative intersection term bounds.

\begin{proof}[Proof of qualitative intersection term bounds \cref{lem: tpca_cond3_simplified}]
	We have
	\begin{align*}
		\frac{|Aut(V)|}{|Aut(U)|}\cdot\frac{1}{S(\gamma)^2}H_{Id_V}^{-\gamma, \gamma} = \sum_{\rho \in P_{\gamma}} \frac{|Aut(V)|}{|Aut(U)|}\cdot\frac{1}{S(\gamma)^2} H_{Id_V, \rho, \rho}^{-\gamma, \gamma}
		= \sum_{\rho \in P_{\gamma}} H'_{\gamma, \rho, \rho}
		&\preceq \sum_{\rho \in {\mathcal P}_U} H'_{\gamma, \rho, \rho}\\
        &= H'_{\gamma}
	\end{align*}
	where we used the fact that for all $\rho \in {\mathcal P}_U$, we have $H'_{\gamma,\rho, \rho} \succeq 0$.
\end{proof}

\subsection{Warm-up: Analysis with no intersection terms}
In this subsection, we show how the analysis works if we ignore the difference between $M^{fact}$ and $M^{orth}$
\begin{theorem}\label{thm:nointersectionanalysis}
	For all $\varepsilon' \in (0,\frac{1}{2}]$, if the norm bounds hold and the following conditions hold
	\begin{enumerate}
		\item For all $U \in \mathcal{I}_{mid}$,  $H_{Id_{U}} \succeq 0$
		\item For all $U \in \mathcal{I}_{mid}$ and all $\tau \in \mathcal{M}_U$
		\[
		\left[ {\begin{array}{cc}
				\frac{1}{|Aut(U)|c(\tau)}H_{Id_{U}} & B_{norm}(\tau)H_{\tau} \\
				B_{norm}(\tau)H^T_{\tau} & \frac{1}{|Aut(U)|c(\tau)}H_{Id_{U}}
		\end{array}} \right] \succeq 0
		\]
		\item $\forall U \in \mathcal{I}_{mid}, \sum_{\tau \in \mathcal{M}_U}{\frac{1}{|Aut(U)|c(\tau)}} \leq \varepsilon'$.
	\end{enumerate}
	then
	\[
	\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}(H_{Id_U})} + \sum_{U \in \mathcal{I}_{mid}}{\sum_{\tau \in \mathcal{M}_U}{M^{fact}_{\tau}(H_{\tau})} \succeq (1-2\varepsilon')\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}(H_{Id_U})}} \succeq 0
	\]
\end{theorem}
\begin{proof}
	We first show how a single term $M_{\sigma}M_{\tau}M_{{\sigma'}^T}$ plus its transpose $M_{\sigma'}M_{{\tau}^T}M_{{\sigma}^T}$ can be bounded.
	\begin{lemma}
		If the norm bounds hold then for all $\tau \in \mathcal{M}'$ and shapes $\sigma,\sigma'$ such that $\sigma,\tau,{\sigma'}^T$ are composable, for all $a,b$ such that $a > 0$, $b > 0$, and $ab = B_{norm}(\tau)^2$,
		\[
		M_{\sigma}M_{\tau}M_{{\sigma'}^T} + M_{\sigma'}M_{{\tau}^T}M_{{\sigma}^T} \preceq aM_{\sigma}M_{\sigma^T} + bM_{\sigma'}M_{{\sigma'}^T}
		\]
	\end{lemma}
	\begin{proof}
		Observe that 
		\begin{align*}
			0 \preceq &\left(\sqrt{a}M_{\sigma} - \frac{\sqrt{b}}{B_{norm}(\tau)}M_{{\sigma'}}M_{\tau^T}\right)\left(\sqrt{a}M_{\sigma} - \frac{\sqrt{b}}{B_{norm}(\tau)}M_{{\sigma'}}M_{\tau^T}\right)^T = \\
			&\left(\sqrt{a}M_{\sigma} - \frac{\sqrt{b}}{B_{norm}(\tau)}M_{{\sigma'}}M_{\tau}\right)\left(\sqrt{a}M_{{\sigma}^T} - \frac{\sqrt{b}}{B_{norm}(\tau)}M_{{\tau}}M_{{\sigma'}^T}\right) = \\
			&aM_{\sigma}M_{\sigma^T} - M_{\sigma}M_{\tau}M_{{\sigma'}^T} - M_{\sigma'}M_{{\tau}^T}M_{{\sigma}^T} + \frac{b}{B_{norm}(\tau)^2}M_{\sigma'}M_{\tau^{T}}M_{\tau}M_{{\sigma'}^T} \preceq \\
			&aM_{\sigma}M_{\sigma^T} - M_{\sigma}M_{\tau}M_{{\sigma'}^T} - M_{\sigma'}M_{{\tau}^T}M_{{\sigma}^T} + \frac{b}{B_{norm}(\tau)^2}M_{\sigma'}(B_{norm}(\tau)^2{Id})M_{{\sigma'}^T}
		\end{align*}
		Thus, $M_{\sigma}M_{\tau}M_{{\sigma'}^T} + M_{\sigma'}M_{{\tau}^T}M_{{\sigma}^T} \preceq aM_{\sigma}M_{\sigma^T} + bM_{\sigma'}M_{{\sigma'}^T}$, as needed.
	\end{proof}




	Unfortunately, if we try to bound everything term by term, there may be too many terms to bound. Instead, we generalize this argument for vectors and coefficient matrices.
	\begin{definition}
		Let $\tau$ be a shape. We say that a vector $v$ is a left $\tau$-vector if the coordinates of $v$ are indexed by left shapes $\sigma \in \mathcal{L}_{U_{\tau}}$. We say that a vector $w$ is a right $\tau$-vector if the coordinates of $w$ are indexed by left shapes $\sigma' \in \mathcal{L}_{V_{\tau}}$.
	\end{definition}
	\begin{lemma}\label{lm:rankonetosquares}
		For all $\tau \in \mathcal{M}'$, if the norm bounds hold, $v$ is a left $\tau$-vector, and $w$ is a right $\tau$-vector then 
		\[
		M^{fact}_{\tau}(vw^T) + M^{fact}_{{\tau}^T}(wv^T) \preceq B_{norm}(\tau)\left(M^{fact}_{Id_{U_{\tau}}}(vv^T) + M^{fact}_{Id_{V_{\tau}}}(ww^T)\right)
		\]
		and 
		\[
		-M^{fact}_{\tau}(vw^T) - M^{fact}_{{\tau}^T}(wv^T) \preceq B_{norm}(\tau)\left(M^{fact}_{Id_{U_{\tau}}}(vv^T) + M^{fact}_{Id_{V_{\tau}}}(ww^T)\right)
		\]
	\end{lemma}
	\begin{proof}
		Observe that 
		\begin{align*}
			0 \preceq &\left(\sum_{\sigma}{v_{\sigma}M_{\sigma} \mp \frac{w_{\sigma}M_{\sigma}M_{{\tau}^T}}{B_{norm}(\tau)}}\right)
			\left(\sum_{\sigma'}{v_{\sigma'}M_{\sigma'} \mp \frac{w_{\sigma'}M_{\sigma'}M_{{\tau}^T}}{B_{norm}(\tau)}}\right)^T = \\
			&\left(\sum_{\sigma}{v_{\sigma}M_{\sigma} \mp \frac{w_{\sigma}M_{\sigma}M_{{\tau}^T}}{B_{norm}(\tau)}}\right)\left(\sum_{\sigma'}{v_{\sigma'}M_{{\sigma'}^T} \mp \frac{w_{{\sigma'}}M_{\tau}M_{{\sigma'}^T}}{B_{norm}(\tau)}}\right) = \\
			&\sum_{\sigma,\sigma'}{\left(v_{\sigma}v_{\sigma'}\right)M_{\sigma}M_{{\sigma'}^T}} \mp \sum_{\sigma,\sigma'}{\frac{\left(v_{\sigma}w_{\sigma'}\right)}{B_{norm}(\tau)}M_{\sigma}M_{\tau}M_{\sigma'}} \\
			&\mp \sum_{\sigma,\sigma'}{\frac{\left(w_{\sigma}v_{\sigma'}\right)}{B_{norm}(\tau)}M_{\sigma}M_{{\tau}^T}M_{\sigma'}} + 
			\frac{1}{B_{norm}(\tau)^2}\sum_{\sigma,\sigma'}{\left(v_{\sigma}v_{\sigma'}\right)M_{\sigma}M_{\tau}M_{{\tau}^T}M_{{\sigma'}^T}}
		\end{align*}
		Further observe that 
		\begin{enumerate}
			\item $\sum_{\sigma,\sigma'}{\left(v_{\sigma}v_{\sigma'}\right)M_{\sigma}M_{{\sigma'}^T}} = M^{fact}_{Id_{U_{\tau}}}(vv^T)$
			\item $\sum_{\sigma,\sigma'}{\left(v_{\sigma}w_{\sigma'}\right)M_{\sigma}M_{\tau}M_{{\sigma'}^T}} = M^{fact}_{\tau}(vw^T)$
			\item $\sum_{\sigma,\sigma'}{\left(w_{\sigma}v_{\sigma'}\right)M_{\sigma}M_{{\tau}^T}M_{{\sigma'}^T}} = M^{fact}_{{\tau}^T}(wv^T)$
			\item 
			\begin{align*}
				\sum_{\sigma,\sigma'}{\left(w_{\sigma}w_{\sigma'}\right)M_{\sigma}M_{\tau}M_{{\tau}^T}M_{{\sigma'}^T}} 
				&= \left(\sum_{\sigma}{w_{\sigma}M_{\sigma}}\right)M_{\tau}M_{{\tau}^T}\left(\sum_{\sigma}{w_{\sigma}M_{\sigma}}\right)^T \\
				&\preceq \left(\sum_{\sigma}{w_{\sigma}M_{\sigma}}\right)B_{norm}(\tau)^2{Id}\left(\sum_{\sigma}{w_{\sigma}M_{\sigma}}\right)^T \\
				&= B_{norm}(\tau)^2\sum_{\sigma,\sigma'}{\left(w_{\sigma}w_{\sigma'}\right)M_{\sigma}M_{{\sigma'}^T}} \\
				&= {B_{norm}(\tau)^2}M^{fact}_{Id_{V_{\tau}}}(ww^T)
			\end{align*}
		\end{enumerate}
		Putting everything together, 
		\[
		\frac{M^{fact}_{\tau}(vw^T) + M^{fact}_{{\tau}^T}(wv^T)}{B_{norm}(\tau)} \preceq M^{fact}_{Id_{U_{\tau}}}(vv^T) + M^{fact}_{Id_{V_{\tau}}}(ww^T)
		\] 
		and 
		\[
		-\frac{M^{fact}_{\tau}(vw^T) + M^{fact}_{{\tau}^T}(wv^T)}{B_{norm}(\tau)} \preceq M^{fact}_{Id_{U_{\tau}}}(vv^T) + M^{fact}_{Id_{V_{\tau}}}(ww^T)
		\] 
		as needed.
	\end{proof}
	\begin{corollary}\label{cor:factorizedmatrixbound}
		For all $\tau \in \mathcal{M}'$, if the norm bounds hold and $H_U$ and $H_V$ are matrices such that 
		\[
		\left[ {\begin{array}{cc}
				H_{U} & B_{norm}(\tau)H_{\tau} \\
				B_{norm}(\tau)H^T_{\tau} & H_{V}
		\end{array}} \right] \succeq 0
		\]
		then $M^{fact}_{\tau}(H_{\tau}) + M^{fact}_{{\tau}^T}(H_{\tau^T}) \preceq M^{fact}_{Id_{U_{\tau}}}(H_{U}) + M^{fact}_{Id_{V_{\tau}}}(H_{V})$
	\end{corollary}
	\begin{proof}
		If $            \left[ {\begin{array}{cc}
				H_{U} & B_{norm}(\tau)H_{\tau} \\
				B_{norm}(\tau)H^T_{\tau} & H_{V}
		\end{array}} \right] \succeq 0$ then we can write 
		\[            \left[ {\begin{array}{cc}
				H_{U} & B_{norm}(\tau)H_{\tau} \\
				B_{norm}(\tau)H^T_{\tau} & H_{V}
		\end{array}} \right] = \sum_{i}{(v_i,w_i)(v_i,w_i)^T}
		\]
		Since the $M^{fact}$ operations are linear, the result now follows by summing the equation
		\[
		M^{fact}_{\tau}({v_i}w_i^T) + M^{fact}_{{\tau}^T}({w_i}v_i^T) \preceq B_{norm}(\tau)\left(M^{fact}_{Id_{U_{\tau}}}({v_i}v_i^T) + M^{fact}_{Id_{V_{\tau}}}({w_i}w_i^T)\right)
		\]
		over all $i$.
	\end{proof}
	Theorem \ref{thm:nointersectionanalysis} now follows directly. For all $U \in \mathcal{I}_{mid}$ and all $\tau \in \mathcal{M}_U$, using Corollary \ref{cor:factorizedmatrixbound} with $H_U = H_V = \frac{1}{|Aut(U)|c(\tau)}H_{Id_{U}}$,
	\[
	M^{fact}_{\tau}(H_{\tau}) + M^{fact}_{{\tau}^T}(H_{\tau^T}) \preceq \frac{1}{|Aut(U)|c(\tau)}M^{fact}_{Id_{U}}(H_{Id_{U}}) + \frac{1}{|Aut(U)|c(\tau)}M^{fact}_{Id_{U}}(H_{Id_{U}})
	\]
	Summing this equation over all $U \in \mathcal{I}_{mid}$ and all $\tau \in \mathcal{M}_U$, we obtain that 
	\[
	\sum_{U \in \mathcal{I}_{mid}}{\sum_{\tau \in \mathcal{M}_U}{M^{fact}_{\tau}(H_{\tau})}} \preceq 2\varepsilon'\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}(H_{Id_U})}
	\] as needed.
\end{proof}
\subsection{Intersection Term Analysis Strategy}\label{qualitativeintersectionpatternssection}
As we saw in the previous subsection, the analysis works out nicely if we work with $M^{fact}$. Unfortunately, our matrices are expressed in terms of $M^{orth}$. In this subsection, we describe our strategy for analyzing the difference between $M^{fact}$ and $M^{orth}$.

Recall the following expressions for $\left(M^{fact}_{\tau}(H)\right)(A,B)$ and $\left(M^{orth}_{\tau}(H)\right)(A,B)$ where $A$ has shape $U_{\tau}$ and $B$ has shape $V_{\tau}$:
\[
\left(M^{fact}_{\tau}(H)\right)(A,B) = \sum_{\sigma \in \mathcal{L}_{U_{\tau}}, \sigma' \in \mathcal{L}_{V_{\tau}}}{H(\sigma,\sigma')\sum_{A',B'}{
		\sum_{R_1 \in \mathcal{R}(\sigma,A,A'), R_2 \in \mathcal{R}(\tau,A',B'), \atop R_3 \in \mathcal{R}({\sigma'}^T,B',B)}M_{R_1}(A,A')M_{R_2}(A',B')M_{R_3}(B',B)}}
\]
\begin{align*}
	&\left(M^{orth}_{\tau}(H)\right)(A,B) \\
	&= \sum_{\sigma \in \mathcal{L}_{U_{\tau}}, \sigma' \in \mathcal{L}_{V_{\tau}}}{H(\sigma,\sigma')\sum_{A',B'}{
			\sum_{R_1 \in \mathcal{R}(\sigma,A,A'), R_2 \in \mathcal{R}(\tau,A',B'), \atop {R_3 \in \mathcal{R}({\sigma'}^T,B',B), R_1,R_2,R_3 \text{ are properly composable}}}M_{R_1}(A,A')M_{R_2}(A',B')M_{R_3}(B',B)}} 
\end{align*}
This implies that $\left(M^{fact}_{\tau}(H)\right)(A,B) - \left(M^{orth}_{\tau}(H)\right)(A,B)$ is equal to
\[
\sum_{\sigma \in \mathcal{L}_{U_{\tau}}, \sigma' \in \mathcal{L}_{V_{\tau}}}{H(\sigma,\sigma')\sum_{A',B'}{
		\sum_{R_1 \in \mathcal{R}(\sigma,A,A'), R_2 \in \mathcal{R}(\tau,A',B'), \text{ and } R_3 \in \mathcal{R}({\sigma'}^T,B',B) \atop R_1,R_2,R_3 \text{ are not properly composable}}M_{R_1}(A,A')M_{R_2}(A',B')M_{R_3}(B',B)}} 
\]
Thus, to understand the difference between $M^{fact}$ and $M^{orth}$, we need to analyze the terms $\chi_{R_1}\chi_{R_2}\chi_{R_3} = \chi_{R_1 \circ R_2 \circ R_3}$ for ribbons $R_1,R_2,R_3$ which are composable but not properly composable. These terms, which we call intersection terms, are not negligible and must be analyzed carefully. In particular, we decompose each resulting ribbon $R = R_1 \circ R_2 \circ R_3$ into new left, middle, and right parts. We do this as follows:
\begin{enumerate}
	\item Let $V_{*}$ be the set of vertices which appear more than once in $V(R_1 \circ R_2 \circ R_3)$. In other words, $V_{*}$ is the set of vertices involved in the intersections between $R_1$, $R_2$, and $R_3$ (not counting the facts that $B_{R_1} = A_{R_2}$ and $B_{R_2} = A_{R_3}$ because we expect these intersections).
	\item Let $A'$ be the leftmost minimum vertex separator of $A_{R_1}$ and $B_{R_1} \cup V_{*}$ in $R_1$. We turn $A'$ into a matrix index by specifying an ordering $O_{A'}$ for the vertices in $A'$.
	\item Let $B'$ be the leftmost minimum vertex separator of $A_{R_3} \cup V_{*}$ and $B_{R_3}$ in $R_2$. We turn $B'$ into a matrix index by specifying an ordering $O_{B'}$ for the vertices in $B'$.
	\item Decompose $R_1$ as $R_1 = {R'}_1 \cup R_4$ where ${R'}_1$ is the part of $R_1$ between $A_{R_1}$ and $A'$ and $R_4$ is the part of $R_1$ between $B'$ and $B_{R_1} = A_{R_2}$. Similarly, decompose 
	$R_3$ as $R_3 = R_5 \cup {R'}_3 $ where $R_5$ is the part of $R_3$ between $B_{R_1} = A_{R_2}$ and $B'$ and ${R'}_3$ is the part of $R_3$ between $B'$ and $B_{R_3}$.
	\item Take $R'_2 = R_4 \circ R_2 \circ R_5$ and note that $R'_1 \circ R'_2 \circ R'_3 = R_1 \circ R_2 \circ R_3$. We view $R'_1,R'_2,R'_3$ as the left, middle, and right parts of $R = R_1 \circ R_2 \circ R_3$
\end{enumerate}
While we will verify our analysis by checking the coefficients of the ribbons, we want to express everything in terms of shapes. We use the following conventions for the names of the shapes:
\begin{enumerate}
	\item As usual, we let $\sigma$, $\tau$, and ${\sigma'}^T$ be the shapes of $R_1$, $R_2$, and $R_3$.
	\item We let $\gamma$ and ${\gamma'}^T$ be the shapes of $R_4$ and $R_5$.
	\item We let $\sigma_2$, $\tau_{P}$, and ${\sigma'_2}^T$ be the shapes of $R'_1$, $R'_2$, and $R'_3$. Here $P$ is the intersection pattern induced by $R_4$, $R_2$, and $R_5$ which we define in the next subsection.
\end{enumerate}
\begin{remark}
	A key feature of our analysis is that it will work the same way regardless of the shapes $\sigma_2,{\sigma'_2}^T$ of $R'_1$ and $R'_3$. In other words, if we replace $\sigma_2$ by $\sigma_{2a}$ and $\sigma'_2$ by $\sigma'_{2a}$ for a given intersection term, this just replaces $\sigma = \sigma_2 \cup \gamma$ with  $\sigma_{a} = \sigma_{2a} \cup \gamma$ and $\sigma' = \sigma'_2 \cup \gamma'$ with  $\sigma'_{a} = \sigma'_{2a} \cup \gamma'$. This allows us to focus on the shapes $\gamma$, $\tau$, and ${\gamma'}^T$ and is the reason why the $-\gamma,\gamma$ operation appears in our results.
\end{remark}
\subsection{Intersection Term Analysis}\label{intersectiontermanalysissection}
In this section, we implement our strategy for analyzing intersection terms. For simplicity, we only give rough definitions and proof sketches here. For a more rigorous treatment, see Apendix \ref{canonicalmapsection}.

We begin by defining intersection patterns which describe how the ribbons $R_1$, $R_2$, and $R_3$ intersect. 
\begin{definition}[Rough Definition of Intersection Patterns]\label{intersectionpatternroughdef}
	Given $\tau \in \mathcal{M}'$, $\gamma \in \Gamma_{*,U_{\tau}} \cup \{Id_{U_{\tau}}\}$, $\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}$, and ribbons $R_1$, $R_2$, and $R_3$ of shapes $\gamma$, $\tau$, and ${\gamma'}^T$ which are composable but not properly composable, we define the intersection pattern $P$ induced by $R_1$, $R_2$, and $R_3$ and the resulting shape $\tau_P$ as follows:
	\begin{enumerate}
		\item We take $V(P) = V(\gamma \circ \tau \circ {\gamma'}^T)$.
		\item We take $E(P)$ to be the set of edges $(u,v)$ such that $u,v$ are distinct vertices in $V(\sigma \circ \tau \circ {\sigma'}^T)$ but $u$ and $v$ correspond to the same vertex in $R_1 \circ R_2 \circ R_3$
		\item We define $\tau_{P}$ to be the shape of the ribbon $R = R_1 \circ R_2 \circ R_3$
	\end{enumerate}
\end{definition}
\begin{definition}\label{setPdefinition}
	Given $\tau \in \mathcal{M}'$, $\gamma \in \Gamma_{*,U_{\tau}} \cup \{Id_{U_{\tau}}\}$, and $\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}$, we define $\mathcal{P}_{\gamma,\tau,{\gamma'}^T}$ to be the set of all possible intersection patterns $P$ which can be induced by ribbons $R_1$, $R_2$, and $R_3$ of shapes $\gamma$, $\tau$, and ${\gamma'}^T$.
\end{definition}
\begin{remark}
	Note that if $\gamma = Id_{U_{\tau}}$ and $\gamma' = Id_{V_{\tau}}$ then $\mathcal{P}_{\gamma,\tau,{\gamma'}^T} = \emptyset$ as every intersection pattern must have an unexpected intersection so either $\gamma$ or $\gamma'$ must be non-trivial.
\end{remark}
It would be nice if the intersection pattern $P$ together with the ribbon $R$ allowed us to recover the original ribbons $R_1$, $R_2$, and $R_3$. Unfortunately, it is possible for different triples of ribbons to result in the same intersection pattern $P$ and ribbon $R$. That said, the number of such triples cannot be too large, and this is sufficient for our purposes.
\begin{definition}
	Given an intersection pattern $P \in \mathcal{P}_{\gamma,\tau,{\gamma'}^T}$, let $R$ be a ribbon of shape $\tau_{P}$. We define $N(P)$ to be the number of different triples of ribbons $R_1,R_2,R_3$ such that $R_1 \circ R_2 \circ R_3 = R$ and $R_1,R_2,R_3$ induce the intersection pattern $P$.
\end{definition}
\begin{lemma}
	For all intersection patterns $P \in \mathcal{P}_{\gamma,\tau,{\gamma'}^T}$, $N(P) \leq |V(\tau_{P})|^{|V(\gamma) \setminus U_{\gamma}| + |V(\gamma') \setminus U_{\gamma'}|}$
\end{lemma}
\begin{proof}[Proof sketch]
	This can be proved by making the following observations:
	\begin{enumerate}
		\item $A_{R_1} = A_{R}$ and $B_{R_3} = B_{R}$.
		\item All of the remining vertices in $V(R_1)$ and $V(R_3)$ must be equal to some vertex in $V(R)$.
		\item Once $R_1$ and $R_3$ are determined, there is at most one ribbon $R_2$ such that $R_1,R_2,R_3$ are composable, $R = R_1 \circ R_2 \circ R_3$, and $R_1,R_2,R_3$ induce the intersection pattern $P$.
	\end{enumerate}
\end{proof}
With these definitions, we can now analyze the intersection terms.
\begin{definition}
	Given a left shape $\sigma$, define $e_{\sigma}$ to be the vector which has a $1$ in coordinate $\sigma$ and has a $0$ in all other coordinates.
\end{definition}
\begin{lemma}\label{lm:singleshapeintersections}
	For all $\tau \in \mathcal{M}'$, $\sigma \in \mathcal{L}_{U_{\tau}}$, and $\sigma' \in \mathcal{L}_{V_{\tau}}$, 
	\begin{align*}
		&M^{fact}_{\tau}(e_{\sigma}e^T_{\sigma'}) - M^{orth}_{\tau}(e_{\sigma}e^T_{\sigma'}) = \sum_{\sigma_2 \in \mathcal{L}, \gamma \in \Gamma: \sigma_2 \circ \gamma = \sigma}{\frac{1}{|Aut(U_{\gamma})|}\sum_{P \in \mathcal{P}_{\gamma,\tau,Id_{V_{\tau}}}}N(P)M^{orth}_{\tau_P}(e_{\sigma_2}e^T_{\sigma'})} \\
		&+ \sum_{\sigma'_2 \in \mathcal{L}, \gamma' \in \Gamma: \sigma'_2 \circ \gamma' = \sigma'}{\frac{1}{|Aut(U_{\gamma'})|}\sum_{P \in \mathcal{P}_{Id_{U_{\tau}},\tau,{\gamma'}^T}}N(P)M^{orth}_{\tau_P}(e_{\sigma}e^T_{\sigma'_2})} \\
		&+ \sum_{\sigma_2 \in \mathcal{L}, \gamma \in \Gamma: \sigma_2 \circ \gamma = \sigma}{\sum_{\sigma'_2 \in \mathcal{L}, \gamma' \in \Gamma: \sigma'_2 \circ \gamma' = \sigma'}{
				\frac{1}{|Aut(U_{\gamma})|\cdot|Aut(U_{\gamma'})|}\sum_{P \in \mathcal{P}_{\gamma,\tau,{\gamma'}^T}}N(P)M^{orth}_{\tau_P}(e_{\sigma_2}e^T_{\sigma'_2})}}
	\end{align*}
\end{lemma}
\begin{proof}[Proof sketch]
	This lemma follows from the following bijection. Consider the third term
	\[
	\sum_{\sigma_2 \in \mathcal{L}, \gamma \in \Gamma: \sigma_2 \circ \gamma = \sigma}{\sum_{\sigma'_2 \in \mathcal{L}, \gamma' \in \Gamma: \sigma'_2 \circ \gamma' = \sigma'}{
			\frac{1}{|Aut(U_{\gamma})|\cdot|Aut(U_{\gamma'})|}\sum_{P \in \mathcal{P}_{\gamma,\tau,{\gamma'}^T}}N(P)M^{orth}_{\tau_P}(e_{\sigma_2}e^T_{\sigma'_2})}}
	\]
	On one side, we have the following data:
	\begin{enumerate}
		\item Ribbons $R_1$, $R_2$, and $R_3$ of shapes $\gamma,\tau,{\gamma'}^T$ such that $R_1,R_2,R_3$ are composable but $R_1$ and $R_2 \circ R_3$ are not properly composable (i.e. $R_1$ has an unexpected intersection with $R_2$ and/or $R_3$) and $R_1 \circ R_2$ and $R_3$ are not properly composable (i.e. $R_3$ has an unexpected intersection with $R_1$ and/or $R_2$).
		\item An ordering $O_{A'}$ on the leftmost minimum vertex separator $A'$ of $A_{R_1}$ and $V_{*} \cup B_{R_1}$ (recall that $V_{*}$ is the set of vertices which appear more than once in $V(R_1 \circ R_2 \circ R_3)$).
		\item An ordering $O_{B'}$ on the rightmost minimum vertex separator $B'$ of $V_{*} \cup A_{R_3}$ and $B_{R_3}$.
	\end{enumerate}
	On the other side, we have the following data
	\begin{enumerate}
		\item An intersection pattern $P \in \mathcal{P}_{\gamma,\tau,{\gamma'}^T}$ where $\gamma$ and ${\gamma'}^T$ are non-trivial.
		\item Ribbons $R'_1$, $R'_2$, $R'_3$ of shapes $\sigma_2$, $\tau_P$, ${\sigma'_2}^T$ which are properly composable
		\item A number in $[N(P)]$ describing which possible triple of ribbons resulted in the intersection pattern $P$ and the ribbon $R'_2$.
	\end{enumerate}
	To see this bijection, note that given the data on the first side, we can recover the ribbons $R'_1$, $R'_2$, and $R'_3$ as follows:
	\begin{enumerate}
		\item We decompose $R_1$ as $R_1 = R'_1 \circ R_4$ where $B_{R'_1} = A_{R_4} = A'$ with the ordering $O_{A'}$.
		\item We decompose $R_3$ as $R_3 = R_5 \circ R'_3$ where  where $B_{R_5} = A_{R'_3} = B'$ with the ordering $O_{B'}$.
		\item We take $R'_2 = R_4 \circ R_2 \circ R_5$.
	\end{enumerate}
	The intersection pattern $P$ and the number in $[N(P)]$ can be obtained from $R_1$, $R_2$, and $R_3$.
	
	Conversely, with the data on the other side, we can recover the data on the first side as follows:
	\begin{enumerate}
		\item $R'_2$ gives an ordering $O_{A'}$ for $A' = A_{R'_2}$ and an ordering $O_{B'}$ for $B' = B_{R'_2}$.
		\item The ribbon $R'_2$, intersection pattern $P$, and number in $[N(P)]$ allow us to recover $R_4$, $R_2$, and $R_5$.
		\item We take $R_1 = R'_1 \circ R_4$ and $R_3 = R_5 \circ R'_3$.
	\end{enumerate}
	Thus, both sides have the same coefficient for each ribbon.
	
	The analysis for the the first term is the same except that when $\gamma'$ is trivial, we always take $\gamma' = Id_{V_{\tau}}$. Thus, we always have that $B' = B_{R'_2} = B_{R_2}$ (with the same ordering) and $R'_3 = R_3 = Id_{B'}$. Because of this, there is no need to specify $R_3$, $R'_3$, $R_5$, or an ordering on $B'$.
	
	Similarly, the analysis for the the second term is the same except that when $\gamma$ is trivial, we always take $\gamma = Id_{U_{\tau}}$. Thus, we always have that $A' = A_{R'_2} = A_{R_2}$ (with the same ordering) and $R'_1 = R_1 = Id_{A'}$. Because of this, there is no need to specify $R_1$, $R'_1$, $R_4$, or an ordering on $A'$.
\end{proof}
Applying Lemma \ref{lm:singleshapeintersections} for all $\sigma$ and $\sigma'$ simultaneously, we obtain the following corollary.
\begin{definition}
	For all $U,V \in \mathcal{I}_{mid}$, given a $\gamma \in \Gamma_{U,V}$ and a vector $v$ indexed by left shapes $\sigma \in \mathcal{L}_V$, define $v^{-\gamma}$ to be the vector indexed by left shapes $\sigma_2 \in \mathcal{L}_{U}$ such that $v^{-\gamma}(\sigma_2) = v(\sigma_2 \circ \gamma)$ if $\sigma_2 \circ \gamma \in \mathcal{L}_V$ and $v^{-\gamma}(\sigma_2) = 0$ otherwise.
\end{definition}
\begin{proposition}
	For all composable $\gamma_2,\gamma_1 \in \Gamma$ and all vectors $v$ indexed by left shapes in $\mathcal{L}_{V_{\gamma_1}}$, $(v^{-\gamma_1})^{-\gamma_2} = v^{-\gamma_2 \circ \gamma_1}$
\end{proposition}
\begin{corollary}\label{cor:singlestepintersections}
	For all $\tau \in \mathcal{M}'$, for all left $\tau$-vectors $v$ and all right $\tau$-vectors $w$, 
	\begin{align*}
		&M^{orth}_{\tau}(vw^T) = M^{fact}_{\tau}(vw^T) - \sum_{\gamma \in \Gamma_{*,U_{\tau}}}{\frac{1}{|Aut(U_{\gamma})|}\sum_{P \in \mathcal{P}_{\gamma,\tau,Id_{V_{\tau}}}}N(P)M^{orth}_{\tau_P}(v^{-\gamma}w^T)} \\
		&- \sum_{\gamma' \in \Gamma_{*,V_{\tau}}}{\frac{1}{|Aut(U_{\gamma'})|}\sum_{P \in \mathcal{P}_{Id_{U_{\tau}},\tau,{\gamma'}^T}}N(P)M^{orth}_{\tau_P}(v(w^{-\gamma})^T)} \\
		&- \sum_{\gamma \in \Gamma_{*,U_{\tau}}}{\sum_{\gamma' \in \Gamma_{*,V_{\tau}}}{
				\frac{1}{|Aut(U_{\gamma})|\cdot|Aut(U_{\gamma'})|}\sum_{P \in \mathcal{P}_{\gamma,\tau,{\gamma'}^T}}N(P)M^{orth}_{\tau_P}(v^{-\gamma}(w^{-\gamma'})^T)}}
	\end{align*}
\end{corollary}
Applying Corollary \ref{cor:singlestepintersections} iteratively, we obtain the following theorem:
\begin{definition}\label{multigammadefinition}
	Given $\gamma,\gamma' \in \Gamma \cup \{Id_U:U \in \mathcal{I}_{mid}\}$ and $j > 0$, let $\Gamma_{\gamma,\gamma',j}$ be the set of all $\gamma_1,\gamma'_1,\cdots,\gamma_j,\gamma'_j \in \Gamma \cup \{Id_U:U \in \mathcal{I}_{mid}\}$ such that:
	\begin{enumerate}
		\item $\gamma_j,\ldots,\gamma_1$ are composable and $\gamma_j \circ \ldots \circ \gamma_1 = \gamma$
		\item $\gamma'_j,\ldots,\gamma'_1$ are composable and $\gamma'_j \circ \ldots \circ \gamma'_1 = \gamma'$
		\item For all $i \in [1,j]$, $\gamma_i$ or $\gamma'_i$ is non-trivial (i.e. $\gamma_i \neq Id_{U_{\gamma_i}}$ or $\gamma'_i \neq Id_{U_{\gamma'_i}}$).
	\end{enumerate}
\end{definition}
\begin{remark}
	Note that if $\gamma = Id_{U}$ and $\gamma' = Id_{V}$ then for all $j > 0$, $\Gamma_{\gamma,\gamma',j} = \emptyset$.
\end{remark}
\begin{theorem}\label{thm:mfactmorthdifference}
	For all $\tau \in \mathcal{M}'$, left $\tau$-vectors $v$, and right $\tau$-vectors $w$, 
	\begin{align*}
		&M^{orth}_{\tau}(v{w^T}) = M^{fact}_{\tau}(v{w^T}) + \\
		&\sum_{\gamma \in \Gamma_{*,U_{\tau}} \cup \{Id_{U_{\tau}}\},\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}: \atop \gamma \text{ or } \gamma' \text{ is non-trivial }}\sum_{j>0}{(-1)^{j}\sum_{\gamma_1,\gamma'_1,\cdots,\gamma_j,\gamma'_j \in \Gamma_{\gamma,\gamma',j}}{\prod_{i:\gamma_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma_i})|}}
				\prod_{i:\gamma'_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma'_i})|}}}} \\
		&\sum_{P_1,\cdots,P_j:P_i \in \mathcal{P}_{\gamma_i,\tau_{P_{i-1}},{\gamma'_i}^T}}{\left(\prod_{i=1}^{j}{N(P_i)}\right)M^{fact}_{\tau_{P_j}}(v^{-\gamma}{(w^{-\gamma'})^T})}
	\end{align*}
	where we take $\tau_{P_0} = \tau$.
\end{theorem}
\subsection{Bounding the difference between $M^{fact}$ and $M^{orth}$}\label{boundingdifferencesection}
In this subsection, we bound the difference between $M^{fact}_{\tau}(H_{\tau})$ and $M^{orth}_{\tau}(H_{\tau})$. We recall the following conditions on $B(\gamma)$, $N(\gamma)$, and $c(\gamma)$:
\begin{enumerate}
	\item For all $\tau \in \mathcal{M}'$, $\gamma \in \Gamma_{*,U_{\tau}} \cup \{Id_{U_{\tau}}\}$, and $\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}$,
	\begin{align*}
		&\sum_{j>0}{\sum_{\gamma_1,\gamma'_1,\cdots,\gamma_j,\gamma'_j \in \Gamma_{\gamma,\gamma',j}}{\left(\prod_{i:\gamma_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma_i})|}}\right)
				\left(\prod_{i:\gamma'_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma'_i})|}}\right)}}\\
		&\sum_{P_1,\cdots,P_j:P_i \in \mathcal{P}_{\gamma_i,\tau_{P_{i-1}},{\gamma'_i}^T}}{\left(\prod_{i=1}^{j}{N(P_i)}\right)} \leq \frac{N(\gamma)N(\gamma')}{(|Aut(U_{\gamma})|)^{1_{\gamma \text{ is non-trivial}}}(|Aut(U_{\gamma'})|)^{1_{\gamma' \text{ is non-trivial}}}}
	\end{align*}
	\item For all $\tau \in \mathcal{M}'$, $\gamma \in \Gamma_{*,U_{\tau}}$, and $\gamma' \in \Gamma_{*,V_{\tau}}$, for all $P \in \mathcal{P}_{\gamma,\tau,{\gamma'}^T}$, 
	$B_{norm}(\tau_{P}) \leq B(\gamma)B(\gamma')B_{norm}(\tau)$
	\item $\forall V \in \mathcal{I}_{mid}, \sum_{\gamma \in \Gamma_{*,V}}{\frac{1}{|Aut(U_{\gamma})|c(\gamma)}} \leq \varepsilon' \leq \frac{1}{20}$
\end{enumerate}
With these conditions, we can now bound the difference between $M^{fact}$ and $M^{orth}$.
\begin{lemma}\label{keyboundinglemma}
	If the norm bounds and the conditions on $B(\gamma)$, $N(\gamma)$, and $c(\gamma)$ hold then for all $\tau \in \mathcal{M}'$, left $\tau$-vectors $v$, and right $\tau$-vectors $w$, 
	\begin{align*}
		&\left(M^{fact}_{\tau}(v{w^T}) + M^{fact}_{{\tau}^T}(w{v^T})\right) - \left(M^{orth}_{\tau}(v{w^T}) + M^{orth}_{{\tau}^T}(w{v^T})\right) \preceq \\
		&\\
		&{\varepsilon'}B_{norm}(\tau)M^{fact}_{Id_{U_{\tau}}}(vv^{T}) + 
		2\sum_{\gamma \in \Gamma_{*,U_{\tau}}}{\frac{B(\gamma)^{2}N(\gamma)^{2}B_{norm}(\tau){c(\gamma)}}{|Aut(U_{\gamma})|}M^{fact}_{Id_{U_{\gamma}}}(v^{-\gamma}(v^{-\gamma})^T)} + \\
		&{\varepsilon'}B_{norm}(\tau)M^{fact}_{Id_{V_{\tau}}}(ww^{T}) + 
		2\sum_{\gamma' \in \Gamma_{*,V_{\tau}}}{\frac{B(\gamma')^{2}N(\gamma')^{2}B_{norm}(\tau){c(\gamma')}}{|Aut(U_{\gamma'})|}M^{fact}_{Id_{U_{\gamma'}}}(w^{-\gamma'}(w^{-\gamma'})^T)}
	\end{align*}
\end{lemma}
\begin{proof}
	By Theorem \ref{thm:mfactmorthdifference}, taking $\tau_{P_0} = \tau$, 
	\begin{align*}
		&M^{orth}_{\tau}(v{w^T}) = M^{fact}_{\tau}(v{w^T}) + \\
		&\sum_{\gamma \in \Gamma_{*,U_{\tau}} \cup \{Id_{U_{\tau}}\},\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}: \atop \gamma \text{ or } \gamma' \text{ is non-trivial }}\sum_{j>0}{(-1)^{j}\sum_{\gamma_1,\gamma'_1,\cdots,\gamma_j,\gamma'_j \in \Gamma_{\gamma,\gamma',j}}{\prod_{i:\gamma_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma_i})|}}
				\prod_{i:\gamma'_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma'_i})|}}}} \\
		&\sum_{P_1,\cdots,P_j:P_i \in \mathcal{P}_{\gamma_i,\tau_{P_{i-1}},{\gamma'_i}^T}}{\left(\prod_{i=1}^{j}{N(P_i)}\right)M^{fact}_{\tau_{P_j}}(v^{-\gamma}{(w^{-\gamma'})^T})}
	\end{align*}
	Taking the transpose of this equation gives
	\begin{align*}
		&M^{orth}_{{\tau}^T}(w{v^T}) = M^{fact}_{{\tau}^T}(w{v^T}) + \\
		&\sum_{\gamma \in \Gamma_{*,U_{\tau}} \cup \{Id_{U_{\tau}}\},\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}: \atop \gamma \text{ or } \gamma' \text{ is non-trivial }}\sum_{j>0}{(-1)^{j}\sum_{\gamma_1,\gamma'_1,\cdots,\gamma_j,\gamma'_j \in \Gamma_{\gamma,\gamma',j}}{\prod_{i:\gamma_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma_i})|}}
				\prod_{i:\gamma'_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma'_i})|}}}} \\
		&\sum_{P_1,\cdots,P_j:P_i \in \mathcal{P}_{\gamma_i,\tau_{P_{i-1}},{\gamma'_i}^T}}{\left(\prod_{i=1}^{j}{N(P_i)}\right)M^{fact}_{\tau^{T}_{P_j}}({w^{-\gamma'}(v^{-\gamma})^T})}
	\end{align*}
	Now observe that by Lemma \ref{lm:rankonetosquares}, if the norm bounds hold, 
	\begin{align*}
		&{\pm}\left(M^{fact}_{\tau_{P_j}}(v^{-\gamma}{(w^{-\gamma'})^T}) + M^{fact}_{\tau^{T}_{P_j}}({w^{-\gamma'}(v^{-\gamma})^T})\right) = \\
		&{\pm}M^{fact}_{\tau_{P_j}}\left(\left(\sqrt{\frac{N(\gamma)B(\gamma){c(\gamma)}}{N(\gamma')B(\gamma'){c(\gamma')}}}v^{-\gamma}\right)
		\left(\sqrt{\frac{N(\gamma')B(\gamma'){c(\gamma')}}{N(\gamma)B(\gamma){c(\gamma)}}}{(w^{-\gamma'})^T}\right)\right) \pm \\
		&M^{fact}_{\tau^{T}_{P_j}}\left(\left(\sqrt{\frac{N(\gamma')B(\gamma'){c(\gamma')}}{N(\gamma)B(\gamma){c(\gamma)}}}w^{-\gamma'}\right)
		\left(\sqrt{\frac{N(\gamma)B(\gamma){c(\gamma)}}{N(\gamma')B(\gamma'){c(\gamma')}}}{(v^{-\gamma})^T}\right)\right) \preceq \\
		&B_{norm}(\tau_{P_j})\left(\frac{N(\gamma)B(\gamma){c(\gamma)}}{N(\gamma')B(\gamma'){c(\gamma')}}M^{fact}_{Id_{U_{\gamma}}}(v^{-\gamma}(v^{-\gamma})^T) + 
		\frac{N(\gamma')B(\gamma'){c(\gamma')}}{N(\gamma)B(\gamma){c(\gamma)}}M^{fact}_{Id_{U_{\gamma'}}}(w^{-\gamma'}(w^{-\gamma'})^T)\right)
	\end{align*}
	Combining these equations,
	\begin{align*}
		&\left(M^{fact}_{\tau}(v{w^T}) + M^{fact}_{{\tau}^T}(w{v^T})\right) - \left(M^{orth}_{\tau}(v{w^T}) + M^{orth}_{{\tau}^T}(w{v^T})\right) \preceq \\
		&\sum_{\gamma \in \Gamma_{*,U_{\tau}} \cup \{Id_{U_{\tau}}\},\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}: \atop \gamma \text{ or } \gamma' \text{ is non-trivial }}\sum_{j>0}{\sum_{\gamma_1,\gamma'_1,\cdots,\gamma_j,\gamma'_j \in \Gamma_{\gamma,\gamma',j}}{\prod_{i:\gamma_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma_i})|}}
				\prod_{i:\gamma'_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma'_i})|}}}} \\
		&\sum_{P_1,\cdots,P_j:P_i \in \mathcal{P}_{\gamma_i,\tau_{P_{i-1}},{\gamma'_i}^T}}{\left(\prod_{i=1}^{j}{N(P_i)}\right)
			B_{norm}(\tau_{P_j})} \\
		&\left(\frac{N(\gamma)B(\gamma){c(\gamma)}}{N(\gamma')B(\gamma'){c(\gamma')}}M^{fact}_{Id_{U_{\gamma}}}(v^{-\gamma}(v^{-\gamma})^T) + 
		\frac{N(\gamma')B(\gamma'){c(\gamma')}}{N(\gamma)B(\gamma){c(\gamma)}}M^{fact}_{Id_{U_{\gamma'}}}(w^{-\gamma'}(w^{-\gamma'})^T)\right)
	\end{align*}
	From the conditions on $B(\gamma)$ and $N(\gamma)$, 
	\begin{enumerate}
		\item $B_{norm}(\tau_{P_j}) \leq B(\gamma)B(\gamma')B_{norm}(\tau)$
		\item
		\begin{align*}
			&\sum_{j>0}{\sum_{\gamma_1,\gamma'_1,\cdots,\gamma_j,\gamma'_j \in \Gamma_{\gamma,\gamma',j}}{\left(\prod_{i:\gamma_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma_i})|}}\right)
					\left(\prod_{i:\gamma'_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma'_i})|}}\right)}}\\
			&\sum_{P_1,\cdots,P_j:P_i \in \mathcal{P}_{\gamma_i,\tau_{P_{i-1}},{\gamma'_i}^T}}{\left(\prod_{i=1}^{j}{N(P_i)}\right)} \leq \frac{N(\gamma)N(\gamma')}{(|Aut(U_{\gamma})|)^{1_{\gamma \text{ is non-trivial}}}(|Aut(U_{\gamma'})|)^{1_{\gamma' \text{ is non-trivial}}}}
		\end{align*}
	\end{enumerate}
	Putting these equations together, 
	\begin{align*}
		&\left(M^{fact}_{\tau}(v{w^T}) + M^{fact}_{{\tau}^T}(w{v^T})\right) - \left(M^{orth}_{\tau}(v{w^T}) + M^{orth}_{{\tau}^T}(w{v^T})\right) \preceq \\
		&\sum_{\gamma \in \Gamma_{*,U_{\tau}} \cup \{Id_{U_{\tau}}\},\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}: \atop \gamma \text{ or } \gamma' \text{ is non-trivial }}{\frac{B(\gamma)^{2}N(\gamma)^{2}B_{norm}(\tau){c(\gamma)}}{(|Aut(U_{\gamma})|)^{1_{\gamma \text{ is non-trivial}}}(|Aut(U_{\gamma'})|)^{1_{\gamma' \text{ is non-trivial}}}{c(\gamma')}}M^{fact}_{Id_{U_{\gamma}}}(v^{-\gamma}(v^{-\gamma})^T)} + \\
		&\sum_{\gamma \in \Gamma_{*,U_{\tau}} \cup \{Id_{U_{\tau}}\},\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}: \atop \gamma \text{ or } \gamma' \text{ is non-trivial }}{\frac{B(\gamma')^{2}N(\gamma')^{2}B_{norm}(\tau){c(\gamma')}}{(|Aut(U_{\gamma})|)^{1_{\gamma \text{ is non-trivial}}}(|Aut(U_{\gamma'})|)^{1_{\gamma' \text{ is non-trivial}}}{c(\gamma)}}M^{fact}_{Id_{U_{\gamma'}}}(w^{-\gamma'}(w^{-\gamma'})^T)}
	\end{align*}
	Now observe that
	\begin{align*}
		&\sum_{\gamma \in \Gamma_{*,U_{\tau}} \cup \{Id_{U_{\tau}}\},\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}: \atop \gamma \text{ or } \gamma' \text{ is non-trivial }}{\frac{B(\gamma)^{2}N(\gamma)^{2}B_{norm}(\tau){c(\gamma)}}{(|Aut(U_{\gamma})|)^{1_{\gamma \text{ is non-trivial}}}(|Aut(U_{\gamma'})|)^{1_{\gamma' \text{ is non-trivial}}}{c(\gamma')}}M^{fact}_{Id_{U_{\gamma}}}(v^{-\gamma}(v^{-\gamma})^T)} \preceq \\
		&\left(\sum_{\gamma' \in \Gamma_{*,V_{\tau}}}{\frac{1}{|Aut(U_{\gamma'})|{c(\gamma')}}}\right)B_{norm}(\tau)M^{fact}_{Id_{U_{\tau}}}(vv^{T}) + \\
		&\sum_{\gamma \in \Gamma_{*,U_{\tau}}}{\left(\sum_{\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}}{\frac{1}{(|Aut(U_{\gamma'})|)^{1_{\gamma' \text{ is non-trivial}}}{c(\gamma')}}}\right)\frac{B(\gamma)^{2}N(\gamma)^{2}B_{norm}(\tau){c(\gamma)}}{(|Aut(U_{\gamma})|)^{1_{\gamma \text{ is non-trivial}}}}M^{fact}_{Id_{U_{\gamma}}}(v^{-\gamma}(v^{-\gamma})^T)}  \preceq \\
		&{\varepsilon'}B_{norm}(\tau)M^{fact}_{Id_{U_{\tau}}}(vv^{T}) + 
		2\sum_{\gamma \in \Gamma_{*,U_{\tau}}}{\frac{B(\gamma)^{2}N(\gamma)^{2}B_{norm}(\tau){c(\gamma)}}{|Aut(U_{\gamma})|}M^{fact}_{Id_{U_{\gamma}}}(v^{-\gamma}(v^{-\gamma})^T)}
	\end{align*}
	Following similar logic, 
	\begin{align*}
		&\sum_{\gamma \in \Gamma_{*,U_{\tau}} \cup \{Id_{U_{\tau}}\},\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}: \atop \gamma \text{ or } \gamma' \text{ is non-trivial }}{\frac{B(\gamma')^{2}N(\gamma')^{2}B_{norm}(\tau){c(\gamma')}}{(|Aut(U_{\gamma})|)^{1_{\gamma \text{ is non-trivial}}}(|Aut(U_{\gamma'})|)^{1_{\gamma' \text{ is non-trivial}}}{c(\gamma)}}M^{fact}_{Id_{U_{\gamma'}}}(w^{-\gamma'}(w^{-\gamma'})^T)} \preceq \\
		&{\varepsilon'}B_{norm}(\tau)M^{fact}_{Id_{V_{\tau}}}(ww^{T}) + 
		2\sum_{\gamma' \in \Gamma_{*,V_{\tau}}}{\frac{B(\gamma')^{2}N(\gamma')^{2}B_{norm}(\tau){c(\gamma')}}{|Aut(U_{\gamma'})|}M^{fact}_{Id_{U_{\gamma'}}}(w^{-\gamma'}(w^{-\gamma'})^T)}
	\end{align*}
	Putting everything together,
	\begin{align*}
		&\left(M^{fact}_{\tau}(v{w^T}) + M^{fact}_{{\tau}^T}(w{v^T})\right) - \left(M^{orth}_{\tau}(v{w^T}) + M^{orth}_{{\tau}^T}(w{v^T})\right) \preceq \\
		&\\
		&{\varepsilon'}B_{norm}(\tau)M^{fact}_{Id_{U_{\tau}}}(vv^{T}) + 
		2\sum_{\gamma \in \Gamma_{*,U_{\tau}}}{\frac{B(\gamma)^{2}N(\gamma)^{2}B_{norm}(\tau){c(\gamma)}}{|Aut(U_{\gamma})|}M^{fact}_{Id_{U_{\gamma}}}(v^{-\gamma}(v^{-\gamma})^T)} + \\
		&{\varepsilon'}B_{norm}(\tau)M^{fact}_{Id_{V_{\tau}}}(ww^{T}) + 
		2\sum_{\gamma' \in \Gamma_{*,V_{\tau}}}{\frac{B(\gamma')^{2}N(\gamma')^{2}B_{norm}(\tau){c(\gamma')}}{|Aut(U_{\gamma'})|}M^{fact}_{Id_{U_{\gamma'}}}(w^{-\gamma'}(w^{-\gamma'})^T)}
	\end{align*}
	as needed.
\end{proof}
Using Lemma \ref{keyboundinglemma} we have the following corollaries:
\begin{corollary}\label{notauintersectiontermcorollary}
	For all $U \in \mathcal{I}_{mid}$, if the norm bounds and the conditions on $B(\gamma)$, $N(\gamma)$, and $c(\gamma)$ hold and $H_{Id_U} \succeq 0$ then  
	\[
	M^{fact}_{Id_{U}}(H_{Id_U}) - M^{orth}_{Id_{U}}(H_{Id_U}) \preceq {\varepsilon'}M^{fact}_{Id_{U}}(H_{Id_U}) + 
	2\sum_{\gamma \in \Gamma_{*,U}}{\frac{B(\gamma)^{2}N(\gamma)^{2}{c(\gamma)}}{|Aut(U_{\gamma})|}M^{fact}_{Id_{U_{\gamma}}}(H^{-\gamma,\gamma}_{Id_{U}})}
	\]
\end{corollary}
\begin{corollary}\label{yestauintersectiontermcorollary}
	For all $U \in \mathcal{I}_{mid}$ and all $\tau \in \mathcal{M}_{U}$, if the norm bounds and the conditions on $B(\gamma)$, $N(\gamma)$, and $c(\gamma)$ hold and 
	\[
	\left[ {\begin{array}{cc}
			\frac{1}{|Aut(U)|c(\tau)}H_{Id_U} & B_{norm}(\tau)H_{\tau} \\
			B_{norm}(\tau)H^T_{\tau} & \frac{1}{|Aut(U)|c(\tau)}H_{Id_U}
	\end{array}} \right] \succeq 0
	\]
	then
	\begin{align*}
		&\left(M^{fact}_{\tau}(H_{\tau}) + M^{fact}_{{\tau}^T}(H^{T}_{\tau})\right) - \left(M^{orth}_{\tau}(H_{\tau}) + M^{orth}_{{\tau}^T}(H^{T}_{\tau})\right) \preceq \\
		&\\
		&2{\varepsilon'}\frac{1}{|Aut(U)|c(\tau)}M^{fact}_{Id_{U}}(H_{Id_U}) + 
		4\sum_{\gamma \in \Gamma_{*,U}}{\frac{B(\gamma)^{2}N(\gamma)^{2}{c(\gamma)}}{|Aut(U_{\gamma})|\cdot|Aut(U)|c(\tau)}M^{fact}_{Id_{U_{\gamma}}}(H_{Id_{U}}^{-\gamma,\gamma}})
	\end{align*}
\end{corollary}
\subsection{Proof of the Main Theorem}
We now prove the following theorem which is a slight modification of Theorem \ref{maintheoremviaproperties} and which implies Theorem \ref{maintheoremviaproperties}.
\begin{theorem}\label{maintheoremviapropertiescopy}
	For all $\varepsilon > 0$ and all $\varepsilon' \in (0,\frac{1}{20}]$, for any moment matrix 
	\[
	\Lambda = \sum_{U \in \mathcal{I}_{mid}}{M^{orth}_{Id_U}(H_{Id_U})} + \sum_{U \in \mathcal{I}_{mid}}{\sum_{\tau \in \mathcal{M}_U}{M^{orth}_{\tau}(H_{\tau})}},
	\]
	if we have that for all $\alpha \in \mathcal{M}', ||M_{\alpha}|| \leq B_{norm}(\alpha)$ and $B(\gamma)$, $N(\gamma)$, and $c(\alpha)$ are functions such that 
	\begin{enumerate}
		\item For all $\tau \in \mathcal{M}'$, $\gamma \in \Gamma_{*,U_{\tau}}$, $\gamma' \in \Gamma_{*,V_{\tau}}$, and all intersection patterns $P \in \mathcal{P}_{\gamma,\tau,\gamma'}$, 
		\[
		B_{norm}(\tau_{P}) \leq B(\gamma)B(\gamma')B_{norm}(\tau)
		\]
		\item For all composable $\gamma_1,\gamma_2$, $B(\gamma_1)B(\gamma_2) = B(\gamma_1 \circ \gamma_2)$.
		\item $\forall U \in \mathcal{I}_{mid}, \sum_{\gamma \in \Gamma_{U,*}}{\frac{1}{|Aut(U)|c(\gamma)}} < \varepsilon'$ 
		\item $\forall V \in \mathcal{I}_{mid}, \sum_{\gamma \in \Gamma_{*,V}}{\frac{1}{|Aut(U_{\gamma})|c(\gamma)}} < \varepsilon'$ 
		\item $\forall U \in \mathcal{I}_{mid}, \sum_{\tau \in \mathcal{M}_{U}}{\frac{1}{|Aut(U)|c(\tau)}} < \varepsilon'$
		\item For all $\tau \in \mathcal{M}'$, $\gamma \in \Gamma_{*,U_{\tau}} \cup \{Id_{U_{\tau}}\}$, and $\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}$,
		\begin{align*}
			&\sum_{j>0}{\sum_{\gamma_1,\gamma'_1,\cdots,\gamma_j,\gamma'_j \in \Gamma_{\gamma,\gamma',j}}{\prod_{i:\gamma_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma_i})|}}
					\prod_{i:\gamma'_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma'_i})|}}}}\sum_{P_1,\cdots,P_j:P_i \in \mathcal{P}_{\gamma_i,\tau_{P_{i-1}},{\gamma'_i}^T}}{\left(\prod_{i=1}^{j}{N(P_i)}\right)} \\
			&\leq \frac{N(\gamma)N(\gamma')}
			{(|Aut(U_{\gamma})|)^{1_{\gamma \text{ is non-trivial}}}(|Aut(U_{\gamma'})|)^{1_{\gamma' \text{ is non-trivial}}}}
		\end{align*}
	\end{enumerate}
	and we have SOS-symmetric coefficient matrices $\{H'_{\gamma}: \gamma \in \Gamma\}$ such that the following conditions hold:
	\begin{enumerate}
		\item For all $U \in \mathcal{I}_{mid}$,  $H_{Id_{U}} \succeq 0$
		\item For all $U \in \mathcal{I}_{mid}$ and $\tau \in \mathcal{M}_U$,
		\[
		\left[ {\begin{array}{cc}
				\frac{1}{|Aut(U)|c(\tau)}H_{Id_{U}} & B_{norm}(\tau)H_{\tau} \\
				B_{norm}(\tau)H^T_{\tau} & \frac{1}{|Aut(U)|c(\tau)}H_{Id_{U}}
		\end{array}} \right] \succeq 0
		\]
		\item For all $U,V \in \mathcal{I}_{mid}$ where $w(U) > w(V)$ and all $\gamma \in \Gamma_{U,V}$, 
		\[
		c(\gamma)^2{N(\gamma)}^2{B(\gamma)^2}H^{-\gamma,\gamma}_{Id_{V}} \preceq H'_{\gamma}
		\]
	\end{enumerate}
	then 
	\[
	\Lambda \succeq \frac{1}{2}\left(\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}}\right) - 3\left(\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U}}(H'_{\gamma},H_{Id_{U}})}{|Aut(U)|c(\gamma)}}}\right)Id_{sym}
	\]
	If it is also true that 
	\[
	\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}} \succeq 6\left(\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U}}(H'_{\gamma},H_{Id_{U}})}{|Aut(U)|c(\gamma)}}}\right)Id_{sym}
	\]
	then $\Lambda \succeq 0$.
\end{theorem}
\begin{proof}
	We make the following observations:
	\begin{enumerate}
		\item By Theorem \ref{thm:nointersectionanalysis},
		\[
		\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}(H_{Id_U})} + \sum_{U \in \mathcal{I}_{mid}}{\sum_{\tau \in \mathcal{M}_U}{M^{fact}_{\tau}(H_{\tau})} \succeq (1-2\varepsilon')\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}(H_{Id_U})}}
		\]
		\item By Corollary \ref{notauintersectiontermcorollary}, 
		\[
		\sum_{U \in \mathcal{I}_{mid}}{\left(M^{fact}_{Id_{U}}(H_{Id_U}) - M^{orth}_{Id_{U}}(H_{Id_U})\right)} \preceq {\varepsilon'}\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_{U}}(H_{Id_U})} + 
		2\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{*,U}}{\frac{M^{fact}_{Id_{U_{\gamma}}}(H'_{\gamma})}{c(\gamma)|Aut(U_{\gamma})|}}}
		\]
		\item By Corollary \ref{yestauintersectiontermcorollary}, 
		\begin{align*}
			&\sum_{U \in \mathcal{I}_{mid}}{\sum_{\tau \in \mathcal{M}_U}{\left(M^{fact}_{\tau}(H_{\tau}) - M^{orth}_{\tau}(H_{\tau})\right)}} \preceq \\
			&\\
			&\sum_{U \in \mathcal{I}_{mid}}{\sum_{\tau \in \mathcal{M}_U}{\left(\frac{2{\varepsilon'}}{|Aut(U)|c(\tau)}M^{fact}_{Id_{U}}(H_{Id_U}) + 
					4\sum_{\gamma \in \Gamma_{*,U}}{\frac{B(\gamma)^{2}N(\gamma)^{2}{c(\gamma)}}{|Aut(U_{\gamma})|\cdot|Aut(U)|c(\tau)}M^{fact}_{Id_{U_{\gamma}}}(H_{Id_{U}}^{-\gamma,\gamma})}\right)}} \preceq \\
			&2{\varepsilon'}^2\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_{U}}(H_{Id_U})} + 
			4{\varepsilon'}\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{*,U}}{\frac{M^{fact}_{Id_{U_{\gamma}}}(H'_{\gamma})}{c(\gamma)|Aut(U_{\gamma})|}}}
		\end{align*}
		\item
		\begin{align*}
			&\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{*,U}}{\frac{M^{fact}_{Id_{U_{\gamma}}}(H'_{\gamma})}{c(\gamma)|Aut(U_{\gamma})|}}} = 
			\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{*,U}}{\frac{M^{fact}_{Id_{U_{\gamma}}}(H_{Id_{U_\gamma}}) + \left(M^{fact}_{Id_{U_{\gamma}}}(H'_{\gamma}) - M^{fact}_{Id_{U_{\gamma}}}(H_{Id_{U_\gamma}})\right)}{c(\gamma)|Aut(U_{\gamma})|}}} \preceq \\
			&\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{*,U}}{\frac{M^{fact}_{Id_{U_{\gamma}}}(H_{Id_{U_{\gamma}}})}{c(\gamma)|Aut(U_{\gamma})|}}} + 
			\left(\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U_{\gamma}}}(H'_{\gamma},H_{Id_{U_{\gamma}}})}{|Aut(U_{\gamma})|c(\gamma)}}}\right)Id_{sym} \preceq \\
			&{\varepsilon'}\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{U}(H_{Id_{U}})} + 
			\left(\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U_{\gamma}}}(H'_{\gamma},H_{Id_{U_{\gamma}}})}{|Aut(U_{\gamma})|c(\gamma)}}}\right)Id_{sym}
		\end{align*}
	\end{enumerate}
	Putting everything together, 
	\begin{align*}
		&\Lambda = \sum_{U \in \mathcal{I}_{mid}}{M^{orth}_{Id_U}(H_{Id_U})} + \sum_{U \in \mathcal{I}_{mid}}{\sum_{\tau \in \mathcal{M}_U}{M^{orth}_{\tau}(H_{\tau})}} = \\
		&\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}(H_{Id_U})} + \sum_{U \in \mathcal{I}_{mid}}{\sum_{\tau \in \mathcal{M}_U}{M^{fact}_{\tau}(H_{\tau})}} + 
		\sum_{U \in \mathcal{I}_{mid}}{\left(M^{fact}_{Id_{U}}(H_{Id_U}) - M^{orth}_{Id_{U}}(H_{Id_U})\right)} + \\
		&\sum_{U \in \mathcal{I}_{mid}}{\sum_{\tau \in \mathcal{M}_U}{\left(M^{fact}_{\tau}(H_{\tau}) - M^{orth}_{\tau}(H_{\tau}) \right)}} \succeq \\
		&(1 - 3{\varepsilon'} - 2{\varepsilon'}^2)\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}(H_{Id_U})} - 
		(2 + 4\varepsilon')\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{*,U}}{\frac{M^{fact}_{Id_{U_{\gamma}}}(H'_{\gamma})}{c(\gamma)|Aut(U_{\gamma})|}}} \succeq \\
		&(1 - 5{\varepsilon'} - 6{\varepsilon'}^2)\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}(H_{Id_U})} - 
		(2 + 4\varepsilon')\left(\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U_{\gamma}}}(H'_{\gamma},H_{Id_{U_{\gamma}}})}{|Aut(U_{\gamma})|c(\gamma)}}}\right)Id_{sym} \succeq \\
		&\frac{1}{2}\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}(H_{Id_U})} - 
		3\left(\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U_{\gamma}}}(H'_{\gamma},H_{Id_{U_{\gamma}}})}{|Aut(U_{\gamma})|c(\gamma)}}}\right)Id_{sym}
	\end{align*}
\end{proof}

\subsection{Theorem Statements}
Recall the following definitions from Section \ref{quantitativetheoremstatementsection}.
\begin{definition}
	We define $S_{\alpha}$ to be the leftmost minimum vertex separator of $\alpha$
\end{definition}
\begin{definition}[Simplified Isolated Vertices]
	Under our simplifying assumptions, we define 
	\[
	I_{\alpha} = \{v \in W_{\alpha}: v \text{ is not incident to any edges in } E(\alpha)\}
	\]
\end{definition}
\begin{theorem}[Simplified $B_{norm}(\alpha)$, $B(\gamma)$, $N(\gamma)$, and $c(\alpha)$]\label{simplifiedfunctionstheorem}
	Under our simplifying assumptions, for all $\varepsilon, \varepsilon' > 0$ and all $D_V \in \mathbb{N}$, if we take
	\begin{enumerate}
		\item $q = 3\left\lceil{{D_V}ln(n) + \frac{ln(\frac{1}{\varepsilon})}{3} + {D_V}ln(5) + 3{D^2_V}ln(2)}\right\rceil$
		\item $B_{vertex} = 6{D_V}\sqrt[4]{2eq}$
		\item $B_{norm}(\alpha) = {B_{vertex}^{|V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|}}n^{\frac{w(V(\alpha)) + w(I_{\alpha}) - w(S_{\alpha})}{2}}$
		\item $B(\gamma) = B_{vertex}^{|V(\gamma) \setminus U_{\gamma}| + |V(\gamma) \setminus V_{\gamma}|}n^{\frac{w(V(\gamma) \setminus U_{\gamma})}{2}}$
		\item $N(\gamma) = (3D_V)^{2|V(\gamma) \setminus V_{\gamma}| + |V(\gamma) \setminus U_{\gamma}|}$
		\item $c(\alpha) = \frac{5(3D_V)^{|U_{\alpha} \setminus V_{\alpha}| + |V_{\alpha} \setminus U_{\alpha}| + 2|E(\alpha)|}2^{|V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})|}}{\varepsilon'}$
	\end{enumerate}
	then the following conditions hold:
	\begin{enumerate}
		\item With probability at least $(1-\varepsilon)$, $\forall \alpha \in \mathcal{M}'$, $||M_{\alpha}|| \leq B_{norm}(\alpha)$
		\item For all $\tau \in \mathcal{M}'$, $\gamma \in \Gamma_{*,U_{\tau}} \cup \{Id_{U_{\tau}}\}$, $\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}$, and intersection patterns $P \in \mathcal{P}_{\gamma,\tau,\gamma'}$,  
		\[
		B_{norm}(\tau_{P}) \leq B(\gamma)B(\gamma')B_{norm}(\tau)
		\]
		\item For all composable $\gamma_1,\gamma_2$, $B(\gamma_1)B(\gamma_2) = B(\gamma_1 \circ \gamma_2)$.
		\item $\forall U \in \mathcal{I}_{mid}, \sum_{\gamma \in \Gamma_{U,*}}{\frac{1}{|Aut(U)|c(\gamma)}} < \varepsilon'$ 
		\item $\forall V \in \mathcal{I}_{mid}, \sum_{\gamma \in \Gamma_{*,V}}{\frac{1}{|Aut(U_{\gamma})|c(\gamma)}} < \varepsilon'$ 
		\item $\forall U \in \mathcal{I}_{mid}, \sum_{\tau \in \mathcal{M}_{U}}{\frac{1}{|Aut(U)|c(\tau)}} < \varepsilon'$
		\item For all $\tau \in \mathcal{M}'$, $\gamma \in \Gamma_{*,U_{\tau}} \cup \{Id_{U_{\tau}}\}$, and $\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}$,
		\begin{align*}
			&\sum_{j>0}{\sum_{\gamma_1,\gamma'_1,\cdots,\gamma_j,\gamma'_j \in \Gamma_{\gamma,\gamma',j}}{\prod_{i:\gamma_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma_i})|}}
					\prod_{i:\gamma'_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma'_i})|}}}}\sum_{P_1,\cdots,P_j:P_i \in \mathcal{P}_{\gamma_i,\tau_{P_{i-1}},{\gamma'_i}^T}}{\left(\prod_{i=1}^{j}{N(P_i)}\right)} \\
			&\leq \frac{N(\gamma)N(\gamma')}
			{(|Aut(U_{\gamma})|)^{1_{\gamma \text{ is non-trivial}}}(|Aut(U_{\gamma'})|)^{1_{\gamma' \text{ is non-trivial}}}}
		\end{align*}
	\end{enumerate}
\end{theorem}
\subsubsection{General functions $B_{norm}(\alpha)$, $B(\gamma)$, $N(\gamma)$, and $c(\alpha)$*}
Recall the following definitions from Section \ref{generalmaintheoremstatementsection}.
\begin{definition}[$S_{\alpha,min}$ and $S_{\alpha,max}$]
	Given a shape $\alpha \in \mathcal{M}'$, define $S_{\alpha,min}$ to be the leftmost minimum vertex separator of $\alpha$ if all edges with multiplicity at least $2$ are deleted and define $S_{\alpha,max}$ to be the leftmost minimum vertex separator of $\alpha$ if all edges with multiplicity at least $2$ are present.
\end{definition}
\begin{definition}[General $I_{\alpha}$]
	Given a shape $\alpha$, define $I_{\alpha}$ to be the set of vertices in $V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})$ such that all edges incident with that vertex have multplicity at least $2$.
\end{definition}
\begin{definition}[$B_{\Omega}$]
	We take $B_{\Omega}(j)$ to be a non-decreasing function such that for all $j \in \mathbb{N}$, $E_{\Omega}[x^{j}] \leq B_{\Omega}(j)^{j}$ 
\end{definition}
\begin{definition}
	For all $i$, we define $h^{+}_i$ to be the polynomial $h_i$ where we make all of the coefficients have positive sign.
\end{definition}
\begin{lemma}
	If $\Omega = N(0,1)$ then we can take $B_{\Omega}(j) = \sqrt{j}$ and we have that 
\end{lemma}
\begin{theorem}[General $B_{norm}(\alpha)$, $B(\gamma)$, $N(\gamma)$, and $c(\alpha)$]\label{generalfunctionstheorem}
	For all $\varepsilon, \varepsilon' > 0$ and all $D_V,D_E \in \mathbb{N}$, if we take
	\begin{enumerate}
		\item $q = \left\lceil{3{D_V}ln(n) + ln(\frac{1}{\varepsilon}) + {(3D_V)^k}ln(D_E + 1) + 3{D_V}ln(5)}\right\rceil$
		\item $B_{vertex} = 6q{D_V}$
		\item $B_{edge}(e) = 2h^{+}_{l_e}(B_{\Omega}(6{D_V}D_E))
		\max_{j \in [0,3{D_V}D_E]}{\left\{\left(h^{+}_{j}(B_{\Omega}(2qj))\right)^{\frac{l_e}{\max{\{j,l_e\}}}}\right\}}$
		\item $B_{norm}(\alpha) = 
		2e{B_{vertex}^{|V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|}}\left(\prod_{e \in E(\alpha)}{B_{edge}(e)}\right)n^{\frac{w(V(\alpha)) + w(I_{\alpha}) - w(S_{\alpha})}{2}}$
		\item $B(\gamma) = B_{vertex}^{|V(\gamma) \setminus U_{\gamma}| + |V(\gamma) \setminus V_{\gamma}|}\left(\prod_{e \in E(\gamma)}{B_{edge}(e)}\right)n^{\frac{w(V(\gamma) \setminus U_{\gamma})}{2}}$
		\item $N(\gamma) = (3D_V)^{2|V(\gamma) \setminus V_{\gamma}| + |V(\gamma) \setminus U_{\gamma}|}$
		\item $c(\alpha) = \frac{5(3{t_{max}}D_V)^{|U_{\alpha} \setminus V_{\alpha}| + |V_{\alpha} \setminus U_{\alpha}| + k|E(\alpha)|}(2t_{max})^{|V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})|}}{\varepsilon'}$
	\end{enumerate}
	then the following conditions hold:
	\begin{enumerate}
		\item With probability at least $(1-\varepsilon)$, $\forall \alpha \in \mathcal{M}'$, $||M_{\alpha}|| \leq B_{norm}(\alpha)$
		\item For all $\tau \in \mathcal{M}'$, $\gamma \in \Gamma_{*,U_{\tau}} \cup \{Id_{U_{\tau}}\}$, $\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}$, and intersection patterns $P \in \mathcal{P}_{\gamma,\tau,\gamma'}$,  
		\[
		B_{norm}(\tau_{P}) \leq B(\gamma)B(\gamma')B_{norm}(\tau)
		\]
		\item For all composable $\gamma_1,\gamma_2$, $B(\gamma_1)B(\gamma_2) = B(\gamma_1 \circ \gamma_2)$.
		\item $\forall U \in \mathcal{I}_{mid}, \sum_{\gamma \in \Gamma_{U,*}}{\frac{1}{|Aut(U)|c(\gamma)}} < \varepsilon'$ 
		\item $\forall V \in \mathcal{I}_{mid}, \sum_{\gamma \in \Gamma_{*,V}}{\frac{1}{|Aut(U_{\gamma})|c(\gamma)}} < \varepsilon'$ 
		\item $\forall U \in \mathcal{I}_{mid}, \sum_{\tau \in \mathcal{M}_{U}}{\frac{1}{|Aut(U)|c(\tau)}} < \varepsilon'$
		\item For all $\tau \in \mathcal{M}'$, $\gamma \in \Gamma_{*,U_{\tau}} \cup \{Id_{U_{\tau}}\}$, and $\gamma' \in \Gamma_{*,V_{\tau}} \cup \{Id_{V_{\tau}}\}$,
		\begin{align*}
			&\sum_{j>0}{\sum_{\gamma_1,\gamma'_1,\cdots,\gamma_j,\gamma'_j \in \Gamma_{\gamma,\gamma',j}}{\prod_{i:\gamma_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma_i})|}}
					\prod_{i:\gamma'_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma'_i})|}}}}\sum_{P_1,\cdots,P_j:P_i \in \mathcal{P}_{\gamma_i,\tau_{P_{i-1}},{\gamma'_i}^T}}{\left(\prod_{i=1}^{j}{N(P_i)}\right)} \\
			&\leq \frac{N(\gamma)N(\gamma')}
			{(|Aut(U_{\gamma})|)^{1_{\gamma \text{ is non-trivial}}}(|Aut(U_{\gamma'})|)^{1_{\gamma' \text{ is non-trivial}}}}
		\end{align*}
	\end{enumerate}
\end{theorem}
\begin{remark}
	Recall that if $\Omega = N(0,1)$ then we may take $B_{\Omega}(j) = \sqrt{j}$ and we have that 
	\[
	h^{+}_j(x) \leq \frac{1}{\sqrt{j!}}(x^2 + j)^{\frac{j}{2}} \leq \left(\frac{e}{j}(x^2 + j)\right)^{\frac{j}{2}}
	\]
	Thus, when $\Omega = N(0,1)$ we can take 
	\[
	B_{edge}(e) = 2\left(\frac{e}{l_e}(6{D_V}D_E + l_e)\right)^{l_e}\left(e(6{D_V}{D_E}q + 1)\right)^{l_e} \leq \left(400{D^2_V}{D^2_E}q\right)^{l_e}
	\]
\end{remark}
\subsection{Choosing $B_{norm}(\alpha)$}
We need matrix norm bounds which hold for all $\alpha \in \mathcal{M}'$. For convenience, we recall the definition of $\mathcal{M}'$ below.
\begin{definition}[$\mathcal{M}'$]
	We define $\mathcal{M}'$ to be the set of all shapes $\alpha$ such that
	\begin{enumerate}
		\item[1.] $|V(\alpha)| \leq 3D_V$
		\item[2.*] $\forall e \in E(\alpha), l_e \leq D_E$
		\item[3.*] All edges $e \in E(\alpha)$ have multiplicity at most $3D_V$.
	\end{enumerate}
\end{definition}
To obtain such norm bounds, we start with the norm bounds in the graph matrix norm bound paper. We then modify these bounds as follows:
\begin{enumerate}
	\item We make the bounds more compatible with the conditions of our machinery. To do this, we upper bound many of the terms in the norm bound by $B_{vertex}^{|V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|}$ where $B_{vertex}$ is a function of our parameters. In general, we will also need to upper bound some of the terms by $\prod_{e \in E(\alpha)}(B_{edge}(e))$ where $B_{edge}(e)$ is a function of $l_e$, $\Omega$, and our parameters.
	\item We generalize the bounds so that they apply to improper shapes as well as proper shapes. Under our simplifying assumptions, all we need to do here is to take isolated vertices into account. In general, we also need to handle multi-edges. 
\end{enumerate}
\subsubsection{Simplified $B_{norm}(\alpha)$}
Under our simplifying assumptions, we start with the following norm bound from the updated graph matrix norm bound paper \cite{AMP20}:
\begin{theorem}[Simplified Graph Matrix Norm Bounds]\label{originalsimplifiednormbounds}
	Under our simplifying assumptions, for all $\varepsilon > 0$ and all proper shapes $\alpha$, taking $c_{\alpha} = |V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})| + |S_{\alpha} \setminus (U_{\alpha} \cap V_{\alpha})|$,
	\[
	Pr\left(||M_{\alpha}|| > (2|V_{\alpha} \setminus (U_{\alpha} \cap V_{\alpha})|)^{|V(\alpha) \setminus (U_{\alpha} \cap V_{\alpha})|}(2eq)^{\frac{c_{\alpha}}{2}}n^{\frac{w(V(\alpha)) - w(S_{\alpha})}{2}}\right) < \varepsilon
	\]
	where $q = 3\left\lceil\frac{ln(\frac{n^{w(S_{\alpha})}}{\varepsilon})}{3c_{\alpha}}\right\rceil$
\end{theorem}
\begin{corollary}\label{firsttweakedsimplifiednormbound}
	For all shapes $\alpha$ and all $\varepsilon > 0$, 
	\[
	Pr\left(||M_{\alpha}|| > \left(2|V_{\alpha}|\sqrt[4]{2eq}\right)^{|V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|}n^{\frac{w(V(\alpha)) + w(I_{\alpha}) - w(S_{\alpha})}{2}}\right) < \varepsilon
	\]
	where $q = 3\left\lceil\frac{ln(\frac{n^{w(S_{\alpha})}}{\varepsilon})}{3c_{\alpha}}\right\rceil$.
\end{corollary}

\begin{proof}
	Observe that adding an isolated vertex to $\alpha$ is equivalent to multiplying $M_{\alpha}$ by $n - |V(\alpha)|$. Thus, if the bound holds for all proper $\alpha$ then it will hold for improper $\alpha$ as well. 
	
	We now make the following observations:
	\begin{enumerate}
		\item $|S_{\alpha} \setminus (U_{\alpha} \cap V_{\alpha})| \leq |U_{\alpha} \setminus V_{\alpha}|$, so $c_{\alpha} = |W_{\alpha}| + |S_{\alpha} \setminus (U_{\alpha} \cap V_{\alpha})| \leq |V(\alpha) \setminus V_{\alpha}|$. Similarly, $|S_{\alpha} \setminus (U_{\alpha} \cap V_{\alpha})| \leq |V_{\alpha} \setminus U_{\alpha}|$, so $c_{\alpha} \leq |V(\alpha) \setminus U_{\alpha}|$. Thus, $c_{\alpha} \leq \frac{|V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|}{2}$.
		\item $|V(\alpha) \setminus (U_{\alpha} \cap V_{\alpha})| \leq |V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|$
	\end{enumerate}
	Thus, by Theorem \ref{originalsimplifiednormbounds}, for all proper shapes $\alpha$ and all $\varepsilon > 0$, 
	\[
	Pr\left(||M_{\alpha}|| > \left(2|V_{\alpha}|\sqrt[4]{2eq}\right)^{|V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|}n^{\frac{w(V(\alpha)) + w(I_{\alpha}) - w(S_{\alpha})}{2}}\right) < \varepsilon''
	\]
	where $q = 3\left\lceil\frac{ln(\frac{n^{w(S_{\alpha})}}{\varepsilon})}{3c_{\alpha}}\right\rceil$.
\end{proof}
\begin{corollary}\label{tweakedsimplifiednormbound}
	For all $z \in \mathbb{N}$ and all $\varepsilon > 0$, taking $\varepsilon'' = \frac{\varepsilon}{5^{z}2^{z^2}}$, with probability at least $1-\varepsilon$ we have that for all shapes $\alpha$ such that $|V(\alpha)| \leq z$, 
	\[
	||M_{\alpha}|| \leq \left(2|V_{\alpha}|\sqrt[4]{2eq}\right)^{|V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|}n^{\frac{w(V(\alpha)) + w(I_{\alpha}) - w(S_{\alpha})}{2}}
	\]
	where $q = 3\left\lceil\frac{ln(\frac{n^{w(S_{\alpha})}}{\varepsilon''})}{3c_{\alpha}}\right\rceil$.
\end{corollary}
\begin{proof}
	This result can be proved from Corollary \ref{firsttweakedsimplifiednormbound} using a union bound and the following proposition:
	\begin{proposition}\label{simplifiedcountingalpha}
		Under our simplifying assumptions, for all $z \in \mathbb{N}$, there are at most $5^{z}2^{z^2}$ proper shapes $\alpha$ such that $V(\alpha) \leq z$.
	\end{proposition}
	\begin{proof}
		Observe that we can construct any proper shape $\alpha$ with at most $m$ vertices as follows:
		\begin{enumerate}
			\item Start with $z$ vertices $v_1,\ldots,v_z$.
			\item For each vertex $v_i$, choose whether $v_i \in V(\alpha) \setminus U_{\alpha} \setminus V_{\alpha}$, $v_i \in U_{\alpha} \setminus V_{\alpha}$, $v_i \in V_{\alpha} \setminus U_{\alpha}$, $v_i \in U_{\alpha} \cap V_{\alpha}$, or $v_i \notin V(\alpha)$.
			\item For each pair of vertices $v_i,v_j \in V(\alpha)$, choose whether or not $(v_i,v_j) \in E(\alpha)$
		\end{enumerate}
	\end{proof}
\end{proof}
\begin{corollary}\label{finalsimplifiednormbound}
	For all $D_V \in \mathbb{N}$ and all $\varepsilon > 0$, taking 
	\[
	q = 3\left\lceil\frac{ln(\frac{5^{3D_V}2^{9D^2_V}n^{3D_V}}{\varepsilon})}{3}\right\rceil = 
	3\left\lceil{{D_V}ln(n) + \frac{ln(\frac{1}{\varepsilon})}{3} + {D_V}ln(5) + 3{D^2_V}ln(2)}\right\rceil,
	\] 
	$B_{vertex} = 6{D_V}\sqrt[4]{2eq}$, and
	\[
	B_{norm}(\alpha) = {B_{vertex}^{|V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|}}n^{\frac{w(V(\alpha)) + w(I_{\alpha}) - w(S_{\alpha})}{2}},
	\] 
	with probability at least $(1-\varepsilon)$ we have that for all shapes $\alpha \in \mathcal{M}'$, $||M_{\alpha}|| \leq B_{norm}(\alpha)$
\end{corollary}
\begin{proof}
	This follows from Corollary \ref{tweakedsimplifiednormbound} and the fact that for all $\alpha \in \mathcal{M}'$, $w(S_{\alpha}) \leq |V(\alpha)| \leq3D_V$
\end{proof}
\subsubsection{General $B_{norm}(\alpha)$}
In general, we start with the following norm bound from the updated graph matrix norm bound paper \cite{AMP20}:
\begin{theorem}[General Graph Matrix Norm Bounds]\label{originalgeneralnormbounds}
	For all $\varepsilon > 0$ and all proper shapes $\alpha$, taking $q = \lceil{ln(\frac{n^{w(S_{\alpha})}}{\varepsilon})}\rceil$
	\[
	P\left(||M_{\alpha}|| > 2e(2q|V(\alpha)|)^{|V(\alpha) \setminus (U_{\alpha} \cap V_{\alpha})|}\left(\prod_{e \in E(\alpha)}{h^{+}_{l_{e}}(B_{\Omega}(2q{l_e}))}\right)
	n^{\frac{(w(V(\alpha)) - w(S_{\alpha}))}{2}}\right) < \varepsilon
	\]
\end{theorem}
\begin{corollary}
	For all $\varepsilon > 0$, for all $z,l_{max},m \in \mathbb{N}$, taking $\varepsilon'' = \frac{\varepsilon}{5^{z}(l_{max} + 1)^{z^k}}$, with probability at least $1-\varepsilon$, for all shapes $\alpha$ such that 
	\begin{enumerate}
		\item $|V(\alpha)| \leq z$.
		\item All edges in $E(\alpha)$ have label at most $l_{max}$.
		\item All edges in $E(\alpha)$ have multiplicity at most $m$.
	\end{enumerate}, 
	\begin{align*}
		||M_{\alpha}|| \leq &2e(2q|V(\alpha)|)^{|V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|}\left(\prod_{e \in E(\alpha)}{2h^{+}_{l_e}(B_{\Omega}(2ml_{max}))
			\max_{j \in [0,ml_{max}]}{\left\{\left(h^{+}_{j}(B_{\Omega}(2qj))\right)^{\frac{l_e}{\max{\{j,l_e\}}}}\right\}}}\right)\\
		&n^{\frac{w(V(\alpha)) + w(I_{\alpha}) - w(S_{\alpha,min})}{2}}
	\end{align*}
	where $q = \left\lceil{ln\left(\frac{n^{w(S_{\alpha,max})}}{\varepsilon''}\right)}\right\rceil$
\end{corollary}
\begin{proof}
	Observe that for each $\alpha$ which has multi-edges, we can write $M_{\alpha} = \sum_{i}{{c_i}M_{\alpha_i}}$ where each $\alpha_i$ has no multiple edges. We first upper bound $\sum_{i}{|c_i|}$.
	\begin{lemma}
		For any $a_1,\ldots,a_m \in \mathbb{N} \cup \{0\}$, taking $p_{max} = \sum_{i=1}^{m}{a_i}$ and writing $\prod_{i=1}^{m}{h_{a_i}} = \sum_{k=0}^{p_{max}}{{c_k}h_k}$,
		\[
		\sum_{k=0}^{p_{max}}{|c_k|} \leq (p_{max}+1)\prod_{i=1}^{m}{h^{+}_{a_i}(B_{\Omega}(2p_{max}))} \leq \prod_{i=1}^{m}{2h^{+}_{a_i}(B_{\Omega}(2p_{max}))}
		\]
	\end{lemma}
	\begin{proof}
		Suppose $\prod_{i = 1}^m (h_{a_i}(x))^2 = \sum_{k = 0}^{2p_{max}} u_kx^k$ and $\prod_{i = 1}^m (h^+_{a_i}(x))^2 = \sum_{k = 0}^{p_{max}} v_kx^k$. Then, note that $|u_k| \le v_k$ and so,	\[E_{\Omega}[\prod_{i = 1}^{m}(h_{a_i}(x))^2] = \sum_{k = 0}^{2p_{max}} u_k E_{\Omega}[x^k] \le \sum_{k = 0}^{2p_{max}} v_k|E_{\Omega}[x^k]| \le \sum_{k = 0}^{2p_{max}} v_k(B_{\Omega}(2p_{max}))^k = \prod_{i = 1}^m (h_{a_i}^+(B_{\Omega}(2p_{max}))^2\]
		Therefore, using the fact that $h_k$ form an orthonormal basis,
		\[\sum_{k = 0}^{p_{max}} c_k^2 = E_{\Omega}[(\sum_{k = 0}^{p_{max}} c_kh_k(x))^2] = E_{\Omega}[\prod_{i = 1}^m (h_{a_i}(x))^2] \le  \prod_{i = 1}^m (h_{a_i}^+(B_{\Omega}(2p_{max}))^2\]
		
		This implies
		\[(\sum_{k = 0}^{p_{max}}|c_k|)^2 \le (p_{max} + 1)(\sum_{k = 0}^{p_{max}} c_k^2) \le  (p_{max} + 1)  \prod_{i = 1}^m (h_{a_i}^+(B_{\Omega}(2p_{max}))^2\]
		Taking square roots gives the inequality.
	\end{proof}
	\begin{corollary}
		For any shape $\alpha$ such that every edge of $\alpha$ has multiplicity at most $m$ and label at most $l_{max}$, if we write 
		$M_{\alpha} = \sum_{i}{{c_i}M_{\alpha_i}}$ where each $\alpha_i$ has no multi-edges then $\sum_{i}{|c_i|} \leq \prod_{e \in E(\alpha)}{2h^{+}_{l_e}(B_{\Omega}(2ml_{max}))}$
	\end{corollary}
	The result now follows from Theorem \ref{originalgeneralnormbounds} and the following observations:
	\begin{enumerate}
		\item $|V(\alpha) \setminus (U_{\alpha} \cap V_{\alpha})| \leq |V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|$.
		\item For any $\alpha$, writing $M_{\alpha} = \sum_{i}{{c_i}M_{\alpha_i}}$ where each $\alpha_i$ has no multi-edges, for all $\alpha_i$,
		\[
		w(V(\alpha_i)) + w(I_{\alpha_i}) - w(S_{\alpha_i}) \leq w(V(\alpha)) + w(I_{\alpha}) - w(S_{\alpha,min})
		\]
		\item For any $a_1,\ldots,a_m \in \mathbb{N} \cup \{0\}$ such that $\forall i' \in [m], a_{i'} \leq l_{max}$, for all $j \in [0,ml_{max}]$
		\[
		h^{+}_{j}(B_{\Omega}(2qj)) \leq \prod_{i'=1}^{m}{\left(h^{+}_{j}(B_{\Omega}(2qj))\right)^{\frac{a_{i'}}{\max{\{j,a_{i'}\}}}}} \leq 
		\prod_{i'=1}^{m}{\max_{j' \in [0,ml_{max}]}{\left\{\left(h^{+}_{j'}(B_{\Omega}(2qj'))\right)^{\frac{a_{i'}}{\max{\{j',a_{i'}\}}}}\right\}}}
		\]
	\end{enumerate}
	\begin{proposition}
		For all $z,l_{max} \in \mathbb{N}$, there are at most $5^{z}(l_{max} + 1)^{z^k}$ proper shapes $\alpha$ such that $|V(\alpha)| \leq z$ and every edge in $E(\alpha)$.
	\end{proposition}
	\begin{proof}
		This can be proved in the same way as before. Observe that we can construct any proper shape $\alpha$ with at most $z$ vertices as follows:
		\begin{enumerate}
			\item Start with $z$ vertices $v_1,\ldots,v_z$.
			\item For each vertex $v_i$, choose whether $v_i \in V(\alpha) \setminus U_{\alpha} \setminus V_{\alpha}$, $v_i \in U_{\alpha} \setminus V_{\alpha}$, $v_i \in V_{\alpha} \setminus U_{\alpha}$, $v_i \in U_{\alpha} \cap V_{\alpha}$, or $v_i \notin V(\alpha)$.
			\item For each $k$ tuple of vertices in $V(\alpha)$, choose the label of the hyperedge between these vertices (or $0$ if the hyperedge is not in $E(\alpha)$).
		\end{enumerate}
	\end{proof}
\end{proof}
\begin{corollary}\label{finalgeneralnormbounds}
	For all $D_V,D_E \in \mathbb{N}$ and all $\varepsilon > 0$, taking 
	\[
	B_{norm}(\alpha) = 2e{B_{vertex}^{|V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|}}\left(\prod_{e \in E(\alpha)}{B_{edge}(e)}\right)n^{\frac{w(V(\alpha)) + w(I_{\alpha}) - w(S_{\alpha})}{2}}
	\] where 
	\begin{enumerate}
		\item $q = \left\lceil{ln\left(\frac{n^{3D_V}}{\varepsilon''}\right)}\right\rceil = \left\lceil{3{D_V}ln(n) + ln(\frac{1}{\varepsilon}) + {(3D_V)^k}ln(D_E + 1) + 3{D_V}ln(5)}\right\rceil$
		\item $B_{vertex} = 6q{D_V}$
		\item $B_{edge}(e) = 2h^{+}_{l_e}(B_{\Omega}(6{D_V}D_E))
		\max_{j \in [0,3{D_V}D_E]}{\left\{\left(h^{+}_{j}(B_{\Omega}(2qj))\right)^{\frac{l_e}{\max{\{j,l_e\}}}}\right\}}$
	\end{enumerate}
	with probability at least $(1-\varepsilon)$, for all shapes $\alpha \in \mathcal{M}'$, $||M_{\alpha}|| \leq B_{norm}(\alpha)$.
\end{corollary}
\subsection{Choosing $B(\gamma)$}
We now describe how to choose the function $B(\gamma)$. Recall that we want the following conditions to hold:
\begin{enumerate}
	\item For all $\gamma, \tau, \gamma'$ and all intersection patterns $P \in \mathcal{P}_{\gamma,\tau,\gamma'}$, 
	\[
	B_{norm}(\tau_{P}) \leq B(\gamma)B(\gamma')B_{norm}(\tau)
	\]
	\item For all composable $\gamma_1,\gamma_2$, $B(\gamma_1)B(\gamma_2) = B(\gamma_1 \circ \gamma_2)$.
\end{enumerate}
The most important part of choosing $B(\gamma)$ is to make sure that the factors of $n$ are controlled. For this, we use the following intersection tradeoff lemma. Under our simplifying assumptions, this lemma follows from \cite[Lemma 7.12]{BHKKMP16}. We defer the general proof of this lemma to the end of this section.
\begin{lemma}[Intersection Tradeoff Lemma]
	For all $\gamma, \tau, \gamma'$ and all intersection patterns $P \in \mathcal{P}_{\gamma,\tau,\gamma'}$, 
	\[
	w(V(\tau_P)) + w(I_{\tau_P}) - w(S_{\tau_P,min}) \leq w(V(\tau)) + w(I_{\tau}) - w(S_{\tau,min})+ w(V(\gamma) \setminus U_{\gamma}) + w(V(\gamma') \setminus U_{\gamma'})
	\]
\end{lemma}
Based on this intersection tradeoff lemma, we can choose the function $B(\gamma)$ as follows.
\begin{corollary}
	If we take 
	\[
	B_{norm}(\alpha) = C \cdot B_{vertex}^{|V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|}\left(\prod_{e \in E(\alpha)}{B_{edge}(e)}\right)n^{\frac{w(V(\alpha)) + w(I_{\alpha}) - w(S_{\alpha})}{2}}
	\] 
	for some constant $C > 0$ and take
	\[
	B(\gamma) = B_{vertex}^{|V(\gamma) \setminus U_{\gamma}| + |V(\gamma) \setminus V_{\gamma}|}\left(\prod_{e \in E(\gamma)}{B_{edge}(e)}\right)n^{\frac{w(V(\gamma) \setminus U_{\gamma})}{2}}
	\]
	then the following conditions hold:
	\begin{enumerate}
		\item For all $\gamma, \tau, \gamma'$ and all intersection patterns $P \in \mathcal{P}_{\gamma,\tau,\gamma'}$, 
		\[
		B_{norm}(\tau_{P}) \leq B(\gamma)B(\gamma')B_{norm}(\tau)
		\]
		\item For all composable $\gamma_1,\gamma_2$, $B(\gamma_1)B(\gamma_2) = B(\gamma_1 \circ \gamma_2)$.
	\end{enumerate}
\end{corollary}
\begin{proof}
	We have that 
	\[
	B_{norm}(\tau_{P}) = B_{vertex}^{|V(\tau_P) \setminus U_{\tau_P}| + |V(\tau_P) \setminus V_{\tau_P}|}\left(\prod_{e \in E(\tau_P)}{B_{edge}(e)}\right)n^{\frac{w(V(\tau_P)) + w(I_{\tau_P}) - w(S_{\tau_P})}{2}}
	\]
	and 
	\begin{align*}
		B(\gamma)B(\gamma')B_{norm}(\tau) &= B_{vertex}^{|V(\gamma) \setminus U_{\gamma}| + |V(\gamma) \setminus V_{\gamma}| + |V(\gamma') \setminus U_{\gamma'}| + |V(\gamma') \setminus V_{\gamma'}| + |V(\tau) \setminus U_{\tau}| + |V(\tau) \setminus V_{\tau}|} \\
		&\left(\prod_{e \in E(\gamma) \cup E(\gamma') \cup E(\tau)}{B_{edge}(e)}\right)n^{\frac{w(V(\gamma) \setminus U_{\gamma}) + w(V(\gamma') \setminus U_{\gamma'}) + 
				w(V(\tau)) + w(I_{\tau}) - w(S_{\tau})}{2}}
	\end{align*}
	The first condition now follows immediately from the following observations:
	\begin{enumerate}
		\item \begin{align*}
			&|V(\gamma) \setminus U_{\gamma}| + |V(\gamma) \setminus V_{\gamma}| + |V(\gamma') \setminus U_{\gamma'}| + |V(\gamma') \setminus V_{\gamma'}| + |V(\tau) \setminus U_{\tau}| + |V(\tau) \setminus V_{\tau}| \\
			&= |V(\gamma \circ \tau \circ {\gamma'}^T) \setminus U_{\gamma \circ \tau \circ {\gamma'}^T}| + |V(\gamma \circ \tau \circ {\gamma'}^T) \setminus V_{\gamma \circ \tau \circ {\gamma'}^T}| 
			\geq |V(\tau_P) \setminus U_{\tau_P}| + |V(\tau_P) \setminus V_{\tau_P}|
		\end{align*}
		\item $E(\tau_P) = E(\gamma) \cup E(\tau) \cup E({\gamma'}^T)$ so 
		$\prod_{e \in E(\tau_P)}{B_{edge}(e)} = \prod_{e \in E(\gamma) \cup E(\gamma') \cup E(\tau)}{B_{edge}(e)}$.
		\item By the intersection tradeoff lemma, 
		\[
		w(V(\tau_P)) + w(I_{\tau_P}) - w(S_{\tau_P}) \leq w(V(\tau)) + w(I_{\tau}) - w(S_{\tau}) + w(V(\gamma) \setminus U_{\gamma}) + w(V(\gamma') \setminus U_{\gamma'})
		\]
	\end{enumerate}
	The second condition follows  from the form of $B(\gamma)$.
\end{proof}
\subsection{Choosing $N(\gamma)$}
To choose $N(\gamma)$, we use the following lemma:
\begin{lemma}
	For all $D_V \in \mathbb{N}$, for all composable $\gamma,\tau,{\gamma'}^T$ such that $|V(\gamma)| \leq D_V$, $|V(\tau)| \leq D_V$, and $|V(\gamma')| \leq D_V$, 
	\begin{align*}
		&\sum_{j>0}{\sum_{\gamma_1,\gamma'_1,\cdots,\gamma_j,\gamma'_j \in \Gamma_{\gamma,\gamma',j}}{\prod_{i:\gamma_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma_i})|}}
				\prod_{i:\gamma'_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma'_i})|}}}}\sum_{P_1,\cdots,P_j:P_i \in \mathcal{P}_{\gamma_i,\tau_{P_{i-1}},{\gamma'_i}^T}}{\left(\prod_{i=1}^{j}{N(P_i)}\right)} \\
		&\leq \frac{{(3D_V)}^{2(|V(\gamma) \setminus V_{\gamma}| + |V(\gamma') \setminus V_{\gamma'}|) + (|V(\gamma) \setminus U_{\gamma}| + |V(\gamma') \setminus U_{\gamma'}|)}}
		{(|Aut(U_{\gamma})|)^{1_{\gamma \text{ is non-trivial}}}(|Aut(U_{\gamma'})|)^{1_{\gamma' \text{ is non-trivial}}}}
	\end{align*}
\end{lemma}
\begin{proof}[Proof sketch]
	Observe that aside from the orderings (which are canceled out by the $|Aut(U_{\gamma_i})|$ and $|Aut(U_{\gamma'_i})|$ factors), the intersection patterns $\{P_i: i \in [j]\}$ are determined by the following data on each vertex $v \in (V(\gamma) \setminus V_{\gamma}) \cup (V({\gamma'}^T) \setminus V_{{\gamma'}^T})$:
	\begin{enumerate}
		\item The first $i \in [j]$ such that $v \in (V(\gamma_i) \setminus V_{\gamma_i}) \cup (V({\gamma'_i}^T) \setminus V_{{\gamma'_i}^T})$. There are at most $j$ possibilities for this.
		\item A vertex $u$ (if one exists) in $V(\gamma_{i-1} \circ \ldots \circ \gamma_1 \circ \tau \circ {\gamma'_1}^T \ldots \circ {\gamma'_{i-1}}^T)$ such that $u$ and $v$ are equal. There are at most $3D_V$ possibilities for this.
	\end{enumerate}
	Using these observations and taking $j_{max} = |V(\gamma) \setminus V_{\gamma}| + |V(\gamma') \setminus V_{\gamma'}|$,
	\begin{align*}
		&\sum_{j > 0}{\sum_{\gamma_1,\gamma'_1,\cdots,\gamma_j,\gamma'_j \in \Gamma_{\gamma,\gamma',j}}{\prod_{i:\gamma_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma_i})|}}
				\prod_{i:\gamma'_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma'_i})|}}}}\sum_{P_1,\cdots,P_j:P_i \in \mathcal{P}_{\gamma_i,\tau_{P_{i-1}},{\gamma'_i}^T}}{1} \\
		&\leq \sum_{j =1}^{j_{max}}{\frac{{(3jD_V)}^{|V(\gamma) \setminus V_{\gamma}| + |V(\gamma') \setminus V_{\gamma'}|}}
			{(|Aut(U_{\gamma})|)^{1_{\gamma \text{ is non-trivial}}}(|Aut(U_{\gamma'})|)^{1_{\gamma' \text{ is non-trivial}}}}} \\
		&\leq j_{max}\left(\frac{2}{3}\right)^{j_{max}}\frac{{(3D_V)}^{2(|V(\gamma) \setminus V_{\gamma}| + |V(\gamma') \setminus V_{\gamma'}|)}}
		{(|Aut(U_{\gamma})|)^{1_{\gamma \text{ is non-trivial}}}(|Aut(U_{\gamma'})|)^{1_{\gamma' \text{ is non-trivial}}}} \\
		&< \frac{{(3D_V)}^{2(|V(\gamma) \setminus V_{\gamma}| + |V(\gamma') \setminus V_{\gamma'}|)}}
		{(|Aut(U_{\gamma})|)^{1_{\gamma \text{ is non-trivial}}}(|Aut(U_{\gamma'})|)^{1_{\gamma' \text{ is non-trivial}}}}
	\end{align*}
	Now recall that by Lemma ,
	for any $\gamma_i,\tau_{P_{i-1}},{\gamma'_i}^{T}$ and any intersection pattern $P_i \in \mathcal{P}_{\gamma_i,\tau_{P_{i-1}},{\gamma'_i}^T}$, 
	\[
	N(P_i) \leq |V(\tau_{P_i})|^{|V(\gamma_i) \setminus U_{\gamma_i}| + |V(\gamma'_i) \setminus U_{\gamma'_i}|}
	\]
	Thus, for any $P_1,\cdots,P_j:P_i \in \mathcal{P}_{\gamma_i,\tau_{P_{i-1}},{\gamma'_i}^T}$, $\prod_{i=1}^{j}{N(P_i)} \leq (3D_V)^{|V(\gamma) \setminus U_{\gamma}| + |V(\gamma') \setminus U_{\gamma'}|}$.
	Putting everything together, the result follows.
\end{proof}
\begin{corollary}
	For all $D_V \in \mathbb{N}$, if we take $N(\gamma) = (3D_V)^{2|V(\gamma) \setminus V_{\gamma}| + |V(\gamma) \setminus U_{\gamma}|}$ then for all composable $\gamma, \tau, {\gamma'}^T$ such that $|V(\gamma)| \leq D_V$, $|V(\tau)| \leq D_V$, and $|V(\gamma')| \leq D_V$, 
	\begin{align*}
		&\sum_{j>0}{\sum_{\gamma_1,\gamma'_1,\cdots,\gamma_j,\gamma'_j \in \Gamma_{\gamma,\gamma',j}}{\prod_{i:\gamma_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma_i})|}}
				\prod_{i:\gamma'_i \text{ is non-trivial}}{\frac{1}{|Aut(U_{\gamma'_i})|}}}}\sum_{P_1,\cdots,P_j:P_i \in \mathcal{P}_{\gamma_i,\tau_{P_{i-1}},{\gamma'_i}^T}}{\left(\prod_{i=1}^{j}{N(P_i)}\right)} \\
		&\leq \frac{N(\gamma)N(\gamma')}
		{(|Aut(U_{\gamma})|)^{1_{\gamma \text{ is non-trivial}}}(|Aut(U_{\gamma'})|)^{1_{\gamma' \text{ is non-trivial}}}}
	\end{align*}
\end{corollary}
\subsection{Choosing $c(\alpha)$}
In this section, we describe how to choose $c(\alpha)$. For simplicity, we first describe how to choose $c(\alpha)$ under our simplifying assumptions. We then describe the minor adjustments that are needed when we have hyperedges and multiple types of vertices.
\begin{lemma}\label{calphalemma}
	Under our simplifying assumptons, for all $U \in \mathcal{I}_{mid}$,
	\[
	\sum_{\alpha: U_{\alpha} \equiv U, \alpha \text{ is proper and non-trivial}}{\frac{1}{|Aut(U_{\alpha} \cap V_{\alpha})|(3D_V)^{|U_{\alpha} \setminus V_{\alpha}| + |V_{\alpha} \setminus U_{\alpha}| + 2|E(\alpha)|}2^{|V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})|}}} < 5
	\]
\end{lemma} 
\begin{proof}
	In order to choose $\alpha$, it is sufficient to choose the following:
	\begin{enumerate}
		\item The number $j_1$ of vertices in $U_{\alpha} \setminus V_{\alpha}$, the number $j_2$ of vertices in $V_{\alpha} \setminus U_{\alpha}$, and the number $j_3$ of vertices in $V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})$.
		\item A mapping in $Aut(U_{\alpha} \cap V_{\alpha})$ determining how the vertices in $U_{\alpha} \cap V_{\alpha}$ match up with each other.
		\item The position of each vertex $u \in U_{\alpha} \setminus V_{\alpha}$ within $U_{\alpha}$ (there are at most $|U_{\alpha}| \leq D_V$ choices for this).
		\item The position of each vertex $v \in V_{\alpha} \setminus U_{\alpha}$ within $V_{\alpha}$ (there are at most $|U_{\alpha}| \leq D_V$ choices for this).
		\item The number $j_4$ of edges in $E(\alpha)$.
		\item The endpoints of each edge in $E(\alpha)$.
	\end{enumerate}
	This implies that for all $j_1,j_2,j_3,j_4 \geq 0$
	\[
	\sum_{\alpha: U_{\alpha} \equiv U, |U_{\alpha} \setminus V_{\alpha}| = j_1, |V_{\alpha} \setminus U_{\alpha}| = j_2 \atop
		|V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})| = j_3, |E(\alpha)| = j_4}{\frac{1}{|Aut(U_{\alpha} \cap V_{\alpha})|(D_V)^{j_1 + j_2}
			(D_V)^{2j_4}}} \leq 1
	\]
	Using this, we have that
	\begin{align*}
		&\sum_{\alpha: U_{\alpha} \equiv U, \alpha \text{ is proper and non-trivial}}{\frac{1}{|Aut(U_{\alpha} \cap V_{\alpha})|(3D_V)^{|U_{\alpha} \setminus V_{\alpha}| + |V_{\alpha} \setminus U_{\alpha}| + 2|E(\alpha)|}2^{|V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})|}}} \\
		&\leq \sum_{j_1,j_2,j_3,j_4 \in \mathbb{N} \cup \{0\}: j_1 + j_2 + j_3 + j_4 \geq 1}{\frac{1}{3^{j_1 + j_2}9^{j_4}2^{j_3}}} \leq 2\left(\frac{3}{2}\right)^2\frac{9}{8} - 1 < 5
	\end{align*}
\end{proof}
\begin{corollary}
	For all $\varepsilon' > 0$, if we take 
	\[
	c(\alpha) = \frac{5(3D_V)^{|U_{\alpha} \setminus V_{\alpha}| + |V_{\alpha} \setminus U_{\alpha}| + 2|E(\alpha)|}2^{|V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})|}}{\varepsilon'}
	\]
	then
	\begin{enumerate}
		\item $\forall U \in \mathcal{I}_{mid}, \sum_{\gamma \in \Gamma_{U,*}}{\frac{1}{|Aut(U)|c(\gamma)}} < \varepsilon'$ 
		\item $\forall V \in \mathcal{I}_{mid}, \sum_{\gamma \in \Gamma_{*,V}}{\frac{1}{|Aut(U_{\gamma})|c(\gamma)}} < \varepsilon'$ 
		\item $\forall U \in \mathcal{I}_{mid}, \sum_{\tau \in \mathcal{M}_{U}}{\frac{1}{|Aut(U)|c(\tau)}} < \varepsilon'$
	\end{enumerate}
\end{corollary}
\subsubsection{Choosing $c(\alpha)$ in general*}
When we have multiple types of vertices and hyperedges of arity $k$, Lemma \ref{calphalemma} can be generalized as follows:
\begin{lemma}
	Under our simplifying assumptons, for all $U \in \mathcal{I}_{mid}$,
	\[
	\sum_{\alpha: U_{\alpha} \equiv U, \alpha \text{ is proper and non-trivial}}{\frac{1}{|Aut(U_{\alpha} \cap V_{\alpha})|(3D_V{t_{max}})^{|U_{\alpha} \setminus V_{\alpha}| + |V_{\alpha} \setminus U_{\alpha}| + k|E(\alpha)|}(2t_{max})^{|V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})|}}} < 5
	\]
\end{lemma}
\begin{proof}[Proof sketch]
	This can be proved in the same way as Lemma \ref{calphalemma} with the following modifications:
	\begin{enumerate}
		\item In addition to choosing the number of vertices in $U_{\alpha} \setminus V_{\alpha}$, $V_{\alpha} \setminus U_{\alpha}$, and $V(\alpha) \setminus (U_{\alpha} \cap V_{\alpha})$, we also have to choose the types of these vertices.
		\item For each hyperedge, we have to choose $k$ endpoints rather than $2$ endpoints.
	\end{enumerate}
\end{proof}

\begin{corollary}\label{cor: calphachoice}
	For all $\varepsilon' > 0$, if we take
	\[c(\alpha) = \frac{5(3{t_{max}}D_V)^{|U_{\alpha} \setminus V_{\alpha}| + |V_{\alpha} \setminus U_{\alpha}| + k|E(\alpha)|}(2t_{max})^{|V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})|}}{\varepsilon'}\]
	then
	\begin{enumerate}
		\item $\forall U \in \mathcal{I}_{mid}, \sum_{\gamma \in \Gamma_{U,*}}{\frac{1}{|Aut(U)|c(\gamma)}} < \varepsilon'$ 
		\item $\forall V \in \mathcal{I}_{mid}, \sum_{\gamma \in \Gamma_{*,V}}{\frac{1}{|Aut(U_{\gamma})|c(\gamma)}} < \varepsilon'$ 
		\item $\forall U \in \mathcal{I}_{mid}, \sum_{\tau \in \mathcal{M}_{U}}{\frac{1}{|Aut(U)|c(\tau)}} < \varepsilon'$
	\end{enumerate}
\end{corollary}

For technical reasons, we will need a more refined bound when the sum is over all shapes $\gamma$ of at least a prescribed size.

\begin{lemma}\label{lem: tpca_refined_bound}
	For all $\varepsilon' > 0$, for the same choice of $c(\alpha)$ as in $\cref{cor: calphachoice}$, for any $U \in {\mathcal I}_{mid}$ and integer $m \ge 1$, we have \[\sum_{\gamma \in \Gamma_{U, *}: |V(\gamma)| \ge |U| + m} \frac{1}{|Aut(U)|c(\gamma)} \le \frac{\varepsilon'}{5\cdot 2^{m - 1}}\]
\end{lemma}

\begin{proof}[Proof sketch]
	The proof is similar to the proof of \cref{cor: calphachoice}, but we now have the extra condition $j_2 + j_3 \ge m$ in the proof of \cref{calphalemma}. Then,
	\[\sum_{j_1, j_2, j_3, j_4 \in \mathbb{N}\cup \{0\}: j_2 + j_3 \ge m} \frac{1}{3^{j_1 + j_2}9^{j_4}2^{j_3}} \le \sum_{j_1, j_4 \in \mathbb{N}\cup \{0\}} \frac{1}{2^m3^{j_1}9^{j_4}} = \frac{27}{16\cdot 2^m}\le \frac{1}{2^{m - 1}}\]
\end{proof}

\subsection{Proof of the Generalized Intersection Tradeoff Lemma}
We now prove the generalized intersection tradeoff lemma.
\begin{lemma}
	For all $\gamma, \tau, \gamma'$ and all intersection patterns $P \in \mathcal{P}_{\gamma,\tau,\gamma'}$, 
	\[
	w(V(\tau_P)) + w(I_{\tau_P}) - w(S_{\tau_P,min}) \leq w(V(\tau)) + w(I_{\tau}) - w(S_{\tau,min})+ w(V(\gamma) \setminus U_{\gamma}) + w(V(\gamma') \setminus U_{\gamma'})
	\]
\end{lemma}
\begin{proof}
	\begin{definition} \ 
		\begin{enumerate}
			\item We define $I_{LM}$ to be the set of vertices which, after intersections, touch $\gamma$ and $\tau$ but not ${\gamma'}^T$. In particular, $I_{LM}$ consists of the vertices which result from intersecting a pair of vertices in $V(\gamma) \setminus V_{\gamma}$ and $V(\tau) \setminus U_{\tau} \setminus V_{\tau}$ and the vertices which are in $U_{\tau} \setminus V_{\tau}$ and are not intersected with any other vertex.
			\item We define $I_{MR}$ to be the set of vertices which, after intersections, touch $\tau$ and ${\gamma'}^T$ but not $\gamma$. In particular, $I_{MR}$ consists of the vertices which result from intersecting a pair of vertices in $V(\tau) \setminus U_{\tau} \setminus V_{\tau}$ and $V({\gamma'}^T) \setminus U_{{\gamma'}^T}$ and the vertices which are in $V_{\tau} \setminus U_{\tau}$ and are not intersected with any other vertex.
			\item We define $I_{LR}$ to be the set of vertices which, after intersections, touch $\gamma$ and ${\gamma'}^T$ but not $\tau$. In particular, $I_{LR}$ consists of the vertices which result from intersecting a pair of vertices in $V(\gamma) \setminus V_{\gamma}$ and $V({\gamma'}^T) \setminus U_{{\gamma'}^T}$.
			\item We define $I_{LMR}$ to be the set of vertices which, after intersections, touch $\gamma$, $\tau$, and ${\gamma'}^T$. In particular, $I_{LMR}$ consists of the vertices which result from intersecting a triple of vertices in $V(\gamma) \setminus V_{\gamma}$,  $V(\tau) \setminus U_{\tau} \setminus V_{\tau}$, and $V({\gamma'}^T) \setminus U_{{\gamma'}^T}$, intersecting a pair of vertices in $V(\gamma) \setminus V_{\gamma}$ and $V_{\tau} \setminus U_{\tau}$, intersecting a pair of vertices in $U_{\tau} \setminus V_{\tau}$ and $V({\gamma'}^T) \setminus U_{{\gamma'}^T}$, and single vertices in $U_{\tau} \cap V_{\tau}$. 
		\end{enumerate}
	\end{definition}
	The main idea is as follows. A priori, any of the vertices in $I_{LM} \cup I_{MR} \cup I_{LR} \cup I_{LMR}$ could become isolated. We handle this by keeping track of the following types of flows:
	\begin{enumerate}
		\item Flows from $U_{\gamma}$ to $I_{LM} \cup I_{LR} \cup I_{LMR}$
		\item Flows from $I_{LR} \cup I_{MR} \cup I_{LMR}$ to $V_{{\gamma'}^T}$
		\item Flows from $I_{LM}$ to $I_{MR}$. For technical reasons, we also view vertices in $I_{LMR}$ as having flow to themselves.
	\end{enumerate}
	We then observe that flows to and from these vertices prevent these vertices from being isolated and can provide flow from $U_{\gamma}$ to $V_{{\gamma'}^T}$, which gives a lower bound on $w(S_{\tau_P})$.
	
	We now implement this idea.
	\begin{definition}[Flow Graph]
		Given a shape $\alpha$, we define the directed graph $H_{\alpha}$ as follows:
		\begin{enumerate}
			\item For each vertex $v \in V(\alpha)$, we create two vertices $v_{in}$ and $v_{out}$. We then create a directed edge from $v_{in}$ to $v_{out}$ with capacity $w(v)$
			\item For each pair of vertices $(v,w)$ which is an edge of multiplicity $1$ in $E(\alpha)$ (or part of a hyperedge of multiplicity $1$ in $E(\alpha)$), we create a directed edge with infinite capacity from $v_{out}$ to $w_{in}$ and we create a directed edge with infinite capacity from $w_{out}$ to $v_{in}$.
			\item We define $U_{H_{\alpha}}$ to be $U_{H_{\alpha}} = \{u_{in}: u \in U_{\alpha}\}$ and we define $V_{H_{\alpha}}$ to be $V_{H_{\alpha}} = \{v_{out}: v \in V_{\alpha}\}$
		\end{enumerate}
	\end{definition}
	\begin{lemma}
		The maximum flow from $U_{H_{\alpha}}$ to $V_{H_{\alpha}}$ is equal to the minimum weight of a separator between $U_{\alpha}$ and $V_{\alpha}$.
	\end{lemma}
	\begin{proof}
		This can be proved using the max flow min cut theorem.
	\end{proof}
	\begin{definition}[Modified Flow Graph]
		Given a shape $\alpha$ together with a set $I_L \subseteq V(\alpha)$ of vertices in $\alpha$ (which will be the vertices in $\alpha$ which are intersected with a vertex to the left of $\alpha$) and a set $I_R \subseteq V(\alpha)$ of vertices in $\alpha$ (which will be the vertices in $\alpha$ which are intersected with a vertex to the right of $\alpha$), we define the modified flow graph $H^{I_L,I_R}_{\alpha}$ as follows:
		\begin{enumerate}
			\item We start with the flow graph $H_{\alpha}$
			\item For each vertex $u \in I_L$, we delete all of the edges into $u_{in}$ and add $u_{in}$ to $U_{H_{\alpha}}$
			\item For each vertex $v \in I_R$, we delete all of the edges out of $v_{out}$ and add $v_{out}$ to $V_{H_{\alpha}}$
			\item We call the resulting graph $H^{I_L,I_R}_{\alpha}$ and the resulting sets $U_{H^{I_L,I_R}_{\alpha}}$ and $V_{H^{I_L,I_R}_{\alpha}}$
		\end{enumerate}
	\end{definition}
	\begin{lemma}\label{increasedflowlemma}
		The maximum flow from $U_{H^{I_L,I_R}_{\alpha}}$ to $V_{H^{I_L,I_R}_{\alpha}}$ in $H^{I_L,I_R}_{\alpha}$ is at least as large as the maximum flow from $U_{H_{\alpha}}$ to $V_{H_{\alpha}}$ in $H_{\alpha}$
	\end{lemma}
	\begin{proof}[Proof sketch]
		Observe that if we have a cut $C$ in $H^{I_L,I_R}_{\alpha}$ which separates $U_{H^{I_L,I_R}_{\alpha}}$ and $V_{H^{I_L,I_R}_{\alpha}}$ then $C$ separates $U_{H_{\alpha}}$ and $V_{H_{\alpha}}$ in $H_{\alpha}$
	\end{proof}
	Before the intersections, we have the following flows.
	\begin{enumerate}
		\item We take $F_1$ to be the maximum flow from $U_{\gamma}$ to $V_{\gamma}$ in $\gamma$. Note that $F_1$ has value $w(V_{\gamma})$
		\item We take $F_2$ to be the maximum flow from $U_{\tau}$ to $V_{\tau}$ in $\tau$. Note that $F_2$ has value $w(S_{\tau,min})$
		\item We take $F_3$ to be the maximum flow from $U_{{\gamma'}^T}$ to $V_{{\gamma'}^T}$ in ${\gamma'}^T$. Note that $F_1$ has value $w(U_{{\gamma'}^T})$
	\end{enumerate}
	After the intersections, we take the following flows:
	\begin{enumerate}
		\item We take $F'_1$ to be the maximum flow from $U_{H^{\emptyset,I_{LM} \cup I_{LR} \cup I_{LMR}}_{\gamma}}$ to $V_{H^{\emptyset,I_{LM} \cup I_{LR} \cup I_{LMR}}_{\gamma}}$ in \\
		$H^{\emptyset,I_{LM} \cup I_{LR} \cup I_{LMR}}_{\gamma}$.
		\item We take $F'_2$ to be the maximum flow from $U_{H^{I_{LM} \cup I_{LMR},I_{MR} \cup I_{LMR}}_{\tau}}$ to $V_{H^{I_{LM} \cup I_{LMR},I_{MR} \cup I_{LMR}}_{\tau}}$ in \\
		$H^{I_{LM} \cup I_{LMR},I_{MR} \cup I_{LMR}}_{\tau}$
		\item We take $F'_3$ to be the maximum flow from $U_{H^{I_{MR} \cup I_{LR} \cup I_{LMR},\emptyset}_{{\gamma'}^T}}$ to $V_{H^{I_{MR} \cup I_{LR} \cup I_{LMR},\emptyset}_{{\gamma'}^T}}$ in \\
		$H^{I_{MR} \cup I_{LR} \cup I_{LMR},\emptyset}_{{\gamma'}^T}$.
	\end{enumerate}
	Observe that because of how intersection patterns are defined, $val(F'_1) = w(U_{\gamma})$ and $val(F'_3) = w(V_{{\gamma'}^T})$. By Lemma \ref{increasedflowlemma}, the value of $F'_2$ is at least as large as the value of $F_2$, so $val(F'_2) \geq w(S_{\tau,min})$.
	
	We now consider $F'_1 + F'_2 + F'_3$. As is, this is not a flow, but we can fix this.
	\begin{definition}
		For each vertex $v \in V(\tau_P)$,
		\begin{enumerate}
			\item We define $f_{in}(v)$ to be the flow into $v_{in}$ in $F'_1 + F'_2 + F'_3$.
			\item We define $f_{out}(v)$ to be the flow out of $v_{out}$ in $F'_1 + F'_2 + F'_3$.
			\item We define $f_{through}(v)$ to be the flow from $v_{in}$ to $v_{out}$ in $F'_1 + F'_2 + F'_3$
			\item We define $f_{imbalance}(v)$ to be $f_{imbalance}(v) = |f_{in}(v) - f_{out}(v)|$
			\item We define $f_{excess}(v)$ to be $f_{excess}(v) = f_{through}(v) - max{\{f_{in}(v),f_{out}(v)\}}$
		\end{enumerate}
		With this information, we fix the flow $F'_1 + F'_2 + F'_3$ as follows. For each vertex $v \in V(\tau_P)$, 
		\begin{enumerate}
			\item If $f_{in}(v) > f_{out}(v)$ then we create a vertex $v_{supplemental,out}$ and an edge from $v_{out}$ to $v_{supplemental,out}$ with capacity $f_{imbalance}(v)$ and we route $f_{imbalance}(v)$ of flow along this edge. We then add $v_{supplemental,out}$ to a set of vertices $V_{supplemental}$.
			\item If $f_{in}(v) < f_{out}(v)$ then we create a vertex $v_{supplemental,in}$ and an edge from $v_{supplemental,in}$ to $v_{in}$ with capacity $f_{imbalance}(v)$ and we route $f_{imbalance}(v)$ of flow along this edge. We then add $v_{supplemental,out}$ to a set of vertices $V_{supplemental}$.
			\item We reduce the flow on the edge from $v_{in}$ to $v_{out}$ by $f_{excess}(v)$
		\end{enumerate}
		We call the resulting flow $F'$
	\end{definition}
	\begin{proposition}
		$F'$ is a flow from  $U_{H^{\emptyset,I_{LM} \cup I_{LR} \cup I_{LMR}}_{\gamma}} \cup U_{supplemental}$ to $V_{H^{I_{MR} \cup I_{LR} \cup I_{LMR},\emptyset}_{{\gamma'}^T}} \cup V_{supplemental}$ with value $val(F') = val(F'_1) + val(F'_2) + val(F'_3) - \sum_{v \in V(\tau)}{f_{excess}(v)}$
	\end{proposition}
	\begin{corollary}\label{fixedflowcorollary}
		There exists a flow $F''$ from $U_{H^{\emptyset,I_{LM} \cup I_{LR} \cup I_{LMR}}_{\gamma}}$ to $V_{H^{I_{MR} \cup I_{LR} \cup I_{LMR},\emptyset}_{{\gamma'}^T}}$ with value $val(F'') \geq val(F'_1) + val(F'_2) + val(F'_3) - \sum_{v \in V(\tau)}{(f_{excess}(v) + f_{imbalance}(v))}$
	\end{corollary}
	\begin{proof}
		Consider the minimum cut $C$ between $U_{H^{\emptyset,I_{LM} \cup I_{LR} \cup I_{LMR}}_{\gamma}}$ and $V_{H^{I_{MR} \cup I_{LR} \cup I_{LMR},\emptyset}_{{\gamma'}^T}}$. If we add all of the supplemental edges to $C$ then this gives a cut $C'$ between $U_{H^{\emptyset,I_{LM} \cup I_{LR} \cup I_{LMR}}_{\gamma}}$ and $V_{H^{I_{MR} \cup I_{LR} \cup I_{LMR},\emptyset}_{{\gamma'}^T}}$ with capacity 
		\[
		capacity(C') = capacity(C) + \sum_{v \in V(\tau)}{f_{imbalance}(v)} \geq val(F')
		\]
		Thus, $capacity(C) \geq val(F') - \sum_{v \in V(\tau)}{f_{imbalance}(v)}$ so there exists a flow $F''$ from $U_{H^{\emptyset,I_{LM} \cup I_{LR} \cup I_{LMR}}_{\gamma}}$ to $V_{H^{I_{MR} \cup I_{LR} \cup I_{LMR},\emptyset}_{{\gamma'}^T}}$ with value 
		\[
		val(F'') = capacity(C) \geq val(F'_1) + val(F'_2) + val(F'_3) - \sum_{v \in V(\tau)}{(f_{excess}(v) + f_{imbalance}(v))}
		\]
	\end{proof}
	We now make the following observations:
	\begin{lemma} \ 
		\begin{enumerate}
			\item For all vertices $v \notin I_{LM} \cup I_{MR} \cup I_{LR} \cup I_{LMR}$, $f_{excess}(v) = f_{imbalance}(v) = 0$ (and these vertices can never be isolated).
			\item For all vertices $v \in I_{LM}$, $f_{excess}(v) + f_{imbalance}(v) \leq w(v)$. Moreover, for all vertices $v \in I_{LM}$ which are isolated, $f_{excess}(v) = f_{imbalance}(v) = 0$.
			\item For all vertices $v \in I_{MR}$, $f_{excess}(v) + f_{imbalance}(v) \leq w(v)$. Moreover, for all vertices $v \in I_{LM}$ which are isolated, $f_{excess}(v) = f_{imbalance}(v) = 0$.
			\item For all vertices $v \in I_{LR}$, $f_{excess}(v) + f_{imbalance}(v) \leq w(v)$. Moreover, for all vertices $v \in I_{LM}$ which are isolated, $f_{excess}(v) = f_{imbalance}(v) = 0$.
			\item For all vertices $v \in I_{LMR}$, $f_{excess}(v) + f_{imbalance}(v) \leq 2w(v)$. Moreover, for all vertices $v \in I_{LMR}$ which are isolated, $f_{excess}(v) = w(v)$ and $f_{imbalance}(v) = 0$.
		\end{enumerate}
	\end{lemma}
	\begin{proof}
		For the first statement, observe that for vertices $v \notin I_{LM} \cup I_{MR} \cup I_{LR} \cup I_{LMR}$, neither $v_{in}$ nor $v_{out}$ is ever a sink or source so the flow into these vertices must equal the flow out of these vertices and thus $f_{in}(v) = f_{out}(v) = f_{through}(v)$.
		
		For the second statement, observe that for a vertex $v \in I_{LM}$, 
		\begin{enumerate}
			\item $F'_1$ will have a flow of $f_{in}(v)$ into $v_{in}$ and along the edge from $v_{in}$ to $v_{out}$ 
			\item $F'_2$ will have a flow of $f_{out}(v)$ along the edge from $v_{in}$ to $v_{out}$ and out of $v_{out}$.
		\end{enumerate}
		Thus, $f_{excess}(v) = f_{in}(v) + f_{out}(v) - \max\{f_{in}(v),f_{out}(v)\}$. Since $f_{imbalance}(v) = |f_{in}(v) - f_{out}(v)|$, 
		$f_{excess}(v) + f_{imbalance}(v) = f_{in}(v) + f_{out}(v) - \min\{f_{in}(v),f_{out}(v)\} \leq w(v)$.
		
		If $v$ is isolated then neither $F'_1$ nor $F'_2$ can have any flow to $v_{in}$ or out of $v_{out}$ so $f_{in}(v) = f_{through}(v) = f_{out}(v) = 0$
		
		The third and fourth statements can be proved in the same way as the second statement.
		
		For the fifth statement, observe that for a vertex $v \in I_{LMR}$, 
		\begin{enumerate}
			\item $F'_1$ will have a flow of $f_{in}(v)$ into $v_{in}$ and along the edge from $v_{in}$ to $v_{out}$.
			\item $F'_2$ will have a flow of $w(v)$ along the edge from $v_{in}$ to $v_{out}$
			\item $F'_3$ will have a flow of $f_{out}(v)$ along the edge from $v_{in}$ to $v_{out}$ and out of $v_{out}$.
		\end{enumerate}
		Thus, $f_{excess}(v) = w(v) + f_{in}(v) + f_{out}(v) - \max\{f_{in}(v),f_{out}(v)\}$. Since $f_{imbalance}(v) = |f_{in}(v) - f_{out}(v)|$, 
		$f_{excess}(v) + f_{imbalance}(v) = w(v) + f_{in}(v) + f_{out}(v) - \min\{f_{in}(v),f_{out}(v)\} \leq 2w(v)$.
		
		If $v$ is isolated then neither $F'_1$ nor $F'_3$ can have any flow to $v_{in}$ or out of $v_{out}$ so $f_{in}(v) = f_{out}(v) = 0$ and $f_{through}(v) = w(v)$.
	\end{proof}
	Putting everything together, we have the following corollary:
	\begin{corollary} \ 
		\[
		\sum_{v \in V(\tau_P)}{(f_{excess}(v) + f_{imbalance}(v))} \leq w(I_{LM}) + w(I_{LR}) + w(I_{MR}) + 2w(I_{LMR}) - (w(I_{\tau_P}) - w(I_{\tau}))
		\]
	\end{corollary}
	Combining this with Corollary \ref{fixedflowcorollary},
	\begin{align*}
		w(S_{\tau_P,min}) &\geq val(F'_1) + val(F'_2) + val(F'_3) - \sum_{v \in V(\tau_P)}{(f_{excess}(v) + f_{imbalance}(v))}\\
		&\geq w(U_{\gamma}) + w(S_{\tau,min}) + w(V_{{\gamma'}^T}) - w(I_{LM}) - w(I_{LR}) - w(I_{MR}) - 2w(I_{LMR}) + (w(I_{\tau_P}) - w(I_{\tau}))
	\end{align*}
	Since $w(V(\tau_P)) = w(V(\tau)) + w(V(\gamma)) + w(V(\gamma')) - w(I_{LM}) - w(I_{LR}) - w(I_{MR}) - 2w(I_{LMR})$, 
	\[
	w(S_{\tau_P,min}) \geq w(U_{\gamma}) + w(S_{\tau,min}) + w(V_{{\gamma'}^T}) + w(V(\tau_P)) - w(V(\tau)) - w(V(\gamma)) - w(V(\gamma')) + (w(I_{\tau_P}) - w(I_{\tau}))
	\]
	Rearranging this gives 
	\[
	w(V(\tau_P)) - w(S_{\tau_P,min}) + w(I_{\tau_P}) \leq w(V(\tau)) - w(S_{\tau,min}) + w(I_{\tau}) + w(V(\gamma) \setminus U_{\gamma}) + w(V(\gamma') \setminus U_{\gamma'})
	\]
	which is the generalized intersection tradeoff lemma.
\end{proof}

\subsubsection{Additional Parameters for the General Case*}
In the general case we will need a few additional parameters which we define here.
\begin{definition} \
	\begin{enumerate}
		\item We define $k$ to be the arity of the hyperedges corresponding to the input.
		\item We define $t_{max}$ to be the number of different types of indices. We define $n_i$ to be the number of possibilities for indices of type $i$ and we define $n = \max{\{n_i: i \in [t_{max]}\}}$.
	\end{enumerate}
\end{definition}
\subsection{Indices, Input Entries, Vertices, and Edges}
Note: For this section, we use $X$ to denote the input, we use $x$ to denote entries of the input and we use $y$ to denote solution variables.
\begin{definition}[Vertices: Simplified Case]
	When the input and solution variables are indexed by one type of index which takes values in $[n]$ then we represent the index $i$ by a vertex labeled $i$.

	If we want to leave an index unspecified, we instead represent it by a vertex labeled with a variable (we will generally use $u$, $v$, or $w$ for these variables).
\end{definition}
\begin{definition}[Vertices: General Case*]
	When the input and solution variables are indexed by several types of indices where indices of type $t$ take values in $[n_t]$, we represent an index of type $t$ with value $i$ as a vertex labeled by the tuple $(t,i)$. We say that such a vertex has type $t$.

	If we want to leave an index of type $t$ unspecified, we instead represent it by a vertex labeled with a tuple $(t,?)$ where $?$ is a variable (which will generally be $u$, $v$, or $w$).
\end{definition}
\begin{definition}[Edges: Simplified Case]
	When the input is $X \in \{-1,+1\}^{\binom{n}{2}}$, we represent the entries of the input by the undirected edges $\{(i,j): i < j \in [n]\}$. Given an edge $e = (i,j)$, we take $x_e = x_{ij}$ to be the input entry corresponding to $e$.
\end{definition}
\begin{definition}[Edges: General Case*]
	In general, we represent the entries of the input by hyperedges whose form depends on nature of the input. We still take $x_e$ to be the input entry corresponding to $e$.
\end{definition}
\begin{example}
	If the input is an $n_1 \times n_2$ matrix $X$ then we will have two types of indices, one for the row and one for the column. Thus, we will have the vertices $\{(1,i): i \in [n_1]\} \cup \{(2,j): j \in [n_2]\}$. In this case, we have an edge $((1,i),(2,j))$ for each entry $x_{ij}$ of the input.
\end{example}
\begin{example}
	If the input is an $n \times n$ matrix $X$ which is not symmetric then we only need the indices $[n]$. In this case, we have a directed edge $(i,j)$ for each entry $x_{ij}$ where $i \neq j$. If the entries $x_{ii}$ are also part of the input than we also have loops $(i,i)$ for these entries.
\end{example}
\begin{example}
	If our input is a symmetric $n \times n \times n$ tensor $X$ (i.e. $x_{ijk} = x_{ikj} = x_{jik} = x_{jki} = x_{kij} = x_{kji}$) and $x_{ijk} = 0$ whenever $i,j,k$ are not distinct then we only need the indices $[n]$. In this case, we have an undirected hyperedge $e = (i,j,k)$ for each entry $x_{e} = x_{ijk}$ of the input where $i,j,k$ are distinct.
\end{example}
\begin{example}
	If the input is an $n_1 \times n_2 \times n_3$ tensor $X$ then we will have three types of indices. Thus, we will have the vertices $\{(1,i): i \in [n_1]\} \cup \{(2,j): j \in [n_2]\} \cup \{(3,k): k \in [n_3]\}$. In this case, we have a hyperedge $e = ((1,i),(2,j),(3,k))$ for each entry $x_e = x_{ijk}$ of the input.
\end{example}
\subsection{Matrix Indices and Monomials}
In this subsection, we discuss how our matrices are indexed and how we associate matrix indices with monomials. We also describe the automorphism groups of matrix indices.
\begin{definition}[Matrix Indices: Simplified Case]
	If there is only one type of index and we have the constraints $y^2_i = 1$ or $y^2_i = y_i$ on the solution variables then we define a matrix index $A$ to be a tuple of indices $(a_1,\ldots,a_{|A|})$. We make the following definitions about matrix indices:
	\begin{enumerate}
		\item We associate the monomial $\prod_{j=1}^{|A|}{y_{a_j}}$ to $A$.
		\item We define $V(A)$ to be the set of vertices $\{a_i: i \in [|A|]\}$. For brevity, we will often write $A$ instead of $V(A)$ when it is clear from context that we are referring to $A$ as a set of vertices rather than a matrix index.
		\item We take the automorphism group of $A$ to be $Aut(A) = S_{|A|}$ (the permutations of the elements of $A$)
	\end{enumerate}
\end{definition}
\begin{example}
	The matrix index $A = (4,6,1)$ represents the monomial ${y_4}{y_6}{y_1} = {y_1}{y_4}{y_6}$ and $Aut(A) = S_3$
\end{example}
\begin{remark}
	We take $A$ to be an ordered tuple rather than a set for technical reasons.
\end{remark}
In general, we need a more intricate definition for matrix indices. We start by defining matrix index pieces
\begin{definition}[Matrix Index Piece Definition*]
	We define a matrix index piece $A_i = ((a_{i1},\ldots,a_{i|A_i|}), t_i, p_i)$ to be a tuple of indices $(a_{i1},\ldots,a_{i|A_i|})$ together with a type $t_i$ and a power $p_i$. We make the following definitions about matrix index pieces:
	\begin{enumerate}
		\item We associate the monomial $p_{A_i} = \prod_{j = 1}^{|A_i|}{y^{p_i}_{{t_i}j}}$ with $A_i$.
		\item We define $V(A_i)$ to be the set of vertices $\{(t_i,a_{ij}): j \in [|A_i|]\}$.
		\item We take the automorphism group of $A_i$ to be $Aut(A_i) = S_{|A_i|}$
		\item We say that $A_i$ and $A_j$ are disjoint if $V(A_i) \cap V(A_j) = \emptyset$ (i.e. $t_i \neq t_j$ or $\{a_{i1},\ldots,a_{i|A_i|}\} \cap \{a_{j1},\ldots,a_{j|A_j|}\} = \emptyset$)
	\end{enumerate}
\end{definition}
\begin{definition}[General Matrix Index Definition*]
	We define a matrix index $A = \{A_i\}$ to be a set of disjoint matrix index pieces. We make the following definitions about matrix indices:
	\begin{enumerate}
		\item We associate the monomial $p_{A} = \prod_{A_i \in A}{p(A_i)}$ with $A$.
		\item We define $V(A)$ to be the set of vertices $\cup_{A_i \in A}{V(A_i)}$. For brevity, we will often write $A$ instead of $V(A)$ when it is clear from context that we are referring to $A$ as a set of vertices rather than a matrix index.
		\item We take the automorphism group of $A$ to be $Aut(A) = \prod_{A_i \in A}{Aut(A_i)}$
	\end{enumerate}
\end{definition}
\begin{example}[*]
	If $A_1 = ((2),1,1)$, $A_2 = ((3,1),1,2)$, and $A_3 = ((1,2,3),2,1)$ then $A = \{A_1,A_2,A_3\}$ represesents the monomial $p = {y_{12}}{y^2_{13}}{y^2_{11}}{y_{21}}{y_{22}}{y_{23}}$ and we have $Aut(A) = S_1 \times S_2 \times S_3$
\end{example}
\subsection{Fourier Characters and Ribbons}
A key idea is to analyze Fourier characters of the input.
\begin{definition}[Simplified Fourier Characters]
	If the input distribution is $\Omega = \{-1,1\}$ then given a multi-set of edges $E$, we define $\chi_{E}(X) = \prod_{e \in E}{x_e}$.
\end{definition}
\begin{example}
	If the input is a graph $G \in \{-1,1\}^{\binom{n}{2}}$ and $E$ is a set of potential edges of $G$ (with no multiple edges) then $\chi_E(G) = (-1)^{|E \setminus E(G)|}$.
\end{example}
In general, the Fourier characters are somewhat more complicated.
\begin{definition}[Orthonormal Basis for ${\Omega}$*]
	We define the polynomials $\{h_i: i \in \mathbb{Z} \cap [0,|supp(\Omega)|-1]\}$ to be the unique polynomials (which can be found through the Gram-Schmidt process) such that
	\begin{enumerate}
		\item $\forall i, E_{\Omega}[h^2_i(x)] = 1$
		\item $\forall i \neq j, E_{\Omega}[h_i(x)h_j(x)] = 0$
		\item For all $i$, the leading coefficient of $h_i(x)$ is positive.
	\end{enumerate}
\end{definition}
\begin{example}\label{example: hermite_basis}
	If $\Omega$ is the normal distribution then the polynomials $\{h_i\}$ are the Hermite polynomials with the appropriate normalization so that for all $i$, $E_{\Omega}[h^2_i(x)] = 1$. In particular, $h_0(x) = 1$, $h_1(x) = x$, $h_2(x) = \frac{x^2 - 1}{\sqrt{2!}}$, $h_3(x) = \frac{x^3 - 3x}{\sqrt{3!}}$, etc.
\end{example}
\begin{definition}[General Fourier Characters*]
	Given a multi-set of hyperedges $E$, each of which has a label $l(e) \in [|support(\Omega)|-1]$ (or $\mathbb{N}$ if $\Omega$ has infinite support), we define $\chi_E = \prod_{e \in E}{h_{l(e)}{(X_e)}}$.

	We say that such a multi-set of hyperedges $E$ is proper if it contains no duplicate hyperedges, i.e. it is a set (though the labels on the hyperedges can be arbitrary non-negative integers). Otherwise, we say that $E$ is improper.
\end{definition}
\begin{remark}
	The Fourier characters are $\{\chi_{E}: E \text{ is proper}\}$. For improper $E$, $\chi_{E}$ can be decomposed as a linear combination of $\chi_{E_j}$ where each $E_j$ is proper. We allow improper $E$ because it is sometimes more convenient to have improper $E$ in the middle of the analysis and then do this decomposition at the end.
\end{remark}
\begin{definition}[Ribbons]\label{def: ribbons}
	A ribbon $R$ is a tuple $(H_R,A_R,B_R)$ where $H_R$ is a multi-graph (*or multi-hypergraph with labeled edges in the general case) whose vertices are indices of the input and $A_R$ and $B_R$ are matrix indices such that $V(A_R) \subseteq V(H_R)$ and $V(B_R) \subseteq V(H_R)$. We make the following definitions about ribbons:
	\begin{enumerate}
		\item We define $V(R) = V(H_R)$ and $E(R) = E(H_R)$
		\item We define $\chi_R = \chi_{E(R)}$.
		\item We define $M_R$ to be the matrix such that $(M_R)_{{A_R}{B_R}} = \chi_R$ and $M_{AB} = 0$ whenever $A \neq A_R$ or $B \neq B_R$.
	\end{enumerate}
	We say that $R$ is a proper ribbon if $H_R$ contains no isolated vertices outside of $A_R \cup B_R$ and $E(R)$ is proper. If there is an isolated vertex in $(V(R) \setminus A_R) \setminus B_R$ or $E(R)$ is improper then we say that $R$ is an improper ribbon.
\end{definition}
Proper ribbons are useful because they give an orthonormal basis for the space of matrix valued functions.
\begin{definition}[Inner products of matrix functions]
	For a pair of real matrices $M_1,M_2$ of the same dimension, we write $\langle{M_1,M_2}\rangle = tr({M_1}{M_2}^T)$ (i.e. $\langle{M_1,M_2}\rangle$ is the entrywise dot product of $M_1$ and $M_2$). For a pair of matrix-valued functions $M_1, M_2$ (of the same dimensions), we define
	\[
	\langle{M_1,M_2}\rangle = E_{X}\left[\langle{M_1(X),M_2(X)}\rangle\right]
	\]
\end{definition}
\begin{proposition}
	If $R$ and $R'$ are two proper ribbons then $\langle{M_R,M_{R'}}\rangle = 1$ if $R = R'$ and is $0$ otherwise.
\end{proposition}
We have already seen examples of ribbons in earlier chapters, but we provide an example here for convenience.
For more examples, see \cite{AMP20}.

\begin{example}[Simplified Ribbon]
    In \cref{fig: ribbon_shape}, consider the ribbon $R$ as shown. We have $A_R = (1, 3), B_R = (4), V(R) = \{1, 2, 3, 4\}, E_R = \{\{1, 2\}, \{3, 2\}, \{2, 4\}\}$. The Fourier character is $\chi_{E_R} = \chi_{1, 2}\chi_{3, 2}\chi_{2, 4}$. And finally, $M_R$ is a matrix with rows and columns indexed by tuples of length $|A_R| = 2$ and $|B_R| = 1$ respectively, with exactly one nonzero entry $M_R((1, 3), (4)) = \chi_{E_R}$. Succinctly, \[M_R =
   
       
       
           
           
           
           
       
    \begin{blockarray}{rl@{}c@{}r}
        & & \makebox[0pt]{column $(4)$} \\[-0.5ex]
        & & \,\downarrow \\[-0.5ex]
        \begin{block}{r(l@{}c@{}r)}
            & \makebox[3.1em]{\Large $0$\bigstrut[t]} & \vdots &\makebox[4.2em]{\Large $0$} \\[-0.2ex]
            \text{row }(1, 3) \rightarrow \mkern-9mu & \raisebox{0.5ex}{\makebox[3.2em][l]{\dotfill}} & \chi_{1, 2}\chi_{3, 2}\chi_{2, 4} & \raisebox{0.5ex}{\makebox[4.2em][r]{\dotfill}} \\[+0ex]
            & \makebox[3.1em]{\Large $0$} & \vdots &\makebox[4.2em]{\bigstrut\Large $0$} \\
        \end{block}
    \end{blockarray}\]
\end{example}

\begin{figure}[!h]
    \centering
    \includegraphics[scale=.6, trim={0 5cm 2 5cm},clip]{machinery/images/ribbon_shape}
    \caption{Example of a ribbon and a shape}
    \label{fig: ribbon_shape}
\end{figure}

\subsection{Shapes}
In this subsection, we describe a basis for $S$-invariant matrix valued functions where each matrix in this basis can be described by a relatively small \emph{shape} $\alpha$. The fundamental idea behind shapes is that we keep the structure of the objects we are working with but leave the elements of the object unspecified.
\subsubsection{Simplified Index Shapes}
\begin{definition}[Simplified Index shapes]
	With our simplifying assumptions, an index shape $U$ is a tuple of unspecified indices $(u_1,\cdots,u_{|U|})$. We make the following definitions about index shapes:
	\begin{enumerate}
		\item We define $V(U)$ to be the set of vertices $\{u_i: i \in [|U|]\}$. For brevity, we will often write $U$ instead of $V(U)$ when it is clear from context that we are referring to $U$ as a set of vertices rather than an index shape.
		\item We define the weight of $U$ to be $w(U) = |U|$.
		\item We take the automorphism group of $U$ to be $Aut(U) = S_{|U|}$ (the permutations of the elements of $U$)
	\end{enumerate}
\end{definition}
\begin{definition}
	We say that a matrix index $A = (a_1,\ldots,a_{|A|})$ has index shape $U = (u_1,\ldots,u_{|U|})$ if $|U| = |A|$. Note that in this case, if we take the map $\varphi: \{u_j: j \in [|U|]\} \rightarrow [n]$ where $\varphi(u_j) = a_j$ then $\varphi(U) = (\varphi(u_1),\ldots,\varphi(u_{|U|})) = (a_1,\ldots,a_{|A|}) = A$
\end{definition}
\begin{definition}
	We say that index shapes $U = (u_1,\ldots,u_{|U|})$ and $V = (v_1,\ldots,v_{|V|})$ are equivalent (which we write as $U \equiv V$) if $|U| = |V|$. If $U \equiv V$ then we can set $U = V$ by setting $v_j = u_j$ for all $j \in [|U|]$.
\end{definition}
\begin{example}
	The matrix index $A = \{4,6,1\}$ has shape $U = \{u_1,u_2,u_3\}$ which has weight $3$.
\end{example}
\subsubsection{General Index Shapes*}
In general, we define general index shapes in the same way that we defined general matrix indices (just with unspecified indices)
\begin{definition}[Index Shape Piece Definition]
	We define a index shape piece $U_i = ((u_{i1},\ldots,u_{i|U_i|}), t_i, p_i)$ to be a tuple of indices $(u_{i1},\ldots,u_{i|A_i|})$ together with a type $t_i$ and a power $p_i$. We make the following definitions about index shape pieces:
	\begin{enumerate}
		\item We define $V(U_i)$ to be the set of vertices $\{(t_i,u_{ij}): j \in [|U_i|]\}$.
		\item We define $w(U_i) = |U_i|log_{n}(n_{t_i})$
		\item We take the automorphism group of $U_i$ to be $Aut(U_i) = S_{|U_i|}$
	\end{enumerate}
\end{definition}
\begin{definition}[General Index Shape Definition]
	We define an index shape $U = \{U_i\}$ to be a set of index shape pieces such that for all $i' \neq i$, either $t_{i'} \neq t_i$ or $p_{i'} \neq p_i$. We make the following definitions about index shapes:
	\begin{enumerate}
		\item We define $V(U)$ to be the set of vertices $\cup_{U_i \in U}{V(U_i)}$. For brevity, we will often write $U$ instead of $V(U)$ when it is clear from context that we are referring to $U$ as a set of vertices rather than an index shape.
		\item We define $w(U)$ to be $w(U) = \sum_{U_i \in U}{w(U_i)}$
		\item We take the automorphism group of $U$ to be $Aut(U) = \prod_{U_i \in U}{Aut(U_i)}$
	\end{enumerate}
\end{definition}
\begin{remark}
	For technical reasons, we want to ensure that if two index shapes $U$ and $U'$ have the same weight then $U$ and $U'$ have the same number of each type of vertex. To ensure this, we add an infinitesimal perturbation to each $n_i$ if necessary.
\end{remark}
\begin{definition}
	We say that a matrix index $A$ has index shape $U$ if there is an assignment of values to the unspecified indices of $U$ which results in $A$. More precisely, we say that $A$ has index shape $U$ if there is a map $\varphi: \{u_{ij}\} \rightarrow \mathbb{N}$ such that if we define $\varphi(U_i)$ to be $\varphi(U_i) = ((\varphi(u_{i1}),\ldots,\varphi(u_{i|U_i|})),t_i,p_i)$ then $\varphi(U) = \{\varphi(U_i)\} = \{A_i\} = A$.
\end{definition}
\begin{definition}
	If $U$ and $V$ are two index shapes, we say that $U$ is equivalent to $V$ (which we write as $U \equiv V$) if $U$ and $V$ have the same number of index shape pieces and we can order the index shape pieces of $U$ and $V$ so that writing $U = \{U_i\}$ and $V = \{V_i\}$ where $U_i = ((u_{i1},\ldots,u_{i|U_i|}), t_i, p_i)$ and $V_{i} = ((v_{i1},\ldots,v_{i|V_i|}), t'_i, p'_i)$, we have that for all $i$, $|V_i| = |U_i|$, $t'_i = t_i$, and $p'_i = p_i$. If $U \equiv V$ then we can set $U = V$ by setting $u_{ij} = v_{ij}$ for all $i$ and all $j \in [|U_i|]$.
\end{definition}
\subsubsection{Ribbon Shapes}
With these definitions, we are now ready to define shapes and the matrices associated to them.
\begin{definition}[Shapes]\label{def: shapes}
	A ribbon shape $\alpha$ (which we call a shape for brevity) is a tuple $\alpha = (H_{\alpha},U_{\alpha},V_{\alpha})$ where $H_{\alpha}$ is a multi-graph (*or multi-hypergraph with labeled edges in the general case) whose vertices are unspecified distinct indices of the input (*whose type is specified in the general case) and $U_{\alpha}$ and $V_{\alpha}$ are index shapes such that $V(U_{\alpha}) \subseteq V(H_\alpha)$ and $V(V_{\alpha}) \subseteq V(H_\alpha)$. We make the following definitions about shapes:
	\begin{enumerate}
		\item We define $V(\alpha) = V(H_{\alpha})$ (note that $V(\alpha)$ and $V_{\alpha}$ are not the same thing) and we define $E(\alpha) = E(H_{\alpha})$.
		\item We say that a shape $\alpha$ is proper if it contains no isolated vertices outside of $V(U_{\alpha}) \cup V(V_{\alpha})$, $E(\alpha)$ has no multiple edges/hyperedges and edges in $E(\alpha)$ do not have label $0$. If there is an isolated vertex in $V(\alpha) \setminus V(U_{\alpha}) \setminus V(V_{\alpha})$ or $E(\alpha)$ has a multiple edge/hyperedge then we say that $\alpha$ is an improper shape.
	\end{enumerate}
	Note: For brevity, we will often write $U_{\alpha}$ and $V_{\alpha}$ instead of $V(U_{\alpha})$ and $V(V_{\alpha})$ when it is clear from context that we are referring to $U_{\alpha}$ and $V_{\alpha}$ as sets of vertices rather than index shapes.
\end{definition}
\begin{definition}[Trivial shapes]
	We say that a shape $\alpha$ is trivial if $V(\alpha) = V(U_{\alpha}) = V(V_{\alpha})$ and $E(\alpha) = \emptyset$. Otherwise, we say that $\alpha$ is non-trivial.
\end{definition}
\begin{remark}
	Note that all trivial shapes can do is permute the order of the vertices in $V(U_{\alpha}) = V(V_{\alpha})$.
\end{remark}
\begin{definition}
	Informally, we say that a ribbon $R$ has shape $\alpha$ if replacing the indices in $R$ with unspecified labels results in $\alpha$. Formally, we say that $R$ has shape $\alpha$ if there is an injective mapping $\varphi:V(\alpha) \rightarrow [n]$ (*or $[t_{max}] \times [n]$ in the general case) such that $\varphi(\alpha) = R$, i.e. $\varphi(H_{\alpha}) = H_R$, $\varphi(U_{\alpha}) = A_R$, and $\varphi(V_{\alpha}) = B_R$
\end{definition}
\begin{definition}
	We say that two shapes $\alpha$ and $\beta$ are equivalent (which we write as $\alpha \equiv \beta$) if they are the same up to renaming their indices. More precisely, we say that $\alpha \equiv \beta$ if there is a bijective map $\pi: V(H_\alpha) \rightarrow V(H_\beta)$ such that $\pi(H_\alpha) = H_{\beta}$, $\pi(U_{\alpha}) = U_{\beta}$, and $\pi(V_{\alpha}) = V_{\beta}$.
\end{definition}
\begin{definition}
	Given a shape $\alpha$ and matrix indices $A,B$ of shapes $U_\alpha$ and $V_\alpha$ respectively, we define $\mathcal{R}(\alpha,A,B)$ to be the set of ribbons $R$ such that $R$ \emph{has shape $\alpha$}, $A_R = A$, and $B_R = B$.
\end{definition}
\begin{definition}
	For a shape $\alpha$, we define the matrix-valued function $M_\alpha$ to have entries $M_{\alpha}(A,B)$ given by
	\[
	(M_\alpha)_{A,B}(X) = \sum_{R \in \mathcal{R}(\alpha, A,B )} \chi_R(X)
	\]
\end{definition}

We have seen examples of shapes in earlier chapters, but we provide one here for convenience.
For more examples, see \cite{AMP20}.

\begin{example}[Simplified Shape]
    In \cref{fig: ribbon_shape}, consider the shape $\alpha$ as shown. We have $U_{\alpha} = (u_1, u_2), V_{\alpha} = (v_1), V(\alpha) = \{u_1, u_2, v_1, w_1\}$ and $E(\alpha) = \{\{u_1, w_1\}, \{u_2, w_1\}, \{w_1, v_1\}\}$. $M_{\alpha}$ is a matrix with rows and columns indexed by tuples of length $|U_{\alpha}| = 2$ and $|V_{\alpha}| = 1$ respectively. The nonzero entries will have rows and columns indexed by $(a_1, a_2)$ and $b_1$ respectively for all distinct $a_1, a_2, b_1$, with the corresponding entry being $M_{\alpha}((a_1, a_2), (b_1)) = \sum_{c_1 \in [n] \setminus \{a_1, a_2, b_1\}} \chi_{a_1, c_1}\chi_{a_2, c_1} \chi_{c_1, b_1}$. Here, the injective map $\varphi$ maps $u_1, u_2, w_1, v_1$ to $a_1, a_2, c_1, b_1$ respectively and we sum over all such maps. Succinctly, \[M_{\alpha} =
   
       
       
           
           
           
           
       
    \begin{blockarray}{rl@{}c@{}r}
        & & \makebox[0pt]{column $(c_1)$} \\[-0.5ex]
        & & \,\downarrow \\[-0.5ex]
        \begin{block}{r(l@{}c@{}r)}
            &  & \vdots & \\[-0.2ex]
            \text{row }(a_1, a_2) \rightarrow \mkern-9mu & \raisebox{0.5ex}{\makebox[3.2em][l]{\dotfill}} & \sum_{c_1 \in [n] \setminus \{a_1, a_2, b_1\}} \chi_{a_1, c_1}\chi_{a_2, c_1} \chi_{c_1, b_1} & \raisebox{0.5ex}{\makebox[4.2em][r]{\dotfill}} \\[+.5ex]
            &  & \vdots & \\
        \end{block}
    \end{blockarray}\]
\end{example}

\begin{proposition}
    The $M_\alpha$'s for proper shapes $\alpha$ are an orthogonal basis for the $S$-invariant functions.\footnote{
        Because of orthogonality of the underlying Fourier characters, it is not hard to check that when $\alpha \neq \alpha'$ and $M_\alpha, M_{\alpha'}$ have the same dimensions, $\langle{M_\alpha, M_{\alpha'}}\rangle = 0$.}
\end{proposition}
\begin{remark}
    Conceptually, one may think of forming an orthonormal basis for this space with the functions $M_\alpha / \sqrt{\langle{M_\alpha, M_\alpha}\rangle}$, but for technical reasons it is easiest to work with these functions without normalizing them to $1$.
    By orthogonality and the fact that every Boolean function is a polynomial, any $S$-invariant matrix-valued function $\Lambda$ is expressible as
    \[
    \Lambda = \sum_{\alpha} \frac{\langle{\Lambda, M_\alpha}\rangle}{\langle{M_\alpha, M_\alpha}\rangle} \cdot M_\alpha
    \]
\end{remark}


\subsection{Composing Ribbons and Shapes}
\begin{definition}[Composing Ribbons]
	We say that ribbons $R_1$ and $R_2$ are composable if $B_{R_1} = A_{R_2}$. Note that this definition is not symmetric so we may have that $R_1$ and $R_2$ are composable but $R_2$ and $R_1$ are not composable.

	We say that $R_1$ and $R_2$ are properly composable if we also have that $V(R_1) \cap V(R_2) = V(B_{R_1}) = V(A_{R_2})$ (there are no unexpected intersections between $R_1$ and $R_2$).

	If $R_1$ and $R_2$ are composable ribbons then we define the composition of $R_1$ and $R_2$ to be the ribbon $R_1 \circ R_2$ such that
	\begin{enumerate}
		\item $A_{R_1 \circ R_2} = A_{R_1}$ and $B_{R_1 \circ R_2} = B_{R_2}$
		\item $V(R_1 \circ R_2) = V(R_1) \cup V(R_2)$
		\item $E(R_1 \circ R_2) = E(R_1) \cup E(R_2)$ (and thus $\chi_{R_1 \circ R_2} = \chi_{R_1}\chi_{R_2}$)
	\end{enumerate}
	We say that ribbons $R_1,\ldots,R_k$ are composable/properly composable if for all $j \in [k-1]$, $R_1 \circ \ldots \circ R_j$ and $R_{j+1}$ are composable/properly composable. If $R_1,\ldots,R_k$ are composable then we define $R_1 \circ \ldots \circ R_k$ to be
	$R_1 \circ \ldots \circ R_k = (R_1 \circ \ldots \circ R_{k-1}) \circ R_k$
\end{definition}
\begin{proposition}
	Ribbon composition is associative, i.e. if $R_1,R_2,R_3$ are composable/properly composable ribbons then $R_2, R_3$ are composable/properly composable, $R_1, (R_2 \circ R_3)$ are composable/properly composable, and $R_1 \circ (R_2 \circ R_3) = (R_1 \circ R_2) \circ R_3$
\end{proposition}
\begin{proposition}
	If $R_1$ and $R_2$ are composable ribbons then $M_{R_1 \cup R_2} = M_{R_1}M_{R_2}$.
\end{proposition}
We have similar definitions for composing shapes.
\begin{definition}[Composing Shapes]
	We say that shapes $\alpha$ and $\beta$ are composable if $U_{\beta} \equiv V_{\alpha}$. Note that this definition is not symmetric so we may have that $\alpha$ and $\beta$ are composable but $\beta$ and $\alpha$ are not composable.

	If $\alpha$ and $\beta$ are composable shapes then we define the composition of $\alpha$ and $\beta$ to be the shape $\alpha \circ \beta$ such that
	\begin{enumerate}
		\item $U_{\alpha \circ \beta} = U_{\alpha}$ and $V_{\alpha \circ \beta} = V_{\beta}$
		\item After setting $U_{\beta} = V_{\alpha}$, we take $V(\alpha \circ \beta) = V(\alpha) \cup V(\beta)$
		\item $E(\alpha \circ \beta) = E(\alpha) \cup E(\beta)$
	\end{enumerate}
	We say that shapes $\alpha_1,\ldots,\alpha_k$ are composable if for all $j \in [k-1]$, $\alpha_1 \circ \ldots \circ \alpha_j$ and $\alpha_{j+1}$ are composable. If $\alpha_1,\ldots,\alpha_k$ are composable then we define the shape $\alpha_1 \circ \ldots \circ \alpha_k$ to be
	$\alpha_1 \circ \ldots \circ \alpha_k = (\alpha_1 \circ \ldots \circ \alpha_{k-1}) \circ \alpha_k$
\end{definition}
\begin{proposition}
	Shape composition is associative, i.e. if $\alpha_1,\alpha_2,\alpha_3$ are composable shapes then $\alpha_2, \alpha_3$ are composable, $\alpha_1, (\alpha_2 \circ \alpha_3)$ are composable, and $\alpha_1 \circ (\alpha_2 \circ \alpha_3) = (\alpha_1 \circ \alpha_2) \circ \alpha_3$
\end{proposition}

\begin{example}
    \cref{fig: basic_shape_comp} illustrates an example of shape composition in the simplified case. Observe how the shapes $\sigma \circ \sigma'^T$ and $\sigma \circ \tau \circ \sigma'^T$ are obtained from the shapes $\sigma, \tau$ and $\sigma'^T$.
\end{example}

\begin{example}
    \cref{fig: shape_comp} illustrates an example of shape composition in the general case. We have two types of vertices that we diagrammaticaly represent by squares and circles. Observe how the shapes $\sigma \circ \sigma'^T$ and $\sigma \circ \tau \circ \sigma'^T$ are obtained from the shapes $\sigma, \tau$ and $\sigma'^T$.
\end{example}

\begin{figure}[!h]
    \centering
    \includegraphics[scale=0.45, trim={4.5cm 2cm 0 2cm},clip]{machinery/images/basic_shape_comp}
    \caption{Illustration of shape composition and decomposition in the simplified case.}
    \label{fig: basic_shape_comp}
\end{figure}

\begin{figure}[!h]
    \centering
    \includegraphics[scale=0.38, trim={8cm 2cm 0 2cm},clip]{machinery/images/shape_comp}
    \caption{Illustration of shape composition and decomposition in the general case.}
    \label{fig: shape_comp}
\end{figure}

\subsection{Decomposition of Shapes into Left, Middle, and Right parts}
In this subsection, we describe how shapes can be decomposed into left, middle, and right parts based on the leftmost and rightmost \emph{minimum vertex separators}, which is a crucial idea for the analysis.
\begin{definition}[Paths]
	A \emph{path} in a shape $\alpha$ is a sequence of vertices $v_1,\ldots,v_t$ such that $v_i, v_{i+1}$ are in some edge/hyperedge together.
	A pair of paths is vertex-disjoint if the corresponding sequences of vertices are disjoint.
\end{definition}
\begin{definition}[Vertex separators]
	Let $\alpha$ be a shape and let $U$ and $V$ be sets of vertices in $\alpha$. We say that a set of vertices $S \subseteq V(\alpha)$ is a \emph{vertex separator} of $U$ and $V$ if every path in $\alpha$ from $U$ to $V$ contains at least one vertex in $S$. Note that any vertex separator $S$ of $U$ and $V$ must contain all of the vertices in $U \cap V$.

	As a special case, we say that $S$ is a vertex separator of $\alpha$ if $S$ is a vertex separator of $U_{\alpha}$ and $V_{\alpha}$
\end{definition}
We define the weight of a set of vertices $S \subseteq V(\alpha)$ in the same way that weight is defined for index shapes.
\begin{definition}[Simplified Weight]
	When there is only one type of index, the weight of a set of vertices $S \subseteq V(\alpha)$ is simply $|S|$.
\end{definition}
\begin{definition}[General Weight*]
	In general, given a set of vertices $S \subseteq V(\alpha)$, writing $S = \cup_{t}{S_t}$ where $S_t$ is the set of vertices of type $t$ in $S$, we define the weight of $S$ to be $w(S) = \sum_{t}{|S_t|log_{n}(n_t)}$
\end{definition}
\begin{remark}[*]
	Again, if necessary, we add an infinitesimal perturbation to $n_1,n_2,\ldots,n_{t_{max}}$ so that if two separators $S$ and $S'$ have the same weight then $S$ and $S'$ have the same number of each type of vertex.
\end{remark}
\begin{definition}[Leftmost and rightmost minimum vertex separators]
	The \emph{leftmost} minimum vertex separator is the vertex separator $S$ of minimum weight such that for every other minimum-weight vertex separator $S'$, $S$ is a separator of $U_\alpha$ and $S'$.
	The \emph{rightmost} minimum vertex separator is the vertex separator $T$ of minimum weight such that for every other minimum-weight vertex separator $T'$, $T$ is a separator of $T'$ and $V_{\alpha}$
\end{definition}
The work \cite{BHKKMP16} showed that under the simplifying assumptions, leftmost and rightmost minimum vertex separators are well defined. For a general proof that leftmost and rightmost minimum vertex separators are well defined, see \cite{potechin2020machinery}.

We now have the following crucial idea. Every shape $\alpha$ can be decomposed into the composition of three composable shapes $\sigma,\tau,{\sigma'}^T$ based on the leftmost and rightmost minimum vertex separators $S,T$ of $\alpha$ together with orderings of $S$ and $T$.
\begin{definition}[Simplified Separators With Orderings]
	Under our simplifying assumptions, given a set of vertices $S \subseteq V(\alpha)$ and an ordering $O_S = s_1,\ldots,s_{|S|}$ of the vertices of $S$, we define the index shape $(S, O_S)$ to be $(S, O_S) = (s_1,\ldots,s_{|S|})$.
\end{definition}
\begin{definition}[General Separators With Orderings*]
	In the general case, we need to give an ordering for each type of vertex. Let $S \subseteq V(\alpha)$ be a subset of the vertices of $\alpha$ and write $S = \cup_{t}{S_t}$ where $S_t$ is the set of vertices in $S$ of type $t$. Given $O_S = \{O_t\}$ where $O_t = s_{t1},\ldots,s_{t|S_t|}$ is an ordering of the vertices of $S_t$, we define the index shape piece $(S_t, O_t)$ to be $(S_t, O_t) = ((s_{t1},\ldots,s_{t|S_t|}), t, 1)$ and we define the index shape $(S,O_S)$ to be $(S,O_S) = \{(S_t,O_t)\}$.
\end{definition}
\begin{proposition}
	The number of possible orderings $O$ for $S$ is equal to $|Aut((S,O_S))|$
\end{proposition}
\begin{definition}[Shape transposes]
	Given a shape $\alpha$, we define $\alpha^{T}$ to be the shape $\alpha$ with $U_{\alpha}$ and $V_{\alpha}$ swapped i.e. $U_{\sigma^{T}} = V_{\sigma}$ and $V_{\sigma^{T}} = U_{\sigma}$.
\end{definition}
\begin{definition}[Left, middle, and right parts]
	Let $\alpha$ be a shape. Let $S$ and $T$ be the leftmost and rightmost minimal vertex separators of $\alpha$ together with orderings $O_S,O_T$ of $S$ and $T$.
	\begin{itemize}
		\item We define the \emph{left part} $\sigma_{\alpha}$ of $\alpha$ to be the shape such that
		\begin{enumerate}
			\item $H_{\sigma_{\alpha}}$ is the induced subgraph of $H_{\alpha}$ on all of the vertices of $\alpha$ reachable from $U_{\alpha}$ without passing through $S$ (note that $H_{\sigma_{\alpha}}$ includes the vertices of $S$) except that we remove any edges/hyperedges which are contained entirely within $S$.
			\item $U_{\sigma_{\alpha}} = U_{\alpha}$ and $V_{\sigma_{\alpha}} = (S,O_S)$
		\end{enumerate}
		\item We define the \emph{right part} ${\sigma'}^T_{\alpha}$ of $\alpha$ to be the shape such that
		\begin{enumerate}
			\item $H_{{\sigma'}^T_{\alpha}}$ is the induced subgraph of $H_{\alpha}$ on all of the vertices of $\alpha$ reachable from $V_{\alpha}$ without passing through $T$ (note that $H_{{\sigma'}^T_{\alpha}}$ includes the vertices of $T$) except that we remove any edges/hyperedges which are contained entirely within $T$.
			\item $V_{{\sigma'}^T_{\alpha}} = V_{\alpha}$ and $U_{{\sigma'}^T_{\alpha}} = (T,O_T)$
		\end{enumerate}
		\item We define the \emph{middle part} $\tau_{\alpha}$ of $\alpha$ to be the shape such that
		\begin{enumerate}
			\item $H_{\tau_{\alpha}}$ is the induced subgraph of $H_{\alpha}$ on all of the vertices of $\alpha$ which are not reachable from $U_{\alpha}$ and $V_{\alpha}$ without touching $S$ and $T$ (note that $H_{\tau_{\alpha}}$ includes the vertices of $S$ and $T$). $H_{\tau_{\alpha}}$ also includes the hyperedges entirely within $S$ and the hyperedges entirely within $T$.
			\item $U_{\tau_{\alpha}} = (S,O_S)$ and $V_{\tau_{\alpha}} = (T,O_T)$
		\end{enumerate}
	\end{itemize}.
\end{definition}

\begin{example}
    \cref{fig: basic_shape_comp} illustrates an example decomposition in the simplified case.
    \begin{enumerate}
        \item If we start with the shape $\alpha$ denoted as $\sigma \circ \sigma'^T$, observe that there is a unique minimum vertex separator, which consists of the middle vertex of degree $5$, i.e. the one that's not in either $U_{\sigma \circ \sigma'^T}$ or $V_{\sigma \circ \sigma'^T}$.
        Then, $\alpha$ is decomposed in to the left part $\sigma$, a trivial middle part $\tau$ (not shown in this figure) which has $V(\tau) = \{u\}, U_{\tau} = V_{\tau} = (u), E(\tau) = \emptyset$, and the right part $\sigma'^T$.
        \item If we start with the shape $\alpha$ denoted as $\sigma \circ \tau \circ \sigma'^T$, then the leftmost minimum vertex separator is the vertex of degree $4$ and the rightmost minimum vertex separator is the vertex of degree $5$. Then, $\alpha$ is decomposed into the left part $\sigma$, the middle part $\tau$ and the right part $\sigma'^T$, which are all shown in this figure.
    \end{enumerate}
\end{example}


\begin{example}
    \cref{fig: shape_comp} illustrates an example decomposition in the general case. We have two types of vertices that we diagrammatically represent by squares and circles. In this example, we assume that the set containing a single circle vertex has a lower weight compared to a set of two square vertices.
    \begin{enumerate}
        \item If we start with the shape $\sigma \circ \sigma'^T$, then it can be decomposed uniquely in to the composition of the left shape $\sigma$, the right shape $\sigma'^T$. In this case, the middle shape (not shown in this figure) is trivial.
        \item If we start with the shape $\sigma \circ \tau \circ \sigma'^T$, then it can be decomposed uniquely into the composition of the left shape $\sigma$, the middle shape $\tau$ and the right shape $\sigma'^T$, which are all shown in this figure.
    \end{enumerate}
\end{example}

\begin{proposition}
	If $\sigma,\tau,{\sigma'}^{T}$ are the left, middle, and rights parts for $\alpha$ for given orderings $O_S,O_T$ of $S$ and $T$ then $\alpha = \sigma \circ \tau \circ {\sigma'}^T$.
\end{proposition}
\begin{remark}
	One may ask which ordering(s) we should take of $S$ and $T$. The answer is that we will take all of the possible orderings of $S$ and $T$ simultaneously, giving equal weight to each.
\end{remark}
Based on this decomposition and the following claim, we make the following definitions for what it means for a shape to be a left, middle, or right part.
\begin{claim}[Proved in Section 6.1 in \cite{BHKKMP16}]
	\footnote{The proof in \cite{BHKKMP16} only explicitly treats the case when the shapes $\alpha$ are graphs, but the proof easily generalizes to the case when the $\alpha$ are hypergraphs.}
	\begin{itemize}
		\item Every shape $\sigma$ which is the left part of some other shape $\alpha$ has that $V_\sigma$ is its left-most and right-most minimum-weight separator.
		\item Every shape ${\sigma}^T$ which is the right part of some other shape $\alpha$ has that $U_{{\sigma}^T}$ is its left-most and right-most minimum-weight separator.
		\item Every shape $\tau$ which is the middle part of some other shape $\alpha$ has $U_\tau$ as its left-most minimum size separator and $V_{\tau}$ as its right-most minimum-weight separator.
	\end{itemize}
\end{claim}
\begin{definition}\label{leftmiddlerightshapedefinitions} \
	\begin{enumerate}
		\item We say that a shape $\sigma$ is a left shape if $\sigma$ is a proper shape, $V_{\sigma}$ is the left-most and right-most minimum-weight separator of $\sigma$, every vertex in $V(\sigma) \setminus V_{\sigma}$ is reachable from $U_{\sigma}$ without touching $V_{\sigma}$, and $\sigma$ has no hyperedges entirely within $V_{\sigma}$.
		\item We say that a shape $\tau$ is a proper middle shape if $\tau$ is a proper shape, $U_{\tau}$ is the left-most minimum-weight separator of $\tau$, and $V_{\tau}$ is the right most minimum-weight separator of $\tau$. In the analysis, we will also need to consider improper middle shapes $\tau$ which may not be proper shapes and which may have smaller separators between $U_{\tau}$ and $V_{\tau}$.
		\item We say that a shape ${\sigma}^{T}$ is a right shape if ${\sigma}^{T}$ is a proper shape, $U_{{\sigma}^{T}}$ is the left-most and right-most minimum-weight separator of ${\sigma}^{T}$, every vertex in $V({\sigma}^{T}) \setminus U_{{\sigma}^{T}}$ is reachable from $V_{{\sigma}^{T}}$ without touching $U_{{\sigma}^{T}}$, and ${\sigma}^{T}$ has no hyperedges entirely within $U_{{\sigma}^{T}}$.
	\end{enumerate}
\end{definition}
\begin{proposition}
	For all shapes $\sigma$, $\sigma$ is a left shape if and only if $\sigma^{T}$ is a right shape.
\end{proposition}
\begin{remark}
	As the reader has likely guessed, throughout this section we use $\sigma$ to denote left parts and $\tau$ to denote middle parts. Instead of having a separate letter for right parts, we express right parts as the transpose of a left part.
\end{remark}
\subsection{Coefficient matrices}\label{fullcoefficientmatrixsubsection}
We will have that $\Lambda = \sum_{\alpha}{\lambda_{\alpha}M_{\alpha}}$. To analyze $\Lambda$, it is extremely useful to express these coefficients in terms of matrices. To do this, we will need a few more definitions. We start by defining the sets of index shapes that can appear when analyzing $\Lambda$.
\begin{definition}
	Given a moment matrix $\Lambda$, we define the following sets of index shapes.
	\begin{enumerate}
		\item We define $\mathcal{I}(\Lambda) = \{U: \exists \text{ matrix index } A: A \text{ is a row index of } \Lambda, A \text{ has shape } U\}$ to be the set of index shapes which describe row and column indices of $\Lambda$.
		\item We define $w_{max}$ to be $w_{max} = \max{\{w(U):U \in \mathcal{I}(\Lambda)\}}$.
		\item With our simplifying assumptions, we define $\mathcal{I}_{mid}$ to be $\mathcal{I}_{mid} = \{U: |U| \leq w_{max}\}$
		\item[3*.] In general, we define $\mathcal{I}_{mid}$ to be $\mathcal{I}_{mid} = \{U: w(U) \leq w_{max}, \forall U_i \in U, p_i = 1\}$
	\end{enumerate}
\end{definition}
We also need to define the sets of shapes which can appear when analyzing $\Lambda$.
\begin{definition}[Truncation Parameters]
	Given a moment matrix $\Lambda = \sum_{\alpha}{\lambda_{\alpha}M_{\alpha}}$, we define $D_V,D_E$ to be the smallest natural numbers such that for all shapes $\alpha$ such that $\lambda_{\alpha} \neq 0$, decomposing $\alpha$ as $\alpha = \sigma \circ \tau \circ {\sigma'}^T$,
	\begin{enumerate}
		\item $|V(\sigma)| \leq D_V$, $|V(\tau)| \leq D_V$, and $|V(\sigma')| \leq D_V$.
		\item[2.*] For all edges $e \in E(\sigma) \cup E(\tau) \cup E(\sigma')$, $l_e \leq D_E$.
	\end{enumerate}
\end{definition}
\begin{remark}
	Under our simplifying assumptions, all edges have label $1$ so we will take $D_E = 1$ and ignore conditions involving $D_E$.
\end{remark}
\begin{definition}
	Given a moment matrix $\Lambda$, we define the following sets of shapes:
	\begin{enumerate}
		\item $\mathcal{L} = \{\sigma: \sigma \text{ is a left shape}, U_{\sigma} \in \mathcal{I}(\Lambda), V_{\sigma} \in \mathcal{I}_{mid}, |V(\sigma)| \leq D_V, \forall e \in E(\sigma), l_e \leq D_E\}$
		\item Given $V \in \mathcal{I}_{mid}$, we define $\mathcal{L}_V = \{\sigma \in \mathcal{L}: V_{\sigma} \equiv V\}$
		\item Given $U \in \mathcal{I}_{mid}$, we define $\mathcal{M}_U = \{\tau: \tau \text{ is a non-trivial proper middle shape}, U_{\tau} \equiv V_{\tau} \equiv U,
		|V(\tau)| \leq D_V, \forall e \in E(\tau), l_e \leq D_E\}$
	\end{enumerate}
\end{definition}
\begin{definition}
	Given a moment matrix $\Lambda$, we define a $\Lambda$-coefficient matrix (which we call a coefficient matrix for brevity) to be a matrix whose rows and columns are indexed by left shapes $\sigma,\sigma' \in \mathcal{L}$.

	We say that a coefficient matrix $H$ is SOS-symmetric if $H(\sigma,\sigma')$ is invariant under permuting the vertices of $U_{\sigma}$ and permuting the vertices of $U_{\sigma'}$ (*more precisely, for the general case we permute the vertices within each index shape piece of $U_{\sigma}$ and permute the vertices within each index shape piece of $U_{\sigma'}$).
\end{definition}
\begin{definition}
	Given a shape $\tau$, we say that a coefficient matrix $H$ is a $\tau$-coefficient matrix if $H(\sigma,\sigma') = 0$ whenever $V_{\sigma} \not\equiv U_{\tau}$ or $V_{\tau} \not\equiv U_{{\sigma'}^T}$.
\end{definition}
\begin{definition}
	Given an index shape $U$, we define $Id_{U}$ to be the shape with $U_{Id_{U}} = V_{Id_{U}} = U$, no other vertices, and no edges.
\end{definition}
Given a shape $\tau$ and a $\tau$-coefficient matrix $H$, for convenience, we create a different matrix-valued functions $M^{fact}_{\tau}(H)$ as follows.


\begin{definition}
	Given a shape $\tau$ and a $\tau$-coefficient matrix $H$, define
	\[
	M^{fact}_{\tau}(H) = \sum_{\sigma \in \mathcal{L}_{U_{\tau}},\sigma' \in \mathcal{L}_{V_{\tau}}}{H(\sigma,\sigma')M_{\sigma}M_{\tau}M_{\sigma'}^T}
	\]
\end{definition}

Given a matrix-valued function $\Lambda$, we can associate coefficient matrices to $\Lambda$ as follows:
\begin{definition}
	Given a matrix-valued function $\Lambda = \sum_{\alpha: \alpha \text{ is proper}}{\lambda_{\alpha}M_{\alpha}}$,
	\begin{enumerate}
		\item For each index shape $U \in \mathcal{I}_{mid}$ and every $\sigma,\sigma' \in \mathcal{L}_{U}$, we take $H_{Id_U}(\sigma,\sigma') = \frac{1}{|Aut(U)|}\lambda_{\sigma \circ {\sigma'}^T}$
		\item For each $U \in \mathcal{I}_{mid}$, $\tau \in \mathcal{M}_U$ and $\sigma, \sigma' \in \mathcal{L}_{U}$, we take
		$H_{\tau}(\sigma,\sigma') = \frac{1}{|Aut(U_{\tau})|\cdot|Aut(V_{\tau})|}\lambda_{\sigma \circ \tau \circ {\sigma'}^T}$
	\end{enumerate}
\end{definition}
\subsection{The $-\gamma,-\gamma$ operation and qualitative theorem statement}
In the intersection term analysis, we will need to further decompose left shapes $\sigma$ as $\sigma = \sigma_2 \circ \gamma$ where $\sigma_2$ and $\gamma$ are themselves left shapes. Accordingly, we make the following definitions
\begin{definition} Given a moment matrix $\Lambda$, we define the following sets of left shapes:
	\begin{enumerate}
		\item $\Gamma = \{\gamma: \gamma \text{ is a non-trivial left shape}, U_{\gamma}, V_{\gamma} \in \mathcal{I}_{mid}, |V(\gamma)| \leq D_V, \forall e \in E(\gamma), l_e \leq D_E\}$
		\item Given $U,V \in \mathcal{I}_{mid}$ such that $w(U) > w(V)$, define $\Gamma_{U,V} = \{\gamma \in \Gamma: U_{\gamma} \equiv U, V_{\gamma} \equiv V\}$.
		\item Given $U \in \mathcal{I}_{mid}$, define $\Gamma_{U,*} = \{\gamma \in \Gamma: U_{\gamma} \equiv U\}$
		\item Given $V \in \mathcal{I}_{mid}$, define $\Gamma_{*,V} = \{\gamma \in \Gamma: V_{\gamma} \equiv V\}$
	\end{enumerate}
	\begin{remark}
		Under our simplifying assumptions, $\Gamma$ is the same as $\mathcal{L}$ except that $\Gamma$ excludes the trivial shapes. In general, while $\mathcal{L}$ requires that $U_{\sigma} \in \mathcal{I}(\Lambda)$, $\Gamma$ requires that $U_{\gamma} \in \mathcal{I}_{mid}$. Note that $\mathcal{I}(\Lambda)$ and $\mathcal{I}_{mid}$ may be incomparable because
		\begin{enumerate}
			\item There may be index shapes $U \in \mathcal{I}_{mid}$ such that no matrix index of $\Lambda$ has shape $U$.
			\item All index shape pieces $U_i$ for index shapes $U \in \mathcal{I}_{mid}$ must have $p_i = 1$ while this is not the case for $\mathcal{I}(\Lambda)$.
		\end{enumerate}
	\end{remark}
\end{definition}
We now state the theorem qualitatively after giving one more definition.
\begin{definition}
	Given a shape $\tau$, left shapes $\gamma \in {\Gamma}_{*,U_{\tau}}$ and $\gamma' \in {\Gamma}_{*,V_{\tau}}$, and a $\tau$-coefficient matrix $H$, define $H^{-\gamma,\gamma'}$ to be the $(\gamma \circ \tau \circ {\gamma'}^T)$-coefficient matrix with entries
	\begin{enumerate}
		\item $H^{-\gamma,\gamma'}(\sigma,\sigma') = H(\sigma \circ \gamma,\sigma' \circ \gamma')$ if $|V(\sigma \circ \gamma)| \leq D_V$ and $|V(\sigma' \circ \gamma')| \leq D_V$.
		\item $H^{-\gamma,\gamma'}(\sigma,\sigma') = 0$ if $|V(\sigma \circ \gamma)| > D_V$ or $|V(\sigma' \circ \gamma')| > D_V$.
	\end{enumerate}
\end{definition}
\begin{remark}
	For the theorem, we will only need the case when $\gamma' = \gamma$
\end{remark}
The qualitative theorem statement is as follows:
\begin{theorem}\label{thm:mainqualitative}
	Let $\Lambda = \sum_{U \in \mathcal{I}_{mid}}{M^{orth}_{Id_U}(H_{Id_U})} + \sum_{U \in \mathcal{I}_{mid}}{\sum_{\tau \in \mathcal{M}_U}{M^{orth}_{\tau}(H_{\tau})}}$ be an SOS-symmetric matrix valued function.

	There exist functions $f(\tau)$ and $f(\gamma)$ depending on $n$ and other parameters such that if the following conditions hold:
	\begin{enumerate}
		\item For all $U \in \mathcal{I}_{mid}$,  $H_{Id_{U}} \succeq 0$
		\item For all $U \in \mathcal{I}_{mid}$ and all $\tau \in \mathcal{M}_{U}$,
		\[
		\left[ {\begin{array}{cc}
				H_{Id_{U}} & f(\tau)H_{\tau} \\
				f(\tau)H^T_{\tau} & H_{Id_{U}}
		\end{array}} \right] \succeq 0
		\]
		\item For all $U,V \in \mathcal{I}_{mid}$ where $w(U) > w(V)$ and all $\gamma \in \Gamma_{U,V}$, $H^{-\gamma,\gamma}_{Id_{V}} \preceq f(\gamma)H_{Id_{U}}$
	\end{enumerate}
	then with high probability $\Lambda \succeq 0$
\end{theorem}
\begin{remark}
	Roughly speaking, conditions 1 and 2 give us an approximate PSD decomposition for the moment matrix $M$. Condition 3 comes from the intersection term analysis, which is the most technically intensive part of the proof.
\end{remark}
\begin{remark}
    As we will demonstrate in this work, the machinery works well when the coefficients $\lambda_{\alpha}$ has a polynomial decay for each vertex or edge in the shape. In many settings, this can be done quite easily by adding noise to the distribution, such as resampling part of the input, or by lowering the parameters slightly, such as $m \le n^{k/4 - \varepsilon}$ instead of $m \le n^{k/4}$.
   
\end{remark}
\subsubsection{Choice of functions $f(\tau)$ and $f(\gamma)$ in the simplified case}

We will describe some intuition for the functions $f(\tau), f(\gamma)$. For simplicity, consider the simplified case.

In a rough sense, $f(\tau)$ measures the blow-up in the norm by using $M_{\tau}$ instead of $M_{Id_U}$ in the corresponding term of the Fourier decomposition. So we choose $f(\tau)$ to be $\norm{M_{\tau}}$, upto lower order terms. Our second condition verifies that the coefficients that arise because of this $\tau$ (which are encoded in $H_{\tau}$) are sufficiently small to overpower this norm blowup.

The fact that $\norm{M_{\tau}}$ is equal to $\tilde{O}(n^{\frac{|V(\tau)|-|U_{\tau}|}{2}})$ has been shown in previous works \cite{BHKKMP16, AMP20}. So, we choose $f(\tau)$ to be $\tilde{O}(n^{\frac{|V(\tau)|-|U_{\tau}|}{2}})$ where the problem instance is on $G_{n, 1/2}$. For problems with Gaussian or other inputs, similar forms of $f(\tau)$ can be used, which have been shown formally in the work of \cite{AMP20}. When we state the main theorem in general, we use a single $f(\tau)$ that incorporates all of these settings.

$f(\gamma)$ is a bit trickier to describe. In the machinery analysis, we roughly collect intersection terms from the approximate PSD decomposition and charge them to shapes of the form $\sigma \circ \gamma \circ \gamma^T \circ \sigma'^T$. Using the same idea as the previous step, we charge these to shapes with trivial middle shapes. Roughly, $f(\gamma)$ for a fixed $\gamma$ upper bounds the blowup from the norms of the original shape as compared to the new intersection shape. And the third condition argues that the the original coefficients are sufficiently small to compensate for these blowups.

For problems on $G_{n, 1/2}$, we set $f(\gamma) = \tilde{O}(n^{|V(\gamma) \setminus U_{\gamma}|})$. For problems with Gaussian inputs, we choose essentially the same function, but they fall under the umbrella of generalized graph matrices, where $V(\gamma)$ and $U_{\gamma}$ are defined accordingly. Indeed, in the main theorem, we encompass both these settings with a single choice of $f(\gamma)$.

\subsection{Quantitative theorem statement}\label{quantitativetheoremstatementsection}
To state the theorem quantitatively, we will need a few more things. First, the conditions of the theorem will involve functions $B_{norm}(\alpha)$, $B(\gamma)$, $N(\gamma)$, and $c(\alpha)$. Roughly speaking, these functions will be used as follows in the analysis:
\begin{enumerate}
	\item $B_{norm}(\alpha)$ will bound the norms of the matrices $M_{\alpha}$
	\item $B(\gamma)$ and $N(\gamma)$ will help us bound the intersection terms.
	\item $c(\alpha)$ will help us sum over the possible $\gamma$ and $\tau$.
\end{enumerate}
Second, for technical reasons it turns out that comparing $H^{-\gamma,\gamma}_{Id_{V_{\gamma}}}$ to $H_{Id_{U_{\gamma}}}$ doesn't quite work. Instead, we compare $H^{-\gamma,\gamma}_{Id_{V_{\gamma}}}$ to a matrix $H'_{\gamma}$ of our choice where $H'_{\gamma}$ is very close to $H_{Id_{U_{\gamma}}}$ ($H'_{\gamma}$ will be the same as $H_{Id_{U_{\gamma}}}$ up to truncation error).
\begin{definition}
	Given a function $B_{norm}(\alpha)$, we define the distance $d_{\tau}(H_{\tau},H'_{\tau})$ between two $\tau$-coefficient matrices $H_{\tau}$ and $H'_{\tau}$ to be
	\[
	d_{\tau}(H_{\tau},H'_{\tau}) = \sum_{\sigma \in \mathcal{L}_{U_{\tau}},\sigma' \in \mathcal{L}_{V_{\tau}}}{|H'_{\tau}(\sigma,\sigma') - H_{\tau}(\sigma,\sigma')|B_{norm}(\sigma)B_{norm}(\tau)B_{norm}(\sigma')}
	\]
\end{definition}
Third, we need an SOS-symmetric analogue of the identity matrix.
\begin{definition}
	We define $Id_{Sym}$ to be the matrix such that
	\begin{enumerate}
		\item The rows and columns of $Id_{Sym}$ are indexed by the matrix indices $A,B$ whose index shape is in $\mathcal{I}(\Lambda)$.
		\item $Id_{Sym}(A,B) = 1$ if $p_A = p_B$ and $Id_{Sym}(A, B) = 0$ if $p_A \neq p_B$.
	\end{enumerate}
\end{definition}
\begin{proposition}
	If $M$ has SOS-symmetry and the rows and columns of $Id_{Sym}$ are indexed by matrix indices $A,B$ whose index shape is in $\mathcal{I}(\Lambda)$ then $M \preceq \norm{M}Id_{Sym}$
\end{proposition}
\begin{corollary}\label{distanceboundingcorollary}
	For all $\tau$ and all SOS-symmetric $\tau$-coefficient matrices $H_{\tau}$ and $H'_{\tau}$,
	\[
	M^{fact}_{\tau}(H'_{\tau}) + M^{fact}_{{\tau}^T}(H'_{{\tau}^T}) - M^{fact}_{\tau}(H_{\tau}) - M^{fact}_{{\tau}^T}(H_{{\tau}^T}) \preceq 2d_{\tau}(H_{\tau},H'_{\tau})Id_{Sym}
	\]
	Note that if $\tau$, $H_{\tau}$ and $H'_{\tau}$ are all symmetric then
	\[
	M^{fact}_{\tau}(H'_{\tau}) - M^{fact}_{\tau}(H_{\tau}) \preceq d_{\tau}(H_{\tau},H'_{\tau})Id_{Sym}
	\]
\end{corollary}
Finally, we need a few more definitions about shapes $\alpha$.
\begin{definition}[$\mathcal{M}'$]
	We define $\mathcal{M}'$ to be the set of all shapes $\alpha$ such that
	\begin{enumerate}
		\item[1.] $|V(\alpha)| \leq 3D_V$
		\item[2.*] $\forall e \in E(\alpha), l_e \leq D_E$
		\item[3.*] All edges $e \in E(\alpha)$ have multiplicity at most $3D_V$.
	\end{enumerate}
\end{definition}
\begin{definition}[$S_{\alpha}$]
	Given a shape $\alpha$, define $S_{\alpha}$ to be the leftmost minimum vertex separator of $\alpha$
\end{definition}
\begin{definition}[$I_{\alpha}$]
	Given a shape $\alpha$, define $I_{\alpha}$ to be the set of vertices in $V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})$ which are isolated.
\end{definition}
We can now state the main theorem.
\begin{theorem}\label{simplifiedmaintheorem}
	Given the moment matrix $\Lambda = \sum_{\alpha}{\lambda_{\alpha}M_{\alpha}}$,
	for all $\varepsilon > 0$, if we take
	\begin{enumerate}
		\item $q = 3\left\lceil{{D_V}ln(n) + \frac{ln(\frac{1}{\varepsilon})}{3} + {D_V}ln(5) + 3{D^2_V}ln(2)}\right\rceil$
		\item $B_{vertex} = 6{D_V}\sqrt[4]{2eq}$
		\item $B_{norm}(\alpha) = {B_{vertex}^{|V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|}}n^{\frac{w(V(\alpha)) + w(I_{\alpha}) - w(S_{\alpha})}{2}}$
		\item $B(\gamma) = B_{vertex}^{|V(\gamma) \setminus U_{\gamma}| + |V(\gamma) \setminus V_{\gamma}|}n^{\frac{w(V(\gamma) \setminus U_{\gamma})}{2}}$
		\item $N(\gamma) = (3D_V)^{2|V(\gamma) \setminus V_{\gamma}| + |V(\gamma) \setminus U_{\gamma}|}$
		\item $c(\alpha) = 100(3D_V)^{|U_{\alpha} \setminus V_{\alpha}| + |V_{\alpha} \setminus U_{\alpha}| + 2|E(\alpha)|}2^{|V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})|}$
	\end{enumerate}
	and we have SOS-symmetric coefficient matrices $\{H'_{\gamma}: \gamma \in \Gamma\}$ such that the following conditions hold:
	\begin{enumerate}
		\item For all $U \in \mathcal{I}_{mid}$,  $H_{Id_{U}} \succeq 0$
		\item For all $U \in \mathcal{I}_{mid}$ and $\tau \in \mathcal{M}_U$,
		\[
		\left[ {\begin{array}{cc}
				\frac{1}{|Aut(U)|c(\tau)}H_{Id_{U}} & B_{norm}(\tau)H_{\tau} \\
				B_{norm}(\tau)H^T_{\tau} & \frac{1}{|Aut(U)|c(\tau)}H_{Id_{U}}
		\end{array}} \right] \succeq 0
		\]
		\item For all $U,V \in \mathcal{I}_{mid}$ where $w(U) > w(V)$ and all $\gamma \in \Gamma_{U,V}$,
		\[
		c(\gamma)^2{N(\gamma)}^2{B(\gamma)^2}H^{-\gamma,\gamma}_{Id_{V}} \preceq H'_{\gamma}
		\]
	\end{enumerate}
	then with probability at least $1 - \varepsilon$,
	\[
	\Lambda \succeq \frac{1}{2}\left(\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}}\right) - 3\left(\sum_{U \in \mathcal{I}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U}}(H'_{\gamma},H_{Id_{U}})}{|Aut(U)|c(\gamma)}}}\right)Id_{sym}
	\]
	If it is also true that whenever $\norm{M_{\alpha}} \leq B_{norm}(\alpha)$ for all $\alpha \in \mathcal{M}'$,
	\[
	\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}} \succeq 6\left(\sum_{U \in \mathcal{I}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U}}(H'_{\gamma},H_{Id_{U}})}{|Aut(U)|c(\gamma)}}}\right)Id_{sym}
	\]
	then with probability at least $1 - \varepsilon$, $\Lambda \succeq 0$.
\end{theorem}
\subsubsection{General Main Theorem}\label{generalmaintheoremstatementsection}
Before stating the general main theorem, we need to modify a few definitions for $\alpha$ and give a few definitions for $\Omega$
\begin{definition}[$S_{\alpha,min}$ and $S_{\alpha,max}$]
	Given a shape $\alpha \in \mathcal{M}'$, define $S_{\alpha,min}$ to be the leftmost minimum vertex separator of $\alpha$ if all edges with multiplicity at least $2$ are deleted and define $S_{\alpha,max}$ to be the leftmost minimum vertex separator of $\alpha$ if all edges with multiplicity at least $2$ are present.
\end{definition}
\begin{definition}[General $I_{\alpha}$]
	Given a shape $\alpha$, define $I_{\alpha}$ to be the set of vertices in $V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})$ such that all edges incident with that vertex have multplicity at least $2$.
\end{definition}
\begin{definition}[$B_{\Omega}$]
	We take $B_{\Omega}(j)$ to be a non-decreasing function such that for all $j \in \mathbb{N}$, $E_{\Omega}[x^{j}] \leq B_{\Omega}(j)^{j}$
\end{definition}
\begin{definition}[$h^{+}_j$]
	For all $j$, we define $h^{+}_j$ to be the polynomial $h_j$ where we make all of the coefficients have positive sign.
\end{definition}
\begin{lemma}
If $\Omega = N(0,1)$ then we can take $B_{\Omega}(j) = \sqrt{j}$ and we have that
	\[
	h^{+}_j(x) \leq \frac{1}{\sqrt{j!}}(x^2 + j)^{\frac{j}{2}} \leq \left(\frac{e}{j}(x^2 + j)\right)^{\frac{j}{2}}
	\]
\end{lemma}

For a proof, see \cite[Lemma 8.15]{AMP20}.

\begin{theorem}\label{generalmaintheorem}
	Given the moment matrix $\Lambda = \sum_{\alpha}{\lambda_{\alpha}M_{\alpha}}$,
	for all $\varepsilon > 0$, if we take
	\begin{enumerate}
		\item $q = \left\lceil{3{D_V}ln(n) + ln(\frac{1}{\varepsilon}) + {(3D_V)^k}ln(D_E + 1) + 3{D_V}ln(5)}\right\rceil$
		\item $B_{vertex} = 6q{D_V}$
		\item $B_{edge}(e) = 2h^{+}_{l_e}(B_{\Omega}(6{D_V}D_E))
		\max_{j \in [0,3{D_V}D_E]}{\left\{\left(h^{+}_{j}(B_{\Omega}(2qj))\right)^{\frac{l_e}{\max{\{j,l_e\}}}}\right\}}$

		As a special case, if $\Omega = N(0,1)$ then we can take $B_{edge}(e) = \left(400{D^2_V}{D^2_E}q\right)^{l_e}$
		\item $B_{norm}(\alpha) =
		2e{B_{vertex}^{|V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|}}\left(\prod_{e \in E(\alpha)}{B_{edge}(e)}\right)n^{\frac{w(V(\alpha)) + w(I_{\alpha}) - w(S_{\alpha,min})}{2}}$
		\item $B(\gamma) = B_{vertex}^{|V(\gamma) \setminus U_{\gamma}| + |V(\gamma) \setminus V_{\gamma}|}\left(\prod_{e \in E(\gamma)}{B_{edge}(e)}\right)n^{\frac{w(V(\gamma) \setminus U_{\gamma})}{2}}$
		\item $N(\gamma) = (3D_V)^{2|V(\gamma) \setminus V_{\gamma}| + |V(\gamma) \setminus U_{\gamma}|}$
		\item $c(\alpha) = 100(3{t_{max}}D_V)^{|U_{\alpha} \setminus V_{\alpha}| + |V_{\alpha} \setminus U_{\alpha}| + k|E(\alpha)|}(2t_{max})^{|V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})|}$
	\end{enumerate}
	and we have SOS-symmetric coefficient matrices $\{H'_{\gamma}: \gamma \in \Gamma\}$ such that the following conditions hold:
	\begin{enumerate}
		\item For all $U \in \mathcal{I}_{mid}$,  $H_{Id_{U}} \succeq 0$
		\item For all $U \in \mathcal{I}_{mid}$ and $\tau \in \mathcal{M}_U$,
		\[
		\left[ {\begin{array}{cc}
				\frac{1}{|Aut(U)|c(\tau)}H_{Id_{U}} & B_{norm}(\tau)H_{\tau} \\
				B_{norm}(\tau)H^T_{\tau} & \frac{1}{|Aut(U)|c(\tau)}H_{Id_{U}}
		\end{array}} \right] \succeq 0
		\]
		\item For all $U,V \in \mathcal{I}_{mid}$ where $w(U) > w(V)$ and all $\gamma \in \Gamma_{U,V}$,
		\[
		c(\gamma)^2{N(\gamma)}^2{B(\gamma)^2}H^{-\gamma,\gamma}_{Id_{V}} \preceq H'_{\gamma}
		\]
	\end{enumerate}
	then with probability at least $1 - \varepsilon$,
	\[
	\Lambda \succeq \frac{1}{2}\left(\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}}\right) - 3\left(\sum_{U \in \mathcal{I}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U}}(H'_{\gamma},H_{Id_{U}})}{|Aut(U)|c(\gamma)}}}\right)Id_{sym}
	\]
	If it is also true that whenever $\norm{M_{\alpha}} \leq B_{norm}(\alpha)$ for all $\alpha \in \mathcal{M}'$,
	\[
	\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}} \succeq 6\left(\sum_{U \in {\mathcal I}_{mid}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U}}(H'_{\gamma},H_{Id_{U}})}{|Aut(U)|c(\gamma)}}}\right)Id_{sym}
	\]
	then with probability at least $1 - \varepsilon$, $\Lambda \succeq 0$.
\end{theorem}
The full proof of the main theorems can be found in \cite{potechin2020machinery}. In this work, we will  apply the machinery to prove our main SoS lower bounds.
\subsection{Choosing $H'_{\gamma}$ and Truncation Error}\label{sec: choosing_hgamma}
A canonical choice for $H'_{\gamma}$ is to take
\begin{enumerate}
	\item $H'_{\gamma}(\sigma,\sigma') = H_{Id_U}(\sigma, \sigma')$ whenever $|V(\sigma \circ \gamma)| \leq D_V$ and $|V(\sigma' \circ \gamma)| \leq D_V$.
	\item $H'_{\gamma}(\sigma,\sigma') = 0$ whenever $|V(\sigma \circ \gamma)| > D_V$ or $|V(\sigma' \circ \gamma)| > D_V$.
\end{enumerate}
With this choice, the truncation error is
\[
d_{Id_{U_{\gamma}}}(H_{Id_{U_{\gamma}}},H'_{\gamma}) = \sum_{\sigma,\sigma' \in \mathcal{L}_{U_{\gamma}}: V(\sigma) \leq D_V, V(\sigma') \leq D_V,
	\atop |V(\sigma \circ \gamma)| > D_V \text{ or } |V(\sigma' \circ \gamma)| > D_V}{B_{norm}(\sigma)B_{norm}(\sigma')H_{Id_{U_{\gamma}}}(\sigma,\sigma')}
\]

\end{comment}
\subsection{Middle shape bounds}

\begin{lemma}\label{lem: spca_charging}
	Suppose $0 < A < \frac{1}{4}$ is a constant such that $\frac{\sqrt{\lambda}}{\sqrt{k}} \le d^{-A\varepsilon}$ and $\frac{1}{\sqrt{k}} \le d^{-2A}$. For all $m$ such that $m \le \frac{d^{1 - \varepsilon}}{\lambda^2}, m \le \frac{k^{2 - \varepsilon}}{\lambda^2}$, for all $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$, suppose $deg^{\tau}(i)$ is even for all $i \in V(\tau) \setminus U_{\tau} \setminus V_{\tau}$, then
	\[\sqrt{d}^{|\tau|_1 - |U_{\tau}|_1}\sqrt{m}^{|\tau|_2 - |U_{\tau}|_2}S(\tau) \le \prod_{j \in V_2(\tau) \setminus U_{\tau} \setminus V_{\tau}} (deg^{\tau}(j) - 1)!!\cdot \frac{1}{d^{A\varepsilon\sum_{e \in E(\tau)} l_e}}\]
\end{lemma}

\begin{proof}
	Let $r_1 = |\tau|_1 - |U_{\tau}|_1, r_2 = |\tau|_2 - |U_{\tau}|_2$. Since $\Delta \le 1$, it suffices to prove
	\[E := \sqrt{d}^{r_1}\sqrt{m}^{r_2}\left(\frac{k}{d}\right)^{r_1} \left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{\sum_{e \in E(\tau)} l_e} \le  \frac{1}{d^{A\varepsilon\sum_{e \in E(\tau)} l_e}}\]

	We will need the following claim.
	\begin{claim}
		$\sum_{e \in E(\tau)} l_e \ge 2 \max(r_1, r_2)$.
	\end{claim}

	\begin{proof}
		We will first prove $\sum_{e \in E(\tau)} l_e \ge 2 r_1$. For any vertex $i \in V_1(\tau) \setminus U_{\tau} \setminus V_{\tau}$, $deg^{\tau}(i)$ is even and is not $0$, hence, $deg^{\tau}(i) \ge 2$. Any vertex $i \in U_{\tau} \setminus V_{\tau}$ cannot have $deg^{\tau}(i) = 0$ otherwise $U_{\tau} \setminus\{i\}$ is a vertex separator of strictly smaller weight than $U_{\tau}$, which is not possible, hence, $deg^{\tau}(i) \ge 1$. Similarly, for $i \in  V_{\tau} \setminus U_{\tau}$, $deg^{\tau}(i) \ge 1$. Also, since $H_{\tau}$ is bipartite, we have $\sum_{i \in V_1(\tau)} deg^{\tau}(i) = \sum_{j \in V_2(\tau)} deg^{\tau}(j)= \sum_{e \in E(\tau)} l_e$. Consider

		\begin{align*}
		\sum_{e \in E(\tau)} l_e &= \sum_{i \in V_1(\tau)} deg^{\tau}(i)\\
		&\ge \sum_{i \in V_1(\tau) \setminus U_{\tau} \setminus V_{\tau}} deg^{\tau}(i) + \sum_{i \in (U_{\tau})_1 \setminus V_{\tau}} deg^{\tau}(i) + \sum_{i \in (V_{\tau})_1 \setminus U_{\tau}} deg^{\tau}(i)\\
		&\ge 2|V_1(\tau) \setminus U_{\tau} \setminus V_{\tau}| + |(U_{\tau})_1 \setminus V_{\tau}| + |(V_{\tau})_1 \setminus U_{\tau}|\\
		&= 2r_1
		\end{align*}
		We can similarly prove $\sum_{e \in E(\tau)} l_e \ge 2 r_2$
	\end{proof}

	To illustrate the main idea, we will start by proving the weaker bound $E \le 1$. Observe that our assumptions imply $m \le \frac{d}{\lambda^2}, m \le \frac{k^2}{\lambda^2}$ and also, using the fact $\frac{\sqrt{\lambda}}{\sqrt{k}} \le d^{-A\varepsilon} \le 1$, we have $E \le \sqrt{d}^{r_1}\sqrt{m}^{r_2}\left(\frac{k}{d}\right)^{r_1} \left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{2\max(r_1, r_2)}$.

	\begin{claim}\label{claim: spca_decay}
		For integers $r_1, r_2 \ge 0$, if $m \le \frac{d}{\lambda^2}$ and $m \le \frac{k^2}{\lambda^2}$, then,
		\[\sqrt{d}^{r_1}\sqrt{m}^{r_2}\left(\frac{k}{d}\right)^{r_1} \left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{2\max(r_1, r_2)} \le  1\]
	\end{claim}

	\begin{proof}
	We will consider the cases $r_1 \ge r_2$ and $r_1 < r_2$ separately. If $r_1 \ge r_2$, we have
	\begin{align*}
		\sqrt{d}^{r_1}\sqrt{m}^{r_2}\left(\frac{k}{d}\right)^{r_1} \left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{2r_1} &\le \sqrt{d}^{r_1}\left(\frac{\sqrt{d}}{\lambda}\right)^{r_2}\left(\frac{k}{d}\right)^{r_1} \left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{2r_1}\\
		&= \left(\frac{\lambda}{\sqrt{d}}\right)^{r_1 - r_2}\\
		&\le \left(\frac{1}{\sqrt{m}}\right)^{r_1 - r_2}\\
		&\le 1
	\end{align*}
	And if $r_1 < r_2$, we have
	\begin{align*}
	\sqrt{d}^{r_1}\sqrt{m}^{r_2}\left(\frac{k}{d}\right)^{r_1} \left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{2r_2} &= \sqrt{d}^{r_1}\sqrt{m}^{r_2 - r_1}\sqrt{m}^{r_1}\left(\frac{k}{d}\right)^{r_1} \left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{2r_2}\\
	&\le \sqrt{d}^{r_1}\left(\frac{k}{\lambda}\right)^{r_2 - r_1}\left(\frac{\sqrt{d}}{\lambda}\right)^{r_1}\left(\frac{k}{d}\right)^{r_1} \left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{2r_2}\\
		&= 1
	\end{align*}
	\end{proof}

	For the desired bounds, we mimic this argument while carefully keeping track of factors of $d^{\varepsilon}$.

	\begin{claim}\label{claim: spca_decay2}
		For integers $r_1, r_2 \ge 0$ and an integer $r \ge 2\max(r_1, r_2)$, if $m \le \frac{d^{1 - \varepsilon}}{\lambda^2}$ and $m \le \frac{k^{2 - \varepsilon}}{\lambda^2}$, then,
		\[\sqrt{d}^{r_1}\sqrt{m}^{r_2}\left(\frac{k}{d}\right)^{r_1} \left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^r \le  \left(\frac{1}{d^{A\varepsilon}}\right)^r\]
	\end{claim}
	\begin{proof}
	If $r_1 \ge r_2$,
	\begin{align*}
		E &= \sqrt{d}^{r_1}\sqrt{m}^{r_2} \left(\frac{k}{d}\right)^{r_1}\left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{2r_1}\left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{r - 2r_1}\\
		&\le \sqrt{d}^{r_1}\left(\frac{\sqrt{d}^{1 - \varepsilon}}{\lambda}\right)^{r_2} \left(\frac{k}{d}\right)^{r_1}\left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{2r_1}\left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{r - 2r_1}\\
		& = \left(\frac{\lambda}{\sqrt{d}^{1 - \varepsilon}}\right)^{r_1 - r_2} \left(\frac{1}{\sqrt{d}}\right)^{\varepsilon r_1}\left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{r - 2r_1}\\
		& \le \left(\frac{1}{\sqrt{m}}\right)^{r_1 - r_2} \left(\frac{1}{\sqrt{d}}\right)^{\varepsilon r_1}\left(\frac{1}{d^{A\varepsilon}}\right)^{r - 2r_1}\\
		&\le \left(\frac{1}{d^{2A}}\right)^{\varepsilon r_1}\left(\frac{1}{d^{A\varepsilon}}\right)^{r - 2r_1}\\
		&= \left(\frac{1}{d^{A\varepsilon}}\right)^r
	\end{align*}
	And if $r_1 < r_2$,
	\begin{align*}
	E &= \sqrt{d}^{r_1}\sqrt{m}^{r_2 - r_1} \sqrt{m}^{r_1} \left(\frac{k}{d}\right)^{r_1}\left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{2r_2}\left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{r - 2r_2}\\
	&\le \sqrt{d}^{r_1}\left(\frac{\sqrt{k}^{2 - \varepsilon}}{\lambda}\right)^{r_2 - r_1}\left(\frac{\sqrt{d}^{1 - \varepsilon}}{\lambda}\right)^{r_1} \left(\frac{k}{d}\right)^{r_1}\left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{2r_2}\left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{r - 2r_2}\\
	&= \left(\frac{\sqrt{k}}{\sqrt{d}}\right)^{\varepsilon r_1}\left(\frac{1}{\sqrt{k}}\right)^{\varepsilon r_2}\left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{r - 2r_2}\\
	&\le \left(\frac{1}{\sqrt{k}}\right)^{\varepsilon r_2}\left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^{r - 2r_2}\\
	&\le \left(\frac{1}{d^{2A}}\right)^{\varepsilon r_2}\left(\frac{1}{d^{A\varepsilon}}\right)^{r - 2r_2}\\
	&\le \left(\frac{1}{d^{A\varepsilon}}\right)^{\sum_{e \in E(\tau)} l_e}
	\end{align*}
\end{proof}
The result follows by setting $r = \sum_{e \in E(\tau)} l_e$ in the above claim.
\end{proof}

\begin{corollary}\label{cor: spca_norm_decay}
	For all $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$, we have
	\[c(\tau) B_{norm}(\tau)S(\tau)R(\tau) \le 1\]
\end{corollary}

\begin{proof}
	First, note that if $deg^{\tau}(i)$ is odd for any vertex $i \in V(\tau) \setminus U_{\tau} \setminus V_{\tau}$, then $S(\tau) = 0$ and the inequality is true. So, assume that $deg^{\tau}(i)$ is even for all $i \in V(\tau) \setminus U_{\tau} \setminus V_{\tau}$.
	Since $\tau$ is a proper middle shape, we have $w(I_{\tau}) = 0$ and $w(S_{\tau, min}) = w(U_{\tau})$. This implies
	$n^{\frac{w(V(\tau)) + w(I_{\tau}) - w(S_{\tau, min})}{2}} = \sqrt{d}^{|\tau|_1 - |U_{\tau}|_1}\sqrt{m}^{|\tau|_2 - |U_{\tau}|_2}$.
	As was observed in the proof of \cref{lem: spca_charging}, every vertex $i \in V(\tau) \setminus U_{\tau}$ or $i \in V(\tau) \setminus V_{\tau}$ has $deg^{\tau}(i) \ge 1$ and hence, $|V(\tau)\setminus U_{\tau}| + |V(\tau)\setminus V_{\tau}| \le 4 \sum_{e \in E(\tau)} l_e$. Also, $q = d^{O(1)\cdot \varepsilon(C_V + C_E)}$. We can set $C_V, C_E$ sufficiently small so that
    {\footnotesize
	\begin{align*}
	c(\tau)B_{norm}(\tau)S(\tau)R(\tau)	&=100(6D_V)^{|U_{\tau}\setminus V_{\tau}| + |V_{\tau}\setminus U_{\tau}| + 2|E(\tau)|}4^{|V(\tau)\setminus (U_{\tau}\cup V_{\tau})|}\\
	&\cdot 2e(6qD_V)^{|V(\tau)\setminus U_{\tau}| + |V(\tau)\setminus V_{\tau}|}\prod_{e \in E(\tau)} (400D_V^2D_E^2q)^{l_e}\\
	&\cdot \sqrt{d}^{|\tau|_1 - |U_{\tau}|_1}\sqrt{m}^{|\tau|_2 - |U_{\tau}|_2} S(\tau) (C_{disc}\sqrt{D_E})^{\sum_{j \in (U_{\tau})_2 \cup (V_{\tau})_2} deg^{\tau}(j)}\\
	&\le d^{O(1) \cdot (C_V + C_E) \cdot \varepsilon\sum_{e \in E(\tau)} l_e}\cdot \prod_{j \in V_2(\tau) \setminus V_2(U_{\tau}) \setminus V_2(V_{\tau})} (deg^{\tau}(j) - 1)!!\cdot \frac{1}{d^{A\varepsilon\sum_{e \in E(\tau)} l_e}}\\
	&\le d^{O(1) \cdot (C_V + C_E) \cdot \varepsilon\sum_{e \in E(\tau)} l_e}\cdot (D_VD_E)^{\sum_{e \in E(\tau)} l_e}\cdot \frac{1}{d^{A\varepsilon\sum_{e \in E(\tau)} l_e}}\\
	&\le 1
	\end{align*}}
\end{proof}

We can now obtain our desired middle shape bounds.

\begin{lemma}\label{lem: spca_cond2}
    For all $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$,
    \[
    \begin{bmatrix}
        \frac{1}{|Aut(U)|c(\tau)}H_{Id_U} & B_{norm}(\tau) H_{\tau}\\
        B_{norm}(\tau) H_{\tau}^T & \frac{1}{|Aut(U)|c(\tau)}H_{Id_U}
    \end{bmatrix}
    \succeq 0
    \]
\end{lemma}

\begin{proof}
	We have
	\begin{align*}
		&\begin{bmatrix}
			\frac{1}{|Aut(U)|c(\tau)}H_{Id_U} & B_{norm}(\tau)H_{\tau}\\
			B_{norm}(\tau)H_{\tau}^T & \frac{1}{|Aut(U)|c(\tau)}H_{Id_U}
		\end{bmatrix}\\
		&\qquad= \begin{bmatrix}
			\left(\frac{1}{|Aut(U)|c(\tau)} - \frac{S(\tau)R(\tau)B_{norm}(\tau)}{|Aut(U)|}\right)H_{Id_U} & 0\\
			0 & \left(\frac{1}{|Aut(U)|c(\tau)} - \frac{S(\tau)R(\tau)B_{norm}(\tau)}{|Aut(U)|}\right)H_{Id_U}
		\end{bmatrix}\\
		&\qquad\quad+ B_{norm}(\tau)\begin{bmatrix}
			\frac{S(\tau)R(\tau)}{|Aut(U)|}H_{Id_U} & H_{\tau}\\
			H_{\tau}^T & \frac{S(\tau)R(\tau)}{|Aut(U)|}H_{Id_U}
		\end{bmatrix}
	\end{align*}
	By \cref{lem: spca_cond2_simplified}, $\begin{bmatrix}
		\frac{S(\tau)R(\tau)}{|Aut(U)|}H_{Id_U} & H_{\tau}\\
		H_{\tau}^T & \frac{S(\tau)R(\tau)}{|Aut(U)|}H_{Id_U}
	\end{bmatrix}
	\succeq 0$, so the second term above is positive semidefinite. For the first term, by \cref{lem: spca_cond1}, $H_{Id_U} \succeq 0$ and by \cref{cor: spca_norm_decay}, $\frac{1}{|Aut(U)|c(\tau)} - \frac{S(\tau)R(\tau)B_{norm}(\tau)}{|Aut(U)|} \ge 0$, which proves that the first term is also positive semidefinite.
\end{proof}

\subsection{Intersection term bounds}

\begin{lemma}\label{lem: spca_charging2}
	Suppose $0 < A < \frac{1}{4}$ is a constant such that $\frac{\sqrt{\lambda}}{\sqrt{k}} \le d^{-A\varepsilon}, \frac{1}{\sqrt{k}} \le d^{-2A}$ and $\frac{k}{d} \le d^{-A\varepsilon}$. For all $m$ such that $m \le \frac{d^{1 - \varepsilon}}{\lambda^2}, m \le \frac{k^{2 - \varepsilon}}{\lambda^2}$, for all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$ and for all $\gamma \in \Gamma_{U, V}$,
	\[n^{w(V(\gamma)\setminus U_{\gamma})} S(\gamma)^2 \le \left(\prod_{j \in V_2(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}}(deg^{\gamma}(j)- 1)!!\right)^2\frac{1}{d^{B\varepsilon (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + \sum_{e \in E(\gamma)} l_e)}}\]
	for some constant $B > 0$ that depends only on $C_{\Delta}$. In particular, it is independent of $C_V$ and $C_E$.
\end{lemma}

\begin{proof}
	Suppose there is a vertex $i \in V(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}$ such that $deg^{\gamma}(i)$ is odd, then $S(\gamma) = 0$ and the inequality is true. So, assume $deg^{\gamma}(i)$ is even for all vertices $i \in V(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}$.
	We have $n^{w(V(\gamma) \setminus U_{\gamma})} = d^{|\gamma|_1 - |U_{\gamma}|_1}m^{|\gamma|_2 - |U_{\gamma}|_2}$. Plugging in $S(\gamma)$, we get that we have to prove
    {\footnotesize
	\begin{align*}
		E := d^{|\gamma|_1 - |U_{\gamma}|_1}m^{|\gamma|_2 - |U_{\gamma}|_2} \left(\frac{k}{d}\right)^{2|\gamma|_1 - |U_{\gamma}|_1 - |V_{\gamma}|_1}\Delta^{2|\gamma|_2 - |U_{\gamma}|_2 - |V_{\gamma}|_2} \prod_{e \in E(\gamma)} \frac{\lambda^{l_e}}{k^{l_e}} \le \frac{1}{d^{B\varepsilon (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + \sum_{e \in E(\gamma)} l_e)}}
	\end{align*}}

	Let $S'$ be the set of vertices $i \in U_{\gamma} \setminus V_{\gamma}$ that have $deg^{\gamma}(i) \ge 1$. Let $e, f$ be the number of type $1$ vertices and the number of type $2$ vertices in $S'$ respectively. Observe that $S' \cup (U_{\gamma} \cap V_{\gamma})$ is a vertex separator of $\gamma$.
	Let $g = |V_{\gamma} \setminus U_{\gamma}|_1$ (resp. $h = |V_{\gamma} \setminus U_{\gamma}|_2$) be the number of type $1$ vertices (resp. type $2$ vertices) in $V_{\gamma} \setminus U_{\gamma}$.
	We first claim that $d^em^f \ge d^gm^h$. To see this, note that the vertex separator $S' \cup (U_{\gamma} \cap V_{\gamma})$ has weight $\sqrt{d}^{e + |U_{\gamma} \cap V_{\gamma}|_1}\sqrt{m}^{f + |U_{\gamma} \cap V_{\gamma}|_2}$. On the other hand, $V_{\gamma}$ has weight $\sqrt{d}^{g + |U_{\gamma} \cap V_{\gamma}|_1}\sqrt{m}^{h + |U_{\gamma} \cap V_{\gamma}|_2}$. Since $\gamma$ is a left shape, $V_{\gamma}$ is the unique minimum vertex separator and hence, we have the inequality $\sqrt{d}^{e + |U_{\gamma} \cap V_{\gamma}|_1}\sqrt{m}^{f + |U_{\gamma} \cap V_{\gamma}|_2} \ge \sqrt{d}^{g + |U_{\gamma} \cap V_{\gamma}|_1}\sqrt{m}^{h + |U_{\gamma} \cap V_{\gamma}|_2}$ which implies $d^em^f \ge d^gm^h$.
	Let $p = |V(\gamma) \setminus (U_{\gamma} \cup V_{\gamma})|_1$ (resp. $q = |V(\gamma) \setminus (U_{\gamma} \cup V_{\gamma})|_2$) be the number of type $1$ vertices (resp. type $2$ vertices) in $V(\gamma) \setminus (U_{\gamma} \cup V_{\gamma})$.
	To illustrate the main idea, we will first prove the weaker inequality $E \le 1$. Since $\Delta \le 1$, it suffices to prove
	\begin{align*}
		d^{|\gamma|_1 - |U_{\gamma}|_1}m^{|\gamma|_2 - |U_{\gamma}|_2} \left(\frac{k}{d}\right)^{2|\gamma|_1 - |U_{\gamma}|_1 - |V_{\gamma}|_1} \prod_{e \in E(\gamma)} \frac{\lambda^{l_e}}{k^{l_e}} \le 1
	\end{align*}
	We have
	$d^{|\gamma|_1 - |U_{\gamma}|_1}m^{|\gamma|_2 - |U_{\gamma}|_2} = d^{p + g}m^{q + h} \le n^{p + \frac{e + g}{2}}m^{q + \frac{f + h}{2}}$
	since $d^em^f \ge d^gm^h$. Also, $2|\gamma|_1 - |U_{\gamma}|_1 - |V_{\gamma}|_1 = 2p + e + g$. So, it suffices to prove
	\begin{align*}
		n^{p + \frac{e + g}{2}}m^{q + \frac{f + h}{2}}\left(\frac{k}{d}\right)^{2p + e + g} \prod_{e \in E(\gamma)} \left(\frac{\lambda}{k}\right)^{l_e} \le 1
	\end{align*}

	We will need the following claim.
	\begin{claim}
		$\sum_{e \in E(\gamma)} l_e \ge \max(2p + e + g, 2q + f + h)$
	\end{claim}
	\begin{proof}
		Since $H_{\gamma}$ is bipartite, we have $\sum_{e \in E(\gamma)}l_e = \sum_{i \in V_1(\gamma)} deg^{\gamma}(i) = \sum_{i \in V_2(\gamma)} deg^{\gamma}(i)$. Observe that all vertices $i \in V(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}$ have $deg^{\gamma}(i)$ nonzero and even, and hence, $deg^{\gamma}(i) \ge 2$. Then,
	\begin{align*}
		\sum_{e \in E(\gamma)}l_e &= \sum_{i \in V_1(\gamma)} deg^{\gamma}(i)\\
		&\ge \sum_{i \in V_1(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}} deg^{\gamma}(i) + \sum_{i \in (U_{\gamma})_1 \setminus V_{\gamma}} deg^{\gamma}(i) + \sum_{i \in (V_{\gamma})_1 \setminus U_{\gamma}} deg^{\gamma}(i)\\
		&\ge 2p + e + g
	\end{align*}
	Similarly,
	\begin{align*}
	\sum_{e \in E(\gamma)}l_e &= \sum_{i \in V_2(\gamma)} deg^{\gamma}(i)\\
	&\ge \sum_{i \in V_2(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}} deg^{\gamma}(i) + \sum_{i \in (U_{\gamma})_2 \setminus V_{\gamma}} deg^{\gamma}(i) + \sum_{i \in (V_{\gamma})_2 \setminus U_{\gamma}} deg^{\gamma}(i)\\
	&\ge 2q + f + h
\end{align*}
Therefore, $\sum_{e \in E(\gamma)} l_e \ge \max(2p + e + g, 2q + f + h)$.
\end{proof}

Now, let $r_1 = p + \frac{e + g}{2}, r_2 = q + \frac{f + h}{2}$. Then, $\sum_{e \in E(\gamma)} l_e \ge 2\max(r_1, r_2)$ and we wish to prove
	$d^{r_1}m^{r_2} \left(\frac{k}{d}\right)^{2r_1} \left(\frac{\lambda}{k}\right)^{2\max(r_1, r_2)} \le 1$
This expression simply follows by squaring \cref{claim: spca_decay}.

Now, to prove that $E \le \frac{1}{d^{B\varepsilon (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + \sum_{e \in E(\gamma)} l_e)}}$, we mimic this argument while carefully keeping track of factors of $d^{\varepsilon}$. Again, using $d^em^f \ge d^gm^h$, it suffices to prove that
\begin{align*}
	d^{p + \frac{e + g}{2}}m^{q + \frac{f + h}{2}} \left(\frac{k}{d}\right)^{2|\gamma|_1 - |U_{\gamma}|_1 - |V_{\gamma}|_1}&\Delta^{2|\gamma|_2 - |U_{\gamma}|_2 - |V_{\gamma}|_2} \prod_{e \in E(\gamma)} \frac{\lambda^{l_e}}{k^{l_e}}\\
    &\le \frac{1}{d^{B\varepsilon (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + \sum_{e \in E(\gamma)} l_e)}}
\end{align*}

The idea is that the $d^{B\varepsilon}$ decay for the edges are obtained from the stronger assumption on $m$, namely $m \le \frac{d^{1 - \varepsilon}}{\lambda^2}, m \le \frac{k^{2 - \varepsilon}}{\lambda^2}$. And the $d^{B\varepsilon}$ decay for the type $1$ vertices of $V(\gamma) \setminus(U_{\gamma} \cap V_{\gamma})$ are obtained both from the stronger assumption on $m$ as well as the factors of $\frac{k}{d}$, the latter especially useful for the degree $0$ vertices. Finally, the $d^{B\varepsilon}$ decay for the type $2$ vertices of $V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})$ are obtained from the factors of $\Delta$.
Indeed, note that for a constant $B$ that depends on $C_{\Delta}$, $\Delta^{2|\gamma|_2 - |U_{\gamma}|_2 - |V_{\gamma}|_2} \le d^{-B\varepsilon|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})|_2}$. So, we would be done if we prove
\begin{align*}
	d^{p + \frac{e + g}{2}}m^{q + \frac{f + h}{2}} \left(\frac{k}{d}\right)^{2|\gamma|_1 - |U_{\gamma}|_1 - |V_{\gamma}|_1}\left(\frac{\lambda}{k}\right)^{\sum_{e \in E(\gamma)} l_e} \le \frac{1}{d^{B\varepsilon (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})|_1 + \sum_{e \in E(\gamma)} l_e)}}
\end{align*}

Let $c_0$ be the number of type $1$ vertices $i$ in $V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})$ such that $deg^{\gamma}(i) = 0$. Since they have degree $0$, they must be in $(U_{\gamma})_1 \setminus V_{\gamma}$. Also, we have $2|\gamma|_1 - |U_{\gamma}|_1 - |V_{\gamma}|_1 = 2p + e + g + c_0$ and hence, $\left(\frac{k}{d}\right)^{2|\gamma|_1 - |U_{\gamma}|_1 - |V_{\gamma}|_1} = \left(\frac{k}{d}\right)^{2p + e + g + c_0}$. For these degree $0$ vertices, we have that the factors of $\frac{k}{d} \le d^{-A\varepsilon}$ offer a decay of $\frac{1}{d^{B\varepsilon}}$. Therefore, it suffices to prove
\begin{align*}
	d^{p + \frac{e + g}{2}}m^{q + \frac{f + h}{2}} \left(\frac{k}{d}\right)^{2p + e + g}\left(\frac{\lambda}{k}\right)^{\sum_{e \in E(\gamma)} l_e} \le \frac{1}{d^{B\varepsilon (p + q + e + f + g + h) + \sum_{e \in E(\gamma)} l_e)}}
\end{align*}
for a constant $B > 0$. Observe that $p + q + e + f + g + h \le 2(\sum_{e \in E(\gamma)} l_e)$. Therefore, using the notation $r_1 = p + \frac{e + g}{2}, r_2 = q + \frac{f + h}{2}$, it suffices to prove
\begin{align*}
	d^{r_1}m^{r_2} \left(\frac{k}{d}\right)^{2r_1}\left(\frac{\lambda}{k}\right)^{\sum_{e \in E(\gamma)} l_e} \le \frac{1}{d^{B\varepsilon \sum_{e \in E(\gamma)} l_e}}
\end{align*}
for a constant $B > 0$. But this follows by squaring \cref{claim: spca_decay2} where we set $r = \sum_{e \in E(\gamma)} l_e$.
\end{proof}

\begin{remk}
	In the above bounds, note that there is a decay of $d^{B\varepsilon}$ for each vertex in $V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})$.	One of the main technical reasons for introducing the slack parameter $C_{\Delta}$ in the planted distribution was to introduce this decay, which is needed in the current machinery.
\end{remk}

With this, we obtain intersection term bounds.

\begin{lemma}\label{lem: spca_cond3}
    For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$ and all $\gamma \in \Gamma_{U, V}$, \[c(\gamma)^2N(\gamma)^2B(\gamma)^2H_{Id_V}^{-\gamma, \gamma} \preceq H_{\gamma}'\]
\end{lemma}

\begin{proof}
	By \cref{lem: spca_cond3_simplified}, we have
	\begin{align*}
		c(\gamma)^2N(\gamma)^2B(\gamma)^2H_{Id_V}^{-\gamma, \gamma} &\preceq c(\gamma)^2N(\gamma)^2B(\gamma)^2 S(\gamma)^2R(\gamma)^2 \frac{|Aut(U)|}{|Aut(V)|} H'_{\gamma}
	\end{align*}
	Using the same proof as in \cref{lem: spca_cond1}, we can see that $H'_{\gamma} \succeq 0$. Therefore, it suffices to prove that $c(\gamma)^2N(\gamma)^2B(\gamma)^2 S(\gamma)^2R(\gamma)^2 \frac{|Aut(U)|}{|Aut(V)|} \le 1$.
	Since $U, V \in {\mathcal I}_{mid}$, $Aut(U) = |U|_1!|U|_2!, Aut(V) = |V|_1!|V|_2!$. Therefore, $\frac{|Aut(U)|}{|Aut(V)|} = \frac{|U|_1!|U|_2!}{|V|_1!|V|_2!} \le D_V^{|U_{\gamma} \setminus V_{\gamma}|}$. Also, $|E(\gamma)| \le \sum_{e \in E(\gamma)} l_e$ and $q = d^{O(1) \cdot \varepsilon (C_V + C_E)}$. Note $R(\gamma)^2 = (C_{disc}\sqrt{D_E})^{2\sum_{j \in (U_{\gamma})_2 \cup (V_{\gamma})_2} deg^{\gamma}(j)} \le d^{O(1)\cdot \varepsilon C_E \cdot \sum_{e \in E(\gamma)} l_e}$ and \[\left(\prod_{j \in V_2(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}}(deg^{\gamma}(j)- 1)!!\right)^2 \le (D_VD_E)^{2\sum_{e \in E(\tau)} l_e} \le d^{O(1)\cdot \varepsilon (C_V + C_E) \cdot \sum_{e \in E(\gamma)} l_e}\]

	Let $B$ be the constant from \cref{lem: spca_charging2}. We can set $C_V, C_E$ sufficiently small so that, using \cref{lem: spca_charging2},
	\begin{align*}
		c(\gamma)^2&N(\gamma)^2B(\gamma)^2S(\gamma)^2R(\gamma)^2 \frac{|Aut(U)|}{|Aut(V)|} \\
		&\le 100^2 (6D_V)^{2|U_{\gamma}\setminus V_{\gamma}| + 2|V_{\gamma}\setminus U_{\gamma}| + |E(\alpha)|}16^{|V(\gamma) \setminus (U_{\gamma} \cup V_{\gamma})|}\\
		&\quad\cdot (3D_V)^{4|V(\gamma)\setminus V_{\gamma}| + 2|V(\gamma)\setminus U_{\gamma}|} (6qD_V)^{2|V(\gamma)\setminus U_{\gamma}| + 2|V(\gamma)\setminus V_{\gamma}|} \prod_{e \in E(\gamma)} (400D_V^2D_E^2q)^{2l_e}\\
		&\quad\cdot  n^{w(V(\gamma)\setminus U_{\gamma})} S(\gamma)^2 d^{O(1)\cdot \varepsilon C_E \cdot \sum_{e \in E(\gamma)} l_e}\cdot D_V^{|U_\gamma \setminus V_{\gamma}|} \\
		&\le d^{O(1) \cdot \varepsilon(C_V + C_E) \cdot (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + \sum_{e \in E(\gamma)} l_e)} \cdot n^{w(V(\gamma)\setminus U_{\gamma})} S(\gamma)^2\\
		&\le d^{O(1) \cdot \varepsilon(C_V + C_E) \cdot (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + \sum_{e \in E(\gamma)} l_e)}\cdot \frac{1}{d^{B\varepsilon (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + \sum_{e \in E(\gamma)} l_e)}}\\
		&\le 1
	\end{align*}
\end{proof}

\subsection{Truncation error bounds}

In this section, we will obtain truncation error bounds using the strategy sketched in \cite[Section 10]{potechin2020machinery}. We also reuse the notation. To do this, we need to first obtain a bound on the quantity $B_{norm}(\sigma) B_{norm}(\sigma') H_{Id_U}(\sigma, \sigma')$.

\begin{lemma}\label{lem: spca_charging3}
	Suppose $0 < A < \frac{1}{4}$ is a constant such that $\frac{\sqrt{\lambda}}{\sqrt{k}} \le d^{-A\varepsilon}$ and $\frac{1}{\sqrt{k}} \le d^{-2A}$. Suppose $m$ is such that $m \le \frac{d^{1 - \varepsilon}}{\lambda^2}, m \le \frac{k^{2 - \varepsilon}}{\lambda^2}$. For all $U \in {\mathcal I}_{mid}$ and $\sigma, \sigma' \in {\mathcal L}_U$,
	\[B_{norm}(\sigma) B_{norm}(\sigma') H_{Id_U}(\sigma, \sigma') \le \frac{1}{d^{0.5A\varepsilon(|V(\sigma \circ \sigma')| + \sum_{e \in E(\alpha) l_e}}} \cdot \frac{1}{d^{|U_{\sigma}|_1 + |U_{\sigma'}|_1}m^{|U_{\sigma'}|_2 + |U_{\sigma'}|_2}}\]
\end{lemma}

\begin{proof}
	Suppose there is a vertex $i \in V(\sigma) \setminus V_{\sigma}$ such that $deg^{\sigma}(i) + deg^{U_{\sigma}}(i)$ is odd, then $H_{Id_U}(\sigma, \sigma') = 0$ and the inequality is true. So, assume that $deg^{\sigma}(i) + deg^{U_{\sigma}}(i)$ is even for all $i \in V(\sigma) \setminus V_{\sigma}$. Similarly, assume that $deg^{\sigma'}(i) + deg^{U_{\sigma'}}(i)$ is even for all $i \in V(\sigma') \setminus V_{\sigma'}$. Also, if $\rho_{\sigma} \neq \rho_{\sigma'}$, we will have $H_{Id_U}(\sigma, \sigma') = 0$ and we would be done. So, assume $\rho_{\sigma} = \rho_{\sigma'}$.

	Let there be $e$ (resp. $f$) vertices of type $1$ (resp. type $2$) in $V(\sigma) \setminus U_{\sigma} \setminus V_{\sigma}$. Then, $n^{\frac{w(V(\sigma)) - w(U)}{2}} = \sqrt{d}^{|V(\sigma)|_1 - |U|_1}\sqrt{m}^{|V(\sigma)|_2 - |U|_2} = \sqrt{d}^{|U_{\sigma}|_1}\sqrt{m}^{|U_{\sigma}|_2} \sqrt{d}^e\sqrt{m}^f$. Let there be $g$ (resp. $h$) vertices of type $1$ (resp. type $2$) in $V(\sigma') \setminus U_{\sigma'} \setminus V_{\sigma'}$. Then, similarly, $n^{\frac{w(V(\sigma')) - w(U)}{2}} \le \sqrt{d}^{|U_{\sigma'}|_1}\sqrt{m}^{|U_{\sigma'}|_2}\sqrt{d}^g\sqrt{m}^h$.

	Let $\alpha = \sigma \circ \sigma'$. Since all vertices in $V(\alpha) \setminus U_{\alpha} \setminus V_{\alpha}$ have degree at least $2$, we have $\sum_{e \in E(\alpha)} l_e \ge \sum_{i \in V_1(\alpha) \setminus U_{\alpha} \setminus V_{\alpha}} deg^{\alpha}(i) \ge 2(e + g) + |U_{\sigma}|_1 + |U_{\sigma}|_2$. Similarly, $\sum_{e \in E(\alpha)} l_e \ge 2(f + h) + |U_{\sigma'}|_1 + |U_{\sigma'}|_2$. Therefore, by setting $r_1 = e + g, r_2 = f + h$ in \cref{claim: spca_decay2}, we have
	\[\sqrt{d}^{e + g}\sqrt{m}^{f + h} \left(\frac{k}{d}\right)^{e + g}\prod_{e \in E(\alpha)} \frac{\sqrt{\lambda}^{l_e}}{\sqrt{k}^{l_e}} \le \frac{1}{d^{A\varepsilon \sum_{e \in E(\alpha)} l_e}}\]
	Also, \[\left(\frac{k}{d}\right)^{|\alpha|_1} \le \left(\frac{k}{d}\right)^{e + g + |U_{\sigma}|_1 + |U_{\sigma'}|_1}\]
    and \[\prod_{j \in V_2(\alpha)} (deg^{\alpha}(j) - 1)!! \le d^{\varepsilon C_V \sum_{e \in E(\alpha)} l_e}\]
    Therefore,
	\begin{align*}
		&n^{\frac{w(V(\sigma)) - w(U)}{2}}n^{\frac{w(V(\sigma')) - w(U)}{2}}H_{Id_U}(\sigma, \sigma')\\
		&\le d^{O(1)D_{sos}}\sqrt{d}^e\sqrt{m}^f d^{O(1)D_{sos}}\sqrt{d}^g\sqrt{m}^h\\
        &\qquad \cdot\frac{1}{|Aut(U)|}\left(\frac{1}{\sqrt{k}}\right)^{deg(\alpha)}\left(\frac{k}{d}\right)^{|\alpha|_1}\Delta^{|\alpha|_2} \prod_{j \in V_2(\alpha)} (deg^{\alpha}(j) - 1)!!\prod_{e \in E(\alpha)} \frac{\sqrt{\lambda}^{l_e}}{\sqrt{k}^{l_e}}\\
		&\le d^{O(1)D_{sos}} d^{\varepsilon C_V \sum_{e \in E(\alpha)} l_e} \sqrt{d}^{e + g}\sqrt{m}^{f + h} \left(\frac{k}{d}\right)^{e + g}\prod_{e \in E(\alpha)} \frac{\sqrt{\lambda}^{l_e}}{\sqrt{k}^{l_e}} \cdot \frac{1}{d^{|U_{\sigma}|_1 + |U_{\sigma'}|_1}m^{|U_{\sigma'}|_2 + |U_{\sigma'}|_2}}\\
		&\le \frac{d^{\varepsilon C_V \sum_{e \in E(\alpha)} l_e}}{d^{A\varepsilon \sum_{e \in E(\alpha)} l_e}} \cdot \frac{1}{d^{|U_{\sigma}|_1 + |U_{\sigma'}|_1}m^{|U_{\sigma'}|_2 + |U_{\sigma'}|_2}}
	\end{align*}
	By setting $C_V, C_E$ sufficiently small and plugging in the expressions for $B_{norm}(\sigma), B_{norm}(\sigma')$, we obtain the result.
\end{proof}

We can apply the the strategy now.

\begin{restatable}{lemma}{SPCAfive}\label{lem: spca_cond5}
	Whenever $\norm{M_{\alpha}} \le B_{norm}(\alpha)$ for all $\alpha \in {\mathcal M}'$,
	\[
	\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}} \succeq \frac{1}{d^{K_1D_{sos}^2}} Id_{sym}
	\]
	for a constant $K_1 > 0$ that can depend on $C_{\Delta}$.
\end{restatable}

\begin{proof}
    For $V \in {\mathcal I}_{mid}$, $\lambda_V = \frac{\Delta^{|V|_2}}{d^{|V|_1}k^{|V|_2}}$. Let the minimum value of this quantity over all $V$ be $N$. We then choose $w_V = N / \lambda_V$ so that for all left shapes $\sigma \in {\mathcal L}_V$, \cref{lem: spca_charging3} implies $w_{V} \leq \frac{w_{U_{\sigma}}\lambda_{U_{\sigma}}}{|\mathcal{I}_{mid}|B_{norm}(\sigma)^2{c(\sigma)^2}{H_{Id_V}(\sigma,\sigma)}}$, completing the proof.
\end{proof}

\begin{restatable}{lemma}{SPCAsix}\label{lem: spca_cond6}
	\[\sum_{U\in {\mathcal I}_{mid}} \sum_{\gamma \in \Gamma_{U, *}} \frac{d_{Id_{U}}(H_{Id_{U}}, H'_{\gamma})}{|Aut(U)|c(\gamma)} \le \frac{d^{K_2 D_{sos}}}{2^{D_V}}\]
	for a constant $K_2 > 0$ that can depend on $C_{\Delta}$.
\end{restatable}

\begin{proof}
    We do the same calculations as in the proof of \cref{lem: plds_cond6}, until
	\begin{align*}
		\sum_{U\in {\mathcal I}_{mid}} \sum_{\gamma \in \Gamma_{U, *}} &\frac{d_{Id_{U}}(H_{Id_{U}}, H'_{\gamma})}{|Aut(U)|c(\gamma)}\\
        &\le \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}} {B_{norm}(\sigma)B_{norm}(\sigma')H_{Id_{U}}(\sigma,\sigma')\frac{1}{2^{\min(m_{\sigma}, m_{\sigma'}) - 1}}}\\
		&\le \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}}\frac{d^{O(1) D_{sos}}}{d^{0.5A\varepsilon|V(\sigma \circ \sigma')|}2^{\min(m_{\sigma}, m_{\sigma'}) - 1}}
	\end{align*}
	where we used \cref{lem: spca_charging3}. Using $d^{0.5A\varepsilon |V(\sigma \circ \sigma')|} \ge d^{0.1A\varepsilon |V(\sigma \circ \sigma')|}2^{|V(\sigma \circ \sigma')|}$,
	\begin{align*}
		\sum_{U\in {\mathcal I}_{mid}} \sum_{\gamma \in \Gamma_{U, *}} \frac{d_{Id_{U}}(H_{Id_{U}}, H'_{\gamma})}{|Aut(U)|c(\gamma)} &\le \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}}\frac{d^{O(1) D_{sos}}}{d^{0.1A\varepsilon|V(\sigma \circ \sigma')|}  2^{|V(\sigma \circ \sigma')|}2^{\min(m_{\sigma}, m_{\sigma'}) - 1}}\\
		&\le \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}}\frac{d^{O(1) D_{sos}}}{d^{0.1A\varepsilon|V(\sigma \circ \sigma')|} 2^{D_V}}\\
		&\le \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}}\frac{d^{O(1) D_{sos}}}{D_{sos}^{D_{sos}}d^{0.1A\varepsilon|V(\sigma \circ \sigma')|} 2^{D_V}}
	\end{align*}
	The final step will be to argue that $\sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}}\frac{1}{D_{sos}^{D_{sos}}d^{0.1 A\varepsilon|V(\sigma \circ \sigma')|}} \le 1$ which will complete the proof. But this will follow if we set $C_V, C_E$ small enough.
\end{proof}

We can finally show that truncation errors can be handled.

\begin{restatable}{lemma}{SPCAfour}\label{lem: spca_cond4}
    Whenever $\norm{M_{\alpha}} \le B_{norm}(\alpha)$ for all $\alpha \in \mathcal{M}'$,
    \[
    \sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}} \succeq 6\left(\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U}}(H'_{\gamma},H_{Id_{U}})}{|Aut(U)|c(\gamma)}}}\right)Id_{sym}
    \]
\end{restatable}

\begin{proof}
	Choose $C_{sos}$ sufficiently small so that $\frac{1}{d^{K_1D_{sos}^2}} \ge 6\frac{d^{K_2D_{sos}}}{2^{D_V}}$ which can be satisfied by setting $C_{sos} < K_3 C_V$ for a sufficiently small constant $K_3 > 0$. Then, since $Id_{Sym} \succeq 0$, using \cref{lem: spca_cond5} and \cref{lem: spca_cond6},
	\begin{align*}
		\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}} &\succeq \frac{1}{d^{K_1D_{sos}^2}} Id_{sym}\\
		&\succeq 6\frac{d^{K_2D_{sos}}}{2^{D_V}} Id_{sym}\\
		&\succeq 6\left(\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U}}(H'_{\gamma},H_{Id_{U}})}{|Aut(U)|c(\gamma)}}}\right)Id_{sym}
	\end{align*}
\end{proof}
\subsection{Fourier analysis for matrix-valued functions: ribbons, shapes, and graph matrices}
For our machinery, we need the definitions of ribbons, shapes, and graph matrices from \cite{AMP20}.


\subsubsection{Ribbons}
\emph{Ribbons} lift the usual Fourier basis for functions $\{ f \, : \, \{ \pm 1\}^{n \choose 2} \rightarrow {\mathbb R} \}$ to  matrix-valued functions.





\begin{definition}[Simplified ribbons -- see \cref{def: ribbons}]
	Let $n \in {\mathbb{N}}$.
	A ribbon $R$ is a tuple $(E_R,A_R,B_R)$ where $E_R \subseteq {[n] \choose 2}$ and $A_R,B_R$ are tuples of elements in $[n]$.
	$R$ thus specifies:
	\begin{enumerate}
		\item A Fourier character $\chi_{E_R}$.
		\item Row and column indices $A_R$ and $B_R$.
	\end{enumerate}
	We think of $R$ as a graph with vertices
	\[
	V(R) = \{ \text{ endpoints of $(i,j) \in E_R$ } \} \cup A_R \cup B_R
	\]
	and edges $E(R) = E_R$, where $A_R, B_R$ are distinguished tuples of vertices.
\end{definition}

\begin{definition}[Matrix-valued function for a ribbon $R$]
	Given a ribbon $R$, we define the matrix valued function $M_R \, : \, \{ \pm 1\}^{n \choose 2} \rightarrow {\mathbb R}^{\frac{n!}{(n - |A_R|)!} \times \frac{n!}{(n - |B_R|)!}}$ to have entries $M_R(A_R,B_R) = \chi_{E_R}$ and $M_R(A',B') = 0$ whenever $A' \neq A_R$ or $B' \neq B_R$.


\end{definition}

The following proposition captures the main property of the matrix-valued functions $M_R$ -- they are an orthonormal basis. We leave the proof to the reader.
\begin{proposition}
	The matrix-valued functions $M_R$ form an orthonormal basis for the vector space of matrix valued functions with respect to the inner product 
	\[
	\iprod{M,M'} = {\mathbb E}_{G \sim \{ \pm 1\}^{n \choose 2} }\left[\Tr \left(M(G) (M'(G))^\top\right)\right].
	\]
\end{proposition}

We don't directly utilize this proposition in our work but this gives insight on to the structure of the matrix valued functions we define and motivates the definition of graph matrices, that we use extensively.


\begin{example}
    In \cref{fig: ribbon_shape}, consider the ribbon $R$ as shown. We have $A_R = (1, 3), B_R = (4), V(R) = \{1, 2, 3, 4\}, E_R = \{\{1, 2\}, \{3, 2\}, \{2, 4\}\}$. The Fourier character is $\chi_{E_R} = \chi_{1, 2}\chi_{3, 2}\chi_{2, 4}$. And finally, $M_R$ is a matrix with rows and columns indexed by tuples of length $|A_R| = 2$ and $|B_R| = 1$ respectively, with exactly one nonzero entry $M_R((1, 3), (4)) = \chi_{E_R}$. Succinctly, \[M_R =
  \begin{blockarray}{rl@{}c@{}r}
    & & \makebox[0pt]{column $(4)$} \\[-0.5ex]
    & & \,\downarrow \\[-0.5ex]
    \begin{block}{r(l@{}c@{}r)}
    & \makebox[3.1em]{\Large $0$\bigstrut[t]} & \vdots &\makebox[4.2em]{\Large $0$} \\[-0.2ex]
    \text{row }(1, 3) \rightarrow \mkern-9mu & \raisebox{0.5ex}{\makebox[3.2em][l]{\dotfill}} & \chi_{1, 2}\chi_{3, 2}\chi_{2, 4} & \raisebox{0.5ex}{\makebox[4.2em][r]{\dotfill}} \\[+0ex]
    & \makebox[3.1em]{\Large $0$} & \vdots &\makebox[4.2em]{\bigstrut\Large $0$} \\
    \end{block}
  \end{blockarray}\]
\end{example}

\begin{figure}[!h]
    \centering
    \includegraphics[scale=.6, trim={0 5cm 2 5cm},clip]{machinery/images/ribbon_shape}
    \caption{Example of a ribbon and a shape}
    \label{fig: ribbon_shape}
\end{figure}


\subsubsection{Shapes and Graph Matrices}
As described above, \emph{ribbons} are an orthonormal basis for matrix-valued functions. However, we will need an orthogonal basis for the subset of those functions which are symmetric with respect to the action of $S_n$. For this, we use \emph{graph matrices}, which are described by \emph{shapes}. The idea is that each ribbon $R$ has a shape $\alpha$ which is obtained by replacing the vertices of $R$ with unspecified indices. Up to scaling, the graph matrix $M_{\alpha}$ is the average of $M_{\pi(R)}$ over all permutations $\pi \in S_n$.



\begin{definition}[Simplified shapes -- see \cref{def: shapes}]
	Informally, a shape $\alpha$ is just a ribbon $R$ where the vertices are specified by variables rather than having specific values in $[n]$. More precisely, a shape $\alpha = (V(\alpha),E(\alpha),U_{\alpha},V_{\alpha})$ is a graph on vertices $V(\alpha)$, with
	\begin{enumerate}
		\item Edges $E(\alpha) \subseteq {{V(\alpha)} \choose 2}$
		\item Distinguished tuples of vertices $U_\alpha = (u_1,u_2,\dots)$ and $V_\alpha = (v_1,v_2,\dots)$, where $u_i,v_i \in V(\alpha)$.
	\end{enumerate}
	(Note that $V(\alpha)$ and $V_\alpha$ are not the same object!)
\end{definition}

\begin{definition}[Shape transposes]
	Given a shape $\alpha$, we define $\alpha^{\top}$ to be the shape $\alpha$ with $U_{\alpha}$ and $V_{\alpha}$ swapped i.e. $U_{\sigma^{\top}} = V_{\sigma}$ and $V_{\sigma^{\top}} = U_{\sigma}$.
	Note that $M_{\alpha^{\top}} = M_\alpha^{\top}$, where $M_\alpha^{\top}$ is the usual transpose of the matrix-valued function $M_\alpha$.
\end{definition}

\begin{definition}[Graph matrices]
	Let $\alpha$ be a shape.
	The graph matrix $M_{\alpha} \, : \, \{ \pm 1\}^{n \choose 2} \rightarrow {\mathbb R}^{\frac{n!}{(n - |U_{\alpha}|)!} \times \frac{n!}{(n - |V_{\alpha}|)!}}$ is defined to be the matrix-valued function with $A,B$-th entry
	\[
	M_{\alpha}(A,B) = \sum_{\substack{R \text{ s.t. } A_R = A, B_R = B \\ \exists \varphi:V(\alpha) \rightarrow [n]: \\ \varphi \text{ is injective}, \varphi(\alpha) = R}}{\chi_{E_R}}
	\]
	In other words, $M_\alpha = \sum_{R} M_R$ where the sum is over ribbons $R$ which can be obtained by assigning each vertex in $V(\alpha)$ a label from $[n]$.

\end{definition}

\begin{example}
In \cref{fig: ribbon_shape}, consider the shape $\alpha$ as shown. We have $U_{\alpha} = (u_1, u_2), V_{\alpha} = (v_1), V(\alpha) = \{u_1, u_2, v_1, w_1\}$ and $E(\alpha) = \{\{u_1, w_1\}, \{u_2, w_1\}, \{w_1, v_1\}\}$. $M_{\alpha}$ is a matrix with rows and columns indexed by tuples of length $|U_{\alpha}| = 2$ and $|V_{\alpha}| = 1$ respectively. The nonzero entries will have rows and columns indexed by $(a_1, a_2)$ and $b_1$ respectively for all distinct $a_1, a_2, b_1$, with the corresponding entry being $M_{\alpha}((a_1, a_2), (b_1)) = \sum_{c_1 \in [n] \setminus \{a_1, a_2, b_1\}} \chi_{a_1, c_1}\chi_{a_2, c_1}, \chi_{c_1, b_1}$. Here, the injective map $\varphi$ maps $u_1, u_2, w_1, v_1$ to $a_1, a_2, c_1, b_1$ respectively and we sum over all such maps. Succinctly, \[M_{\alpha} =
  \begin{blockarray}{rl@{}c@{}r}
    & & \makebox[0pt]{column $(c_1)$} \\[-0.5ex]
    & & \,\downarrow \\[-0.5ex]
    \begin{block}{r(l@{}c@{}r)}
    &  & \vdots & \\[-0.2ex]
    \text{row }(a_1, a_2) \rightarrow \mkern-9mu & \raisebox{0.5ex}{\makebox[3.2em][l]{\dotfill}} & \sum_{c_1 \in [n] \setminus \{a_1, a_2, b_1\}} \chi_{a_1, c_1}\chi_{a_2, c_1}, \chi_{c_1, b_1} & \raisebox{0.5ex}{\makebox[4.2em][r]{\dotfill}} \\[+.5ex]
    &  & \vdots & \\
    \end{block}
  \end{blockarray}\]
\end{example}

\begin{remk}
    The fact that we are summing over all "free" vertices in $V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})$ is how we are incorporating symmetry into the definition of these graph matrices.
\end{remk}

The following examples illustrate that simple matrices such as the adjacency matrix of a graph and the identity matrix are also graph matrices.

\begin{example}[Adjacency matrix]\label{ex:adj-matrix}
	Let $\alpha$ be the shape with two vertices $V(\alpha) = \{u_1,v_1\}$ and a single edge $E(\alpha) = \{\{ u_1,v_1\}\}$. The tuples $U_\alpha, V_\alpha$ are $(u_1), (v_1)$, respectively.
	Then $M_\alpha$ has entries $(M_\alpha)_{i,j}(G) = G_{ij}$ if $i \neq j$ and $(M_\alpha)_{i,i} = 0$.
	If $G \in \{ \pm 1\}^{n \choose 2}$ is thought of as a graph, then $M_\alpha$ is precisely its $\pm 1$ adjacency matrix with zeros on the diagonal.
\end{example}

\begin{example}[Identity matrix]
	If $V(\alpha) = \{u\}$ is a singleton, $E(\alpha) = \emptyset$, and $U_{\alpha} = V_{\alpha} = (u)$, then $M_\alpha(G)$ is identically equal to the $n \times n$ identity matrix, independent of $G$.
\end{example}



For more examples of graph matrices and why they can be a useful tool to work with, see \cite{AMP20}.
\begin{remark}
As noted in \cite{AMP20}, we index graph matrices by tuples rather than sets so that they are symmetric (as a function of the input) under permutations of $[n]$.
\end{remark}

\subsection{Factoring Graph Matrices and Decomposing Shapes into Left, Middle, and Right Parts}

A crucial idea in our analysis is the idea from \cite{BHKKMP16} of decomposing each shape $\alpha$ into left, middle, and right parts. This will allow us to give an approximate factorization of each graph matrix $M_{\alpha}$.

\subsubsection{Leftmost and Rightmost Minimum Vertex Separators and Decomposition of Shapes into Left, Middle, and Right Parts}
For each shape $\alpha$ we will identify three other shapes, which we denote by $\sigma,\tau,{\sigma'}^T$ and call (for reasons we will see soon) the \emph{left, middle, and right parts of $\alpha$}, respectively.
The idea is that $M_{\alpha} \approx M_{\sigma} M_{\tau} M_{{\sigma'}^T}$.
We obtain $\sigma, \tau$, and ${\sigma'}^T$ by splitting the shape $\alpha$ along the \emph{leftmost and rightmost minimum vertex separators}.

\begin{definition}[Vertex Separators]
	We say that a set of vertices $S$ is a vertex separator of $\alpha$ if every path from $U_{\alpha}$ to $V_{\alpha}$ in $\alpha$ (including paths of length $0$) intersects $S$. Note that for any vertex separator $S$, $U_{\alpha} \cap V_{\alpha} \subseteq S$. 
\end{definition}



\begin{definition}[Minimum Vertex Separators]
	We say that $S$ is a minimum vertex separator of $\alpha$ if $S$ is a vertex separator of $\alpha$ and for any other vertex separator $S'$ of $\alpha$, $|S| \leq |S'|$.
\end{definition}

\begin{definition}[Leftmost and Rightmost Minimum Vertex Separators] \
	\begin{enumerate}
		\item We say that $S$ is the leftmost minimum vertex separator of $\alpha$ if $S$ is a minimum vertex separator of $\alpha$ and for every other minimum vertex separator $S'$ of $\alpha$, every path from $U_{\alpha}$ to $S'$ intersects $S$. 
		\item We say that $T$ is the rightmost minimum vertex separator of $\alpha$ if $T$ is a minimum vertex separator of $\alpha$ and for every other minimum vertex separator $S'$ of $\alpha$, every path from $S'$ to $V_{\alpha}$ intersects $T$. 
	\end{enumerate}
\end{definition}
It is not immediately obvious that leftmost and rightmost minimum vertex separators are well-defined. For the simplified setting we are considering here, this was shown by \cite{BHKKMP16}. We give a more general proof in \cref{separatorswelldefinedsection}.

We now describe how to split $\alpha$ into left, middle, and right parts $\sigma, \tau$, and ${\sigma'}^T$.

\begin{definition}[Decomposition Into Left, Middle, and Right Parts]
	Let $\alpha$ be a shape and let $S$ and $T$ be the leftmost and rightmost minimum vertex separators of $\alpha$. Given orderings $O_S$ and $O_T$ for $S$ and $T$, we decompose $\alpha$ into left, middle, and right parts $\sigma$, $\tau$, and ${\sigma'}^T$ as follows.

	\begin{enumerate}
		\item The left part $\sigma$ of $\alpha$ is the part of $\alpha$ reachable from $U_\alpha$ without passing through $S$. It includes $S$ but excludes all edges which are entirely within $S$.
		More formally, 
		\begin{enumerate}
		    \item $V(\sigma) = \{u \in V(\alpha): \text{ there is a path } P \text{ from } U_{\alpha} \text{ to } u \text{ in } \alpha \text{ such that } (V(P) \setminus \{u\}) \cap S = \emptyset\}$
		    \item $U_\sigma = U_\alpha$ and $V_\sigma = S$ with the ordering $O_S$
		    \item $E(\sigma) = \{\{u,v\} \in E(\alpha): u,v \in V(\sigma), u \notin S \text{ or } v \notin S\}$
		\end{enumerate}
		\item
		The right part ${\sigma'}^T$ of $\alpha$ is the part of $\alpha$ reachable from $V_\alpha$ without intersecting $T$ more than once. It includes $T$ but excludes all edges which are entirely within $T$.
		More formally, 		
		\begin{enumerate}
		    \item $V({\sigma'}^T) = \{u \in V(\alpha): \text{ there is a path } P \text{ from } V_{\alpha} \text{ to } u \text{ in } \alpha \text{ such that } (V(P) \setminus \{u\}) \cap T = \emptyset\}$
		    \item $U_{{\sigma'}^T} = T$ with the ordering $O_T$ and $V_{{\sigma'}^T} = V_\alpha$.
		    \item $E({\sigma'}^T) = \{\{u,v\} \in E(\alpha): u,v \in V({\sigma'}^T), u \notin T \text{ or } v \notin T\}$
		\end{enumerate}
		\item The middle part $\tau$ of $\alpha$ is, informally, the part of $\alpha$ between $S$ and $T$ (including $S$ and $T$ and all edges which are entirely within $S$ or within $T$).
		More formally, let $U_\tau = S$ with the ordering $O_S$, let $V_\tau = T$ with the ordering $O_T$, and let $E(\tau) = E(\alpha) \setminus (E(\sigma) \cup E(\sigma'))$ be all of the edges of $E(\alpha)$ which do not appear in $E(\sigma)$ or $E(\sigma')$.
		Then $V(\tau)$ is all of the vertices incident to edges in $E(\tau)$ together with $S, T$.
	\end{enumerate}
\end{definition}

\begin{example}
    \cref{fig: basic_shape_comp} illustrates an example decomposition.
    \begin{enumerate}
        \item If we start with the shape $\alpha$ denoted as $\sigma \circ \sigma'^T$, observe that there is a unique minimum vertex separator, which consists of the middle vertex of degree $5$, i.e. the one that's not in either $U_{\sigma \circ \sigma'^T}$ or $V_{\sigma \circ \sigma'^T}$.
        Then, $\alpha$ is decomposed in to the left part $\sigma$, a trivial middle part $\tau$ (not shown in this figure) which has $V(\tau) = \{u\}, U_{\tau} = V_{\tau} = (u), E(\tau) = \emptyset$, and the right part $\sigma'^T$.
        \item If we start with the shape $\alpha$ denoted as $\sigma \circ \tau \circ \sigma'^T$, then the leftmost minimum vertex separator is the vertex of degree $4$ and the rightmost minimum vertex separator is the vertex of degree $5$. Then, $\alpha$ is decomposed into the left part $\sigma$, the middle part $\tau$ and the right part $\sigma'^T$, which are all shown in this figure.
    \end{enumerate}
\end{example}

\begin{remark}
Note that the decomposition into left, middle, and right parts depends on the ordering for the vertices in $S$ and $T$. As we will discuss later (see Section \ref{fullcoefficientmatrixsubsection}), we will use all possible orderings simultaneously and then scale things by an appropriate constant.
\end{remark}

\begin{figure}[!h]
    \centering
    \includegraphics[scale=0.45, trim={4.5cm 2cm 0 2cm},clip]{machinery/images/basic_shape_comp}
    \caption{Illustration of shape composition and decomposition.}
    \label{fig: basic_shape_comp}
\end{figure}


\noindent Because of the minimality and leftmost/rightmost-ness of the vertex separators $S,T$ used to define $\sigma, \tau, \sigma'$, the shapes $\sigma, \tau, \sigma'$ have some special combinatorial structure, which we capture in the following proposition. We defer the proof until \cref{sec: technical_def_and_main_theorem} where we state a generalized version.

\begin{proposition}
	$\sigma$, $\tau$, and ${\sigma'}^{T}$ have the following properties:
	\begin{enumerate}
		\item $V_{\sigma} = S$ is the unique minimum vertex separator of $\sigma$.
		\item $S$ and $T$ are the leftmost and rightmost minimum vertex separators of $\tau$.
		\item $T = U_{{\sigma'}^T}$ is the unique minimum vertex separator of ${\sigma'}^T$.
	\end{enumerate}
\end{proposition}

\noindent Based on this, we define sets of shapes which can appear as left, middle, or right parts.

\begin{definition}[Left, Middle, and Right Parts] Let $\alpha$ be a shape.
	\begin{enumerate}
		\item We say that $\alpha$ is a left part if $V_{\alpha}$ is the unique minimum vertex separator of $\alpha$, all vertices of $\alpha$ are reachable from $U_{\alpha}$ without passing through $V_{\alpha}$, and $E(\alpha)$ has no edges which are entirely contained in $V_{\alpha}$.
		\item We say that $\alpha$ is a proper middle part if $U_{\alpha}$ is the leftmost minimum vertex separator of $\alpha$ and $V_{\alpha}$ is the rightmost minimum vertex separator of $\alpha$
		\item We say that $\alpha$ is a right part if $U_{\alpha}$ is the unique minimum vertex separator of $\alpha$, all vertices of $\alpha$ are reachable from $V_{\alpha}$ without passing through $U_{\alpha}$, and $E(\alpha)$ has no edges which are entirely contained in $U_{\alpha}$.
	\end{enumerate}
\end{definition}
\begin{remark}
	For technical reasons, later on we will need to consider improper middle parts $\tau$ where $U_{\tau}$ and $V_{\tau}$ are not the leftmost and rightmost minimum vertex separators of $\tau$, which is why we make this distinction here.
\end{remark}

\noindent The following proposition is also straightforward from the definitions.

\begin{proposition}
	A shape $\sigma$ is a left part if and only if $\sigma^{T}$ is a right part
\end{proposition}

\subsubsection{Products of Graph Matrices}
We now analyze what happens when we take the products of graph matrices. Roughly speaking, we will have that if $\alpha$ can be decomposed into left, middle, and right parts $\sigma$, $\tau$, and ${\sigma'}^{T}$ then $M_{\alpha} \approx M_{\sigma}M_{\tau}M_{{\sigma'}^T}$. However, this is only an approximation rather than an equality, and this will be the source of considerable technical difficulties.

We begin with a concatenation operation on ribbons.
\begin{definition}[Ribbon Concatenation]
	If $R_1$ and $R_2$ are two ribbons such that $V(R_1) \cap V(R_2) = B_{R_1} = A_{R_2}$ and either $R_1$ or $R_2$ contains no edges entirely within $B_{R_1} = A_{R_2}$ then we define $R_1 \circ R_2$ to be the ribbon formed by glueing together $R_1$ and $R_2$ along $B_{R_1} = A_{R_2}$.
	In other words,
	\begin{enumerate}
		\item $V(R_1 \circ R_2) = V(R_1) \cup V(R_2)$
		\item $E(R_1 \circ R_2) = E(R_1) \cup E(R_2)$
		\item $A_{R_1 \circ R_2} = A_{R_1}$ and $B_{R_1 \circ R_2} = B_{R_2}$.
	\end{enumerate}
\end{definition}

\noindent The following proposition is easy to check.

\begin{proposition}
	Whenever $R_1, R_2$ are ribbons such that $R_1 \circ R_2$ is defined, $M_{R_1}M_{R_2} = M_{R_1 \circ R_2}$
\end{proposition}

\noindent We have an analogous definition for concatenating shapes:

\begin{definition}[Shape Concatenation]
	If $\alpha_1$ and $\alpha_2$ are two shapes such that $V(\alpha_1) \cap V(\alpha_2) = V_{\alpha_1} = U_{\alpha_2}$ and either $\alpha_1$ or $\alpha_2$ contains no edges entirely within $V_{\alpha_1} = U_{\alpha_2}$ then we define $\alpha_1 \circ \alpha_2$ to be the shape formed by glueing together $\alpha_1$ and $\alpha_2$ along $V_{\alpha_1} = U_{\alpha_2}$. In other words,
	\begin{enumerate}
		\item $V(\alpha_1 \circ \alpha_2) = V(\alpha_1) \cup V(\alpha_2)$
		\item $E(\alpha_1 \circ \alpha_2) = E(\alpha_1) \cup E(\alpha_2)$
		\item $U_{\alpha_1 \circ \alpha_2} = U_{\alpha_1}$ and $V_{\alpha_1 \circ \alpha_2} = V_{\alpha_2}$.
	\end{enumerate}
\end{definition}

\begin{example}
    \cref{fig: basic_shape_comp} illustrates an example of shape composition. Observe how the shapes $\sigma \circ \sigma'^T$ and $\sigma \circ \tau \circ \sigma'^T$ are obtained from the shapes $\sigma, \tau$ and $\sigma'^T$.
\end{example}

\noindent The next proposition, again easy to check, shows that the shape concatenation operation respects the left/middle/right part decomposition.

\begin{proposition}
	If $\alpha$ can be decomposed into left, middle, and right parts $\sigma,\tau,{\sigma'}^{T}$ then $\alpha = \sigma \circ \tau \circ {\sigma'}^T$.
\end{proposition}

We now discuss why $M_\alpha = M_{\sigma \circ \tau \circ {\sigma'}^T} \approx M_\sigma M_\tau M_{{\sigma'}^T}$ is only an approximation rather than an equality. Consider the difference $M_{\sigma}M_\tau M_{{\sigma'}^T} - M_{\sigma \circ \tau \circ {\sigma'}^T}$. The graph matrix $M_{\sigma \circ \tau \circ {\sigma'}^T}$ decomposes (by definition) into a sum over injective maps $\varphi \, : \, V(\sigma \circ \tau \circ {\sigma'}^T) \rightarrow [n]$. Also by expanding definitions, the product $M_{\sigma}M_\tau M_{{\sigma'}^T}$ expands into a sum over triples of injective maps $(\varphi_1, \varphi_2, \varphi_3)$, where $\varphi_1 \, : \, V(\sigma) \rightarrow [n], \varphi_2 \, : \, V(\tau) \rightarrow [n], \varphi_3 \, : \, V(\sigma') \rightarrow [n]$ where $\varphi_1$ and $\varphi_2$ agree on $V_{\sigma} = U_{\tau}$ and $\varphi_2$ and $\varphi_3$ agree on $V_{\tau} = U_{{\sigma'}^T}$.

If they are combined into one map $\varphi: V(\sigma \cup \tau \cup {\sigma'} ) \rightarrow [n]$, the resulting $\varphi$ may not be injective because $\varphi_1(V(\sigma)), \varphi_2(V(\tau)), \varphi_3(V({\sigma'}^T))$ may have nontrivial intersection (beyond $\varphi_1(V_\sigma)$ and $\varphi_2(V_\tau)$).
We call the resulting terms \emph{intersection terms} and handling them properly is a major part of the technical analysis.
\begin{remark}
    Actually, the approximation $M_\alpha = M_{\sigma \circ \tau \circ {\sigma'}^T} \approx M_\sigma M_\tau M_{{\sigma'}^T}$ is also off by a multiplicative constant because there is also a subtle issue involving the automorphism groups of these shapes. For now, we ignore this issue. For details about this issue, see Lemma \ref{lm:morthsimplereexpression}.
\end{remark}




\subsection{Shape Coefficient Matrices}
The idea for our analysis is as follows. Given a matrix-valued function $\Lambda$ which is symmetric under permutations of $[n]$, we write $\Lambda = \sum_{\alpha}{\lambda_{\alpha}M_{\alpha}}$. We then break each shape $\alpha$ up into left, middle, and right parts $\sigma$, $\tau$, and ${\sigma'}^{T}$.

For this analysis, we use \emph{shape coefficient matrices} $H_{\tau}$ whose rows and columns are indexed by left shapes and whose entries depend on the coefficients $\lambda_{\alpha}$. We choose these matrices so that 
\[
\Lambda = \sum_{\tau}{H_{\tau}(\sigma,\sigma')M_{\sigma \circ \tau \circ {\sigma'}^T}} \approx \sum_{\tau}{H_{\tau}(\sigma,\sigma')M_{\sigma}M_{\tau}M_{{\sigma'}^T}}
\]
To set this up, we separate the possible middle parts $\tau$ into groups based on the size of $U_{\tau}$ and whether or not they are trivial.
\begin{definition}
    We define $\mathcal{I}_{mid}$ to be the set of all possible $U_{\tau}$. Here $\mathcal{I}_{mid}$ is the set of tuples of unspecified vertices of the form $U = (u_1,\ldots,u_k)$ where $0 \leq k \leq d$.
\end{definition}
\begin{definition}
    We say that a proper middle shape $\tau$ is trivial if $E(\tau) = \emptyset$ and $|U_{\tau} \cap V_{\tau}| = |U_{\tau}| = |V_{\tau}|$ (i.e. $V_{\tau}$ is a permutation of $U_{\tau}$).
\end{definition}
For simplicity, the only proper trivial middle parts $\tau$ we consider are shapes $Id_U$ corresponding to identity matrices. 
\begin{definition}
    Given a tuple of unspecified vertices $U = (u_1,\ldots,u_{|U|})$ We define $Id_U$ to be the shape where $V(Id_U) = U$, $U_{Id_U} = V_{Id_U} = U$, and $E(Id_U) = \emptyset$.
\end{definition}
We group all of the proper non-trivial middle parts $\tau$ into sets $\mathcal{M}_U$ based on the size of $U_{\tau}$.
\begin{definition}
    Given a tuple of unspecified vertices $U = (u_1,\ldots,u_{|U|})$, we define $\mathcal{M}_U$ to be the set of proper non-trivial middle parts $\tau$ such that $U_{\tau}$ and $V_{\tau}$ have the same size as $U$. Note that $U_{\tau}$ and $V_{\tau}$ may intersect each other arbitrarily.
\end{definition}
With these definitions, we can now define our shape coefficient matrices.
\begin{definition}
    Given $U \in \mathcal{I}_{mid}$, we define $\mathcal{L}_U$ to be the set of left shapes $\sigma$ such that $|V_{\sigma}| = |U|$.
\end{definition}
\begin{definition}
    For each $U \in \mathcal{I}_{mid}$, we define the shape coefficient matrix $H_{Id_U}$ to be the matrix indexed by left shapes $\sigma,\sigma' \in \mathcal{L}_{U}$ with entries $H_{Id_U}(\sigma,\sigma') = \frac{1}{|U|!}\lambda_{\sigma \circ {\sigma'}^T}$
\end{definition}
\begin{definition}
    For each $U \in \mathcal{I}_{mid}$, for each $\tau \in \mathcal{M}_{U}$, we define the shape coefficient matrix $H_{\tau}$ to be the matrix indexed by left shapes $\sigma,\sigma' \in \mathcal{L}_{U}$ with entries $H_{\tau}(\sigma,\sigma') = \frac{1}{(|U|!)^2}\lambda_{\sigma \circ \tau \circ {\sigma'}^T}$
\end{definition}
With these shape coefficient matrices, we have the following decomposition of $\Lambda = \sum_{\alpha}{\lambda_{\alpha}M_{\alpha}}$.
\begin{lemma}
$\Lambda = \sum_{U \in \mathcal{I}_{mid}}{\sum_{\sigma,\sigma' \in \mathcal{L}_U}{H_{Id_U}(\sigma,\sigma')M_{\sigma \circ {\sigma'}^T}}} + \sum_{U \in \mathcal{I}_{mid}}{\sum_{\tau \in \mathcal{M}_{U}}{\sum_{\sigma,\sigma' \in \mathcal{L}_U}{H_{\tau}(\sigma,\sigma')M_{\sigma \circ \tau \circ  {\sigma'}^T}}}}$
\end{lemma}
We defer the proof of this lemma to \cref{lm:determiningcoefficientmatrices}.

For technical reasons, we need to define one more operation to handle intersection terms. We call this operation \emph{the $-\gamma,\gamma$ operation.}

\begin{definition}
Given $U,V \in \mathcal{I}_{mid}$ where $|U| > |V|$, we define $\Gamma_{U,V}$ to be the set of left parts $\gamma$ such that $|U_{\gamma}| = |U|$ and $|V_{\gamma}| = |V|$.
\end{definition}
\begin{definition}
	Given $U,V \in \mathcal{I}_{mid}$ where $|U| > |V|$, a shape coefficient matrix $H_{Id_V}$, and a $\gamma \in \Gamma_{U,V}$, we define the shape coefficient matrix $H^{-\gamma,\gamma}_{Id_V}$ to be the matrix indexed by left shapes $\sigma,\sigma' \in \mathcal{L}_{U}$ with entries $H^{-\gamma,\gamma}_{Id_V}(\sigma,\sigma') = H(\sigma \circ \gamma, \sigma' \circ \gamma)$
\end{definition}

\subsection{Informal Theorem Statement}

We are now ready to state a simplified, qualitative version of our main theorem. For the full, quantitative version of our main theorem, see \cref{simplifiedmaintheorem}.

\begin{theorem}\label{informalmaintheoremstatement}
	There exist functions $f(\tau): \mathcal{M}_U \rightarrow {\mathbb R}$ and $f(\gamma): \Gamma_{U,V} \rightarrow {\mathbb R}$ depending on $n$ and other parameters such that if $\Lambda = \sum_{\alpha}{\lambda_{\alpha}M_{\alpha}}$ and the following conditions hold:
	\begin{enumerate}
		\item For all $U \in \mathcal{I}_{mid}$,  $H_{Id_U} \succeq 0$
		\item For all $U \in \mathcal{I}_{mid}$ and all $\tau \in \mathcal{M}_{U}$,
		\[
		\left[ {\begin{array}{cc}
				H_{Id_U} & f(\tau)H_{\tau} \\
				f(\tau)H^T_{\tau} & H_{Id_U}
		\end{array}} \right] \succeq 0
		\]
	
		\item For all $U,V \in \mathcal{I}_{mid}$ such that $|U| > |V|$ and all $\gamma \in \Gamma_{U,V}$, $H^{-\gamma,\gamma}_{Id_{V_{\gamma}}} \preceq f(\gamma)H_{Id_{U_{\gamma}}}$
	\end{enumerate}
	then with probability at least $1-o(1)$ over $G \sim \{ \pm 1\}^{n \choose 2}$ it holds that $\Lambda(G) \succeq 0$.
\end{theorem}

\begin{remark}
	Condition $1$ of \cref{informalmaintheoremstatement} will follow from condition $2$ but we state it explicitly since it will correspond to the dominating terms of the approximate PSD decomposition. And in applications, it will be both easy to verify and will shed light on the structure of the coefficients which in turn will be useful for verifying conditions $2$ and $3$.
\end{remark}

\begin{remark}
As we will demonstrate in the remainder of this paper, our machinery works well when the coefficients $\lambda_{\alpha}$ has a polynomial decay for each vertex or edge in the shape. In many settings, this can be done quite easily by adding noise to the distribution, such as resampling part of the input, or by lowering the parameters slightly, such as $m \le n^{k/4 - \varepsilon}$ instead of $m \le n^{k/4}$. That said, it's possible to optimize many of our functions so that a significantly better bound can be obtained (say for example $m \le n^{k/4} / polylog(n)$ is potentially feasible), we do not attempt this in this work.
\end{remark}

\subsubsection{Choice of functions $f(\tau)$ and $f(\gamma)$, upto lower order terms}

In a rough sense, $f(\tau)$ measures the blow-up in the norm by using $M_{\tau}$ instead of $M_{Id_U}$ in the corresponding term of the Fourier decomposition. So we choose $f(\tau)$ to be $\norm{M_{\tau}}$, upto lower order terms. Our second condition verifies that the coefficients that arise because of this $\tau$ (which are encoded in $H_{\tau}$) are sufficiently small to overpower this norm blowup.

The fact that $\norm{M_{\tau}}$ is equal to $\tilde{O}(n^{\frac{|V(\tau)|-|U_{\tau}|}{2}})$ has been shown in previous works \cite{BHKKMP16, AMP20}. So, we choose $f(\tau)$ to be $\tilde{O}(n^{\frac{|V(\tau)|-|U_{\tau}|}{2}})$ where the problem instance is on $G_{n, 1/2}$. For problems with Gaussian or other inputs, similar forms of $f(\tau)$ can be used, which have been shown formally in the work of \cite{AMP20}. When we state the main theorem in general, we use a single $f(\tau)$ that incorporates all of these settings.

$f(\gamma)$ is a bit trickier to describe. In our analysis, we roughly collect intersection terms from the approximate PSD decomposition and charge them to shapes of the form $\sigma \circ \gamma \circ \gamma^T \circ \sigma'^T$. Using the same idea as the previous step, we charge these to shapes with trivial middle shapes. Roughly, $f(\gamma)$ for a fixed $\gamma$ upper bounds the blowup from the norms of the original shape as compared to the new intersection shape. And the third condition argues that the the original coefficients are sufficiently small to compensate for these blowups.

For problems on $G_{n, 1/2}$, we set $f(\gamma) = \tilde{O}(n^{|V(\gamma) \setminus U_{\gamma}|})$. For problems with Gaussian inputs, we choose essentially the same function, but they fall under the umbrella of generalized graph matrices, where $V(\gamma)$ and $U_{\gamma}$ are defined accordingly. Indeed, in our main theorem, we encompass both these settings with a single choice of $f(\gamma)$.

\subsection{An informal application to planted clique}
Before we move on to further definitions needed for a more complete statement of the main theorem, we present an informal example.

\begin{example}
	When the pseudo-calibration method is applied to prove an SoS lower bound for the planted clique problem in $n$ node graphs with clique size $k$, as in \cite{BHKKMP16}, the matrix-valued function which results is $\Lambda =  \sum_{\alpha \, : \, |V(\alpha)| \leq t}{\left(\frac{k}{n}\right)^{|V(\alpha)|}M_{\alpha}}$ where $t \approx \log(n)$.
	One may then compute that the matrices $H_{Id_U}$ and $H_{\tau}$ are as follows (at least so long as $|V(\sigma)|,|V(\tau)|,|V(\sigma')| \ll t$; we ignore this detail for now).
	For all $r \in [0,\frac{d}{2}]$,
	\begin{enumerate}
		\item For $U$ with $|U|  = r$, $H_{Id_U}(\sigma,\sigma') = \left(\frac{k}{n}\right)^{|V(\sigma)| + |V(\sigma')| - r}$
		\item For all proper, non-trivial middle shapes $\tau$ such that $|U_{\tau}| = |V_{\tau}| = r$, 
		\[
		H_{\tau}(\sigma,\sigma') = \left(\frac{k}{n}\right)^{|V(\sigma)| + |V(\sigma')| + |V(\tau)|- 2r}
		\]
	\end{enumerate}
	Defining $v_r$ to be the vector such that $v_r(\sigma) = \left(\frac{k}{n}\right)^{|V(\sigma)| - \frac{r}{2}}$, we have that 
	\begin{enumerate}
		\item For $U$ with $|U|  = r$, $H_{Id_U} = {v_{|U|}}{v^T_{|U|}}$
		\item For all proper, non-trivial middle shapes $\tau$ such that $|U_{\tau}| = |V_{\tau}| = r$, $H_{\tau} = \left(\frac{k}{n}\right)^{|V(\tau)|- r}{v_r}{v^T_r}$
		\item For all left parts $\gamma$, $H^{-\gamma,\gamma}_{Id_{V_{\gamma}}} = \left(\frac{k}{n}\right)^{2|V(\gamma)| - |U_{\gamma}| - |V_{\gamma}|}v_{|U_{\gamma}|}v^{T}_{|U_{\gamma}|}$
	\end{enumerate}
	It turns out in this setting that we can take $f(\tau)$ to be $\tilde{O}(n^{\frac{|V(\tau)|-|U_{\tau}|}{2}})$ and $f(\gamma)$ to be $\tilde{O}(n^{|V(\gamma) \setminus U_{\gamma}|})$. Thus, as long as $k \ll \sqrt{n}$,
	\begin{enumerate}
		\item For any $U$ and all $\tau$ such that $V_{\tau} \neq U_{\tau}$ with $|U_{\tau}| = |V_{\tau}| = |U|$, $f(\tau)H_{\tau} \preceq H_{Id_U}$.
		\item For all non-trivial left parts $\gamma$, $H^{-\gamma,\gamma}_{Id_{V_{\gamma}}} \preceq f(\gamma)H_{Id_{U_{\gamma}}}$
	\end{enumerate}
\end{example}
\begin{remark}\label{rmk: planted_clique_failure}
This does not quite satisfy the conditions of Theorem \ref{informalmaintheoremstatement} because there are $\tau$ such that $V_{\tau} = U_{\tau}$ but which are non-trivial because $E(\tau) \neq \emptyset$. For these $\tau$, condition 2 of Theorem \ref{informalmaintheoremstatement} fails. In order to prove their SoS lower bounds for planted clique, \cite{BHKKMP16} handle this issue by grouping together all of the $\tau$ where $V_{\tau} = U_{\tau}$ into the indicator function for whether $V_{\tau} = U_{\tau}$ is a clique.

Since this issue is specific to planted clique, we don't try to incorporate it into the machinery to avoid losing generality.

\end{remark}

For the sake of exposition, a detailed analysis with figures of all the shapes and all the coefficient matrices that appear for the degree-$4$ SoS lower bound for planted clique is given in \cref{sec: deg_4_planted_clique}.

\subsection{Generalizing the machinery}
In this section, we restricted ourselves to the case when the input is from $\{-1,1\}^{\binom{n}{2}}$ for simplicity. However, for our results we will need to handle more general types of inputs. We now briefly describe which kinds of inputs we will need to handle and how we handle them.
\begin{enumerate}
    \item In general, the entries of the input may be labeled by more than $2$ indices. For example, for tensor PCA on order $3$ tensors, the entries of the input are indexed by $3$ indices. To handle this, we will have shapes which have hyperedges rather than edges.
    \item In general, the entries of the input will come from a distribution $\Omega$ rather than being $\pm{1}$. To handle this, we will take an orthonormal basis $\{h_k\}$ for $\Omega$. We will then give each edge/hyperedge a label $l$ to specify which polynomial $h_l$ should be applied to that entry of the input.   
    \item In general, there may be $t$ different types of indices rather than just one type of index. In this case, the symmetry group will be $S_{n_1} \times \ldots \times S_{n_t}$ rather than $S_n$. To handle this, we will have shapes with different types of vertices.
\end{enumerate}
We formally make these generalizations in \cref{sec: technical_def_and_main_theorem}.

\subsection{Further definitions needed for our applications}

We will describe some more notations and definitions that will be useful to us to describe the qualitative bounds for our applications. For each of our applications, we will describe the corresponding modifications needed to the definitions already in place and present new definitions where necessary.

\subsubsection{Planted slightly denser subgraph}

Since the input is a graph $G \in \{-1, 1\}^{\binom{[n]}{2}}$, most of what we introduced already apply to this setting. To describe the moment matrix, we need to define the truncation parameter.

\begin{definition}[Truncation parameters]
For integers $D_{sos}, D_V \ge 0$, say that a shape $\alpha$ satisfies the truncation parameters $D_{sos}, D_V$ if
\begin{itemize}
    \item The degrees of the monomials that $U_{\alpha}$ and $V_{\alpha}$ correspond to, are at most $\frac{D_{sos}}{2}$
    \item The left part $\sigma$, the middle part $\tau$ and the right part $\sigma'$ of $\alpha$ satisfy $|V(\sigma)|, |V(\tau)|, |V(\sigma')| \le D_V$
\end{itemize}
\end{definition}

\subsubsection{Tensor PCA}

We consider the input to be a tensor $A \in {\mathbb R}^{[n]^k}$. The input entries are now sampled from the distribution $\mathcal{N}(0, 1)$ instead of $\{-1, 1\}$. So, we will work with the Hermite basis of polynomials.

Let the standard unnormalized Hermite polynomials be denoted as $h_0(x) = 1, h_1(x) = x, h_2(x) = x^2 - 1, \ldots$. Then, we work with the basis $h_a(A) := \prod_{e \in [n]^k} h_e(A_e)$ over $a \in \mathbb{N}^{[n]^k}$. Accordingly, we will modify the graphs that represent ribbons (and by extension, shapes), to have labeled hyperedges of arity $k$. So, an hyperedge $e$ with a label $t$ will correspond to the hermite polynomial $h_t(A_e)$.

\begin{definition}[Hyperedges]
Instead of standard edges, we will have labeled hyperedges of arity $k$ in the underlying graphs for our ribbons as well as shapes. The label for an hyperedge $e$, denoted $l_e$, is an element of $\mathbb{N}$ which will correspond to the Hermite polynomial being evaluated on that entry.
\end{definition}

Note that our hyperedges are ordered since the tensor $A$ is not necessarily symmetric.

For variables $x_1, \ldots, x_n$, the rows and columns of our moment matrix will now correspond to monomials of the form $\prod_{i \le n} x_i^{p_i}$ for $p_i \ge 0$. To capture this, we use the notion of index shape pieces and index shapes. Informally, we split the above monomial product into groups based on their powers and each such group will form an index shape piece.

\begin{definition}[Index shape piece]
    An index shape piece $U_i= ((U_{i, 1}, \ldots, U_{i, t}), p_i)$ is a tuple of indices $(U_{i, 1}, \ldots, U_{i, t})$ along with a power $p_i \in \mathbb{N}$. Let $V(U_i)$ be the set $\{U_{i, 1}, \ldots, U_{i, t}\}$ of vertices of this index shape piece. When clear from context, we use $U_i$ instead of $V(U_i)$.
\end{definition}

If we realize $U_{i, 1}, \ldots, U_{i, t}$ to be indices $a_1, \ldots, a_t \in [n]$, then, this realization of this index shape piece corresponds to the monomial $\prod_{j \le t} x_{a_j}^{p_i}$.

\begin{definition}[Index shape]
An index shape $U$ is a set of index shape pieces $U_i$ that have different powers. Let $V(U)$ be the set of vertices $\cup_i V(U_i)$. When clear from context, we use $U$ instead of $V(U)$.
\end{definition}

Observe that each realization of an index shape corresponds to a row or column of the moment matrix.

\begin{definition}
For two index shapes $U, V$, we write $U \equiv V$ if for all powers $p$, the index shape pieces of power $p$ in $U$ and $V$ have the same length.
\end{definition}

\begin{definition}
Define ${\mathcal I}_{mid}$ to be the set of all index shapes $U$ that contain only index shape pieces of power $1$.
\end{definition}

In the definition of shapes, the distinguished set of vertices should now be replaced by index shapes.

\begin{definition}[Shapes]
Shapes are tuples $\alpha = (H_{\alpha}, U_{\alpha}, V_{\alpha})$ where $H_{\alpha}$ is a graph with hyperedges of arity $k$ and $U_{\alpha}, V_{\alpha}$ are index shapes such that $U_{\alpha}, V_{\alpha} \subseteq V(H_{\alpha})$.
\end{definition}

\begin{definition}[Proper shape]
A shape $\alpha$ is proper if it has no isolated vertices outside $U_{\alpha} \cup V_{\alpha}$, no multi-edges and all the edges have a nonzero label.
\end{definition}

To define the notion of vertex separators, we modify the notion of paths for hyperedges.

\begin{definition}[Path]
A path is a sequence of vertices $u_1, \ldots, u_t$ such that $u_i, u_{i + 1}$ are in the same hyperedge, for all $i \le t - 1$.
\end{definition}

The notions of vertex separator and decomposition into left, middle and right parts are identically defined with the above notion of hyperedges and paths. In \cref{sec: technical_def_and_main_theorem}, we will show that they are well defined.

In the definition of trivial shape $\tau$, we now require $U_{\tau} \equiv V_{\tau}$. For $U \in {\mathcal I}_{mid}$, ${\mathcal M}_U$ will be the set of proper non-trivial middle parts $\tau$ with $U_{\tau} \equiv V_{\tau} \equiv U$ and ${\mathcal L}_U$ will be the set of left parts $\sigma$ such that $V_{\sigma} \equiv U$. Similarly, for $U, V \in {\mathcal I}_{mid}$, ${\mathcal L}_{U, V}$ will be the set of left parts $\gamma$ such that $U_{\gamma} \equiv U$ and $V_{\gamma} \equiv V$.

In order to define the moment matrix, we need to truncate our shapes based on the number of vertices and the labels on our hyperedges. So, we make the following definition.

\begin{definition}[Truncation parameters]
For integers $D_{sos}, D_V, D_E \ge 0$, say that a shape $\alpha$ satisfies the truncation parameters $D_{sos}, D_V, D_E$ if
\begin{itemize}
    \item The degrees of the monomials that $U_{\alpha}$ and $V_{\alpha}$ correspond to, are at most $\frac{D_{sos}}{2}$
    \item The left part $\sigma$, the middle part $\tau$ and the right part $\sigma'^T$ of $\alpha$ satisfy $|V(\sigma)|, |V(\tau)|, |V(\sigma'^T)| \le D_V$
    \item For each $e \in E(\alpha)$, $l_e \le D_E$.
\end{itemize}
\end{definition}

\subsubsection{Sparse PCA}

We consider the $m$ vectors $v_1, \ldots, v_m \in {\mathbb R}^d$ to be the input. Similar to Tensor PCA, we will work with the Hermite basis of polynomials since the entries are sampled from the distribution $\mathcal{N}(0, 1)$.

In particular, if we denote the unnormalized Hermite polynomials by $h_0(x) = 1, h_1(x) = x, h_2(x) = x^2 - 1, \ldots$, then, we work with the basis $h_a(v) := \prod_{i \in [m], j \in [n]} h_{a_{i, j}}(v_{i, j})$ over $a \in \mathbb{N}^{m \times n}$. To capture these bases, we will modify the graphs that represent ribbons (and by extension, shapes), to be bipartite graphs with two types of vertices, and have labeled edges that go across vertices of different types. So, an edge $(i, j)$ with label $t$ between a vertex $i$ of type $1$ and a vertex $j$ of type $2$ will correspond to $h_t(v_{i, j})$.

\begin{definition}[Vertices]
We will have two types of vertices, the vertices corresponding to the $m$ input vectors that we call type $1$ vertices and the vertices corresponding to ambient dimension of the space that we call type $2$ vertices.
\end{definition}

\begin{definition}[Edges]
Edges will go across vertices of different types, thereby forming a bipartite graph. An edge between a type $1$ vertex $i$ and a type 2 vertex $j$ corresonds to the input entry $v_{i, j}$. Each edge will have a label in $\mathbb{N}$ corresponding to the Hermite polynomial evaluated on that entry.
\end{definition}

We will have variables $x_1, \ldots, x_n$ in our SoS program, so we will work with index shape pieces and index shapes as in Tensor PCA, since the rows and columns of our moment matrix will now correspond to monomials of the form $\prod_{i \le n} x_i^{p_i}$ for $p_i \ge 0$. But since in our decompositions into left, right and middle parts, we will have type $2$ vertices as well in the vertex separators, we will define a generalized notion of index shape pieces and index shapes.

\begin{definition}[Index shape piece]
    An index shape piece $U_i= ((U_{i, 1}, \ldots, U_{i, t}), t_i, p_i)$ is a tuple of indices $(U_{i, 1}, \ldots, U_{i, t})$ along a type $t_i \in \{1, 2\}$ with a power $p_i \in \mathbb{N}$. Let $V(U_i)$ be the set $\{U_{i, 1}, \ldots, U_{i, t}\}$ of vertices of this index shape piece. When clear from context, we use $U_i$ instead of $V(U_i)$.
\end{definition}

For an index shape piece $((U_{i, 1}, \ldots, U_{i, t}), t_i, p_i)$ with type $t_i = 2$, if we realize $U_{i_1}, \ldots, U_{i_t}$ to be indices $a_1, \ldots, a_t \in [n]$, then, this index shape pieces correspond this to the monomial $\prod_{j \le n} x_{a_j}^{p_i}$.

\begin{definition}[Index shape]
An index shape $U$ is a set of index shape pieces $U_i$ that have either have different types or different powers. Let $V(U)$ be the set of vertices $\cup_i V(U_i)$. When clear from context, we use $U$ instead of $V(U)$.
\end{definition}

Observe that each realization of an index shape corresponds to a row or column of the moment matrix. For our moment matrix, the only nonzero rows correspond to index shapes that have only index shape pieces of type $2$, since the only SoS variables are $x_1 \ldots, x_n$, but in order to do our analysis, we need to work with the generalized notion of index shapes that allow index shape pieces of both types.

\begin{definition}
For two index shapes $U, V$, we write $U \equiv V$ if for all types $t$ and all powers $p$, the index shape pieces of type $t$ and power $p$ in $U$ and $V$ have the same length.
\end{definition}

\begin{definition}
Define ${\mathcal I}_{mid}$ to be the set of all index shapes $U$ that contain only index shape pieces of power $1$.
\end{definition}

Since we are working with standard graphs, the notion of path and vertex separator need no modifications, but we will now use the minimum weight vertex separator instead of the minimum vertex separator where we define the weight as follows.

\begin{definition}[Weight of an index shape]
Suppose we have an index shape $U = \{U_1, U_2\} \in {\mathcal I}_{mid}$ where $U_1 = ((U_{1, 1}, \ldots, U_{1, |U_1|}), 1, 1)$ is an index shape piece of type $1$ and $U_2 = ((U_{2, 1}, \ldots, U_{2, |U_2|}), 2, 1)$ is an index shape piece of type $2$. Then, define the weight of this index shape to be $w(U) = \sqrt{m}^{|U_1|}\sqrt{n}^{|U_2|}$.
\end{definition}

We now give the modified definition of shapes.

\begin{definition}[Shapes]
Shapes are tuples $\alpha = (H_{\alpha}, U_{\alpha}, V_{\alpha})$ where $H_{\alpha}$ is a graph with two types of vertices, has labeled edges only across vertices of different types and $U_{\alpha}, V_{\alpha}$ are index shapes such that $U_{\alpha}, V_{\alpha} \subseteq V(H_{\alpha})$.
\end{definition}

\begin{definition}[Proper shape]
A shape $\alpha$ is proper if it has no isolated vertices outside $U_{\alpha} \cup V_{\alpha}$, no multi-edges and all the edges have a nonzero label.
\end{definition}

In \cref{sec: technical_def_and_main_theorem}, we will show that with this new definition of weight and shapes, any shape $\alpha$ has a unique decomposition into $\sigma \circ \tau \circ \sigma'^T$ where $\sigma, \tau, \sigma'^T$ are left, middle and right parts respectively. Here, $\tau$ may possibly be improper.

In the definition of trivial shape $\tau$, we now require $U_{\tau} \equiv V_{\tau}$. For $U \in {\mathcal I}_{mid}$, ${\mathcal M}_U$ will be the set of proper non-trivial middle parts $\tau$ with $U_{\tau} \equiv V_{\tau} \equiv U$ and ${\mathcal L}_U$ will be the set of left parts $\sigma$ such that $V_{\sigma} \equiv U$. Similarly, for $U, V \in {\mathcal I}_{mid}$, ${\mathcal L}_{U, V}$ will be the set of left parts $\gamma$ such that $U_{\gamma} \equiv U$ and $V_{\gamma} \equiv V$.

Finally, in order to define the moment matrix, we need to truncate our shapes based on the number of vertices and the labels on our edges. So, we make the following definition.

\begin{definition}[Truncation parameters]
For integers $D_{sos}, D_V, D_E \ge 0$, say that a shape $\alpha$ satisfies the truncation parameters $D_{sos}, D_V, D_E$ if
\begin{itemize}
    \item The degrees of the monomials that $U_{\alpha}$ and $V_{\alpha}$ correspond to, are at most $\frac{D_{sos}}{2}$
    \item The left part $\sigma$, the middle part $\tau$ and the right part $\sigma'^T$ of $\alpha$ satisfy $|V(\sigma)|, |V(\tau)|, |V(\sigma'^T)| \le D_V$
    \item For each $e \in E(\alpha)$, $l_e \le D_E$.
\end{itemize}
\end{definition}

\subsubsection{Relaxing the third condition}\label{sec: hgamma_qual}

In \cref{informalmaintheoremstatement}, the third qualitative condition we'd like to show is as follows:
For all $U,V \in \mathcal{I}_{mid}$ such that $|U| > |V|$ and all $\gamma \in \Gamma_{U,V}$, $H^{-\gamma,\gamma}_{Id_{V_{\gamma}}} \preceq f(\gamma)H_{Id_{U_{\gamma}}}$.

For technical reasons, we won't be able to show this directly. To handle this, we instead work with a slight modification of $H_{Id_{U_{\gamma}}}$, a matrix $H_{\gamma}'$ that's very close to $H_{Id_{U_{\gamma}}}$. So, what we will end up showing is:
For all $U,V \in \mathcal{I}_{mid}$ such that $|U| > |V|$ and all $\gamma \in \Gamma_{U,V}$, $H^{-\gamma,\gamma}_{Id_{V_{\gamma}}} \preceq f(\gamma)H'_{\gamma}$

Let $D_V$ be the truncation parameter. A canonical choice for $H'_{\gamma}$ is to take 
\begin{enumerate}
	\item $H'_{\gamma}(\sigma,\sigma') = H_{Id_U}(\sigma, \sigma')$ whenever $|V(\sigma \circ \gamma)| \leq D_V$ and $|V(\sigma' \circ \gamma)| \leq D_V$.
	\item $H'_{\gamma}(\sigma,\sigma') = 0$ whenever $|V(\sigma \circ \gamma)| > D_V$ or $|V(\sigma' \circ \gamma)| > D_V$.
\end{enumerate}

With this choice, $H_{\gamma}'$ is the same as $H_{Id_{U_{\gamma}}}$ upto truncation error. We will formally bound the errors in the quantitative sections after we introduce the full machinery.
\subsection{Pseudo-calibration}

We will pseudo-calibrate with respect the following pair of random and planted distributions which we denote $\nu$ and $\mu$ respectively.

\PLDSdistributions*

We assume that the input is given as $G_{i, j}$ for $i, j \in \binom{[n]}{2}$ where $G_{i, j}$ is $1$ if the edge $(i, j)$ is present in the graph and $-1$ otherwise. We work with the Fourier basis $\chi_E$ defined as $\chi_E(G) := \prod_{(i, j) \in E} G_{i, j}$. For a subset $I \subseteq [n]$, define $x_I := \prod_{i \in I} x_I$.

\begin{lemma}
Let $I \subseteq [n], E \subseteq \binom{[n]}{2}$. Then,
\[\mathbb{E}_{\mu}[x_I \chi_E(G)] = \left(\frac{k}{n}\right)^{|I \cup V(E)|} (2p - 1)^{|E|}\]
\end{lemma}

\begin{proof}
When we sample $(G, S)$ from $\mu$, we condition on whether $I \cup V(E) \subseteq S$.
\begin{align*}
\mathbb{E}_{(G, S)\sim \mu}[x_I \chi_E(G)] &= Pr_{(G, S) \sim \mu}[I \cup V(E) \subseteq S]\mathbb{E}_{(G, S) \sim \mu}[x_I\chi_E(G)|I \cup V(E) \subseteq S]\\
&\qquad + Pr_{(G, S) \sim \mu}[I \cup V(E) \not\subseteq S]\mathbb{E}_{(G, S) \sim \mu}[x_I\chi_E(G)|I \cup V(E) \not\subseteq S]
\end{align*}
We claim that the second term is $0$. In particular, $\mathbb{E}_{(G, S) \sim \mu}[x_I\chi_E(G)|I \cup V(E) \not\subseteq S] = 0$ because when $I \cup V(E) \not\subseteq S$, either $S$ doesn't contain a vertex in $I$ or an edge $(i, j) \in E$ is outside $S$. If $S$ doesn't contain a vertex in $I$, then $x_I = 0$ and hence, the quantity is $0$. And if an edge $(i, j) \in E$ is outside $S$, since this edge is sampled with probability $\frac{1}{2}$, by taking expectations, the quantity $\mathbb{E}_{(G, S) \sim \mu}[x_I\chi_E(G)|I \cup V(E) \not\subseteq S]$ is $0$.

Finally, note that $Pr_{(G, S) \sim \mu}[I \cup V(E) \subseteq S] = \left(\frac{k}{n}\right)^{|I \cup V(E)|}$ and
\[\mathbb{E}_{(G, S) \sim \mu}[x_I\chi_E(G)|I \cup V(E) \subseteq S] = \mathbb{E}_{(G, S) \sim \mu}[\chi_E(G)|V(E) \subseteq S] = (2p - 1)^{|E|}\]
The last equality follows because for each edge $e \in E$, since $e$ is present independently with probability $p$, the expected value of $\chi_e$ is $1\cdot p + (-1) \cdot (1 - p) = 2p - 1$.
\end{proof}


Define the degree of SoS to be $D_{sos} = n^{C_{sos}\varepsilon}$ for some constant $C_{sos} > 0$ that we choose later. And define the truncation parameter to be $D_V = n^{C_V\varepsilon}$ for some constant $C_V > 0$.

\begin{remk}[Choice of parameters]\label{rmk: choice_of_params1}
	We first set $\varepsilon$ to be a sufficiently small constant. Based on this choice, we will set $C_V$ to be a sufficiently small constant to satisfy all the inequalities we use in our proof. Based on these choices, we can choose $C_{sos}$ to be sufficiently small to satisfy the inequalities we use.
\end{remk}

We will now describe the decomposition of the moment matrix $\Lambda$.

\begin{definition}\label{def: plds_coeffs}
	If a shape $\alpha$ satisfies the following properties:
	\begin{itemize}
		\item $\alpha$ is proper,
		\item $\alpha$ satisfies the truncation parameter $D_{sos}, D_V$.
	\end{itemize}
	then define \[\lambda_{\alpha} = \left(\frac{k}{n}\right)^{|V(\alpha)|}  (2p - 1)^{|E(\alpha)|}\]
\end{definition}

\begin{corollary}
	$\Lambda = \sum \lambda_{\alpha}M_{\alpha}$.
\end{corollary}

\subsection{Qualitative machinery bounds}

In this section, we will prove the PSD mass condition and the qualitative versions of the middle shape and intersection term bounds.

\begin{restatable}[PSD mass]{lemma}{PLDSone}\label{lem: plds_cond1}
	For all $U \in {\mathcal I}_{mid}$, $H_{Id_U} \succeq 0$
\end{restatable}

While this is easy to prove directly, we would like to introduce appropriate notation so that this lemma as well as the qualitative bounds to follow are immediate.
Therefore, we state the qualitative conditions next and then prove them all together.
Now, we define the following quantities which capture the contribution of the vertices within $\tau, \gamma$ to the Fourier coefficients.

\begin{restatable}{definition}{PLDSstau}\label{def: plds_stau}
	For $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$, define
	$S(\tau) = \left(\frac{k}{n}\right)^{|V(\tau)| - |U_{\tau}|}(2p - 1)^{|E(\tau)|}$.
	And for all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$ and $\gamma \in \Gamma_{U, V}$, define
	$S(\gamma) = \left(\frac{k}{n}\right)^{|V(\gamma)| - \frac{|U_{\gamma}| + |V_{\gamma}|}{2}}(2p - 1)^{|E(\gamma)|}$.
\end{restatable}

We can now state our qualitative bounds, which we prove shortly.

\begin{restatable}[Qualitative middle shape bounds]{lemma}{PLDStwosimplified}\label{lem: plds_cond2_simplified}
	For all $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$,
	\[
	\begin{bmatrix}
		\frac{S(\tau)}{|Aut(U)|}H_{Id_U} & H_{\tau}\\
		H_{\tau}^T & \frac{S(\tau)}{|Aut(U)|}H_{Id_U}
	\end{bmatrix}
	\succeq 0
	\]
\end{restatable}

In the following qualitative intersection term bounds, we use the canonical definition of $H_{\gamma}'$ from \cref{sec: hgamma_qual}.

\begin{restatable}[Qualitative intersection term bounds]{lemma}{PLDSthreesimplified}\label{lem: plds_cond3_simplified}
	For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$ and all $\gamma \in \Gamma_{U, V}$,
	\[\frac{|Aut(V)|}{|Aut(U)|}\cdot\frac{1}{S(\gamma)^2}H_{Id_V}^{-\gamma, \gamma} = H_{\gamma}'\]

\end{restatable}

In order to prove these bounds, we define the following quantity to capture the contribution of the vertices within $\sigma$ to the Fourier coefficients.

\begin{definition}
	For a shape $\sigma\in {\mathcal L}$, define
	$T(\sigma) = \left(\frac{k}{n}\right)^{|V(\sigma)| - \frac{|V_{\sigma}|}{2}}(2p - 1)^{|E(\sigma)|}$.
	For $U \in {\mathcal I}_{mid}$, define $v_U$ to be the vector indexed by $\sigma \in {\mathcal L}$ such that $v_U(\sigma) = T(\sigma)$ if $\sigma \in {\mathcal L}_U$ and $0$ otherwise.
\end{definition}

The following propositions are immediate from \cref{def: plds_coeffs}.

\begin{propn}
	For all $U\in {\mathcal I}_{mid}, \rho \in {\mathcal P}_U$, $H_{Id_U} = \frac{1}{|Aut(U)|}v_Uv_U^T$.
\end{propn}





\begin{propn}
	For any $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$, $H_{\tau} = \frac{1}{|Aut(U)|^2} S(\tau) v_Uv_U^T$.
\end{propn}


The first proposition implies that for all $U \in {\mathcal I}_{mid}$, $H_{Id_U} \succeq 0$, which is the PSD mass condition \cref{lem: plds_cond1}.
\cref{lem: plds_cond2_simplified} and \cref{lem: plds_cond3_simplified} also follow easily.

\begin{proof}[Proof of \cref{lem: plds_cond2_simplified}]
\begin{align*}
    \begin{bmatrix}
		\frac{S(\tau)}{|Aut(U)|}H_{Id_U} & H_{\tau}\\
		H_{\tau}^T & \frac{S(\tau)}{|Aut(U)|}H_{Id_U}
	\end{bmatrix} &= \begin{bmatrix}
		\frac{S(\tau)}{|Aut(U)|}v_Uv_U^T & \frac{S(\tau)}{|Aut(U)|^2}v_Uv_U^T\\
		\frac{S(\tau)}{|Aut(U)|^2}v_Uv_U^T & \frac{S(\tau)}{|Aut(U)|}v_Uv_U^T
	\end{bmatrix} \succeq 0
\end{align*}
\end{proof}




\begin{proof}[Proof of \cref{lem: plds_cond3_simplified}]
    Fix $\sigma, \sigma' \in {\mathcal L}_{U}$ such that $|V(\sigma \circ \gamma)|, |V(\sigma' \circ \gamma)| \le D_V$. Note that $|V(\sigma)| - \frac{|V_{\sigma}|}{2} + |V(\sigma')| - \frac{|V_{\sigma'}|}{2} + 2(|V(\gamma)| - \frac{|U_{\gamma}| + |V_{\gamma}|}{2}) = |V(\sigma \circ \gamma \circ \gamma^T \circ \sigma'^T)|$. Using \cref{def: plds_coeffs}, we can easily verify that $\lambda_{\sigma \circ \gamma \circ \gamma^T \circ \sigma'^T} = T(\sigma)T(\sigma') S(\gamma)^2$. Therefore, we have $H_{Id_V}^{-\gamma, \gamma}(\sigma, \sigma') = \frac{|Aut(U)|}{|Aut(V)|} S(\gamma)^2 H_{Id_U}(\sigma, \sigma')$. Since $H'_{\gamma}(\sigma, \sigma') = H_{Id_U}(\sigma, \sigma')$ whenever $|V(\sigma \circ \gamma)|, |V(\sigma' \circ \gamma)| \le D_V$, this completes the proof.
\end{proof}



\subsection{The Sum of Squares Hierarchy}\label{subsec: sos}

The SoS hierarchy is a powerful class of algorithms parameterized by it's degree. As we increase the degree, we get progressively stronger algorithms (with longer running times). It's been shown formally to obtain the state-of-the art guarantees for many problems both in the worst case and the average case setting. 
For constant degree SoS, the hierarchy can be optimized in polynomial time\footnote{There is a caveat, see \cite{o2017sos}}. In general, for degree-$d$ SoS, we can solve it in $n^{O(d)}$ time. Most of our applications in this paper focus on showing hardness for the SoS hierarchy when the degree is $n^{\varepsilon}$, which corresponds to a subexponential running time.

We now formally describe the sum of squares hierarchy.

\begin{definition}[Pseudo-expectation values]\label{def: pseudoexpectation}
Given polynomial constraints $g_1 = 0$,\ldots,$g_m = 0$, degree $d$ pseudo-expectation values are a linear map $\tilde{\EE}$ from polynomials of degree at most $d$ to $\mathbb{R}$ satisfying the following conditions:
  \begin{enumerate}
    \item $\tilde{\EE}[1] = 1$, \label{pe:normalized}
    \item $\tilde{\EE}[f \cdot g_i] = 0$ for every $i \in [m]$ and polynomial $f$ such that $\deg(f \cdot g_i) \leq d$. \label{pe:feasible}
    \item $\tilde{\EE}[f^2] \geq 0$ for every polynomial $f$ such that $\deg(f^2) \le d$. \label{pe:psdness}
  \end{enumerate}
\end{definition}
The intuition behind pseudo-expectation values is that the conditions on the pseudo-expectation values are conditions that would be satisfied by any actual expected values over a distribution of solutions, so optimizing over pseudo-expectation values gives a relaxation of the problem. Moreover, the conditions on pseudo-expectation values can be captured by a semidefinite program.

In particular, \cref{pe:psdness} in \cref{def: pseudoexpectation} can be reexpressed in terms of a matrix called the moment matrix.

\begin{definition}[Moment Matrix of $\tilde{\EE}$]
Given degree $d$ pseudo-expectation values $\tilde{\EE}$, define the associated moment matrix $\Lambda$ to be a matrix with rows and columns indexed by monomials $p$ and $q$ such that the entry corresponding to row $p$ and column $q$ is
  \[
  \Lambda[p, q] := \tilde{\EE}\left[pq\right].
  \]
\end{definition}

It is easy to verify that \cref{pe:psdness} in~\cref{def: pseudoexpectation} equivalent to $\Lambda \succeq 0$.

For our setting, we are investigating the following kind of question. Given polynomial constraints $g_1 = 0$,\ldots,$g_m = 0$, can degree $d$ SoS certify that some other polynomial $h$ has value at most $c$?

If there do not exist pseudo-expectation values $\tilde{\EE}$ satisfying the conditions in~\cref{def: pseudoexpectation} such that $\tilde{\EE}[h] > c$ then degree $d$ SoS certifies that $\tilde{\EE}[h] \leq c$. More precisely, by duality, there exists a degree $d$ SoS/Positivstellensatz proof that $h \leq c$.

On the other hand, if there exist degree $d$ pseudo-expectation values $\tilde{\EE}$ satisfying the conditions in~\cref{def: pseudoexpectation} such that $\tilde{\EE}[h] > c$ then degree $d$ SoS fails to certify that $h \leq c$. This is what we need to show in order to prove SoS lower bounds on certification problems.

\begin{comment}

We will first introduce the notion of a pseudoexpectation operator for a set of polynomial constraints and then describe the Sum-of-Squares relaxation for a polynomial optimization problem.

For an integer $d$, Let ${\mathbb R}^{\le d}[x_1, \ldots, x_n]$ be the set of polynomials on $x_1 \ldots, x_n$ of degree at most $d$. We denote the degree of a polynomial
$f \in {\mathbb R}[x_1,\dots,x_n]$ by $\deg(f)$.

\begin{definition}[Pseudoexpectation operator]\label{def: pseudoexpectation}
  Given polynomial constraints $g_1(x) = 0, \ldots, g_m(x) = 0$ on variables $x_1,\dots,x_n$ such that $deg(g_i) \le D$ for an integer $D \ge 0$. For an even integer $d \ge D$, a degree-$d$ pseudoexpectation operator $\tilde{\EE}$ satisfying these constraints is an operator
  $\tilde{\EE} \colon {\mathbb R}^{\le d}[x_1,\dots,x_n] \rightarrow \mathbb{R}$ satisfying:
  \begin{enumerate}
    \item $\tilde{\EE}[1] = 1$, \label{pe:normalized}
    \item $\tilde{\EE}$ is an ${\mathbb R}$-linear operator, i.e., $\tilde{\EE}[f+cg] = \tilde{\EE}[f] + c\tilde{\EE}[g]$ for every $f,g \in {\mathbb R}^{\le d}[x_1,\dots,x_n], c \in {\mathbb R}$, \label{pe:linear}
    \item $\tilde{\EE}[g_i \cdot f] = 0$ for every $i = 1, \ldots, m$ and $f \in {\mathbb R}^{\le d}[x_1,\dots,x_n]$ with $\deg(f \cdot g_i) \le d$. \label{pe:feasible}
    \item $\tilde{\EE}[f^2] \ge 0$ for every $f \in {\mathbb R}^{\le d}[x_1,\dots,x_n]$ with $\deg(f^2) \le d$. \label{pe:psdness}
  \end{enumerate}
\end{definition}

The notion of $\tilde{\EE}$ generalizes the standard expectation operator. The idea is that optimizing over this larger space of pseudoexpectation operators can be formulated as a semidefinite programming problem and hence, will serve as a relaxation of our program that can be solved efficiently.

Formally, consider an optimization task on $n$ variables $x_1, \ldots, x_n \in {\mathbb R}$ formulated as maximizing a polynomial $f(x)$ subject to polynomial constraints $g_1(x) = 0, \ldots, g_m(x) = 0$. Suppose all the polynomials $f, g_1, \ldots, g_m$ have degree at most $D$.  Then, for an even integer $d \ge D$, the degree $d$ Sum-of-Squares relaxation of this program is as follows: Over all pseudoexpectation operators $\tilde{\EE}$ satisfying $\tilde{\EE}[g_1(x)] = 0, \ldots, \tilde{\EE}[g_m(x)] = 0$, output the maximum value of $\tilde{\EE}[f(x)]$.

To prove an SoS lower bound, we need to exhibit an operator $\tilde{\EE}$ that satisfies these constraints with optimum value of $\tilde{\EE}[f(x)]$ being far away from the true optimum.

In most cases, when constructing $\tilde{\EE}$, the condition \cref{pe:psdness} in \cref{def: pseudoexpectation} is the most technically challenging condition to satisfy. It can be equivalently stated as a positive semidefiniteness condition of an associated matrix called the moment matrix.

To define the moment matrix, we need to set up some more notation.

For an integer $d \ge 0$, let ${\mathcal I}_d$ denote the set of all tuples $(t_1, \ldots, t_n)$ such that $t_i \ge 0$ for all $i$ and $\sum t_i \le d$. For $I = (t_1, \ldots, t_n) \in {\mathcal I}_d$, denote by $x^I := x_1^{t_1}x_2^{t_2} \ldots x_n^{t_n}$.

\begin{definition}[Moment Matrix of $\tilde{\EE}$]
For a degree $2d$ pseudoexpectation operator $\tilde{\EE}$ on variables $x_1, \ldots, x_n$, define the associated moment matrix $\Lambda$ to be a matrix with rows and columns indexed by ${\mathcal I}_d$ such that the entry corresponding to row $I$ and column $J$ is
  \[
  \Lambda[I, J] := \tilde{\EE}\left[ x^I \cdot x^J \right].
  \]
\end{definition}

It is easy to verify that \cref{pe:psdness} in~\cref{def: pseudoexpectation} equivalent to $\Lambda \succeq 0$.

The machinery in this paper provides general conditions under which we can show that with high probability, $\Lambda \succeq 0$.
\end{comment}
\subsection{Pseudo-calibration}\label{subsec: pseudocalibration}

To obtain SoS integrality gaps on random instances, we need to construct valid pseudo-expectation values for a random input instance of an optimization problem. Naturally, these pseudo-expectation values will depend on the input. Psuedo-calibration is a heuristic introduced by \cite{BHKKMP16} to construct such candidate pseudo-expectation values almost mechanically by considering a planted distribution supported on instances of the problem with large objective value and using this planted distribution as a guide to construct the pseudo-expectation values. 

Unfortunately, psuedo-calibration doesn't guarantee feasibility of these candidate pseudo-expectation values and the corresponding moment matrix and this has to be verified separately for different problems. This verification of feasibility is relatively easy except for the PSDness condition, which often leads to highly technical and involved analyses. The machinery attempts to mitigate this problem by providing easily verifiable conditions to prove PSDness, regardless of whether the moment matrix was obtained via pseudo-calibration.

For our applications, psuedocalibration is used to obtain a candidate pseudoexpectation operator $\tilde{\EE}$ and a corresponding moment matrix $\Lambda$
from the random vs planted problem. This will be the starting point for all our applications. Here, we do not attempt to motivate and describe it in great detail. Instead, we will briefly describe the heuristic, the intuition behind it and show an example of how to use it. A detailed treatment can be found in \cite{BHKKMP16}.

Let $\nu$ denote the random distribution and $\mu$ denote the planted distribution. Let $v$ denote the input and $x$ denote the variables for our SoS relaxation. The main idea is that, for an input $v$ sampled from $\nu$ and any polynomial $f(x)$ of degree at most the SoS degree, pseudo-calibration proposes that for any low-degree test $g(v)$, the correlation of $\tilde{\EE}[f]$ should match in the planted and random distributions. That is,
\[\mathbb{E}_{v \sim \nu}[\tilde{\EE}[f(x)]g(v)] = \mathbb{E}_{(x, v) \sim \mu}[f(x)g(v)]\]

Here, the notation $(x, v) \sim \mu$ means that in the planted distribution $\mu$, the input is $v$ and $x$ denotes the planted structure in that instance. For example, in planted clique, $x$ would be the indicator vector of the clique. If there are multiple, pick an arbitrary one.

Let ${\mathcal F}$ denote the Fourier basis of polynomials for the input $v$. By choosing different basis functions from ${\mathcal F}$ as choices for $g$ such that the degree is at most $n^{\varepsilon}$ (hence the term low-degree test), we get all lower order Fourier coefficients for $\tilde{\EE}[f(x)]$ when considered as a function of $v$. Furthermore, the higher order coefficients are set to be $0$ so that the candidate pseudoexpectation operator can be written as
\[\tilde{\EE} f(x) = \sum_{\substack{g \in {\mathcal F}\\deg(g) \le n^{\varepsilon}}} \mathbb{E}_{v \sim \nu}[\tilde{\EE}[f(x)]g(v)] g(v) = \sum_{\substack{g \in {\mathcal F}\\deg(g) \le n^{\varepsilon}}} \mathbb{E}_{(x, v) \sim \mu}[[f(x)]g(v)] g(v)\]

The coefficients $\mathbb{E}_{(x, v) \sim \mu}[[f(x)]g(v)]$ can be explicitly computed in many settings, which therefore gives an explicit pseudoexpectation operator $\tilde{\EE}$.

One intuition for pseudo-calibration is as follows. The planted distribution is usually chosen to be a maximum entropy distribution which still has the planted structure. This conforms to the philosophy that random instances are hard for SoS, such as the uniform Bernoulli distribution for planted clique or the Gaussian distribution for Tensor PCA. By conditioning on the lower order moments matching such a planted distribution, pseudo-calibration can be interpreted as sort of interpolating between the random and planted distributions by only looking at lower order Fourier characters. This intuition has proven to be successful, since pseudo-calibration been successfully exploited to construct SoS lower bounds for a wide variety of dense as well as sparse problems.

An advantage of pseudo-calibration is that this construction automatically satisfies some nice properties that the pseudoexpectation $\tilde{\EE}$ should satisfy. It's linear in $v$ by construction. For all polynomial equalities of the form $f(x) = 0$ that is satisfied in the planted distribution, it's true that $\tilde{\EE}[f(x)] = 0$. For other polynomial equalities of the form $f(x, v) = 0$ that are satisfied in the planted distribution, the equality $\tilde{\EE}[f(x, v)] = 0$ is approximately satisfied. In most cases, $\tilde{\EE}$ can be mildly adjusted to satisfy these exactly.

The condition $\tilde{\EE}[1] = 1$ is not automatically satisfied but in most applications, we usually require that $\tilde{\EE}[1] = 1 \pm \operatorname{o}(1)$. Indeed, this has been the case for all known successful applications of pseudo-calibration. Once we have this, we simply set our final pseudoexpectation operator to be $\tilde{\EE}'$ defined as $\tilde{\EE}'[f(x)] = \tilde{\EE}[f(x)] / \tilde{\EE}[1]$.

We remark that the condition $\tilde{\EE}[1] = 1 \pm \operatorname{o}(1)$ has been quite successful in predicting the right thresholds between approximability and inapproximability\cite{hop17, hop18, kunisky19notes}.

\paragraph{Example: Planted Clique} 
As an warmup, we review the pseudo-calibration calculation for planted clique. Here, the random distribution $\nu$ is $G(n, \frac{1}{2})$.

The planted distribution $\mu$ is as follows. For a given integer $k$, first sample $G'$ from $G(n, \frac{1}{2})$, then choose a random subset $S$ of the vertices where each vertex is picked independently with probability $\frac{k}{n}$. For all pairs $i, j$ of distinct vertices in $S$, add the edge $(i, j)$ to the graph if not already present. Set $G$ to be the resulting graph.

The input is given by $G \in \{-1, 1\}^{\binom{[n]}{2}}$ where $G_{i, j}$ is $1$ if the edge $(i, j)$ is present and $-1$ otherwise. Let $x_1, \ldots, x_n$ be the boolean variables for our SoS program such that $x_i$ indicates if $i$ is in the clique.

\begin{definition}
Given a set of vertices $V \subseteq [n]$, define $x_V = \prod_{v \in V}{x_v}$.
\end{definition}
\begin{definition}
Given a set of possible edges $E \subseteq \binom{[n]}{2}$, define $\chi_E = (-1)^{|E \setminus E(G)|} = \prod_{(i, j) \in E}G_{i, j}$.
\end{definition}

Pseudo-calibration says that for all small $V$ and $E$,
\[
\mathbb{E}_{G \sim \nu}\left[\tilde{E}[x_V]\chi_E\right] = \mathbb{E}_{\mu}\left[x_V{\chi_E}\right]
\]
Using standard Fourier analysis, this implies that if we take 
\[
c_E = \mathbb{E}_{\mu}\left[x_V{\chi_E}\right] = \left(\frac{k}{n}\right)^{|V \cup V(E)|}
\]
where $V(E)$ is the set of the endpoints of the edges in $E$, then for all small $V$,
\[
\tilde{\EE}[x_V] = \sum_{E:E \text{ is small}}{{c_E}\chi_E} = \sum_{E:E \text{ is small}}{\left(\frac{k}{n}\right)^{|V \cup V(E)|}\chi_E}
\]

Since the values of $\tilde{\EE}[x_V]$ are known, by multi-linearity, this can be naturally extended to obtain values $\tilde{\EE}[f(x)]$ for any polynomial $f$ of degree at most the SoS degree.


\subsection{Pseudo-calibration}

\begin{definition}[Slack parameter]
	Define the slack parameter to be $\Delta = d^{-C_{\Delta}\varepsilon}$ for a constant $C_{\Delta} > 0$.
\end{definition}

We will pseudo-calibrate with respect the following pair of random and planted distributions which we denote $\nu$ and $\mu$ respectively.

\SPCAdistributions*

We will again work with the Hermite basis of polynomials. For $a \in \mathbb{N}^{m \times d}$ and variables $v_{i, j}$ for $i \in [m], j \in [n]$, define $h_a(v) := \prod_{i \in [m], j \in [n]} h_{a_{i, j}}(v_{i, j})$.
For a nonnegative integer $t$, define $t!!= \frac{(2t)!}{t!2^t} = 1 \times 3 \times \ldots \times t$ if $t$ is odd and $0$ otherwise.

\begin{lemma}
	Let $I \in \mathbb{N}^d, a \in \mathbb{N}^{m \times d}$. For $i \in [m]$, let $e_i = \sum_{j \in [d]} a_{ij}$ and for $j \in [d]$, let $f_j = I_j + \sum_{i \in [m]} a_{ij}$. Let $c_1$ (resp. $c_2$) be the number of $i$ (resp. $j$) such that $e_i > 0$ (resp. $f_j > 0$). Then, if $e_i, f_j$ are all even, we have
	\[\mathbb{E}_{\mu}[u^I h_a(v)] = \left(\frac{1}{\sqrt{k}}\right)^{|I|} \left(\frac{k}{d}\right)^{c_2} \Delta^{c_1}\prod_{i \in [m]} (e_i - 1)!! \prod_{i, j} \frac{\sqrt{\lambda}^{a_{ij}}}{\sqrt{k}^{a_{ij}}}\]
	Else, $\mathbb{E}_{\mu}[u^I h_a(v)] = 0$.
\end{lemma}

\begin{proof}
	$v_1, \ldots, v_m \sim \mu$ can be written as $v_i = g_i + \sqrt{\lambda} b_i l_i u$ where $g_i \sim \mathcal{N}(0, I_d), l_i \sim \mathcal{N}(0, 1), b_i \in \{0, 1\}$ where $b_i = 1$ with probability $\Delta$.
	Let's analyze when the required expectation is nonzero. We can first condition on $b_i, l_i, u$ and use the fact that for a fixed $t$, $\mathbb{E}_{g \sim \mathcal{N}(0, 1)}[h_k(g + t)] = t^k$ to obtain
    \begin{align*}
	\mathbb{E}_{(u, l_i, b_i, g_i) \sim \mu}[u^I h_a(v)] &= \mathbb{E}_{(u, l_i, b_i) \sim \mu}[u^I\prod_{i, j}(\sqrt{\lambda}b_il_iu_j)^{a_{ij}}]\\
    &= \mathbb{E}_{(u, l_i, b_i) \sim \mu}[\prod_{i \in [m]} (b_il_i)^{e_i}\prod_{j \in [d]} u_j^{f_j}] \prod_{i, j} \sqrt{\lambda}^{a_{ij}}
    \end{align*}
	For this to be nonzero, the set of $c_1$ indices $i$ such that $e_i > 0$, should not have been resampled otherwise $b_i = 0$, each of which happens independently with probability $\Delta$. And the set of $c_2$ indices $j$ such that $f_j > 0$ should have been such that $u_j$ is nonzero, each of which happens independently with probability $\frac{k}{d}$. Since $l_i, u_j$ are have zero expectation in $\nu$, we need $e_i, f_j$ to be even. The expectation then becomes
    {\footnotesize
    \begin{align*}
	\Delta^{c_1} \left(\frac{k}{d}\right)^{c_2}\mathbb{E}_{(u, l_i) \sim \mu}[\prod_{i \in [m]} l_i^{e_i}\prod_{j \in [d]} u_j^{f_j}] \prod_{i, j} \sqrt{\lambda}^{a_{ij}} = \left(\frac{1}{\sqrt{k}}\right)^{|I|} \left(\frac{k}{d}\right)^{c_2} \Delta^{c_1}\prod_{i \in [m]} (e_i - 1)!! \prod_{i, j} \frac{\sqrt{\lambda}^{a_{ij}}}{\sqrt{k}^{a_{ij}}}
    \end{align*}}
	The last equality follows because, for each $j$ such that $u_j$ is nonzero, we have $u_j^t = (\frac{1}{\sqrt{k}})^t$ and $\mathbb{E}_{g \sim \mathcal{N}(0, 1)}[g^t] = (t - 1)!!$ if $t$ is even.
\end{proof}

Define the degree of SoS to be $D_{sos} = d^{C_{sos}\varepsilon}$ for some constant $C_{sos} > 0$ that we choose later.
Define the truncation parameters to be $D_V = d^{C_V\varepsilon}, D_E = d^{C_E\varepsilon}$ for some constants $C_V, C_E > 0$. Regarding the choice of parameters, although we are working with a different problem, \cref{rmk: choice_of_params2} directly applies.

The underlying graphs for the graph matrices have the following structure:
There will be two types of vertices - $d$ type $1$ vertices corresponding to the dimensions of the space and $m$ type $2$ vertices corresponding to the different input vectors. The shapes will correspond to bipartite graphs with edges going between across of different types.
For the analysis of Sparse PCA, we will use the following notation.
\begin{itemize}
	\item For a shape $\alpha$ and type $t \in \{1, 2\}$, let $V_t(\alpha)$ denote the vertices of $V(\alpha)$ that are of type $t$. Let $|\alpha|_t = |V_t(\alpha)|$.
	\item For an index shape $U$ and a vertex $i$, define $deg^{U}(i)$ as follows: If $i \in V(U)$, then it is the power of the unique index shape piece $A \in U$ such that $i \in V(A)$. Otherwise, it is $0$.
	\item For an index shape $U$, define $deg(U) = \sum_{i \in V(U)} deg^U(i)$. This is also the degree of the monomial $p_U$.
	\item For a shape $\alpha$ and vertex $i$ in $\alpha$, let $deg^{\alpha}(i) = \sum_{i \in e \in E(\alpha)} l_e$.
	\item For any shape $\alpha$, let $deg(\alpha) = deg(U_{\alpha}) + deg(V_{\alpha})$.
	\item For an index shape $U \in {\mathcal I}_{mid}$ and type $t \in \{1, 2\}$, let $U_t \in U$ denote the index shape piece of type $t$ in $U$ if it exists, otherwise define $U_t$ to be $\emptyset$. Note that this is well defined since for each type $t$, there is at most one index shape piece of type $t$ in $U$ since $U \in {\mathcal I}_{mid}$. Also, denote by $|U|_t$ the length of the tuple $U_t$.
\end{itemize}

We will now describe the decomposition of the moment matrix $\Lambda$.

\begin{definition}\label{def: spca_coeffs}
	If a shape $\alpha$ satisfies the following properties:
	\begin{itemize}
		\item Both $U_{\alpha}$ and $V_{\alpha}$ only contain index shape pieces of type $1$,
		\item $deg^{\alpha}(i) + deg^{U_{\alpha}}(i) + deg^{V_{\alpha}}(i)$ is even for all $i \in V(\alpha)$,
		\item $\alpha$ is proper,
		\item $\alpha$ satisfies the truncation parameters $D_{sos}, D_V, D_E$.
	\end{itemize}
	then define \[\lambda_{\alpha} = \left(\frac{1}{\sqrt{k}}\right)^{deg(\alpha)}\left(\frac{k}{d}\right)^{|\alpha|_1}\Delta^{|\alpha|_2} \prod_{j \in V_2(\alpha)} (deg^{\alpha}(j) - 1)!!\prod_{e \in E(\alpha)} \frac{\sqrt{\lambda}^{l_e}}{\sqrt{k}^{l_e}}\]
	Otherwise, define $\lambda_{\alpha} = 0$.
\end{definition}

\begin{corollary}
	$\Lambda = \sum \lambda_{\alpha}M_{\alpha}$.
\end{corollary}

\subsection{Qualitative machinery bounds}

In this section, we will prove the main PSD mass condition and obtain qualitative bounds of the other two conditions, which we will reuse in the full verification.
As in prior sections, we will state the bounds first, introduce notation and then prove them all in a unified manner.

\begin{restatable}[PSD mass]{lemma}{SPCAone}\label{lem: spca_cond1}
	For all $U \in {\mathcal I}_{mid}$, $H_{Id_U} \succeq 0$
\end{restatable}

We define the following quantities to capture the contribution of the vertices within $\tau, \gamma$ to the Fourier coefficients.

\begin{restatable}{definition}{SPCAstau}\label{def: spca_stau}
	For $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$, if $deg^{\tau}(i)$ is even for all vertices $i \in V(\tau) \setminus U_{\tau} \setminus V_{\tau}$, define
	\[S(\tau) =
	\left(\frac{k}{d}\right)^{|\tau|_1 - |U_{\tau}|_1}\Delta^{|\tau|_2 - |U_{\tau}|_2} \prod_{j \in V_2(\tau) \setminus U_{\tau} \setminus V_{\tau}} (deg^{\tau}(j) - 1)!!\prod_{e \in E(\tau)} \frac{\sqrt{\lambda}^{l_e}}{\sqrt{k}^{l_e}}\]
	Otherwise, define $S(\tau) = 0$. 	For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$ and $\gamma \in \Gamma_{U, V}$, if $deg^{\gamma}(i)$ is even for all vertices $i$ in $V(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}$, define
	\[S(\gamma) =
	\left(\frac{k}{d}\right)^{|\gamma|_1 - \frac{|U_{\gamma}|_1 + |V_{\gamma}|_1}{2}}\Delta^{|\gamma|_2 - \frac{|U_{\gamma}|_2 + |V_{\gamma}|_2}{2}} \prod_{j \in V_2(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}} (deg^{\gamma}(j) - 1)!!\prod_{e \in E(\gamma)} \frac{\sqrt{\lambda}^{l_e}}{\sqrt{k}^{l_e}}\]
	Otherwise, define $S(\gamma) = 0$.
\end{restatable}

For getting the best bounds, it will be convenient to discretize the Normal distribution. The following fact follows from standard results on Gaussian quadrature, see for e.g. \cite[Lemma 4.3]{diakonikolas2017statistical}.

\begin{fact}[Discretizing the Normal distribution]\label{fact: quadrature}
	There is an absolute constant $C_{disc}$ such that, for any positive integer $D$, there exists a distribution ${\mathcal E}$ over the real numbers supported on $D$ points $p_1, \ldots, p_D$, such that $|p_i| \le C_{disc} \sqrt{D}$ for all $i \le D$ and
    $\mathbb{E}_{g \sim {\mathcal E}}[g^t] = \mathbb{E}_{g \sim \mathcal{N}(0, 1)}[g^t]$ for all $t = 0, 1, \ldots, 2D - 1$.
\end{fact}

\begin{definition} For any shape $\tau$, suppose $U' = (U_{\tau})_2, V' = (V_{\tau})_2$ are the type $2$ vertices in $U_{\tau}, V_{\tau}$ respectively. Define
$R(\tau) = (C_{disc}\sqrt{D_E})^{\sum_{j \in U' \cup V'} deg^{\tau}(j)}$.
\end{definition}

We can now state our qualitative bounds.

\begin{restatable}[Qualitative middle shape bounds]{lemma}{SPCAtwosimplified}\label{lem: spca_cond2_simplified}
	For all $U \in{\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$,
	\[
	\begin{bmatrix}
	\frac{S(\tau)R(\tau)}{|Aut(U)|}H_{Id_U} & H_{\tau}\\
	H_{\tau}^T & \frac{S(\tau)R(\tau)}{|Aut(U)|}H_{Id_U}
	\end{bmatrix}
	\succeq 0\]
\end{restatable}



We again use the canonical definition of $H_{\gamma}'$ from \cref{sec: hgamma_qual}.

\begin{restatable}[Qualitative intersection term bounds]{lemma}{SPCAthreesimplified}\label{lem: spca_cond3_simplified}
	For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$ and all $\gamma \in \Gamma_{U, V}$,
	\[\frac{|Aut(V)|}{|Aut(U)|}\cdot\frac{1}{S(\gamma)^2R(\gamma)^2}H_{Id_V}^{-\gamma, \gamma} \preceq H_{\gamma}'\]
\end{restatable}

\subsubsection{Proof of the PSD mass condition}

Most of the notation and analysis here are similar to the case of Tensor PCA, we just need to appropriately modify them since there are two types of vertices in the Sparse PCA application.
When we compose shapes $\sigma, \sigma'$, from \cref{def: spca_coeffs}, in order for $\lambda_{\sigma\circ \sigma'}$ to be nonzero, observe that all vertices $i$ in $\lambda_{\sigma \circ \sigma'}$ should have $deg^{\sigma \circ \sigma'}(i) + deg^{U_{\sigma \circ \sigma'}}(i) + deg^{V_{\sigma \circ \sigma'}}(i)$ to be even. To capture this notion conveniently, we again use the notion of parity vectors.

\begin{definition}
	Define a parity vector $\rho$ to be a vector whose entries are in $\{0, 1\}$.
	For $U\in {\mathcal I}_{mid}$, define ${\mathcal P}_U$ to be the set of parity vectors $\rho$ whose coordinates are indexed by $U_1$ followed by $U_2$.
\end{definition}

\begin{definition}
	For a left shape $\sigma$, define $\rho_{\sigma} \in {\mathcal P}_{V_{\sigma}}$, called the parity vector of $\sigma$, to be the parity vector such that for each vertex $i \in V_{\sigma}$, the $i$-th entry of $\rho_{\sigma}$ is the parity of $deg^{U_{\sigma}}(i) + deg^{\sigma}(i)$, that is, $(\rho_{\sigma})_i \equiv deg^{U_{\sigma}}(i) + deg^{\sigma}(i) \pmod 2$.
	For $U \in {\mathcal I}_{mid}$ and $\rho \in {\mathcal P}_U$, let ${\mathcal L}_{U, \rho}$ be the set of all left shapes $\sigma \in {\mathcal L}_U$ such that $\rho_{\sigma} = \rho$, that is, the set of all left shapes with parity vector $\rho$.
\end{definition}


For a shape $\tau$, for a $\tau$ coefficient matrix $H_{\tau}$ and parity vectors $\rho \in {\mathcal P}_{U_{\tau}}, \rho' \in {\mathcal P}_{V_{\tau}}$, define the $\tau$-coefficient matrix $H_{\tau, \rho, \rho'}$ as $H_{\tau ,\rho, \rho'}(\sigma, \sigma') = H_{\tau}(\sigma, \sigma')$ if $\sigma \in {\mathcal L}_{U_{\tau}, \rho}, \sigma' \in {\mathcal L}_{V_{\tau}, \rho'}$ and $0$ otherwise. This immediately implies the following proposition.

\begin{propn}
	For any shape $\tau$ and $\tau$-coefficient matrix $H_{\tau}$, we have the equality $H_{\tau} = \sum_{\rho \in {\mathcal P}_{U_{\tau}}, \rho' \in {\mathcal P}_{V_{\tau}}} H_{\tau, \rho, \rho'}$
\end{propn}

\begin{propn}
	For any $U \in {\mathcal I}_{mid}$, $H_{Id_U} = \sum_{\rho \in {\mathcal P}_U} H_{Id_U, \rho, \rho}$
\end{propn}

\begin{proof}
	For any $\sigma, \sigma' \in {\mathcal L}_U$, using \cref{def: spca_coeffs}, note that in order for $H_{Id_U}(\sigma, \sigma')$ to be nonzero, we must have $\rho_{\sigma} = \rho_{\sigma'}$.
\end{proof}

We now discretize the normal distribution while matching the first $2D_E - 1$ moments.

\begin{definition}\label{def: discretized_gaussian}
	Let ${\mathcal D}$ be a distribution over the real numbers obtained by setting $D = D_E$ in \cref{fact: quadrature}. So, in particular, for any $x$ sampled from ${\mathcal D}$, we have $|x| \le C_{disc}\sqrt{D_E}$ and for $t \le 2D_E - 1$, $\mathbb{E}_{x \sim {\mathcal D}}[x^t] = (t - 1)!!$.
\end{definition}

We define the following quantities to capture the contribution of the vertices within $\sigma$ to the Fourier coefficients.

\begin{definition}
	For a shape $\sigma\in {\mathcal L}$, if $deg^{\sigma}(i) + deg^{U_{\sigma}}(i)$ is even for all vertices $i \in V(\sigma) \setminus V_{\sigma}$, define
	\[T(\sigma) = \left(\frac{1}{\sqrt{k}}\right)^{deg(U_{\sigma})}\left(\frac{k}{d}\right)^{|\sigma|_1 - \frac{|V_{\sigma}|_1}{2}}\Delta^{|\sigma|_2 - \frac{|V_{\sigma}|_2}{2}} \prod_{j \in V_2(\sigma) \setminus V_{\sigma}} (deg^{\sigma}(j) - 1)!!\prod_{e \in E(\sigma)} \frac{\sqrt{\lambda}^{l_e}}{\sqrt{k}^{l_e}}\]
	Otherwise, define $T(\sigma) = 0$.
\end{definition}

\begin{definition}
	Let $U \in {\mathcal I}_{mid}$. Let $x_i$ for $i \in U_2$ be variables. Denote them collectively as $x_{U_2}$. For $\rho \in {\mathcal P}_U$, define $v_{\rho, x_{U_2}}$ to be the vector indexed by left shapes $\sigma \in {\mathcal L}$ such that the $\sigma$th entry is $T(\sigma) \prod_{i \in {U_2}} x_i^{deg^{\sigma}(i)}$ if $\sigma \in {\mathcal L}_{U, \rho}$ and $0$ otherwise.
\end{definition}

The following proposition is obvious and immediately implies the PSD mass condition.

\begin{propn}
	For any $U\in {\mathcal I}_{mid}, \rho \in {\mathcal P}_U$, suppose $x_i$ for $i \in U_2$ are random variables sampled from ${\mathcal D}$. Then,
	$H_{Id_U, \rho, \rho} = \frac{1}{|Aut(U)|}\mathbb{E}_{x}[v_{\rho, x_{U_2}}v_{\rho, x_{U_2}}^T]$.
\end{propn}

\begin{proof}
	Observe that for $\sigma, \sigma' \in {\mathcal L}_{U, \rho}$ and $t \in \{1, 2\}$, $(|\sigma|_t - \frac{|V_{\sigma}|_t}{2}) + (|\sigma'|_t - \frac{|V_{\sigma'}|_t}{2}) = |\sigma \circ \sigma'|_t$. The result follows by verifying the conditions of \cref{def: spca_coeffs} and using \cref{def: discretized_gaussian}.
\end{proof}


\begin{proof}[Proof of the PSD mass condition \cref{lem: spca_cond1}]
	We have $H_{Id_U} = \sum_{\rho \in {\mathcal P}_U} H_{Id_U, \rho, \rho} \succeq 0$ because of the above proposition.
\end{proof}

\subsubsection{Qualitative middle shape bounds}

The next proposition captures the fact that when we compose shapes $\sigma, \tau, \sigma'^T$, in order for $\lambda_{\sigma \circ \tau \circ \sigma'^T}$ to be nonzero, the parities of the degrees of the merged vertices should add up correspondingly.

\begin{propn}\label{propn: spca_coeff_2}
	For all $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$, there exist two sets of parity vectors $P_{\tau}, Q_{\tau} \subseteq {\mathcal P}_{U}$ and a bijection $\pi : P_{\tau} \rightarrow Q_{\tau}$ such that $H_{\tau} = \sum_{\rho \in P_{\tau}} H_{\tau, \rho, \pi(\rho)}$.
\end{propn}

\begin{proof}
	Using \cref{def: spca_coeffs}, in order for $H_{\tau}(\sigma, \sigma')$ to be nonzero, we must have that, in $\sigma \circ \tau \circ \sigma'$, for all $i \in U_{\tau} \cup V_{\tau}$, $deg^{U_{\sigma}}(i) + deg^{U_{\sigma'}}(i) + deg^{\sigma \circ \tau \circ \sigma'^T}(i)$ must be even. In other words, for any $\rho \in {\mathcal P}_U$, there is at most one $\rho' \in {\mathcal P}_U$ such that if we take $\sigma \in {\mathcal L}_{U, \rho}, \sigma' \in {\mathcal L}_U$ with $H_{\tau}(\sigma, \sigma')$ nonzero, then the parity of $\sigma'$ is $\rho'$. Also, observe that $\rho'$ determines $\rho$. We then take $P_{\tau}$ to be the set of $\rho$ such that $\rho'$ exists, $Q_{\tau}$ to be the set of $\rho'$ and in this case, we define $\pi(\rho) = \rho'$.
\end{proof}



\begin{propn}
	For any $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$, suppose we take $\rho \in P_{\tau}$.  Let $\pi$ be the bijection from \cref{propn: spca_coeff_2} so that $\pi(\rho) \in Q_{\tau}$. Let $U' = (U_{\tau})_2, V' = (V_{\tau})_2$ be the type $2$ vertices in $U_{\tau}, V_{\tau}$ respectively. Let $x_i$ for $i \in U' \cup V'$ be random variables independently sampled from ${\mathcal D}$. Define $x_{U'}$ (resp. $x_{V'}$) to be the subset of variables $x_i$ for $i \in U'$ (resp. $i \in V'$). Then,
	\[H_{\tau, \rho, \pi(\rho)} = \frac{1}{|Aut(U)|^2} S(\tau) \mathbb{E}_x\left[v_{\rho, x_{U'}}\left(\prod_{i \in U' \cup V'} x_i^{deg^{\tau}(i)}\right)v_{\pi(\rho), x_{V'}}^T\right]\]
\end{propn}

\begin{proof}
	For $\sigma \in L_{U, \rho}, \sigma' \in {\mathcal L}_{U, \pi(\rho)}$ and $t \in \{1, 2\}$, we have $(|\tau|_t - |U_{\tau}|_t) + (|\sigma|_t - \frac{|V_{\sigma}|_t}{2}) + (|\sigma'|_t - \frac{|V_{\sigma'}|_t}{2}) = |\sigma \circ \tau\circ \sigma'|_t$.
	The result then follows by a straightforward verification of the conditions of \cref{def: spca_coeffs} using \cref{def: discretized_gaussian}.
\end{proof}


We are ready to show the qualitative middle shape bounds.

\begin{proof}[Proof of the qualitative middle shape bounds \cref{lem: spca_cond2_simplified}]
	Let $P_{\tau}, Q_{\tau}, \pi$ be from \cref{propn: spca_coeff_2}. Let $U' = (U_{\tau})_2, V' = (V_{\tau})_2$ be the type $2$ vertices in $U_{\tau}, V_{\tau}$ respectively. Let $x_i$ for $i \in U' \cup V'$ be random variables independently sampled from ${\mathcal D}$. Define $x_{U'}$ (resp. $x_{V'}$) to be the subset of variables $x_i$ for $i \in U'$ (resp. $i \in V'$).

	For $\rho \in {\mathcal P}_U$, define $W_{\rho, \rho} = \mathbb{E}_{y_{U_2} \sim {\mathcal D}^{U_2}}[v_{\rho, y_{U_2}}v_{\rho, y_{U_2}}^T]$ so that $H_{Id_U, \rho, \rho} = \frac{1}{|Aut(U)|} W_{\rho, \rho}$. Observe that $W_{\rho, \rho} = \mathbb{E}[v_{\rho, x_{U'}}v_{\rho, x_{U'}}^T] = \mathbb{E}[v_{\rho, x_{V'}}v_{\rho, x_{V'}}^T]$ because $x_{U'}$ and $x_{V'}$ are also sets of variables sampled from ${\mathcal D}$ and, $U'$, $V'$ have the same size as $U_2$ because $U_{\tau} = V_{\tau} = U$.

	For $\rho, \rho' \in {\mathcal P}_U$, define $Y_{\rho, \rho'} = \mathbb{E}\left[v_{\rho, x_{U'}}\left(\prod_{i \in U' \cup V'} x_i^{deg^{\tau}(i)}\right)v_{\pi(\rho), x_{V'}}^T\right]$. Then, $H_{\tau} = \sum_{\rho \in P_{\tau}} H_{\tau, \rho, \pi(\rho)} = \frac{1}{|Aut(U)|^2}S(\tau)\sum_{\rho \in P_{\tau}} Y_{\rho, \pi(\rho)}$. We have

	\begin{align*}
	\begin{bmatrix}
	\frac{S(\tau)R(\tau)}{|Aut(U)|}H_{Id_U} & H_{\tau}\\
	H_{\tau}^T & \frac{S(\tau)R(\tau)}{|Aut(U)|}H_{Id_U}
	\end{bmatrix}
	&= \frac{S(\tau)}{|Aut(U)|^2}
	\begin{bmatrix}
	R(\tau)\sum_{\rho \in {\mathcal P}_U} W_{\rho, \rho} & \sum_{\rho \in P_{\tau}} Y_{\rho, \pi(\rho)}\\
	\sum_{\rho \in P_{\tau}} Y_{\rho, \pi(\rho)}^T & R(\tau)\sum_{\rho \in {\mathcal P}_U} W_{\rho, \rho}
	\end{bmatrix}
	\end{align*}
	Since $\frac{S(\tau)}{|Aut(U)|^2} \ge 0$, it suffices to prove that $\begin{bmatrix}
	R(\tau)\sum_{\rho \in {\mathcal P}_U} W_{\rho, \rho} & \sum_{\rho \in P_{\tau}} Y_{\rho, \pi(\rho)}\\
	\sum_{\rho \in P_{\tau}} Y_{\rho, \pi(\rho)}^T & R(\tau)\sum_{\rho \in {\mathcal P}_U} W_{\rho, \rho}
	\end{bmatrix}\succeq 0$. Consider
	\begin{align*}
		\begin{bmatrix}
			R(\tau)\sum_{\rho \in {\mathcal P}_U} W_{\rho, \rho} & \sum_{\rho \in P_{\tau}} Y_{\rho, \pi(\rho)}\\
			\sum_{\rho \in P_{\tau}} Y_{\rho, \pi(\rho)}^T & R(\tau)\sum_{\rho \in {\mathcal P}_U} W_{\rho, \rho}
		\end{bmatrix} =& R(\tau)\begin{bmatrix}
		\sum_{\rho \in {\mathcal P}_U \setminus P_{\tau}} W_{\rho, \rho} & 0\\
		0 & \sum_{\rho \in {\mathcal P}_U \setminus Q_{\tau}} W_{\rho, \rho}
		\end{bmatrix}\\
		& + \begin{bmatrix}
		R(\tau)\sum_{\rho \in P_{\tau}} W_{\rho, \rho} & \sum_{\rho \in P_{\tau}} Y_{\rho, \pi(\rho)}\\
		\sum_{\rho \in P_{\tau}} Y_{\rho, \pi(\rho)}^T & R(\tau)\sum_{\rho \in P_{\tau}} W_{\pi(\rho), \pi(\rho)}
		\end{bmatrix}\\
	\end{align*}

	We have $\sum_{\rho \in {\mathcal P}_U \setminus P_{\tau}} W_{\rho, \rho} = \sum_{\rho \in {\mathcal P}_U \setminus P_{\tau}} \mathbb{E}[v_{\rho, x_{U'}}v_{\rho, x_{U'}}^T] \succeq 0$. Similarly, $\sum_{\rho \in {\mathcal P}_U \setminus Q_{\tau}} W_{\rho, \rho} \succeq 0$. Also, $R(\tau) \ge 0$ and therefore, we have that the first term in the above expression,
	$R(\tau)\begin{bmatrix}
	\sum_{\rho \in {\mathcal P}_U \setminus P_{\tau}} W_{\rho, \rho} & 0\\
	0 & \sum_{\rho \in {\mathcal P}_U \setminus Q_{\tau}} W_{\rho, \rho}
	\end{bmatrix}$, is positive semidefinite. For the second term,
{\footnotesize
	\begin{align*}
	&\begin{bmatrix}
	R(\tau)\sum_{\rho \in P_{\tau}} W_{\rho, \rho} & \sum_{\rho \in P_{\tau}} Y_{\rho, \pi(\rho)}\\
	\sum_{\rho \in P_{\tau}} Y_{\rho, \pi(\rho)}^T & R(\tau)\sum_{\rho \in P_{\tau}} W_{\pi(\rho), \pi(\rho)}
	\end{bmatrix}\\
	&\qquad= \sum_{\rho \in P_{\tau}}
	\begin{bmatrix}
	R(\tau)\mathbb{E}[v_{\rho, x_{U'}}v_{\rho, x_{U'}}^T] & \mathbb{E}\left[v_{\rho, x_{U'}}\left(\prod_{i \in U' \cup V'} x_i^{deg^{\tau}(i)}\right)v_{\pi(\rho), x_{V'}}^T\right]\\
	\mathbb{E}\left[v_{\rho, x_{U'}}^T\left(\prod_{i \in U' \cup V'} x_i^{deg^{\tau}(i)}\right)v_{\pi(\rho), x_{V'}}\right] & R(\tau)\mathbb{E}[v_{\pi(\rho), x_{V'}}v_{\pi(\rho), x_{V'}}^T]
	\end{bmatrix}\\
	&\qquad= \sum_{\rho \in P_{\tau}}\mathbb{E}
	\begin{bmatrix}
	R(\tau)v_{\rho, x_{U'}}v_{\rho, x_{U'}}^T & v_{\rho, x_{U'}}\left(\prod_{i \in U' \cup V'} x_i^{deg^{\tau}(i)}\right)v_{\pi(\rho), x_{V'}}^T\\
	v_{\rho, x_{U'}}^T\left(\prod_{i \in U' \cup V'} x_i^{deg^{\tau}(i)}\right)v_{\pi(\rho), x_{V'}} & R(\tau)v_{\pi(\rho), x_{V'}}v_{\pi(\rho), x_{V'}}^T
	\end{bmatrix}
	\end{align*}}

We will prove that the term inside the expectation is positive semidefinite for each $\rho \in P_{\tau}$ and each sampling of the $x_i$ from ${\mathcal D}$, which will complete the proof. Fix $\rho \in P_{\tau}$ and any sampling of the $x_i$ from ${\mathcal D}$. Let $w_1 = v_{\rho, X_{U'}}, w_2 = v_{\pi(\rho), x_{V'}}$. Let $E = \prod_{i \in U' \cup V'} x_i^{deg^{\tau}(i)}$. We would like to prove that $\begin{bmatrix}
	R(\tau)w_1w_1^T & Ew_1w_2^T\\
	Ew_1^Tw_2 & R(\tau)w_2w_2^T
\end{bmatrix} \succeq 0$. For all $y$ sampled from ${\mathcal D}$, $|y| \le C_{disc}\sqrt{D_E}$ and so, $|E| \le (C_{disc}\sqrt{D_E})^{\sum_{j \in U' \cup V'} deg^{\tau}(j)} = R(\tau)$.

If $E \ge 0$, then
{\footnotesize
\begin{align*}
	\begin{bmatrix}
		R(\tau)w_1w_1^T & Ew_1w_2^T\\
		Ew_1^Tw_2 & R(\tau)w_2w_2^T
	\end{bmatrix} &= (R(\tau) - E)
	\begin{bmatrix}
		w_1w_1^T & 0\\
		0 & w_2w_2^T
	\end{bmatrix}
	+ E\begin{bmatrix}
		w_1w_1^T & w_1w_2^T\\
		w_1^Tw_2 & w_2w_2^T
	\end{bmatrix}\\
	&= (R(\tau) - E)\left(
	\begin{bmatrix}
	w_1\\
	0
	\end{bmatrix}
	\begin{bmatrix}
	w_1 & 0
	\end{bmatrix} +
	\begin{bmatrix}
		0\\
		w_2
	\end{bmatrix}
	\begin{bmatrix}
		0 & w_2
	\end{bmatrix}\right) +
	E\begin{bmatrix}
	w_1\\
	w_2
	\end{bmatrix}
	\begin{bmatrix}
		w_1 & w_2
	\end{bmatrix}\\
& \succeq 0
\end{align*}}
since $R(\tau) - E \ge 0$ And if $E < 0$,
{\footnotesize
\begin{align*}
	\begin{bmatrix}
		R(\tau)w_1w_1^T & Ew_1w_2^T\\
		Ew_1^Tw_2 & R(\tau)w_2w_2^T
	\end{bmatrix} &= (R(\tau) + E)
	\begin{bmatrix}
		w_1w_1^T & 0\\
		0 & w_2w_2^T
	\end{bmatrix}
	- E\begin{bmatrix}
		w_1w_1^T & -w_1w_2^T\\
		-w_1^Tw_2 & w_2w_2^T
	\end{bmatrix}\\
	&= (R(\tau) + E)\left(
	\begin{bmatrix}
		w_1\\
		0
	\end{bmatrix}
	\begin{bmatrix}
		w_1 & 0
	\end{bmatrix} +
	\begin{bmatrix}
		0\\
		w_2
	\end{bmatrix}
	\begin{bmatrix}
		0 & w_2
	\end{bmatrix}\right)
	- E\begin{bmatrix}
		w_1\\
		-w_2
	\end{bmatrix}
	\begin{bmatrix}
		w_1 & -w_2
	\end{bmatrix}\\
	& \succeq 0
\end{align*}}
since $R(\tau) + E \ge 0$.
\end{proof}

\subsubsection{Qualitative intersection term bounds}

Just as in \cref{propn: spca_coeff_2}, the next proposition captures the fact that when we compose shapes $\sigma, \gamma, \gamma^T, \sigma'^T$, in order for $\lambda_{\sigma \circ \gamma \circ \gamma^T \circ \sigma'^T}$ to be nonzero, the parities of the degrees of the merged vertices should add up correspondingly.
Just as in the tensor PCA application, we similarly define $H_{Id_V, \rho, \rho'}^{-\gamma, \gamma}$ and $H'_{\gamma, \rho, \rho}$. The following propositions are simple and proved the same way.


\begin{propn}
	For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$, for all $\gamma \in \Gamma_{U, V}$, there exists a set of parity vectors $P_{\gamma} \subseteq {\mathcal P}_U$ such that
	$H_{Id_V}^{-\gamma, \gamma} = \sum_{\rho \in P_{\gamma}} H_{Id_V, \rho, \rho}^{-\gamma, \gamma}$.
\end{propn}



\begin{propn}
	For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$, for $\gamma \in \Gamma_{U, V}$, $H_{\gamma}' = \sum_{\rho \in P_{\gamma}} H_{\gamma, \rho, \rho}'$.
\end{propn}

We will now define vectors which are truncations of $v_{\rho, x_{U_2}}$. This definition and the following proposition are mostly a matter of technicality and they are essentially similar to the PSD mass condition analysis.

\begin{definition}
    Let $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$, and let $\gamma \in \Gamma_{U, V}$. Let $x_i$ for $i \in U_2$ be variables. Denote them collectively as $x_{U_2}$. For $\rho \in {\mathcal P}_U$, define $v_{\rho, x_{U_2}}^{-\gamma}$ to be the vector indexed by left shapes $\sigma \in {\mathcal L}$ such that the $\sigma$th entry is $v_{\rho, x_{U_2}}(\sigma)$ if $|V(\sigma \circ \gamma)| \le D_V$ and $0$ otherwise.
\end{definition}



With this, we can decompose each slice $H_{Id_V, \rho, \rho}^{-\gamma, \gamma}$.

\begin{propn}\label{lem: spca_decomp}
	For any $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$, and for any $\gamma \in \Gamma_{U, V}$, suppose we take $\rho \in P_{\gamma}$. When we compose $\gamma$ with $\gamma^T$ to get $\gamma \circ \gamma^T$, let $U' = (U_{\gamma \circ \gamma^T})_2, V' = (V_{\gamma \circ \gamma^T})_2$ be the type $2$ vertices in $U_{\gamma \circ \gamma^T}, V_{\gamma \circ \gamma^T}$ respectively. And let $W'$ be the set of type $2$ vertices in $\gamma \circ \gamma^T$ that were identified in the composition when we set $V_{\gamma} = U_{\gamma}^T$. Let $x_i$ for $i \in U' \cup W' \cup V'$ be random variables independently sampled from ${\mathcal D}$. Define $x_{U'}$ (resp. $x_{V'}, x_{W'}$) to be the subset of variables $x_i$ for $i \in U'$ (resp. $i \in V', i \in W'$). Then,
	\[H_{Id_V, \rho, \rho}^{-\gamma, \gamma} = \frac{1}{|Aut(V)|}S(\gamma)^2 \mathbb{E}_x\left[(v_{\rho, x_{U'}}^{-\gamma})\left(\prod_{i \in U' \cup W' \cup V'} x_i^{deg^{\gamma \circ \gamma^T}(i)}\right)(v_{\rho, x_{V'}}^{-\gamma})^T\right]\]
\end{propn}

\begin{proof}
	Fix $\sigma, \sigma' \in {\mathcal L}_{U, \rho}$ such that $|V(\sigma \circ \gamma)|, |V(\sigma' \circ \gamma)| \le D_V$. Note that for $t \in \{1, 2\}$, $|\sigma|_t - \frac{|V_{\sigma}|_t}{2} + |\sigma'|_t - \frac{|V_{\sigma'}|_t}{2} + 2(|\gamma|_t - \frac{|U_{\gamma}|_t + |V_{\gamma}|_t}{2}) = |\sigma \circ \gamma \circ \gamma^T \circ \sigma'^T|_t$. We can easily verify the equality using \cref{def: spca_coeffs} and \cref{def: discretized_gaussian}.
\end{proof}

\begin{propn}
	For any $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$, and for any $\gamma \in \Gamma_{U, V}$, suppose we take $\rho \in {\mathcal P}_U$. Then,
	\[H'_{\gamma, \rho, \rho} = \frac{1}{|Aut(U)|}\mathbb{E}_{y_{U_2}\sim {\mathcal D}^{U_2}}\left[(v_{\rho, y_{U_2}}^{-\gamma})(v_{\rho, y_{U_2}}^{-\gamma})^T\right]\]
\end{propn}



We can finally show the qualitative intersection term bounds.

\begin{proof}[Proof of the qualitative intersection term bounds \cref{lem: spca_cond3_simplified}]
    Let $U', V', W'$ be defined as in \cref{lem: spca_decomp}. We have
    {\footnotesize
	\begin{align*}
	\frac{|Aut(V)|}{|Aut(U)|}\cdot\frac{1}{S(\gamma)^2R(\gamma)^2}H_{Id_V}^{-\gamma, \gamma} &= \sum_{\rho \in P_{\gamma}} \frac{|Aut(V)|}{|Aut(U)|}\cdot\frac{1}{S(\gamma)^2R(\gamma)^2} H_{Id_V, \rho, \rho}^{-\gamma, \gamma}\\
	&= \sum_{\rho \in P_{\gamma}} \frac{1}{|Aut(U)|}\cdot\frac{1}{R(\gamma)^2} \mathbb{E}_x\left[(v_{\rho, x_{U'}}^{-\gamma})\left(\prod_{i \in U' \cup W' \cup V'} x_i^{deg^{\gamma \circ \gamma^T}(i)}\right)(v_{\rho, x_{V'}}^{-\gamma})^T\right]
\end{align*}}

We will now prove that, for all $\rho \in P_{\gamma}$,
\begin{align*}
    \frac{1}{|Aut(U)|}\cdot \frac{1}{R(\gamma)^2} \mathbb{E}_x\left[(v_{\rho, x_{U'}}^{-\gamma})\left(\prod_{i \in U' \cup W' \cup V'} x_i^{deg^{\gamma \circ \gamma^T}(i)}\right)(v_{\rho, x_{V'}}^{-\gamma})^T\right] \preceq H'_{\gamma, \rho, \rho}
\end{align*}
which reduces to proving that
{\footnotesize
\begin{align*}
    \frac{2}{R(\gamma)^2} \mathbb{E}_x\left[(v_{\rho, x_{U'}}^{-\gamma})\left(\prod_{i \in U' \cup W' \cup V'} x_i^{deg^{\gamma \circ \gamma^T}(i)}\right)(v_{\rho, x_{V'}}^{-\gamma})^T\right] &\preceq 2\mathbb{E}_{y_{U_2}\sim {\mathcal D}^{U_2}}\left[(v_{\rho, y_{U_2}}^{-\gamma})(v_{\rho, y_{U_2}}^{-\gamma})^T\right]\\
    &= \mathbb{E}_{x}\left[(v_{\rho, x_{U'}}^{-\gamma})(v_{\rho, x_{U'}}^{-\gamma})^T + (v_{\rho, x_{V'}}^{-\gamma})(v_{\rho, x_{V'}}^{-\gamma})^T\right]
\end{align*}}
where the last equality followed from linearity of expectation and the fact that $U' \equiv V' \equiv U_2$.

Since $H_{Id_V, \rho, \rho}^{-\gamma, \gamma}$ is symmetric, we have
{\footnotesize
\[\mathbb{E}_x\left[(v_{\rho, x_{U'}}^{-\gamma})\left(\prod_{i \in U' \cup W' \cup V'} x_i^{deg^{\gamma \circ \gamma^T}(i)}\right)(v_{\rho, x_{V'}}^{-\gamma})^T\right] = \mathbb{E}_x\left[(v_{\rho, x_{V'}}^{-\gamma})\left(\prod_{i \in U' \cup W' \cup V'} x_i^{deg^{\gamma \circ \gamma^T}(i)}\right)(v_{\rho, x_{U'}}^{-\gamma})^T\right]\]}
So, it suffices to prove
{\footnotesize
\begin{align*}
    \frac{1}{R(\gamma)^2}&\mathbb{E}_x\left[(v_{\rho, x_{U'}}^{-\gamma})\left(\prod_{i \in U' \cup W' \cup V'} x_i^{deg^{\gamma \circ \gamma^T}(i)}\right)(v_{\rho, x_{V'}}^{-\gamma})^T + (v_{\rho, x_{V'}}^{-\gamma})\left(\prod_{i \in U' \cup W' \cup V'} x_i^{deg^{\gamma \circ \gamma^T}(i)}\right)(v_{\rho, x_{U'}}^{-\gamma})^T\right]\\
    &\preceq \mathbb{E}_{x}\left[(v_{\rho, x_{U'}}^{-\gamma})(v_{\rho, x_{U'}}^{-\gamma})^T + (v_{\rho, x_{V'}}^{-\gamma})(v_{\rho, x_{V'}}^{-\gamma})^T\right]
\end{align*}}

We will prove that for every sampling of the $x_i$ from ${\mathcal D}$, we have
{\footnotesize
\begin{align*}
    \frac{1}{R(\gamma)^2}&\left((v_{\rho, x_{U'}}^{-\gamma})\left(\prod_{i \in U' \cup W' \cup V'} x_i^{deg^{\gamma \circ \gamma^T}(i)}\right)(v_{\rho, x_{V'}}^{-\gamma})^T + (v_{\rho, x_{V'}}^{-\gamma})\left(\prod_{i \in U' \cup W' \cup V'} x_i^{deg^{\gamma \circ \gamma^T}(i)}\right)(v_{\rho, x_{U'}}^{-\gamma})^T\right) \\
    &\preceq (v_{\rho, x_{U'}}^{-\gamma})(v_{\rho, x_{U'}}^{-\gamma})^T + (v_{\rho, x_{V'}}^{-\gamma})(v_{\rho, x_{V'}}^{-\gamma})^T
\end{align*}}
Then, taking expectations will give the result. Indeed, fix a sampling of the $x_i$ from ${\mathcal D}$. Let $E = \prod_{i \in U' \cup W' \cup V'} x_i^{deg^{\gamma \circ \gamma^T}(i)}$ and let $w_1 = v_{\rho, x_{U'}}^{-\gamma}, w_2 = v_{\rho, x_{V'}}^{-\gamma}$. Then, the inequality we need to show is
\[\frac{E}{R(\gamma)^2}(w_1w_2^T + w_2w_1^T) \preceq w_1w_1^T + w_2w_2^T\]
Now, since $|x_i| \le C_{disc}\sqrt{D_E}$ for all $i$, we have $|E| \le \prod_{i \in U' \cup W' \cup V'} (C_{disc}\sqrt{D_E})^{deg^{\gamma \circ \gamma^T}(i)} =  R(\gamma)^2$.
If $E \ge 0$, using $\frac{E}{R(\gamma)^2}(w_1 - w_2)(w_1 - w_2)^T \succeq 0$ gives
\begin{align*}
    \frac{E}{R(\gamma)^2} (w_1w_2^T + w_2w_1^T) &\preceq \frac{E}{R(\gamma)^2} (w_1w_1^T + w_2w_2^T)
    \preceq w_1w_1^T + w_2w_2^T
\end{align*}
since $0 \le E \le R(\gamma)^2$.
And if $E < 0$, using $\frac{-E}{R(\gamma)^2}(w_1 + w_2)(w_1 + w_2)^T \succeq 0$ gives
\begin{align*}
    \frac{E}{R(\gamma)^2} (w_1w_2^T + w_2w_1^T) &\preceq \frac{-E}{R(\gamma)^2} (w_1w_1^T + w_2w_2^T)
    \preceq w_1w_1^T + w_2w_2^T
\end{align*}
since $0 \le -E \le R(\gamma)^2$.
Finally, we use the fact that for all $\rho \in {\mathcal P}_U$, we have $H'_{\gamma,\rho, \rho} \succeq 0$ which can be proved the same way as the proof of \cref{lem: spca_cond1}. Therefore,
\begin{align*}
	\frac{|Aut(V)|}{|Aut(U)|}\cdot\frac{1}{S(\gamma)^2R(\gamma)^2}H_{Id_V}^{-\gamma, \gamma} &\preceq \sum_{\rho \in P_{\gamma}} H'_{\gamma, \rho, \rho}
	\preceq \sum_{\rho \in {\mathcal P}_U} H'_{\gamma, \rho, \rho}
	= H'_{\gamma}
	\end{align*}
\end{proof}

\subsection{Intuition for quantitative bounds}

In this section, we will give some intuition on the bounds needed for our main theorem \cref{thm: spca_main}, which is formally proved in \cref{sec: spca_quant}. Informally, the theorem states that when $m \le \frac{d}{\lambda^2}$ and $m \le \frac{k^2}{\lambda^2}$, then $\Lambda \succeq 0$ with high probability.

We will try and understand why the inequality $\lambda_{\sigma \circ \tau \circ \sigma'^T}^2\norm{M_{\tau}}^2 \le \lambda_{\sigma \circ \sigma^T}\lambda_{\sigma' \circ \sigma'^T}$ holds. Assume for simplicity that $d < n$ and consider the shapes in \cref{fig: sparse_pca}. The assumption $d < n$ is used in this example since otherwise, if $d > n$, the decomposition differs from what's shown in the figure.

\begin{figure}[!ht]
    \centering
    \includegraphics[scale=0.9, trim={2cm 5cm 0 4cm},clip]{machinery/images/spca}
    \caption{Shapes $\sigma \circ \tau_1\circ \sigma^T, \sigma \circ \tau_2 \circ \sigma^T$ and $\sigma \circ \sigma^T$. All edges have label $1$.}
    \label{fig: sparse_pca}
\end{figure}

Firstly, the shape $\sigma \circ \sigma^T$ has a coefficient of $\lambda_{\sigma \circ \sigma^T} \approx \left(\frac{1}{\sqrt{k}}\right)^4\left(\frac{k}{d}\right)^2$.
The first shape $\sigma \circ \tau_1 \circ \sigma^T$ has a coefficient of $\lambda_{\sigma \circ \tau_1 \circ \sigma^T} \approx \left(\frac{1}{\sqrt{k}}\right)^4\left(\frac{k}{d}\right)^4 \left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^4$ and with high probability, upto lower order terms, $\norm{M_{\tau_1}} \le md$. So, the inequality $\lambda_{\sigma \circ \tau_1 \circ \sigma^T}^2\norm{M_{\tau_1}}^2 \le \lambda_{\sigma \circ \sigma^T}\lambda_{\sigma \circ \sigma^T}$ rearranges to $m \le \frac{d}{\lambda^2}$. But this is precisely one of the assumptions on $m$. Moreover, this also confirms that we need this assumption on $m$ in order for our strategy to go through.

The second shape $\sigma \circ \tau_2 \circ \sigma^T$ has a coefficient of $\lambda_{\sigma \circ \tau_2 \circ \sigma^T} \approx \left(\frac{1}{\sqrt{k}}\right)^4\left(\frac{k}{d}\right)^4 \left(\frac{\sqrt{\lambda}}{\sqrt{k}}\right)^8$ and with high probability, upto lower order terms, $\norm{M_{\tau_2}} \le m^2d$. So, the inequality $\lambda_{\sigma \circ \tau_2 \circ \sigma^T}^2\norm{M_{\tau_2}}^2 \le \lambda_{\sigma \circ \sigma^T}\lambda_{\sigma \circ \sigma^T}$ rearranges to $m^2 \le \frac{k^2d}{\lambda^4}$. But this is obtained simply by multiplying our assumptions on $m$, namely $m \le \frac{k^2}{\lambda^2}$ and $m \le \frac{d}{\lambda^2}$.

Moreover, consider a shape of the form $\sigma \circ \tau_3 \circ \sigma^T$ where $\tau_3$ is similar to $\tau_2$ except it has $t$ (instead of $3$) different circle vertices that are common neighbors to the top 2 square vertices. Analyzing our required inequality, we get for our strategy to go through, $m$ has to satisfy $m \le \frac{k^2}{\lambda^2} \cdot \left(\frac{d}{k^2}\right)^{\frac{2}{t + 1}}$. By taking $t$ arbitrarily large, we can see that the condition $m \le \frac{k^2}{\lambda^2}$ is needed.

So, we get that for our analysis to go through, the assumptions $m \le \frac{d}{\lambda^2}$ and $m \le \frac{k^2}{\lambda^2}$ are necessary. We will prove that in fact, these are sufficient. To do this, we use a charging argument that exploits the special structure of the shapes $\alpha$ that appear in our decomposition of $\Lambda$ and their coefficients $\lambda_{\alpha}$, as we obtained in \cref{def: spca_coeffs}. For details, see \cref{sec: spca_quant}.
\subsection{Middle shape bounds}

\begin{lemma}\label{lem: plds_charging}
    Suppose $k \le n^{1/2 - \varepsilon}$. For all $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$,
	\[\sqrt{n}^{|V(\tau)| - |U_{\tau}|}S(\tau) \le \frac{1}{n^{C_p\varepsilon|E(\tau)|}}\]
\end{lemma}

\begin{proof}
    This result follows by plugging in the value of $S(\tau)$. Using $k \le n^{1/2 - \varepsilon}$,
	\begin{align*}
	\sqrt{n}^{|V(\tau)| - |U_{\tau}|}S(\tau) &= \sqrt{n}^{|V(\tau)| - |U_{\tau}|} \left(\frac{k}{n}\right)^{|V(\tau)| - |U_{\tau}|}(2(\frac{1}{2} + \frac{1}{2n^{C_p\varepsilon}}) -1 )^{|E(\tau)|}
	\le \frac{1}{n^{C_p\varepsilon|E(\tau)|}}
	\end{align*}
\end{proof}

\begin{corollary}\label{cor: plds_norm_decay}
	For all $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$, we have \[c(\tau)B_{norm}(\tau)S(\tau) \le 1\]
\end{corollary}

\begin{proof}
	Since $\tau$ is a proper middle shape, we have $w(I_{\tau}) = 0$ and $w(S_{\tau}) = w(U_{\tau})$. This implies
	$n^{\frac{w(V(\tau)) + w(I_{\tau}) - w(S_{\tau})}{2}} = \sqrt{n}^{|V(\tau)| - |U_{\tau}|}$.
	Since $\tau$ is proper, every vertex $i \in V(\tau) \setminus U_{\tau}$ or $i \in V(\tau) \setminus V_{\tau}$ has $deg^{\tau}(i) \ge 1$ and hence, $|V(\tau)\setminus U_{\tau}| + |V(\tau)\setminus V_{\tau}| \le 4|E(\tau)|$. Also, $q = n^{O(1) \cdot \varepsilon C_V}$. We can set $C_V$ sufficiently small so that, using \cref{lem: plds_charging},
	{\footnotesize
	\begin{align*}
	c(\tau)&B_{norm}(\tau)S(\tau)\\
	&= 100(3D_V)^{|U_{\tau}\setminus V_{\tau}| + |V_{\tau}\setminus U_{\tau}| + 2|E(\tau)|}2^{|V(\tau)\setminus (U_{\tau}\cup V_{\tau})|}
	\cdot (6D_V\sqrt[4]{2eq})^{|V(\tau)\setminus U_{\tau}| + |V(\tau)\setminus V_{\tau}|}\sqrt{n}^{|V(\tau)| - |U_{\tau}|}S(\tau)\\
	&\le n^{O(1) \cdot \varepsilon C_V \cdot |E(\tau)|} \cdot \sqrt{n}^{|V(\tau)| - |U_{\tau}|}S(\tau)\\
	&\le n^{O(1) \cdot \varepsilon C_V \cdot |E(\tau)|} \cdot \frac{1}{n^{C_p\varepsilon|E(\tau)|}}\\
	&\le 1
	\end{align*}
	}
\end{proof}

We can now obtain middle shape bounds.

\begin{lemma}
    For all $U \in {\mathcal I}_{mid}$ and $\tau \in {\mathcal M}_U$,
    \[
\begin{bmatrix}
    \frac{1}{|Aut(U)|c(\tau)}H_{Id_U} & B_{norm}(\tau) H_{\tau}\\
    B_{norm}(\tau) H_{\tau}^T & \frac{1}{|Aut(U)|c(\tau)}H_{Id_U}
\end{bmatrix}
\succeq 0
\]
\end{lemma}

\begin{proof}
	We have
    {\footnotesize
	\begin{align*}
	&\begin{bmatrix}
	\frac{1}{|Aut(U)|c(\tau)}H_{Id_U} & B_{norm}(\tau)H_{\tau}\\
	B_{norm}(\tau)H_{\tau}^T & \frac{1}{|Aut(U)|c(\tau)}H_{Id_U}
	\end{bmatrix}\\
	&\qquad = \begin{bmatrix}
	\left(\frac{1}{|Aut(U)|c(\tau)} - \frac{S(\tau)B_{norm}(\tau)}{|Aut(U)|}\right)H_{Id_U} & 0\\
	0 & \left(\frac{1}{|Aut(U)|c(\tau)} - \frac{S(\tau)B_{norm}(\tau)}{|Aut(U)|}\right)H_{Id_U}
	\end{bmatrix}\\
	&\qquad \qquad + B_{norm}(\tau)\begin{bmatrix}
	\frac{S(\tau)}{|Aut(U)|}H_{Id_U} & H_{\tau}\\
	H_{\tau}^T & \frac{S(\tau)}{|Aut(U)|}H_{Id_U}
	\end{bmatrix}
	\end{align*}}
	By \cref{lem: plds_cond2_simplified}, $\begin{bmatrix}
	\frac{S(\tau)}{|Aut(U)|}H_{Id_U} & H_{\tau}\\
	H_{\tau}^T & \frac{S(\tau)}{|Aut(U)|}H_{Id_U}
	\end{bmatrix}
	\succeq 0$, so the second term above is positive semidefinite. For the first term, by \cref{lem: plds_cond1}, $H_{Id_U} \succeq 0$ and by \cref{cor: plds_norm_decay}, $\frac{1}{|Aut(U)|c(\tau)} - \frac{S(\tau)B_{norm}(\tau)}{|Aut(U)|} \ge 0$, which proves that the first term is also positive semidefinite.
\end{proof}

\subsection{Intersection term bounds}

\begin{lemma}\label{lem: plds_charging2}
	Suppose $k \le n^{1/2 - \varepsilon}$. For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$ and for all $\gamma \in \Gamma_{U, V}$,
	\[n^{w(V(\gamma)\setminus U_{\gamma})} S(\gamma)^2 \le \frac{1}{n^{B\varepsilon (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + |E(\gamma)|)}}\]
	for some constant $B$ that depends only on $C_p$. In particular, it is independent of $C_V$.
\end{lemma}

\begin{proof}
	Since $\gamma$ is a left shape, we have $|U_{\gamma}| \ge |V_{\gamma}|$ as $V_{\gamma}$ is the unique minimum vertex separator of $\gamma$ and so, $n^{w(V(\gamma) \setminus U_{\gamma})} = n^{|V(\gamma)| - |U_{\gamma}|} \le n^{|V(\gamma)| - \frac{|U_{\gamma}| + |V_{\gamma}|}{2}}$. Also, note that $2|V(\gamma)| - |U_{\gamma}| - |V_{\gamma}| = |U_{\gamma} \setminus V_{\gamma}| + |V_{\gamma} \setminus U_{\gamma}| + 2|V(\gamma) \setminus U_{\gamma} \setminus V_{\gamma}| \ge |V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})|$. Therefore,
	\begin{align*}
	n^{w(V(\gamma)\setminus U_{\gamma})} S(\gamma)^2 &= n^{|V(\gamma)\setminus U_{\gamma})|} \left(\frac{k}{n}\right)^{2|V(\gamma)| - |U_{\gamma}| - |V_{\gamma}|} (2(\frac{1}{2} + \frac{1}{2n^{C_p\varepsilon}}) - 1)^{2|E(\gamma)|}\\
	&\le n^{|V(\gamma)| - \frac{|U_{\gamma}| + |V_{\gamma}|}{2}}\left(\frac{1}{n^{1/2 + \varepsilon}}\right)^{2|V(\gamma)| - |U_{\gamma}| - |V_{\gamma}|}\left(\frac{1}{n^{2C_p\varepsilon}}\right)^{|E(\gamma)|}\\
	&\le  \left(\frac{1}{n^{\varepsilon}}\right)^{2|V(\gamma)| - |U_{\gamma}| - |V_{\gamma}|}\left(\frac{1}{n^{2C_p\varepsilon}}\right)^{|E(\gamma)|}\\
	&\le \frac{1}{n^{B\varepsilon (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + \sum_{e \in E(\gamma)} l_e)}}
	\end{align*}
for a constant $B$ that depends only on $C_p$.
\end{proof}

We obtain intersection term bounds.

\begin{lemma}
    For all $U, V \in {\mathcal I}_{mid}$ where $w(U) > w(V)$ and all $\gamma \in \Gamma_{U, V}$, \[c(\gamma)^2N(\gamma)^2B(\gamma)^2H_{Id_V}^{-\gamma, \gamma} \preceq H_{\gamma}'\]
\end{lemma}

\begin{proof}
	By \cref{lem: plds_cond3_simplified}, we have
	\begin{align*}
	c(\gamma)^2N(\gamma)^2B(\gamma)^2H_{Id_V}^{-\gamma, \gamma} &= c(\gamma)^2N(\gamma)^2B(\gamma)^2 S(\gamma)^2 \frac{|Aut(U)|}{|Aut(V)|} H'_{\gamma}
	\end{align*}
	Using the same proof as in \cref{lem: plds_cond1}, we can see that $H'_{\gamma} \succeq 0$. Therefore, it suffices to prove that $c(\gamma)^2N(\gamma)^2B(\gamma)^2 S(\gamma)^2 \frac{|Aut(U)|}{|Aut(V)|} \le 1$.
	Since $U, V \in {\mathcal I}_{mid}$, $|Aut(U)| = |U|!,|Aut(V)| = |V|!$. Therefore, $\frac{|Aut(U)|}{|Aut(V)|} = \frac{|U|!}{|V|!} \le D_V^{|U_{\gamma} \setminus V_{\gamma}|}$. Also, $q = n^{O(1) \cdot \varepsilon C_V}$. Let $B$ be the constant from \cref{lem: plds_charging2}. We can set $C_V$ sufficiently small so that, using \cref{lem: plds_charging2},

    {\footnotesize
	\begin{align*}
	c(\gamma)^2N(\gamma)^2B(\gamma)^2S(\gamma)^2 \frac{|Aut(U)|}{|Aut(V)|} &\le 100^2 (3D_V)^{2|U_{\gamma}\setminus V_{\gamma}| + 2|V_{\gamma}\setminus U_{\gamma}| + 4|E(\alpha)|}4^{|V(\gamma) \setminus (U_{\gamma} \cup V_{\gamma})|}\\
	&\quad\cdot (3D_V)^{4|V(\gamma)\setminus V_{\gamma}| + 2|V(\gamma)\setminus U_{\gamma}|} (6D_V\sqrt[4]{2eq})^{2|V(\gamma)\setminus U_{\gamma}| + 2|V(\gamma)\setminus V_{\gamma}|}\\
	&\quad\cdot n^{w(V(\gamma)\setminus U_{\gamma})} S(\gamma)^2 \cdot D_V^{|U_\gamma \setminus V_{\gamma}|} \\
	&\le n^{O(1) \cdot \varepsilon C_V \cdot (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + \sum_{e \in E(\gamma)} l_e)} \cdot n^{w(V(\gamma)\setminus U_{\gamma})} S(\gamma)^2\\
	&\le n^{O(1) \cdot \varepsilon C_V \cdot (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + \sum_{e \in E(\gamma)} l_e)} \cdot \frac{1}{n^{B\varepsilon (|V(\gamma) \setminus (U_{\gamma} \cap V_{\gamma})| + \sum_{e \in E(\gamma)} l_e)}}\\
	&\le 1
	\end{align*}}
\end{proof}

\subsection{Truncation error bounds}

In this section, we will prove truncation error bounds.
We use the strategy and notation from \cite[Section 10]{potechin2020machinery}.
First, we will need a bound on $B_{norm}(\sigma) B_{norm}(\sigma') H_{Id_U}(\sigma, \sigma')$ that is obtained below.

\begin{lemma}\label{lem: plds_charging3}
	Suppose $k \le n^{1/2 - \varepsilon}$. For all $U \in {\mathcal I}_{mid}$ and $\sigma, \sigma' \in {\mathcal L}_U$,
	\[B_{norm}(\sigma) B_{norm}(\sigma') H_{Id_U}(\sigma, \sigma') \le \frac{1}{n^{0.5\varepsilon|V(\alpha)| + C_p\varepsilon|E(\alpha)|}} \left(\frac{k}{n}\right)^{|U|}
	\]
\end{lemma}

\begin{proof}
	Let $\alpha = \sigma \circ \sigma'$. Observe that $|V(\sigma)| + |V(\sigma')| = |V(\alpha)| + |U|$. By choosing $C_V$ sufficiently small,
    {\footnotesize
	\begin{align*}
	B_{norm}(\sigma) B_{norm}(\sigma') H_{Id_U}(\sigma, \sigma') &= (6D_V\sqrt[4]{2eq})^{|V(\sigma)\setminus U_{\sigma}| + |V(\sigma)\setminus V_{\sigma}|} n^{\frac{w(V(\sigma)) - w(U)}{2}}\\
	&\quad\cdot (6D_V\sqrt[4]{2eq})^{|V(\sigma')\setminus U_{\sigma'}| + |V(\sigma')\setminus V_{\sigma'}|} n^{\frac{w(V(\sigma')) - w(U)}{2}}\\
	&\quad\cdot \frac{1}{|Aut(U)|} \left(\frac{k}{n}\right)^{|V(\alpha)|} (2(\frac{1}{2} + \frac{1}{2n^{C_p\varepsilon}}) - 1)^{|E(\alpha)|}\\
	&\le n^{O(1) \cdot \varepsilon C_V \cdot |V(\alpha)|} \sqrt{n}^{|V(\sigma)| - |U|}\sqrt{n}^{|V(\sigma')| - |U|} \left(\frac{k}{n}\right)^{|V(\alpha)|}\frac{1}{n^{C_p\varepsilon|E(\alpha)|}}\\
	&\le \frac{1}{n^{0.5\varepsilon|V(\alpha)| + C_p\varepsilon|E(\alpha)|}} \left(\frac{k}{n}\right)^{|U|}
	\end{align*}}
\end{proof}

Now, we are ready to apply the strategy.

\begin{restatable}{lemma}{PLDSfive}\label{lem: plds_cond5}
	Whenever $\norm{M_{\alpha}} \le B_{norm}(\alpha)$ for all $\alpha \in {\mathcal M}'$,
	\[
	\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}} \succeq \frac{1}{n^{K_1D_{sos}^2}} Id_{sym}
	\]
	for a constant $K_1 > 0$.
\end{restatable}

\begin{proof}
    For $V \in {\mathcal I}_{mid}$, we have $\lambda_V = \left(\frac{k}{n}\right)^{|V|}$. Now, we choose $w_V = \left(\frac{k}{n}\right)^{D_{sos} - |V|}$. Then, for all $\sigma \in {\mathcal L}_{V}$, we have $w_{V} \leq \frac{w_{U_{\sigma}}\lambda_{U_{\sigma}}}{|\mathcal{I}_{mid}|B_{norm}(\sigma)^2{c(\sigma)^2}{H_{Id_V}(\sigma,\sigma)}}$ which is easily verified using \cref{lem: plds_charging3}. The result now follows.
\end{proof}

\begin{restatable}{lemma}{PLDSsix}\label{lem: plds_cond6}
	\[\sum_{U\in {\mathcal I}_{mid}} \sum_{\gamma \in \Gamma_{U, *}} \frac{d_{Id_{U}}(H_{Id_{U}}, H'_{\gamma})}{|Aut(U)|c(\gamma)} \le \frac{n^{K_2 D_{sos}}}{2^{D_V}}\]
	for a constant $K_2 > 0$.
\end{restatable}

\begin{proof}
	We have
	\begin{align*}
	&\sum_{U\in {\mathcal I}_{mid}} \sum_{\gamma \in \Gamma_{U, *}} \frac{d_{Id_{U}}(H_{Id_{U}}, H'_{\gamma})}{|Aut(U)|c(\gamma)} \\
    &= \sum_{U\in {\mathcal I}_{mid}} \sum_{\gamma \in \Gamma_{U, *}} \frac{1}{|Aut(U)|c(\gamma)}\sum_{\sigma,\sigma' \in \mathcal{L}_{U_{\gamma}}: |V(\sigma)| \leq D_V, |V(\sigma')| \leq D_V,
		\atop |V(\sigma \circ \gamma)| > D_V \text{ or } |V(\sigma' \circ \gamma)| > D_V}{B_{norm}(\sigma)B_{norm}(\sigma')H_{Id_{U_{\gamma}}}(\sigma,\sigma')}
	\end{align*}
	The set of $\sigma, \sigma'$ that could appear in the above sum must necessarily be non-trivial and hence, $\sigma, \sigma' \in {\mathcal L}_U'$. Then,
    {\footnotesize
	\begin{align*}
	&\sum_{U\in {\mathcal I}_{mid}} \sum_{\gamma \in \Gamma_{U, *}} \frac{d_{Id_{U}}(H_{Id_{U}}, H'_{\gamma})}{|Aut(U)|c(\gamma)}\\
	&= \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}} {B_{norm}(\sigma)B_{norm}(\sigma')H_{Id_{U}}(\sigma,\sigma')}\sum_{\gamma \in \Gamma_{U, *}: |V(\sigma \circ \gamma)| > D_V \text{ or } |V(\sigma' \circ \gamma)| > D_V} \frac{1}{|Aut(U)|c(\gamma)}
	\end{align*}}
	For $\sigma \in {\mathcal L}'_{U}$, define $m_{\sigma} = D_V + 1 - |V(\sigma)| \ge 1$. This is precisely set so that for all $\gamma \in \Gamma_{U, *}$, we have $|V(\sigma \circ \gamma)| > D_V$ if and only if $|V(\gamma)| \ge |U| + m_{\sigma}$. So, for $\sigma, \sigma' \in {\mathcal L}'_U$,
	\begin{align*}
	\sum_{\gamma \in \Gamma_{U, *}: |V(\sigma \circ \gamma)| > D_V \text{ or } |V(\sigma' \circ \gamma)| > D_V} &\frac{1}{|Aut(U)|c(\gamma)} \\
    &=
	\sum_{\gamma \in \Gamma_{U, *}: |V(\gamma)| \ge |U| + \min(m_{\sigma}, m_{\sigma'})} \frac{1}{|Aut(U)|c(\gamma)}\\
	&\le \frac{1}{2^{\min(m_{\sigma}, m_{\sigma'}) - 1}}
	\end{align*}
	Also, for $\sigma, \sigma' \in {\mathcal L}_U'$, we have $|V(\sigma \circ \sigma')| + min(m_{\sigma}, m_{\sigma'}) - 1 \ge D_V$.
	Therefore,
	\begin{align*}
		\sum_{U\in {\mathcal I}_{mid}} \sum_{\gamma \in \Gamma_{U, *}} &\frac{d_{Id_{U}}(H_{Id_{U}}, H'_{\gamma})}{|Aut(U)|c(\gamma)} \\
        &\le \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}} {B_{norm}(\sigma)B_{norm}(\sigma')H_{Id_{U}}(\sigma,\sigma')\frac{1}{2^{\min(m_{\sigma}, m_{\sigma'}) - 1}}}\\
		&\le \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}}\frac{n^{O(1) D_{sos}}}{n^{0.5\varepsilon|V(\sigma \circ \sigma')|}2^{\min(m_{\sigma}, m_{\sigma'}) - 1}}
	\end{align*}
	where we used \cref{lem: plds_charging3}. Using $n^{0.5\varepsilon |V(\sigma \circ \sigma')|} \ge n^{0.1\varepsilon |V(\sigma \circ \sigma')|}2^{|V(\sigma \circ \sigma')|}$,
	\begin{align*}
		\sum_{U\in {\mathcal I}_{mid}} \sum_{\gamma \in \Gamma_{U, *}} \frac{d_{Id_{U}}(H_{Id_{U}}, H'_{\gamma})}{|Aut(U)|c(\gamma)} &\le \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}}\frac{n^{O(1) D_{sos}}}{n^{0.1\varepsilon|V(\sigma \circ \sigma')|} 2^{|V(\sigma \circ \sigma')|}2^{\min(m_{\sigma}, m_{\sigma'}) - 1}}\\
		&\le \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}}\frac{n^{O(1) D_{sos}}}{n^{0.1\varepsilon|V(\sigma \circ \sigma')|} 2^{D_V}}\\
		&\le \sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}}\frac{n^{O(1) D_{sos}}}{D_{sos}^{D_{sos}}n^{0.1\varepsilon|V(\sigma \circ \sigma')|} 2^{D_V}}
	\end{align*}
	The final step will be to argue that $\sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma,\sigma' \in \mathcal{L}'_{U}}\frac{1}{D_{sos}^{D_{sos}}n^{0.1 \varepsilon|V(\sigma \circ \sigma')|}} \le 1$ which will complete the proof. But this will follow if we set $C_V$ small enough.
\end{proof}

We conclude the following.

\begin{lemma}
    Whenever $\norm{M_{\alpha}} \le B_{norm}(\alpha)$ for all $\alpha \in \mathcal{M}'$,
    \[
    \sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}} \succeq 6\left(\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U}}(H'_{\gamma},H_{Id_{U}})}{|Aut(U)|c(\gamma)}}}\right)Id_{sym}
    \]
\end{lemma}

\begin{proof}
	Choose $C_{sos}$ sufficiently small so that $\frac{1}{n^{K_1D_{sos}^2}} \ge 6\frac{n^{K_2D_{sos}}}{2^{D_V}}$ which can be satisfied by setting $C_{sos} < K_3 C_V$ for a sufficiently small constant $K_3 > 0$. Then, since $Id_{Sym} \succeq 0$, using \cref{lem: plds_cond5} and \cref{lem: plds_cond6},
	\begin{align*}
		\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}} &\succeq \frac{1}{n^{K_1D_{sos}^2}} Id_{sym}\\
		&\succeq 6\frac{n^{K_2D_{sos}}}{2^{D_V}} Id_{sym}\\
		&\succeq 6\left(\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U}}(H'_{\gamma},H_{Id_{U}})}{|Aut(U)|c(\gamma)}}}\right)Id_{sym}
	\end{align*}
\end{proof}
\subsection{General strategy to lower bound $\sum_{V \in \mathcal{I}_{mid}}{M^{fact}(H_{Id_V})}$}\label{subsec: positivity_lower_bound_strategy}

In this section, we describe how to show that $\sum_{V \in \mathcal{I}_{mid}}{M^{fact}(H_{Id_V})} \succeq {\delta}Id_{Sym}$ for some $\delta > 0$ where $\delta$ will depend on $n$ and other parameters. For now, we assume that the indices of $\Lambda$ are multilinear monomials. We will then describe the adjustments that are needed to handle non-multilinear matrix indices.

We start with a few more definitions.
\begin{definition}
	For all $V \in \mathcal{I}_{mid}$ we define $Id_{Sym,V}$ to be the matrix such that 
	\begin{enumerate}
		\item $Id_{Sym,V}(A,B) = 1$ if $A$ and $B$ both have index shape $V$.
		\item Otherwise, $Id_{Sym,V}(A,B) = 0$.
	\end{enumerate}
\end{definition}
\begin{proposition}
	$Id_{Sym} = \sum_{V \in \mathcal{I}_{mid}}{Id_{Sym,V}}$
\end{proposition}
\begin{definition}
	For all $V \in \mathcal{I}_{mid}$ we define $\lambda_V = |Aut(V)|H_{Id_V}(Id_V,Id_V)$
\end{definition}
We now describe our strategy for showing $\sum_{V \in \mathcal{I}_{mid}}{M^{fact}(H_{Id_V})} \succeq {\delta}Id_{Sym}$. The idea is as follows. We will consider the index shapes $V \in \mathcal{I}_{mid}$ from largest weight to smallest weight and we will show that for each $V \in \mathcal{I}_{mid}$, there exists a $\delta_V > 0$ such that $\sum_{V \in \mathcal{I}_{mid}}{M^{fact}(H_{Id_V})} \succeq \delta_{V}\sum_{U \in \mathcal{I}_{mid}: w(U) \geq w(V)}{Id_{Sym,U}}$.

For the first step, letting $V_{max}$ be the maximum weight index shape in $\mathcal{I}_{mid}$, $M^{fact}(H_{Id_{V_{max}}}) = \lambda_{V_{max}}Id_{Sym,V_{max}}$ because there are no non-trivial left shapes $\sigma$ such that $V_{\sigma} = V_{max}$. For other $V \in \mathcal{I}_{mid}$, $\lambda_{V}Id_{Sym,V}$ is a part of $M^{fact}(H_{Id_V})$ but $M^{fact}(H_{Id_V})$ will also contain terms of the form $H_{Id_V}(\sigma,\sigma')M_{\sigma}M_{{\sigma'}^T}$ where $U_{\sigma} \neq V$ or $U_{\sigma'} \neq V$. 

We can handle the terms $H_{Id_V}(\sigma,\sigma')M_{\sigma}M_{{\sigma'}^T}$ where $U_{\sigma} \neq V$ and $U_{\sigma'} \neq V$ by bounding these terms in terms of $Id_{Sym,U_{\sigma}}$ and $Id_{Sym,U_{\sigma'}}$. Since $w(U_{\sigma}) > w(V)$ and $w(U_{\sigma'}) > w(V)$, $Id_{Sym,U_{\sigma}}$ and $Id_{Sym,U_{\sigma'}}$ are already available to us. To handle the terms $H_{Id_V}(\sigma,\sigma')M_{\sigma}M_{{\sigma'}^T}$ where exactly one of $U_{\sigma}$ and $U_{\sigma'}$ are equal to $V$, we use the following trick.
\begin{definition}
	Given $V \in \mathcal{I}_{mid}$, define $H''_{Id_V}$ to be the coefficient matrix such that 
	\begin{enumerate}
		\item If $U_{\sigma} = U_{\sigma'} = V$ then $H''_{Id_V}(\sigma,\sigma') = \frac{1}{2}H_{Id_V}(\sigma,\sigma')$
		\item If exactly one of $U_{\sigma}$ and $U_{\sigma'}$ are equal to $V$ then $H''_{Id_V}(\sigma,\sigma') = H_{Id_V}(\sigma,\sigma')$
		\item If $U_{\sigma} \neq V$ and $U_{\sigma'} \neq V$ then $H''_{Id_V}(\sigma,\sigma') = 2H_{Id_V}(\sigma,\sigma')$
	\end{enumerate}
\end{definition}
\begin{proposition}
	$M^{fact}(H''_{Id_V}) \succeq 0$
\end{proposition}
\begin{proof}
	Since $H_{Id_V} \succeq 0$, $H''_{Id_V} \succeq 0$ and thus $M^{fact}(H''_{Id_V}) \succeq 0$.
\end{proof}
\begin{corollary}\label{cor: showing_positivity_corollary}
	For all $V \in \mathcal{I}_{mid}$, 
	\[
	M^{fact}(H_{Id_V}) + \sum_{\sigma,\sigma' \in \mathcal{L}_V: U_{\sigma} \neq V, U_{\sigma'} \neq V}{H_{Id_V}(\sigma,\sigma')M_{\sigma}M_{{\sigma'}^T}} \succeq \frac{\lambda_V}{2}Id_{Sym,V}
	\]
\end{corollary}
\begin{proof}
	Observe that 
	\[
	M^{fact}(H_{Id_V})  -  \frac{\lambda_V}{2}Id_{Sym,V} + \sum_{\sigma,\sigma' \in \mathcal{L}_V: U_{\sigma} \neq V, U_{\sigma'} \neq V}{H_{Id_V}(\sigma,\sigma')M_{\sigma}M_{{\sigma'}^T}} = 
	M^{fact}(H''_{Id_V}) \succeq 0
	\]
\end{proof}
We now analyze the terms $ \sum_{\sigma,\sigma' \in \mathcal{L}_V: U_{\sigma} \neq V, U_{\sigma'} \neq V}{H_{Id_V}(\sigma,\sigma')M_{\sigma}M_{{\sigma'}^T}}$.
\begin{definition}
	Given $U,V \in \mathcal{I}$ with $w(U) > w(V)$, we define $W(U,V)$ to be
	\[
	W(U,V) = \frac{1}{|Aut(U)|}\sum_{\sigma \in \mathcal{L}_{V}: U_{\sigma} = U}{\sum_{\sigma' \in \mathcal{L}_{V}: U_{\sigma'} \neq V}{B_{norm}(\sigma)B_{norm}(\sigma')H_{Id_V}(\sigma,\sigma')}}
	\]
\end{definition}
\begin{lemma}\label{showingpositivitylemma}
	For all $V \in \mathcal{I}_{mid}$, 
	\[
	\sum_{\sigma,\sigma' \in \mathcal{L}_V: U_{\sigma} \neq V, U_{\sigma'} \neq V}{H_{Id_V}(\sigma,\sigma')M_{\sigma}M_{{\sigma'}^T}} \preceq \sum_{U \in \mathcal{I}_{mid}: w(U) > w(V)}{W(U,V)Id_{Sym,U}}
	\]
\end{lemma}
\begin{proof}
	Observe that for all $\sigma,\sigma' \in \mathcal{L}_{V}$ such that $U_{\sigma} \neq V$ and $U_{\sigma'} \neq V$, $||M_{\sigma}M_{{\sigma'}^T}|| \leq B_{norm}(\sigma)B_{norm}(\sigma')$ and thus 
	\[
	\frac{1}{2}\left(M_{\sigma}M_{{\sigma'}^T} + M_{\sigma'}M_{{\sigma}^T}\right) \preceq \frac{1}{2}B_{norm}(\sigma)B_{norm}(\sigma')\left(M_{Id_{U_{\sigma}}} + M_{Id_{U_{\sigma'}}}\right)
	\]
	Summing this equation over all $\sigma,\sigma' \in \mathcal{L}_{V}$ such that $U_{\sigma} \neq V$ and $U_{\sigma'} \neq V$,
	\begin{align*}
		&\sum_{\sigma,\sigma' \in \mathcal{L}_V: U_{\sigma} \neq V, U_{\sigma'} \neq V}{H_{Id_U}(\sigma,\sigma')M_{\sigma}M_{{\sigma'}^T}} \preceq 
		\sum_{\sigma,\sigma' \in \mathcal{L}_V: U_{\sigma} \neq V, U_{\sigma'} \neq V}{B_{norm}(\sigma)B_{norm}(\sigma')M_{Id_{U_{\sigma}}}} \\
		&\preceq \sum_{U \in \mathcal{I}_{mid}: w(U) > w(V)}{\sum_{\sigma \in \mathcal{L}_{V}: U_{\sigma} = U}{\sum_{\sigma' \in \mathcal{L}_{V}: V_{\sigma} \neq V}{B_{norm}(\sigma)B_{norm}(\sigma')H_{Id_U}(\sigma,\sigma')M_{Id_U}}}} \\
		&\preceq \sum_{U \in \mathcal{I}_{mid}: w(U) > w(V)}|Aut(U)|W(U,V)M_{Id_U}
	\end{align*}
	Since all of the coefficient matrices have SOS-symmetry, we can replace $M_{Id_U}$ by $\frac{1}{|Aut(U)|}Id_{Sym,U}$ and this completes the proof.
\end{proof}
Using this lemma, we can show the following theorem:
\begin{theorem}
	Let $G$ be the following directed graph:
	\begin{enumerate}
		\item The vertices of $G$ are the index shapes $V \in \mathcal{I}_{mid}$
		\item For each $U,V \in \mathcal{I}_{mid}$ such that $w(U) > w(V)$, we have an edge $e = (V,U)$ with weight $w(e) = \frac{2W(U,V)}{\lambda_V}$
	\end{enumerate}
	For all $V \in \mathcal{I}_{mid}$,
	\[
	Id_{Sym,V} \preceq 2\sum_{U \in \mathcal{I}_{mid}: w(U) \geq w(V)}{\left(\sum_{P: P \text{ is a path from V to U in } G}{\prod_{e \in E(P)}{w(e)}}\right)\frac{1}{{\lambda_{U}}}M^{fact}(H_{Id_{U}})}
	\]
\end{theorem}
\begin{proof}[Proof sketch]
	This can be proved by starting with \cref{cor: showing_positivity_corollary} and iteratively applying Lemma \ref{showingpositivitylemma} and \cref{cor: showing_positivity_corollary}.
\end{proof}
\subsubsection{Handling Non-multilinear Matrix Indices*}
In order to handle non-multilinear matrix indices, we need to make a few adjustments. First, we need to modify the definition of $Id_{Sym,V}$.
\begin{definition}
    For all $V \in \mathcal{I}_{mid}$ we define $Id_{Sym,V}$ to be the matrix such that 
	\begin{enumerate}
		\item $Id_{Sym,V}(A,B) = 1$ if $A$ and $B$ have the same index shape $U$ and $U$ has the same number of each type of vertex as $V$. Note that $B$ may be a permutation of $A$ and $U$ may have different powers than $V$.
		\item Otherwise, $Id_{Sym,V}(A,B) = 0$.
	\end{enumerate}
\end{definition}

Observe that with this modified definition, we will still have $Id_{Sym} = \sum_{V \in \mathcal{I}_{mid}}{Id_{Sym,V}}$.

We also need to adjust how we define $\lambda_V$ as there are left shapes $\sigma$ such that $U_{\sigma}$ and $V_{\sigma}$ have the same numbers and types of vertices but $U_{\sigma}$ has different powers.
\begin{definition}
Given $V \in \mathcal{I}_{mid}$, we define $\mathcal{T}_V \subseteq \mathcal{L}_V$ to be the set of left shapes $\sigma \in \mathcal{L}_V$ such that $U_{\sigma}$ has the same numbers and types of vertices as $V$ (which automatically implies that $E(\sigma) = \emptyset$).
\end{definition}

\begin{definition}
Define $Id^{*}_{Sym,V}$ to be the matrix indexed by left shapes $\sigma,\sigma' \in \mathcal{T}_V$ such that $Id^{*}_{Sym,V}(\sigma,\sigma') = \frac{1}{|Aut(V)|}$ if $U_{\sigma} \equiv U_{\sigma'}$ and $Id^{*}_{Sym,V}(\sigma,\sigma') = 0$ otherwise.
\end{definition}
\begin{proposition}
$M^{fact}(Id^{*}_{Sym,V}) = Id_{Sym,V}$
\end{proposition}
\begin{definition}
Let $H'_{Id_V}$ be the matrix $H_{Id_V}$ restricted to rows and columns $\sigma,\sigma'$ where $\sigma,\sigma' \in \mathcal{T}_V$. We define $\lambda_V$ to be the largest constant $\lambda$ such that $H'_{Id_V} \succeq {\lambda}Id^{*}_{Sym,V}$
\end{definition}

Finally, whenever we have the condition that $U_{\sigma} \neq V$, it should instead be the condition that $U_{\sigma}$ does not have the same number of each type of vertex as $V$. With these adjustments, the same arguments go through.
\subsection{Strategy to prove positivity in our applications}

Now, we will illustrate the final ingredients needed to show positivity for our applications.

To use \cref{generalmaintheorem}, we would need to prove a statement of the form: Whenever $\norm{M_{\alpha}} \le B_{norm}(\alpha)$ for all $\alpha \in \mathcal{M}'$,
\[
	\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}} \succeq 6\left(\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U}}(H'_{\gamma},H_{Id_{U}})}{|Aut(U)|c(\gamma)}}}\right)Id_{sym}
\]

We will sketch the strategy we use to prove this. Let $D_{sos}$ be the degree of the SoS program.

For the left hand side, we will prove a lower bound of the form: Whenever $\norm{M_{\alpha}} \le B_{norm}(\alpha)$ for all $\alpha \in {\mathcal M}'$,
\[
\sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}} \succeq \frac{1}{n^{K_1D_{sos}^2}} Id_{sym}
\]
for a constant $K_1 > 0$. For this, we use the strategy from \cref{subsec: positivity_lower_bound_strategy}. Then, we prove an upper bound on the right hand side of the form
\[\sum_{U\in {\mathcal I}_{mid}} \sum_{\gamma \in \Gamma_{U, *}} \frac{d_{Id_{U}}(H_{Id_{U}}, H'_{\gamma})}{|Aut(U)|c(\gamma)} \le \frac{n^{K_2D_{sos}}}{2^{D_V}}\]
for a constant $K_2 > 0$.

Now, we put these two together. Using the fact that $Id_{Sym} \succeq 0$, by simply setting $\frac{1}{n^{K_1D_{sos}^2}} > \frac{n^{K_2D_{sos}}}{2^{D_V}}$ which can be obtained by choosing $D_{sos}$ small enough, we obtain the desired result.

We will also need the following bound that says that  that lets us sum over all shapes if we have sufficient decay for each vertex, then, the sum of this decay, over all shapes $\sigma \circ \sigma'$ for $\sigma, \sigma' \in {\mathcal L}_U'$, is bounded.

\begin{definition}
	For $U \in {\mathcal I}_{mid}$, let ${\mathcal L}_{U}' \subset {\mathcal L}_U$ be the set of non-trivial shapes in ${\mathcal L}_U$.
\end{definition}

\begin{lemma}\label{lem: gp_sum}
    Suppose $D_V = n^{C_V\varepsilon}, D_E = n^{C_E\varepsilon}$ for constants $C_V, C_E > 0$, are the truncation parameters for our shapes. For any $U \in {\mathcal I}_{mid}$, 
	\begin{align*}
	\sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma, \sigma' \in {\mathcal L}'_U} \frac{1}{D_{sos}^{D_{sos}}n^{F \varepsilon|V(\sigma \circ \sigma')|}} \le 1
	\end{align*}
	for a constant $F > 0$ that depends only on $C_V, C_E$. In particular, by setting $C_V, C_E$ small enough, we can make this constant arbitrarily small.
\end{lemma}

\begin{proof}
	For a given $j = |U|$, the number of ways to choose $U$ is at most $t_{max}^j$. For a given $U \in {\mathcal I}_{mid}$, we will bound the number of ways to choose $\sigma, \sigma' \in {\mathcal L}_U'$. To choose $\sigma, \sigma'\in {\mathcal L}'_U$, it is sufficient to choose
	\begin{itemize}
		\item The number of vertices $j_1 \ge 1$ (resp. $j_1' \ge 1$) in $U_{\sigma} \setminus V_{\sigma}$ (resp. $U_{\sigma'} \setminus V_{\sigma'}$), their types of which there are at most $t_{max}$, and their powers which have at most $D_{sos}$ choices.
		\item The number of vertices $j_2$ (resp. $j_2'$) in $V(\sigma) \setminus (U_{\sigma} \cup V_{\sigma})$ (resp. $V(\sigma') \setminus (U_{\sigma'} \cup V_{\sigma'})$) and also their types, of which there are at most $t_{max}$.
		\item The position of each vertex $i$ in $U_{\sigma} \setminus V_{\sigma}$ (resp. $U_{\sigma'} \setminus V_{\sigma'}$) within $U_{\sigma}$ (resp. $U_{\sigma'}$). There are at most $D_V$ choices for each vertex.
		\item The subset of $U_{\sigma}$ (resp. $U_{\sigma'}$) that is in $V_{\sigma}$ (resp. $V_{\sigma'}$) and a mapping in $Aut(U_{\sigma} \cap V_{\sigma})$ (resp. $Aut(U_{\sigma'} \cap V_{\sigma'})$) that determines the matching between the vertices in $U_{\sigma} \cap V_{\sigma}$ (resp. $U_{\sigma'} \cap V_{\sigma'}$).
		\item The number $j_3$ (resp. $j_3'$) of edges in $E(\sigma)$ (resp. $E(\sigma')$). and the $k$ endpoints of each edge. Each endpoint has at most $D_V$ choices.
	\end{itemize}
	Therefore, for all $j \ge 0, j_1, j_1' \ge 1, j_2, j_2', j_3, j_3' \ge 0$, we have
	\begin{align*}
	\sum_{U \in {\mathcal I}_{mid}}\sum_{\substack{\sigma, \sigma' \in {\mathcal L}'_U \\ |U_{\sigma} \setminus V_{\sigma}| = j_1, |U_{\sigma'} \setminus V_{\sigma'}| = j_1' \\ |V(\sigma) \setminus (U_{\sigma} \cup V_{\sigma})| = j_2, |V(\sigma') \setminus (U_{\sigma'} \cup V_{\sigma'})| = j_2'\\ |E(\sigma)| = j_3, |E(\sigma')| = j_3'}} \frac{1}{|Aut(U_{\sigma'} \cap V_{\sigma'})||Aut(U_{\sigma} \cap V_{\sigma})|(2t_{max})^{j + j_2 + j_2'}(D_Vt_{max}D_{sos})^{j_1 + j_1'}(D_V)^{kj_3}} \le 1
	\end{align*}
	This implies that
	\begin{align*}
	\sum_{U\in {\mathcal I}_{mid}} \sum_{\sigma, \sigma' \in {\mathcal L}'_U} \frac{1}{D_{sos}^{D_{sos}}n^{F \varepsilon|V(\sigma \circ \sigma')|}} \le 1
	\end{align*}
	for a constant $F > 0$ that only depends on $C_V, C_E$.
\end{proof}
\section{Omitted technical details}
	\subsection{Proof that the Leftmost and Rightmost Minimum Vertex Separators are Well-defined}\label{separatorswelldefinedsection}
	In this section, we give a general proof that the leftmost and rightmost minimum vertex separators are well-defined.
	\begin{lemma}\label{leftrightseparatorlemma}
		For any two distinct vertex separators $S_1$ and $S_2$ of $\alpha$, there exist vertex separators $S_L$ and $S_R$ of $\alpha$ such that:
		\begin{enumerate}
			\item $S_L$ is a vertex separator of $U_{\alpha}$ and $S_1$ and a vertex separator of $U_{\alpha}$ and $S_2$.
			\item $S_R$ is a vertex separator of $S_1$ and $V_{\alpha}$ and a vertex separator of $S_2$ and $V_{\alpha}$.
			\item $w(S_L) + w(S_R) \leq w(S_1) + w(S_2)$
		\end{enumerate}
	\end{lemma}
	\begin{proof}
		Take $S_L$ to be the set of vertices $v \in V(\alpha) \cap (S_1 \cup S_2)$ such that there is a path from $U_{\alpha}$ to $v$ which doesn't intersect $S_1 \cup S_2$ before reaching $v$. Similarly, take $S_R$ to be the set of vertices $v \in V(\alpha) \cap (S_1 \cup S_2)$ such that there is a path from $V_{\alpha}$ to $v$ which doesn't intersect $S_1 \cup S_2$ before reaching $v$.
		
		Now observe that $S_L$ is a vertex separator between $U_{\alpha}$ and $S_1$. To see this, note that for any path $P$ from $U_{\alpha}$ to a vertex $v \in S_1$, either $P$ intersects $S_L$ before reaching $v$ or $P$ does not intersect $S_L$ before reaching $v$. In the latter case, $v \in S_L$. Thus, in either case, $P$ intersects $S_L$. Following similar logic, $S_L$ is also a vertex separator between $U_{\alpha}$ and $S_2$, $S_R$ is a vertex separator between $S_1$ and $V_{\alpha}$, and $S_R$ is also a vertex separator between $S_2$ and $V_{\alpha}$.
		
		To show that $w(S_L) + w(S_R) \leq w(S_1) + w(S_2)$, observe that $w(S_L) + w(S_R) = w(S_R \cup S_R) + w(S_L \cap S_R)$ and $w(S_1) + w(S_2) = w(S_1 \cup S_2) + w(S_1 \cap S_2)$. Thus, to show that $w(S_L) + w(S_R) \leq w(S_1) + w(S_2)$, it is sufficient to show that
		\begin{enumerate}
			\item $S_L \cup S_R \subseteq S_1 \cup S_2$
			\item $S_L \cap S_R \subseteq S_1 \cap S_2$
		\end{enumerate}
		For the first statement, note that by definition any vertex in $S_L \cup S_R$ must be in $S_1 \cup S_2$. For the second statement, note that if $v \in S_L \cap S_R$ then there is a path from $U_{\alpha}$ to $v$ which does not intersect any other vertices in $S_1 \cup S_2$ and there is a path from $v$ to $V_{\alpha}$ which does not intersect any other vertices in $S_1 \cup S_2$. Combining these paths, we obtain a path $P$ from $U_{\alpha}$ to $V_{\alpha}$ such that $v$ is the only vertex in $P$ which is in $S_1 \cup S_2$. This implies that $v \in S_1 \cap S_2$ as otherwise either $S_1$ or $S_2$ would not be a vertex separator between $U_{\alpha}$ and $V_{\alpha}$.
	\end{proof}
	\begin{corollary}
		The leftmost and rightmost minimum vertex separators between $U_{\alpha}$ and $V_{\alpha}$ are well-defined.
	\end{corollary}
	\begin{proof}
		Assume that there is no minimum leftmost vertex separator. If so, then there exists a minimum vertex separator $S_1$ between $U_{\alpha}$ and $V_{\alpha}$ such that 
		\begin{enumerate}
			\item There does not exist a minimum vertex separator $S'$ of $\alpha$ such that $S'$ is also a minimum vertex separator of $U_{\alpha}$ and $S_1$ (otherwise we would take $S'$ rather than $S$)
			\item There exists a minimum vertex separator $S_2$ of $\alpha$ such that $S'$ is not a minimum vertex separator of $U_{\alpha}$ and $S_2$ (as otherwise $S_1$ would be the leftmost minimum vertex separator)
		\end{enumerate}
		Now let $S_L$ and $S_R$ be the vertex separators of $\alpha$ obtained by applying Lemma \ref{leftrightseparatorlemma} to $S_1$ and $S_2$. Since $S_1$ and $S_2$ are minimum vertex separators of $\alpha$, we must have that $w(S_L) = w(S_R) = w(S_1) = w(S_2)$. Since $S_L$ is a vertex separator of $U_{\alpha}$ and $S_2$, $S_L \neq S_1$. However, $S_L$ is a vertex separator of $U_{\alpha}$ and $S_1$, which contradicts our choice of $S_1$.
		
		Thus, there must be a leftmost minimum vertex separator of $\alpha$. Following similar logic, there must be a rightmost minimum vertex separator of $\alpha$ as well.
	\end{proof}
	\subsection{Proofs with Canonical Maps}\label{canonicalmapsection}
	In this section, we give alternative proofs of Lemmas \ref{lm:morthsimplereexpression} and \ref{lm:singleshapeintersections} using canonical maps.
	\begin{definition}[Canonical Maps]
		For each shape $\alpha$ and each ribbon $R$ of shape $\alpha$, we arbitrarily choose a canonical map $\varphi_R: V(\alpha) \rightarrow V(R)$ such that $\varphi_R(H_{\alpha}) = H_R$, $\varphi_{R}(U_{\alpha}) = A_R$, and $\varphi_{R}(V_{\alpha}) = B_R$. Note that there are $|Aut(\alpha)|$ possible choices for this map.
	\end{definition}
	\subsection{Proof of Lemma \ref{lm:morthsimplereexpression}}
	\begin{lemma}
		\[
		M^{orth}_{\tau}(H) = \sum_{\sigma \in Row(H),\sigma' \in Col(H)}{H(\sigma,\sigma')|Decomp(\sigma,\tau,{\sigma'}^T)|M_{\sigma \circ \tau \circ {\sigma'}^T}}
		\]
	\end{lemma}
	\begin{proof}
		Observe that there is a bijection between ribbons $R$ with shape $\sigma \circ \tau \circ {\sigma'}^T$ together with an element $\pi \in Decomp(\sigma,\tau,\sigma')$ and triples of ribbons $(R_1,R_2,R_3)$ such that
		\begin{enumerate}
			\item $R_1,R_2,R_3$ have shapes $\sigma$, $\tau$, and ${\sigma'}^T$, respectively.
			\item $V(R_1) \cap V(R_2) = A_{R_2} = B_{R_1}$,  $V(R_2) \cap V(R_3) = A_{R_3} = B_{R_2}$, and $V(R_1) \cap V(R_3) = A_{R_2} \cap B_{R_2}$
		\end{enumerate}
		To see this, note that given such ribbons $R_1,R_2,R_3$, the ribbon $R = R_1 \circ R_2 \circ R_3$ has shape $\sigma \circ \tau \circ {\sigma'}^T$. Further note that we have two bijective maps from $V(\sigma \circ \tau \circ {\sigma'}^T)$ to $V(R)$. The first map is $\varphi_R$. The second map is $\varphi_{R_1} \circ \varphi_{R_2} \circ \varphi_{R_3}$. Using this, we can take $\pi = \varphi^{-1}_R(\varphi_{R_1} \circ \varphi_{R_2} \circ \varphi_{R_3})$
		
		Conversely, given a ribbon $R$ of shape $\sigma \circ \tau \circ {\sigma'}^T$ and an element $\pi \in Decomp(\sigma,\tau,\sigma')$, let $R_1 = \varphi_R(\pi(\sigma))$, let $R_2 = \varphi_R(\pi(\tau))$, and let $R_3 = \varphi_R(\pi({\sigma'}^T))$. Note that this is well defined because for any element $\pi' \in Aut(\sigma) \times Aut(\tau) \times Aut({\sigma'}^T)$, $\varphi_R(\pi\pi'(\sigma)) = \varphi_R(\pi(\pi'(\sigma))) = \varphi_R(\pi(\sigma))$. Similarly, $\varphi_R(\pi\pi'(\tau)) = \varphi_R(\pi(\tau))$ and $\varphi_R(\pi\pi'({\sigma'}^T)) = \varphi_R(\pi({\sigma'}^T))$.
		
		To confirm that this is bijection, we have to show that these two maps are inverses of each other. Given $R_1$, $R_2$, and $R_3$, applying these two maps gives us ribbons $R'_1 = \varphi_R\varphi^{-1}_R(\varphi_{R_1} \circ \varphi_{R_2} \circ \varphi_{R_3})(H_{\sigma}) = R_1$, $R'_2 = \varphi_R\varphi^{-1}_R(\varphi_{R_1} \circ \varphi_{R_2} \circ \varphi_{R_3})(H_{\tau}) = R_2$, and $R'_3 = \varphi_R\varphi^{-1}_R(\varphi_{R_1} \circ \varphi_{R_2} \circ \varphi_{R_3})(H_{{\sigma'}^T}) = R_3$. Conversely, given $R$ and an element $\pi \in Decomp(\sigma,\tau,\sigma')$ (which we represent by an element $\pi \in Aut(\sigma \circ \tau \circ {\sigma'}^T)$), applying these two maps gives us the ribbon 
		\[
		R' = \varphi_R(\pi(\sigma)) \circ \varphi_R(\pi(\tau)) \circ \varphi_R(\pi({\sigma'}^T)) = {\varphi_R}\pi(\sigma \circ \tau \circ {\sigma'}^T) = R
		\]
		and gives us the map 
		\[
		\varphi^{-1}_R(\varphi_{\varphi_R(\pi(\sigma))} \circ \varphi_{\varphi_R(\pi(\tau))} \circ \varphi_{\varphi_R(\pi({\sigma'}^T))})
		\]
		Now observe that both ${\varphi_R}\pi$ and $\varphi_{\varphi_R(\pi(\sigma))}$ give bijective maps from $\sigma$ to the ribbon ${\varphi_R}\pi(\sigma)$ so $\varphi^{-1}_{\varphi_R(\pi(\sigma))}{\varphi_R}\pi \in Aut(\sigma)$. Following similar logic for $\tau$ and ${\sigma'}^T$, in $Decomp(\sigma,\tau,\sigma')$ this map is equivalent to $
		\varphi^{-1}_R({\varphi_R}\pi) = \pi$
	\end{proof}
	\subsection{Proof of Lemma \ref{lm:singleshapeintersections}}
	\begin{definition}[Rigorous definition of intersection patterns]
		We define an intersection pattern $P$ on composable shapes $\gamma,\tau,{\gamma'}^T$ to consist of the shape $\gamma \circ \tau \circ {\gamma'}^T$ together with a non-empty set of constraint edges $E(P)$ on $V(\gamma \circ \tau \circ {\gamma'}^T)$ such that:
		\begin{enumerate}
			\item For all vertices $u,v,w \in V(\gamma \circ \tau \circ {\gamma'}^T)$, if $(u,v),(v,w) \in E(P)$ then $(u,w) \in E(P)$
			\item $E(P)$ does not contain a path between two vertices of $\gamma$, two vertices of $\tau$, or two vertices of ${\gamma'}^T$. This ensures that when we consider $\gamma,\tau,\gamma'$ individually, their vertices are distinct.
			\item Defining $V_{*}(\gamma) \subseteq V(\gamma)$ to be the vertices of $\gamma$ which are incident to an edge in $E(P)$, $U_{\gamma}$ is the unique minimum-weight vertex separator between $U_{\gamma}$ and $V_{*}(\gamma) \cup V_{\gamma}$
			\item Similarly, defining $V_{*}({\gamma'}^T) \subseteq V({\gamma'}^T)$ to be the vertices of ${\gamma'}^T$ which are incident to an edge in $E(P)$, $V_{{\gamma'}^T}$ is the unique minimum-weight vertex separator between $V_{*}({\gamma'}^T) \cup U_{{\gamma'}^T}$ and $V_{U_{{\gamma'}^T}}$
			\item[5.*] All edges in $E(P)$ are between vertices of the same type.
		\end{enumerate}
	\end{definition}
	\begin{definition}
		We say that two intersection patterns $P,P'$ on shapes $\gamma,\tau,{\gamma'}^T$ are equivalent (which we write as $P \equiv P'$) if there is an automorphism $\pi \in Aut(\gamma) \times Aut(\tau) \times Aut({\gamma'}^T)$ such that $\pi(P) = P'$ (i.e. if $E(P)$ and $E(P')$ are the constraint edges for $P$ and $P'$ respectively then $\pi(E(P)) = E(P')$).
	\end{definition}
	\begin{definition}
		Given composable shapes $\gamma,\tau,{\gamma'}^T$, we define $\mathcal{P}_{\gamma,\tau,{\gamma'}^T}$ to be the set of all possible intersection patterns $P$ on $\gamma,\tau,{\gamma'}^T$ (up to equivalence)
	\end{definition}
	\begin{definition}
		Given composable (but not properly composable) ribbons $R_1$, $R_2$, $R_3$ of shapes $\gamma, \tau, {\gamma'}$, we define the intersection pattern $P \in \mathcal{P}_{\gamma,\tau,{\gamma'}^T}$ induced by $R_1,R_2,R_3$ as follows:
		\begin{enumerate}
			\item Take the canonical maps $\varphi_{R_1}: V(\gamma) \rightarrow V(R_1)$, $\varphi_{R_2}: V(\tau) \rightarrow V(R_2)$, and $\varphi_{R_3}: V({\gamma'}^T) \rightarrow V(R_3)$
			\item Given vertices $u \in V(\gamma)$ and $v \in V(\tau)$, add a constraint edge between $u$ and $v$ if and only if $\varphi_{R_1}(u) = \varphi_{R_2}(v)$. Similarly, given vertices $u \in V(\gamma)$ and $w \in V({\gamma'}^T)$, add a constraint edge between $u$ and $w$ if and only if $\varphi_{R_1}(u) = \varphi_{R_3}(w)$ and given vertices $v \in V(\tau)$ and $w \in V({\gamma'}^T)$, add a constraint edge between $v$ and $w$ if and only if $\varphi_{R_2}(v) = \varphi_{R_3}(w)$.
		\end{enumerate}
	\end{definition}
	\begin{definition}
		Given an intersection pattern $P \in \mathcal{P}_{\gamma,\tau,{\gamma'}^T}$, we define $V(\gamma \circ \tau \circ {\gamma'}^T)/E(P)$ to be $V(\gamma \circ \tau \circ {\gamma'}^T)$ where all of the edges in $E(P)$ are contracted (i.e. if $(u,v) \in E(P)$ then $u = v$ and $u = v$ only appears once).
	\end{definition}
	\begin{definition}
		Given an intersection pattern $P \in \mathcal{P}_{\gamma,\tau,{\gamma'}^T}$, we define $\tau_{P}$ to be the shape such that:
		\begin{enumerate}
			\item $V(H_{\tau_P}) = V(\gamma \circ \tau \circ {\gamma'}^T)/E(P)$
			\item $E(H_{\tau_P}) = E(\gamma) \cup E(\tau) \cup E({\gamma'}^{T})$
			\item $U_{\tau_P} = U_{\gamma}$
			\item $V_{\tau_P} = V_{{\gamma'}^T}$
		\end{enumerate}
	\end{definition}
	\begin{definition}
		Given an intersection pattern $P \in \mathcal{P}_{\gamma,\tau,{\gamma'}^T}$, we make the following definitions:
		\begin{enumerate}
			\item We define $Aut(P) = \{\pi \in Aut(\gamma \circ \tau \circ {\gamma'}^T): \pi(E(P)) = E(P)\}$
			\item We define $Aut_{pieces}(P) = \{\pi \in Aut(U_{\gamma}) \times Aut(\tau) \times Aut({\gamma'}^T): \pi(E(P)) = E(P)\}$
			\item We define $N(P) = |Aut(P)/Aut_{pieces}(P)|$
		\end{enumerate}
	\end{definition}
	\begin{lemma}
		For all composable $\sigma$, $\tau$, and ${\sigma'}^T$ (inclulding improper $\tau$), 
		\begin{align*}
			&M^{fact}_{\tau}(e_{\sigma}e^T_{\sigma'}) - M^{orth}_{\tau}(e_{\sigma}e^T_{\sigma'}) = \sum_{\sigma_2, \gamma: \gamma \text{ is non-trivial }, \atop \sigma_2 \cup \gamma = \sigma}{\frac{1}{|Aut(U_{\gamma})|}\sum_{P \in \mathcal{P}_{\gamma,\tau,Id_{V_{\tau}}}}N(P)M^{orth}_{\tau_P}(e_{\sigma_2}e^T_{\sigma'})} \\
			&+ \sum_{\sigma'_2, \gamma': \gamma' \text{ is non-trivial }, \atop \sigma'_2 \cup \gamma' = \sigma'}{\frac{1}{|Aut(U_{\gamma'})|}\sum_{P \in \mathcal{P}_{Id_{U_{\tau}},\tau,{\gamma'}^T}}N(P)M^{orth}_{\tau_P}(e_{\sigma}e^T_{\sigma'_2})} \\
			&+ \sum_{\sigma_2, \gamma: \gamma \text{ is non-trivial }, \atop \sigma_2 \cup \gamma = \sigma}{\sum_{\sigma'_2, \gamma': \gamma' \text{ is non-trivial }, \atop \sigma'_2 \cup \gamma' = \sigma'}{
					\frac{1}{|Aut(U_{\gamma})|\cdot|Aut(U_{\gamma'})|}\sum_{P \in \mathcal{P}_{\gamma,\tau,{\gamma'}^T}}N(P)M^{orth}_{\tau_P}(e_{\sigma_2}e^T_{\sigma'_2})}}
		\end{align*}
	\end{lemma}
	\begin{proof}
		This lemma follows from the following bijection. Consider the third term
		\[
		\sum_{\sigma_2, \gamma: \gamma \text{ is non-trivial }, \atop \sigma_2 \cup \gamma = \sigma}{\sum_{\sigma'_2, \gamma': \gamma' \text{ is non-trivial }, \atop \sigma'_2 \cup \gamma' = \sigma'}{
				\frac{1}{|Aut(U_{\gamma})|\cdot|Aut(U_{\gamma'})|}\sum_{P \in \mathcal{P}_{\gamma,\tau,{\gamma'}^T}}N(P)M^{orth}_{\tau_P}(e_{\sigma_2}e^T_{\sigma'_2})}}
		\]
		On one side, we have the following data:
		\begin{enumerate}
			\item Ribbons $R_1$, $R_2$, and $R_3$ such that 
			\begin{enumerate}
				\item $R_1,R_2,R_3$ have shapes $\sigma$, $\tau$, and ${\sigma'}^T$, respectively.
				\item $A_{R_2} = B_{R_1}$ and $A_{R_3} = B_{R_2}$
				\item $\left(V(R_1) \cup V(R_2)\right) \cap V(R_3) \neq A_{R_3}$ and $\left(V(R_2) \cup V(R_3)\right) \cap V(R_1) \neq B_{R_1}$
			\end{enumerate}
			\item An ordering $O_{S'}$ on the leftmost minimum vertex separator $S'$ between $A_{R_1}$ and $V_{*} \cup B_{R_1}$.
			\item An ordering $O_{T'}$ on the rightmost minimum vertex separator $S'$ between $V_{*} \cup A_{R_3}$ and $B_{R_3}$.
		\end{enumerate}
		On the other side, we have the following data
		\begin{enumerate}
			\item An intersection pattern $P \in \mathcal{P}_{\gamma,\tau,{\gamma'}^T}$ where $\gamma$ and ${\gamma'}^T$ are non-trivial.
			\item Ribbons $R'_1$, $R'_2$, $R'_3$ of shapes $\sigma_2$, $\tau_P$, ${\sigma'_2}^T$ such that $V(R'_1) \cap V(R'_2) = A_{R'_2} = B_{R'_1}$, $V(R'_2) \cap V(R'_3) = B_{R'_2} = A_{R'_3}$, and $V(R'_1) \cap V(R'_3) = A_{R'_2} \cap B_{R'_2}$
			\item An element $\pi \in Aut(P)/Aut_{pieces}(P)$
		\end{enumerate}
		To see this bijection, given $R_1,R_2,R_3$, we again implement our strategy for analyzing intersection terms. Recall that $V_{*}$ is the set of vertices in $V(R_1) \cup V(R_2) \cup V(R_3)$ which have an unexpected equality with another vertex, $S'$ is the leftmost minimum vertex separator between $A_{R_1}$ and $B_{R_1} \cup V_{*}$, and $T'$ is the rightmost minimum vertex separator between $A_{R_3} \cup V_{*}$ and $B_{R_3}$.
		\begin{enumerate}
			\item Decompose $R_1$ as $R_1 = {R'}_1 \circ R_4$ where ${R'}_1$ is the part of $R_1$ between $A_{R_1}$ and $(S',O_{S'})$ and $R_4$ is the part of $R_1$ between $(S',O_{S'})$ and $B_{R_1} = A_{R_2}$. Decompose $R_3$ as $R_5 \cup R'_3$ where $R_5$ is the part of $R_3$ between $A_{R_3}$ and $(T',O_{T'})$ and $R'_3$ is the part of $R_3$ between $(T',O_{T'})$ and $B_{R_3}$
			\item Take the intersection pattern $P$ and the ribbon $R'_2$ induced by $R_4$, $R_2$, and $R_5$.
			\item Observe that we have two bijective maps from $V(\gamma \circ \tau \circ {\gamma'}^T)/E(P)$ to $V(R_4) \cup V(R_2) \cup V(R_5)$. The first map is $\varphi_{R_4} \circ \varphi_{R_2} \circ \varphi_{R_5}$ and the second map is $\varphi_{R'_2}$. We take $\pi = \varphi^{-1}_{R'_2}(\varphi_{R_4} \circ \varphi_{R_2} \circ \varphi_{R_5})$.
		\end{enumerate}
		Conversely, given an intersection pattern $P \in \mathcal{P}_{\gamma,\tau,{\gamma'}^T}$, $R'_1$, $R'_2$, $R'_3$, and an element $\pi \in Aut(P)/Aut_{pieces}(P)$:
		\begin{enumerate}
			\item Take $R_4 = \varphi_{R'_2}\pi(V(\gamma))$, $R_2 = \varphi_{R'_2}\pi(V(\tau))$, and $R_5 = \varphi_{R'_2}\pi(V({\gamma'}^T))$.
			\item Take $R_1 = R'_1 \cup R_4$ and take $R_3 = R_5 \cup R'_3$.
			\item Take $O_S$ and $O_T$ based on $B_{R'_1} = A_{R_4}$ and $B_{R_5} = A_{R'_3}$.
		\end{enumerate}
		To confirm that this is a bijection, we need to show that these maps are inverses of each other.
		
		If we apply the first map and then the second, we obtain the following:
		\begin{enumerate}
			\item We obtain the ribbons 
			\begin{enumerate}
				\item $R''_1 = R'_1 \circ \varphi_{R'_2}\varphi^{-1}_{R'_2}(\varphi_{R_4} \circ \varphi_{R_2} \circ \varphi_{R_5})(V(\gamma))$
				\item $R''_2 = \varphi_{R'_2}\varphi^{-1}_{R'_2}(\varphi_{R_4} \circ \varphi_{R_2} \circ \varphi_{R_5})(V(\tau))$
				\item $R''_3 = \varphi_{R'_2}\varphi^{-1}_{R'_2}(\varphi_{R_4} \circ \varphi_{R_2} \circ \varphi_{R_5})(V({\gamma'}^T)) \circ R'_3$
			\end{enumerate} 
			where 
			\begin{enumerate}
				\item $R'_1$ is the part of $R_1$ between $A_{R_1}$ and $(S',O_{S'})$ where $S'$ is the minimum vertex separator between $A_{R_1}$ and $V_{*} \cup B_{R_1}$.
				\item $R_4$ is the part of $R_1$ between $(S',O_{S'})$ and $B_{R_1}$
				\item $R'_2$ is the ribbon of shape $\tau_{P}$ induced (along with the intersection pattern $P$) by $R_1$, $R_2$, and $R_3$.
				\item $R_5$ is the part of $R_3$ between $A_{R_3}$ and $(T',O_{T'})$.
				\item $R'_3$ is the part of $R_3$ between $(T',O_{T'})$ and $B_{R_3}$
			\end{enumerate}
			This implies that $R''_1 = R'_1 \circ R_4 = R_1$, $R''_2 = R_2$, and $R''_3 = R_5 \circ R'_3 = R_3$. Since the second map leaves $R'_1$ and $R'_3$ unchanged, we recover the orderings $O_S$ and $O_T$ as well.
		\end{enumerate}
		
		Conversely, if we apply the second map, we have that $R_1 = R'_1 \circ \varphi_{R'_2}\pi(V(\gamma))$, $R_2 = \varphi_{R'_2}\pi(V(\tau))$, and $R_3 = \varphi_{R'_2}\pi(V({\gamma'}^T)) \circ R'_3$ and we have the orderings $O_S$ and $O_T$ corresponding to $B_{R'_1}$ and $A_{R'_3}$ respectively. If we apply the first map, 
		\begin{enumerate}
			\item $R'_1$ and $R'_3$ are preserved.
			\item $R''_2$ and $P''$ are the ribbon and intersection pattern induced by the ribbons $\varphi_{R'_2}\pi(\gamma)$, $\varphi_{R'_2}\pi(\tau)$, and $\varphi_{R'_2}\pi({\gamma'}^T)$. To see that $R''_2 = R'_2$, observe that 
			\[
			R''_2 = \varphi_{R'_2}\pi(V(\gamma)) \circ \varphi_{R'_2}\pi(V(\tau)) \circ \varphi_{R'_2}\pi(V({\gamma'}^T)) = \varphi_{R'_2}{\pi(\gamma \circ \tau \circ {\gamma'}^T)} = \varphi_{R_2}(\gamma \circ \tau \circ {\gamma'}^T) = R'_2
			\]
			To see that $P'' \equiv P$, observe that:
			\begin{enumerate}
				\item We have two bijective maps from $V(\gamma)$ to $V(\varphi_{R'_2}\pi(\gamma))$. These two maps are $\varphi_{R'_2}\pi$ and $\varphi_{\varphi_{R'_2}\pi(\gamma)}$.
				\item We have two bijective maps from $V(\tau)$ to $V(\varphi_{R'_2}\pi(\tau))$. These two maps are $\varphi_{R'_2}\pi$ and $\varphi_{\varphi_{R'_2}\pi(\tau)}$.
				\item We have two bijective maps from $V({\gamma'}^T)$ to $V(\varphi_{R'_2}\pi({\gamma'}^T))$. These two maps are $\varphi_{R'_2}\pi$ and $\varphi_{\varphi_{R'_2}\pi({\gamma'}^T)}$.
				\item For $P''$, the constraint edges are 
				\[
				\left(\varphi^{-1}_{\varphi_{R'_2}\pi(\gamma)}\varphi_{R'_2}\pi \circ \varphi^{-1}_{\varphi_{R'_2}\pi(\tau)}\varphi_{R'_2}\pi \circ \varphi^{-1}_{\varphi_{R'_2}\pi({\gamma'}^T))}\varphi_{R'_2}\pi\right)(E(P))
				\]
			\end{enumerate}
			\item We have that 
			\[
			\pi'' = \varphi^{-1}_{R'_2}(\varphi_{\varphi_{R'_2}\pi(V(\gamma))} \circ \varphi_{\varphi_{R'_2}\pi(V(\tau))} \circ \varphi_{\varphi_{R'_2}\pi(V({\gamma'}^T))})
			\]
			To see that $\pi'' \equiv \pi$, note that 
			\[
			\pi = \pi''\left(\varphi^{-1}_{\varphi_{R'_2}\pi(V(\gamma))}\varphi_{R'_2}\pi \circ \varphi^{-1}_{\varphi_{R'_2}\pi(V(\tau))}\varphi_{R'_2}\pi \circ \varphi^{-1}_{\varphi_{R'_2}\pi(V({\gamma'}^T))}\varphi_{R'_2}\pi)\right)
			\]
		\end{enumerate}
		The analysis for the the first term is the same except that when $\gamma'$ is trivial, we always take $\gamma'$ to be the identity so $T = V(V_{\tau}) = V(U_{{\sigma'}^T})$ and the ordering $O_{T}$ is given by $V_{\tau} = U_{{\sigma'}^T}$. 
		Similarly, the analysis for the the second term is the same except that when $\gamma$ is trivial, we always take $\gamma$ to be the identity so $S = V(V_{\sigma}) = V(U_{\tau})$ and the ordering $O_{S}$ is given by $V_{\sigma} = U_{\tau}$.
	\end{proof}

\section{Degree 4 Planted Clique Analysis}\label{sec: deg_4_planted_clique}
For this example, we name the shapes based on what they look like to make them easier to keep track of. With the exception of $Id_{U}$, these names only appear in this section.
\subsection{The shapes $\alpha$ and coefficients $\lambda_{\alpha}$}
After several preprocessing steps, the moment matrix which needs to be analyzed is $M \approx \sum_{\alpha}{\lambda_{\alpha}M_{\alpha}}$ for the following shapes $\alpha$ and coefficients $\lambda_{\alpha}$
\begin{definition} \ 
\begin{enumerate}
\item Given $E \subseteq \{(u_1,v_1), (u_1,v_2), (u_2,v_1), (u_2,v_2)\}$, we define $\alpha_{E}$ to be the shape where $U_{\alpha_E} = (u_1,u_2)$, $V_{\alpha_E} = (v_1,v_2)$, and $E(\alpha) = E$.
\item Given $E \subseteq \{(u_1,v_1), (u_1,v_2), (u_2,v_1), (u_2,v_2)\}$, we define $\alpha_{X,E}$ to be the shape where $U_{\alpha_E} = (u_1,u_2)$, $V_{\alpha_E} = (v_1,v_2)$, there is one additional vertex $w_1$, and $E(\alpha) = E \cup \{(u_1,w_1), (u_2,w_1), (w_1,v_1), (w_1,v_2)\}$.
\item Given $i,j \in \{1,2\}$, we define $\alpha_{u_i = v_j,e}$ to be the shape where $U_{\alpha_{u_i = v_j,e}} = (u_1,u_2)$, $V_{\alpha_{u_i = v_j,e}} = (v_1,v_2)$, $u_i = v_j$, and $E(\alpha_{u_i = v_j,e}) = \{(u_{2-i},v_{2-j})\}$.
\item Given $i,j \in \{1,2\}$, we define $\alpha_{u_i = v_j,\emptyset}$ to be the shape where $U_{\alpha_E} = (u_1,u_2)$, $V_{\alpha_E} = (v_1,v_2)$, $u_i = v_j$, and $E(\alpha) = \emptyset$.
\item We define $\alpha_{Id:} = Id_{(u_1,u_2)}$ to be the shape where $U_{Id_{(u_1,u_2)}} = V_{Id_{(u_1,u_2)}} = (u_1,u_2)$ and $E(Id_{(u_1,u_2)}) = \emptyset$.
\item We define $\alpha_{swap}$ to be the shape where $U_{\alpha_{swap}} = (u_1,u_2)$, $V_{\alpha_{swap}} = (u_2,u_1)$ and $E(\alpha_{swap}) = \emptyset$.
\end{enumerate}
\end{definition}
For illustrations of these shapes $\alpha$, see Figures \ref{zerointersectionalphasfigure}.\\
\begin{figure}[ht]\label{zerointersectionalphasfigure}
\centerline{\includegraphics[height=4cm]{machinery/images/ZeroIntersectionAlphas}}
\caption{This figure shows the shapes $\alpha$ where $|U_{\alpha} \cap V_{\alpha}| = 0$. On the left we have $\alpha_{E}$ and on the right we have $\alpha_{X,E}$.}
\end{figure}
\begin{figure}[ht]\label{oneintersectionalphasfigure}
\centerline{\includegraphics[height=4cm]{machinery/images/OneIntersectionAlphas}}
\caption{This figure shows the shapes $\alpha$ where $|U_{\alpha} \cap V_{\alpha}| = 1$. From left to right, we have $\alpha_{u_2 = v_2,\emptyset}$ and $\alpha_{u_2 = v_2,e}$, $\alpha_{u_2 = v_1,\emptyset}$ and $\alpha_{u_2 = v_1,e}$, $\alpha_{u_1 = v_1,\emptyset}$ and $\alpha_{u_1 = v_1,e}$, and $\alpha_{u_1 = v_2,\emptyset}$ and $\alpha_{u_1 = v_2,e}$.}
\end{figure}
\begin{figure}[ht]\label{twointersectionalphasfigure}
\centerline{\includegraphics[height=4cm]{machinery/images/TwoIntersectionAlphas}}
\caption{This figure shows the shapes $\alpha$ where $|U_{\alpha} \cap V_{\alpha}| = 2$. On the left we have $\alpha_{Id:}$ and on the right we have $\alpha_{swap}$.}
\end{figure}
We have the following coefficients on these shapes.
\begin{enumerate}
\item For each $E \subseteq \{(u_1,v_1), (u_1,v_2), (u_2,v_1), (u_2,v_2)\}$, $\lambda_{\alpha_E} = \frac{k^4}{n^4}$.
\item For each $E \subseteq \{(u_1,v_1), (u_1,v_2), (u_2,v_1), (u_2,v_2)\}$, $\lambda_{\alpha_{X,E}} = C\frac{k^5}{n^5}$ for some constant $C > 1$. These coefficients are the ad-hoc fix to the candidate pseudo-expectation values for planted clique in \cite{meka2015sum}.
\item For each $i,j \in \{1,2\}$, $\lambda_{\alpha_{u_i = v_j,e}} = \lambda_{\alpha_{u_i = v_j,\emptyset}} = \frac{k^3}{n^3}$.
\item $\lambda_{\alpha_{Id:}} = \lambda_{\alpha_{swap}} = \frac{k^2}{n^2}$
\end{enumerate}
\subsection{Decomposing $\alpha$ and coefficient matrices}
To find the coefficient matrices $H_{Id_{\emptyset}}$, $H_{Id_{(u_1)}}$, $H_{Id_{(u_1,u_2)}}$, and $H_{\tau}$, we need to decompose each $\alpha$ into a left part $\sigma$, a proper middle part $\tau$, and a right part ${\sigma'}^T$. 

The following left shapes will appear in these decompositions
\begin{definition} \ 
\begin{enumerate}
\item Define $\sigma_{Id:} = Id_{(u_1,u_2)}$. Note that $\sigma_{Id:} = \alpha_{Id:}$ but it is playing a different role.
\item Define $\sigma_{swap} = \alpha_{swap}$.
\item Define $\sigma_7$ to be the shape where $U_{\sigma_{7}} = (u_1,u_2)$, $V_{\sigma_{7}} = (v_1)$, and $E(\sigma_{7}) = \{(u_1,v_1),(u_2,v_1)\}$.
\item Define $\sigma_{u_1,u_2 \rightarrow u_1}$ to be the shape where $U_{\sigma_{u_1,u_2 \rightarrow u_1}} = (u_1,u_2)$, $V_{\sigma_{u_1,u_2 \rightarrow u_1}} = (u_1)$, and $E(\sigma_{u_1,u_2 \rightarrow u_1}) = \emptyset$.
\item Similarly, define $\sigma_{u_1,u_2 \rightarrow u_2}$ to be the shape where $U_{\sigma_{u_1,u_2 \rightarrow u_2}} = (u_1,u_2)$, $V_{\sigma_{u_1,u_2 \rightarrow u_2}} = (u_2)$, and $E(\sigma_{u_1,u_2 \rightarrow u_2}) = \emptyset$.
\item Define $\sigma_{u_1,u_2 \rightarrow \emptyset}$ to be the shape where $U_{\sigma_{u_1,u_2 \rightarrow \emptyset}} = (u_1,u_2)$, $V_{\sigma_{u_1,u_2 \rightarrow \emptyset}} = \emptyset$, and $E(\sigma_{u_1,u_2 \rightarrow \emptyset}) = \emptyset$.
\end{enumerate}
\end{definition}
These left shapes are illustrated in Figure \ref{onerightsidevertexsigmasfigure}. \\
\begin{figure}[ht]\label{onerightsidevertexsigmasfigure}
\centerline{\includegraphics[height=4cm]{machinery/images/OneRightSideVertexSigmas}}
\caption{This figure shows the left shapes $\sigma$ where $|V_{sigma}| = 1$. From left to right we have $\sigma_{u_1,u_2 \rightarrow u_1}$, $\sigma_{u_1,u_2 \rightarrow u_2}$, and $\sigma_7$.}
\end{figure}
\begin{figure}[ht]\label{zeroortworightvertexsigmasfigure}
\centerline{\includegraphics[height=4cm]{machinery/images/ZeroorTwoRightVertexSigmas}}
\caption{This figure shows the left shapes $\sigma$ where $|V_{\sigma}| = 2$ or $|V_{\sigma}| = 0$. On the left we have $\sigma_{Id:}$ and $\sigma_{swap}$. On the right we have $\sigma_{u_1,u_2 \rightarrow \emptyset}$.}
\end{figure}

The following proper middle shapes will appear in these decompositions.
\begin{definition} \ 
\begin{enumerate}
\item Define $\tau_{Id:} = Id_{(u_1,u_2)}$
\item Given $E \subseteq \{(u_1,v_1), (u_1,v_2), (u_2,v_1), (u_2,v_2)\}$ such that all four vertices $u_1,u_2,v_1,v_2$ are incident to at least one edge in $E$, we define $\tau_{E} = \alpha_{E}$.
\item Given $E \subseteq \{(u_1,v_1), (u_1,v_2), (u_2,v_1), (u_2,v_2)\}$ such that $E \neq \emptyset$, we define $\tau_{X,E} = \alpha_{X,E}$.
\item Given $i,j \in \{1,2\}$, we define $\tau_{u_i = v_j,e} = \alpha_{u_i = v_j,e}$.
\item Define $\tau_{Id\cdot} = Id_{(u_1)}$ to be the shape where $U_{Id_{(u_1)}} = V_{Id_{(u_1)}} = (u_1)$ and $E(Id_{(u_1)}) = \emptyset$.
\item Define $\tau_{e}$ to be the shape where $U_{\tau_{e}} = (u_1)$, $V_{\tau_{e}} = (v_1)$, and $E(\tau_{e}) = \{(u_1,v_1)\}$.
\item Define $\tau_{\emptyset}$ to be the empty shape with no vertices. 
\end{enumerate}
\end{definition}
These proper middle shapes (except for $\tau_{\emptyset}$) are illustrated in Figure \ref{twovertexseparatortausfigure}. \\
\begin{figure}[ht]\label{twovertexseparatortausfigure}
\centerline{\includegraphics[height=8cm]{machinery/images/TwoVertexSeparatorTaus}}
\caption{This figure shows the proper middle shapes $\tau$ where $|U_{tau}| = |V_{tau}| = 2$. In the upper row, we have $\tau_{Id:}$, $\tau_{E}$, and $\tau_{X,E}$. In the bottom row, we have  $\tau_{u_2 = v_2,e}$, $\tau_{u_2 = v_1,e}$, $\tau_{u_1 = v_1,e}$, and $\tau_{u_1 = v_2,e}$.}
\end{figure}
\begin{figure}[ht]\label{onevertexseparatortausfigure}
\centerline{\includegraphics[height=3cm]{machinery/images/OneVertexSeparatorTaus}}
\caption{This figure shows the proper middle shapes $\tau$ where $|U_{tau}| = |V_{tau}| = 1$. On the left we have $\tau_{Id\cdot}$ and on the right we have $\tau_{e}$.}
\end{figure}

Some decompositions are as follows:
\begin{example} \ 
\begin{enumerate}
\item $\alpha_{X,\emptyset} = \sigma_7 \circ \tau_{Id\cdot} \circ \sigma_7^T$ (see Figure \ref{decompositiononefigure}).
\item $\alpha_{\{(u_1,v_1),(u_2,v_1)\}} = \sigma_7 \circ \tau_{Id\cdot} \circ \sigma_{u_1,u_2 \rightarrow u_1}^T$ (see Figure \ref{decompositiontwofigure}).
\item $\alpha_{\{(u_2,v_1)\}} = \sigma_{u_1,u_2 \rightarrow u_2} \circ \tau_{e} \circ \sigma_{u_1,u_2 \rightarrow u_1}^T$ (see Figure \ref{decompositionthreefigure}).
\item \begin{align*}
\alpha_{\{(u_1,v_1),(u_2,v_2)\}} &= \sigma_{Id:} \circ \alpha_{\{(u_1,v_1),(u_2,v_2)\}} \circ \sigma_{Id:}^T = \sigma_{Id:} \circ \alpha_{\{(u_1,v_2),(u_2,v_1)\}} \circ \sigma_{swap}^T \\
&= \sigma_{swap}^T \circ \alpha_{\{(u_1,v_2),(u_2,v_1)\}} \circ \sigma_{Id:} = \sigma_{swap} \circ \alpha_{\{(u_1,v_1),(u_2,v_2)\}} \circ \sigma_{swap}^T
\end{align*}
\end{enumerate}
\end{example}
\begin{remark}
Since there are $4$ different ways to decompose $\alpha_{\{(u_1,v_1),(u_2,v_2)\}}$, we split the coefficient $\lambda_{\alpha_{\{(u_1,v_1),(u_2,v_2)\}}}$ among these four decompositions. This is the reason for the factor of $4$ in the denominator in the entries of the matrix $H_{\tau}$.
\end{remark}
\begin{figure}[ht]\label{decompositiononefigure}
\centerline{\includegraphics[height=4cm]{machinery/images/DecompositionOne}}
\caption{This figure shows the decomposition $\alpha_{X,\emptyset} = \sigma_7 \circ \tau_{Id\cdot} \circ \sigma_7^T$.}
\end{figure}
\begin{figure}[ht]\label{decompositiontwofigure}
\centerline{\includegraphics[height=4cm]{machinery/images/DecompositionTwo}}
\caption{This figure shows the decomposition $\alpha_{\{(u_1,v_1),(u_2,v_1)\}} = \sigma_7 \circ \tau_{Id\cdot} \circ \sigma_{u_1,u_2 \rightarrow u_1}^T$.}
\end{figure}
\begin{figure}[ht]\label{decompositionthreefigure}
\centerline{\includegraphics[height=4cm]{machinery/images/DecompositionThree}}
\caption{This figure shows the decomposition $\alpha_{\{(u_2,v_1)\}} = \sigma_{u_1,u_2 \rightarrow u_2} \circ \tau_{e} \circ \sigma_{u_1,u_2 \rightarrow u_1}^T$.}
\end{figure}

Our coefficient matrices are as follows (ignoring zero rows and columns):
\begin{enumerate}
\item $H_{Id_{(u_1,u_2)}}$ has two rows and columns indexed by $\sigma_{Id:}$ and $\sigma_{swap}$ and has entries $\left(\begin{matrix}
\frac{k^2}{2n^2} & \frac{k^2}{2n^2}\\
\frac{k^2}{2n^2} & \frac{k^2}{2n^2}\\
\end{matrix}\right)$.
\item $H_{Id_{(u_1)}}$ has rows and columns indexed by $\sigma_{u_1,u_2 \rightarrow u_1}$, $\sigma_{u_1,u_2 \rightarrow u_2}$, and $\sigma_7$ and has entries
\[
\left(\begin{matrix}
\frac{k^3}{n^3} & \frac{k^3}{n^3} & \frac{k^4}{n^4}\\
\frac{k^3}{n^3} & \frac{k^3}{n^3} & \frac{k^4}{n^4}\\
\frac{k^4}{n^4} & \frac{k^4}{n^4} & C\frac{k^5}{n^5}\\
\end{matrix}\right) \]
\item $H_{Id_{\emptyset}}$ has a single row and column indexed by $\sigma_{u_1,u_2 \rightarrow \emptyset}$ and has a single entry which is $\frac{k^4}{n^4}$.
\item For all $E \subseteq \{(u_1,v_1), (u_1,v_2), (u_2,v_1), (u_2,v_2)\}$ such that all four vertices $u_1,u_2,v_1,v_2$ are incident to at least one edge in $E$, $H_{\tau_{E}}$ has two rows and columns indexed by $\sigma_{Id:}$ and $\sigma_{swap}$ and has entries $\left(\begin{matrix}
\frac{k^4}{4n^4} & \frac{k^4}{4n^4}\\
\frac{k^4}{4n^4} & \frac{k^4}{4n^4}\\
\end{matrix}\right)$.
\item For all $E \subseteq \{(u_1,v_1), (u_1,v_2), (u_2,v_1), (u_2,v_2)\}$ such that $E \neq \emptyset$,  $H_{\tau_{X,E}}$ has two rows and columns indexed by $\sigma_{Id:}$ and $\sigma_{swap}$ and has entries $\left(\begin{matrix}
\frac{k^5}{4n^5} & \frac{k^5}{4n^5}\\
\frac{k^5}{4n^5} & \frac{k^5}{4n^5}\\
\end{matrix}\right)$.
\item For all $i,j \in \{1,2\}$, $H_{\tau_{u_i = v_j,e}}$ has two rows and columns indexed by $\sigma_{Id:}$ and $\sigma_{swap}$ and has entries $\left(\begin{matrix}
\frac{k^3}{4n^3} & \frac{k^3}{4n^3}\\
\frac{k^3}{4n^3} & \frac{k^3}{4n^3}\\
\end{matrix}\right)$.
\item $H_{\tau_{e}}$ has rows and columns indexed by $\sigma_{u_1,u_2 \rightarrow u_1}$ and $\sigma_{u_1,u_2 \rightarrow u_2}$ and has entries 
\[
\left(\begin{matrix}
\frac{k^4}{n^4} & \frac{k^4}{n^4}\\
\frac{k^4}{n^4} & \frac{k^4}{n^4}\\
\end{matrix}\right)\]
\end{enumerate}
\subsection{Verifying the first and second conditions of the machinery}
We can verify the fist and second conditions of the machinery as follows.
\begin{enumerate}
\item $H_{Id_{(u_1,u_2)}} \succeq 0$ and $H_{Id_{\emptyset}} \succeq 0$
\item As long as $C \geq 1$, $H_{Id_{(u_1)}} \succeq 0$. This condition is the reason why we need to add this term in.
\item For all $E \subseteq \{(u_1,v_1), (u_1,v_2), (u_2,v_1), (u_2,v_2)\}$ such that all four vertices $u_1,u_2,v_1,v_2$ are incident to at least one edge in $E$, $||M_{\tau_{E}}||$ is $\tilde{O}(n)$ so $||M_{\tau_{E}}||H_{\tau_{E}} \preceq H_{Id_{(u_1,u_2)}}$ as long as $k << \sqrt{n}$.
\item For all $E \subseteq \{(u_1,v_1), (u_1,v_2), (u_2,v_1), (u_2,v_2)\}$ such that $E \neq \emptyset$, 
$||M_{\tau_{X,E}}||$ is $\tilde{O}(n^{3/2})$ so $||M_{\tau_{X,E}}||H_{\tau_{X,E}} \preceq H_{Id_{(u_1,u_2)}}$ as long as $k << \sqrt{n}$.
\item For all $i,j \in \{1,2\}$, $||M_{\tau_{u_i = v_j,e}}||$ is $\tilde{O}(\sqrt{n})$ so $||M_{\tau_{u_i = v_j,e}}||H_{\tau_{u_i = v_j,e}} \preceq H_{Id_{(u_1,u_2)}}$ as long as $k << \sqrt{n}$.
\item Since $\left(\begin{matrix}
\frac{k^3}{n^3} & \frac{k^3}{n^3} & \frac{k^4}{n^4}\\
\frac{k^3}{n^3} & \frac{k^3}{n^3} & \frac{k^4}{n^4}\\
\frac{k^4}{n^4} & \frac{k^4}{n^4} & C\frac{k^5}{n^5}\\
\end{matrix}\right) \succeq (1 - \frac{1}{C})\left(\begin{matrix}
\frac{k^3}{n^3} & \frac{k^3}{n^3} & 0\\
\frac{k^3}{n^3} & \frac{k^3}{n^3} & 0\\
0 & 0 & 0\\
\end{matrix}\right)$ and $||M_{\tau_{e}}||$ is $\tilde{O}(\sqrt{n})$, $||M_{\tau_{e}}||H_{\tau_{e}} \preceq H_{Id_{(u_1)}}$ as long as $C > 1$ and $k << \sqrt{n}$. Note that for pseudo-calibration we take $C = 1$. We can do this because we have more terms which allows us to have a more delicate factorization.
\end{enumerate}
\subsection{Verifying the third condition of the machinery}
The following left shapes $\gamma$ appear.
\begin{definition} \ 
\begin{enumerate}
\item Define $\gamma_7 = \sigma_7$.
\item Define $\gamma_{u_1,u_2 \rightarrow u_1} = \sigma_{u_1,u_2 \rightarrow u_1}$.
\item Define $\gamma_{u_1,u_2 \rightarrow u_2} = \sigma_{u_1,u_2 \rightarrow u_2}$.
\item Define $\gamma_{u_1,u_2 \rightarrow \emptyset} = \sigma_{u_1,u_2 \rightarrow \emptyset}$.
\item Define $\gamma_{u_1 \rightarrow \emptyset}$ to be the shape with $U_{\gamma_{u_1 \rightarrow \emptyset}} = (u_1)$ and $V_{\gamma_{u_1 \rightarrow \emptyset}} = \emptyset$
\end{enumerate}
\end{definition}
For illustrations of these gammas, see Figure \ref{possiblegammasfigure}. \\
\begin{figure}[ht]\label{possiblegammasfigure}
\centerline{\includegraphics[height=4cm]{machinery/images/PossibleGammas}}
\caption{This figure shows the left shapes $\gamma$ which appear in the analysis. From left to right we have $\gamma_{u_1,u_2 \rightarrow u_1}$, $\gamma_{u_1,u_2 \rightarrow u_2}$, $\gamma_7$, $\gamma_{u_1,u_2 \rightarrow \emptyset}$, and $\gamma_{u_1 \rightarrow \emptyset}$}
\end{figure}
We have the following compositions.
\begin{enumerate}
\item $\sigma_{Id:} \circ \gamma_7 = \sigma_{swap} \circ \gamma_7 = \sigma_7$ (see Figure \ref{compositiononefigure}).
\item $\sigma_{Id:} \circ \gamma_{u_1,u_2 \rightarrow u_1} = \sigma_{u_1,u_2 \rightarrow u_1}$ and $\sigma_{swap} \circ \gamma_{u_1,u_2 \rightarrow u_1} = \sigma_{u_1,u_2 \rightarrow u_2}$ (see Figure \ref{compositiontwofigure}).
\item Similarly, $\sigma_{Id:} \circ \gamma_{u_1,u_2 \rightarrow u_2} = \sigma_{u_1,u_2 \rightarrow u_2}$ and $\sigma_{swap} \circ \gamma_{u_1,u_2 \rightarrow u_2} = \sigma_{u_1,u_2 \rightarrow u_1}$.
\item $\sigma_{Id:} \circ \gamma_{u_1,u_2 \rightarrow \emptyset} = \sigma_{swap} \circ \gamma_{u_1,u_2 \rightarrow \emptyset} = \sigma_{u_1,u_2 \rightarrow \emptyset}$.
\item $\sigma_{u_1,u_2 \rightarrow u_1} \circ \gamma_{u_1 \rightarrow \emptyset} = \sigma_{u_1,u_2 \rightarrow u_2} \circ \gamma_{u_1 \rightarrow \emptyset} = \sigma_{u_1,u_2 \rightarrow \emptyset}$ (see Figure \ref{compositionthreefigure}).
\end{enumerate}
\begin{figure}[ht]\label{compositiononefigure}
\centerline{\includegraphics[height=4cm]{machinery/images/CompositionOne}}
\caption{This figure shows the compositions $\sigma_{Id:} \circ \gamma_{7} = \sigma_{7}$ and $\sigma_{swap} \circ \gamma_{7} = \sigma_{7}$.}
\end{figure}
\begin{figure}[ht]\label{compositiontwofigure}
\centerline{\includegraphics[height=8cm]{machinery/images/CompositionTwo}}
\caption{This figure shows the compositions $\sigma_{Id:} \circ \gamma_{u_1,u_2 \rightarrow u_1} = \sigma_{u_1,u_2 \rightarrow u_1}$ and $\sigma_{swap} \circ \gamma_{u_1,u_2 \rightarrow u_1} = \sigma_{u_1,u_2 \rightarrow u_2}$ .}
\end{figure}
\begin{figure}[ht]\label{compositionthreefigure}
\centerline{\includegraphics[height=4cm]{machinery/images/CompositionThree}}
\caption{This figure shows the compositions $\sigma_{u_1,u_2 \rightarrow u_1} \circ \gamma_{u_1 \rightarrow \emptyset} = \sigma_{u_1,u_2 \rightarrow \emptyset}$ and $\sigma_{u_1,u_2 \rightarrow u_2} \circ \gamma_{u_1 \rightarrow \emptyset} = \sigma_{u_1,u_2 \rightarrow \emptyset}$.}
\end{figure}

Based on these compositions, we have the following matrices:
\begin{enumerate}
\item $H_{Id_{(u_1)}}^{-\gamma_7,\gamma_7}$ has two rows and columns indexed by $\sigma_{Id:}$ and $\sigma_{swap}$ and has entries $\left(\begin{matrix}
\frac{k^5}{n^5} & \frac{k^5}{n^5}\\
\frac{k^5}{n^5} & \frac{k^5}{n^5}\\
\end{matrix}\right)$.
\item $H_{Id_{(u_1)}}^{-\gamma_{u_1,u_2 \rightarrow u_1},\gamma_{u_1,u_2 \rightarrow u_1}}$ has two rows and columns indexed by $\sigma_{Id:}$ and $\sigma_{swap}$ and has entries $\left(\begin{matrix}
\frac{k^3}{n^3} & \frac{k^3}{n^3}\\
\frac{k^3}{n^3} & \frac{k^3}{n^3}\\
\end{matrix}\right)$.
\item $H_{Id_{(u_1)}}^{-\gamma_{u_1,u_2 \rightarrow u_2},\gamma_{u_1,u_2 \rightarrow u_2}} = H_{Id_{(u_1)}}^{-\gamma_{u_1,u_2 \rightarrow u_1},\gamma_{u_1,u_2 \rightarrow u_1}}$.
\item $H_{Id_{\emptyset}}^{-\gamma_{u_1,u_2 \rightarrow \emptyset},\gamma_{u_1,u_2 \rightarrow \emptyset}}$  has two rows and columns indexed by $\sigma_{Id:}$ and $\sigma_{swap}$ and has entries $\left(\begin{matrix}
\frac{k^4}{n^4} & \frac{k^4}{n^4}\\
\frac{k^4}{n^4} & \frac{k^4}{n^4}\\
\end{matrix}\right)$.
\item $H_{Id_{\emptyset}}^{-\gamma_{u_1 \rightarrow \emptyset},\gamma_{u_1 \rightarrow \emptyset}}$ has two rows and columns indexed by $\sigma_{u_1,u_2 \rightarrow u_1}$ and $\sigma_{u_1,u_2 \rightarrow u_2}$ and has entries $\left(\begin{matrix}
\frac{k^4}{n^4} & \frac{k^4}{n^4}\\
\frac{k^4}{n^4} & \frac{k^4}{n^4}\\
\end{matrix}\right)$.
\end{enumerate}
We can qualitatively verify the third condition of the machinery as follows
\begin{enumerate}
\item $B(\gamma_7)$ is $\tilde{O}(n^{\frac{|V(\gamma_7) \setminus U_{\gamma_7}|}{2}}) = \tilde{O}(\sqrt{n})$ so ${B(\gamma_7)^2}H_{Id_{(u_1)}}^{-\gamma_7,\gamma_7} \preceq H_{Id_{(u_1,u_2)}}$ as long as $k << n^{\frac{2}{3}}$.
\item $B(\gamma_{u_1,u_2 \rightarrow u_1})$ is $\tilde{O}(n^{\frac{|V(\gamma_{u_1,u_2 \rightarrow u_1}) \setminus U_{\gamma_{u_1,u_2 \rightarrow u_1}}|}{2}}) = \tilde{O}(1)$ so ${B(\gamma_{u_1,u_2 \rightarrow u_1})^2}H_{Id_{(u_1)}}^{-\gamma_{u_1,u_2 \rightarrow u_1},\gamma_{u_1,u_2 \rightarrow u_1}} \preceq H_{Id_{(u_1,u_2)}}$. Following the same logic, ${B(\gamma_{u_1,u_2 \rightarrow u_2})^2}H_{Id_{(u_1)}}^{-\gamma_{u_1,u_2 \rightarrow u_2},\gamma_{u_1,u_2 \rightarrow u_2}} \preceq H_{Id_{(u_1,u_2)}}$.
\item $B(\gamma_{u_1,u_2 \rightarrow \emptyset})$ is $\tilde{O}(n^{\frac{|V(\gamma_{u_1,u_2 \rightarrow \emptyset}) \setminus U_{\gamma_{u_1,u_2 \rightarrow \emptyset}}|}{2}}) = \tilde{O}(1)$ so ${B(\gamma_{u_1,u_2 \rightarrow \emptyset})^2}H_{Id_{\emptyset}}^{-\gamma_{u_1,u_2 \rightarrow \emptyset},\gamma_{u_1,u_2 \rightarrow \emptyset}} \preceq H_{Id_{(u_1,u_2)}}$.
\item $B(\gamma_{u_1 \rightarrow \emptyset})$ is $\tilde{O}(n^{\frac{|V(\gamma_{u_1 \rightarrow \emptyset}) \setminus U_{\gamma_{u_1 \rightarrow \emptyset}}|}{2}}) = \tilde{O}(1)$ so ${B(\gamma_{u_1 \rightarrow \emptyset})^2}H_{Id_{\emptyset}}^{-\gamma_{u_1 \rightarrow \emptyset},\gamma_{u_1 \rightarrow \emptyset}} \preceq H_{Id_{(u_1)}}$ as 
\[
\left(\begin{matrix}
\frac{k^3}{n^3} & \frac{k^3}{n^3} & \frac{k^4}{n^4}\\
\frac{k^3}{n^3} & \frac{k^3}{n^3} & \frac{k^4}{n^4}\\
\frac{k^4}{n^4} & \frac{k^4}{n^4} & C\frac{k^5}{n^5}\\
\end{matrix}\right) \succeq (1 - \frac{1}{C})\left(\begin{matrix}
\frac{k^3}{n^3} & \frac{k^3}{n^3} & 0\\
\frac{k^3}{n^3} & \frac{k^3}{n^3} & 0\\
0 & 0 & 0\\
\end{matrix}\right) \succeq \tilde{O}(1)\left(\begin{matrix}
\frac{k^4}{n^4} & \frac{k^4}{n^4} & 0\\
\frac{k^4}{n^4} & \frac{k^4}{n^4} & 0\\
0 & 0 & 0\\
\end{matrix}\right)
\].
\end{enumerate}	


\subsection{Shapes and graph matrices}

Consider the setting when the input distribution is a Rademacher $G_{n, 1/2}$ graph with the input entries being $\chi_e \in \{-1, 1\}$. For $T \subseteq \binom{[n]}{2}$, let $\chi_T = \prod_{e \in T} \chi_e$ be the standard Fourier basis. In this setting, shapes were already defined in \cref{chap: efron_stein}. Here, for technical reasons, we slightly modify the definitions so that the rows and columns are indexed by sub-tuples of $[n]$ rather than subsets of $[n]$. The techniques developed in \cref{chap: efron_stein} still carry over to bound the norms of such graph matrices.

\begin{definition}[Shapes in the setting of Rademacher $G_{n, 1/2}$ inputs]
    A shape $\alpha = (V(\alpha),E(\alpha),U_{\alpha},V_{\alpha})$ is a graph on vertices $V(\alpha)$ and edges $E(\alpha)$ with two distinguished tuples of vertices $U_{\alpha}, V_{\alpha} \subseteq V(\alpha)$. Note that $U_{\alpha}, V_{\alpha}$ are ordered subsets (tuples).
\end{definition}

As we saw earlier, we can define corresponding matrices for each shape, that are termed graph matrices. Recall that a realization is an injective map from $V(\alpha)$ to $[n]$. The main difference here, as compared to \cref{chap: efron_stein}, is that in the definition of graph matrices, we sum over realizations $\varphi$ that correspond to distinct characters, rather than all realizations $\varphi$.

To capture this notion precisely, we use the following definition. Define two realizations (injective maps from $V(\alpha)$ to $[n]$) $\varphi, \varphi'$ to be equivalent if $\varphi(U_{\alpha}) = \varphi'(U_{\alpha}), \varphi(V_{\alpha}) = \varphi'(V_{\alpha})$ as tuples and $\varphi(E(\alpha)) = \varphi'(E(\alpha))$ as sets. Let the set of non-equivalent realizations of $\alpha$ be denoted $\mathrm{Real}(\alpha)$.

\begin{definition}[Graph matrices in the setting of Rademacher $G_{n, 1/2}$ inputs]
    For a shape $\alpha$, the graph matrix $M_{\alpha}$ is a matrix-valued function with rows and columns indexed by sub-tuples of $[n]$ of sizes $|U_{\alpha}|, |V_{\alpha}|$ respectively, which is defined as follows: It maps input graph $G \in \{\pm 1\}^{\binom{n}{2}}$ (wich associated fourier characters $\chi_E$) to a matrix with the $A, B$-th entry being \[M_{\alpha}(A,B) = \sum_{\substack{\varphi(U_{\alpha}) = A, \varphi(V_{\alpha}) = B\\\varphi \in \mathrm{Real}(\alpha)}}{\chi_{E(\alpha)}}\]

   
\end{definition}

\begin{definition}[Shape transposes]
    For a shape $\alpha = (V(\alpha),E(\alpha),U_{\alpha},V_{\alpha})$, define its transpose  $\alpha^T$ to be $\alpha^T = (V(\alpha),E(\alpha),V_{\alpha},U_{\alpha})$.
    Note that $M_{\alpha^T} = M_{\alpha}^T$ as matrix transpose.
\end{definition}

\begin{example}
    In \cref{fig: sample_shape}, consider the shape $\alpha$ as shown. We have $U_{\alpha} = (u_1, u_2), V_{\alpha} = (v_1), V(\alpha) = \{u_1, u_2, v_1, w_1\}$ and $E(\alpha) = \{\{u_1, w_1\}, \{u_2, w_1\}, \{w_1, v_1\}\}$. $M_{\alpha}$ is a matrix with rows and columns indexed by tuples of length $|U_{\alpha}| = 2$ and $|V_{\alpha}| = 1$ respectively. The nonzero entries will have rows and columns indexed by $(a_1, a_2)$ and $b_1$ respectively for all distinct $a_1, a_2, b_1$, with the corresponding entry being $M_{\alpha}((a_1, a_2), (b_1)) = \sum_{c_1 \in [n] \setminus \{a_1, a_2, b_1\}} \chi_{a_1, c_1}\chi_{a_2, c_1}\chi_{c_1, b_1}$. Here, the injective map $\varphi$ maps the vertices $u_1, u_2, w_1, v_1$ to $a_1, a_2, c_1, b_1$ respectively and we sum over all such maps (as they are all pairwise non-equivalent). Succinctly, \[M_{\alpha} =
    \begin{blockarray}{rl@{}c@{}r}
        & & \makebox[0pt]{column $(b_1)$} \\[-0.5ex]
        & & \,\downarrow \\[-0.5ex]
        \begin{block}{r(l@{}c@{}r)}
            &  & \vdots & \\[-0.2ex]
            \text{row }(a_1, a_2) \rightarrow \mkern-9mu & \raisebox{0.5ex}{\makebox[3.2em][l]{\dotfill}} & \sum_{c_1 \in [n] \setminus \{a_1, a_2, b_1\}} \chi_{a_1, c_1}\chi_{a_2, c_1}\chi_{c_1, b_1} & \raisebox{0.5ex}{\makebox[4.2em][r]{\dotfill}} \\[+.5ex]
            &  & \vdots & \\
        \end{block}
    \end{blockarray}\]
\end{example}

\begin{figure}[!h]
    \centering
    \includegraphics[scale=.3, trim={0 0 0 0},clip]{machinery/images/sample_shape}
    \caption{Example of a shape}
    \label{fig: sample_shape}
\end{figure}

Some simple matrices such as the adjacency matrix of a graph and the identity matrix are also graph matrices, as we see below
\begin{itemize}
    \item Take $\alpha$ to be a shape with two vertices $u, v$ with $U_{\alpha} = (u), V_{\alpha} = (v)$ and exactly one edge $\{u, v\}$. Then, $M_{\alpha}$ has rows and columns indexed by $[n]$ (more specifically tuples of length $1$) with the $i, j$-th entry being $G_{ij}$ if $i \neq j$ and $0$ otherwise. Therefore, $M_{\alpha}$ is just the $\pm 1$ adjacency matrix of the graph $G$.
    \item Take $\alpha$ to be the shape with exactly $1$ vertex $u$, no edges and $U_{\alpha} = V_{\alpha} = (u)$. Then, $M_{\alpha}$ is the identity matrix of size $n \times n$.
\end{itemize}

For more examples of graph matrices and why they can be a useful tool to work with, see \cite{ahn2016graph}.
We now define some terms to capture the rows and columns of graph matrices.

\paragraph{Matrix indices and index shapes}
In the above setting of Rademacher $G_{n, 1/2}$ inputs, a matrix index $A$ is a tuple of indices $(a_1, \ldots, a_{|A|})$ where $a_i \in [n]$. When the SoS variables are $y_1, \ldots, y_n$, we associate to this matrix index $A$ the monomial $\prod_{i \le |A|} y_{a_i}$. With this definition, graph matrices have as rows and columns matrix indices.

Define an index shape $U = (u_1, \ldots, u_{|U|})$ to be a tuple of formal variables $u_i$, or in other words, unspecified indices. If $|U| = t$, we say that any matrix index $A$ of length $t$ has shape $U$. We say two index shapes $U, V$ are equivalent, denoted $U \equiv V$ if $|U| = |V|$. Finally, define the weight of $U$ to be $w(U) = |U|$ and the automorphism group $Aut(U) = S_{|U|}$ (the permutations of the elements of $U$). The latter definition is needed for describing coefficients.

\paragraph{Shape definitions}
We say a shape $\alpha$ is proper if it has no isolated vertices (i.e. no degree $0$ vertices) outside $U_{\alpha} \cup V_{\alpha}$. We say a shape $\alpha$ is trivial if $U_{\alpha}$ and $V_{\alpha}$ are equal as sets, and they constitute all the vertices in $\alpha$. and moreover, there are no edges in $\alpha$.

A path is a sequence of vertices of $V(\alpha)$ such that every consecutive pair of vertices form an edge in $V(\alpha)$. A vertex separator of $\alpha$ is a set of vertices $S$ such that every path from $U_{\alpha}$ to $V_{\alpha}$ passes through $S$. As we saw in \cref{chap: efron_stein}, the norm bounds of the graph matrix $M_{\alpha}$ rely on the size of the minimum vertex separator of $\alpha$. Define the weight of a vertex separator $S$ as $|S|$.

The above definitions are sufficient for the application to the Planted Slightly Denser subgraph problem.  But when we work with Tensor PCA and Sparse PCA, we need to generalize the notion of shapes and graph matrices. These generalized shapes and graph matrices were studied in \cite{ahn2016graph}. Now, we describe the required generalizations.

\subsubsection{Definitions for Tensor PCA}

In the Tensor PCA application, the input is a tensor $A \in {\mathbb R}^{[n]^k}$. To incorporate this, we modify our definitions of shapes and index shapes accordingly. The input entries are now sampled from the distribution $\mathcal{N}(0, 1)$ instead of $\{-1, 1\}$. So, we will work with the Hermite basis of polynomials.
Let the standard unnormalized Hermite polynomials be denoted as $h_0(x) = 1, h_1(x) = x, h_2(x) = x^2 - 1, \ldots$. Then, we work with the basis $h_a(A) := \prod_{e \in [n]^k} h_e(A_e)$ over $a \in \mathbb{N}^{[n]^k}$. Accordingly, we will modify the graphs that represent shapes, to have labeled hyperedges of arity $k$. So, an hyperedge $e$ with a label $t$ will correspond to the hermite polynomial $h_t(A_e)$.

\begin{definition}[Hyperedges]
    Instead of standard edges, we will have labeled hyperedges of arity $k$ in the underlying graphs for our ribbons as well as shapes. The label for an hyperedge $e$, denoted $l_e$, is an element of $\mathbb{N}$ which will correspond to the Hermite polynomial being evaluated on that entry.
\end{definition}

Note that our hyperedges are ordered since the tensor $A$ is not necessarily symmetric.
For variables $x_1, \ldots, x_n$, the rows and columns of our moment matrix will now correspond to monomials of the form $\prod_{i \le n} x_i^{p_i}$ for $p_i \ge 0$. To capture this, we use the notion of index shape pieces and index shapes. Informally, we split the above monomial product into groups based on their powers and each such group will form an index shape piece.

\begin{definition}[Index shape piece]
    An index shape piece $U_i= ((U_{i, 1}, \ldots, U_{i, t}), p_i)$ is a tuple of indices $(U_{i, 1}, \ldots, U_{i, t})$ along with a power $p_i \in \mathbb{N}$. Let $V(U_i)$ be the set $\{U_{i, 1}, \ldots, U_{i, t}\}$ of vertices of this index shape piece. When clear from context, we use $U_i$ instead of $V(U_i)$.
\end{definition}

If we realize $U_{i, 1}, \ldots, U_{i, t}$ to be indices $a_1, \ldots, a_t \in [n]$, then this realization of this index shape piece corresponds to the monomial $\prod_{j \le t} x_{a_j}^{p_i}$.

\begin{definition}[Index shape]
    An index shape $U$ is a set of index shape pieces $U_i$ that have different powers. Let $V(U)$ be the set of vertices $\cup_i V(U_i)$. When clear from context, we use $U$ instead of $V(U)$.
\end{definition}

Observe that each realization of an index shape corresponds to a row or column of the moment matrix.
Equivalence of index shapes is analogous, namely, for two index shapes $U, V$, we write $U \equiv V$ if for all powers $p$, the index shape pieces of power $p$ in $U$ and $V$ have the same length.
We also define the automorphism group of $U$ as $Aut(U) = \prod_{U_i \in U}{Aut(U_i)}$ where the automorphism group of an index shape piece $U_i$ is $Aut(U_i) = S_{|U_i|}$.
In the definition of shapes, the distinguished set of vertices should now be replaced by index shapes.

\begin{definition}[Shapes]
    Shapes are tuples $\alpha = (V(\alpha), E(\alpha), U_{\alpha}, V_{\alpha})$ where $(V(\alpha), E(\alpha))$ is a graph with hyperedges of arity $k$ and $U_{\alpha}, V_{\alpha}$ are index shapes such that $U_{\alpha}, V_{\alpha} \subseteq V(\alpha)$.
\end{definition}

A shape $\alpha$ is proper if it has no isolated vertices outside $U_{\alpha} \cup V_{\alpha}$, no multi-edges and all the edges have a nonzero label.
To define the notion of vertex separators, we accordingly modify the notion of paths for hyperedges instead of edges. Formally, a path is a sequence of vertices $u_1, \ldots, u_t$ such that $u_i, u_{i + 1}$ are in the same hyperedge, for all $i \le t - 1$.
The notion of vertex separator is identically defined with the above notion of hyperedges and paths.
Finally, the definition of trivial shape $\tau$ is similar, the only change being that we now require $U_{\tau} \equiv V_{\tau}$ instead of saying they're equal as sets.

\subsubsection{Definitions for Sparse PCA}

We are given the $m$ vectors $v_1, \ldots, v_m \in {\mathbb R}^d$ as input. Similar to Tensor PCA, we will work with the Hermite basis of polynomials since the entries are sampled from the distribution $\mathcal{N}(0, 1)$.
In particular, if we denote the unnormalized Hermite polynomials by $h_0(x) = 1, h_1(x) = x, h_2(x) = x^2 - 1, \ldots$, then, we work with the basis $h_a(v) := \prod_{i \in [m], j \in [n]} h_{a_{i, j}}(v_{i, j})$ over $a \in \mathbb{N}^{m \times n}$. To capture this basis, we will modify the graphs that represent shapes to be bipartite graphs with two types of vertices, and have labeled edges that go across vertices of different types. So, an edge $(i, j)$ with label $t$ between a vertex $i$ of type $1$ and a vertex $j$ of type $2$ will correspond to $h_t(v_{i, j})$.

Formally, we will have two types of vertices, the vertices corresponding to the $m$ input vectors that we call type $1$ vertices and the vertices corresponding to ambient dimension of the space that we call type $2$ vertices. For a shape with such vertices, edges will go across vertices of different types, thereby forming a bipartite graph. An edge between a type $1$ vertex $i$ and a type 2 vertex $j$ corresonds to the input entry $v_{i, j}$. Each edge will have a label in $\mathbb{N}$ corresponding to the Hermite polynomial evaluated on that entry.

We will have variables $x_1, \ldots, x_n$ in our SoS program, so we will work with index shape pieces and index shapes as in Tensor PCA, since the rows and columns of our moment matrix will now correspond to monomials of the form $\prod_{i \le n} x_i^{p_i}$ for $p_i \ge 0$. But since we have $2$ types of vertices, we need to slightly modify the notion of index shape pieces and index shapes.

\begin{definition}[Index shape piece]
    An index shape piece $U_i= ((U_{i, 1}, \ldots, U_{i, t}), t_i, p_i)$ is a tuple of indices $(U_{i, 1}, \ldots, U_{i, t})$ along a type $t_i \in \{1, 2\}$ with a power $p_i \in \mathbb{N}$. Let $V(U_i)$ be the set $\{U_{i, 1}, \ldots, U_{i, t}\}$ of vertices of this index shape piece. When clear from context, we use $U_i$ instead of $V(U_i)$.
\end{definition}

For an index shape piece $((U_{i, 1}, \ldots, U_{i, t}), t_i, p_i)$ with type $t_i = 2$, if we realize $U_{i_1}, \ldots, U_{i_t}$ to be indices $a_1, \ldots, a_t \in [n]$, then, this index shape pieces correspond this to the monomial $\prod_{j \le n} x_{a_j}^{p_i}$.

\begin{definition}[Index shape]
    An index shape $U$ is a set of index shape pieces $U_i$ that have either have different types or different powers. Let $V(U)$ be the set of vertices $\cup_i V(U_i)$. When clear from context, we use $U$ instead of $V(U)$.
\end{definition}

Each realization of an index shape will correspond to a row or column of the moment matrix. For our moment matrix, the only nonzero rows correspond to index shapes that have only index shape pieces of type $2$, since the only SoS variables are $x_1 \ldots, x_n$, but in order to do our analysis, we need to work with the generalized notion of index shapes that allow index shape pieces of both types.

Analogous to our previous definitions, for two index shapes $U, V$, we write $U \equiv V$ if for all types $t$ and all powers $p$, the index shape pieces of type $t$ and power $p$ in $U$ and $V$ have the same length.
Since we are working with standard graphs, the original notion of path and vertex separator will work , but we will now use the minimum weight vertex separator instead of the minimum vertex separator where we define the weight as follows.

\begin{definition}[Weight of an index shape]
    Suppose we have an index shape $U = \{U_1, U_2\} \in {\mathcal I}_{mid}$ where $U_1 = ((U_{1, 1}, \ldots, U_{1, |U_1|}), 1, 1)$ is an index shape piece of type $1$ and $U_2 = ((U_{2, 1}, \ldots, U_{2, |U_2|}), 2, 1)$ is an index shape piece of type $2$. Then, define the weight of this index shape to be $w(U) = \sqrt{m}^{|U_1|}\sqrt{n}^{|U_2|}$.
\end{definition}

The definition carries over for a vertex separator as well. We also define the automorphism group of $U$ as $Aut(U) = \prod_{U_i \in U}{Aut(U_i)}$ where the automorphism group of an index shape piece $U_i$ is $Aut(U_i) = S_{|U_i|}$. We now give the modified definition of shapes.

\begin{definition}[Shapes]
    Shapes are tuples $\alpha = (V(\alpha), E(\alpha), U_{\alpha}, V_{\alpha})$ where $(V(\alpha), E(\alpha))$ is a graph with two types of vertices, has labeled edges only across vertices of different types and $U_{\alpha}, V_{\alpha}$ are index shapes such that $U_{\alpha}, V_{\alpha} \subseteq V(\alpha)$.
\end{definition}

The other definitions that follow are analogous. A shape $\alpha$ is proper if it has no isolated vertices outside $U_{\alpha} \cup V_{\alpha}$, no multi-edges and all the edges have a nonzero label. In the definition of trivial shape $\tau$, just as in Tensor PCA, we require $U_{\tau} \equiv V_{\tau}$ instead of saying they're equal as sets.

\subsection{Decomposing shapes}

Compared to the lower bound strategy in the Sherrington-Kirkpatrick lower bound in \cref{chap: sk}, the main strategy in the machinery is to provide an approximate PSD decomposition by decomposing shapes $\alpha$ into three other shapes $\sigma, \tau, \sigma'^T$ such that $M_{\alpha} \approx M_{\sigma}M_{\tau} M_{\sigma'^T}$. Then, the idea is to argue that the graph matrix coefficients of the moment matrix also decompose similarly, ending with a PSD decomposition showing that the moment matrix is PSD.

We first need to define composition of shapes. We say that shapes $\alpha$ and $\beta$ are composable if $U_{\beta} \equiv V_{\alpha}$. In this case, define their composition to be the shape $\alpha \circ \beta$ which is obtained by concatenating $\alpha, \beta$ while gluing together $U_{\beta}, V_{\alpha}$. Formally, $\alpha\circ \beta$ is such that $U_{\alpha \circ \beta} = U_{\alpha}$, $V_{\alpha \circ \beta} = V_{\beta}$, and after setting $U_{\beta} = V_{\alpha}$, we take $V(\alpha \circ \beta) = V(\alpha) \cup V(\beta)$, and finally, $E(\alpha \circ \beta) = E(\alpha) \cup E(\beta)$.

Note that by doing this, the concatenated shape could become improper if edges repeat.
We remark that shape composition is not necessarily commutative, but it is associative.

\begin{figure}[!ht]
    \centering
    \includegraphics[scale=0.45, trim={4.5cm 2cm 0 2cm},clip]{machinery/images/basic_shape_comp}
    \caption{Illustration of shape composition and decomposition.}
    \label{fig: basic_shape_comp}
\end{figure}

\begin{example}
    \cref{fig: basic_shape_comp} illustrates an example of shape composition in the setting where there is only one type of vertex. Observe how the shapes $\sigma \circ \sigma'^T$ and $\sigma \circ \tau \circ \sigma'^T$ are obtained from the shapes $\sigma, \tau$ and $\sigma'^T$.
\end{example}

\begin{example}
    \cref{fig: shape_comp} illustrates an example of shape composition in the setting where there are two types of vertices. We have two types of vertices that we diagrammaticaly represent by squares and circles. Observe how the shapes $\sigma \circ \sigma'^T$ and $\sigma \circ \tau \circ \sigma'^T$ are obtained from the shapes $\sigma, \tau$ and $\sigma'^T$.
\end{example}

\begin{figure}[!ht]
    \centering
    \includegraphics[scale=0.38, trim={8cm 2cm 0 2cm},clip]{machinery/images/shape_comp}
    \caption{Illustration of shape composition and decomposition.}
    \label{fig: shape_comp}
\end{figure}

Previously, we defined the notion of minimum vertex separators (and analogously, minimum weight vertex separators). In what follows, we collectively term either of them as minimum weight vertex separators. by Define the \textit{leftmost} (resp. \textit{rightmost}) minimum-weight vertex separator $S$ (resp. $T$) to be a minimum-weight vertex separator such that for every other minimum-weight vertex separator $S'$ (resp. $T'$), $S$ separates $U_{\alpha}$ from $S'$ (resp. $T'$ from $V_{\alpha}$).
In \cite{BHKKMP16, potechin2020machinery}, it's shown that these are well-defined.

With these definitions in hand, we can now define how to decompose a shape $\alpha$ into its left, middle and right parts $\sigma, \tau, \sigma'^T$ respectively.

\begin{definition}[Shape decomposition]
    Let $\alpha$ be a shape. Let $S$ and $T$ be the leftmost and rightmost minimum-weight vertex separators of $\alpha$ together with some orderings $O_S,O_T$ of $S$ and $T$.
    \begin{itemize}
        \item We define the left part $\sigma$ of $\alpha$ to be the shape formed by taking the induced subgraph on all of the vertices of $\alpha$ reachable from $U_{\alpha}$ without passing through $S$ (but including the vertices of $S$) where all edges and hyperedges within $S$ are removed, and we take $U_{\sigma} = U_{\alpha}$ and $V_{\sigma} = (S,O_S)$.
        \item We define the right part ${\sigma'}^T$ of $\alpha$ to be the shape formed by taking the induced subgraph on all of the vertices of $\alpha$ reachable from $V_{\alpha}$ without passing through $T$ (but we include the vertices of $T$) where all edges and hyperedges within $T$ are removed, and we take $V_{{\sigma'}^T} = V_{\alpha}$ and $U_{{\sigma'}^T} = (T,O_T)$.
        \item Finally, we define the middle part $\tau$ of $\alpha$ to be the shape formed by the induced subgraph on all of the vertices of $\alpha$ which are not reachable from $U_{\alpha}$ and $V_{\alpha}$ without touching $S$ and $T$ respectively (but we include the vertices of $S$ and $T$), where we also include the edges or hyperedges entirely within $S$ and the edges or hyperedges entirely within $T$, and we take $U_{\tau} = (S,O_S)$ and $V_{\tau} = (T,O_T)$.
    \end{itemize}
\end{definition}

It's evident from the definition that $\alpha = \sigma \circ \tau \circ \sigma'^T$.

\begin{example}
    \cref{fig: basic_shape_comp} illustrates an example decomposition in the setting where there is only one type of vertex.
    \begin{enumerate}
        \item If we start with the shape $\alpha$ denoted as $\sigma \circ \sigma'^T$, observe that there is a unique minimum vertex separator, which consists of the middle vertex of degree $5$, i.e. the one that's not in either $U_{\sigma \circ \sigma'^T}$ or $V_{\sigma \circ \sigma'^T}$.
        Then, $\alpha$ is decomposed in to the left part $\sigma$, a trivial middle part $\tau$ (not shown in this figure) which has $V(\tau) = \{u\}, U_{\tau} = V_{\tau} = (u), E(\tau) = \emptyset$, and the right part $\sigma'^T$.
        \item If we start with the shape $\alpha$ denoted as $\sigma \circ \tau \circ \sigma'^T$, then the leftmost minimum vertex separator is the vertex of degree $4$ and the rightmost minimum vertex separator is the vertex of degree $5$. Then, $\alpha$ is decomposed into the left part $\sigma$, the middle part $\tau$ and the right part $\sigma'^T$, which are all shown in this figure.
    \end{enumerate}
\end{example}

\begin{example}
    \cref{fig: shape_comp} illustrates an example decomposition in the setting where there are two types of vertices. We have two types of vertices that we diagrammatically represent by squares and circles. In this example, we assume that the set containing a single circle vertex has a lower weight compared to a set of two square vertices.
    \begin{enumerate}
        \item If we start with the shape $\sigma \circ \sigma'^T$, then it can be decomposed uniquely in to the composition of the left shape $\sigma$, the right shape $\sigma'^T$. In this case, the middle shape (not shown in this figure) is trivial.
        \item If we start with the shape $\sigma \circ \tau \circ \sigma'^T$, then it can be decomposed uniquely into the composition of the left shape $\sigma$, the middle shape $\tau$ and the right shape $\sigma'^T$, which are all shown in this figure.
    \end{enumerate}
\end{example}

At this point, the definitions of left shapes, middle shapes and right shapes are natural.
We say that a shape $\sigma$ is a left shape if $\sigma$ is a proper shape, $V_{\sigma}$ is the left-most and right-most minimum-weight separator of $\sigma$, every vertex in $V(\sigma) \setminus V_{\sigma}$ is reachable from $U_{\sigma}$ without touching $V_{\sigma}$, and $\sigma$ has no hyperedges entirely within $V_{\sigma}$.
Similarly, we say that a shape $\tau$ is a proper middle shape if $\tau$ is a proper shape, $U_{\tau}$ is the left-most minimum-weight separator of $\tau$, and $V_{\tau}$ is the right most minimum-weight separator of $\tau$. We do not define improper middle shapes, which are needed in the machinery proof, but not here.
Finally, we say that a shape ${\sigma}^{T}$ is a right shape if it is the transpose of a left shape.

\subsection{Coefficient matrices}

We have all the necessary definitions in place for shapes and graph matrices. To apply the machinery, we will decompose the given moment matrix $\Lambda$ as $\Lambda = \sum \lambda_{\alpha}M_{\alpha}$ where the sum is over all shapes $\alpha$. The coefficients $\lambda_{\alpha}$ are then assembled into matrices, termed coefficient matrices, that we will define next. The conditions of the machinery will be in terms of these coefficient matrices.

We will begin with some notations for different sets of index shapes. Given a moment matrix $\Lambda$, define $\mathcal{I}(\Lambda)$ to the set of matrix shapes $U$ such that some row or column index of $\Lambda$ has shape $U$. Define $w_{max} = \max{\{w(U):U \in \mathcal{I}(\Lambda)\}}$ to be the maximum possible weight of an index shape in ${\mathcal I}(\Lambda)$. Finally, define $\mathcal{I}_{mid}$ to be $\mathcal{I}_{mid} = \{U: w(U) \leq w_{max}, \forall U_i \in U, p_i = 1\}$.
Observe that in the setting of Rademacher $G_{n, 1/2}$ inputs, we have $\mathcal{I}_{mid} = \{U: |U| \leq w_{max}\}$.

In pseudo-calibration, we only keep the shapes that satisfy certain truncation parameters that we choose. Formally, satisfication of truncation parameters is defined as follows.

\begin{definition}[Truncation parameters for the setting of Rademacher $G_{n, 1/2}$ inputs]
    For integers $D_{sos}, D_V \ge 0$, say that a shape $\alpha$ satisfies the truncation parameters $D_{sos}, D_V$ if
    \begin{itemize}
        \item The degrees of the monomials that $U_{\alpha}$ and $V_{\alpha}$ correspond to, are at most $\frac{D_{sos}}{2}$
        \item The left part $\sigma$, the middle part $\tau$ and the right part $\sigma'$ of $\alpha$ satisfy the bounds $|V(\sigma)|, |V(\tau)|, |V(\sigma')| \le D_V$.
    \end{itemize}
\end{definition}

\begin{definition}[Truncation parameters for Tensor PCA and Sparse PCA]
    For integers $D_{sos}, D_V, D_E \ge 0$, say that a shape $\alpha$ satisfies the truncation parameters $D_{sos}, D_V, D_E$ if
    \begin{itemize}
        \item The degrees of the monomials that $U_{\alpha}$ and $V_{\alpha}$ correspond to, are at most $\frac{D_{sos}}{2}$
        \item The left part $\sigma$, the middle part $\tau$ and the right part $\sigma'^T$ of $\alpha$ satisfy the bounds $|V(\sigma)|, |V(\tau)|, |V(\sigma'^T)| \le D_V$
        \item For each $e \in E(\alpha)$, $l_e \le D_E$.
    \end{itemize}
\end{definition}

We also need to define the sets of shapes which can appear when analyzing $\Lambda$.
Given a moment matrix $\Lambda$, define $\mathcal{L} = \{\sigma: \sigma \text{ is a left shape}, U_{\sigma} \in \mathcal{I}(\Lambda), V_{\sigma} \in \mathcal{I}_{mid}, |V(\sigma)| \leq D_V, \forall e \in E(\sigma), l_e \leq D_E\}$. Moreover, given $V \in \mathcal{I}_{mid}$, define $\mathcal{L}_V = \{\sigma \in \mathcal{L}: V_{\sigma} \equiv V\}$. Finally, given $U \in \mathcal{I}_{mid}$, define $\mathcal{M}_U = \{\tau: \tau \text{ is a non-trivial proper middle shape}, U_{\tau} \equiv V_{\tau} \equiv U, |V(\tau)| \leq D_V, \forall e \in E(\tau), l_e \leq D_E\}$.

We are now ready to define coefficient matrices. Given a moment matrix $\Lambda$, a coefficient matrix is a matrix $H$ whose rows and columns are indexed by left shapes $\sigma,\sigma' \in \mathcal{L}$. $H$ is called SoS-symmetric if  $H(\sigma,\sigma')$ is invariant under the action of the symmetric group, i.e. if we permute the vertices of $U_{\sigma}$ and the vertices of $U_{\sigma'}$ (where we only permute within the same type) under the same permutation, then the entry doesn't change.

For a shape $\tau$, we say that a coefficient matrix $H$ is a $\tau$-coefficient matrix if $H(\sigma,\sigma') = 0$ whenever $V_{\sigma} \not\equiv U_{\tau}$ or $V_{\tau} \not\equiv U_{{\sigma'}^T}$. Given an index shape $U$, we define $Id_{U}$ to be the shape with $U_{Id_{U}} = V_{Id_{U}} = U$, no other vertices, and no edges.

As stated earlier, the coefficients $\lambda_{\alpha}$ of the moment matrix $\Lambda$ are assembled to form coefficient matrices, which are used to state the machinery conditions.
Given a shape $\tau$ and a $\tau$-coefficient matrix $H$, we consider the matrix-valued function $M^{fact}_{\tau}(H)$ defined as
\[
M^{fact}_{\tau}(H) = \sum_{\sigma \in \mathcal{L}_{U_{\tau}},\sigma' \in \mathcal{L}_{V_{\tau}}}{H(\sigma,\sigma')M_{\sigma}M_{\tau}M_{\sigma'}^T}
\]

The motivation for this definition is as follows. First observe that $\Lambda$ is essentially an expression of the form $\sum_{\tau}\sum_{\sigma, \sigma'}{H(\sigma,\sigma')M_{\sigma \circ \tau \circ {\sigma'}^T}}$. For each $\tau$, the inner expression sort of looks like $M_{\tau}^{fact}(H)$. However, there is a technical difference.
Recall that if we expand out the definition of the graph matrices $M_{\sigma}, M_{\tau}$ and $M_{\sigma'}^T$, then $M_{\tau}^{fact}(H)$ sums over non-equivalent realizations coming from the sets $\textrm{Real}(\sigma), \textrm{Real}(\tau), \textrm{Real}(\sigma'^T)$ respectively. Apriori, it's not guaranteed that for each choice of realizations $\varphi_1, \varphi_2, \varphi_3$ from these sets, the corresponding subset of labels $\varphi_1(V(\sigma)), \varphi_2(V(\tau)), \varphi_3(V(\sigma'^T)) \subseteq [n]$ are disjoint. However, if we enforce that they are disjoint, then we will obtain a matrix closely related to what we desire. The work \cite{potechin2020machinery} terms this matrix obtained by enforcing disjointness of the realizations as $M^{orth}_{\tau}(H)$.

As they remark, it is not true that $M^{orth}_{\tau}(H) = \sum_{\sigma,\sigma'}{H(\sigma,\sigma')M_{\sigma \circ \tau \circ {\sigma'}^T}}$ because of additional terms involving automorphism groups.
Nevertheless, because of this enforced condition that the realizations don't overlap, $\Lambda$ can be easily expressed in terms of $M^{orth}$. Indeed, as they show via careful counting, $\Lambda = \sum_{U \in \mathcal{I}_{mid}}{M^{orth}_{Id_U}(H_{Id_U})} + \sum_{U \in \mathcal{I}_{mid}}{\sum_{\tau \in \mathcal{M}_U}{M^{orth}_{\tau}(H_{\tau})}}$, where $H_{Id_U}$ and $H_{\tau}$, formally defined below, are simple coefficient matrices assembled from $\lambda_{\alpha}$

Despite wanting to work with $M^{orth}$, the machinery instead works with $M^{fact}$ instead because showing PSDness is easier with $M^{fact}$ due to the product structure. The proof strategy in the machinery is to then show that the error terms when going from $M^{fact}$ to $M^{orth}$ (and therefore, $\Lambda$) are negligible with high probability, concluding the PSDness proof.

Given a matrix-valued function $\Lambda$, we assemble the following coefficient matrices.
\begin{definition}
    Given a matrix-valued function $\Lambda = \sum_{\alpha: \alpha \text{ is proper}}{\lambda_{\alpha}M_{\alpha}}$,
    \begin{enumerate}
        \item For each index shape $U \in \mathcal{I}_{mid}$ and every $\sigma,\sigma' \in \mathcal{L}_{U}$, set $H_{Id_U}(\sigma,\sigma') = \frac{1}{|Aut(U)|}\lambda_{\sigma \circ {\sigma'}^T}$
        \item For each $U \in \mathcal{I}_{mid}$, $\tau \in \mathcal{M}_U$ and $\sigma, \sigma' \in \mathcal{L}_{U}$, set
        $H_{\tau}(\sigma,\sigma') = \frac{1}{|Aut(U_{\tau})|\cdot|Aut(V_{\tau})|}\lambda_{\sigma \circ \tau \circ {\sigma'}^T}$
    \end{enumerate}
\end{definition}

We need a final definition, that of the coefficient matrix $H^{-\gamma, \gamma}$.
In order to handle error terms in the approximate PSD decomposition, the machinery has to further decompose left shapes $\sigma$ as $\sigma = \sigma_2 \circ \gamma$ where $\sigma_2$ and $\gamma$ are themselves left shapes. In order to capture this operation, the following definitions are needed.

Given a moment matrix $\Lambda$, define $\Gamma = \{\gamma: \gamma \text{ is a non-trivial left shape with } U_{\gamma}, V_{\gamma} \in \mathcal{I}_{mid}, |V(\gamma)| \leq D_V, \forall e \in E(\gamma), l_e \leq D_E\}$. Moreover, given $U,V \in \mathcal{I}_{mid}$ such that $w(U) > w(V)$, define $\Gamma_{U,V} = \{\gamma \in \Gamma: U_{\gamma} \equiv U, V_{\gamma} \equiv V\}$. Finally, given $U \in \mathcal{I}_{mid}$, define $\Gamma_{U,*} = \{\gamma \in \Gamma: U_{\gamma} \equiv U\}$.

We finally define the coefficient matrix $H^{-\gamma, \gamma}$ given the truncation parameter $D_V$. Given a shape $\tau$ with $U_{\tau} \equiv V_{\tau}$, left shape $\gamma \in {\Gamma}_{*,U_{\tau}}$ (and therefore, $\gamma \in {\Gamma}_{*,V_{\tau}}$), and a $\tau$-coefficient matrix $H$, define $H^{-\gamma,\gamma}$ to be the $(\gamma \circ \tau \circ {\gamma}^T)$-coefficient matrix with entries
    \begin{itemize}
        \item $H^{-\gamma,\gamma}(\sigma,\sigma') = H(\sigma \circ \gamma,\sigma' \circ \gamma)$ if $|V(\sigma \circ \gamma)| \leq D_V$ and $|V(\sigma' \circ \gamma)| \leq D_V$.
        \item $H^{-\gamma,\gamma}(\sigma,\sigma') = 0$ if $|V(\sigma \circ \gamma)| > D_V$ or $|V(\sigma' \circ \gamma)| > D_V$.
    \end{itemize}


\subsection{Main theorems}

For a problem ${\mathcal P}$, let $\Lambda_{{\mathcal P}}$ be the moment matrix obtained via pseudo-calibration. We then state the conditions that the machinery requires in order to show positivity with high probability.
We will use the following notion of distance between coefficient matrices, which will be useful to bound truncation error.

\begin{definition}
    Given a function $B_{norm}(\alpha)$, define the distance $d_{\tau}(H_{\tau},H'_{\tau})$ between two $\tau$-coefficient matrices $H_{\tau}$ and $H'_{\tau}$ as
    \[
    d_{\tau}(H_{\tau},H'_{\tau}) = \sum_{\sigma \in \mathcal{L}_{U_{\tau}},\sigma' \in \mathcal{L}_{V_{\tau}}}{|H'_{\tau}(\sigma,\sigma') - H_{\tau}(\sigma,\sigma')|B_{norm}(\sigma)B_{norm}(\tau)B_{norm}(\sigma')}
    \]
\end{definition}

We also define $Id_{Sym}$, which is the SoS-symmetric analogue of the identity matrix. For a matrix index $A$, denote by $p_A$ the formal monomial (in terms of the SoS program variables) it corresponds to. Define $Id_{Sym}$ to be the matrix such that the rows and columns of $Id_{Sym}$ are indexed by the matrix indices $A,B$ whose index shape is in $\mathcal{I}(\Lambda)$ and $Id_{Sym}(A,B) = 1$ if $p_A = p_B$ and $Id_{Sym}(A, B) = 0$ if $p_A \neq p_B$.

We introduce a few more notations about shapes in order to state our parameters.
Define $\mathcal{M}'$ to be the set of all shapes $\alpha$ such that $|V(\alpha)| \leq 3D_V$, $\forall e \in E(\alpha), l_e \leq D_E$ and all edges $e \in E(\alpha)$ have multiplicity at most $3D_V$. Note that the latter two conditions are not needed for the setting of Rademacher $G_{n, 1/2}$ inputs but they're needed in the setting of Gaussian inputs.
In the setting of Rademacher $G_{n, 1/2}$ inputs, for a shape $\alpha$, define $S_{\alpha}$ to be the leftmost minimum vertex separator of $\alpha$ and define $I_{\alpha}$ to be the set of vertices in $V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})$ which are isolated. In the setting of Gaussian $\mathcal{N}(0, 1)$ inputs, for a shape $\alpha \in \mathcal{M}'$, define $S_{\alpha,min}$ to be the leftmost minimum vertex separator of $\alpha$ if all edges with multiplicity at least $2$ are deleted. Moreover, define $I_{\alpha}$ to be the set of vertices in $V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})$ such that all edges incident with that vertex have multiplicity at least $2$.

\subsubsection{Choice of parameters in the setting of Rademacher $G_{n, 1/2}$ inputs}

We first state some parameters we will use in this work and then state the main conditions that are needed for the main theorem statement, which is stated after this.

 Let $\varepsilon > 0$ and $D_V, D_E$ be truncation parameters. Define
\begin{itemize}
    \item $q = 3\left\lceil{{D_V}\ln n + \frac{\ln(\frac{1}{\varepsilon})}{3} + {D_V}\ln 5 + 3{D^2_V}\ln 2}\right\rceil$
    \item $B_{vertex} = 6{D_V}\sqrt[4]{2eq}$
    \item $B_{norm}(\alpha) = {B_{vertex}^{|V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|}}n^{\frac{w(V(\alpha)) + w(I_{\alpha}) - w(S_{\alpha})}{2}}$
    \item $B(\gamma) = B_{vertex}^{|V(\gamma) \setminus U_{\gamma}| + |V(\gamma) \setminus V_{\gamma}|}n^{\frac{w(V(\gamma) \setminus U_{\gamma})}{2}}$
    \item $N(\gamma) = (3D_V)^{2|V(\gamma) \setminus V_{\gamma}| + |V(\gamma) \setminus U_{\gamma}|}$
    \item $c(\alpha) = 100(3D_V)^{|U_{\alpha} \setminus V_{\alpha}| + |V_{\alpha} \setminus U_{\alpha}| + 2|E(\alpha)|}2^{|V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})|}$
\end{itemize}

In our application, as stated earlier, we show SoS lower bounds for degree-$n^{\varepsilon}$ SoS, where the input size is $n^{O(1)}$. In this setting, we take $D_V, D_E$ to be of the order of $n^{O(\varepsilon)}$. Therefore, for simplicity, we can interpret the parameters as
\[q = n^{O(\varepsilon)}, B_{vertex} = n^{O(\varepsilon)}, B_{norm}(\alpha) =n^{O(\varepsilon)|V(\alpha)|}n^{\frac{w(V(\alpha)) + w(I_{\alpha}) - w(S_{\alpha})}{2}}\]
\[B(\gamma) = n^{O(\varepsilon)|V(\gamma)|}n^{\frac{w(V(\gamma) \setminus U_{\gamma})}{2}}, N(\gamma) = n^{O(\varepsilon)|V(\gamma)|}, c(\alpha) = n^{O(\varepsilon)|V(\alpha)|}\]

\subsubsection{Choice of parameters in the setting of Gaussian inputs on hypergraphs}

We now state the parameters needed for the more general statement of the machinery where we have Gaussian inputs on hypergraphs.
In this setting, let there be at most $t_{max}$ types of vertices and let $k$ be the maximum arity of an hyperedge. In the setting of Tensor PCA, we take $t_{max} = 1$ and in the setting of Sparse PCA, we take $k = t_{max} = 2$.
For all $\varepsilon > 0$ and truncation parameters $D_V, D_E$, define
\begin{enumerate}
    \item $q = \left\lceil{3{D_V}\ln n + \ln(\frac{1}{\varepsilon}) + {(3D_V)^k}\ln(D_E + 1) + 3{D_V}\ln 5}\right\rceil$
    \item $B_{vertex} = 6q{D_V}$
    \item $B_{edge}(e) = \left(400{D^2_V}{D^2_E}q\right)^{l_e}$
    \item $B_{norm}(\alpha) =
    2e{B_{vertex}^{|V(\alpha) \setminus U_{\alpha}| + |V(\alpha) \setminus V_{\alpha}|}}\left(\prod_{e \in E(\alpha)}{B_{edge}(e)}\right)n^{\frac{w(V(\alpha)) + w(I_{\alpha}) - w(S_{\alpha,min})}{2}}$
    \item $B(\gamma) = B_{vertex}^{|V(\gamma) \setminus U_{\gamma}| + |V(\gamma) \setminus V_{\gamma}|}\left(\prod_{e \in E(\gamma)}{B_{edge}(e)}\right)n^{\frac{w(V(\gamma) \setminus U_{\gamma})}{2}}$
    \item $N(\gamma) = (3D_V)^{2|V(\gamma) \setminus V_{\gamma}| + |V(\gamma) \setminus U_{\gamma}|}$
    \item $c(\alpha) = 100(3{t_{max}}D_V)^{|U_{\alpha} \setminus V_{\alpha}| + |V_{\alpha} \setminus U_{\alpha}| + k|E(\alpha)|}(2t_{max})^{|V(\alpha) \setminus (U_{\alpha} \cup V_{\alpha})|}$
\end{enumerate}

In our applications, we can interpret the above parameters in a much simpler manner again. More specifically, $k$ is a constant and we work with SoS degree $n^{\varepsilon}$. Then, we can think of each vertex or edge of the shape $\alpha$ or $\gamma$ essentially contributing a factor of $n^{\varepsilon}$. Therefore, we can interpret
\[q = n^{O(\varepsilon)}, B_{vertex} = n^{O(\varepsilon)}, B_{edge} = n^{O(\varepsilon)|E(\alpha)|}\]
\[B_{norm}(\alpha) = n^{O(\varepsilon)(|V(\alpha)| + |E(\alpha)|)}n^{\frac{w(V(\alpha)) + w(I_{\alpha}) - w(S_{\alpha,min})}{2}}\]
\[B(\gamma) = n^{O(\varepsilon)(|V(\gamma)| + |E(\gamma)|)}n^{\frac{w(V(\gamma) \setminus U_{\gamma})}{2}}\]
\[N(\gamma) = n^{O(\varepsilon)|V(\gamma)|}, c(\alpha) = n^{O(\varepsilon)(|V(\alpha)| + |E(\alpha)|)}\]

\subsubsection{Statement of the machinery}

As discussed above, consider the appropriate choice of parameters suited for the problem. Now, we can state our conditions on the problem ${\mathcal P}$ in terms of its correspondingly constructed pseudo-calibrated moment matrix $\Lambda_{{\mathcal P}}$ and coefficient matrices $H_{Id_U}, H_{\tau}$.

\begin{definition}[PSD mass]
    We say that ${\mathcal P}$ satisfies \normalfont{(PSD mass) } if for all $U \in \mathcal{I}_{mid}$,  $H_{Id_{U}} \succeq 0$.
\end{definition}

\begin{definition}[Middle shape bounds]
    We say that ${\mathcal P}$ satisfies \normalfont{(Middle shape bounds) } if for all $U \in \mathcal{I}_{mid}$ and $\tau \in \mathcal{M}_U$,
    \[
    \left[ {\begin{array}{cc}
            \frac{1}{|Aut(U)|c(\tau)}H_{Id_{U}} & B_{norm}(\tau)H_{\tau} \\
            B_{norm}(\tau)H^T_{\tau} & \frac{1}{|Aut(U)|c(\tau)}H_{Id_{U}}
    \end{array}} \right] \succeq 0
    \]
\end{definition}

\begin{definition}[Intersection term bounds]
    For some SoS-symmetric coefficient matrices $\{H'_{\gamma}: \gamma \in \Gamma\}$, ${\mathcal P}$ satisfies \normalfont{(Intersection term bounds) } with respect to them if for all $U,V \in \mathcal{I}_{mid}$ where $w(U) > w(V)$ and all $\gamma \in \Gamma_{U,V}$,
    \[
    c(\gamma)^2{N(\gamma)}^2{B(\gamma)^2}H^{-\gamma,\gamma}_{Id_{V}} \preceq H'_{\gamma}
    \]
\end{definition}

\begin{definition}[Truncation error bounds]
    For some SoS-symmetric coefficient matrices $\{H'_{\gamma}: \gamma \in \Gamma\}$,
    ${\mathcal P}$ satisfies \normalfont{(Truncation error bounds) } with respect to them if the following condition holds: Whenever $\norm{M_{\alpha}} \leq B_{norm}(\alpha)$ for all $\alpha \in \mathcal{M}'$,
    \[
    \sum_{U \in \mathcal{I}_{mid}}{M^{fact}_{Id_U}{(H_{Id_U})}} \succeq 6\left(\sum_{U \in \mathcal{I}_{mid}}{\sum_{\gamma \in \Gamma_{U,*}}{\frac{d_{Id_{U}}(H'_{\gamma},H_{Id_{U}})}{|Aut(U)|c(\gamma)}}}\right)Id_{sym}
    \]
\end{definition}

Finally, we can state our main theorem.
\begin{theorem}\label{generalmaintheorem}
    For all $\varepsilon > 0$, if we take the parameters defined above, and we have SoS-symmetric coefficient matrices $\{H'_{\gamma}: \gamma \in \Gamma\}$ such that ${\mathcal P}$ satisfies \normalfont{(PSD mass) }, \normalfont{(Middle shape bounds) }, \normalfont{(Intersection term bounds) } and \normalfont{(Truncation error bounds) }\hspace{-.8em},
    then with probability at least $1 - \varepsilon$, $\Lambda_{{\mathcal P}} \succeq 0$.
\end{theorem}

In our applications, for problems ${\mathcal P}$ of interest, we pseudo-calibrate, decompose into graph matrices, exhibit the desired conditions on ${\mathcal P}$ and invoke the machinery to prove our lower bounds.

\subsubsection{Choice of $H'_{\gamma}$ for our applications}\label{sec: hgamma_qual}
In our applications, we choose $H'_{\gamma}$ as follows.
\begin{enumerate}
    \item $H'_{\gamma}(\sigma,\sigma') = H_{Id_U}(\sigma, \sigma')$ whenever $|V(\sigma \circ \gamma)| \leq D_V$ and $|V(\sigma' \circ \gamma)| \leq D_V$.
    \item $H'_{\gamma}(\sigma,\sigma') = 0$ whenever $|V(\sigma \circ \gamma)| > D_V$ or $|V(\sigma' \circ \gamma)| > D_V$.
\end{enumerate}
Then, the truncation error that we need to bound is
\[
d_{Id_{U_{\gamma}}}(H_{Id_{U_{\gamma}}},H'_{\gamma}) = \sum_{\sigma,\sigma' \in \mathcal{L}_{U_{\gamma}}: V(\sigma) \leq D_V, V(\sigma') \leq D_V,
    \atop |V(\sigma \circ \gamma)| > D_V \text{ or } |V(\sigma' \circ \gamma)| > D_V}{B_{norm}(\sigma)B_{norm}(\sigma')H_{Id_{U_{\gamma}}}(\sigma,\sigma')}
\]

\chapter{Introduction}\label{chap: intro}

\input{intro}

\chapter{Nonlinear matrix concentration}\label{chap: efron_stein}

\input{efron_stein}

\chapter{The Sum of Squares Hierarchy}\label{chap: sos}

\input{sos}

\chapter{Our main results on Sum of Squares lower bounds}\label{chap: main_results}

\input{main_results}

\chapter{The Sherrington-Kirkpatrick Hamiltonian}\label{chap: sk}
\input{sk}

\chapter{The machinery and Qualitative bounds}\label{chap: qual}
\input{qual}

\chapter{Quantitative bounds}\label{chap: quant}
\input{quant}

\chapter{Followup and Future work}\label{chap: future_work}

\input{future_work}

\makebibliography


\end{document}



\section{The Sherrington-Kirkpatrick Hamiltonian}

We first define the Gaussian Orthogonal Ensemble, $\GOE(n)$, a random matrix model for $n \times n$ matrices.
\begin{definition}
	The Gaussian Orthogonal Ensemble, denoted $\GOE(n)$, is the distribution of $\frac{1}{\sqrt{2}}(A + A^\intercal)$
	where $A$ is a random $n\times n$ matrix with i.i.d. standard Gaussian entries.
\end{definition}

Equivalently, we could define $\GOE(n)$ to be a probability distribution over symmetric matrices $W$ such that $W_{ii} \sim \mathcal{N}(0, 2)$ for $i \le n$ and for $i\neq j$, $W_{ij} = W_{ji} \sim \mathcal{N}(0, 1)$ independently.

We consider the main optimization task
\begin{equation}\label{eq:general_opt}
	\OPT(W) := \max_{x \in \{\pm 1\}^n} x^\intercal W x,
\end{equation}
where $W$ is a random symmetric matrix in ${\mathbb R}^{n\times n}$. This is an important task that arises in computer science and statistical physics.

In computer science, a natural choice of $W$ is to take it to be the Laplacian of a graph~\cite[Section 4]{HooryLW06}. Then, the problem is equivalent to the
Maximum Cut problem, a well-known NP-hard problem in the worst
case~\cite{K72}. The equivalence is immediate by observing that
$x \in \{\pm 1\}^n$ can be thought of as encoding a bipartition of $[n] = \{1,
2, \ldots, n\}$.

In particular, an interesting special case is when we consider sparse random graphs, sampled either from the Erd\H{o}s\xspace-R\'enyi\xspace graphs $G(n, \frac{d}{n})$ with average degree $d$ or a uniformly chosen $d$-regular graph, where $d \ge 3$ is a fixed integer.
In this case, it is known that the true size of the maximum cut is asymptotically $n(\frac{d}{4} + f(d) \sqrt{d})$.
Moreover, it was shown in \cite{dembo2017extremal} (originally conjectured in \cite{zdeborova2010conjecture}) that $\lim_{d \rightarrow \infty} f(d) = \frac{1}{2}P^* \approx 0.382$, where
\[P^* := \frac{1}{2}\lim_{n\rightarrow\infty}{\mathbb E}_{W \sim \GOE(n)}[\frac{1}{n^{3/2}}\OPT(W)] \approx 0.7632\]
is referred to as the Parisi constant. This already strongly motivates the problem of studying \cref{eq:general_opt} when $W \sim \GOE(n)$. Interestingly, this problem is motivated for another fantastic reason.

In statistical physics, when $W \sim \GOE(n)$, our objective, up to scaling, is the Hamiltonian of the famous Sherrington-Kirkpatrick model. Here, $x$ can be thought of as encoding
spin values in a spin-glass model.
$-W_{i,j}$ models the interaction between spin $x_i$ and $x_j$
(with $-W_{i,j} \ge 0$ being ferromagnetic and $-W_{i,j} < 0$ being
anti-ferromagnetic). Then, the optimal value corresponds to the
minimum-energy, or ground state of the system, up to sign.
The works \cite{P79, parisi1980sequence, crisanti2002analysis} predicted, using non-rigorous means, that $P^* \approx 0.7632$. This was eventually formalized
in the works \cite{Tal06, Panchenko2014, guerra2003broken}.

In this work, we will focus on this average case optimization problem when $W \sim \GOE(n)$. The first natural question is whether there exists
a polynomial-time algorithm that
given $W \sim \GOE(n)$ computes an $x$ achieving close to $\OPT(W)$.
In a recent breakthrough work, Montanari~\cite{Montanari19} showed that, for any $\varepsilon > 0$, there exists a polynomial time algorithm that outputs $x$ given $W$ such that with high probability it achieves a value of $(2P^* - \varepsilon)n^{3/2}$ (assuming a widely believed conjecture).

Now we move onto certification: Is there an
efficient algorithm to certify an upper bound on $\OPT(W)$ for any
input $W$?

A simple algorithm will be the spectral algorithm where we just output the largest eigenvalue of $W$, up to scaling, for an upper bound.
Note that $\GOE(n)$ is a particular kind
of Wigner matrix ensemble, thereby satisfying the semicircle law, which
in this case establishes that the largest eigenvalue of $W$ is
$(2+\operatorname{o}_n(1)) \cdot \sqrt{n}$ with probability
$1-\operatorname{o}_n(1)$. Thus, a trivial spectral bound establishes
$\OPT(W) \le (2+\operatorname{o}_n(1)) \cdot n^{3/2}$ with probability
$1-\operatorname{o}_n(1)$.

Now, we can ask if it's possible to beat this spectral algorithm for certification. In
particular, we can ask how well SoS does as a certification
algorithm. The natural upper bound of $(2+\operatorname{o}_n(1)) \cdot
n^{3/2}$ obtained via the spectral norm of $W$ is also the value of
the degree-$2$ SoS relaxation~\cite{MS16}. Two independent recent
works of Mohanty--Raghavendra--Xu~\cite{mohanty2020lifting} and
Kunisky--Bandeira~\cite{KuniskyBandeira19} show that degree-4 SoS does
not perform much better, and a heuristic argument from~\cite{bkw19} suggests that even degree-$(n/\log n)$ SoS cannot certify anything stronger than the trivial spectral bound. Thus we ask,

\begin{center}
	\emph{Can higher-degree SoS certify better upper bounds for the Sherrington--Kirkpatrick problem, \\
		hopefully closer to the true bound $2 \cdot P^* \cdot n^{3/2}$?}
\end{center}

In this work, we answer the question above negatively by showing that even at degree as large as
$n^\delta$, SoS cannot improve upon the basic spectral
algorithm.
\begin{restatable}{theorem}{SKbounds}\label{theo:sk-bounds}
	There exists a constant $\delta > 0$ such that, w.h.p. for $W \sim \GOE(n)$, there is a degree-$n^\delta$ SoS solution
	for the Sherrington--Kirkpatrick problem with value at least $(2-\operatorname{o}_n(1)) \cdot n^{3/2}$.
\end{restatable}

An independent and concurrent work by Kunisky~\cite{kunisky2020} also showed a special case of the above theorem for degree-$6$ SoS, using different techniques.

We will present the proof of this theorem in \cref{chap: sk}.
The above theorem and it's proof originally appeared in \cite{sklowerbounds}, from which the material here is adapted from.
We now present the high level ideas behind the proof of this theorem.


\subsection{Our approach}
In order to prove~\cref{theo:sk-bounds}, we first introduce a new
average-case problem we call Planted Affine Planes (PAP) for which we directly prove a SoS lower bound. We then use
the PAP lower bound to prove a lower bound on the
Sherrington--Kirkpatrick problem. The PAP problem can be informally
described as follows (see~\cref{def:prob:pap} for the formal definition).
\begin{definition}[Informal statement of PAP]
	Given $m$ random vectors $d_1,\ldots,d_m$ in $\mathbb{R}^n$, can we
	prove that there is no vector $v \in {\mathbb R}^n$ such that for all
	$u \in [m]$, $\langle v, d_u\rangle^2 = 1$? In other words, can we
	prove that $m$ random vectors are not all contained in two parallel
	hyperplanes at equal distance from the origin?
\end{definition}
This problem, when we restrict $v$ to a Boolean vector in $\set{\pm \frac{1}{\sqrt{n}}}^n$,
can be encoded as the feasibility of the polynomial system
\begin{align*}
	\exists v \in {\mathbb R}^n~\text{s.t.} \qquad & \forall i \in [n], \; v_i^2 = \frac{1}{n},\\
	& \forall u \in [m], \; \ip{v}{d_u}^2 = 1.
\end{align*}
Hence it is a ripe candidate for SoS. However, we show that SoS fails
to refute a random instance with high probability over the input. The Boolean restriction on $v$ actually
makes the lower bound result stronger since SoS cannot refute even a
smaller subset of vectors in ${\mathbb R}^n$. In this work, we will consider two
different random distributions, namely when $d_1, \ldots, d_m$ are
independent samples from the multivariate normal distribution and when
they are independent samples from the uniform distribution on the
boolean hypercube.
\begin{theorem}\label{theo:sos-bounds}
	For both the Gaussian and Boolean settings, there exists a constant $c > 0$ such that for all $\varepsilon > 0$ and $\delta \le c\varepsilon$, for $m \leq n^{3/2 - \varepsilon}$, w.h.p. there is a feasible degree-$n^\delta$ SoS solution for Planted Affine Planes.
\end{theorem}

It turns out that the Planted Affine Plane problem introduced above is
closely related to the following ``Boolean vector in a random
subspace'' problem, which we call the Planted Boolean Vector problem,
introduced by~\cite{mohanty2020lifting} in the context of studying the performance
of SoS on computing the Sherrington--Kirkpatrick Hamiltonian.

The Planted Boolean Vector problem is to certify that a random subspace of ${\mathbb R}^n$ is far from containing a boolean vector.
Specifically, we want to certify an upper bound for
\[
\OPT(V) :=  \frac{1}{n}\max_{b \in \{\pm 1\}^n} b^\intercal \Pi_V b,
\]
where $V$ is a uniformly random $p$-dimensional subspace\footnote{$V$
	can be specified by a basis, which consists of $p$ i.i.d.  samples
	from ${\mathcal N}(0, I)$.} of ${\mathbb R}^n$, and $\Pi_V$ is the projector onto
$V$. In brief, the relationship to the Planted Affine Plane problem is
that the PAP vector $v$ represents the coefficients on a linear
combination for the vector $b$ in the span of a basis of
$V$.

An argument of~\cite{mohanty2020lifting} shows that, when $p \ll n$, w.h.p.,
$\OPT(V) \approx \frac{2}{\pi}$, whereas they also show that
w.h.p. assuming $p \geq n^{0.99}$, there is a degree-4 SoS solution
with value $1-\operatorname{o}_n(1)$. They ask whether or not there is a
polynomial time algorithm that can certify a tighter bound; we rule
out SoS-based algorithms for a larger regime both in terms of SoS
degree and the dimension $p$ of the random subspace.

\begin{restatable}{theorem}{booleanSubspace}\label{theo:boolean-subspace}
	There exists a constant $c > 0$ such that, for all $\varepsilon > 0$ and $\delta \le c\varepsilon$, for $p \geq n^{2/3 + \varepsilon}$, w.h.p. over $V$ there is a
	degree-$n^\delta$ SoS solution for Planted Boolean Vector of value $1$.
\end{restatable}

The bulk of our technical contribution lies
in the SoS lower bound for the Planted Affine Planes
problem,~\cref{theo:sos-bounds}. We then show that Planted Affine
Planes in the Gaussian setting is equivalent to the Planted Boolean
Vector problem. The reduction from Sherrington-Kirkpatrick to the
Planted Boolean Vector problem is due to
Mohanty--Raghavendra--Xu~\cite{mohanty2020lifting}.

As a starting point to the PAP lower bound, we employ pseudocalibration to produce a good
candidate SoS solution $\tilde{\EE}$. The operator $\tilde{\EE}$ unfortunately does
not exactly satisfy the PAP constraints ``$\ip{v}{d_u}^2 = 1$'', it
only satisfies them up to a tiny error. In the original work, we use an interesting and
rather generic approach to round $\tilde{\EE}$ to a nearby pseudoexpectation
operator $\tilde{\EE}'$ which does exactly satisfy the constraints, We have omitted this in this thesis for the sake of brevity, but it can be found in the original work \cite{sklowerbounds}.

For degree $D$, the candidate SoS solution can be viewed as a
(pseudo) moment matrix ${\mathcal M}$ with rows and columns indexed by
subsets $I,J\subset [n]$ with size bounded by $D/2$ and with entries
\[{\mathcal M}[I,J] := \tilde{\EE}[v^{I} v^{J}].\]
The matrix ${\mathcal M}$ is a random function of the inputs $d_1, \dots, d_m$, and the most challenging part of the
analysis consists of showing that ${\mathcal M}$ is positive
semi-definite (PSD) with high probability.

Similarly to~\cite{BHKKMP16}, we decompose
${\mathcal M}$ as a linear combination of graph matrices, i.e., ${\mathcal M} = \sum_{\alpha} \lambda_{\alpha} \cdot M_{\alpha}$, where $M_{\alpha}$
is the graph matrix associated with shape $\alpha$. In brief, each
graph matrix aggregates all terms with shape $\alpha$  in the Fourier expansions of the entries of ${\mathcal M}$ -- the shape $\alpha$ is informally a graph with labeled edges
with size bounded by ${\mathrm{poly}}(D)$. A graph
matrix decomposition of ${\mathcal M}$ is particularly handy in the PSD
analysis since the operator norm of individual graph matrices $M_{\alpha}$ is (with high probability)
determined by simple combinatorial properties of the graph
$\alpha$. One technical difference from~\cite{BHKKMP16} is that our
graph matrices have two types of vertices $\square{}$ and $\circle{}$; these graph matrices fall into the general framework developed by Ahn et al. in~\cite{ahn2016graph}.

To show that the matrix ${\mathcal M}$ is PSD, we need to study the graph matrices that appear with nonzero coefficients in the decomposition. The matrix ${\mathcal M}$ can be split into blocks and each diagonal block contains in the decomposition a (scaled) identity matrix. From the graph matrix perspective, this means that certain ``trivial'' shapes appear in the decomposition, with appropriate coefficients. If we could bound the norms of all other graph matrices that appear against these trivial shapes and show that, together, they have negligible norm compared to the sum of these scaled identity blocks, then we would be in good shape.

Unfortunately, this approach will not work. The kernel of the matrix ${\mathcal M}$ is nontrivial, as a consequence of satisfying the PAP constraints ``$\ip{v}{d_u}^2 = 1$", and hence there is no hope of showing that the contribution of all nontrivial shapes in the decomposition of ${\mathcal M}$ has small norm. Indeed, certain shapes $\alpha$ appearing in the
decomposition of ${\mathcal M}$ are such that $\norm{\lambda_{\alpha} \cdot M_{\alpha}}$ is large. As it turns out, all such shapes have a simple graphical substructure, and so we call these shapes \textit{spiders}.

To get around the null space issue, we restrict ourselves to $\nullspace({\mathcal M})^\perp$, which is the complement of the nullspace of ${\mathcal M}$.
We show that the substructure present in a spider implies that the spider is close to the zero matrix in $\nullspace({\mathcal M})^\perp$. Because of this, we can almost freely
add and subtract $M_\alpha$ for spiders $\alpha$ while preserving the action of ${\mathcal M}$ on $\nullspace({\mathcal M})^\perp$. Our strategy is to ``kill'' the spiders
by subtracting off $\lambda_\alpha \cdot M_\alpha$ for each spider $\alpha$. However, because $M_{\alpha}$ is only approximately in $\nullspace({\mathcal M})^\perp$, this
strategy could potentially introduce new graph matrix terms, and in particular it could introduce new spiders. To handle this,
we recursively kill them while carefully analyzing how the coefficients of all the graph matrices change. After all spiders
are killed, the resulting moment matrix becomes
$$
\sum_{0 \le k \le D/2} \frac{1}{n^{k}} \cdot I_k + \sum_{\gamma \colon \textup{non-spiders}} \lambda_{\gamma}' \cdot M_{\gamma},
$$
for some new coefficients $\lambda_{\gamma}'$. Here, $I_k$ is the
matrix which has an identity in the $k$th block and the remaining
entries $0$. Using a novel charging argument, we finally show that the
latter term is negligible compared to the former term, thus
establishing ${\mathcal M} \succeq 0$.

\subsection{Related work}
Degree-$4$ SoS lower bounds on the
Sherrington-Kirkpatrick Hamiltonian problem were proved independently
by Mohanty--Raghavendra--Xu~\cite{mohanty2020lifting} and
Kunisky--Bandeira~\cite{KuniskyBandeira19}. The concurrent and independent work by Kunisky~\cite{kunisky2020} obtained degree $6$ SoS lower bounds. In this work, we prove an
improved degree-$n^{\delta}$ SoS lower bound for some constant $\delta
> 0$.  Our result is obtained by reducing the Sherrington-Kirkpatrick
problem to the ``Boolean Vector in a Random Subspace'' problem which
is equivalent to our new Planted Affine Planes problem on the normal
distribution. The reduction from Sherrington-Kirkpatrick problem to
the ``Boolean Vector in a Random Subspace'' is due to
Mohanty--Raghavendra--Xu~\cite{mohanty2020lifting}. The results of
Mohanty--Raghavendra--Xu~\cite{mohanty2020lifting} and
Kunisky--Bandeira~\cite{KuniskyBandeira19} build on a degree-$2$ SoS
lower bounds of Montanari and Sen~\cite{MS16}.


Degree-$4$ SoS lower bounds on the ``Boolean Vector in a Random
Subspace'' problem for $p~\ge~n^{0.99}$ were proved by
Mohanty--Raghavendra--Xu in~\cite{mohanty2020lifting} where this problem was
introduced. We improve the dependence on $p$ to $p \ge n^{2/3
	+ \varepsilon}$ for any $\varepsilon > 0$ and obtain a stronger
degree-$n^{c\varepsilon}$ SoS lower bound for some absolute constant $c > 0$.

Interestingly, the recent work \cite{zadik2021latticebased} exhibited a polynomial-time algorithm for the search variant of Planted Affine Planes for $m \ge n + 1$, achieving statistical optimality. In particular, they beat prior known polynomial time algorithms, including SoS based ones, all of which required $m \gg n^2$ \cite{mao2021optimal}. This new algorithm is a lattice-based method that uses the specific algebraic structure present in this problem. Because of this, their algorithm is not robust to small perturbations, that is, they require the points to lie exactly on the planes. On the other hand, the spectral algorithms such as the work of \cite{mao2021optimal} are robust to noise. Because of this necessity of lack of noise, the lattice based algorithm is of a similar flavor to how Gaussian elimination can beat SoS lower bounds in the absense of noise. Specifically, this means that this lattice based algorithm does not refute our certification lower bound, or the low degree likelihood ratio hypothesis described in \cref{subsec: ldlr}.


\section{Sparse PCA}

Principal components analysis (PCA) \cite{joliffe1992principal} is a popular data processing and dimension reduction routine that is widely used. It has numerous applications in Machine Learning, Statistics, Engineering, Biology, etc. Given a dataset, PCA projects the data to a lower dimensional space spanned by the principal components. The intuition is that PCA sheds lower order information such as noise but importantly preserves much of the intrinsic information present in the data that are needed for downstream tasks.

However, despite great optimality properties, PCA has its drawbacks. Firstly, because the principal components are linear combinations of all the original variables, it's notoriously hard to interpret them \cite{mahoney2009cur}. Secondly, it's well known that PCA does not yield good estimators in high dimensional settings \cite{baik2005phase, paul2007asymptotics, johnstone_lu2009}.

To address these issues, a variant of PCA known as Sparse PCA is often used. Sparse PCA searches for principal components of the data with the added constraint of sparsity.
Concretely, consider given data $v_1, v_2, \ldots, v_m \in {\mathbb R}^d$. In Sparse PCA, we want to find the top principal component of the data under the extra constraint that it has sparsity at most $k$. That is, we want to find a vector $v \in {\mathbb R}^d$ that maximizes $\sum_{i = 1}^m \ip{v}{v_i}^2$ such that $\norm{v}_0 \le k$.

Sparse PCA has enjoyed applications in a diverse range of fields ranging from medicine, computational biology, economics, image and signal processing, finance and of course, machine learning and statistics (e.g. \cite{wang2012online, naikal2011informative, majumdar2009image, tan2014classification, chun2009expression, allen2011sparse}).
Moreover, sparse PCA comes with the important benefit that the components are easier to interpret. A notable example of this is to recover topics from documents \cite{d2004direct, papailiopoulos2013sparse}. Moreover, interpretability has important benefits for algorithmic fairness in machine learning.

A large volume of research has been devoted to study Sparse PCA and its variants.
Algorithms have been proposed and studied by several works, e.g. \cite{amini_wainwright2008, ma2013sparse, krauthgamer2015, deshpande2016, wang2016statistical, berthet2013complexity, ma_wigderson_15, diakonikolas2017statistical, hop17, brennan2019optimal, ding2019subexponential,  chowdhury2020approximation, d2020sparse}.
For example, simple variants of PCA such as thresholding on top of standard PCA \cite{johnstone_lu2009, chowdhury2020approximation} work well in certain parameter settings. This leads to the natural question whether more sophisticated algorithms can do better either for these settings or other parameter settings.

On the other hand, there have been works from the inapproximability perspective as well (e.g. \cite{berthet2013complexity, hop17, brennan2019optimal, krauthgamer2015, ding2019subexponential, wang2016statistical}, we will give a more detailed overview after stating our main result).
In particular, a lot of these inapproximability results have relied on various other conjectures, due to the difficulty of proving unconditional lower bounds.
Despite these prior works, exactly understanding the limits of efficient algorithms to this problem is still an active research area. This is natural considering the importance of sparse PCA and how fundamental it is to a multitude of applications.

Therefore, we naturally ask (also raised by and posed as an open problem in the works \cite{ma_wigderson_15, hop17, hop18})

\begin{it}
    Can Sum of Squares algorithms beat known algorithms for Sparse PCA?
\end{it}

In this work, we show that SoS algorithms cannot beat known spectral algorithms, even if we allow sub-exponential time! Therefore, this suggests that currently used algorithms such as thresholding or other spectral algorithms are in a sense optimal for this problem.

To prove our results, we will consider random instances of Sparse PCA and show that they are naturally hard for SoS. In particular, we focus on the Wishart random model of Sparse PCA. This model is a more natural modeling assumption compared to other random models that have been studied before, such as the Wigner random model.

Note importantly that our model assumptions only strengthen our results because we are proving impossibility results. In other words, if SoS algorithms do not work for this restricted version of sparse PCA, then it will not work for more general models, e.g. with general covariance or multiple spikes.
We now describe the model.

The Wishart model of Sparse PCA, also known as the Spiked Covariance model, was originally proposed by \cite{johnstone_lu2009}. In this model, we observe $m$ vectors $v_1, \ldots, v_m \in {\mathbb R}^d$ from the distribution $\mathcal{N}(0, I_d + \lambda uu^T)$ where $u$ is a $k$-sparse unit vector, that is, $\norm{u}_0 \le k$ and we would like to recover the principal component $u$. Here, the sparsity of a vector is the number of nonzero entries and $\lambda$ is known as the signal-to-noise ratio.

As the signal to noise ratio $\lambda$ gets lower, it becomes harder and maybe even impossible to recover $u$ since the signature left by $u$ in the data becomes fainter. However, it's possible that this may be mitigated if the number of samples $m$ grows. Therefore, there is a tradeoff between $m, n$ and $k$ at play here. Algorithms proposed earlier have been able to recover $u$ at various regimes.
For example, if the number of samples is really large, namely $m \gg \max(\frac{d}{\lambda}, \frac{d}{\lambda^2})$, then standard PCA will work. If this is not the case, we may still be able to recover $u$ by assuming that the sparsity is not too large compared to the number of samples, namely $m \gg \frac{k^2}{\lambda^2}$. To do this, we use a variant of standard PCA known as diagonal thresholding. Similar results have been obtained for various regimes, while some regimes have resisted attack to algorithms.

Our results here complete the picture by showing that in the regimes that have so far resisted attack by efficient algorithms, the powerful Sum of Squares algorithms also cannot recover the principal component. We now state our theorem informally, postponing the formal statement to \cref{cor: spca_main}.

\begin{theorem}\label{thm: spca_main_informal}
    For the Wishart model of Sparse PCA, sub-exponential time SoS algorithms fail to recover the principal component when the number of samples $m \ll \min(\frac{d}{\lambda^2}, \frac{k^2}{\lambda^2})$ .
\end{theorem}

In particular, this theorem resolves an open problem posed by \cite{ma_wigderson_15} and \cite{hop17, hop18}.

In almost all other regimes, algorithms to recover the principal component $u$ exist. We give a summary of such algorithms shortly, captured succinctly in \cref{fig: spca_thresholds}.
We say almost all other regimes because there is one interesting regime, namely $\frac{d}{\lambda^2} \le m \le \frac{\min(d, k)}{\lambda}$ marked by light green in \cref{fig: spca_thresholds}, where we can show that information theoretically, we cannot recover $u$ but it's possible to do hypothesis testing of Sparse PCA. That is, in this regime, we can distinguish purely random unspiked samples from the spiked samples. However, we will not be able to recover the principal component even if we use an exponential time brute force algorithm.

\begin{figure}[!h]
    \centering
    \begin{subfigure}{\textwidth}
        \centering
        \includegraphics[scale=.4, trim={0 0 0 0},clip]{SNRaboveOne_og}
        \caption{SNR $\lambda \ge 1$}
        \label{fig: spca_thresholds1}
       
       
       
    \end{subfigure}%

    \begin{subfigure}{\textwidth}
        \centering
        \includegraphics[scale=.4, trim={0 0 0 0},clip]{SNRbelowOne_og}
        \caption{SNR $\lambda < 1$}
        \label{fig: spca_thresholds2}
       
       
       
    \end{subfigure}
    \caption{Computational barrier diagram for Sparse PCA}
    \label{fig: spca_thresholds}
\end{figure}

Now, we state our results a bit more formally.
First, we will assume that the entries of $u$ are in $\{-\frac{1}{\sqrt{k}}, 0, \frac{1}{\sqrt{k}}\}$ chosen such that the sparsity is $k$ (and hence, the norm is $1$). Note importantly that this assumption is only strengthening our result: If SoS cannot solve this problem even for this specific $u$, it cannot do any better for the general problem with arbitrary $u$.

Let the vectors from the given dataset be $v_1, \ldots, v_m$. Let them form the rows of a matrix $S \in {\mathbb R}^{m \times d}$.
Let $\Sigma = \frac{1}{m} \sum_{i = 1}^m v_iv_i^T$ be the sample covariance matrix. Then the standard PCA objective is to maximize $x^T\Sigma x$ and recover $x = \sqrt{k}u$. Therefore, the sparse PCA problem can be rephrased as
\[\text{maximize } m\cdot x^T\Sigma x = \sum_{i = 1}^m \ip{x}{v_i}^2\text{ such that }x_i^3 = x_i\text{ for all $i \le d$ and } \sum_{i = 1}^d x_i^2 = k\]
where the program variables are $x_1, \ldots, x_d$.
The constraint $x_i^3 = x_i$ enforces that the entries of $x$ are in $\{-1, 0, 1\}$ and along with these constraints, the last condition $\sum_{i = 1}^d x_i^2 = k$ enforces $k$-sparsity (but we remark that, due to technical reasons, we will only satisfy this condition up to $o(1)$ error in our lower bounds). Then, the vector $u$ can be recovered by setting $u = \frac{1}{\sqrt{k}} x$.

Now, we will consider the series of convex relaxations for Sparse PCA obtained by SoS algorithms. In particular, we will consider SoS degree of $d^{\varepsilon}$ for a small constant $\varepsilon > 0$. Note that this corresponds to SoS algorithms of subexponential running time in the input size $d^{O(1)}$.

Our main result states that for choices of $m$ below a certain threshold, when the vectors $v_1, \ldots, v_m$ are sampled from the unspiked standard Gaussian $\mathcal{N}(0, I_d)$, then sub-exponential time SoS algorithms will have optimal value close to $m + m\lambda$. This is also the optimal value in the case when the vectors $v_1, \ldots, v_m$ are indeed sampled from the spiked Gaussian $\mathcal{N}(0, I_d + \lambda uu^T)$ and $x = \sqrt{k}u$.
Therefore, SoS is unable to distinguish $\mathcal{N}(0, I_d)$ from $\mathcal{N}(0, I_d + \lambda uu^T)$ and hence cannot solve sparse PCA. Formally,

\begin{theorem}\label{cor: spca_main}
    For all sufficiently small constants $\varepsilon > 0$, suppose $m \le \frac{d^{1 - \varepsilon}}{\lambda^2}, m \le \frac{k^{2 - \varepsilon}}{\lambda^2}$, and for some $A > 0$, $d^{A} \le k \le d^{1 - A\varepsilon}, \frac{\sqrt{\lambda}}{\sqrt{k}} \le d^{-A\varepsilon}$, then for an absolute constant $C > 0$, with high probability over a random $m \times d$ input matrix $S$ with Gaussian entries, the sub-exponential time SoS algorithm of degree $d^{C\varepsilon}$ for sparse PCA has optimal value at least $m + m\lambda - o(1)$.
\end{theorem}


In other words, sub-exponential time SoS cannot certify that for a random dataset with Gaussian entries, there is no unit vector $u$ with $k$ nonzero entries and $m \cdot u^T\Sigma u \approx m + m\lambda$.

A few remarks are in order.
\begin{enumerate}
    \item Note here that $m + m\lambda$ is approximately the value of the SoS program when the input vectors $v_1, \ldots, v_m$ are indeed sampled from the spiked model $\mathcal{N}(0, I_d + \lambda uu^T)$ and $x = \sqrt{k}u$. Therefore, sub-exponential time SoS is unable to distinguish a completely random distribution from the spiked distribution and hence is unable to solve sparse PCA.
    \item The constant $A$ can be thought of as $\approx 0$ and it appears for technical reasons, to ensure that we have sufficient decay in our bounds. In particular, most values of $k, \lambda$ fall under the conditions of the theorem.
    \item For technical reasons, the constraint $\sum_{i = 1}^k x_i^2 = k$ is satisfied up to $o(1)$ error in our lower bounds. We leave to future work the problem of satisfying this constraint exactly.
\end{enumerate}

Informally, our main result says that when $m \ll \min\left(\frac{d}{\lambda^2}, \frac{k^2}{\lambda^2}\right)$, then subexponential time SoS cannot recover the principal component $u$. This is the content of \cref{thm: spca_main_informal}.

To show our results, we use the strategy from \cref{sec: strategy_for_sos_lower_bounds}.
For the Wishart model of Sparse PCA, we use the following distributions.
\begin{restatable}{itemize}{SPCAdistributions}
   
        \item Random distribution $\nu$: $v_1, \ldots, v_m$ are sampled from $\mathcal{N}(0, I_d)$ and we take $S$ to be the $m \times d$ matrix with rows $v_1, \ldots, v_m$.
        \item Planted distribution $\mu$: Sample $u$ from $\{-\frac{1}{\sqrt{k}}, 0, \frac{1}{\sqrt{k}}\}^d$ where the values are taken with probabilites $\frac{k}{2d}, 1 - \frac{k}{d}, \frac{k}{2d}$ respectively. Then sample $v_1, \ldots, v_m$ as follows. For each $i \in [m]$, with probability $\Delta$, sample $v_i$ from $\mathcal{N}(0, I_d + \lambda uu^T)$ and with probability $1 - \Delta$, sample $v_i$ from $\mathcal{N}(0, I_d)$. Finally, take $S$ to be the $m \times d$ matrix with rows $v_1, \ldots, v_m$.
       
\end{restatable}

In \cref{sec: spca_qual}, we compute the SoS solution obtained by pseudo-calibration. We prove the following theorem.

\begin{restatable}{theorem}{SPCAmain}\label{thm: spca_main}
    There exists a constant $C > 0$ such that for all sufficiently small constants $\varepsilon > 0$, if $m \le \frac{d^{1 - \varepsilon}}{\lambda^2}, m \le \frac{k^{2 - \varepsilon}}{\lambda^2}$, and there exists a constant $A$ such that $0 < A < \frac{1}{4}$, $d^{4A} \le k \le d^{1 - A\varepsilon}$, and $\frac{\sqrt{\lambda}}{\sqrt{k}} \le d^{-A\varepsilon}$, then with high probability, the SoS solution given by pseudo-calibration for degree $d^{C\varepsilon}$ Sum of Squares is feasible.
\end{restatable}

Since we use an average case distribution, this SoS lower bound is a lower bound for certification. An overview of our proof is in \cref{sec: global_approach}.  From this theorem, \cref{cor: spca_main} follows as a corollary.

\paragraph{Prior work on algorithms}

Due to its widespread importance, a tremendous amount of work has been devoted to obtaining algorithms for sparse PCA, both theoretically and practically, \cite{amini_wainwright2008, ma2013sparse, krauthgamer2015, deshpande2016, wang2016statistical, berthet2013complexity, ma_wigderson_15, diakonikolas2017statistical, hop17, brennan2019optimal, ding2019subexponential,  chowdhury2020approximation, d2020sparse} to cite a few.

We now place our result in the context of known algorithms for Sparse PCA and explain why it offers tight tradeoffs between approximability and inapproximability.
Between this work and prior works, we completely understand the parameter regimes where sparse PCA is easy or conjectured to be hard up to polylogarithmic factors. In \cref{fig: spca_thresholds1} and \cref{fig: spca_thresholds2}, we assign the different parameter regimes into the following categories.
\begin{itemize}
    \item Diagonal thresholding: In this regime, Diagonal thresholding \cite{johnstone_lu2009, amini_wainwright2008} recovers the sparse vector. Covariance thresholding \cite{krauthgamer2015, deshpande2016} and SoS algorithms \cite{sparse_pca_focs20} can also be used in this regime. The benefits of these alternate algorithms are that covariance thresholding has better dependence on logarithmic factors and SoS algorithms works in the presence of adversarial errors.
    \item Vanilla PCA: Vanilla PCA (i.e. standard PCA) can recover the vector, i.e. we do not need to use the fact that the vector is sparse (see e.g. \cite{berthet2013, sparse_pca_focs20}).
    \item Spectral: An efficient spectral algorithm recovers the sparse vector (see e.g. \cite{sparse_pca_focs20}).
    \item Can test but not recover: A simple spectral algorithm can solve the hypothesis testing version of Sparse PCA but it is information theoretically impossible to recover the sparse vector \cite[Appendix E]{sparse_pca_focs20}.
    \item Hard: A regime where it is conjectured to be hard for algorithms to recover the sparse principal component. We discuss this in more detail below.
\end{itemize}

In \cref{fig: spca_thresholds1} and \cref{fig: spca_thresholds2}, the regimes corresponding to Diagonal thresholding, Vanilla PCA and Spectral are dark green, while the regimes corresponding to Spectral* and Hard are light green and red respectively.

\paragraph{Prior work on hardness}

Prior works have explored statistical query lower bounds \cite{brennan2020statistical}, basic SDP lower bounds \cite{krauthgamer2015}, reductions from conjectured hard problems \cite{berthet2013, berthet2013complexity, brennan2019optimal, gao2017sparse, wang2016statistical}, lower bounds via the low-degree conjecture \cite{ding2019subexponential, sparse_pca_focs20}, lower bounds via statistical physics \cite{ding2019subexponential, arous2020free}, etc.
We note that similar threshold behaviors as us have been predicted by \cite{sparse_pca_focs20}, but importantly, they assume a conjecture known as the low-degree likelihood conjecture. Similarly, many of these other lower bounds rely on various conjectures. To put this in context, the low-degree likelihood conjecture is a stronger assumption than P $\neq$ NP. In contrast, our results are unconditional and do not assume any conjectures.

Compared to these other lower bounds, there have only been two prior works on lower bounds against SoS algorithms \cite{krauthgamer2015, berthet2013, ma_wigderson_15} which are only for degree $2$ and degree $4$ SoS. In particular, degree $2$ SoS lower bounds have been studied in \cite{krauthgamer2015, berthet2013} although they don't state it this way. Moreover, \cite{ma_wigderson_15} obtained degree $4$ SoS lower bounds but they were very lossy, i.e. they hold for a strict subset of the \textit{Hard} regime $m \ll \frac{k^2}{\lambda^2}$ and $m \ll \frac{d}{\lambda^2}$. Moreover, the ideas used in these prior works do not generalize for higher degrees.
The lack of other SoS lower bounds can be attributed to the difficulty in proving such lower bounds. In this paper, we vastly strengthen these known results and show almost-tight lower bounds for SoS algorithms of degree $d^{\varepsilon}$ which correspond to sub-exponential running time $d^{d^{O(\varepsilon)}}$.
We note that SoS algorithms get stronger as the degree increases, therefore our results immediately imply these prior results and even in the special case of degree $4$ SoS, we improve the known lossy bounds. In summary, \cref{cor: spca_main} subsumes all these earlier known results and is a vast improvement over prior known SoS lower bounds which provides compelling evidence for the hardness of Sparse PCA in this parameter range.



The work \cite{hop17} also states SoS lower bounds for Sparse PCA but it differs from our work in three important aspects. First, they handle the related but qualitatively different Wigner model of Sparse PCA. Their techniques fail for the Wishart model of Sparse PCA, which is more natural in practice. We overcome this shortcoming and work with the Wishart model. We emphasize that their techniques are insufficient to handle this generality and overcoming this is far from being a mere technicality. On the other hand, our techniques can easily recover their results.
Second, while they sketch a high level proof overview for their lower bound, they don't give a proof. On the other hand, our proofs are fully explicit.
Finally, they assume the input distribution has entries in $\{\pm 1\}$, that is, they work with the $\pm 1$ variant of PCA.
On the other hand, we work with the more realistic setting where the distribution is $\mathcal{N}(0, 1)$.
Again, our techniques can easily recover their results as well.

\section{Tensor PCA}

We use our techniques to also obtain strong results for the related Tensor Principal components analysis (Tensor PCA) problem.
Tensor PCA, originally introduced by \cite{richard2014statistical}, is a generalization of PCA to higher order tensors. Formally, given an order $k$ tensor of the form $\lambda u^{\otimes k} + B$ where $u \in {\mathbb R}^n$ is a unit vector and $B \in {\mathbb R}^{[n]^k}$ has independent Gaussian entries, we would like to recover the principal component $u$. Here, $\lambda$ is known as the signal-to-noise ratio.

Tensor PCA is a remarkably useful statistical and computational technique to exploit higher order moments of the data.
It was originally envisaged to be applied in latent variable modeling and indeed, it has found multiple applications in this context (e.g. \cite{anandkumar2014tensor, kivva2021learning, anandkumar2014analyzing}). Here, a tensor containing statistics of the input data is computed and then it's decomposed in order to recover the latent variables.
Because of the technique's versatility, it has gathered a lot of attention in machine learning with applications in topic modeling, video processing, collaborative filtering,  community detection, etc. (see e.g. \cite{hsu2012spectral, anandkumar2014guaranteed, richard2014statistical, anandkumar2014tensor, anandkumar2014analyzing, duchenne2011tensor, li2010tensor} and references therein.)

For Tensor PCA, similar to sparse PCA, there has been wide interest in the community to study algorithms (e.g. \cite{arous2020algorithmic, tensorpca16, HopSS15, hopkins2016fast, richard2014statistical, zheng2015interpolating, wein2019kikuchi, kim2017community, anandkumar2017homotopy}) as well as approximability and hardness (e.g. \cite{montanari2015limitation, kunisky19notes, brennan2020reducibility, hop17}, a more detailed overview is presented after stating our main results).
It's worth noting that many of these hardness results are conditional, that is, they rely on various conjectures, sometimes stronger than P $\neq$ NP.
Moreover, there has been widespread interest from the statistics community as well, e.g. \cite{jagannath2020statistical, perry2016statistical, lesieur2017statistical, chen2019phase, chen2018phase}, due to fascinating connections to random matrix theory and statistical physics.

In this work, we study the performance of sub-exponential time Sum of Squares algorithms for Tensor PCA.
Our main result is stated informally below and formally in \cref{cor: tpca_main}.

\begin{theorem}\label{thm: tpca_main_informal}
    For Tensor PCA, sub-exponential time SoS algorithms fail to recover the principal component when the signal to noise ratio $\lambda \ll n^{\frac{k}{4}}$.
\end{theorem}

In particular, this resolves an open question posed by the works \cite{HopSS15, tensorpca16, hop17, hop18}.

Let's make this theorem formal. Recall that we are given an order $k$ tensor $A$ of the form $A = \lambda u^{\otimes k} + B$ where $u \in {\mathbb R}^n$ is a unit vector and $B \in {\mathbb R}^{[n]^k}$ has independent Gaussian entries and we would like to recover the principal component $u$.
Tensor PCA can be rephrased by the program
\[\text{maximize }\ip{A}{x^{\otimes k}} = \ip{A}{\underbrace{x\otimes\ldots\otimes x}_{\text{$k$ times}}}\text{ such that } \sum_{i = 1}^n x_i^2 = 1\]
where the program variables are $x_1, \ldots, x_n$.
The principal component $u$ will then just be the returned solution $x$.
Just like in Sparse PCA, we remark that for technical reasons, we will satisfy the unit vector condition only up to $o(1)$ error in our lower bounds and satisfying the condition exactly is left for future work.
We will again consider sub-exponential time SoS algorithms, in particular degree $n^{\varepsilon}$ SoS, for this problem. This is sub-exponential time because the input size is $n^{O(1)}$.

We then show that if the signal to noise ratio $\lambda$ is below a certain threshold, then sub-exponential time SoS for the unspiked input $A \sim \mathcal{N}(0, I_{[n]^k})$ will have optimal value close to $\lambda$, which is also the optimal value in the spiked case when $A = \lambda u^{\otimes k} + B, B\sim \mathcal{N}(0, I_{[n]^k})$ and $x = u$. In other words, SoS cannot distinguish the unspiked and spiked distributions and hence cannot recover the principal component $u$.

\begin{theorem}\label{cor: tpca_main}
    Let $k \ge 2$ be an integer. For all sufficiently small $\varepsilon > 0$, if $\lambda \le n^{\frac{k}{4} - \varepsilon}$, for an absolute constant $C > 0$, with high probability over a random tensor $A \sim\mathcal{N}(0, I_{[n]^k})$, the sub-exponential time SoS algorithm of degree $n^{C\varepsilon}$ for Tensor PCA has optimal value at least $\lambda - o(1)$.
\end{theorem}


Therefore, sub-exponential time SoS cannot certify that for a random tensor $A$ sampled from $\mathcal{N}(0, I_{[n]^k})$, there is no unit vector $u$ such that $\ip{A}{\underbrace{u \otimes\ldots\otimes u}_{\text{$k$ times}}} \approx \lambda$.

We again remark that when the tensor $A$ is actually sampled from the spiked model $A = \lambda u^{\otimes k} + B$, the optimal value of the SoS program is approximately $\lambda$ when $x = u$. Therefore, this shows that sub-exponential time SoS algorithms cannot solve Tensor PCA.

Informally, the theorem says that when the signal to noise ratio $\lambda \ll n^{\frac{k}{4}}$, SoS algorithms cannot solve Tensor PCA, as stated in \cref{thm: tpca_main_informal}.

To show our results for Tensor PCA, we apply the strategy from \cref{sec: strategy_for_sos_lower_bounds} where we use the following distributions. Let $k \ge 2$ be an integer.
\begin{restatable}{itemize}{TPCAdistributions}
   
        \item Random distribution $\nu$: Sample $A$ from $\mathcal{N}(0, I_{[n]^k})$.
        \item Planted distribution $\mu$: Let $\lambda,\Delta > 0$. Sample $u$ from $\{-\frac{1}{\sqrt{\Delta n}}, 0, \frac{1}{\sqrt{\Delta n}}\}^n$ where the values are taken with probabilites $\frac{\Delta}{2}, 1 - \Delta, \frac{\Delta}{2}$ respectively. Then sample $B$ from $\mathcal{N}(0, I_{[n]^k})$. Set $A = B + \lambda \tens{u}{k}$.
       
\end{restatable}

In \cref{sec: tpca_qual}, we apply pseudo-calibration and we prove the following theorem.

\begin{restatable}{theorem}{TPCAmain}\label{thm: tpca_main}
    Let $k \ge 2$ be an integer. There exist constants $C,C_{\Delta} > 0$ such that for all sufficiently small constants $\varepsilon > 0$, if $\lambda \le n^{\frac{k}{4} - \varepsilon}$ and $\Delta = n^{-C_{\Delta}\varepsilon}$ then with high probability, the SoS solution given by pseudo-calibration for degree $n^{C\varepsilon}$ Sum of Squares is feasible.
\end{restatable}

This theorem can also be naturally interpreted as an SoS lower bound for the certification problem of Tensor PCA. A sketch of our proof follows in \cref{sec: global_approach}. From this theorem, \cref{cor: tpca_main} follows as a corollary.

\paragraph{Prior work}
Algorithms for Tensor PCA have been studied in the works \cite{arous2020algorithmic, tensorpca16, HopSS15, hopkins2016fast, richard2014statistical, zheng2015interpolating, wein2019kikuchi, kim2017community, anandkumar2017homotopy}. It was shown in \cite{tensorpca16} that the degree $q$ SoS algorithm certifies an upper bound of $\frac{2^{O(k)} (n \cdot \text{polylog}(n))^{k/4}}{q^{k/4 - 1/2}}$ for the Tensor PCA problem. When $q = n^{\varepsilon}$ this gives an upper bound of $n^{\frac{k}{4} - O(\varepsilon)}$. Therefore, our result is tight, giving insight into the computational threshold for Tensor PCA.

Lower bounds for Tensor PCA have been studied in various forms including statistical query lower bounds \cite{brennan2020statistical, dudeja2021statistical}, reductions from conjectured hard problems \cite{zhang2018tensor, brennan2020reducibility}, lower bounds from the low-degree conjecture \cite{hop17, hop18, kunisky19notes}, evidence based on the landscape behavior \cite{arous2019landscape, montanari2015limitation}, etc. Compared to a lot of these works which rely on various conjectures, we remark that our lower bounds are unconditional and do not rely on any conjectures.

In \cite{hop17}, similar to Sparse PCA, they state a similar theorem for a different variant of Tensor PCA. However, they do not give a proof whereas we give explicit proofs.
In particular, they state their result without proof for the $\pm{1}$ variant of Tensor PCA whereas we work with the more realistic setting where the distribution is $\mathcal{N}(0, 1)$. We remark that their techniques do not recover our results but on the other hand, our techniques can recover theirs.


\section{Planted Slightly Denser Subgraph}

In the planted dense subgraph problem, we are given a random graph $G$ where a dense subgraph of size $k$ has been planted and we are asked to find this planted dense subgraph.
This is a natural generalization of the $k$-clique problem \cite{karp1972reducibility} and has been subject to a long line of work over the years (e.g. \cite{feige1997densest, feige2001dense, khot2006ruling, bhaskara2010detecting, bhaskara2012polynomial, braverman2017eth, manurangsi2017almost}).
In this work, we consider the following certification variant of planted dense subgraph.

\begin{quote}
	\em{Given a random graph $G$ sampled from the Erd\H{o}s\xspace-R\'enyi\xspace model $G(n, \frac{1}{2})$, certify an upper bound on the edge density of the densest subgraph on $k$ vertices.}
\end{quote}

We show a high degree SoS lower bound for this problem using the strategy from \cref{sec: strategy_for_sos_lower_bounds}. In particular, we use the following distributions.
\begin{restatable}{itemize}{PLDSdistributions}

	\item Random distribution $\nu$: Sample $G$ from $G(n, \frac{1}{2})$
	\item Planted distribution $\mu$: Let $k$ be an integer and let $p > \frac{1}{2}$. Sample a graph $G'$ from $G(n, \frac{1}{2})$. Choose a random subset $S$ of the vertices, where each vertex is picked independently with probability $\frac{k}{n}$. For all pairs $i, j$ of vertices in $S$, rerandomize the edge $(i, j)$ where the probability of $(i, j)$ being in the graph is now $p$. Set $G$ to be the resulting graph.

\end{restatable}
In \cref{sec: plds_qual}, we compute the candidate SoS solution obtained via pseudo-calibration. Our main theorem is as follows, with a proof sketch following in \cref{sec: global_approach}.

\begin{restatable}{theorem}{PLDSmain}\label{thm: plds_main}
	Let $C_p > 0$. There exists a constant $C > 0$ such that for all sufficiently small constants $\varepsilon > 0$, if $k \le n^{\frac{1}{2} - \varepsilon}$ and $p =  \frac{1}{2} + \frac{n^{-C_p\varepsilon}}{2}$, then with high probability, the candidate solution given by pseudo-calibraton for degree $n^{C\varepsilon}$ Sum of Squares is feasible.
\end{restatable}


\paragraph{Related work}
For many different parameter regimes of the random and planted distributions (an example being planting $G_{k, q}$ in $G_{n, p}$ for constants $p < q$), and when $k = o(\sqrt{n})$, the hardness of the easier distinguishing version of planted dense subgraph problem has been posed as formal conjecture (often referred to as the PDS conjecture) before in the literature (see e.g., \cite{hajek2015computational, chen2014statistical, brennan2018reducibility, brennan2019universality}). This has also led to many reductions to other problems \cite{brennan2019optimal}, although it's not clear if these reductions can be made in the SoS framework without loss in the parameter dependence.

In our case, we consider the slightly planted denser subgraph version where for $k \le n^{\frac{1}{2} - \varepsilon}$, we plant a subgraph of density $\frac{1}{2} + \frac{1}{n^{O(\varepsilon)}}$, i.e. $p = \frac{1}{2}, q = \frac{1}{2} + \frac{1}{n^{O(\varepsilon)}}$. This has been widely believed to require sub-exponential time. Our work provides strong evidence towards this by exhibiting unconditional lower bounds against the powerful SoS hierarchy, even if we consider $n^{O(\varepsilon)}$ levels, which corresponds to $n^{n^{O(\varepsilon)}}$ running time! We expect this to lead to this problem being used as a natural starting point for reductions to show sub-exponential time hardness for various problems.

Within the SoS literature, \cite{BHKKMP16} show that for $k \le n^{\frac{1}{2} - \varepsilon}$ for a constant $\varepsilon > 0$, the degree $o(\log n)$ Sum of Squares cannot distinguish between a fully random graph sampled from $G(n, \frac{1}{2})$ from a random graph which has a planted $k$-clique. This implies that degree $o(\log n)$ SoS cannot certify an edge density better than $1$ for the densest $k$-subgraph if $k \le n^{\frac{1}{2} - \varepsilon}$.

In \cref{thm: plds_main}, we show that for $k \le n^{\frac{1}{2} - \varepsilon}$ for a constant $\varepsilon > 0$, degree $n^{\Omega(\varepsilon)}$ SoS cannot certify an edge density better than $\frac{1}{2} + \frac{1}{n^{O(\varepsilon)}}$. The degree of SoS in our setting, $n^{\Omega(\varepsilon)}$ is vastly higher than the earlier known result which uses degree $o(\log n)$. To the best of our knowledge, this is the first result that proves such a high degree lower bound for this problem.

We remark that when we take $k = n^{\frac{1}{2} - \varepsilon}$,  the true edge density of the densest $k$-subgraph is $\frac{1}{2} + \frac{\sqrt{\log(n/k)}}{\sqrt{k}} + \operatorname{o}(\frac{1}{\sqrt{k}}) \approx \frac{1}{2} + \frac{1}{n^{1/4 - \varepsilon/2}}$ as was shown in \cite[Corollary 2]{gamarnik2019landscape} whereas, by \cref{thm: plds_main}, the SoS optimum is as large as $\frac{1}{2} + \frac{1}{n^{\varepsilon}}$. This highlights a significant difference in the optimum value.

\section{Our approach}\label{sec: global_approach}

In this section, we briefly describe how to prove \cref{thm: spca_main}, \cref{thm: tpca_main} and \cref{thm: plds_main}. We naturally start with pseudocalibration and all constraints except positivity are easily shown to hold by construction. To show positivity and hence the lower bound,  we will essentially apply a general meta-theorem called the machinery. The machinery enables us to show SoS lower bounds for certain kinds of ``noisy'' problems.

In this work, we state and use the machinery, whose proof can be found in the original work where it appeared \cite{potechin2020machinery}. To show PSDness, the machinery constructs certain \emph{coefficient matrices} from the moment matrix $\Lambda$ and gives conditions on these coefficient matrices which are sufficient to guarantee that $\Lambda$ is PSD with high probability. Some of the ideas involved in the machinery are a generalization of the techniques used to prove the SoS lower bound for planted clique \cite{BHKKMP16}. In this section, we give an informal sketch of the machinery. We also motivate some of the conditions that arise.

\paragraph{Shapes and graph matrices}
We start by describing shapes and graph matrices, which were originally introduced by \cite{BHKKMP16, medarametla2016bounds} and later generalized in \cite{ahn2016graph}. They will be covenient for our analysis.

Shapes $\alpha$ are graphs that contain extra information about the vertices. Corresponding to each shape $\alpha$, there is a matrix-valued function $M_{\alpha}$ (i.e. a matrix whose entries depend on the input) that we call a graph matrix. Graph matrices are analogous to a Fourier basis, but for matrix-valued functions that exhibit a certain kind of symmetry. In our setting, $\Lambda$ will be such a matrix-valued function, so we can decompose $\Lambda$ as a linear combination of graph matrices $\Lambda = \sum_{\text{shapes } \alpha}{\lambda_{\alpha}M_{\alpha}}$.

Shapes and graph matrices have several properties which make them very useful to work with. First, $\norm{M_{\alpha}}$ can be bounded with high probability in terms of simple combinatorial properties of the shape $\alpha$. Second, if two shapes $\alpha$ and $\beta$ match up in a certain way, we can combine them to form a larger shape $\alpha \circ \beta$. We call this operation shape composition. Third, each shape $\alpha$ has a canonical decomposition into three shapes, the left, middle and right parts of $\alpha$, which we call $\sigma$, $\tau$, and ${\sigma'}^T$. For this canonical decomposition, we have that $\alpha = \sigma \circ \tau \circ {\sigma'}^T$ and $M_{\alpha} \approx M_{\sigma}M_{\tau}M_{{\sigma'}^T}$. This decomposition is crucial for our analysis.

\paragraph{A general framework for SoS lower bounds}
We now sketch the strategy of the machinery.
\begin{enumerate}
   
    \item Decompose the moment matrix $\Lambda$ as a linear combination $\Lambda = \sum_{\text{shapes } \alpha}{\lambda_{\alpha}M_{\alpha}}$ of graph matrices $M_{\alpha}$
    \item For each shape $\alpha$, decompose $\alpha$ into a left part $\sigma$, a middle part $\tau$, and a right part ${\sigma'}^T$.
    \item Based on the coefficients $\lambda_{\alpha}$ and the decompositions of the shapes $\alpha$ into left, middle, and right parts, construct coefficient matrices $H_{Id_U}$ and $H_{\tau}$.
   
    \item Based on the coefficient matrices $H_{Id_U}$ and $H_{\tau}$, obtain an approximate PSD decomposition of $\Lambda$.
    \item Show that the error terms (which we call intersection terms) can be bounded by the approximate PSD decomposition of $\Lambda$.
\end{enumerate}
This is broadly similar to the work of \cite{BHKKMP16} who showed SoS lower bounds for the planted clique problem.

The machinery shows that this analysis will succeed by distilling it as three conditions on the coefficient matrices.
The rough blueprint to use the machinery to prove SoS lower bounds is as follows.
\begin{enumerate}
    \item Construct a candidate moment matrix $\Lambda$.
    \item Decompose the moment matrix $\Lambda$ as a linear combination $\Lambda = \sum_{\text{shapes } \alpha}{\lambda_{\alpha}M_{\alpha}}$ of graph matrices $M_{\alpha}$ (akin to Fourier decomposition) and find the corresponding coefficient matrices.
    \item Verify the required conditions on the coefficient matrices.
\end{enumerate}

\subsubsection{A sketch of the intuition behind the conditions}\label{ideadescriptionsubsection}

We now motivate and sketch the conditions we present in the machinery.

\paragraph{Giving an approximate PSD factorization}
As discussed above, we decompose the moment matrix $\Lambda$ as a linear combination $\Lambda = \sum_{\text{shapes } \alpha}{\lambda_{\alpha}M_{\alpha}}$ of graph matrices $M_{\alpha}$. We then decompose each $\alpha$ into left, middle, and right parts $\sigma$, $\tau$, and ${\sigma'}^T$. We now have that
\[
\Lambda = \sum_{\alpha = \sigma \circ \tau \circ {\sigma'}^T}{\lambda_{\sigma \circ \tau \circ {\sigma'}^T}M_{\sigma \circ \tau \circ {\sigma'}^T}}
\]

We first consider the terms $\sum_{\sigma, \sigma'} \lambda_{\sigma \circ \sigma'^T}M_{\sigma \circ \sigma'^T} \approx \sum_{\sigma, \sigma'} \lambda_{\sigma \circ \sigma'^T}M_{\sigma} M_{\sigma'^T}$ where $\tau$ corresponds to an identity matrix and can be ignored.

If there existed real numbers $v_{\sigma}$ for all left shapes $\sigma$ such that $\lambda_{\sigma \circ \sigma'^T} = v_{\sigma}v_{\sigma'}$, then we would have
\[
\sum_{\sigma, \sigma'} \lambda_{\sigma \circ \sigma'^T}M_{\sigma} M_{\sigma'^T} = \sum_{\sigma, \sigma'} v_{\sigma}v_{\sigma'}M_{\sigma} M_{\sigma'^T} = (\sum_{\sigma} v_{\sigma}M_{\sigma})(\sum_{\sigma} v_{\sigma}M_{\sigma})^T \succeq 0
\]
which shows that the contribution from these terms is positive semidefinite. In fact, this turns out to be the case for the planted clique analysis. However, this may not hold in general. To handle this, we note that the existence of $v_{\sigma}$ can be relaxed as follows: Let $H$ be the matrix with rows and columns indexed by left shapes $\sigma$ such that $H(\sigma, \sigma') = \lambda_{\sigma \circ \sigma'^T}$. Up to scaling, $H$ will be one of our coefficient matrices. If $H$ is positive semidefinite then the contribution from these terms will also be positive semidefinite. In fact, this will be
the PSD mass condition of the main theorem.

\paragraph{Handling terms with a non-trivial middle part}

Unfortunately, we also have terms $\lambda_{\sigma \circ \tau \circ \sigma'^T}M_{\sigma \circ \tau \circ \sigma'^T}$ where $\tau$ is non-trivial. Their strategy is to charge these terms to other terms.
For the sake of simplicity, we will describe how to handle one term. A starting point is the following inequality. For a left shape $\sigma$, a middle shape $\tau$, a right shape $\sigma'^T$, and real numbers $a, b$,
\[(a M_{\sigma} - bM_{\sigma'}M_{\tau^T})(a M_{\sigma} - bM_{\sigma'}M_{\tau^T})^T \succeq 0\]
which rearranges to
\begin{align*}
    ab(M_{\sigma}M_{\tau}M_{\sigma'^T} + (M_{\sigma}M_{\tau}M_{\sigma'^T})^T) &\preceq a^2M_{\sigma}M_{\sigma^T} + b^2M_{\sigma'}M_{\tau^T}M_{\tau}M_{\sigma'^T}\\
    &\preceq a^2M_{\sigma}M_{\sigma^T} + b^2\norm{M_{\tau}}^2M_{\sigma'}M_{\sigma'^T}
\end{align*}

If $\lambda_{\sigma \circ \tau \circ \sigma'^T}^2\norm{M_{\tau}}^2 \le \lambda_{\sigma \circ \sigma^T}\lambda_{\sigma' \circ \sigma'^T}$, then we can choose $a, b$ such that $a^2 \le \lambda_{\sigma \circ \sigma^T}$, $ b^2 \norm{M_{\tau}}^2 \le \lambda_{\sigma' \circ \sigma'^T}$ and $ab = \lambda_{\sigma \circ \tau \circ \sigma'^T}$. This will approximately imply
\[\lambda_{\sigma \circ \tau \circ \sigma'^T}(M_{\sigma \circ \tau \circ \sigma'^T} + M_{\sigma \circ \tau \circ \sigma'^T}^T) \preceq \lambda_{\sigma \circ \sigma^T}M_{\sigma \circ \sigma^T} + \lambda_{\sigma' \circ \sigma'^T}M_{\sigma' \circ \sigma'^T}\]
which will give us a way to charge terms with a nontrivial middle part against terms with a trivial middle part.

While we could try to apply this inequality term by term, it is not strong enough to give us the main machinery result. Instead, they generalize this inequality to work with the entire set of shapes $\sigma, \sigma'$ for a fixed $\tau$. This will lead us to
the middle shape bounds condition.

\paragraph{Handing intersection terms}

There's one important technicality in the above calculations. Whenever we decompose $\alpha$ into left, middle, and right parts $\sigma$, $\tau$, and ${\sigma'}^T$, $M_{\sigma}M_{\tau}M_{{\sigma'}^T}$ is only approximately equal to $M_{\alpha} = M_{\sigma \circ \tau \circ {\sigma'}^T}$. All the other error terms have to be carefully handled in the analysis. We call these terms intersection terms.

We exploit the fact that these intersection terms themselves are graph matrices. Therefore, we recursively decompose them into $\sigma_2 \circ \tau_2 \circ \sigma_2'^T$ and apply the previous ideas. To do this methodically, the machinery employs several ideas such as the notion of intersection patterns and the generalized intersection tradeoff lemma. Properly handling the intersection terms is one of the most technically intensive parts of their work.
This analysis leads us to the intersection term bounds condition.

\paragraph{Applying the machinery}

To apply the machinery to our problems of interest, we verify the spectral conditions that our coefficients should satisfy and then we can use the main theorem. The Planted slightly denser subgraph application is straightforward and will serve as a good warmup to understand the machinery. In the applications to Tensor PCA and Sparse PCA, the shapes corresponding to the graph matrices with nonzero coefficients have nice structural properties that will be crucial for our analysis. We exploit this structure and use novel charging arguments to verify the conditions of the machinery. We do this in this work.

\section{Related work on Sum of Squares Lower Bounds for Certification Problems}

\cite{KothariMOW17} proved that for random constraint satisfaction problems (CSPs) where the predicate has a balanced pairwise independent distribution of solutions, with high probability, degree $\Omega(n)$ SoS is required to certify that these CSPs do not have a solution. While they don't state it in this manner, the pseudo-expectation values used by \cite{KothariMOW17} can also be derived using pseudo-calibration \cite{rajendran2018combinatorial, brown2020extended}. The analysis for showing that the moment matrix is PSD is very different. It is an interesting question whether or not it is possible to unify these analyses.

\cite{mohanty2020lifting} showed that it's possible to lift degree $2$ SoS solutions to degree $4$ SoS solutions under suitable conditions, and used it to obtain degree $4$ SoS lower bounds for average case $d$-regular Max-Cut and the Sherrington Kirkpatrick problem. Their construction is inspired by pseudo-calibration and their analysis also goes via graph matrices.


\cite{kunisky2020} recently proposed a technique to lift degree $2$ SoS lower bounds to higher levels and applied it to construct degree $6$ lower bounds for the Sherrington-Kirkpatrick problem. Interestingly, their construction does not go via pseudo-calibration.

\section{Organization of the proofs}

We prove the Sherrington-Kirkpatrick lower bound, \cref{theo:sk-bounds}, in \cref{chap: sk}. The proofs for planted slightly denser subgraph, tensor PCA and sparse PCA, namely \cref{thm: plds_main}, \cref{thm: tpca_main} and \cref{thm: spca_main}, are split between \cref{chap: qual} and \cref{chap: quant}. The latter proofs are split into qualitative and quantitative versions. Qualitative theorem statements capture the essence of the inequalities we prove, and serve to illustrate the main forms of the bounds we desire, without getting lost in the details. Quantitative theorems on the other hand build on their qualitative counterparts by stating the precise bounds that are needed. In \cref{chap: qual}, we introduce the machinery and and in \cref{sec: plds_qual}, \cref{sec: tpca_qual} and \cref{sec: spca_qual}, we qualitatively verify the conditions of the machinery for planted slightly denser subgraph, tensor PCA, and sparse PCA respectively. While these sections only verify the qualitative conditions, the results in these sections are precise and will be reused in \cref{chap: quant}, where we fully verify the conditions of the machinery in \cref{sec: plds_quant}, \cref{sec: tpca_quant} and \cref{sec: spca_quant}.

\section{The machinery}\label{quantitativetheoremstatementsection}

In this section, we describe the machinery we apply to show SoS lower bounds.
\input{machinery_statement}

\section{Qualitative bounds for Planted slightly denser subgraph}\label{sec: plds_qual}

\input{machinery/planted_ds_qual}

\section{Qualitative bounds for Tensor PCA}\label{sec: tpca_qual}

\input{machinery/tensor_pca_qual}

\section{Qualitative bounds for Sparse PCA}\label{sec: spca_qual}

\input{machinery/sparse_pca_qual}
\section{Planted slightly denser subgraph: Full verification}\label{sec: plds_quant}

\input{machinery/planted_ds_quant}

\section{Tensor PCA: Full verification}\label{sec: tpca_quant}

\input{machinery/tensor_pca_quant}

\section{Sparse PCA: Full verification}\label{sec: spca_quant}

\input{machinery/sparse_pca_quant}

\subsection{PAP planted distribution}\label{subsec:pap:dist}


We formally define the random and the planted distributions for the
Planted Affine Planes problem in the Gaussian and boolean
settings. These two (families of) distributions are required by the
pseudocalibration machinery in order to define a candidate
pseudoexpectation operator $\tilde{\EE}$. For the Gaussian setting, we have
the following distributions.

\begin{definition}[Gaussian PAP distributions]\label{def:prob:pap:gauss:dist}
  The Gaussian PAP distributions are as follows.
  \begin{enumerate}
      \item (Random distribution) $m$ i.i.d. vectors $d_u \sim \gauss{0}{I}$.
      \item (Planted distribution) A vector $v$ is sampled uniformly from $\left\{\pm \frac{1}{\sqrt{n}}\right\}^n$, as well as signs $b_u \in_{\text{R}} \{\pm 1\}$,
             and $m$ vectors $d_u$ are drawn from $\mathcal{N}(0, I)$ conditioned on $\ip{d_u}{v} = b_u$.
  \end{enumerate}
\end{definition}

For the boolean setting, we have the following distributions.
\begin{definition}[Boolean PAP distributions]\label{def:prob:pap:bool:dist}
  The boolean PAP distributions are as follows
  \begin{enumerate}
      \item (Random distribution) $m$ i.i.d. vectors $d_u \in_{\text{R}} \{-1,+1\}^n$.
      \item (Planted distribution) A vector $v$ is sampled uniformly from $\left\{\pm \frac{1}{\sqrt{n}}\right\}^n$, as well as signs $b_u \in_{\text{R}} \{\pm 1\}$, and $m$ vectors $d_u$ are drawn from $\left\{\pm 1\right\}^n$ conditioned on $\ip{d_u}{v} = b_u$.
  \end{enumerate}
\end{definition}

\subsection{Pseudocalibration technique}\label{subsec:pseudo_calib_technique}

We will use the shorthand ${\mathbb E}_{\text{ra}}$ and ${\mathbb E}_{\text{pl}}$ for
the expectation under the random and planted distributions.
Pseudocalibration gives a method for constructing a candidate
pseudoexpectation operator $\tilde{\EE}$.
The idea behind pseudocalibration is that
${\mathbb E}_{\text{ra}} \tilde{\EE} f(v)$ should match with ${\mathbb E}_{\text{pl}} f(v)$ for
every low-degree test of the data $t = t(d) = t(d_1, \dots, d_m)$,
\[{\mathbb E}_{\text{ra}} t(d) \tilde{\EE} f(v) = {\mathbb E}_{\text{pl}} t(d) f(v) .\]
When pseudocalibrating, one can freely choose the ``outer'' basis in
which to express the polynomial $f(v)$, as well as the ``inner'' basis
of low-degree tests which should agree with the planted
distribution. Though we attempted to use alternate bases to simplify
the analysis, ultimately we opted for the standard choice of bases: a
Fourier basis for the inner basis in each setting (Hermite functions
for the Gaussian setting, parity functions for the boolean setting),
and the coordinate basis $v^I$ for the outer basis.

When the inner basis is orthonormal under the random distribution (as a Fourier basis is), the pseudocalibration condition
gives a formula for the coefficients of $\tilde{\EE} f(v)$ in the orthonormal basis (though it only gives the coefficients of the low-degree functions $t(d)$). Concretely, letting the inner basis be indexed by $\alpha \in {\mathcal F}$, as a function of $d$ the pseudocalibration condition enforces
\[ \tilde{\EE} f(v) = \displaystyle\sum_{\substack{\alpha \in {\mathcal F}: \\ \abs{\alpha} \leq n^\tau}} \left( {\mathbb E}_{\text{pl}} t_\alpha(d) f(v) \right)t_\alpha (d).\]
Here we use ``$\abs{\alpha} \leq n^\tau$'' to describe the set of low-degree tests. The pseudocalibration condition does not prescribe any coefficients for functions $t_\alpha(d)$ with $\abs{\alpha} > n^\tau$ and an economical choice is to set these coefficients to zero.

When pseudocalibrating, our pseudoexpectation operator is guaranteed to be linear, as the expression above is linear in $f$. It is guaranteed to satisfy all constraints of the form ``$f(v) =0$''. It will approximately satisfy constraints of the form ``$f(v, d) = 0$'', though only up to truncation error.

\begin{fact}[Proof in \cite{sklowerbounds}]\label{lem:pE-constraints}
  If $p(v)$ is a polynomial which is uniformly zero on the planted
  distribution, then $\tilde{\EE}[p]$ is the zero function. If $p(v,d)$ is a polynomial which is uniformly zero on the planted distribution, then the only nonzero Fourier coefficients of $\tilde{\EE}[p]$ are those with size between $n^\tau \pm \deg_d(p)$.
\end{fact}

Truncation
introduces a tiny error in the constraints, which we are able to handle in \cite{sklowerbounds}, omitted in this work for brevity.

For the pseudocalibration we truncate to only Fourier coefficients of
size at most $n^\tau$. The relationship between the parameters is $\delta \le c\tau \le c'\varepsilon$ where $c' < c < 1$ are absolute constants. We will assume that they are sufficiently small for all our proofs to go through.

Pseudocalibration also by default does not enforce the condition $\tilde{\EE}[1] = 1$. However, this is easily fixed by dividing the operator by $\tilde{\EE}[1]$. As will be pointed out in~\cref{rmk:pe-one}, w.h.p. in the unnormalized pseudocalibration, $\tilde{\EE}[1] = 1 + \operatorname{o}_n(1)$ and so the error introduced does not impact the statement of any lemmas.


\subsection{Gaussian setting pseudocalibration}\label{subsec:calib:gauss}


We start by computing the pseudocalibration for the Gaussian setting. Here the natural choice of Fourier basis is the Hermite polynomials. Let $\alpha \in ({\mathbb{N}}^n)^m$ denote a Hermite polynomial index. Define $\alpha! := \prod_{u,i} \alpha_{u,i}!$ and $\abs{\alpha} := \sum_{u, i} \alpha_{u,i}$ and $\abs{\alpha_u} := \sum_i \alpha_{u,i}$. We let $h_\alpha(d_1, \dots, d_m)$ denote an unnormalized Hermite polynomial, so that $h_{\alpha}/\sqrt{\alpha!}$ forms an orthonormal basis for polynomials in the entries of the vectors $d_1, \dots, d_m$, under the inner product $\ip{p}{q} = {\mathbb E}_{d_1, \dots, d_m \sim {\mathcal N}(0, I)} [p \cdot q]$.

We can view $\alpha$ as an $m\times n$ matrix of natural numbers, and with this view we also define $\alpha^\intercal \in ({\mathbb{N}}^m)^n$.
\begin{lemma}\label{lem:gaussian-pseudocal}
For any $I \subseteq [n]$, the pseudocalibration value is
\[\tilde{\EE} v^I = \displaystyle\sum_{\substack{\alpha: \abs{\alpha} \leq n^\tau,\\ \abs{\alpha_u} \text{ even}, \\ \abs{(\alpha^\intercal)_i} \equiv I_i \; (\mod 2)}} \left(\prod_{u = 1}^m h_{\abs{\alpha_u}}(1) \right)\cdot\frac{1}{n^{\abs{I}/2 + \abs{\alpha}/2}} \cdot\frac{h_{\alpha}(d_1, \dots, d_m)}{\alpha!}. \]
\end{lemma}
In words, the nonzero Fourier coefficients are those which have even row sums, and whose column sums match the parity of $I$.
\begin{proof}
The truncated pseudocalibrated value is defined to be
\[\tilde{\EE} v^I = \displaystyle\sum_{\alpha : \abs{\alpha} \leq n^\tau} \frac{h_{\alpha}(d_1, \dots, d_m)}{\alpha!} \cdot {\mathbb E}_{\text{pl}}[h_{\alpha}(d_1, \dots, d_m) \cdot v^I] \]
So we set about to compute the planted moments. For this computation, the following lemma is crucial. Here, we give a short proof of this lemma using generating functions. For a different combinatorial proof, see \cite{sklowerbounds}.


\begin{lemma}\label{lem:fixed-moments}
Let $\alpha \in {\mathbb{N}}^n$. When $v$ is fixed and $b$ is fixed (not necessarily $\pm1$) and $d \sim N(0, I)$ conditioned on $\ip{v}{d} = b\norm{v}$,
\[{\mathbb E}_{d}[h_{\alpha}(d)] = \frac{v^\alpha}{\norm{v}^{\abs{\alpha}}} \cdot h_{\abs{\alpha}}(b).\]
\end{lemma}
\begin{proof}
It suffices to prove the claim when $\norm{v} = 1$ since the left-hand side is independent of $\norm{v}$. Express $d = bv + (I - vv^\intercal)x$ where $x \sim N(0,I)$ is a standard normal variable. Now we want
\[{\mathbb E}_{x \sim N(0,I)} h_{\alpha}\left(bv + (I - vv^\intercal)x\right). \]
The Hermite polynomial generating function is
\[\displaystyle\sum_{\alpha \in {\mathbb{N}}^n} {\mathbb E}_{x \sim N(0,I)} h_{\alpha}\left(bv + (I - vv^\intercal)x\right)\frac{t^{\alpha}}{\alpha!} = {\mathbb E}_x\exp\left(\ip{bv + (I - vv^\intercal)x}{t} - \frac{\norm{t}_2^2}{2}\right)\]
\[= \int_{\mathbb{R}^n} \frac{1}{(2\pi)^{\frac{n}{2}}} \cdot \exp\left(\ip{bv + (I - vv^\intercal)x}{t} - \frac{\norm{t}_2^2}{2} - \frac{\norm{x}_2^2}{2}\right) \; dx. \]
Completing the square,
\begin{align*}= &  \int_{\mathbb{R}^n} \frac{1}{(2\pi)^{\frac{n}{2}}} \cdot \exp\left(\ip{bv}{t} - \frac{\ip{v}{t}^2}{2} - \frac{1}{2} \cdot\norm{x- (t - \ip{v}{t}v)}_2^2\right) \; dx \\
=&  \exp\left(\ip{bv}{t} - \frac{\ip{v}{t}^2}{2}\right) \\
=& \exp\left(b\ip{v}{t} - \frac{1}{2} \cdot \ip{v}{t}^2\right).
\end{align*}
How can we Taylor expand this in terms of $t$? The Taylor expansion of $\exp(by - \frac{y^2}{2})$ is $\sum_{i=0}^\infty h_i(b) \frac{y^i}{i!}$. That is, the $i$-th derivative in $y$ of $\exp(by - \frac{y^2}{2})$, evaluated at 0, is $h_i(b)$. Using the chain rule with $y=\ip{v}{t}$, the $\alpha$-derivative in $t$ of our expression, evaluated at 0, is $v^\alpha \cdot h_{\abs{\alpha}}(b)$. This is the expression we wanted when $\norm{v} = 1$, and along with the aforementioned remark about homogeneity in $\norm{v}$ this completes the proof.
\end{proof}

Now we can finish the calculation.
To compute ${\mathbb E}_{\text{pl}}[h_{\alpha}(d_1, \dots, d_m) \cdot v^I]$, marginalize $v$ and the $b_u$ and factor the conditionally independent $b_u$ and $d_u$.
\begin{align*}
    {\mathbb E}_{\text{pl}}[h_{\alpha}(d_1, \dots, d_m) v^I] &= {\mathbb E}_{v, b_u} v^I \prod_{u=1}^m {\mathbb E}_{d}\left[h_{\alpha_u}(d_u) \mid v, b_u\right]\\
    &= {\mathbb E}_{v, b_u} v^I \cdot \prod_{u=1}^m \frac{v^{\alpha_u}}{\norm{v}^{\abs{\alpha_u}}} \cdot h_{\abs{\alpha_u}}(b_u) && (\text{\cref{lem:fixed-moments})}\\
    &= \left({\mathbb E}_{v} \frac{v^{I + \sum_{u=1}^m \alpha_u}}{\norm{v}^{\sum_{u=1}^m\abs{\alpha_u}}} \right) \cdot \left( \prod_{u=1}^m {\mathbb E}_{b_u}h_{\abs{\alpha_u}}(b_u)\right)
\end{align*}
The Hermite polynomial expectations will be zero in expectation over $b_u$ if the degree is odd, and otherwise $b_u$ is raised to an even power and can be replaced by 1. This requires that $\abs{\alpha_u}$ is even for all $u$. The norm $\norm{v}$ is constantly $1$ and can be dropped. The numerator will be $\frac{1}{n^{\abs{I}/2 + \abs{\alpha}/2}}$ if the parity of every $\abs{(\alpha^\intercal)_i}$ matches $I_i$, and 0 otherwise. This completes the pseudocalibration calculation.
\end{proof}

We can now write ${\mathcal M}$ in terms of graph matrices.

\begin{definition}\label{def:calL_valid_shapes}
	Let $\mathcal{L}$ be the set of all proper shapes $\alpha$ with the following properties
	\begin{itemize}
		\item $U_{\alpha}$ and $V_{\alpha}$ only contain square vertices and $|U_{\alpha}|, |V_{\alpha}| \le n^{\delta}$
		\item $W_\alpha$ has no degree $0$ vertices
		\item $\deg(\square{i}) + U_\alpha(\square{i}) + V_\alpha(\square{i})$ is even for all $\square{i} \in V(\alpha)$
		\item $\deg(\circle{u})$ is even and $\deg(\circle{u}) \ge 4$ for all $\circle{u} \in V(\alpha)$
		\item $|E(\alpha)| \le n^\tau$
	\end{itemize}
\end{definition}

\begin{remark}
  Note that the shapes in $\mathcal{L}$ can have isolated vertices in $U_{\alpha} \cap V_{\alpha}$.
\end{remark}

\begin{remark}\label{rmk:circle_deg_bound}
	${\mathcal L}$ captures all the shapes that have nonzero coefficient when we write ${\mathcal M}$ in terms of graph matrices. The constraint $\deg(\circle{u}) \ge 4$ arises because pseudocalibration gives us that $\deg(\circle{u})$ is even, $\circle{u}$ cannot be isolated, and $h_2(1) = 0$.
\end{remark}


For a shape $\alpha$, we define
\[\alpha! := \prod_{e \in E(\alpha)} l(e)!\] Note that this equals the factorial of the corresponding index of the Hermite polynomial for this shape.

\begin{definition}
	For any shape $\alpha$, if $\alpha \in \mathcal{L}$, define \[\lambda_{\alpha} :=
	\left( \prod_{\circle{u}\in V(\alpha)} h_{\deg(\circle{u})}(1)\right)
	\cdot \frac{1}{ n^{(\abs{U_\alpha} + \abs{V_\alpha} + \abs{E(\alpha)})/2}}
	\cdot \frac{1}{\alpha!}  \]
	Otherwise, define $\lambda_{\alpha} := 0$.
\end{definition}

\begin{corollary} Modulo the footnote\footnote{Technically, the graph matrices $M_\alpha$ have rows and columns indexed by all subsets of ${\mathcal C}_m \cup {\mathcal S}_n$. The submatrix with rows and columns from $\binom{{\mathcal S}_n}{\leq D/2}$ equals the moment matrix for $\tilde{\EE}$.}, ${\mathcal M} = \displaystyle\sum_{\text{shapes }\alpha} \lambda_{\alpha} M_{\alpha}$.
\end{corollary}

\subsection{Boolean setting pseudocalibration}\label{subsec:calib:bool}

We now present the pseudocalibration for the boolean setting. For the
sequel, we need notation for vectors on a slice of the boolean
cube.

\begin{definition}[Slice]
  Let $v \in \set{\pm 1}^n$ and $\theta \in \mathbb{Z}$. The slice $\slice_{v}(\theta)$ is defined as
  $$
  \slice_{v}(\theta) \coloneqq \set{d \in \set{\pm 1}^n ~\vert~ \ip{v}{d} = \theta}.
  $$
  We use $\slice_{v}(\pm \theta)$ to denote $\slice_{v}(\theta) \cup \slice_{v}(-\theta)$ and
  $\slice(\theta)$ to denote $\slice_{v}(\theta)$ when $v$ is the all-ones vector.
\end{definition}


\begin{remark}
  With our notation for the slice, the planted distribution in the boolean setting can be equivalently described as
  \begin{enumerate}
    \item Sample $v \in \set{\frac{\pm 1}{\sqrt{n}}}^n$ uniformly, and then
    \item Sample $d_1,\dots,d_m$ independently and uniformly from $\slice_{\sqrt{n} \cdot v}(\pm\sqrt{n})$.
  \end{enumerate}
\end{remark}
The planted distribution doesn't actually exist for every $n$, but this is immaterial, as we can still define the pseudoexpectation via the same formula.

We will also need the expectation of monomials over the slice
$\slice(\sqrt{n})$ since they will appear in the description of the
pseudocalibrated Fourier coefficients.

\begin{definition}
   $
   e(k) \coloneqq {\mathbb E}_{x \in_{\text{R}} \mathcal{S}(\sqrt{n})}\left[x_1\cdots x_k\right].
   $
\end{definition}

We now compute the Fourier coefficients of $\tilde{\EE} v^{\beta}$, where
$\beta \in \mathbb{F}_2^n$. The Fourier basis when $d_1, \dots, d_m \in_{\text{R}} \{\pm 1\}^n$ is the set of parity functions. Thus a character can be specified by $\alpha \in ({\mathbb F}_2^n)^m$, where $\alpha$
is composed of $m$ vectors
$\alpha_1,\dots,\alpha_m \in \mathbb{F}_2^n$.  More precisely, the
character $\chi_{\alpha}$ associated to $\alpha$ is defined as
\[ \chi_\alpha(d_1,\dots,d_m) := \prod_{u=1}^m d_u^{\alpha_u}\]
We denote by $\abs{\alpha}$ the number of non-zero entries of $\alpha$
and define $\abs{\alpha_u}$ similarly. Thinking of $\alpha$ as an $m\times n$ matrix with entries in ${\mathbb F}_2$, we also define $\alpha^\intercal \in ({\mathbb F}_2^n)^m$.

\begin{lemma}\label{lem:boolean-pseudocalibration}
We have
  $$
  \tilde{\EE} v^{\beta} = \frac{1}{n^{\abs{\beta}/2}}\sum_{\substack{\alpha \colon \abs{\alpha} \le n^\tau, \\ \abs{\alpha_u} \text{ even},\\ \abs{\alpha^\intercal_i} \equiv \beta_i \;(\mod 2) }} \prod_{u=1}^m e(\abs{\alpha_u}) \cdot \chi_{\alpha_u}(d_u).
  $$
\end{lemma}
The set of nonzero coefficients has a similar structure as in the Gaussian case: the rows of $\alpha$ must have an even number of entries, and the $i$-th column must have parity matching $\beta_i$.

\begin{proof}
   Given $\alpha \in ({\mathbb F}_2^n)^m$ with $\abs{\alpha} \le n^\tau$, the pseudocalibration equation enforces by construction that
   $$
   {\mathbb E}_{d_1,\dots,d_m \in \set{\pm 1}^n} (\tilde{\EE} v^{\beta})(d_1,\dots,d_m) \cdot \chi_{\alpha}(d_1,\dots,d_m) = {\mathbb E}_{\text{pl}} v^{\beta} \cdot \chi_{\alpha}(d_1,\dots,d_m).
   $$

  Computing the RHS above
  yields
{\footnotesize
  \begin{align*}
    {\mathbb E}_{v \in \set{\pm 1}^n}  {\mathbb E}_{d_1,\dots,d_m \in_{\text{R}} \slice_v(\pm \sqrt{n})}\left[ v^{\beta} \prod_{u=1}^m \chi_{\alpha_u}(d_u) \right] &= {\mathbb E}_{v \in \set{\pm 1}^n}  {\mathbb E}_{d_1,\dots,d_m \in_{\text{R}} \slice(\pm \sqrt{n})} \left[ v^{\beta} \prod_{u=1}^m \chi_{\alpha_u}(v) \chi_{\alpha_u}(d_u) \right] \\
      & = {\mathbb E}_{v \in \set{\pm 1}^n} \chi_{\alpha_1+\cdots +\alpha_m + \beta}(v) {\mathbb E}_{d_1,\dots,d_m \in \slice(\pm \sqrt{n})}\left[ \prod_{i=1}^m \chi_{\alpha_i}(d_i) \right] \\
      & = \mathbf 1_{\left[\alpha_1+\cdots +\alpha_m=\beta\right]} \cdot \prod_{i=1}^m {\mathbb E}_{d_i \in \slice(\pm \sqrt{n})} \left[ \chi_{\alpha_i}(d_i) \right]\\
      & = \mathbf 1_{\left[\alpha_1+\cdots +\alpha_m=\beta\right]} \cdot \prod_{i=1}^m \mathbf 1_{\left[\abs{\alpha_i} \equiv 0 \pmod{2} \right]} \cdot \prod_{i=1}^m e(\abs{\alpha_i}).
\end{align*}
}%
  Since we have a general expression for the Fourier coefficient of each character,
  applying Fourier inversion  concludes the proof.
\end{proof}

We can now express the moment matrix in terms of graph matrices.

\begin{definition}
    Let ${\mathcal L}_{bool}$ be the set of shapes in ${\mathcal L}$ from~\cref{def:calL_valid_shapes} in which the edge labels are all 1.
\end{definition}

\begin{remark}
	${\mathcal L}_{bool}$ captures all the shapes that have nonzero coefficient when we write ${\mathcal M}$ in terms of graph matrices. Similar to~\cref{rmk:circle_deg_bound}, since $e(2) = 0$ (see~\cref{claim:e2}), we have the same condition $deg(\circle{u}) \ge 4$ for shapes in ${\mathcal L}_{bool}$.
\end{remark}

\begin{definition}
For all shapes $\alpha$, if $\alpha \in {\mathcal L}_{bool}$ define
    \[\lambda_\alpha :=   \frac{1}{n^{(\abs{U_\alpha} + \abs{V_\alpha})/2}}\prod_{\circle{u} \in V(\alpha)} e(\deg(\circle{u}))\]
Otherwise, let $\lambda_\alpha := 0$.
\end{definition}

\begin{corollary}${\mathcal M} = \displaystyle\sum_{\text{shapes }\alpha} \lambda_\alpha M_\alpha$
\end{corollary}

\subsubsection{Unifying the analysis}


It turns out that the analysis of the boolean setting mostly
follows from the analysis in the Gaussian setting.
Initially, the boolean pseudocalibration is essentially equal to
the Gaussian pseudocalibration in which we have removed all shapes
containing at least one edge with a label $k \ge 2$. The coefficients
on the graph matrices will actually be slightly different, but they
both admit an upper bound that is sufficient for our purposes
(see~\cref{prop:coefficient-bound} for the precise statement).

To unify the notation in our analysis, we conveniently set the edge
functions of the graphs in the boolean case to be
\[h_k(x) = \left\{\begin{array}{lr}
    1 & \text{if }k = 0\\
    x & \text{if }k = 1\\
    0 & \text{if }k \geq 2
\end{array}
\right. \]
This choice of $h_k(x)$ preserves the fact that
$\{h_0(x)=1,h_1(x)=x\}$ is an orthogonal polynomial basis in
the boolean setting, while zeroing out graphs with larger labels.


During the course of the analysis, we may multiply two graph matrices
and produce graph matrices with improper parallel edges (so-called
``intersections terms"). For a fixed pair ${u,i}$ of vertices, parallel
edges between $u$ and $i$ with labels $l_1,\dots,l_s$ correspond to
the product of orthogonal polynomials $\prod_{j=1}^s
h_{l_j}(d_{u,i}) \eqqcolon q(d_{u,i})$. We will re-express this product
as a linear combination of polynomials in the orthogonal family, i.e.,\xspace
$q(d_{u,i}) = \sum_{i=0}^{\textup{deg}(q)} \lambda_i \cdot
h_i(d_{u,i})$ for some coefficients $\lambda_i \in \mathbb{R}$. For
the boolean case, the polynomial $q(d_{u,i})$ will be either
$h_0(d_{u,i})=1$ or $h_1(d_{u,i})=d_{u,i}$. However, for the Gaussian
setting there may be up to $\textup{deg}(q)$ non-zero, potentially larger coefficients
$\lambda_i$ for the corresponding Hermite polynomials $h_i$.
For the graphs that arise in this way, we will always bound their
contributions to ${\mathcal M}$ by applying the triangle inequality and norm
bounds. Since we show bounds using the larger coefficients $\lambda_i$ from the Gaussian case,
the same bounds apply when using the 0/1 coefficients in the boolean case.

We will consider separate cases at any point where the analysis differs between the two settings.
\subsection{Finishing the proof}\label{sec:finishing-psdness}

The final step of the proof is to argue that, after the spider killing
process is completed, the newly created non-spider terms in ${\mathcal M}^+$ also
have small norm. Towards this, we would like to prove a statement similar
to~\cref{cor:non_spider_killing}. In that proof, we used special
structural properties of the non-spiders in ${\mathcal L}$ to
prove that non-spiders in the pseudocalibration were negligible.
But now, the non-spiders in ${\mathcal M}^+$ need not have the properties of
${\mathcal L}$ -- for instance, there could be circle vertices of degree $2$ or
isolated vertices. To handle the potentially larger norms, we will use
that the coefficients of these new non-spider terms $\beta$ come with the
coefficients $\lambda_\alpha$ of the spider terms $\alpha$ in whose web
they lie. Since $\alpha$ has more vertices/edges than $\beta$, the power of $\frac{1}{n}$ in $\lambda_\alpha$ is larger than the ``expected pseudocalibration'' coefficient of $\eta^{\abs{U_\beta} + \abs{V_\beta}} \cdot \frac{1}{n^{\abs{E(\beta)}/2}}$.
We prove that these extra factors of $\frac{1}{n}$ are enough to overpower isolated
vertices or a smaller vertex separator using a careful
charging argument.

\begin{lemma}\label{lem:advanced-charging}
	If $\beta$ is a nontrivial non-spider and $\beta \in W(\alpha)$ for some spider $\alpha \in {\mathcal L}$, then
	\[\eta^{\abs{U_\alpha} + \abs{V_\alpha}} \cdot \frac{1}{n^{\abs{E(\alpha)}/2}}\cdot n^{\frac{w(V(\beta)) - w(S_{\min}) + w(W_{iso})}{2}} \leq \eta^{\abs{U_\beta} + \abs{V_\beta}}\cdot \frac{1}{n^{\Omega(\varepsilon\abs{E(\alpha)})}}\]
	where $S_{min}$ and $W_{iso}$ are the minimum vertex separator of $\beta$ and the set of isolated vertices of $V(\beta) \setminus (U_\beta \cup V_\beta)$ respectively.
\end{lemma}

\begin{proof}
	We start by giving the idea of the proof. Suppose we try to use the same
	distribution scheme as in the proof of \cref{lem:charging}. It doesn't work
	for two reasons. Firstly, the circle vertices in $\beta$ still have even
	degree, which follows from \cref{rmk:parity}, but now, they could have
	degrees $0$ or $2$. For the previous distribution scheme to go through, we
	needed them to have degree at least $4$ which gave the necessary edge decay
	to handle the norm bounds. Secondly, the square vertices can now have degree
	$0$ hence getting no decay from the edges.

	The first issue is relatively easy to handle. Since $\beta$ was obtained by
	collapsing $\alpha$, the circle vertices of degrees $0$ or $2$ in $\beta$
	must have had degree at least $4$ in $\alpha$ to begin with. Hence, we can
	fix a particular sequence of collapses from $\alpha$ to $\beta$ and then
	assume for the sake of analysis that the removed edges are still present. In
	this case, the same charging argument as in \cref{lem:charging} would go
	through. This is made formal by looking at the sequence of improper
	collapses of this chain of collapses.

	To handle the second issue, let's analyze more carefully how degree $0$
	square vertices appear. Fix a sequence of collapses from $\alpha$ to $\beta$
	and consider a specific step where $\gamma$ collapsed to $\gamma'$ and a
	square vertex of degree $0$ was formed. Let the two square vertices that
	collapsed in $\gamma$ be $\square{i}, \square{j}$ and let the square vertex
	of degree $0$ that formed in $\gamma'$ be $\square{k}$. In light of
	\cref{rmk:parity}, since $\square{k}$ has degree $0$, it must not be in
	$(U_{\gamma'} \cup V_{\gamma'})\setminus (U_{\gamma'} \cap V_{\gamma'})$ and
	hence, $U_{\gamma'}(\square{k}) = V_{\gamma'}(\square{k}) = 0$ or
	$U_{\gamma'}(\square{k}) = V_{\gamma'}(\square{k}) = 1$. But in the latter
	case, this vertex does not contribute to norm bounds since it's in
	$U_{\gamma'} \cap V_{\gamma'}$ so it can be safely disregarded. Note that
	it doesn't have to stay in this set since future collapses might collapse
	this vertex, but this is not a problem as we can charge for this collapse
	if it happens.

	So, assume we have $U_{\gamma'}(\square{k}) = V_{\gamma'}(\square{k}) = 0$.
	But by the definition of collapse, at least one of $\square{i}$ or
	$\square{j}$ must have been in $U_\gamma \setminus (U_\gamma \cap V_\gamma)$
	or $V_\gamma \setminus (U_\gamma \cap V_\gamma)$. Also from the definition
	of collapse, we have $U_{\gamma'}(\square{k}) = U_{\gamma}(\square{i}) +
	U_{\gamma}(\square{j}) (\mod 2)$ and $V_{\gamma'}(\square{k}) =
	V_{\gamma}(\square{i}) + V_{\gamma}(\square{j}) (\mod 2)$. Putting these
	together, we immediately get that the only way this could have happened is
	if either $\square{i}, \square{j} \in U_\gamma \setminus (U_\gamma \cap
	V_\gamma)$ or if $\square{i}, \square{j} \in V_\gamma \setminus (U_\gamma
	\cap V_{\gamma})$.

	When such a collapse happens, observe that $|U_{\gamma}| + |V_{\gamma}| \ge
	|U_{\gamma'}| + |V_{\gamma'}| + 2$. This is precisely where the decay from
	our normalization factor $\eta = \frac{1}{\sqrt{n}}$ kicks in. This
	inequality means that an extra decay factor of $\eta^2 = \frac{1}{n}$ is
	available to us when we compare to the "expected pseudocalibration"
	coefficient of $\beta$. We will use this factor to charge the new square
	vertex of degree $0$.

	We now make these ideas formal.

	Let $Q = U_\beta \cap V_\beta, P = (U_\beta \cup V_\beta) \setminus Q$ and
	let $P'$ be the set of degree $1$ square vertices in $\beta$ that are not in
	$S_{min}$. Let $s_0$ be the number of degree $0$ square vertices in
	$V(\beta)\setminus Q$. All the square vertices outside $P' \cup Q \cup
	S_{min}$ have degree at least $2$, let there be $s_{\ge 2}$ of them.

	Because of parity constraints, \cref{rmk:parity}, and because there are no
	circle vertices in $U_{\beta} \cup V_{\beta}$, all circle vertices have even
	degree in $\beta$. Let $c_0$ be the number of degree $0$ circle vertices in
	$\beta$. Let $c_2, c_{\ge 4}$ be the number of degree $2$ circle vertices and
	the number of circle vertices of degree at least $4$ in $V(\beta) \setminus
	S_{min}$ respectively. Then, we have \[n^{\frac{w(V(\beta)) - w(S_{\min}) +
	w(W_{iso})}{2}} \le n^{\frac{|P'| + s_{\ge 2} + (1.5 - \varepsilon)(c_2 + c_{\ge
	4})}{2}} \cdot n^{s_0 + (1.5 - \varepsilon)c_0}\]

	Using $\eta = \frac{1}{\sqrt{n}}$, it suffices to show
	\begin{align*}
        \abs{E(\alpha)} + &(|U_\alpha| + |V_\alpha| - |U_\beta| - |V_\beta|)\\
        &\ge |P'| + s_{\ge 2} + (1.5 - \varepsilon)(c_2 + c_{\ge 4}) + 2s_0 + 2(1.5 - \varepsilon)c_0 + \Omega(\varepsilon\abs{E(\alpha)})
    \end{align*}

	There can be many ways to collapse $\alpha$ to $\beta$, fix any one. We first use a charging argument for the degree $0$ square vertices.
	\begin{lemma}\label{lem:phantom_vertex}
		$|U_\alpha| + |V_\alpha| - |U_\beta| - |V_\beta| \ge 2s_0$
	\end{lemma}
	\begin{proof}
		In the collapse process, in each step, a vertex $\square{i} \in U_\gamma \setminus (U_\gamma \cap V_\gamma)$ or $\square{i} \in V_\gamma \setminus (U_\gamma \cap V_\gamma)$ of degree $1$ in an intermediate shape $\gamma$ collapses with another square vertex $\square{k}$. We have that $|U_\gamma| + |V_\gamma|$ decreases precisely when $\square{i}$ collapses with $\square{k} \in U_\gamma$ (resp. $\square{k} \in V_\gamma$). In either case, the quantity decreases by exactly $2$ which we allocate to this new merged vertex. Each degree $0$ square vertex in $V(\beta) \setminus Q$ must have arisen from a collapse, and hence must have had at least an additive quantity of $2$ allocated to it. This proves that $|U_\alpha| + |V_\alpha| - |U_\beta| - |V_\beta| \ge 2s_0$.
	\end{proof}

	We will now prove a structural lemma.
	\begin{lemma}\label{lem:structure}
		Any vertex $\circle{u}$ that has degree at least $2$ in $V(\beta) \setminus S_{min}$ is adjacent to at most $1$ vertex of $P'$.
	\end{lemma}

	\begin{proof}
		Observe that $\circle{u}$ cannot be adjacent to $3$ vertices in $P'$ because otherwise, at least $2$ of them would be in $U_\beta \setminus Q$ or in $V_\beta \setminus Q$ which means $\beta$ would be a spider which is a contradiction. If $\circle{u}$ is adjacent to $2$ vertices in $P'$, then one of them is in $U_\beta \setminus Q$ and the other is in $V_\beta \setminus Q$ respectively. Since both of these vertices are not in $S_{min}$, it follows that $\circle{u}$ is in $S_{min}$ since there is no path from $U_\beta$ to $V_\beta$ that doesn't pass through $S_{min}$. This is a contradiction. Therefore, $\circle{u}$ is adjacent to at most $1$ vertex in $P'$.
	\end{proof}
	This lemma immediately implies $|P'| \le c_2 + c_{\ge 4}$.

	To account for edges of $\alpha$ that are not in $\beta$, we let
	$\widetilde{\beta}$ be the result of improperly collapsing $\alpha$ to
	$\beta$; note that $\abs{E(\alpha)} = \abs{E(\widetilde{\beta})}$.
	%
	We call the edges that disappeared when properly collapsing ``phantom'' edges.
	Let $\deg_{\widetilde{\beta}}(\square{i})$ (resp. $\deg_{\widetilde{\beta}}(\circle{u})$) denote the degree of vertex $\square{i}$ (resp. $\circle{u}$) in $\widetilde{\beta}$. Observe that any circle vertex $\circle{u}$ in $V(\beta)$ has $deg_{\widetilde{\beta}}(\circle{u}) \ge 4$.

	\begin{lemma}\label{lem:phantom_edge}
		$\abs{E(\alpha)} \ge |P'| + s_{\ge 2} + (1.5 - \varepsilon)(c_2 + c_{\ge 4}) + 2(1.5 - \varepsilon)c_0 + \Omega(\varepsilon\abs{E(\alpha)})$
	\end{lemma}

	\begin{proof}
		We will use the following charging scheme. Each edge of $\beta$ incident on $P'$ allocates $1$ to the incident square vertex, which is in $P'$. Every other edge of $\beta$ allocates $\frac{1}{2}$ to the incident square vertex and $\frac{1}{2} - \frac{\varepsilon}{10}$ to the incident circle vertex.	Each phantom edge allocates $1 - \frac{\varepsilon}{10}$ to the incident circle vertex $\circle{u}$. So, a total of $\frac{\varepsilon}{10}(\abs{E(\alpha)} - |P'|)$ has not been allocated.

		All square vertices in $P'$ have been allocated a value of $1$. And observe that all square vertices of degree at least $2$ in $\beta$ have been allocated at least $1$ from the incident edges of $\beta$, for a total value of $s_{\ge 2}$. So, the square vertices get a total allocation of at least $|P'| + s_{\ge 2}$.

		Consider any degree-$0$ circle vertex $\circle{u}$ in $V(\beta)$. It must be incident to at least $4$ phantom edges and hence, must be allocated at least a value of $4(1 - \frac{\varepsilon}{10}) > 2(1.5 - \varepsilon)$. Hence, the degree-$0$ circle vertices in $V(\beta$) have a total allocation of at least $2(1.5 - \varepsilon)c_0$.

		Suppose the degree of $\circle{u}$ in $V(\beta)$ is $2$. Then, it is incident on at least $2$ phantom edges.
		By \cref{lem:structure}, it is also adjacent to at most one vertex of $P'$ and so, must have been allocated a value of at least $2(1 - \frac{\varepsilon}{10}) + (deg_{\widetilde{\beta}}(\circle{u}) - 3)(\frac{1}{2} - \frac{\varepsilon}{10})$. This is at least $1.5 - \varepsilon + \frac{\varepsilon}{10}$.

		Suppose the degree of $\circle{u}$ in $V(\beta)$ is at least $4$. By \cref{lem:structure}, it is adjacent to at most one vertex of $P'$. Then it must have been allocated a value of at least $(deg_{\widetilde{\beta}}(\circle{u}) - 1)(\frac{1}{2} - \frac{\varepsilon}{10})$.  Using $deg_{\widetilde{\beta}}(\circle{u}) \ge 4$, this is at least $1.5 - \varepsilon + \frac{\varepsilon}{10}$.

		This implies
		\[\abs{E(\alpha)} \ge |P'| + s_{\ge 2} + 2(1.5 - \varepsilon)c_0 + (1.5 - \varepsilon + \frac{\varepsilon}{10})(c_2 + c_{\ge 4}) + \frac{\varepsilon}{10}(\abs{E(\alpha)} - |P'|)\]
		Using $|P'| \le c_2 + c_{\ge 4}$ completes the proof.
	\end{proof}

	Adding \cref{lem:phantom_vertex} and \cref{lem:phantom_edge}, we get the result.
\end{proof}

\begin{corollary}\label{cor:non-spider-norm-bound}
	If $\beta$ is a nontrivial non-spider and $\beta \in W(\alpha)$ for some spider $\alpha \in {\mathcal L}$, then
	\[\eta^{\abs{U_\alpha} + \abs{V_\alpha}} \cdot \frac{1}{n^{\abs{E(\alpha)}/2}}\norm{M_\beta} \leq \eta^{\abs{U_\beta} + \abs{V_\beta}}\cdot \frac{1}{n^{\Omega(\varepsilon\abs{E(\alpha)})}}\]
\end{corollary}

\begin{proof}

	From \cref{lem:gaussian-norm-bounds}, we have
	\[ \norm{M_\beta} \leq 2\cdot\left(\abs{V(\beta)} \cdot (1+\abs{E(\beta)}) \cdot \log(n)\right)^{C\cdot (\abs{V_{rel}(\beta)} + \abs{E(\beta)})} \cdot n^{\frac{w(V(\beta)) - w(S_{\min}) + w(W_{iso})}{2}}\]

	We have $\abs{V(\beta)}\cdot (1+\abs{E(\beta)}) \cdot \log(n) \le n^{O(\tau)}$. Also, $|V_{rel}(\beta)| \le 2(|E(\alpha)| + |E(\beta)|)$ since all the degree $0$ vertices in $V_{rel}(\beta)$ would have had vertices of $V_{rel}(\alpha)$ collapse into it in the chain of collapses and there are no degree $0$ vertices in $V_{rel}(\alpha)$. Finally, since $|E(\alpha)| \ge |E(\beta)|$, the factor
	$2\cdot(\abs{V(\beta)} \cdot (1+\abs{E(\beta)}) \cdot \log(n))^{C\cdot (\abs{V_{rel}(\beta)} + \abs{E(\beta)})}$ can be absorbed into $\frac{1}{n^{\Omega(\varepsilon\abs{E(\alpha)})}}$. The result follows from \cref{lem:advanced-charging}.
\end{proof}

\begin{proposition}\label{prop:m+-diag}
If $\beta$ is a trivial shape, $\lambda_\beta^+ = \lambda_\beta$.
\begin{proof}
    A trivial shape cannot appear in $W(\alpha)$ for any $\alpha$, since every collapse of a spider always keeps its circle vertices around.
\end{proof}
\end{proposition}

\begin{lemma}\label{lem:non-spider-psd}
	For $k,l \in \{0, 1, \dots, D/2\}$, let ${\mathcal B}_{k,l}$ denote the set of nontrivial non-spiders on block $(k, l)$. Then
	\[\displaystyle\sum_{\beta \in {\mathcal B}_{k,l}} \abs{\lambda_\beta^+}\norm{M_\beta} \leq \eta^{k+l} \cdot \frac{1}{n^{\Omega(\varepsilon)}} \]
\end{lemma}
\begin{proof}

\begin{align*}
\displaystyle\sum_{\beta \in {\mathcal B}_{k,l}}\norm{\lambda_\beta^+ M_\beta}
\leq & \sum_{\beta \in {\mathcal B}_{k,l}} \abs{\lambda_\beta} \norm{M_\beta} + \sum_{\beta \in {\mathcal B}_{k,l}} \sum_{\substack{\text{spiders }\alpha:\\ \beta \in W(\alpha)}} \abs{v_\beta} \abs{\lambda_\alpha} \norm{M_\beta}
\end{align*}
To bound the first term, we checked previously in~\cref{cor:non-spider-sum} that the total norm of nontrivial non-spiders appearing in the pseudocalibration (i.e. this term) is $\eta^{k+l}o_n(1)$. For the second term, via~\cref{lem:web-leaves} we have a bound on the accumulations $v_\gamma$ of one spider on one non-spider, so it is at most
\[\leq \sum_{\beta \in {\mathcal B}_{k,l}}\displaystyle\sum_{\substack{\text{spiders }\alpha:\\ \beta \in W(\alpha)}}(C_1\abs{V(\alpha)} \cdot \abs{E(\alpha)})^{C_2 \abs{E(\alpha)}} \cdot \abs{\lambda_\alpha} \norm{M_\beta}.\]
Use the bound on the coefficients $\abs{\lambda_\alpha}$,~\cref{prop:coefficient-bound},
\begin{align*}
\leq  \sum_{\beta \in {\mathcal B}_{k,l}}\displaystyle\sum_{\substack{\text{spiders }\alpha:\\ \beta \in W(\alpha)}}(C_1\abs{V(\alpha)} \cdot \abs{E(\alpha)})^{C_2 \abs{E(\alpha)}} \cdot\eta^{\abs{U_\alpha} + \abs{V_\alpha}}\cdot  \frac{\abs{E(\alpha)}^{3\abs{E(\alpha)}}}{n^{\abs{E(\alpha)}/2}} \cdot \norm{M_\beta}
\end{align*}
Invoking the norm bound for non-spiders which are collapses, \cref{cor:non-spider-norm-bound},
\begin{align*}
& \leq  \eta^{k+l} \cdot \sum_{\beta \in {\mathcal B}_{k,l}}\displaystyle\sum_{\substack{\text{spiders }\alpha:\\ \beta \in W(\alpha)}} \left(\frac{C_1\abs{V(\alpha)} \cdot \abs{E(\alpha)}}{n^{\Omega(\varepsilon)}}\right)^{C_2' \abs{E(\alpha)}}\\
& \leq  \eta^{k+l} \cdot \sum_{\beta \in {\mathcal B}_{k,l}}\displaystyle\sum_{\substack{\text{spiders }\alpha:\\ \beta \in W(\alpha)}} \left(\frac{C_1n^\tau \cdot n^\tau}{n^{\Omega(\varepsilon)}}\right)^{C_2' \abs{E(\alpha)}}.
\end{align*}

Bound the sum over all spiders by the sum over all shapes. By~\cref{prop:edge-shape-count}, the number of shapes with $i$ edges is $n^{O(\tau (i+1))}$. Summing by the number of edges, observe that $\abs{E(\alpha)} \geq \max(\abs{E(\beta)}, 2)$ since spiders always have at least $2$ edges.
\begin{align*}
    & \leq \eta^{k+l}\sum_{\beta \in {\mathcal B}_{k,l}}\displaystyle\sum_{i=\max(\abs{E(\beta)}, 2)}^\infty
    n^{O(\tau (i+1))} \cdot \left(\frac{C_1n^\tau \cdot n^{\tau}}{n^{\Omega(\varepsilon)}}\right)^{C_2' i} \\
    &\leq \eta^{k+l}\sum_{\beta \in {\mathcal B}_{k,l}} \frac{1}{n^{\Omega(\varepsilon \max(\abs{E(\beta)}, 2))}}\\
    & \leq \eta^{k+l}\sum_{i=0}^\infty \frac{n^{O(\delta (i+1))}}{n^{\Omega(\varepsilon \max(i, 2))}}\\
    & = \eta^{k+l} \cdot \frac{1}{n^{\Omega(\varepsilon)}} \qedhere
\end{align*}
\end{proof}

\begin{corollary}\label{cor:m+-diag}
For $k \in \{0, \dots, D/2\}$, the $(k,k)$ block of ${\mathcal M}^+$ has minimum singular value at least $\eta^{2k}(1 - \frac{1}{n^{\Omega(\varepsilon)}})$, and for $k, l \in \{0, \dots, D/2\}, l \neq k$, the $(k,l)$ off-diagonal block has norm at most $\eta^{k+l} \cdot \frac{1}{n^{\Omega(\varepsilon)}}$.
\end{corollary}
\begin{proof}
    By~\cref{prop:m+-diag} the identity matrix appears on the $(k,k)$ blocks with coefficient $\eta^{2k}$. By construction, ${\mathcal M}^+$ has no spider shapes. By~\cref{lem:non-spider-psd}, the total norm of the non-spider shapes on the $(k,l)$ block is at most $\eta^{k+l}\cdot \frac{1}{n^{\Omega(\varepsilon)}}$.
\end{proof}

\begin{theorem}
    W.h.p. ${\mathcal M}_{fix} \succeq 0$.
\end{theorem}
\begin{proof}
    For any $x \in \nullspace({\mathcal M}_{fix})$, we of course have $x^\intercal {\mathcal M}_{fix} x = 0$.
    For any $x \perp \nullspace({\mathcal M}_{fix})$ with $\norm{x}_2 = 1$,
    \begin{align*}
        x^\intercal {\mathcal M}_{fix} x & = x^\intercal ({\mathcal M} + {\mathcal E}) x\\
        &= x^\intercal {\mathcal M}^+ x + x^\intercal \left(\displaystyle\sum_{\text{spiders }\alpha} \lambda_\alpha\left( {\mathcal M}_\alpha - \sum_{\text{leaves }\gamma\text{ of }W(\alpha)}v_\gamma M_\gamma \right)\right)x + x^\intercal {\mathcal E} x\\
        &= x^\intercal ({\mathcal M}^+ +  {\mathcal E}) x
    \end{align*}
    where  the last equality follows from \cref{prop:web-sum}.
    Because the norm bound on ${\mathcal E}$ is significantly less than $\eta^{D} = n^{-n^{\delta}}$ (see \cite{sklowerbounds}), the bound on the norm of each block of ${\mathcal M}^+$
    in~\cref{cor:m+-diag} also applies to the blocks of ${\mathcal M}^+ + {\mathcal E}$. Therefore,
    we use~\cref{lem:block-psd} to conclude ${\mathcal M}^+ + {\mathcal E} \succeq 0$ and the above expression is nonnegative.
\end{proof}

\subsection{Combinatorial Proof of Lemma \ref{lem:fixed-moments}}\label{app:combinatorialpseudocalibration}
In this appendix, we give a combinatorial proof of Lemma \ref{lem:fixed-moments}. We recall the statement of Lemma \ref{lem:fixed-moments} here.
\begin{lemma}
Let $\alpha \in {\mathbb{N}}^n$. When $v$ is fixed and $b$ is fixed (not necessarily +1 or -1) and $d \sim N(0, I)$ conditioned on $\ip{v}{d} = b\norm{v}$, 
\[{\mathbb E}_{d}[h_{\alpha}(d)] = \frac{v^\alpha}{\norm{v}^{\abs{\alpha}}} \cdot h_{\abs{\alpha}}(b).\]
\end{lemma}
\begin{proof}
Again, it is sufficient to prove this lemma when $\norm{v} = 1$. For this proof, we need the following description of Hermite polynomials in terms of matchings and Isserlis' Theorem/Wick's Theorem.
\begin{fact}
\[
h_k(x) = \sum_{M: M \text{ is a matching on } [k]}{(-1)^{|M|}x^{k - 2|M|}}
\]
\end{fact}
\begin{theorem}[Isserlis' Theorem/Wick's Theorem]
For any vectors $u_1,\ldots,u_k$,
\[
E_{x \sim N(0,I)}\left[\prod_{j=1}^{k}{\ip{x}{u_j}}\right] = \sum_{M: M \text{ is a perfect matching on } [k]}\prod_{(i,j) \in M}{\ip{u_i}{u_j}}
\]
\end{theorem}
The idea behind this proof is to break up each coordinate vector $e_i$ into a component which is parallel to $v$ and a component which is perpendicular to $v$.
\begin{definition}
For each coordinate $i$, define $e_{i}^{\perp} = e_i - {v_i}v$
\end{definition}
\begin{proposition}
For any coordinate $i$, $\ip{e_{i}^{\perp}}{e_{i}^{\perp}} = 1-{v_i}^2$. For any pair of distinct coordintes $i$ and $i'$, $\ip{e_{i}^{\perp}}{e_{i'}^{\perp}} = -{v_i}{v_{i'}}$
\end{proposition}
\begin{proof}
Observe that for all i,
\[
\ip{e_{i}^{\perp}}{e_{i}^{\perp}} = \ip{e_{i} - {v_i}v}{e_{i} - v_{i}v} = \ip{e_{i}}{e_{i}}  - 2{v_i}\ip{v}{e_{i}} + {v_i}^2\ip{v}{v} = 1-{v_i}^2
\]
and if $i$ and $i'$ are distinct then
\[
\langle{e_{i}^{\perp},e_{i'}^{\perp}}\rangle = \langle{e_{i} - {v_i}v,e_{i'} - v_{i'}v}\rangle = \langle{e_{i},e_{i'}}\rangle  - {v_i}\langle{v,e_{i'}}\rangle - {v_{i'}}\langle{e_i,v}\rangle + {v_i}{v_{i'}}\langle{v,v}\rangle = -{v_i}{v_{i'}}
\]
\end{proof}
To evaluate ${\mathbb E}_{d}[h_{\alpha}(d)]$, we proceed as follows:
\begin{enumerate}
\item Break up each $d_i = \ip{d}{e_i}$ as $d_i = \ip{bv}{e_{i}} + \ip{d^{\perp}}{e_{i}} = bv_{i} + \ip{d^{\perp}}{e_{i}^{\perp}}$
where $d^{\perp}$ is the component of $d$ which is orthogonal to $v$.
\item Observe that since each $e^{\perp}_{i}$ is orthogonal to $v$, we can replace $d^{\perp}$ by a random vector $d' \sim N(0,I)$.
\item Apply Isserlis' Theorem/Wick's Theorem to evaluate these terms.
\end{enumerate}
For this calculation, it is convenient to think of $\alpha$ as a tuple of $|\alpha|$ elements where each $i \in [n]$ appears $\alpha_i$ times.
\begin{definition}
For each $j \in [\abs{\alpha}]$, we define $\alpha(j)$ to be the index $i$ such that $\sum_{i' = 1}^{i-1}{\alpha_{i'}} < j$ and $\sum_{i'=1}^{i}{\alpha_{i'}} \geq j$. For example, if $\alpha = (2,1,0,3)$ then $\alpha(1) = \alpha(2) = 1$, $\alpha(3) = 2$, and $\alpha(4) = \alpha(5) = \alpha(6) = 4$.
\end{definition}
In the special case when $\alpha(1),\ldots,\alpha(\abs{\alpha})$ are all distinct, 
\[
{\mathbb E}_{d}[h_{\alpha}(d)] = {\mathbb E}_d\Big[\prod_{j=1}^{\abs{\alpha}}{\ip{d}{e_{\alpha(j)}}}\Big] = {\mathbb E}_{d' \sim N(0,I)}\Big[\prod_{j=1}^{\abs{\alpha}}{\left(bv_{\alpha(j)} + \ip{d'}{e_{\alpha(j)}^{\perp}}\right)}\Big]
\]
In this case, we can associate a matching $M$ to each term we get after applying Isserlis' Theorem/Wick's Theorem as follows:
\begin{enumerate}
\item For each $j \in \abs{\alpha}$ where we have the $bv_{\alpha(j)}$ term, we take $j$ to be isolated.
\item For each pair of distinct $j,j' \in \abs{\alpha}$ such that we have the term $\ip{e_{\alpha
(j)}^{\perp}}{e_{\alpha(j')}^{\perp}}$ (which only happens if we start with the $\ip{d'}{e_{\alpha(j)}^{\perp}}$ and $\ip{d'}{e_{\alpha(j')}^{\perp}}$ terms and $e_{\alpha(j)}^{\perp}$ and $e_{\alpha(j')}^{\perp}$ are paired together after applying Isserlis' Theorem/Wick's Theorem), we add an edge between $j$ and $j'$ in $M$.
\end{enumerate}
We now have that
\begin{align*}
{\mathbb E}_{d}[h_{\alpha}(d)] &= \sum_{M:M \text{ is a matching on [\abs{\alpha}]}}{\left(\prod_{(j,j') \in M}{-v_{\alpha(j)}v_{\alpha(j')}}\right)\left(\prod_{j: j \text{ is unmatched by } M}{bv_{\alpha(j)}}\right)}\\ 
&= \left(\sum_{M:M \text{ is a matching on [\abs{\alpha}]}}{(-1)^{|M|}b^{\abs{\alpha} - 2|M|}}\right)\prod_{j=1}^{\abs{\alpha}}{v_{\alpha(j)}} \\
&= h_{\abs{\alpha}}(b)v^{\alpha} 
\end{align*}
For the general case, we use a similar idea although it is somewhat more complicated. In particular, we associate a multi-colored matching $M = M_{blue} \cup M_{red} \cup M_{purple}$ to each term. The idea is that whenever we have a blue edge, we could have had a red edge instead and vice versa, so we can combine terms with red and blue edges to make purple edges which gives us an ordinary matching as before. More precisely, the idea is as follows.
\begin{enumerate}
\item When we expand out $h_{\alpha}(d)$ in terms of matchings, we take $M_{blue}$ to be the union of these matchings.
\item For each $j \in \abs{\alpha}$ where we have the $bv_{\alpha(j)}$ term, we take $j$ to be isolated.
\item For each pair of distinct $j,j' \in \abs{\alpha}$ such that we have the term $\ip{e_{\alpha
(j)}^{\perp}}{e_{\alpha(j')}^{\perp}}$ (which only happens if we start with the $\ip{d'}{e_{\alpha(j)}^{\perp}}$ and $\ip{d'}{e_{\alpha(j')}^{\perp}}$ terms and $e_{\alpha(j)}^{\perp}$ and $e_{\alpha(j')}^{\perp}$ are paired together after applying Isserlis' Theorem/Wick's Theorem), we add an edge between $j$ and $j'$. If $\alpha(j') = \alpha(j)$ then we take this edge to be red and add it to $M_{red}$. If $\alpha(j') \neq \alpha(j)$ then we take this edge to be purple and add it to $M_{purple}$.
\end{enumerate}
We now implement this idea. We have that
\begin{align*}
&{\mathbb E}_{d}[h_{\alpha}(d)] = \sum_{M_{blue}: M_{blue} \text{ is a matching on } [\abs{\alpha}], \atop \forall (j,j') \in M_{blue}, \alpha(j) = \alpha(j')}{
(-1)^{|M_{blue}|}{\mathbb E}_{d}\Big[\prod_{j \in \abs{\alpha}: j \text{ is unmatched by } M_{blue}}{\ip{d}{e_{\alpha(j)}}}\Big]} \\
&=  \sum_{M_{blue}: M_{blue} \text{ is a matching on } [\abs{\alpha}], \atop \forall (j,j') \in M_{blue}, \alpha(j) = \alpha(j')}{
(-1)^{|M_{blue}|}{\mathbb E}_{d' \sim N(0,I)}\Big[\prod_{j \in \abs{\alpha}: j \text{ is unmatched by } M_{blue}}{\left(bv_{\alpha(j)} + \ip{d'}{e_{\alpha(j)}^{\perp}}\right)}\Big]}
\end{align*}
Expanding out ${\mathbb E}_{d' \sim N(0,I)}\Big[\prod_{j \in \abs{\alpha}: j \text{ is unmatched by } M_{blue}}{\left(bv_{\alpha(j)} + \ip{d'}{e_{\alpha(j)}^{\perp}}\right)}\Big]$ and applying Isserlis' Theorem/Wick's Theorem, we have that 
\begin{align*}
{\mathbb E}_{d}[h_{\alpha}(d)] &= \sum_{M_{blue},M_{red},M_{purple}}{(-1)^{|M_{blue}|}\prod_{(j,j') \in M_{red}}{(1-v^{2}_{\alpha(j)})}\prod_{(j,j') \in M_{purple}}{(-v_{\alpha(j)}v_{\alpha(j')})}} \\
&\prod_{j: j \text{ is unmatched by } M = M_{blue} \cup M_{red} \cup M_{purple}}{bv_{\alpha(j)}}
\end{align*}
where the sum is taken over all $M_{blue},M_{red},M_{purple}$ such that
\begin{enumerate}
\item $M = M_{blue} \cup M_{red} \cup M_{purple}$ is a matching on $[\abs{\alpha}]$ and $M_{blue},M_{red},M_{purple}$ are disjoint.
\item $\forall (j,j') \in M_{blue}, \alpha(j) = \alpha(j')$.
\item $\forall (j,j') \in M_{red}, \alpha(j) = \alpha(j')$.
\item $\forall (j,j') \in M_{purple}, \alpha(j) \neq \alpha(j')$.
\end{enumerate}
Since whenever we have a blue edge, we could have instead had a red edge and vice versa, for each distinct $j,j'$ such that $\alpha(j') = \alpha(j)$, we can combine terms which have a blue edge between $j$ and $j'$ with terms which have a red edge between $j$ and $j'$. A blue edge between $j$ and $j'$ has a coefficient of $-1$ and a red edge between $j$ and $j'$ has a coefficient of $1 - v^2_{\alpha(j)}$, so this effectively gives a purple edge with coefficient $-v^2_{\alpha(j)} = v_{\alpha(j)}v_{\alpha(j')}$. Thus, 
\begin{align*}
{\mathbb E}_{d}[h_{\alpha}(d)] &= \sum_{M:M \text{ is a matching on [\abs{\alpha}]}}{\left(\prod_{(j,j') \in M}{-v_{\alpha(j)}v_{\alpha(j')}}\right)\left(\prod_{j: j \text{ is unmatched by } M}{bv_{\alpha(j)}}\right)}\\ 
&= h_{\abs{\alpha}}(b)v^{\alpha} 
\end{align*}
\end{proof}

\subsection{Killing a single spider}
\label{sec:single-spider}

We saw in the Proof Strategy section that the shape $2\beta_1 + \frac{1}{n}\beta_2$ lies in the nullspace of a moment matrix which
satisfies the constraints ``$\ip{v}{d_u}^2 = 1$". The shape $\beta_1$ is
exactly the kind of substructure that appears in a spider! Therefore it
is natural to hope that if $\alpha$ is a left spider, then
${\mathcal M}_{fix}M_{\alpha} = 0$. This
doesn't quite hold because $\ip{v}{d_u}^2$ is ``missing"
some terms: in realizations of $\alpha,$ the end vertices are required to be
distinct from the other squares in $\alpha$, which prevents terms
for all pairs $i,j$ from appearing in the product
${\mathcal M}_{fix}M_\alpha$. There are smaller ``intersection terms"
(which we call
collapses of $\alpha$) that we can add so that the end vertices are permitted to take
on all pairs $i, j$. After adding in these terms, we will produce a matrix $L$ with ${\mathcal M}_{fix}L =
0$.

We first define what it means to collapse a shape into another shape
by merging two vertices. Here, we only define it for merging two
square vertices, since these are the only kind of merges that will
happen in our analysis of intersection terms.

\begin{definition}[Improper collapse]
    Let $\alpha$ be a shape and let $\square{i}, \square{j}$ be two distinct square vertices in $V(\alpha)$. We define the improper collapse of $\square{i}, \square{j}$ by:
    \begin{itemize}
        \item Remove \square{i}, \square{j} from $V(\alpha)$ and replace them by a single new vertex \square{k}.
        \item Replace each edge $\{\square{i}, \circle{u}\}$ and $\{\square{j}, \circle{u}\}$, if present, by $\{\square{k}, \circle{u}\}$, keeping the same labels (note that there may be multiedges and so the new shape may not be proper).
        \item Set $U(\square{k}) = U(\square{i}) + U(\square{j}) (\mod 2)$ and $V(\square{k}) = V(\square{i}) + V(\square{j}) (\mod 2)$.
    \end{itemize}
\end{definition}

Improper collapses have parallel edges, but we can convert them back to a sum
of proper shapes.
This is done by, for each set of parallel edges, expanding the product of Fourier characters in the Fourier basis. For example, two parallel edges with label 1 should be expanded as
\[h_1(z)^2 = (z^2-1) + 1 = h_2(z) + h_0(z)\]
\begin{definition}[Collapsing a shape]
    Let $\alpha$ be a shape with two distinct square vertices $\square{i}, \square{j}$. We say that $\beta$ is a (proper) collapse of $\square{i}, \square{j}$ if $\beta$ appears in the expansion of the improper collapse of $\square{i},\square{j}$.
\end{definition}

\begin{remark}
    If $l_1, \dots, l_k$ are the labels of a set of parallel edges, then the product $h_{l_1}(z) \cdots h_{l_k}(z)$ is even/odd depending on the parity of $l_1 + \cdots + l_k$. Thus the nonzero Fourier coefficients will be the terms of matching parity. Therefore, in both the boolean and Gaussian cases, the shapes that are proper collapses of a given improper collapse are formed by replacing each set of parallel edges by a single edge $e$ such that $l(e) \le l_1 + \ldots + l_k$ and $l(e)~\equiv~l_1 + \cdots + l_k\pmod 2$.
\end{remark}

\begin{remark}\label{rmk:parity}
	Looking at the definition and in light of the previous remark, we have the following.
	\begin{enumerate}
		\item The number of circle vertices does not change by collapsing a shape but the number of square vertices decreases by $1$.
		\item $\alpha \in {\mathcal L}$ has the property that the vertices have odd degree if and only if they are in $(U_{\alpha} \cup V_{\alpha}) \setminus (U_{\alpha} \cap V_{\alpha})$. When $\alpha$ collapses, this property is preserved.
	\end{enumerate}
\end{remark}

We now define the desired shapes $L_k$ which lie in the null space of ${\mathcal M}_{fix}$.

\begin{definition}
For $k \geq 2$ define the shape $\ell_k$ on $\{\square{1}, \dots, \square{k}, \circle{1} \}$ with two edges $\{\{\square{1}, \circle{1}\}$, $\{\square{2}, \circle{1}\}\}$. The left side of $\ell_k$ consists of $U_{\ell_k} = \{\square{1},\dots,\square{k}\}$. The right side consists of $V_{\ell_k} =\{\square{3}, \dots, \square{k}, \circle{1}\}$.
\end{definition}

\begin{definition}\label{def:lk}
Define the ``completed'' version $L_k$ of $\ell_k$ to be the matrix which is the sum of $c_\beta M_{\beta}$ for $\beta$ being the following shapes with coefficients:
\begin{itemize}
    \item ($L_{k,1}$): $\ell_k$, with coefficient 2.
    \item ($L_{k,2}$): If $k \geq 3$, collapse $\square{1}$ and $\square{3}$ in $\ell_k$ with coefficient $\frac{2}{n}$
    \item ($L_{k,3}$): If $k \geq 4$, collapse $\square{1}$ and $\square{3}$, and collapse $\square{2}$ and $\square{4}$ in $\ell_k$ with coefficient $\frac{2}{n^2}$
    \item ($L_{k,4}$): Collapse $\square{1}$ and $\square{2}$, replacing the edges by an edge with label 2, with coefficient $\frac{1}{n}$
    \item ($L_{k,5}$): If $k \geq 3$, collapse $\square{1}, \square{2}$, and $\square{3}$, replacing the edges by an edge with label 2, with coefficient $\frac{1}{n}$.
\end{itemize}
\end{definition}

For a pictorial representation of the ribbons/shapes, see ~\cref{fig:Lk} below.

\begin{lemma}\label{lem:completed-left-side}
    ${\mathcal M}_{fix} L_k = 0$
\end{lemma}
\begin{proof}
    These shapes are constructed so that if we fix a partial realization
    of the vertices $\circle{1}$ and $\square{3}, \dots, \square{k}$ as $\circle{u} \in {\mathcal C}_m$ and $S \in \binom{{\mathcal S}_n}{k-2}$, the squares $\square{1}$ and $\square{2}$ can still be realized as any $j_1,j_2 \in [n]$. That is, exactly the following equality holds,
    \begin{align*}
        ({\mathcal M}_{fix} L_k)_I &= \displaystyle\sum_{\substack{\circle{u} \in {\mathcal C}_m,\\ S \in \binom{{\mathcal S}_n}{k-2}} }\left(\sum_{\substack{j_1, j_2 \in [n]:\\ j_1 \neq j_2}} \tilde{\EE}[v^I v^S v_{j_1}v_{j_2}] d_{uj_1}d_{uj_2} + \sum_{j_1 \in [n]} \tilde{\EE}[v^Iv^Sv_{j_1}^2](d_{uj_1}^2 - 1)\right)\\
        &= \displaystyle\sum_{\substack{\circle{u} \in {\mathcal C}_m,\\ S \in \binom{{\mathcal S}_n}{k-2}}} \tilde{\EE}[v^Iv^S(\ip{v}{d_u}^2 - 1)]\\
        &= 0
    \end{align*}

    To demonstrate how the coefficients arise, we analyze the ribbons $R$ which $L_k$ is composed of and see how they contribute to the output.
    For pictures of the ribbons/shapes, see~\cref{fig:Lk} below.
    Let the ribbon be partially realized as $\circle{u}$ and $S = \{\square{j_3},\dots, \square{j_k}\}$. Let $(M_{fix}L_k)_{I(u, S)}$ denote the terms in $(M_{fix}L_k)_I$ with this partial realization. In this notation we want to show
    \[({\mathcal M}_{fix}L_k)_{I(u, S)} = \sum_{\substack{j_1, j_2 \in [n]:\\ j_1 \neq j_2}} \tilde{\EE}[v^I v^S v_{j_1}v_{j_2}] d_{uj_1}d_{uj_2} + \sum_{j_1 \in [n]} \tilde{\EE}[v^Iv^Sv_{j_1}^2](d_{uj_1}^2 - 1).\]

    \begin{figure}[!ht]
        \centering
        \includegraphics[height=10cm]{sherrington_kirkpatrick/LkFullWithNameU}
        \caption{The five shapes that make up $L_4$.}
        \label{fig:Lk}
    \end{figure}

    \begin{enumerate}
        \item If we take a ribbon $R$ with $A_R = \{\square{j_1}, \dots, \square{j_k}\}$, $B_R = \{\square{j_3}, \dots, \square{j_k}\} \cup \{\circle{u}\}$ and $E(R) = \{\{\square{j_1}, \circle{u}\}, \{\square{j_2}, \circle{u}\}\}$ where $j_1 \neq j_2$ and $j_1, j_2 \notin S$ then
        \[ ({\mathcal M}_{fix}M_R)_{I(u, S)} = \tilde{\EE}[v^Iv^Sv_{j_1}v_{j_2}]d_{uj_1}d_{uj_2}.\]
        This ribbon must ``cover'' both ordered pairs $(j_1, j_2)$ and $(j_2, j_1)$, so we want each such ribbon $R$ to appear with a coefficient of 2 in $L_k$.
        \item If we take a ribbon $R$ with $A_R = \{\square{j_1}, \dots, \square{j_k}\} \setminus \{\square{j_1}, \square{j_3}\}$, $B_R = \{\square{j_3}, \dots, \square{j_k}\} \cup \{\circle{u}\}$ and $E(R) = \{\{\square{j_3}, \circle{u}\}, \{\square{j_2}, \circle{u}\}\}$ where $j_1 = j_3 \in S$ then
        \[ ({\mathcal M}_{fix}M_R)_{I(u, S)} = \tilde{\EE}[v^Iv^{S\setminus \{j_3\}}v_{j_2}]d_{uj_3}d_{uj_2} = n\tilde{\EE}[v^Iv^Sv_{j_1}v_{j_2}]d_{uj_1}d_{uj_2}.\]
        Taking a coefficient of $\frac{2}{n}$ in $L_k$ covers the two pairs $(j_1, j_2)$ and $(j_2, j_1)$ for this case of overlap with $S$.
        \item If we take a ribbon $R$ with $A_R = \{\square{j_1}, \dots, \square{j_k}\} \setminus \{\square{j_1}, \square{j_2}, \square{j_3}, \square{j_4}\}$, $B_R = \{\square{j_3}, \dots, \square{j_k}\} \cup \{\circle{u}\}$ and $E(R) = \{\{\square{j_3}, \circle{u}\}, \{\square{j_4}, \circle{u}\}\}$ where $j_1 = j_3 \in S$ and $j_2 = j_4 \in S$ then
        \[ ({\mathcal M}_{fix}M_R)_{I(u, S)} = \tilde{\EE}[v^Iv^{S\setminus \{j_3, j_4\}}]d_{uj_3}d_{uj_4} = n^2\tilde{\EE}[v^Iv^Sv_{j_1}v_{j_2}]d_{uj_1}d_{uj_2}.\]
        Taking a coefficient of $\frac{2}{n^2}$ in $L_k$ covers the two pairs $(j_1, j_2)$ and $(j_2, j_1)$ for this case of overlap with $S$.
        \item If we take a ribbon $R$ with $A_R = \{\square{j_1}, \dots, \square{j_k}\}\setminus \{\square{j_1},\square{j_2}\}$, $B_R = \{\square{j_3}, \dots, \square{j_k}\} \cup \{\circle{u}\}$ and $E(R) = \{\{\square{j_1}, \circle{u}\}_2\}$ where $j_1 = j_2 \notin S$ then
        \[ ({\mathcal M}_{fix}M_R)_{I(u, S)} = \tilde{\EE}[v^Iv^{S}](d_{uj_1}^2-1) = n\tilde{\EE}[v^Iv^Sv_{j_1}^2](d_{uj_1}^2-1).\]
        Taking a coefficient of $\frac{1}{n}$ in $L_k$ covers these terms.
        \item If we take a ribbon $R$ with $A_R = \{\square{j_1}, \dots, \square{j_k}\} \setminus \{\square{j_1}, \square{j_2}\}$, $B_R = \{\square{j_3}, \dots, \square{j_k}\} \cup \{\circle{u}\}$ and $E(R) = \{\{\square{j_3}, \circle{u}\}_2\}$ where $j_1 = j_2 =j_3\in S$ then
        \[ ({\mathcal M}_{fix}M_R)_{I(u, S)} = \tilde{\EE}[v^Iv^{S}](d_{uj_3}^2-1) = n\tilde{\EE}[v^Iv^Sv_{j_1}^2](d_{uj_1}^2-1).\]
        Taking a coefficient of $\frac{1}{n}$ in $L_k$ covers these terms.
    \end{enumerate}
\end{proof}


One of the key facts about graph matrices is that multiplication of graph matrices approximately equals a new graph matrix, $M_\alpha \cdot M_\beta \approx M_{\gamma}$, where $\gamma$ is the result of gluing $V_\alpha$ with $U_\beta$ (and if $V_\alpha, U_\beta$ do not have the same number of vertices of each type, the product is zero). The error terms in the approximation are intersection terms (collapses) between the variables in $\alpha$ and $\beta$.
\begin{definition}
    Say that shapes $\alpha$ and $\beta$ are composable if $V_\alpha$ and $U_\beta$ have the same number of square and circle vertices. We say a shape $\gamma$ is a gluing of $\alpha$ and $\beta$, if the graph of $\gamma$ is the disjoint union of the graphs of $\alpha$ and $\beta$, followed by identifying $V_\alpha$ and $U_\beta$ under some type-preserving bijection, and if $U_\gamma = U_\alpha$ and $V_\gamma = V_\beta$.
\end{definition}

\begin{proposition}\label{prop:graph-matrix-multiplication}
    Let $\alpha, \beta$ be composable shapes. Assume that $V(\alpha) \setminus V_\alpha$ has only square vertices. Let $\{\gamma_i\}$ be the distinct gluings of $\alpha$ and $\beta$, and let $\widetilde{{\mathcal I}}$ be the set of improper collapses of any number of squares (possibly zero) in $V(\alpha) \setminus V_\alpha$ with distinct squares in $V(\beta) \setminus U_\beta$ in any gluing $\gamma_i$. Then there are coefficients $c_\gamma$ for $\gamma \in \widetilde{{\mathcal I}}$ such that
    \[ M_\alpha\cdot M_\beta = \displaystyle\sum_{\gamma \in \widetilde{{\mathcal I}}} c_\gamma M_\gamma.\]
    Furthermore, the coefficients satisfy $\abs{c_\gamma} \leq
    2^{\abs{V(\alpha) \setminus V_\alpha}}\abs{V(\gamma)}^{\abs{V(\alpha) \setminus U_\alpha}}$.
\end{proposition}
\begin{proof}
    The product $M_\alpha \cdot M_\beta$ is a matrix which is a symmetric function of the inputs $(d_1, \dots, d_m)$, the space of which is spanned by the $M_\gamma$ over all possible shapes $\gamma$ (not restricted to $\widetilde{{\mathcal I}}$), so there exist coefficients $c_\gamma$ if we allow all shapes $\gamma$. We need to check that $M_\alpha \cdot M_\beta$ actually lies in the span of shapes in $\widetilde{{\mathcal I}}$ by showing that all ribbons in $M_\alpha \cdot M_\beta$ have shapes in $\widetilde{{\mathcal I}}$. Expanding the definition,
{\footnotesize
    \[ M_\alpha \cdot M_\beta = \left(\displaystyle\sum_{R \text{ is a ribbon of shape }\alpha} M_R\right)\left(\sum_{S\text{ is a ribbon of shape }\beta} M_S\right) = \displaystyle\sum_{\substack{R \text{ is a ribbon of shape }\alpha,\\ S \text{ is a ribbon of shape }\beta}} M_R M_S.\]
}%
    In order for $M_RM_S$ to be nonzero, we require $B_R = A_S$ as sets; $R$ may assign the labels arbitrarily inside $B_R$, resulting in different gluings of $\alpha$ and $\beta$. Fix $R$ and $S$, and let $\gamma$ be the corresponding gluing of $\alpha$ and $\beta$ for this $R$ and $S$.

    The matrix $M_RM_S$ has one nonzero entry; we claim that it is a Fourier character for a ribbon $T$ which is a collapse of $\gamma$. The labels of $R$ outside of $B_R$ can possibly overlap with the labels of $S$ outside of $A_S$, and naturally the shape of $T$ is the result of collapsing vertices in $\gamma$ with the same label.

    To bound the coefficients $c_\gamma$ that appear, it suffices to bound the coefficient on a ribbon $M_T$, which is bounded by the number of contributing ribbons $R, S$, where we say ribbons $R$ of shape $\alpha$ and $S$ of shape $\beta$ contribute to $T$ if $M_RM_S = M_T$. From $T$, we can completely recover the sets $A_R$ and $B_S$. The labels of $V(R) \setminus A_R$ must be among the labels of $T$; choose them in at most $\abs{V(\gamma)}^{\abs{V(\alpha) \setminus U_\alpha}}$ ways. This also determines $B_R =A_S$. All that remains is to determine the graph structure of $S$. Since improper collapsing doesn't lose any edges, knowing the labels of $R$ we know exactly which edges of $T$ must come from $R$ and $S$. The vertices $V(T) \setminus V(R)$ must come from $S$, as must $B_R$; pick a subset of $V(R) \setminus B_R$ to include in $2^{\abs{V(\alpha) \setminus V_\alpha}}$ ways.
\end{proof}


Let $\alpha$ be a left spider with end vertices $\square{i}, \square{j}$ which are adjacent to a circle $\circle{u}$. Recall that our goal is to argue that ${\mathcal M} M_\alpha \approx 0$. To get there, we can try and factor $M_\alpha$ across the vertex separator $S = U_\alpha \cup \{\circle{u}\} \setminus \{\square{i},\square{j}\}$ which separates $\alpha$ into
\[ M_\alpha \approx L_{\abs{U_\alpha}} \cdot M_{\body(\alpha)}\]
where we have defined,
\begin{definition}
    Let $\alpha$ be a left spider with end vertices $\square{i}, \square{j}$.
    Define $\body(\alpha)$ as the shape whose graph is $\alpha$ with $\square{i}$ and $\square{j}$ deleted and with $U_{\body(\alpha)} = U_\alpha \cup \{\circle{u}\} \setminus \{\square{i},\square{j}\}$, $V_{\body(\alpha)} = V_\alpha$. The definition is analogous for right spiders.
\end{definition}
Due to~\cref{lem:completed-left-side}, the right-hand side of the approximation is in the null space of ${\mathcal M}$. We now formalize this approximate factorization.

\begin{definition}
	Let $\alpha$ be a spider with end vertices $\square{i}, \square{j}$. Define $\widetilde{{\mathcal I}}_{\alpha}$ to be the set of shapes that can be obtained from $\alpha$ by performing at least one of the following steps:
	\begin{itemize}
	    \item Improperly collapse $\square{i}$ with a square vertex in $\alpha$
	    \item Improperly collapse $\square{j}$ with a square vertex in $\alpha$
	\end{itemize}
	Let ${\mathcal I}_\alpha$ be the set of proper shapes that can be obtained via the same process but using proper collapses.
\end{definition}
In the above definition, we allow $\square{i}, \square{j}$ to collapse with two distinct squares, or to collapse together, or to both collapse with a common third vertex. For technical reasons we need to work with a refinement of ${\mathcal I}_\alpha$ into two sets of shapes and use tighter bounds on coefficients of one set.
\begin{definition}
    Let ${\mathcal I}_{\alpha}^{(1)}$ be the set of shapes that can be obtained from $\alpha$ by performing at least one of the following steps:
	\begin{itemize}
	    \item Collapse $\square{i}$ with a square vertex in $\body(\alpha) \setminus U_\alpha$
	    \item Collapse $\square{j}$ with a square vertex in $\body(\alpha) \setminus U_\alpha$ (distinct from $\square{i}$'s collapse if it happened)
	\end{itemize}
    Let ${\mathcal I}_\alpha^{(2)}:= {\mathcal I}_\alpha \setminus {\mathcal I}_\alpha^{(1)}$
    and define the improper versions $\widetilde{{\mathcal I}}_\alpha^{(1)}, \widetilde{{\mathcal I}}_\alpha^{(2)}$ analogously.
\end{definition}

\begin{lemma}\label{lem:improper-collapse}
	Let $\alpha$ be a left spider with end vertices \square{i}, \square{j}. There are coefficients $c_\beta$ for $\beta \in \widetilde{{\mathcal I}}_\alpha$ such that
	\[L_{\abs{U_\alpha}} \cdot M_{\body(\alpha)} = 2M_{\alpha} + \sum_{\beta \in \widetilde{{\mathcal I}}_\alpha}c_\beta M_\beta,\]
	\[\abs{c_\beta} \leq
	\begin{cases}
	 40\abs{V(\alpha)}^3 & \beta \in \widetilde{{\mathcal I}}_\alpha^{(1)}\\
	 \frac{40\abs{V(\alpha)}^3}{n} & \beta \in \widetilde{{\mathcal I}}_\alpha^{(2)}
	\end{cases}.\]
\end{lemma}
\begin{proof}
    First, we can check that the coefficient of $M_\alpha$ is 2. Only the $\ell_k$ term of $L_k$ has the full number of squares, and it has a factor of 2 in $L_k$.

    The shapes in $\widetilde{{\mathcal I}}_\alpha$ are definitionally the intersection
    terms that appear in this graph matrix product, and furthermore the shapes in
    $\widetilde{{\mathcal I}}_\alpha$ are definitionally the intersection terms for the $\ell_k$ term.
    Using~\cref{prop:graph-matrix-multiplication}, for each of the five shapes
    in $L_{\abs{U_\alpha}}$ the coefficient it contributes is bounded by
    $4\abs{V(\alpha)}^3$. The coefficient on $\ell_k$ is 2, so the coefficients
    for $\widetilde{{\mathcal I}}_\alpha^{(1)}$ are at most $8 \abs{V(\alpha)}^3$. The
    maximum coefficient of the other four shapes in $L_{\abs{U_\alpha}}$ is
    $\frac{2}{n}$, so their total contribution to coefficients on
    $\widetilde{{\mathcal I}}_\alpha^{(2)}$ is at most $\frac{32\abs{V(\alpha)}^3}{n}$.
\end{proof}

We now want to turn our improper shapes into proper ones from ${\mathcal I}_\alpha$. Unfortunately it is not quite true that to expand an improper shape, one can just expand each edge individually
(though this is true for improper ribbons).
There is an additional difficulty that arises due to ribbon symmetries. To see the difficulty, consider the example given in \cref{fig:ribbon-symmetry} below.

\begin{figure}[!ht]
  \centering
  \begin{tikzpicture}[scale=0.5,every node/.style={scale=0.5}]
   
    \draw  (-6.5,1.5) rectangle node {\huge $u_1$} (-5,0);
    \draw  (5,1.5) rectangle node {\huge $v_2$} (6.5,0);
    \draw  (0,2.5) ellipse (1 and 1) node {\huge $w_1$};
    \draw  (0,-1) ellipse (1 and 1) node {\huge $w_2$};
    \node (v1) at (-5,0.75) {};
    \node (v3) at (1,2.5) {};
    \node (v2) at (-1,2.5) {};
    \node (v4) at (1,-1) {};
    \draw  (-7,2) rectangle (-4.5,-0.5);
    \node at (-5.5,-1.5) {\huge $U_{\alpha}$};
    \draw  (4.5,2) rectangle (7,-0.5);
    \node at (6,-1.5) {\huge $V_{\alpha}$};
    \node (v6) at (-1,-1) {};
    \node (v5) at (4.95,0.75) {};
    \draw (v3);
    \draw  plot[smooth, tension=.7] coordinates {(v3) (v5)};
    \draw  plot[smooth, tension=.7] coordinates {(v5) (v4)};
    \draw  plot[smooth, tension=.7] coordinates {(v1) (v6)};
    \draw  plot[smooth, tension=.7] coordinates {(v1) (-3,2.5) (v2)};
    \draw  plot[smooth, tension=.7] coordinates {(v1) (-2.5,1) (v2)};
    \node at (-3.5,3) {\Large $1$};
    \node at (-2,0.5) {\Large $1$};
    \node at (-3,-0.5) {\Large $2$};
    \node at (3,2) {\Large $2$};
    \node at (3,-0.5) {\Large $2$};

   
    \draw  (12.5,-3.5) rectangle node {\huge $u_1$} (14,-5);
    \draw  (24,-3.5) rectangle node {\huge $v_2$} (25.5,-5);
    \draw  (19,-2.5) ellipse (1 and 1) node {\huge $w_1$};
    \draw  (19,-6) ellipse (1 and 1) node {\huge $w_2$};
    \node (v11) at (14,-4.25) {};
    \node (v13) at (20,-2.5) {};
    \node (v12) at (18,-2.5) {};
    \node (v14) at (20,-6) {};
    \draw  (12,-3) rectangle (14.5,-5.5);
    \node at (13.5,-6.5) {\huge $U_{\gamma_2}$};
    \draw  (23.5,-3) rectangle (26,-5.5);
    \node at (25,-6.5) {\huge $V_{\gamma_2}$};
    \node (v16) at (18,-6) {};
    \node (v15) at (23.95,-4.25) {};
    \draw (v13);
    \draw  plot[smooth, tension=.7] coordinates {(v13) (v15)};
    \draw  plot[smooth, tension=.7] coordinates {(v15) (v14)};
    \draw  plot[smooth, tension=.7] coordinates {(v11) (v16)};

    \node at (16,5.5) {\Large $2$};
    \node at (16,-5.5) {\Large $2$};
    \node at (22,-3) {\Large $2$};
    \node at (22,-5.5) {\Large $2$};

   
    \draw  (12.5,5) rectangle node {\huge $u_1$} (14,3.5);
    \draw  (24,5) rectangle node {\huge $v_2$} (25.5,3.5);
    \draw  (19,6) ellipse (1 and 1) node {\huge $w_1$};
    \draw  (19,2.5) ellipse (1 and 1) node {\huge $w_2$};
    \node (v11) at (14,4.25) {};
    \node (v13) at (20,6) {};
    \node (v12) at (18,6) {};
    \node (v14) at (20,2.5) {};
    \draw  (12,5.5) rectangle (14.5,3);
    \node at (13.5,2) {\huge $U_{\gamma_1}$};
    \draw  (23.5,5.5) rectangle (26,3);
    \node at (25,2) {\huge $V_{\gamma_1}$};
    \node (v16) at (18,2.5) {};
    \node (v15) at (23.95,4.25) {};
    \draw (v13);
    \draw  plot[smooth, tension=.7] coordinates {(v13) (v15)};
    \draw  plot[smooth, tension=.7] coordinates {(v15) (v14)};
    \draw  plot[smooth, tension=.7] coordinates {(v11) (v16)};
    \node at (16,3) {\Large $2$};
    \node at (22,5.5) {\Large $2$};
    \node at (22,3) {\Large $2$};

    \draw  plot[smooth, tension=.7] coordinates {(v11)};
    \draw  plot[smooth, tension=.7] coordinates {(v11) (v12)};
    \node at (8.5,0.5) {\Huge $=$};
    \node at (19,0) {\Huge $+$};
    \node at (10.5,4) {\Huge \bf $2 \times$};
  \end{tikzpicture}
  \caption{A surprising equality of graph matrices.}
  \label{fig:ribbon-symmetry}
\end{figure}

One would expect both coefficients on the right shapes to be 1 since $h_1(z)^2 = h_2(z) + h_0(z)$. However, in the left shape, the two circles are distinguishable, hence summing over all ribbons includes one with $w_1 = i, w_2 =  j$ and a second with $w_1 = j, w_2 = i$. On the top right shape, the circles are indistinguishable, hence the graph/ribbon where the circles are assigned $\{i, j\}$ is counted twice. On the bottom right shape, the circles are distinguishable, so all ribbons are summed once. To bound the new coefficients, we use the concept of shape automorphisms.

\begin{definition}
    An automorphism of a shape $\alpha$ is a function $\varphi:V(\alpha) \rightarrow V(\alpha)$ that preserves the sets $U_\alpha, V_\alpha$ and is an automorphism of the underlying edge-labeled graph. Let $\aut(\alpha)$ denote the automorphism group of $\alpha$.
\end{definition}

\begin{proposition}\label{prop:expand-improper}
    Let $\alpha$ be an improper shape, and let ${\mathcal P}$ be the set of proper shapes that can be obtained by expanding $\alpha$. Then there are coefficients $\abs{c_\gamma} \leq C_{Fourier}\cdot C_{Aut}$ such that
    \[M_\alpha = \displaystyle\sum_{\gamma \in {\mathcal P}} c_\gamma M_\gamma\]
    where $C_{Fourier}$ is a bound on the magnitude of Fourier coefficients in the expansion and $C_{Aut} = \max_{\gamma \in {\mathcal P}} \frac{\abs{\aut(\gamma)}}{\abs{\aut(\alpha)}}$.
\end{proposition}
\begin{proof}
The number of realizations of a graph matrix giving a particular ribbon is exactly the number of automorphisms, therefore
\begin{align*}
    M_\alpha &= \frac{1}{\abs{\aut(\alpha)}}\displaystyle\sum_{\text{realizations }\sigma} M_{\sigma(\alpha)}
\end{align*}
Expand each improper ribbon $M_{\sigma(\alpha)}$ into proper ribbons with coefficients at most $C_{Fourier}$.
Because the realizations of $\alpha$ and any $\gamma$ are the same, this exactly sums over all $\gamma$ and all realizations of $\gamma$. The
Fourier coefficient on each realization of $\gamma$ is the same; let it be
$c_\gamma'$ with $\abs{c_\gamma'} \leq C_{Fourier}$. Continuing,
\begin{align*}
    &= \displaystyle\frac{1}{\abs{\aut(\alpha)}} \sum_{\gamma \in {\mathcal P}} c_\gamma'\sum_{\text{realizations }\sigma} M_{\sigma(\gamma)}\\
    &= \sum_{\gamma \in {\mathcal P}} c_\gamma' \frac{\abs{\aut(\gamma)}}{\abs{\aut(\alpha)}} M_{\gamma}
\end{align*}
\end{proof}

\begin{proposition}\label{prop:hermite-product-coefficients}
Let $l_1 \leq \cdots \leq l_k \in {\mathbb{N}}$ and let $L = l_1 + \cdots + l_k$. Assume $L \ge 1$. In the Fourier expansion of $h_{l_1}(z)\cdots h_{l_k}(z)$, the maximum coefficient is bounded in magnitude by $(2L)^{L-l_k}$.
\begin{proof}
In the boolean case, the coefficient is 1. In the Gaussian case, the ``linearization coefficient'' of $h_p(z)$ in this product is given by orthogonality to be
\[\frac{{\mathbb E}_{z \sim {\mathcal N}(0,1)}[h_{l_1}(z) \cdots h_{l_k}(z) \cdot h_p(z)]}{{\mathbb E}_{z \sim {\mathcal N}(0,1)}[h_p^2(z)]}  = \frac{{\mathbb E}_{z \sim {\mathcal N}(0,1)}[h_{l_1}(z) \cdots h_{l_k}(z) \cdot h_p(z)]}{p!}\]
A formula from, e.g.,~\cite[Example G (Continued)]{RotaWallstrom97} shows that ${\mathbb E}[h_{l_1} \cdots h_{l_k} \cdot h_p]$ equals the number of ``block perfect matchings'': perfect matchings on $l_1 + \cdots + l_k + p$ elements
divided into blocks of size $l_i$ or $p$ such that no two elements from the same block are matched. Bound the number of block perfect matchings by:
\begin{itemize}
    \item Pick a partial function from blocks $l_1, \dots, l_{k-1}$ to $[L]$ in at most $(L+1)^{L-l_k}$ ways.
    \item If this forms a valid partial matching and there are $p$ unmatched elements remaining, match them with the elements from the block of size $p$ in $p!$ ways.
\end{itemize}
Therefore the coefficient is bounded by $(L+1)^{L - l_k} \leq (2L)^{L-l_k}$.
\end{proof}
\end{proposition}

\begin{proposition}\label{prop:automorphism-ratio}
For a shape $\alpha$, let $\alpha\pm e$ denote the shape with edge $e$ added or deleted.
Then
\[ \frac{\abs{\aut(\alpha \pm e)}}{\abs{\aut(\alpha)}}~\leq~\abs{V(\alpha)}^2.\]
\end{proposition}
\begin{proof}
We show that the two groups have a large subgroup which are equal. Consider $\aut(\alpha\pm e)$ and $\aut(\alpha)$ as group actions on the set $\binom{V(\alpha)}{2}$. Letting $G^e$ denote the stabilizer of edge $e$, observe that $\aut(\alpha\pm e)^e = \aut(\alpha)^e$. By the orbit-stabilizer lemma, the index $\abs{G : G^e}$ is equal to the size of the orbit of $e$, which is at least 1 and at most $\abs{V(\alpha)}^2$. So,
\[\frac{\abs{\aut(\alpha \pm e)}}{\abs{\aut(\alpha)}} = \frac{\abs{\aut(\alpha \pm e) : \aut(\alpha \pm e)^e}}{\abs{\aut(\alpha) : \aut(\alpha)^e}} \leq \abs{V(\alpha)}^2.\qedhere \]
\end{proof}



\begin{lemma}\label{lem:collapse-lemma}
    If $\alpha$ is a left spider, there are coefficients ${c_\beta}$ for each $\beta \in {\mathcal I}_\alpha$ such that
	\[L_{\abs{U_\alpha}} \cdot M_{\body(\alpha)} = 2M_{\alpha} + \sum_{\beta \in {{\mathcal I}}_\alpha}c_\beta M_\beta,\]
	\[\abs{c_\beta} \leq
	\begin{cases}
	 160\abs{V(\alpha)}^7\abs{E(\alpha)}^2 & \beta \in {{\mathcal I}}_\alpha^{(1)}\\
	 \frac{160\abs{V(\alpha)}^7\abs{E(\alpha)}^2}{n} & \beta \in {{\mathcal I}}_\alpha^{(2)}
	\end{cases}.\]
\end{lemma}
\begin{proof}
    We express each $M_\beta, \beta \in \widetilde{{\mathcal I}}_\alpha$ in~\cref{lem:improper-collapse} in terms of proper shapes. We apply~\cref{prop:expand-improper} using the following bounds on $C_{Fourier}$ and $C_{Aut}$. The only improperness in $\beta$ comes from
    collapsing (at most) the two end vertices, which have a single incident
    edge each. Therefore the set of labels of any parallel edges is either
    $\{1,k\}$ or $\set{1,1,k},$ for some $k \leq \abs{E(\alpha)}$. By~\cref{prop:hermite-product-coefficients}, we have $C_{Fourier} \leq 4\abs{E(\alpha)}^2$. There are at most two extra parallel edges in $\beta$, so we have $C_{Aut} \leq \abs{V(\alpha)}^4$ using~\cref{prop:automorphism-ratio}. Therefore the coefficients increase by at most $C_{Fourier}\cdot C_{Aut} \leq 4\abs{E(\alpha)}^2\abs{V(\alpha)}^4$.
\end{proof}

\begin{corollary}\label{cor:right-spider-coefs}
    If $\alpha$ is a right spider, there are coefficients $c_\beta$ with the same bounds given in~\cref{lem:collapse-lemma} such that
	\[M_{\body(\alpha)} \cdot L_{\abs{U_\alpha}}^\intercal = 2M_{\alpha} + \sum_{\beta \in {{\mathcal I}}_\alpha}c_\beta M_\beta.\]
\end{corollary}

\begin{corollary}\label{cor:spider-killing}
    If $x \perp \nullspace({\mathcal M}_{fix})$ and $\alpha$ is a spider, then for some $c_\beta$ with the same bounds given in~\cref{lem:collapse-lemma},
    \[x^\top(M_\alpha - \displaystyle\sum_{\beta \in {\mathcal I}_\alpha} c_\beta M_\beta) x = 0 \]
\end{corollary}
\begin{proof}
For a left spider, since
\[{\mathcal M}_{fix} (2M_\alpha + \displaystyle\sum_{\beta \in {\mathcal I}_\alpha} c_\beta M_\beta) = {\mathcal M}_{fix} \cdot L_{\abs{U_\alpha}} \cdot M_{\alpha'} = 0\]
we are in position to use~\cref{fact:null-space}. For a right spider, the proof is analogous.
\end{proof}

\subsection{Killing all the spiders}

The strategy is to start with the moment matrix ${\mathcal M}$ and apply~\cref{cor:spider-killing} repeatedly until we end up with no spiders in our decomposition. For each spider, killing it via~\cref{cor:spider-killing} leaves only intersection terms. Some of those intersection terms may themselves be smaller spiders, in which case we will apply the corollary again and again until only non-spiders remain. The difficulty during this procedure is to bound the total coefficient accumulated on each non-spider. To capture this process, we define the web of a spider $\alpha$, which will be a directed acyclic graph that will capture the spider killing process. For the sake of distinction, we will call the vertices of this graph ``nodes".

\begin{definition}[Web of $\alpha$]
    The web $W(\alpha)$ of a spider $\alpha$ is a rooted directed acyclic graph
    (DAG) whose nodes are shapes and whose root is $\alpha$. Each spider node
    $\gamma$ has edges to nodes $\beta$ for each shape $\beta \in
    {\mathcal I}_{\gamma}$.
   
   
    The non-spider nodes are leaves/sinks of the DAG.
\end{definition}
\begin{remark}
  The DAG structure arises because each shape in ${\mathcal I}_\gamma$ has strictly fewer square vertices than $\gamma$ for any spider $\gamma$. As a consequence, the height of a web $W(\alpha)$ is at most $\abs{V(\alpha)}$.
\end{remark}
Each node $\gamma$ of $W(\alpha)$ also has an associated value $v_\gamma$, which is defined by the following process:
\begin{itemize}
	\item Initially, set $v_\alpha = 1$ and for all other $\gamma$, set $v_\gamma = 0$.
	\item Starting from the root and in topological order, each spider node $\gamma$ adds $v_{\gamma} c_\beta$ to $v_\beta$ for each child $\beta \in {\mathcal I}_{\gamma}$, where the $c_\beta$ are the coefficients from~\cref{cor:spider-killing}.
\end{itemize}

\begin{proposition}\label{prop:web-sum}
	If $x \perp \nullspace({\mathcal M}_{fix})$, then
	\[\displaystyle x^\intercal(M_\alpha - \sum_{\text{leaves } \gamma\text{ of }W(\alpha)} v_\gamma M_\gamma)x = 0.\]
\end{proposition}
\begin{proof}
	Start with the equation $x^\intercal M_{\alpha} x = x^\intercal
	v_{\alpha}M_{\alpha} x$. In each step, we take the topologically first spider $\gamma$, which in this case means the spider closest to the root of $W(\alpha)$, that is present in the right hand side of our equation and using \cref{cor:spider-killing}, we
	replace $v_{\gamma}M_{\gamma}$ by $\sum_{\beta \in
	\text{children}(\gamma)} v_\gamma c_\beta M_\beta$.
	Precisely by the definition of the $v_{\gamma}$, this
	process ends with the equation
	\[\displaystyle x^\intercal M_\alpha x = x^\intercal(\sum_{\text{leaves } \gamma\text{ of }W(\alpha)} v_\gamma M_\gamma)x\]
\end{proof}

\begin{proposition}\label{prop:web-parents}
For any node $\beta$ in $W(\alpha)$, $\abs{\parents(\beta)} \leq 4\abs{V(\alpha)}^3 \cdot \abs{E(\alpha)}^2$ where $parents(\beta)$ is the set of nodes $\gamma$ in $W(\alpha)$ such that $\beta \in {\mathcal I}_{\gamma}$.
\end{proposition}
\begin{proof}
The following process covers all parent left spiders $\gamma$ which could possibly collapse their end vertices to form $\beta$. Starting from $\gamma = \beta,$
\begin{itemize}
    \item Pick a circle vertex $\circle{u} \in V(\gamma)$ to be the neighbor of the end vertices.
    \item Pick a square vertex $\square{i} \in V(\gamma)$ to be the collapse of the first end vertex. ``Uncollapse'' it by adding a new square to $U_{\gamma}$ with a single edge to $\circle{u}$ with label $1$. Flip the value of $U_{\gamma}(\square{i})$. Modify the label of $\{\square{i}, \circle{u}\}$ to any number up to $\abs{E(\alpha)}$.
    \item Pick a square vertex $\square{j} \in V(\gamma)$ to be the second end vertex. Optionally uncollapse it by adding a new square to $\gamma$ in the same way as above.
\end{itemize}
The process can be carried out in at most $\abs{V(\alpha)}^3\abs{E(\alpha)}(\abs{E(\alpha)}+1) \leq 2\abs{V(\alpha)}^3\abs{E(\alpha)}^2$ ways. We multiply by 2 to accommodate right spiders.
\end{proof}

Let us label each parent-child edge $(\gamma, \beta$) as either a ``type 1'' edge if $\beta \in {\mathcal I}_{\gamma}^{(1)}$ or a ``type 2'' edge if $\beta \in {\mathcal I}_{\gamma}^{(2)}$.


\begin{proposition}\label{prop:web-derivation-number}
  Let $p$ be a path in $W(\alpha)$ with $\#_1(p)$ type 1 edges and $\#_2(p)$ type 2 edges. Then $\#_1(p)~\leq~\abs{E(\alpha)}~+~2\#_2(p)$.
\end{proposition}

\begin{proof}
	For a shape $\gamma$, let $S_{\gamma}$ be the set of square vertices in $\gamma$. Then, $S_{\gamma} \cap W_{\gamma}$ will be the set of middle vertices of $\gamma$ which are squares.
  We claim that the quantity $\abs{{\mathcal S}_\gamma \cap W_\gamma} + \abs{U_\gamma
    \setminus (U_\gamma \cap V_\gamma)} + \abs{V_\gamma \setminus
    (U_\gamma \cap V_\gamma)}$ decreases during a collapse.


Fix a pair of consecutive shapes $(\gamma, \beta)$ which form a type
1 edge. Looking at the definition of ${\mathcal I}_\gamma^{(1)}$, each end vertex either
  collapses with (1) nothing, or (2) a vertex of $W_\gamma$, or (3) a vertex from
  $V_\gamma \setminus U_\gamma$ (if $\gamma$ is a left spider;
  for a right spider, $U_\gamma \setminus V_\gamma$).
  Furthermore, case (2) or (3) must occur for at least one of the end vertices and also, they do not collapse together.

  If case (2) occurs, then $\abs{{\mathcal S}_\beta \cap W_\beta} < \abs{{\mathcal S}_\gamma \cap W_\gamma}$ while $\abs{U_\beta
  \setminus (U_\beta \cap V_\beta)} = \abs{U_\gamma \setminus (U_\gamma \cap
  V_\gamma)}$ and $\abs{V_\beta \setminus
    (U_\beta \cap V_\beta)} = \abs{V_\gamma \setminus
    (U_\gamma \cap V_\gamma)}$.
    On the other hand, if case (3) occurs, then $W_\beta = W_\gamma$ while
  $\abs{U_\beta \setminus(U_\beta \cap V_\beta)}<\abs{U_\gamma
  \setminus(U_\gamma \cap V_\gamma)}$ and $\abs{V_\beta \setminus(U_\beta \cap V_\beta)}<\abs{V_\gamma
    \setminus(U_\gamma \cap V_\gamma)}$.
    In all cases, $\abs{{\mathcal S}_\beta \cap W_\beta} + \abs{U_\beta
  	\setminus (U_\beta \cap V_\beta)} + \abs{V_\beta \setminus
  	(U_\beta \cap V_\beta)} < \abs{{\mathcal S}_\gamma \cap W_\gamma} + \abs{U_\gamma
  	\setminus (U_\gamma \cap V_\gamma)} + \abs{V_\gamma \setminus
  	(U_\gamma \cap V_\gamma)}$ as desired.

  Now we bound this expression for $\alpha$. From the definition of ${\mathcal L}$, \cref{def:calL_valid_shapes},
  for spiders appearing in the pseudocalibration,
  the square vertices in $W_{\alpha}$, $U_\alpha \setminus (U_\alpha \cap
  V_\alpha)$ and $V_\alpha \setminus (U_\alpha \cap
  V_\alpha)$ have degree at least $1$ and can only be connected to circle vertices.
  Therefore their number is bounded by $\abs{E(\alpha)}$. Hence, initially
  $\abs{{\mathcal S}_\alpha \cap W_\alpha} + \abs{U_\alpha \setminus (U_\alpha \cap V_\alpha)} + \abs{V_\alpha \setminus (U_\alpha \cap V_\alpha)} \leq \abs{E(\alpha)}$.

  Finally, each type 2 edge in $p$ can only increase the quantity $\abs{{\mathcal S}_\gamma \cap W_\gamma} + \abs{U_\gamma \setminus (U_\gamma \cap V_\gamma)} + \abs{V_\gamma \setminus (U_\gamma \cap V_\gamma)}$
  by at most 2. Therefore, we have the desired inequality $\#_1(p) \leq \abs{E(\alpha)} + 2\#_2(p)$.
\end{proof}
\begin{corollary}\label{cor:web-derivation-number}
    $\#_2(p) \geq \frac{\abs{p}}{3} - \frac{\abs{E(\alpha)}}{3}$.
\end{corollary}
\begin{proof}
  Plug in $\abs{p} = \#_1(p) + \#_2(p)$ and rearrange.
\end{proof}

Finally, we can bound the accumulation on each non-spider by a term which only depends on the parameters of the spider $\alpha$.
\begin{lemma}\label{lem:web-leaves}
There are absolute constants $C_1, C_2$ so that for all leaves $\gamma$ of $W(\alpha)$,
\[ \abs{v_\gamma} \leq (C_1 \cdot \abs{V(\alpha)} \cdot \abs{E(\alpha)})^{C_2 \abs{E(\alpha)}}.\]
\end{lemma}

\begin{proof}
  To bound $\abs{v_\gamma}$ we will sum the contributions of all paths $p = (\beta_0=\alpha,\dots,\beta_r=\gamma)$ in $W(\alpha)$
  starting from $\alpha$ and ending at $\gamma$. This path contributes a product of coefficients $c_\beta$ towards $v_\gamma$.

  \begin{remark}
  Here it is important that type 2 edges have stronger bounds on their coefficients $\abs{c_\beta} \leq  C\cdot (\abs{V(\alpha)}\abs{E(\alpha)})^{O(1)}/n \ll 1$.
  \end{remark}

  Before we proceed with the proof we establish some convenient notation and recall some facts.
  %
  For consecutive shapes $\beta_{i-1},\beta_{i}$ (i.e.,\xspace $\beta_{i}$ is a child of $\beta_{i-1}$),
  we denote by $c_{\beta_i}$ the coefficient from~\cref{cor:spider-killing} applied on $\beta_{i - 1}$.
  %
  By~\cref{prop:web-parents}, the in-degree of $W(\alpha)$ can be bounded as $B_1~\cdot~(\abs{V(\alpha)}\abs{E(\alpha)})^{B_2}$ for some constants $B_1, B_2$. Thus,
  the number of paths of length $r$ ending at $\gamma$ is at most $(B_1\abs{V(\alpha)}\abs{E(\alpha)})^{B_2 r}$. Using \cref{cor:spider-killing}, set $B_1, B_2$ large enough so that $c_{\beta_i}$ is at most $B_1 \cdot (\abs{V(\alpha)}\abs{E(\alpha)})^{B_2}$ for a type $1$ edge (resp. $B_1 \cdot (\abs{V(\alpha)}\abs{E(\alpha)})^{B_2} / n$ for a type $2$ edge).
  {\footnotesize

  \begin{align*}
      \abs{v_\gamma} &\le \sum_{r=0}^\infty \sum_{\substack{p = (\beta_0=\alpha,\dots,\beta_r=\gamma) \\ \textup{path from $\alpha$ to $\gamma$ in } W(\alpha)}} \prod_{i=1}^r \abs{c_{\beta_i}}\\
      &\le \sum_{r=0}^\infty\sum_{\substack{p = (\beta_0=\alpha,\dots,\beta_r=\gamma) \\ \textup{path from $\alpha$ to $\gamma$ in } W(\alpha)}} \left(B_1 \cdot (\abs{V(\alpha)}\abs{E(\alpha)})^{B_2} \right)^{\#_1(p)} \left(B_1 \cdot (\abs{V(\alpha)}\abs{E(\alpha)})^{B_2}/n \right)^{\#_2(p)} \\
      &\le \sum_{r=0}^\infty\sum_{\substack{p = (\beta_0=\alpha,\dots,\beta_r=\gamma) \\ \textup{path from $\alpha$ to $\gamma$ in } W(\alpha)}} \left(B_1 \cdot (\abs{V(\alpha)}\abs{E(\alpha)})^{B_2} \right)^{\abs{E(\alpha)} + 2\#_2(p)} \left(B_1 \cdot (\abs{V(\alpha)}\abs{E(\alpha)})^{B_2}/n \right)^{\#_2(p)} \\
      & = \sum_{r=0}^\infty\sum_{\substack{p = (\beta_0=\alpha,\dots,\beta_r=\gamma) \\ \textup{path from $\alpha$ to $\gamma$ in } W(\alpha)}} \left(B_1 \cdot (\abs{V(\alpha)}\abs{E(\alpha)})^{B_2} \right)^{\abs{E(\alpha)}} \left(B_1' \cdot (\abs{V(\alpha)}\abs{E(\alpha)})^{B_2'}/n \right)^{\#_2(p)}
  \end{align*}
  }
for some constants $B_1', B_2'$
where the first inequality followed by \cref{cor:spider-killing} and the second inequality followed by \cref{prop:web-derivation-number}.
We split the above sum into two sums, $r \le 3|E(\alpha)|$ and $r > 3|E(\alpha)|$. For $r \leq 3\abs{E(\alpha)}$, upper bounding the $\#_2(p)$ term by 1 and upper bounding
  the number of paths by $(B_1\abs{V(\alpha)}\abs{E(\alpha)})^{B_2 r}$ gives a
  bound of $(B_1''\abs{V(\alpha)}\abs{E(\alpha)})^{B_2'' \abs{E(\alpha)}}$ for some constants $B_1'', B_2''$.
  For larger $r$, we lower bound
  $\#_2(p) \geq r/9 = \abs{E(\alpha)}/3$ using~\cref{cor:web-derivation-number}. Applying the same bound on the number of paths,
  the total contribution of the terms corresponding to larger $r$ is bounded by
  1 using the power of $n$ in the denominator (assuming $\delta, \tau$ are
  small enough).
\end{proof}

We define the result of all this spider killing to be a new matrix ${\mathcal M}^+$.
\begin{definition}
    Define the matrix ${\mathcal M}^+$ as the result of killing all the spiders,
    \[{\mathcal M}^+ := {\mathcal M} - \displaystyle\sum_{\text{spiders }\alpha} \lambda_\alpha \left( M_\alpha - \sum_{\text{leaves }\gamma \text{ of }W(\alpha)} v_\gamma M_\gamma \right)\]
\end{definition}



\section{Chapter Notes}

The technical results in this chapter are adapted from \cite{sklowerbounds}, joint work with Mrinalkanti Ghosh, Fernando Granha Jeronimo, Chris Jones and Aaron Potechin.

\input{sherrington_kirkpatrick/open_problems}
\subsection{Norm Bounds}\label{app:norm_bounds}

The precise norm bounds we use come from applying the trace power method
in~\cite{ahn2016graph}, but qualitatively, the bounds from \cref{chap: efron_stein} also work. The paper~\cite{ahn2016graph} uses a slightly different
definition of matrix index. They define a \textit{matrix index piece}
as a tuple of distinct elements from either ${\mathcal C}_m$ or ${\mathcal S}_n$
along with a fixed integer denoting multiplicity. A matrix index is
then a set of matrix index pieces. Our graph matrix $M_\alpha$ appears
as a submatrix of those matrices: for a given set of square vertices,
order the squares in increasing order in a tuple, and assign it
multiplicity 1. Hence the same norm bounds apply.


Boolean norm bounds:
\begin{lemma}\label{lem:norm-bounds}
Let $V_{rel}(\alpha) := V(\alpha) \setminus (U_\alpha \cap V_\alpha)$. There is a universal constant $C$ such that the following norm bound holds for all proper shapes $\alpha$ w.h.p.:
\[\norm{M_\alpha} \leq 2\cdot\left(\abs{V(\alpha)} \cdot \log(n)\right)^{C\cdot \abs{V_{rel}(\alpha)}} \cdot n^{\frac{w(V(\alpha)) - w(S_{\min}) + w(W_{iso})}{2}} \]
\end{lemma}
\begin{proof}
From Corollary 8.13 of~\cite{ahn2016graph}, with probability at least $1-\varepsilon$ for a fixed shape $\alpha$,
\[\norm{M_\alpha} \leq 2 \abs{V(\alpha)}^{\abs{V_{rel}(\alpha)}}\cdot \left( 6e \ceil{\frac{\log\left(\frac{n^{w(S_{\min})}}{\varepsilon}\right)}{6\abs{V_{rel}(\alpha)}}}\right)^{\abs{V_{rel}(\alpha)}} \cdot n^{\frac{w(V(\alpha)) - w(S_{\min}) + w(W_{iso})}{2}}\]
Letting $N_k$ be the number of distinct shapes on $k$ vertices (either
circles or squares), we apply the corollary with $\varepsilon = 1/(mn
N_{\abs{V(\alpha)}})$. Union bounding, the failure probability across
all shapes of size $k$ is at most $1/mn$, and since the number of
vertices in a shape is at most $m + n \leq 2m$, we have a bound that
holds with high probability for all shapes. It remains to simplify the
exact bound.

\begin{proposition}\label{prop:boolean-shape-counting}
$N_k \leq 8^k 2^{k^2}$
\begin{proof}
The following process forms all shapes on $k$ vertices: starting from $k$ formal variables, assign each variable to be either a circle or a square, decide whether each variable is in $U_\alpha$ and/or $V_\alpha$, then among the $k^2$ variable pairs put any number of edges.
\end{proof}
\end{proposition}
We also bound $n^{w(S_{\min})} \leq (mn)^{\abs{V(\alpha)}}$.
{\footnotesize
\begin{align*}
    \norm{M_\alpha} & \leq 2 \abs{V(\alpha)}^{\abs{V_{rel}(\alpha)}}\cdot \left( 6e \ceil{\frac{\log\left(n^{w(S_{\min})} \cdot mn N_{\abs{V(\alpha)}}\right)}{6\abs{V_{rel}(\alpha)}}}\right)^{\abs{V_{rel}(\alpha)}} \cdot n^{\frac{w(V(\alpha)) - w(S_{\min}) + w(W_{iso})}{2}} \\
    & \leq 2 \abs{V(\alpha)}^{\abs{V_{rel}(\alpha)}}\cdot \left( 12e \log\left(n^{w(S_{\min})} \cdot mn N_{\abs{V(\alpha)}}\right)\right)^{\abs{V_{rel}(\alpha)}} \cdot n^{\frac{w(V(\alpha)) - w(S_{\min}) + w(W_{iso})}{2}} \\
    & \leq 2 \abs{V(\alpha)}^{\abs{V_{rel}(\alpha)}}\cdot \left( 12e \log\left((mn)^{\abs{V(\alpha)}} \cdot mn\cdot 8^{\abs{V(\alpha)}} 2^{\abs{V(\alpha)}^2}\right)\right)^{\abs{V_{rel}(\alpha)}} \cdot n^{\frac{w(V(\alpha)) - w(S_{\min}) + w(W_{iso})}{2}}\\
    &  \leq 2 \abs{V(\alpha)}^{\abs{V_{rel}(\alpha)}}\cdot \left( 100e \abs{V(\alpha)}^2 \log\left(mn\right)\right)^{\abs{V_{rel}(\alpha)}} \cdot n^{\frac{w(V(\alpha)) - w(S_{\min}) + w(W_{iso})}{2}}\\
    & \leq 2\cdot\left(\abs{V(\alpha)} \cdot \log(mn)\right)^{3\cdot \abs{V_{rel}(\alpha)}} \cdot n^{\frac{w(V(\alpha)) - w(S_{\min}) + w(W_{iso})}{2}}
\end{align*}
}%
Note that we now assume $m \leq n^2$.
\end{proof}

We have the following norm bound for Hermite shapes. For a Hermite shape $\alpha$, define the \textit{total size} to be $\abs{U_\alpha} + \abs{V_\alpha} + \abs{W_\alpha} + \abs{E(\alpha)}$.
\begin{lemma}\label{lem:gaussian-norm-bounds}
Let $V_{rel}(\alpha) := V(\alpha) \setminus (U_\alpha \cap V_\alpha)$ as sets. There is a universal constant $C$ such that the following norm bound holds for all proper shapes $\alpha$ with total size at most $n$ w.h.p.:
\[ \norm{M_\alpha} \leq 2\cdot\left(\abs{V(\alpha)} \cdot (1+\abs{E(\alpha)}) \cdot \log(n)\right)^{C\cdot (\abs{V_{rel}(\alpha)} + \abs{E(\alpha)})} \cdot n^{\frac{w(V(\alpha)) - w(S_{\min}) + w(W_{iso})}{2}}\]
\end{lemma}

The proof performs the same calculation starting from~\cite[Corollary 8.15]{ahn2016graph}. Note that in our notation, $l(\alpha) = \abs{E(\alpha)}$. There is a further difference which is that~\cite{ahn2016graph} uses normalized Hermite polynomials whereas we use unnormalized Hermite polynomials; this contributes the additional term $\prod_{e \in E(\alpha)} l(e)! \leq (1+\abs{E(\alpha)})^{\abs{E(\alpha)}}$. We must replace Proposition~\ref{prop:boolean-shape-counting} with the following:
\begin{proposition}\label{prop:gaussian-shape-counting}
The number of Hermite shapes with total size $k$ is at most $k2^k(k+1)^{2k+k^2}$.
\begin{proof}
Such a shape has at most $k$ distinct variable vertices. Each of these is either a circle or a square. Each variable can be in $U_\alpha$ with multiplicity between 0 and (at most) $k$, and also in $V_\alpha$ with multiplicity between 0 and $k$. The $k^2$ possible pairs of vertices can have edge multiplicity in $E(\alpha)$ between 0 and $k$.
\end{proof}
\end{proposition}

\section{Satisfying the Constraints Exactly}
\label{sec:exact-constraints}

After pseudocalibration, the PAP constraints ``$\ip{v}{d_u}^2 = 1$'' are not exactly
satisfied by the pseudocalibration,
but they are satisfied up to truncation error $\tilde{\EE}[\ip{v}{d_u}^2 -1] = n^{-\Omega(n^\tau)}$. This is enough to produce a Sherrington-Kirkpatrick solution that is \textit{almost} boolean, meaning $\tilde{\EE}[x_i^2] = 1 \pm n^{-\Omega(n^\tau)}$ where the pseudocalibration is truncated to degree $n^\tau$. To satisfy the constraints exactly, and produce an SK solution which is \textit{exactly} boolean, we can project the pseudocalibration operator. The goal of this section is to prove the following lemma for the PAP problem,

\begin{lemma}\label{lem:pseudoexpectation-rounding}
W.h.p. for the PAP problem there is $\tilde{\EE}' \in {\mathbb R}^{\binom{[n]}{\leq D}}$ such that $\norm{\tilde{\EE} - \tilde{\EE}'}_2 \leq \frac{1}{n^{\Omega(n^\tau)}}$ and $\tilde{\EE}'$ exactly satisfies the constraints ``$\ip{v}{d_u}^2 = 1"$.
\end{lemma}
\begin{remark}
Note that $\tilde{\EE}'$ is syntactically guaranteed to still satisfy the constraints ``$v_i^2 = \frac{1}{n}$".
\end{remark}

\begin{corollary}\label{lem:constraint-fixing}
There is an $\binom{[n]}{\leq D/2} \times \binom{[n]}{\leq D/2}$ matrix ${\mathcal E}$ with $\norm{{\mathcal E}} \leq \frac{1}{n^{\Omega(n^\tau)}}$ such that the matrix $M_{fix}~:=~M~+~{\mathcal E}$ is SoS-symmetric and exactly satisfies the constraints ``$\ip{v}{d_u}^2 = 1$".
\end{corollary}

We view the operators $\tilde{\EE}$ as vectors in ${\mathbb R}^{\binom{[n]}{\leq D}}$. The approach we take is to define a ``check matrix" $Q$ such that $\tilde{\EE}$ satisfies the necessary constraints iff $\tilde{\EE} \in \nullspace(Q)$. When the constraints are functions of $v$ only, the matrix $Q$ would be filled with constants. Since the constraints depend on the inputs $d_u$, the matrix $Q$ is also a function of the $d_u$. This allows us to deconstruct it as a sum of graph matrices -- and in fact it is made out of graph matrices which we have seen already.

\begin{definition}
We let $Q$ be the matrix
\[Q := \displaystyle\sum_{k=2}^{D}L_k^\intercal \]
where the matrices $L_k$ are defined in~\cref{sec:psd}.
\end{definition}

\begin{lemma}
$Q\tilde{\EE} = 0$ iff $\tilde{\EE}$ exactly satisfies the constraints ``$\ip{v}{d_u}^2 = 1$".
\end{lemma}
\begin{proof}
One can see in the proof of~\cref{lem:completed-left-side} that the entries of $Q\tilde{\EE}$ measure exactly the error in the constraints.
\end{proof}

The natural choice of $\tilde{\EE}'$ is therefore the projection of $\tilde{\EE}$ to the nullspace. This is defined by
\[\tilde{\EE}' := \tilde{\EE} - Q^\intercal(QQ^\intercal)^+Q\tilde{\EE} \]
where we take the pseudo-inverse of $QQ^\intercal$ as it will turn out not to be invertible. 

To prove~\cref{lem:pseudoexpectation-rounding}, we must decompose the second 
term in terms of graph matrices and show it has small norm.

As a warm-up, we end this outline by 
showing a simpler projection argument in the Planted Boolean Vector domain is 
sufficient if one just wants to satisfy the boolean constraints in the Planted Boolean Vector problem rather than the constraints of the PAP problem.\footnote{Using the translation between the two problems in~\cref{sec:sk}, this would allow us to exactly satisfy ``$\ip{v}{d_u}^2 =1"$ for the PAP problem. Unfortunately, the constraints ``$v_i^2 =\frac{1}{n}$" might be broken.}


Let $\tilde{\EE}_{\text{PBV}}$ be a candidate, not-yet-boolean, degree-$D$ pseudoexpectation operator for the Planted Boolean Vector problem, $D = 2\cdot n^\delta$. $\tilde{\EE}_{\text{PBV}}$ has an entry for each monomial $b^\alpha$, therefore it 
is $\multiset{n}{\leq D}$-dimensional. Let $Q_{bool}$ be the ``check  
matrix'' for the boolean constraints. $Q_{bool}$ has $n~\cdot~\multiset{n}{\leq D-2}$ rows. The $(i, \alpha)$ row checks $\tilde{\EE}[b^\alpha \cdot b_i^2] = \tilde{\EE}[b^\alpha]$. It has entry 1 in column $\alpha$ and entry $-1$ in column $\alpha \cup \{i, i\}$. 

\begin{lemma}
Assume that $\tilde{\EE}_{\text{PBV}}$ approximately satisfies the boolean constraints:
\[\tilde{\EE}_{\text{PBV}} [b^\alpha \cdot (b_i^2 - 1)] \leq n^{-\Omega(n^\tau)}\]
for any $b^\alpha$ with degree at most $D-2$. Then letting $\tilde{\EE}_{\text{PBV}}'$ be the projection to $\nullspace(Q_{bool})$, we have
\[\norm{\tilde{\EE}_{\text{PBV}} - \tilde{\EE}_{\text{PBV}}'}_2 \leq n^{-\Omega(n^\tau)}.\]
\end{lemma}
\begin{proof}
The effect of projecting $\tilde{\EE}$ to $\nullspace(Q_{bool})$ is to symmetrize $\tilde{\EE}[b^{\alpha + 2\beta}]$ across all $\beta$; average all entries $\tilde{\EE}[1], \tilde{\EE}[b_1^2], \tilde{\EE}[b_2^2], \tilde{\EE}[b_1^6b_7^4b_{10}^2]$ etc, average $\tilde{\EE}[b_1], \tilde{\EE}[b_1b_3^2], \tilde{\EE}[b_1b_3^4b_4^4]$ etc, and so on. One can see this because this is a linear map which fixes $\nullspace(Q_{bool})$ and takes all vectors into $\nullspace(Q_{bool})$.

By assumption, there is additive error $n^{-\Omega(n^\tau)}$ between $\tilde{\EE}_{\text{PBV}}[b^\alpha]$ and 
$\tilde{\EE}_{\text{PBV}}[b^\alpha \cdot b_i^2]$. As the size of $\beta$ is at most $D \ll n^\tau$, we still easily have $\tilde{\EE}_{\text{PBV}}[b^{\alpha + 2\beta}] = \tilde{\EE}_{\text{PBV}}[b^\alpha] \pm n^{-\Omega(n^\tau)}$ for all $\beta$. 
Therefore 
averaging these entries changes each of them by at most 
$n^{-\Omega(n^\tau)}$. Thus,
\[\norm{\tilde{\EE}_{\text{PBV}} - \tilde{\EE}_{\text{PBV}}'}_2 \leq \multichoose{n}{\leq D} \cdot \norm{\tilde{\EE}_{\text{PBV}} - \tilde{\EE}_{\text{PBV}}'}_\infty\]
\[\leq n^{O(n^{\delta})} \cdot n^{-\Omega(n^\tau)} = n^{-\Omega(n^\tau)}\]
\end{proof}

\subsection{Truncation error in the pseudocalibration}

The constraint ``$\ip{v}{d_u}^2 = 1$'' isn't exactly satisfied, but a general property of pseudocalibration is that it's satisfied up to truncation error, which is small w.h.p. We show a quantitative version of this bound.

We introduce the notation
\[\mu_{I, \alpha} := {\mathbb E}_{\text{pl}} [v^I \chi_{\alpha}(d)] \]
where $\chi_\alpha(d) = h_\alpha(d)$ in the Gaussian case and $\chi_\alpha(d) = d^\alpha$ in the boolean case.

\begin{lemma}\label{lem:boolean-approximate-constraints}
    Let $p(d,v)$ such that $p$ is uniformly zero on the planted distribution. Let $\deg_d(p) = D$. For any $I \subseteq [n]$, the only nonzero Fourier coefficients of $\tilde{\EE}[v^I p]$ are those with size between $n^\tau \pm D$.

    Furthermore, the nonzero coefficients are bounded in absolute value by
    \[ M \cdot L \cdot 2^D  e^{mn}\cdot \max_I\max_{\abs{\alpha} \in n^\tau \pm 2D} \abs{\mu_{I, \alpha}} \]
    where $M$ is the number of nonzero monomials of $p$ and $L$ is the largest coefficient of $p$ (in absolute value).
\end{lemma}

\begin{proof} We divide the calculations into boolean and Gaussian cases. For each case we compute that Fourier coefficients below the truncation threshold neatly cancel and bound the coefficients at the threshold.

\noindent\textbf{(Boolean case)} 
Expand $p(d,v) = \displaystyle\sum_{\abs{J} \leq D} d^J p_J(v)$. By linearity,
\[\tilde{\EE}[v^Ip] =  \displaystyle\sum_{\abs{J} \leq D} d^J \tilde{\EE}[v^Ip_J(v)].\]
The $\alpha$-th Fourier coefficient gets a contribution from the $J$-th term equal to the $(\alpha \oplus J)$-th Fourier coefficient of $\tilde{\EE}[v^Ip_J(v)]$. Expand the polynomial $p_J$ in the $J$-th term,
\[ \tilde{\EE}[v^Ip_J(v)] = \sum_{K} c_{J,K} \tilde{\EE}[v^Iv^K]\]
The $(\alpha \oplus J)$-th coefficient of $\tilde{\EE}[v^Iv^K]$ is defined by pseudocalibration to be
\begin{equation}\label{eq:boolean-fourier-cases}
\left\{
\begin{array}{lr}
    \mu_{I+K,\alpha\oplus J} & \abs{\alpha \oplus J} \leq n^\tau\\
    0 & \abs{\alpha \oplus J} > n^\tau
\end{array}\right.
\end{equation}
For $\abs{\alpha} \leq n^\tau - D$ we are guaranteed to be in the first case. For this case the total $\alpha$-th Fourier coefficient is
\begin{align*}
    \displaystyle\sum_{\abs{J} \leq D} \sum_K c_{J,K}\mu_{I+K, \alpha \oplus J} &= \sum_{\abs{J} \leq D} \sum_K c_{J,K}{\mathbb E}_{\text{pl}}[v^{I}v^K d^{J \oplus \alpha}]\\
    &= \sum_{\abs{J} \leq D} \sum_K c_{J,K}{\mathbb E}_{\text{pl}}[v^{I}v^K d^\alpha d^{J}]\\
    &= {\mathbb E}_{\text{pl}}[v^{I}d^{\alpha}p(d,v)]\\
    & = 0.
\end{align*}
For $\abs{\alpha} > n^\tau + D$, we are guaranteed to be in the second case of~\cref{eq:boolean-fourier-cases}, in which case the total Fourier coefficient will also be zero. For $\abs{\alpha}$ within $D$ of the truncation parameter, some terms $J$ will not contribute their coefficients towards cancellation. We bound the Fourier coefficient for these $\alpha$,
\begin{align*}
    \abs{\displaystyle\sum_{\substack{J:\abs{J} \leq D,\\ \abs{\alpha \oplus J} \leq n^\tau}} \sum_K c_{J,K}\cdot\mu_{I+K, \alpha \oplus J}} &\leq \displaystyle\sum_{\abs{J} \leq D} \sum_K \abs{c_{J,K}\cdot\mu_{I+K, \alpha \oplus J}}\\
    &\leq M \cdot L \cdot \max_{I} \max_{\abs{\alpha} \in n^\tau \pm 2D} \abs{\mu_{I,\alpha}}.
\end{align*}

\noindent\textbf{(Gaussian case)} Expand $p(d, v) = \displaystyle\sum_{\abs{\beta} \leq D}h_\beta(d) p_\beta(v) = \sum_{\abs{\beta} \leq D}h_\beta(d) \sum_K c_{\beta, K} v^K$. The pseudoexpectation is
\begin{align*}
\tilde{\EE}[v^I p(d, v)] &= \displaystyle\sum_{\abs{\beta} \leq D}h_\beta(d) \tilde{\EE}[v^Ip_\beta(v)]\\
&= \sum_{\abs{\beta} \leq D}h_\beta(d) \sum_{K}c_{\beta,K}\tilde{\EE}[v^Iv^K]\\
&= \sum_{\abs{\beta} \leq D}h_\beta(d) \sum_K c_{\beta, K}\sum_{\abs{\alpha} \leq n^\tau} \mu_{I+K, \alpha}\frac{h_\alpha(d)}{\alpha!}.
\end{align*}
Let $l_{\alpha,\beta,\gamma}$ be the coefficient of $h_\gamma$ in the Hermite product $h_\alpha \cdot h_\beta$.
\begin{align*}
   \tilde{\EE}[v^I p(d, v)] &= \displaystyle\sum_{\abs{\beta} \leq D} \sum_K c_{\beta, K}\sum_{\abs{\alpha} \leq n^\tau} \mu_{I+K, \alpha}\sum_{\gamma} l_{\alpha,\beta,\gamma}\frac{h_\gamma(d)}{\alpha!} 
\end{align*}
In the case $\abs{\gamma} > n^\tau + D$, the coefficient of $h_\gamma(d)$ is zero because the max degree of a Hermite polynomial appearing in $h_{\alpha}\cdot h_{\beta}$ is at most $\abs{\alpha}+\abs{\beta} \leq n^\tau + D$. We show cancellations occur when $\abs{\gamma} \leq n^\tau - D$.
Moving the summations around, the coefficient of $h_\gamma$ is,
\begin{align*}
    &\displaystyle\sum_{\abs{\beta} \leq D} \sum_K c_{\beta, K}\sum_{\abs{\alpha} \leq n^\tau} \mu_{I+K, \alpha} \cdot  l_{\alpha,\beta,\gamma}\frac{1}{\alpha!}\\
    = & \sum_{\abs{\beta} \leq D} \sum_K c_{\beta, K}\sum_{\abs{\alpha} \leq n^\tau} {\mathbb E}_{\text{pl}}[v^Iv^K h_{\alpha}(d)] \cdot l_{\alpha,\beta,\gamma}\frac{1}{\alpha!}\\
    = & {\mathbb E}_{\text{pl}}v^I\sum_{\abs{\beta} \leq D} \sum_K c_{\beta, K}v^K\sum_{\abs{\alpha} \leq n^\tau} l_{\alpha,\beta,\gamma}\frac{h_{\alpha}(d) }{\alpha!}.
\end{align*}

We need an explicit formula for $l_{\alpha, \beta, \gamma}$ from~\cite[p.~92]{roman2005umbral},

\begin{proposition}\label{prop:multiply-coefficients}
\[l_{\alpha,\beta,\alpha+\beta - 2\delta} = \displaystyle\prod_{u,i}\binom{\alpha_{ui}}{\delta_{ui}}\binom{\beta_{ui}}{\delta_{ui}} \delta_{ui}! \]
\end{proposition}

\begin{proposition}\label{prop:hermite-product}
\[\sum_{\alpha} l_{\alpha,\beta,\gamma}\frac{h_{\alpha}(d) }{\alpha!} = h_{\beta}(d)\cdot \frac{h_\gamma(d)}{\gamma!}\]
\end{proposition}
\begin{proof}
Compute using~\cref{prop:multiply-coefficients}.
\end{proof}

In~\cref{prop:hermite-product}, the summation is actually finite. The largest $\alpha$ with $l_{\alpha,\beta,\gamma}$ nonzero has $\abs{\alpha}~\leq~\abs{\beta}~+~\abs{\gamma}$. Since we have $\abs{\beta}\leq D$ (the constraint only has degree $D$), as long as $\abs{\gamma} \leq n^\tau - D$, the above equality applies, in which case continuing the calculation for this case,
\begin{align*}
   \tilde{\EE}[v^Ip] &= {\mathbb E}_{\text{pl}}v^I\sum_{\abs{\beta} \leq D} \sum_K c_{\beta, K}v^K \cdot h_{\beta}(d)\cdot \frac{h_\gamma(d)}{\gamma!}\\
   &=  {\mathbb E}_{\text{pl}}v^I \cdot \frac{h_\gamma(d)}{\gamma!} \cdot \sum_{\abs{\beta} \leq D} \sum_K c_{\beta, K}v^K \cdot h_{\beta}(d)\\
   &= {\mathbb E}_{\text{pl}}v^I \cdot \frac{h_\gamma(d)}{\gamma!} \cdot p(d,v)\\
   &= 0.
\end{align*}
We now bound the coefficients that appear in the remaining case when $n^\tau - D < \abs{\gamma} \leq n^\tau + D$.
\begin{align*}
    \abs{\displaystyle\sum_{\abs{\beta} \leq D} \sum_K c_{\beta, K}\sum_{\abs{\alpha} \leq n^\tau} \mu_{I+K, \alpha} \cdot  l_{\alpha,\beta,\gamma}\frac{1}{\alpha!}} & \leq \displaystyle\sum_{\abs{\beta} \leq D} \sum_K \abs{c_{\beta, K}}\sum_{\abs{\alpha} \leq n^\tau} \abs{\mu_{I+K, \alpha}} \cdot  l_{\alpha,\beta,\gamma}\frac{1}{\alpha!}
\end{align*}
If $l_{\alpha,\beta,\gamma}> 0$ then we must have $\abs{\alpha} \geq \abs{\gamma} - \abs{\beta} \geq n^\tau - 2D$.
\begin{align*}
    &\leq \sum_{\abs{\beta} \leq D} \sum_K \abs{c_{\beta, K}} \cdot \left(\max_I \max_{\abs{\alpha} \in n^\tau \pm 2D}\abs{\mu_{I, \alpha}} \right) \sum_{\alpha}   l_{\alpha,\beta,\gamma}\frac{1}{\alpha!}
\end{align*}

\begin{proposition}
   \[\sum_{\alpha} l_{\alpha,\beta,\gamma}\frac{1}{\alpha!} = e^{mn} \prod_{u,i}\binom{\beta_{ui}}{\frac{\alpha_{ui} + \beta_{ui} -\gamma_{ui}}{2}}\]
\end{proposition}
\begin{proof}
Compute using~\cref{prop:multiply-coefficients}.
\end{proof}
Using the proposition,
\[\leq \sum_{\abs{\beta} \leq D} \sum_K \abs{c_{\beta, K}} \cdot \left(\max_I \max_{\abs{\alpha} \in n^\tau \pm 2D}\abs{\mu_{I, \alpha}} \right)  e^{mn}\prod_{u,i}\binom{\beta_{ui}}{\frac{\alpha_{ui} + \beta_{ui} -\gamma_{ui}}{2}}  \]
We can bound 
\[\prod_{u,i}\binom{\beta_{ui}}{k_{ui}} \leq \prod_{u,i} 2^{\beta_{ui}} = 2^{\abs{\beta}} \leq 2^D.\]
In total, letting $M$ be the number of nonzero coefficients in the constraint $p$ and $L$ be the largest coefficient, this Fourier coefficient is at most,
\[ M \cdot L \cdot 2^D  e^{mn}\cdot \max_I\max_{\abs{\alpha}\in n^\tau \pm 2D} \abs{\mu_{I+K,\alpha}} .\]
\end{proof}


\begin{lemma}\label{lem:approximate-constraints}
W.h.p. $\norm{Q\tilde{\EE}} \leq \frac{1}{n^{\Omega(n^\tau)}}$
\end{lemma}
\begin{proof}
    Via~\cref{lem:boolean-approximate-constraints} the only nonzero Fourier characters that appear in $Q\tilde{\EE}$ are those of size $n^\tau \pm 2$. Their coefficient in the lemma is at most 
    \begin{align*}
        & C \cdot  e^{mn}\cdot \max_I\max_{\abs{\alpha}\in n^\tau \pm 4} \abs{\mu_{I+K,\alpha}}\\
        \leq & C \cdot e^{mn} \cdot \frac{(n^{\tau} - 4)^{3(n^\tau - 4)}}{n^{(n^\tau - 4)/2}} && (\text{\cref{prop:coefficient-bound}})\\
        \leq & \frac{n^{3\tau n^\tau}}{n^{(\frac{1}{2} + o(1)) n^\tau }}
    \end{align*}
    Therefore we can express $Q\tilde{\EE}$ as a sum of graph matrices\footnote{Graph vectors, since $Q\tilde{\EE}$ is a vector.} of this size, with coefficients bounded by the above quantity. Now we bound the total norm by summing over all graphs.
    
    The number of graph matrices of this size is at most $n^{O(\tau) \cdot n^\tau}$.
    
    The norm of each term can be bounded using norm bounds by $n^{\frac{w(V(\alpha)) - w(S_{\min}) + w(W_{iso})}{2}}$ w.h.p. Note that there is no minimum vertex separator since $V = \emptyset$, and there are $O(1)$ isolated vertices when multiplying graphs in $Q$ with graphs in the pseudocalibration (which have no isolated vertices). The number of circle vertices can be bounded by $\frac{1}{4}\abs{E(\alpha)} \leq \frac{1}{4}n^\tau$. The number of square vertices can be bounded by $O(n^\delta) + \frac{1}{2}\abs{E(\alpha)} \leq 0.52n^\tau$. Therefore the norms are at most $m^{\frac{1}{8}n^\tau}~n^{0.26n^\tau + O(1)}~\leq~n^{0.49n^\tau}$. Notably this is significantly less than the denominator of the graph matrix coefficient, which is $n^{(0.5+o(1))n^\tau}$. Assuming $\delta$ and $\tau$ are small enough, the denominator is enough to overpower all terms multiplied together.
\end{proof}


\subsection{Analyzing $Q{Q^T}$}
The main theorem of this subsection is that the minimum nonzero eigenvalue of $QQ^T$ is large.

\begin{theorem}\label{lem:pseudoinverse-lower-bound}
The minimum nonzero eigenvalue of $QQ^T$ is $\frac{n^2}{2} - \tilde{O}(n\sqrt{m})$. 
\end{theorem}

\begin{proof}[Proof of~\cref{lem:pseudoexpectation-rounding} assuming~\cref{lem:pseudoinverse-lower-bound}]
\begin{align*}
    \norm{\tilde{\EE} - \tilde{\EE}'} & = \norm{Q^\intercal (QQ^\intercal)^+ Q\tilde{\EE}} \leq \norm{Q} \cdot \norm{(QQ^\intercal)^+} \cdot\norm{Q\tilde{\EE}}\\
    & \leq n^{O(1)} \cdot \frac{1}{n^2} \cdot \frac{1}{n^{\Omega(n^\tau)}}\\
    & = \frac{1}{n^{\Omega(n^\tau)}}
\end{align*}
\end{proof}



Recall that $Q = \sum_{k}{L^T_k}$. Let us refer to the five shapes in~\cref{def:lk} as $\alpha_1$ through $\alpha_5$, and their coefficients as $c_{\alpha_i}$. Observe that the dominant part of $L_k$ is $2M_{\alpha_1}$ which has norm $\tilde{O}(n)$. The norm bounds for the other components of $L_k$ are as follows:
\begin{enumerate}
\item $\norm{c_{\alpha_2}M_{\alpha_2}}$ is $\tilde{O}\left(\frac{1}{n} \cdot \sqrt{mn}\right) = \tilde{O}\left(\frac{\sqrt{m}}{\sqrt{n}}\right)$
\item $\norm{c_{\alpha_3}M_{\alpha_3}}$ is $\tilde{O}\left(\frac{1}{n^2} \cdot n\sqrt{m}\right) = \tilde{O}\left(\frac{\sqrt{m}}{n}\right)$
\item $\norm{c_{\alpha_4}M_{\alpha_4}}$ is $\tilde{O}\left(\frac{1}{n} \cdot \sqrt{mn}\right) = \tilde{O}\left(\frac{\sqrt{m}}{\sqrt{n}}\right)$
\item $\norm{c_{\alpha_5}M_{\alpha_5}}$ is $\tilde{O}\left(\frac{1}{n} \cdot \sqrt{m}\right) = \tilde{O}\left(\frac{\sqrt{m}}{n}\right)$
\end{enumerate}

We start by analyzing $QQ^T = \sum_{k}{L^T_k{L_k}}$.

From the above, taking $\alpha = \alpha_1$, the dominant term of $L_k$ is $2M_{\alpha}$ where $U_{\alpha} = \{j_1,\ldots,j_{k}\}$, $V_{\alpha} = \{j_3,\ldots,j_{k}\} \cup \{u\}$, and $E(\alpha) = \{(j_1,u),(j_2,u)\}$. Since $\norm{M_{\alpha}}$ is $\tilde{O}(n)$ and $\norm{L_k - 2M_{\alpha}}$ is $\tilde{O}(\frac{\sqrt{m}}{\sqrt{n}})$, this implies that for each $k$, $\norm{L_{k}^T{L_k} - 4M^T_{\alpha}M_{\alpha}}$ is $\tilde{O}(\sqrt{mn})$. Thus, it is sufficient to analyze $M^T_{\alpha}M_{\alpha}$.

\begin{lemma}\label{QQTdecompositionlemma}
Taking $\alpha$ to be the shape such that $U_{\alpha} \setminus V_{\alpha} = \{j_1,j_2\}$, $U_{\alpha} \cap V_{\alpha} = \{j_3,j_4,\ldots,j_{k}\}$, $V_{\alpha} \setminus U_{\alpha} = \{i_{circ}\}$, and $E(\alpha) = \{(j_1,i),(j_2,i)\}$,
\[
M^{T}_{\alpha}M_{\alpha} = M_{\alpha_1} + M_{\alpha_2} + M_{\alpha_3} + M_{\alpha_4} + M_{\alpha_5} + M_{\alpha_6}
\] where $\alpha_1,\alpha_2,\alpha_3,\alpha_4,\alpha_5,\alpha_6$ are the following shapes. Note that $\alpha_1$ and $\alpha_2$ are improper shapes.
\begin{enumerate}
\item $U_{\alpha_1} \setminus V_{\alpha_1} = \emptyset$, $U_{\alpha_1} \cap V_{\alpha_1} = \{j_3,j_4,\ldots,j_{k}\} \cup \{i_{circ}\}$, $V_{\alpha_1} \setminus U_{\alpha_1} = \emptyset$, $V(\alpha_1) \setminus (U_{\alpha_1} \cup V_{\alpha_1}) = \{j_1,j_2\}$, and $E(\alpha_1) = \{(i,j_1),(i,j_1),(i,j_2),(i,j_2)\}$.
\item $U_{\alpha_2} \setminus V_{\alpha_2} = \{j_1\}$, $U_{\alpha_2} \cap V_{\alpha_2} = \{j_4,\ldots,j_{k}\} \cup \{i_{circ}\}$, $V_{\alpha_2} \setminus U_{\alpha_2} = \{j'_1\}$, $V(\alpha_2) \setminus (U_{\alpha_2} \cup V_{\alpha_2}) = \{j_2\}$, and $E(\alpha_2) = \{(i,j_1),(i,j'_1),(i,j_2),(i,j_2)\}$.
\item $U_{\alpha_3} \setminus V_{\alpha_3} = \{j_1,j_2\}$, $U_{\alpha_3} \cap V_{\alpha_3} = \{j_5,\ldots,j_{k}\} \cup \{i_{circ}\}$, $V_{\alpha_3} \setminus U_{\alpha_3} = \{j'_1,j'_2\}$, and $E(\alpha_3) = \{(i,j_1),(i,j_2),(i,j'_1),(i,j'_2)\}$.
\item $U_{\alpha_4} \setminus V_{\alpha_4} = \{i_{circ}\}$, $U_{\alpha_4} \cap V_{\alpha_4} = \{j_3,j_4,\ldots,j_{k}\}$, $V_{\alpha_4} \setminus U_{\alpha_4} = \{i'_{circ}\}$, $V(\alpha_4) \setminus (U_{\alpha_4} \cup V_{\alpha_4}) = \{j_1,j_2\}$, and $E(\alpha_4) = \{(i,j'_1),(i,j'_2),(i',j_1),(i',j_2)\}$.
\item $U_{\alpha_5} \setminus V_{\alpha_5} = \{j_1\} \cup \{i_{circ}\}$, $U_{\alpha_5} \cap V_{\alpha_5} = \{j_4,\ldots,j_{k}\}$, $V_{\alpha_5} \setminus U_{\alpha_5} = \{j'_1\} \cup \{i'_{circ}\}$, $V(\alpha_5) \setminus (U_{\alpha_5} \cup V_{\alpha_5}) = \{j_2\}$, and $E(\alpha_5) = \{(i,j'_1),(i,j_2),(i',j_1),(i',j_2)\}$.
\item $U_{\alpha_6} \setminus V_{\alpha_6} = \{j_1,j_2\} \cup \{i_{circ}\}$, $U_{\alpha_6} \cap V_{\alpha_6} = \{j_5,\ldots,j_{k}\}$, $V_{\alpha_6} \setminus U_{\alpha_6} = \{j'_1,j'_2\}  \cup \{i'_{circ}\}$, and $E(\alpha_6) = \{(i',j_1),(i',j_2),(i,j'_1),(i,j'_2)\}$.
\end{enumerate}
\end{lemma}
For pictures of these shapes, see~\cref{fig:QQTdominantfigure} below.
\begin{figure}[ht]
\centerline{\includegraphics[height=10cm]{sherrington_kirkpatrick/QQTFull}}
\caption{This figure shows the decomposition of $M^T_{\alpha}M_{\alpha}$.}
\label{fig:QQTdominantfigure}
\end{figure}
\begin{remark}
For $k = 2$, only shapes $\alpha_1$ and $\alpha_4$ are present and for $k = 3$, only shapes $\alpha_1,\alpha_2,\alpha_4,\alpha_5$ are present.
\end{remark}
\begin{proof}[Proof of Lemma \ref{QQTdecompositionlemma}]
We compute $M_{\alpha^T}M_{\alpha}$ by considering the ribbons which appear in $M_{\alpha^T}M_{\alpha}$.
\begin{enumerate}
\item Each ribbon $R$ with $A_R \setminus B_R = \emptyset$, $A_R \cap B_R = \{j_3,j_4,\ldots,j_{k}\} \cup \{i_{circ}\}$, $B_R \setminus A_R = \emptyset$, $V(R) \setminus (A_R \cup B_R) = \{j_1,j_2\}$, and $E(R) = \{(i,j_1),(i,j_1),(i,j_2),(i,j_2)\}$ appears in exactly one way as the composition of the ribbons $R_1$ and $R_2$ where 
\begin{enumerate}
\item $A_{R_1} \setminus B_{R_1} = \{i_{circ}\}$, $A_{R_1} \cap B_{R_1} = \{j_3,j_4,\ldots,j_{k}\}$, $B_{R_1} \setminus A_{R_1} = \{j_1,j_2\}$, and $E(R_1) = \{(i,j_1),(i,j_2)\}$.
\item $A_{R_2} \setminus B_{R_2} = \{j_1,j_2\}$, $A_{R_2} \cap B_{R_2} = \{j_3,j_4,\ldots,j_{k}\}$, $B_{R_2} \setminus A_{R_2} = \{i_{circ}\}$, and $E(R_2) = \{(i,j_1),(i,j_2)\}$.
\end{enumerate}
\item Each ribbon $R$ with $A_R \setminus B_R = \{j_1\}$, $A_R \cap B_R = \{j_4,\ldots,j_{k}\} \cup \{i_{circ}\}$, $B_R \setminus A_R = \{j'_1\}$, $V(R) \setminus (A_R \cup B_R) = \{j_2\}$, and $E(R) = \{(i,j_1),(i,j'_1),(i,j_2),(i,j_2)\}$ appears in exactly one way as the composition of the ribbons $R_1$ and $R_2$ where 
\begin{enumerate}
\item $A_{R_1} \setminus B_{R_1} = \{i_{circ}\}$, $A_{R_1} \cap B_{R_1} = \{j_1,j_4,\ldots,j_{k}\}$, $B_{R_1} \setminus A_{R_1} = \{j'_1,j_2\}$, and $E(R_1) = \{(i,j'_1),(i,j_2)\}$.
\item $A_{R_2} \setminus B_{R_2} = \{j_1,j_2\}$, $A_{R_2} \cap B_{R_2} = \{j'_1,j_4,\ldots,j_{k}\}$, $B_{R_2} \setminus A_{R_2} = \{i_{circ}\}$, and $E(R_2) = \{(i,j_1),(i,j_2)\}$.
\end{enumerate}
\item Each ribbon $R$ with $A_R \setminus B_R = \{j_1,j_2\}$, $A_R \cap B_R = \{j_5,\ldots,j_{k}\} \cup \{i_{circ}\}$, $B_R \setminus A_R = \{j'_1,j'_2\}$, $V(R) \setminus (A_R \cup B_R) = \emptyset$, and $E(R) = \{(i,j_1),(i,j'_1),(i,j_2),(i,j'_2)\}$ appears in exactly one way as the composition of the ribbons $R_1$ and $R_2$ where 
\begin{enumerate}
\item $A_{R_1} \setminus B_{R_1} = \{i_{circ}\}$, $A_{R_1} \cap B_{R_1} = \{j_1,j_2,j_5,\ldots,j_{k}\}$, $B_{R_1} \setminus A_{R_1} = \{j'_1,j'_2\}$, and $E(R_1) = \{(i,j'_1),(i,j'_2)\}$.
\item $A_{R_2} \setminus B_{R_2} = \{j_1,j_2\}$, $A_{R_2} \cap B_{R_2} = \{j'_1,j'_2,j_5,\ldots,j_{k}\}$, $B_{R_2} \setminus A_{R_2} = \{i_{circ}\}$, and $E(R_2) = \{(i,j_1),(i,j_2)\}$.
\end{enumerate}
\item Each ribbon $R$ with $A_R \setminus B_R = \{j_1,j_2\} \cup  \{i_{circ}\}$, $A_R \cap B_R = \{j_5,\ldots,j_{k}\}$, $B_R \setminus A_R = \{j'_1,j'_2\}  \cup  \{i'_{circ}\}$, $V(R) \setminus (A_R \cup B_R) = \emptyset$, and $E(R) = \{(i,j'_1),(i,j'_2),(i',j_1),(i',j_2)\}$ appears in exactly one way as the composition of the ribbons $R_1$ and $R_2$ where 
\begin{enumerate}
\item $A_{R_1} \setminus B_{R_1} = \{i_{circ}\}$, $A_{R_1} \cap B_{R_1} = \{j_1,j_2,j_5,\ldots,j_{k}\}$, $B_{R_1} \setminus A_{R_1} = \{j'_1,j'_2\}$, and $E(R_1) = \{(i,j'_1),(i,j'_2)\}$.
\item $A_{R_2} \setminus B_{R_2} = \{j_1,j_2\}$, $A_{R_2} \cap B_{R_2} = \{j'_1,j'_2,j_5,\ldots,j_{k}\}$, $B_{R_2} \setminus A_{R_2} = \{i'_{circ}\}$, and $E(R_2) = \{(i,j_1),(i,j_2)\}$.
\end{enumerate}
\item Each ribbon $R$ with $A_R \setminus B_R = \{i_{circ}\}$, $A_R \cap B_R = \{j_3,j_4,\ldots,j_{k}\}$, $B_R \setminus A_R = \{i'_{circ}\}$, $V(R) \setminus (A_R \cup B_R) = \{j_1,j_2\}$, and $E(R) = \{(i,j_1),(i,j_2),(i',j_1),(i',j_2)\}$ appears in exactly one way as the composition of the ribbons $R_1$ and $R_2$ where 
\begin{enumerate}
\item $A_{R_1} \setminus B_{R_1} = \{i_{circ}\}$, $A_{R_1} \cap B_{R_1} = \{j_3,j_4,\ldots,j_{k}\}$, $B_{R_1} \setminus A_{R_1} = \{j_1,j_2\}$, and $E(R_1) = \{(i,j_1),(i,j_2)\}$.
\item $A_{R_2} \setminus B_{R_2} = \{j_1,j_2\}$, $A_{R_2} \cap B_{R_2} = \{j_3,j_4,\ldots,j_{k}\}$, $B_{R_2} \setminus A_{R_2} = \{i'_{circ}\}$, and $E(R_2) = \{(i',j_1),(i',j_2)\}$.
\end{enumerate}
\item Each ribbon $R$ with $A_R \setminus B_R = \{j_1\}$, $A_R \cap B_R = \{j_4,\ldots,j_{k}\} \cup \{i_{circ}\}$, $B_R \setminus A_R = \{j'_1\}$, $V(R) \setminus (A_R \cup B_R) = \{j_2\}$, and $E(R) = \{(i,j_1),(i,j'_1),(i,j_2),(i,j_2)\}$ appears in exactly one way as the composition of the ribbons $R_1$ and $R_2$ where 
\begin{enumerate}
\item $A_{R_1} \setminus B_{R_1} = \{i_{circ}\}$, $A_{R_1} \cap B_{R_1} = \{j_1,j_4,\ldots,j_{k}\}$, $B_{R_1} \setminus A_{R_1} = \{j'_1,j_2\}$, and $E(R_1) = \{(i,j'_1),(i,j_2)\}$.
\item $A_{R_2} \setminus B_{R_2} = \{j_1,j_2\}$, $A_{R_2} \cap B_{R_2} = \{j'_1,j_4,\ldots,j_{k}\}$, $B_{R_2} \setminus A_{R_2} = \{i_{circ}\}$, and $E(R_2) = \{(i,j_1),(i,j_2)\}$.
\end{enumerate}
\end{enumerate}
Based on these cases, we have that $M_{\alpha^T}M_{\alpha} = M_{\alpha_1} + M_{\alpha_2} + M_{\alpha_3} + M_{\alpha_4} + M_{\alpha_5} + M_{\alpha_6}$.
\end{proof}
We now analyze each of the matrices $M_{\alpha_1}, M_{\alpha_2}, M_{\alpha_3}, M_{\alpha_4}, M_{\alpha_5}, M_{\alpha_6}$.
\begin{lemma}
Taking $Id_{k-2,1}$ to be the shape where $U_{Id_{k-2,1}} \setminus V_{Id_{k-2,1}} = \emptyset$, $U_{Id_{k-2,1}} \cap V_{Id_{k-2,1}} = \{j_3,j_4,\ldots,j_{k}\} \cup \{i_{circ}\}$, $V_{Id_{k-2,1}} \setminus U_{Id_{k-2,1}} = \emptyset$, and $E(Id_{k-2,1}) = \emptyset$, $\norm{M_{\alpha_1} - \binom{n-k+2}{2}M_{Id_{k-2,1}}}$ is $\tilde{O}\left(n^{\frac{3}{2}}\right)$.
\end{lemma}
\begin{proof}
To convert an improper shape $\alpha_1$ to a sum of proper shapes, we take each ribbon of shape $\alpha_1$ and decompose it into a sum of proper ribbons. Decomposing $M_{\alpha_1}$ in this way, each ribbon $R$ with $A_R \setminus B_R = \emptyset$, $A_R \cap B_R = \{j_3,j_4,\ldots,j_{k}\} \cup \{i_{circ}\}$, $B_R \setminus A_R = \emptyset$, and $E(R) = \emptyset$ appears $\binom{n-k+2}{2}$ times, once for each pair $j_1,j_2$ such that $j_1 < j_2$ and $j_1,j_2 \notin \{j_3,\ldots,j_{k}\}$. The other ribbons which arise all have an edge with label $2$ incident with $j_1$ or $j_2$ and thus the resulting terms have norm $\tilde{O}\left(n^{\frac{3}{2}}\right)$.
\end{proof}
\begin{definition}
Define $\beta_2$ to be the shape such that $U_{\beta_2} \setminus V_{\beta_2} = \{j_1\}$, $U_{\beta_2} \cap V_{\beta_2} = \{j_4,\ldots,j_{k}\} \cup \{i_{circ}\}$, $V_{\beta_2} \setminus U_{\beta_2} = \emptyset$, and $E(\beta_1) = \{(i,j_1)\}$.
\end{definition}
\begin{lemma}
$\norm{M_{\alpha_2} - (n-k+1)M_{\beta_2}M^T_{\beta_2}}$ is $\tilde{O}\left(n^{\frac{3}{2}}\right)$.
\end{lemma}
\begin{proof}
Again, to convert an improper shape $\alpha_2$ to a sum of proper shapes, we take each ribbon of shape $\alpha_2$ and decompose it into a sum of proper ribbons. Decomposing $M_{\alpha_2}$ in this way, each ribbon $R$ with $A_R \setminus B_R = \{j_1\}$, $A_R \cap B_R = \{j_4,\ldots,j_{k}\} \cup \{i_{circ}\}$, $B_R \setminus A_R = \emptyset$, and $E(R) = \{(i,j_1),(i,j'_1)\}$ appears $(n-k+1)$ times, once for each $j_2 \in [n] \setminus \{j_1,j'_1,j_4,\ldots,j_k\}$. The other ribbons which arise have an edge with label $2$ incident with $j_2$ and thus the resulting terms have norm $\tilde{O}\left(n^{\frac{3}{2}}\right)$. This implies that if we take $\alpha'_2$ to be the shape where $U_{\alpha_2} \setminus V_{\alpha_2} = \{j_1\}$, $U_{\alpha_2} \cap V_{\alpha_2} = \{j_4,\ldots,j_{k}\} \cup \{i_{circ}\}$, $V_{\alpha_2} \setminus U_{\alpha_2} = \{j'_1\}$, and $E(\alpha_2) = \{(i,j_1),(i,j'_1)\}$ then $\norm{M_{\alpha_2} - (n-k+1)M_{\alpha'_2}}$ is $\tilde{O}\left(n^{\frac{3}{2}}\right)$.

$\norm{M_{\alpha'_2}}$ is $\tilde{O}(n)$, so this term cannot be ignored. To handle this, we observe that $M_{\alpha'_2}$ is approximately equal to a PSD matrix. More precisely, $\norm{M_{\alpha'_2} - M_{\beta_2}M^T_{\beta_2}}$ is $\tilde{O}(1)$. To see this, note that when we expand out $M_{\beta_2}M^T_{\beta_2}$, the ribbons which result when there are no collisions give $M_{\alpha'_2}$ and for each ribbon $R$ which results from a collision, $A_R \setminus B_R = B_R \setminus A_R = \emptyset$ so the resulting terms have norm $\tilde{O}(1)$. 
\end{proof}
\begin{figure}[ht]\label{BetaTwoFigure}
\centerline{\includegraphics[height=3cm]{sherrington_kirkpatrick/BetaTwo}}
\caption{This figure shows $\alpha'_2$ and $\beta_2$ for $k = 4$.}
\end{figure}
$\norm{M_{\alpha_3}}$ is $\tilde{O}(n^2)$, so this term cannot be ignored. To handle this, we observe that $M_{\alpha_3}$ is approximately equal to a PSD matrix. More precisely, we have the following lemma.
\begin{definition}
Define $\beta_3$ to be the shape such that $U_{\beta_3} \setminus V_{\beta_3} = \{j_1,j_2\}$, $U_{\beta_3} \cap V_{\beta_3} = \{j_5,\ldots,j_{k}\} \cup \{i_{circ}\}$, $V_{\beta_3} \setminus U_{\beta_3} = \emptyset$, and $E(\beta_3) = \{(i,j_1),(i,j_2)\}$.
\end{definition}
\begin{lemma}
$\norm{M_{\alpha_3} - M_{\beta_3}M^T_{\beta_3}}$ is $\tilde{O}(n)$.
\end{lemma}
\begin{proof}
To see this, note that when we expand out $M_{\beta_3}M^T_{\beta_3}$, the ribbons which result when there are no collisions give $M_{\alpha_3}$ and for each ribbon $R$ which results from a collision, $|A_R \setminus B_R| = |B_R \setminus A_R| \leq 1$ so the resulting terms have norm $\tilde{O}(n)$. 
\end{proof}
\begin{figure}[ht]\label{BetaThreeFigure}
\centerline{\includegraphics[height=3cm]{sherrington_kirkpatrick/BetaThree}}
\caption{This figure shows $\alpha_3$ and $\beta_3$ for $k = 4$.}
\end{figure}
We now consider the norms of $M_{\alpha_4}$, $M_{\alpha_5}$, and $M_{\alpha_6}$.
\begin{enumerate}
\item $\norm{M_{\alpha_4}}$ is $\tilde{O}(n\sqrt{m})$.
\item $\norm{M_{\alpha_5}}$ is $\tilde{O}(n\sqrt{m})$.
\item $\norm{M_{\alpha_6}}$ is $\tilde{O}(n^2)$.
\end{enumerate}
This means that $M_{\alpha_4}$ and $M_{\alpha_5}$ can be ignored but $M_{\alpha_6}$ cannot be ignored. In fact, there is a very good reason for this. In particular, for $k \geq 4$, $L_k$ has a non-trivial nullspace $N_k$, so we cannot show that the minimum nonzero eigenvalue of ${L_k^T}{L_k}$ is large without taking this nullspace into account. We handle this nullspace $N_k$ in the next two subsubsections.

Putting everything together, we have the following corollary:
\begin{corollary}\label{QQTapproximationcorollary} \ 
\begin{enumerate}
\item For $k = 2$, $\norm{{L_k^T}L_k - 2{n^2}M_{Id_{k-2,1}}}$ is $\tilde{O}(n\sqrt{m})$
\item For $k = 3$, $\norm{{L_k^T}L_k - 2{n^2}M_{Id_{k-2,1}} - 4nM_{\beta_2}M^T_{\beta_2}}$ is $\tilde{O}(n\sqrt{m})$
\item For $k \geq 4$, $\norm{{L_k^T}L_k - 2{n^2}M_{Id_{k-2,1}} - 4nM_{\beta_2}M^T_{\beta_2} - 4M_{\beta_3}M^T_{\beta_3} - 4M_{\alpha_6}}$ is $\tilde{O}(n\sqrt{m})$
\end{enumerate}
\end{corollary}
\begin{remark}
We replaced $\binom{n-k+2}{2}$ with $\frac{n^2}{2}$ as $\norm{M_{Id_{k-2,1}}} = 1$ and $|\frac{n^2}{2} - \binom{n-k+2}{2}|$ is $\tilde{O}(n)$. Similarly, we replaced $(n-k+1)$ with $n$ as $\norm{M_{\alpha'_2}}$ is $\tilde{O}(n)$ and $|n - (n-k+1)|$ is $\tilde{O}(1)$
\end{remark}
\subsubsection{The Null Space $N_k$}
We now construct a matrix $N_k$ for each $k \geq 4$ such that ${L_k}{N_k} = 0$ and the columns of ${N_k}$ span the nullspace of $L_k$. To do this, we construct $N_k$ so that the entries of each column of $N_k$ is indexed by a subset $S = \{j_3,\ldots,j_{k}\} \subseteq [n]$ and an ordered tuple of circle indices $(i,i')$ where $i < i'$. We then want that if we view $\tilde{E}$ as a vector,
\[
({\tilde{E}^T}{L_k}N_k)_{S,(i,i')} = \tilde{E}\left[v^{S}\left({\ip{v}{d_i}}^2 - 1\right)\left({\ip{v}{d_{i'}}}^2 - 1\right)\right] - \tilde{E}\left[v^{S}\left({\ip{v}{d_{i'}}}^2 - 1\right)\left({\ip{v}{d_i}}^2 - 1\right)\right] = 0
\]
\begin{lemma}
$N_k = c_{\alpha_1}(M_{\alpha_1^+} - M_{\alpha_1^-}) + c_{\alpha_2}(M_{\alpha_2^+} - M_{\alpha_2^-}) + c_{\alpha_3}(M_{\alpha_3^+} - M_{\alpha_3^-}) + c_{\alpha_4}(M_{\alpha_4^+} - M_{\alpha_4^-}) + c_{\alpha_5}(M_{\alpha_5^+} - M_{\alpha_5^-})$ for the following shapes $\alpha_1^{+},\ldots,\alpha_5^{+}, \alpha_1^{-},\ldots,\alpha_5^{-}$ and coefficients $c_{\alpha_1},\ldots,c_{\alpha_5}$. Unless stated otherwise, all of these shapes have no middle vertices.
\begin{enumerate}
\item $U_{\alpha_1^+} \setminus V_{\alpha_1^+} = \{j_1,j_2\}$, $U_{\alpha_1^+} \cap V_{\alpha_1^+} = \{j_3,j_4,\ldots,j_{k}\} \cup \{i'_{circ}\}$, $V_{\alpha_1^+} \setminus U_{\alpha_1^+} = \{i_{circ}\}$, $E(\alpha_1^+) = \{(j_1,i),(j_2,i)\}$, and $c_{\alpha_1} = 2$.
\item $U_{\alpha_2^+} \setminus V_{\alpha_2^+} = \{j_2\}$, $U_{\alpha_2^+} \cap V_{\alpha_2^+} = \{j_4,\ldots,j_{k}\} \cup \{i'_{circ}\}$, $V_{\alpha_2^+} \setminus U_{\alpha_2^+} = \{j_3\} \cup \{i_{circ}\}$, $E(\alpha_2^+) = \{(j_3,i),(j_2,i)\}$, and $c_{\alpha_2} = \frac{2}{n}$.
\item $U_{\alpha_3^+} \setminus V_{\alpha_3^+} = \emptyset$, $U_{\alpha_3^+} \cap V_{\alpha_3^+} = \{j_5,\ldots,j_{k}\} \cup \{i'_{circ}\}$, $V_{\alpha_3^+} \setminus U_{\alpha_3^+} = \{j_3,j_4\} \cup \{i_{circ}\}$, $E(\alpha_3^+) = \{(j_3,i),(j_4,i)\}$, and $c_{\alpha_3} = \frac{2}{n^2}$.
\item $U_{\alpha_4^+} \setminus V_{\alpha_4^+} = \emptyset$, $U_{\alpha_4^+} \cap V_{\alpha_4^+} = \{j_3,j_4,\ldots,j_{k}\} \cup \{i'_{circ}\}$, $V_{\alpha_4^+} \setminus U_{\alpha_4^+} = \{i_{circ}\}$, $V(\alpha_4^+) \setminus (U_{\alpha_4^+} \cup V_{\alpha_4^+}) = \{j_1\}$ $E(\alpha_4^+) = \{(j_1,i)_2\}$, and $c_{\alpha_4} = \frac{1}{n}$.
\item $U_{\alpha_5^+} \setminus V_{\alpha_5^+} = \emptyset$, $U_{\alpha_5^+} \cap V_{\alpha_5^+} = \{j_3,j_4,\ldots,j_{k}\} \cup \{i'_{circ}\}$, $V_{\alpha_5^+} \setminus U_{\alpha_5^+} = \{i_{circ}\}$, $E(\alpha_5^+) = \{(j_3,i)_2\}$, and $c_{\alpha_5} = \frac{1}{n}$.
\end{enumerate}
where for all of these shapes, $(i_{circ},i'_{circ})$ is a tuple in the right side and $i < i'$. $\alpha_1^{-},\ldots,\alpha_5^{-}$ are the same as $\alpha_1^{+},\ldots,\alpha_5^{+}$ except that $i$ and $i'$ are swapped.
\end{lemma}
\begin{remark}
Note that $\alpha_1^{+},\ldots,\alpha_5^{+}$ are the same shapes which appear in the decomposition of $L_k$ except that the intersection of $U$ and $V$ now contains $i'_{circ}$ and we require that $i < i'$.
\end{remark}
For pictures of these shapes, see Figure \ref{NDecompositionFigure} below.
\begin{figure}[ht]
\centerline{\includegraphics[height=10cm]{sherrington_kirkpatrick/NFull081620}}
\caption{This figure shows the decomposition of $N_k$ for $k = 4$. Here we always have that $i < i'$. If $i$ and $i'$ are swapped then this flips the signs but these parts are not shown to save space.}
\label{NDecompositionFigure}
\end{figure}
\begin{proof}
To determine $N_k$, we analyze the ribbons which $N_k$ is composed of. Let $S = \{j_3,j_4,\ldots,j_k\}$.
\begin{enumerate}
\item If we take a ribbon $R$ with $A_R = \{j_1,\ldots,j_{k}\} \cup \{i'_{circ}\}$, $B_R = \{j_3,\ldots,j_{k}\} \cup (i_{circ},i'_{circ})$, and $E(R) = \{(j_1,i),(j_2,i)\}$ where $j_1 \neq j_2$ and $j_1,j_2 \notin S$ then 
\[
({\tilde{E}^T}{L_k}M_R)_{S,i,i'} = \tilde{E}\left[v^{S}v_{j_1}v_{j_2}(d_{i})_{j_1}(d_i)_{j_2}\left({\ip{v}{d_{i'}}}^2 - 1\right)\right]
\]
Each such term appears with a coefficient of $2$ in $\tilde{E}\left[v^{S}\left({\ip{v}{d_i}}^2 - 1\right)\left({\ip{v}{d_{i'}}}^2 - 1\right)\right]$, so we want each such ribbon $R$ to appear with a coefficient of $2$ in $N_k$.

Similarly, we want each ribbon $R$ with $A_R = \{j_1,\ldots,j_{k}\} \cup \{i_{circ}\}$, $B_R = \{j_3,\ldots,j_{k}\} \cup (i_{circ},i'_{circ})$, and $E(R) = \{(j_1,i'),(j_2,i')\}$ where $j_1 \neq j_2$ and $j_1,j_2 \notin S$ to appear with a coefficient of $-2$ in $N_k$.
\item If we take a ribbon $R$ with $A_R = (\{j_1,\ldots,j_{k}\} \setminus \{j_1,j_3\}) \cup \{i'_{circ}\}$, $B_R = \{j_3,\ldots,j_{k}\} \cup (i_{circ},i'_{circ})$, and $E(R) = \{(j_3,i),(j_2,i)\}$ where $j_1 = j_3 \in S$ and $j_2 \notin S$ then 
\begin{align*}
({\tilde{E}^T}{L_k}M_R)_{S,i,i'} &= \tilde{E}\left[v^{S \setminus \{j_1,j_3\}}v_{j_2}(d_{i})_{j_3}(d_i)_{j_2}\left({\ip{v}{d_{i'}}}^2 - 1\right)\right]\\
&=n\tilde{E}\left[v^{S}v_{j_1}v_{j_2}(d_{i})_{j_1}(d_i)_{j_2}\left({\ip{v}{d_{i'}}}^2 - 1\right)\right]
\end{align*}
Each such term appears with a coefficient of $\frac{2}{n}$ in $\tilde{E}\left[v^{S}\left({\ip{v}{d_i}}^2 - 1\right)\left({\ip{v}{d_{i'}}}^2 - 1\right)\right]$, so we want each such ribbon $R$ to appear with a coefficient of $\frac{2}{n}$ in $N_k$.

Similarly, we want each ribbon $R$ with $A_R = (\{j_1,\ldots,j_{k}\} \setminus \{j_1,j_3\}) \cup \{i_{circ}\}$, $B_R = \{j_3,\ldots,j_{k}\} \cup (i_{circ},i'_{circ})$, and $E(R) = \{(j_3,i'),(j_2,i')\}$ where $j_1 = j_3 \in S$ and $j_2 \notin S$ to appear with a coefficient of $-\frac{2}{n}$ in $N_k$.
\item If we take a ribbon $R$ with $A_R = (\{j_1,\ldots,j_{k}\} \setminus \{j_1,j_2,j_3,j_4\}) \cup \{i'_{circ}\}$, $B_R = \{j_3,\ldots,j_{k}\} \cup (i_{circ},i'_{circ})$, and $E(R) = \{(j_3,i),(j_4,i)\}$ where $j_1 = j_3 \in S$ and $j_2 = j_4 \in S$ then 
\begin{align*}
({\tilde{E}^T}{L_k}M_R)_{S,i,i'} &= \tilde{E}\left[v^{S \setminus \{j_1,j_2,j_3,j_4\}}(d_{i})_{j_3}(d_i)_{j_4}\left({\ip{v}{d_{i'}}}^2 - 1\right)\right]\\
&=n^2\tilde{E}\left[v^{S}v_{j_1}v_{j_2}(d_{i})_{j_1}(d_i)_{j_2}\left({\ip{v}{d_{i'}}}^2 - 1\right)\right]
\end{align*}
Each such term appears with a coefficient of $\frac{2}{n^2}$ in $\tilde{E}\left[v^{S}\left({\ip{v}{d_i}}^2 - 1\right)\left({\ip{v}{d_{i'}}}^2 - 1\right)\right]$, so we want each such ribbon $R$ to appear with a coefficient of $\frac{2}{n^2}$ in $N_k$.

Similarly, we want each ribbon $R$ with $A_R = (\{j_1,\ldots,j_{k}\} \setminus \{j_1,j_2,j_3,j_4\}) \cup \{i_{circ}\}$, $B_R = \{j_3,\ldots,j_{k}\} \cup (i_{circ},i'_{circ})$, and $E(R) = \{(j_3,i'),(j_4,i')\}$ where $j_1 = j_3 \in S$ and $j_2 = j_4 \in S$ to appear with a coefficient of $-\frac{2}{n^2}$ in $N_k$.
\item If we take a ribbon $R$ with $A_R = (\{j_1,\ldots,j_{k}\} \setminus \{j_1,j_2\})  \cup \{i'_{circ}\}$, $B_R = \{j_3,\ldots,j_{k}\} \cup (i_{circ},i'_{circ})$, and $E(R) = \{(j_1,i)_2\}$ where $j_1 = j_2 \notin S$ then 
\begin{align*}
({\tilde{E}^T}{L_k}M_R)_{S,i,i'} &= \tilde{E}\left[v^{S \setminus \{j_1,j_2\}}((d_{i})^2_{j_1} - 1)\left({\ip{v}{d_{i'}}}^2 - 1\right)\right]\\
&=n\tilde{E}\left[v^{S}v_{j_1}^2((d_{i})^2_{j_1} - 1)\left({\ip{v}{d_{i'}}}^2 - 1\right)\right]
\end{align*}
Each such term appears with a coefficient of $\frac{1}{n}$ in $\tilde{E}\left[v^{S}\left({\ip{v}{d_i}}^2 - 1\right)\left({\ip{v}{d_{i'}}}^2 - 1\right)\right]$ so we want each such ribbon $R$ to appear with a coefficient of $\frac{1}{n}$ in $N_k$.

Similarly, we want each ribbon $R$ with $A_R = (\{j_1,\ldots,j_{k}\} \setminus \{j_1,j_2\})  \cup \{i'_{circ}\}$, $B_R = \{j_3,\ldots,j_{k}\} \cup (i_{circ},i'_{circ})$, and $E(R) = \{(j_1,i)_2\}$ where $j_1 = j_2 \notin S$ to appear with a coefficient of $-\frac{1}{n}$ in $N_k$.
\item If we take a ribbon $R$ with $A_R = (\{j_1,\ldots,j_{k}\} \setminus \{j_1,j_2\}) \cup \{i'_{circ}\}$, $B_R = \{j_3,\ldots,j_{k}\} \cup (i_{circ},i'_{circ})$, and $E(R) = \{(j_3,i)_2\}$ where $j_1 = j_2 = j_3 \in S$ then 
\begin{align*}
({\tilde{E}^T}{L_k}M_R)_{S,i,i'} &= \tilde{E}\left[v^{S \setminus \{j_1,j_2\}}((d_{i})^2_{j_3} - 1)\left({\ip{v}{d_{i'}}}^2 - 1\right)\right]\\
&=n\tilde{E}\left[v^{S}v_{j_1}^2((d_{i})^2_{j_1} - 1)\left({\ip{v}{d_{i'}}}^2 - 1\right)\right]
\end{align*}
Each such term appears with a coefficient of $\frac{1}{n}$ in $\tilde{E}\left[v^{S}\left({\ip{v}{d_i}}^2 - 1\right)\left({\ip{v}{d_{i'}}}^2 - 1\right)\right]$ so we want each such ribbon $R$ to appear with a coefficient of $\frac{1}{n}$ in $N_k$.

Similarly, we want each ribbon $R$ with $A_R = (\{j_1,\ldots,j_{k}\} \setminus \{j_1,j_2\}) \cup \{i'_{circ}\}$, $B_R = \{j_3,\ldots,j_{k}\} \cup (i_{circ},i'_{circ})$, and $E(R) = \{(j_3,i)_2\}$ where $j_1 = j_2 = j_3 \in S$ to appear with a coefficient of $-\frac{1}{n}$ in $N_k$.
\end{enumerate}
\end{proof}
Observe that the dominant part of $N_k$ is $2(M_{\alpha_1^+} - M_{\alpha_1^-})$ which has norm $\tilde{O}(n)$. The norm bounds for the other components of $N_k$ are as follows:
\begin{enumerate}
\item $\norm{c_{\alpha_2}(M_{\alpha_2^+} - M_{\alpha_2^-})}$ is $\tilde{O}\left(\frac{1}{n} \cdot \sqrt{mn}\right) = \tilde{O}\left(\frac{\sqrt{m}}{\sqrt{n}}\right)$
\item $\norm{c_{\alpha_3}(M_{\alpha_3^+} - M_{\alpha_3^-})}$ is $\tilde{O}\left(\frac{1}{n^2} \cdot n\sqrt{m}\right) = \tilde{O}\left(\frac{\sqrt{m}}{n}\right)$
\item $\norm{c_{\alpha_4}(M_{\alpha_4^+} - M_{\alpha_4^-})}$ is $\tilde{O}\left(\frac{1}{n} \cdot \sqrt{mn}\right) = \tilde{O}\left(\frac{\sqrt{m}}{\sqrt{n}}\right)$
\item $\norm{c_{\alpha_5}(M_{\alpha_5^+} - M_{\alpha_5^-})}$ is $\tilde{O}\left(\frac{1}{n} \cdot \sqrt{m}\right) = \tilde{O}\left(\frac{\sqrt{m}}{n}\right)$
\end{enumerate}
\subsubsection{Analyzing $N_k{N^T_k}$}
The dominant terms of $N_k$ are $2M_{\alpha^{+}} - 2M_{\alpha^{-}}$ where 
\begin{enumerate}
\item $U_{\alpha^{+}} \setminus V_{\alpha^{+}} = \{j_1,j_2\}$, $U_{\alpha^{+}} \cap V_{\alpha^{+}} = \{j_3,\ldots,j_{k}\} \cup \{i'_{circ}\}$, $V_{\alpha^{+}} \setminus U_{\alpha^{+}} = \{i_{circ}\}$, and $E(\alpha^{+}) = \{(j_1,i),(j_2,i)\}$. Note that here $i < i'$ and $(i_{circ},i'_{circ})$ appears as a tuple in $V_{\alpha^{+}}$.
\item $U_{\alpha^{-}} \setminus V_{\alpha^{-}} = \{j_1,j_2\}$, $U_{\alpha^{-}} \cap V_{\alpha^{-}} = \{j_3,\ldots,j_{k}\} \cup \{i_{circ}\}$, $V_{\alpha^{-}} \setminus U_{\alpha^{-}} = \{i'_{circ}\}$, and $E(\alpha^{-}) = \{(j_1,i'),(j_2,i')\}$. Note that here $i < i'$ and $(i'_{circ},i_{circ})$ appears as a tuple in $V_{\alpha^{-}}$.
\end{enumerate}
\begin{lemma}\label{NNTPluslemma}
$M_{\alpha^{+}}M_{\alpha^{+}}^T + M_{\alpha^{-}}M_{\alpha^{-}}^T = M_{\alpha_1} + M_{\alpha_2} + M_{\alpha_3}$ where $\alpha_1,\alpha_2,\alpha_3$ are the following shapes.
\begin{enumerate}
\item $U_{\alpha_1} \setminus V_{\alpha_1} = \emptyset$, $U_{\alpha_1} \cap V_{\alpha_1} = \{j_1,j_2,j_3,j_4,\ldots,j_{k}\} \cup \{i_{circ}\}$, $V_{\alpha_1} \setminus U_{\alpha_1} = \emptyset$, $V(\alpha_1) \setminus (U_{\alpha_1} \cup V_{\alpha_1}) = \{i'_{circ}\}$, and $E(\alpha_1) = \{(i',j_1),(i',j_1),(i',j_2),(i',j_2)\}$.
\item $U_{\alpha_2} \setminus V_{\alpha_2} = \{j_1\}$, $U_{\alpha_2} \cap V_{\alpha_2} = \{j_2,j_4,\ldots,j_{k}\} \cup \{i_{circ}\}$, $V_{\alpha_2} \setminus U_{\alpha_2} = \{j'_1\}$, $V(\alpha_2) \setminus (U_{\alpha_2} \cup V_{\alpha_2}) = \{i'_{circ}\}$, and $E(\alpha_2) = \{(i',j_1),(i',j'_1),(i',j_2),(i',j_2)\}$.
\item $U_{\alpha_3} \setminus V_{\alpha_3} = \{j_1,j_2\}$, $U_{\alpha_3} \cap V_{\alpha_3} = \{j_5,\ldots,j_{k}\} \cup \{i_{circ}\}$, $V_{\alpha_3} \setminus U_{\alpha_3} = \{j'_1,j'_2\}$, $V(\alpha_3) \setminus (U_{\alpha_3} \cup V_{\alpha_3}) = \{i'_{circ}\}$, and $E(\alpha_3) = \{(i',j_1),(i',j_2),(i',j'_1),(i',j'_2)\}$.
\end{enumerate}
Note that for these shapes, we do not assume that $i < i'$. Also note that $\alpha_1$ and $\alpha_2$ are improper shapes, though this does not matter for us.
\end{lemma}
\begin{remark}
Actually, we do not need to do this computation as we will just use that $M_{\alpha^{+}}M_{\alpha^{+}}^T + M_{\alpha^{-}}M_{\alpha^{-}}^T \succeq 0$, but we include it anyways to show the similarity with the decomposition of ${L_k^T}L_k$.
\end{remark}
For pictures of these shapes, see Figure \ref{NNTPlusfigure} below.
\begin{figure}[ht]\label{NNTPlusfigure}
\centerline{\includegraphics[height=6cm]{sherrington_kirkpatrick/NNTPlus}}
\caption{This figure shows the decomposition of $M_{\alpha^{+}}M_{\alpha^{+}}^T + M_{\alpha^{-}}M_{\alpha^{-}}^T$.}
\end{figure}
\begin{proof}[Proof of Lemma \ref{NNTPluslemma}]
We compute $M_{\alpha^{+}}M_{\alpha^{+}}^T + M_{\alpha^{-}}M_{\alpha^{-}}^T$ by considering the ribbons which appear in $M_{\alpha^{+}}M_{\alpha^{+}}^T + M_{\alpha^{-}}M_{\alpha^{-}}^T$. For these ribbons, we do not assume that $i < i'$.
\begin{enumerate}
\item Each ribbon $R$ with $A_R \setminus B_R = \emptyset$, $A_R \cap B_R = \{j_1,j_2,j_3,j_4,\ldots,j_{k}\} \cup \{i_{circ}\}$, $B_R \setminus A_R = \emptyset$, $V(R) \setminus (A_R \cup B_R) = \{i'_{circ}\}$, and $E(R) = \{(i',j_1),(i',j_1),(i',j_2),(i',j_2)\}$ appears in exactly one way as the composition of the ribbons $R_1$ and $R_2$ where 
\begin{enumerate}
\item $A_{R_1} \setminus B_{R_1} = \{i_{circ}\}$, $A_{R_1} \cap B_{R_1} = \{j_3,j_4,\ldots,j_{k}\}$, $B_{R_1} \setminus A_{R_1} = \{j_1,j_2\}$, and $E(R_1) = \{(i,j_1),(i,j_2)\}$.
\item $A_{R_2} \setminus B_{R_2} = \{j_1,j_2\}$, $A_{R_2} \cap B_{R_2} = \{j_3,j_4,\ldots,j_{k}\}$, $B_{R_2} \setminus A_{R_2} = \{i_{circ}\}$, and $E(R_2) = \{(i,j_1),(i,j_2)\}$.
\end{enumerate}
Whether $i < i'$ or $i' < i$ only affects whether this ribbon appears in $M_{\alpha^{+}}M_{\alpha^{+}}^T$ or $M_{\alpha^{-}}M_{\alpha^{-}}^T$.
\item Each ribbon $R$ with $A_R \setminus B_R = \{j_1\}$, $A_R \cap B_R = \{j_4,\ldots,j_{k}\} \cup \{i_{circ}\}$, $B_R \setminus A_R = \{j'_1\}$, $V(R) \setminus (A_R \cup B_R) = \{i'_{circ}\}$, and $E(R) = \{(i,j_1),(i,j'_1),(i,j_2),(i,j_2)\}$ appears in exactly one way as the composition of the ribbons $R_1$ and $R_2$ where 
\begin{enumerate}
\item $A_{R_1} \setminus B_{R_1} = \{i_{circ}\}$, $A_{R_1} \cap B_{R_1} = \{j_1,j_4,\ldots,j_{k}\}$, $B_{R_1} \setminus A_{R_1} = \{j'_1,j_2\}$, and $E(R_1) = \{(i,j'_1),(i,j_2)\}$.
\item $A_{R_2} \setminus B_{R_2} = \{j_1,j_2\}$, $A_{R_2} \cap B_{R_2} = \{j'_1,j_4,\ldots,j_{k}\}$, $B_{R_2} \setminus A_{R_2} = \{i_{circ}\}$, and $E(R_2) = \{(i,j_1),(i,j_2)\}$.
\end{enumerate}
Whether $i < i'$ or $i' < i$ only affects whether this ribbon appears in $M_{\alpha^{+}}M_{\alpha^{+}}^T$ or $M_{\alpha^{-}}M_{\alpha^{-}}^T$.
\item Each ribbon $R$ with $A_R \setminus B_R = \{j_1,j_2\}$, $A_R \cap B_R = \{j_5,\ldots,j_{k}\} \cup \{i_{circ}\}$, $B_R \setminus A_R = \{j'_1,j'_2\}$, $V(R) \setminus (A_R \cup B_R) = \{i'_{circ}\}$, and $E(R) = \{(i,j_1),(i,j'_1),(i,j_2),(i,j'_2)\}$ appears in exactly one way as the composition of the ribbons $R_1$ and $R_2$ where 
\begin{enumerate}
\item $A_{R_1} \setminus B_{R_1} = \{i_{circ}\}$, $A_{R_1} \cap B_{R_1} = \{j_1,j_2,j_5,\ldots,j_{k}\}$, $B_{R_1} \setminus A_{R_1} = \{j'_1,j'_2\}$, and $E(R_1) = \{(i,j'_1),(i,j'_2)\}$.
\item $A_{R_2} \setminus B_{R_2} = \{j_1,j_2\}$, $A_{R_2} \cap B_{R_2} = \{j'_1,j'_2,j_5,\ldots,j_{k}\}$, $B_{R_2} \setminus A_{R_2} = \{i_{circ}\}$, and $E(R_2) = \{(i,j_1),(i,j_2)\}$.
\end{enumerate}
Whether $i < i'$ or $i' < i$ only affects whether this ribbon appears in $M_{\alpha^{+}}M_{\alpha^{+}}^T$ or $M_{\alpha^{-}}M_{\alpha^{-}}^T$.
\end{enumerate}
\end{proof}
\begin{lemma}\label{NNTMinuslemma}
$M_{\alpha^{+}}M_{\alpha^{-}}^T + M_{\alpha^{-}}M_{\alpha^{+}}^T = M_{\alpha_4} + M_{\alpha_5} + M_{\alpha_6}$ where $\alpha_4,\alpha_5,\alpha_6$ are the following shapes.
\begin{enumerate}
\item $U_{\alpha_4} \setminus V_{\alpha_4} = \{i_{circ}\}$, $U_{\alpha_4} \cap V_{\alpha_4} = \{j_1,j_2,j_3,j_4,\ldots,j_{k}\}$, $V_{\alpha_4} \setminus U_{\alpha_4} = \{i'_{circ}\}$, and $E(\alpha_4) = \{(i,j_1),(i,j_2),$ $(i',j_1),(i',j_2)\}$.
\item $U_{\alpha_5} \setminus V_{\alpha_5} = \{j_1\} \cup \{i_{circ}\}$, $U_{\alpha_5} \cap V_{\alpha_5} = \{j_2,j_4,\ldots,j_{k}\}$, $V_{\alpha_5} \setminus U_{\alpha_5} = \{j'_1\} \cup \{i'_{circ}\}$, and $E(\alpha_5) = \{(i,j'_1),(i,j_2),(i',j_1),(i',j_2)\}$.
\item $U_{\alpha_6} \setminus V_{\alpha_6} = \{j_1,j_2\} \cup \{i_{circ}\}$, $U_{\alpha_6} \cap V_{\alpha_6} = \{j_5,\ldots,j_{k}\}$, $V_{\alpha_6} \setminus U_{\alpha_6} = \{j'_1,j'_2\}  \cup \{i'_{circ}\}$, and $E(\alpha_6) = \{(i',j_1),(i',j_2),(i,j'_1),(i,j'_2)\}$.
\end{enumerate}
\end{lemma}
For pictures of these shapes, see Figure \ref{NNTMinusfigure} below.
\begin{figure}[ht]\label{NNTMinusfigure}
\centerline{\includegraphics[height=6cm]{sherrington_kirkpatrick/NNTMinus}}
\caption{This figure shows the decomposition of $M_{\alpha^{+}}M_{\alpha^{+}}^T + M_{\alpha^{-}}M_{\alpha^{-}}^T$.}
\end{figure}
\begin{proof}[Proof of Lemma \ref{NNTMinuslemma}]
We compute $M_{\alpha^{+}}M_{\alpha^{-}}^T + M_{\alpha^{-}}M_{\alpha^{+}}^T$ by considering the ribbons which appear in $M_{\alpha^{+}}M_{\alpha^{-}}^T + M_{\alpha^{-}}M_{\alpha^{+}}^T$. For these ribbons, we do not assume that $i < i'$.
\begin{enumerate}
\item Each ribbon $R$ with $A_R \setminus B_R = \{i_{circ}\}$, $A_R \cap B_R = \{j_1,j_2,j_3,j_4,\ldots,j_{k}\}$, $B_R \setminus A_R = \{i'_{circ}\}$, $V(R) \setminus (A_R \cup B_R) = \emptyset$, and $E(R) = \{(i,j_1),(i,j_2),(i',j_1),(i',j_2)\}$ appears in exactly one way as the composition of the ribbons $R_1$ and $R_2$ where 
\begin{enumerate}
\item $A_{R_1} \setminus B_{R_1} = \{j_1,j_2\}$, $A_{R_1} \cap B_{R_1} = \{j_3,j_4,\ldots,j_{k}\} \cup \{i_{circ}\}$, $B_{R_1} \setminus A_{R_1} = \{i'_{circ}\}$, and $E(R_1) = \{(i',j_1),(i',j_2)\}$.
\item $A_{R_2} \setminus B_{R_2} =  \{i_{circ}\}$, $A_{R_2} \cap B_{R_2} = \{j_3,j_4,\ldots,j_{k}\} \cup \{i'_{circ}\}$, $B_{R_2} \setminus A_{R_2} = \{j_1,j_2\}$, and $E(R_2) = \{(i,j_1),(i,j_2)\}$.
\end{enumerate}
Whether $i < i'$ or $i' < i$ only affects whether this ribbon appears in $M_{\alpha^{+}}M_{\alpha^{-}}^T$ or $M_{\alpha^{-}}M_{\alpha^{+}}^T$.
\item Each ribbon $R$ with $A_R \setminus B_R = \{j_1\} \cup \{i_{circ}\}$, $A_R \cap B_R = \{j_2,j_4,\ldots,j_{k}\}$, $B_R \setminus A_R = \{j'_1\} \cup \{i'_{circ}\}$, $V(R) \setminus (A_R \cup B_R) = \emptyset$, and $E(R) = \{(i,j_1),(i,j_2),(i',j'_1),(i',j_2)\}$ appears in exactly one way as the composition of the ribbons $R_1$ and $R_2$ where 
\begin{enumerate}
\item $A_{R_1} \setminus B_{R_1} = \{j_1,j_2\}$, $A_{R_1} \cap B_{R_1} = \{j'_1,j_4,\ldots,j_{k}\} \cup \{i_{circ}\}$, $B_{R_1} \setminus A_{R_1} = \{i'_{circ}\}$, and $E(R_1) = \{(i',j_1),(i',j_2)\}$.
\item $A_{R_2} \setminus B_{R_2} =  \{i_{circ}\}$, $A_{R_2} \cap B_{R_2} = \{j_1,j_4,\ldots,j_{k}\} \cup \{i'_{circ}\}$, $B_{R_2} \setminus A_{R_2} = \{j'_1,j_2\}$, and $E(R_2) = \{(i,j'_1),(i,j_2)\}$.
\end{enumerate}
Whether $i < i'$ or $i' < i$ only affects whether this ribbon appears in $M_{\alpha^{+}}M_{\alpha^{-}}^T$ or $M_{\alpha^{-}}M_{\alpha^{+}}^T$.
\item Each ribbon $R$ with $A_R \setminus B_R = \{j_1,j_2\} \cup  \{i_{circ}\}$, $A_R \cap B_R = \{j_5,\ldots,j_{k}\}$, $B_R \setminus A_R = \{j'_1,j'_2\}  \cup  \{i'_{circ}\}$, $V(R) \setminus (A_R \cup B_R) = \emptyset$, and $E(R) = \{(i,j'_1),(i,j'_2),(i',j_1),(i',j_2)\}$ appears in exactly one way as the composition of the ribbons $R_1$ and $R_2$ where 
\begin{enumerate}
\item $A_{R_1} \setminus B_{R_1} = \{j_1,j_2\}$, $A_{R_1} \cap B_{R_1} = \{j_1,j_2,j_5,\ldots,j_{k}\} \cup \{i_{circ}\}$, $B_{R_1} \setminus A_{R_1} = \{i'_{circ}\}$, and $E(R_1) = \{(i,j'_1),(i,j'_2)\}$.
\item $A_{R_2} \setminus B_{R_2} = \{i_{circ}\}$, $A_{R_2} \cap B_{R_2} = \{j'_1,j'_2,j_5,\ldots,j_{k}\}  \cup \{i'_{circ}\}$, $B_{R_2} \setminus A_{R_2} = \{j'_1,j'_2\}$, and $E(R_2) = \{(i,j_1),(i,j_2)\}$.
\end{enumerate}
Whether $i < i'$ or $i' < i$ only affects whether this ribbon appears in $M_{\alpha^{+}}M_{\alpha^{-}}^T$ or $M_{\alpha^{-}}M_{\alpha^{+}}^T$.
\end{enumerate}
\end{proof}
We now consider the norm bounds for these terms
\begin{enumerate}
\item $\norm{M_{\alpha_4}}$ is $\tilde{O}(m)$.
\item $\norm{M_{\alpha_5}}$ is $\tilde{O}(m)$.
\item $\norm{M_{\alpha_6}}$ is $\tilde{O}(n^2)$.
\end{enumerate}
This means that $M_{\alpha_6}$ cannot be ignored, but this is fine. In fact, the $M_{\alpha_6}$ here will cancel with the $M_{\alpha_6}$ that appears in the decomposition of ${L_k^T}L_k$
\begin{corollary}\label{NNTapproximationcorollary}
For all $k \geq 4$, $\norm{{N_k}{N_k}^T - 4M_{\alpha^{+}}M_{\alpha^{+}}^T - 4M_{\alpha^{-}}M_{\alpha^{-}}^T + 4M_{\alpha_6}}$ is $\tilde{O}(m)$.
\end{corollary}
\subsubsection{Putting Everything Together}
We now put everything together to prove that for all $k$, the minimum nonzero eigenvalue of ${L_k^T}{L_k}$ is $2n^2 - \tilde{O}(n\sqrt{m})$.
\begin{enumerate}
\item For $k = 2$, by Corollary \ref{QQTapproximationcorollary}, $\norm{{L_k^T}L_k - 2{n^2}M_{Id_{k-2,1}}}$ is $\tilde{O}(n\sqrt{m})$ so the minimum eigenvalue of ${L_k^T}{L_k}$ is $2n^2 - \tilde{O}(n\sqrt{m})$.
\item For $k = 3$, by Corollary \ref{QQTapproximationcorollary}, $\norm{{L_k^T}L_k - 2{n^2}M_{Id_{k-2,1}} - 4nM_{\beta_2}M^T_{\beta_2}}$ is $\tilde{O}(n\sqrt{m})$ so the minimum eigenvalue of ${L_k^T}{L_k}$ is $2n^2 - \tilde{O}(n\sqrt{m})$.
\item For $k \geq 4$, by Corollary \ref{QQTapproximationcorollary}, $\norm{{L_k^T}L_k - 2{n^2}M_{Id_{k-2,1}} - 4nM_{\beta_2}M^T_{\beta_2} - 4M_{\beta_3}M^T_{\beta_3} - 4M_{\alpha_6}}$ is $\tilde{O}(n\sqrt{m})$. By Corollary \ref{NNTapproximationcorollary}, $\norm{{N_k}{N_k}^T - 4M_{\alpha^{+}}M_{\alpha^{+}}^T - 4M_{\alpha^{-}}M_{\alpha^{-}}^T + 4M_{\alpha_6}}$ is $\tilde{O}(m)$. Combining these equations, 
\[
\norm{{L_k^T}L_k + {N_k}{N_k}^T - 2{n^2}M_{Id_{k-2,1}} - 4nM_{\beta_2}M^T_{\beta_2} - 4M_{\beta_3}M^T_{\beta_3} - 4M_{\alpha^{+}}M_{\alpha^{+}}^T - 4M_{\alpha^{-}}M_{\alpha^{-}}^T}
\] is $\tilde{O}(n\sqrt{m})$ so the minimum eigenvalue of ${L_k^T}L_k + {N_k}{N_k}^T$ is $2n^2 - \tilde{O}(n\sqrt{m})$. Since the minimum nonzero eigenvalue of ${L_k^T}L_k$ is at least as large as the minimum eigenvalue of ${L_k^T}L_k + {N_k}{N_k}^T$, the nonzero eigenvalue of ${L_k^T}{L_k}$ is $2n^2 - \tilde{O}(n\sqrt{m})$.
\end{enumerate}
This implies that the minimum nonzero eigenvaue of $QQ^T$ is $2n^2 - \tilde{O}(n\sqrt{m})$, as needed.

\subsection{Importance of Scaling}\label{app:scaling}
We remark that somewhat surprisingly, the scaling of the problem is important for our arguments. The reason this is somewhat surprising is that for the purpose of determining whether or not a matrix $M$ is PSD, the scaling of the rows and columns of $M$ doesn't matter. More precisely, we have the following proposition.
\begin{proposition}
For any symmetric $N \times N$ matrix $M$ and any $N \times N$ diagonal matrix $D$ such that $\forall i \in [N], D_{ii} \neq 0$, $M \succeq 0$ if and only if $DMD \succeq 0$.
\end{proposition}
However, for our techniques, we also use the fact that if $x$ is in the nullspace of $M$ then for the purposes of determining whether $M$ is PSD, we can freely add a non-negative multiple of $xx^T$ to $M$.
\begin{proposition}\label{addingnullspaceprop}
For any symmetric $N \times N$ symmetric matrix $M$, any vector $x$ such that $Mx = 0$, and any constant $c   $, $M \succeq 0$ if and only if $M+cxx^T \succeq 0$.
\end{proposition}
As shown by the following example, the set of matrices that can be obtained using Proposition \ref{addingnullspaceprop} depends on the scaling of $M$.

If $M = \begin{pmatrix}
1 & 1 & 2 \\
1 & 2 & 3 \\
2 & 3 & 5
\end{pmatrix}$, $x = \begin{pmatrix}
1\\
1\\
-1
\end{pmatrix}$, and $D = \begin{pmatrix}
1 & 0 & 0 \\
0 & 1 & 0 \\
0 & 0 & \lambda
\end{pmatrix}$
then $DMD = \begin{pmatrix}
1 & 1 & 2\lambda \\
1 & 2 & 3\lambda \\
2\lambda & 3\lambda & 5{\lambda}^2
\end{pmatrix}$ and 
\[
DMD + cD^{-1}x{x^T}D^{-1} = \begin{pmatrix}
1 + c & 1 + c & 2\lambda - \frac{c}{\lambda} \\
1 + c & 2 + c & 3\lambda - \frac{c}{\lambda} \\
2\lambda - \frac{c}{\lambda} & 3\lambda - \frac{c}{\lambda} & 5{\lambda}^2 + \frac{c}{{\lambda}^2}
\end{pmatrix}
\]
Scaling this so that the diagonal entries are $1$ gives the matrix 
\[
\begin{pmatrix}
1 & \frac{\sqrt{1+c}}{\sqrt{2+c}} & \frac{2\lambda - \frac{c}{\lambda}}{\sqrt{(1+c)(5{\lambda}^2 + \frac{c}{{\lambda}^2})}} \\
\frac{\sqrt{1+c}}{\sqrt{2+c}} & 1 & \frac{3\lambda - \frac{c}{\lambda}}{\sqrt{(2+c)(5{\lambda}^2 + \frac{c}{{\lambda}^2})}} \\
\frac{2\lambda - \frac{c}{\lambda}}{\sqrt{(1+c)(5{\lambda}^2 + \frac{c}{{\lambda}^2})}} & \frac{3\lambda - \frac{c}{\lambda}}{\sqrt{(2+c)(5{\lambda}^2 + \frac{c}{{\lambda}^2})}} & 1
\end{pmatrix}
\]
Note that the entries in the upper left $2 \times 2$ block only depend on $c$ and are different for each $c$ while the other off-diagonal entries also depend on $\lambda$. Thus, different $\lambda$ give different sets of matrices.


\section{Open Problems}\label{sec:open-problems}

We conjecture that for the Planted Affine Planes problem, the problem remains difficult even with the number of vectors increased to $m = n^{2 - \varepsilon}$.

\begin{conjecture}
	\cref{theo:sos-bounds} holds with the bound on the number of sampled vectors $m$ loosened to $m \leq n^{2-\varepsilon}$.
\end{conjecture}

The reason for the upper bound comes from \cref{rmk:pe-one}. Analyzing $\tilde{\EE}[1]$ is an established way to hypothesize about the power of SoS in hypothesis testing problems (see \cite{hop17}, \cite{hop18}).

Dual to the Planted Affine Planes problem, we conjecture a similar
bound for Planted Boolean Vector problem whenever $d \geq
n^{1/2+\varepsilon}$.

\begin{conjecture}
	\cref{theo:boolean-subspace} holds with the bound on the dimension $p$ of a random subspace
  loosened to $p \geq n^{1/2+\varepsilon}$.
\end{conjecture}

We conjecture that the Planted Boolean Vector problem/Planted Affine
Planes problem is still hard for SoS if the input is no longer i.i.d.
Gaussian or boolean entries, but is drawn from a ``random enough''
distribution. For example, if in the random instance of PAP the vectors
$d_u$ are i.i.d. samples from $S^n$, or a random orthonormal system,
degree $n^\delta$ SoS should still believe the instance is satisfiable
(after appropriate normalization of $v$). Or, taking the view of Planted
Boolean Vector, if the subspace is the eigenspace of the bottom
eigenvectors of a random adjacency matrix, the instance should still be
difficult. This last setting arises in Maximum Cut, for which we conjecture the following.

\begin{conjecture}
    Let $d \geq 3$, and let $G$ be a random $d$-regular graph on $n$ vertices. For some $\delta > 0$, w.h.p. there is a degree-$n^\delta$ pseudoexpectation operator $\tilde{\EE}$ on boolean variables $x_i$ with maximum cut value at least
    \[ \frac{1}{2} + \frac{\sqrt{d-1}}{d}(1 - \operatorname{o}_{d,n}(1)) \]
\end{conjecture}

The above expression is w.h.p. the value of the spectral relaxation for Maximum Cut, therefore qualitatively this conjecture expresses that degree $n^\delta$ SoS cannot significantly tighten the basic spectral relaxation.

We should remark that, with respect to the goal of showing SoS cannot significantly outperform the Goemans-Williamson relaxation, random instances are not integrality gap instances. The main difficulty in comparing (even degree 4) SoS to the Goemans-Williamson algorithm seems to be the lack of a candidate hard input distribution.

Evidence for this conjecture comes from the fact that the only property
required of the random inputs $d_1, \dots, d_m$ was that norm bounds hold for
the graph matrix with Hermite polynomial entries. When the variables
$\{d_{u,i}\}$ are i.i.d from some other distribution, if we use graph matrices
for the orthonormal polynomials under the distribution and assuming suitable
bounds on the moments of the distribution, the same norm bounds
hold~\cite{AMP20}.
When $d_u \in_{\text{R}} S^n$ or another distribution for which the coordinates are not
i.i.d, it seems likely that if we use e.g. the spherical harmonics then
similar norm bounds hold, but this is not proven.



\subsection{Problem statements}

We introduce the Planted Affine Planes problem over a distribution ${\mathcal D}$.
\begin{definition}[Planted Affine Planes (PAP) problem]\label{def:prob:pap}
  Given $d_1, \dots, d_m \sim {\mathcal D}$ where each $d_u$ is a vector in ${\mathbb R}^n$,
  determine whether there exists $v \in \set{\pm \frac{1}{\sqrt{n}}}^n$ such that
  \[
  \ip{v}{d_u}^2 = 1,
  \]
  for every $u \in [m]$.
\end{definition}
Our results hold for the Gaussian setting $\mathcal{D} = {\mathcal N}(0, I)$ and the boolean setting where ${\mathcal D}$ is uniformly sampled from $\{\pm 1\}^n$, though we conjecture (\cref{sec:open-problems}) that similar SoS bounds hold under more general conditions on ${\mathcal D}$.

Observe that in both settings the solution vector $v$ is restricted to be Boolean (in the sense that the entries are either $\frac{1}{\sqrt{n}}$ or $\frac{-1}{\sqrt{n}}$) and an SoS lower bound for this restricted version of the problem is
stronger than when $v$ can be an arbitrary vector from ${\mathbb R}^n$.

The Sherrington--Kirkpatrick (SK) problem comes from the spin-glass model
in statistical physics~\cite{SK76}.

\begin{definition}[Sherrington-Kirkpatrick problem]\label{def:prob:sk}
  Given $W \sim \GOE(n)$, compute
  \[
    \OPT(W) := \max_{x \in \{\pm 1\}^n} x^\intercal W x.
  \]
\end{definition}

The Planted Boolean Vector problem was introduced by
Mohanty--Raghavendra--Xu~\cite{MRX20}, where it was called the
``Boolean Vector in a Random Subspace''.

\begin{definition}[Planted Boolean Vector problem]\label{def:prob:pbv}
  Given a uniformly random $p$-dimensional subspace $V$ of $\mathbb{R}^n$ in the form of
  a projector $\Pi_V$ onto $V$, compute
  \[
  \OPT(V) :=  \frac{1}{n}\max_{b \in \{\pm 1\}^n} b^\intercal \Pi_V b.
  \]
\end{definition}


\subsection{Sum-of-Squares solutions}

We will work with two equivalent definitions of a degree-$D$ SoS
solution: a pseudoexpectation operator and a moment matrix. We tailor these
definitions to our setting of feasibility of systems of polynomial
equality constraints given by the common zero set of a collection of
polynomials ${\mathcal P}$ on $\pm \frac{1}{\sqrt{n}}$ Boolean variables
$v_1,\dots,v_n$.  For a degree-$D$ solution to be well defined, we
need $D$ to be at least the maximum degree of a polynomial in
${\mathcal P}$. Let ${\mathbb R}^{\le D}(v_1,\dots,v_n)$ be the subset of
polynomials of degree at most $D$ from the polynomial ring
${\mathbb R}(v_1,\dots,v_n)$. We denote the degree of a polynomial
$f \in {\mathbb R}(v_1,\dots,v_n)$ by $\deg(f)$.

\subsubsection{Pseudoexpectation operator}

We formally define the pseudoexpectation operators used in our setting.
\begin{definition}[Pseudoexpectation]\label{def:pseudoexpectation}
  Given a finite collection of ``constraint'' polynomials ${\mathcal P}$ of degree at most $D$ on $\pm \frac{1}{\sqrt{n}}$ Boolean variables $v_1,\dots,v_n$,
  a degree-$D$ pseudoexpectation operator $\tilde{\EE}$ is an operator
  $\tilde{\EE} \colon {\mathbb R}^{\le D}(v_1,\dots,v_n) \rightarrow \mathbb{R}$ satisfying:
  \begin{enumerate}
    \item $\tilde{\EE}[1] = 1$, \label{pe:normalized}
    \item $\tilde{\EE}$ is an ${\mathbb R}$-linear operator, i.e., $\tilde{\EE}[f+g] = \tilde{\EE}[f] + \tilde{\EE}[g]$ for every $f,g \in {\mathbb R}^{\le D}(v_1,\dots,v_n)$, \label{pe:linear}
    \item $\tilde{\EE}[f^2] \ge 0$ for every $f \in {\mathbb R}^{\le D}(v_1,\dots,v_n)$ with $\deg(f^2) \le D$. \label{pe:psdness}
    \item $\tilde{\EE}[(v_i^2-\frac{1}{n}) \cdot f] = 0$ for all $i \in [n]$ and for every $f \in {\mathbb R}^{\le D}(v_1,\dots,v_n)$ with $\deg(f) \le D-2$, and \label{pe:boolean}   
    \item $\tilde{\EE}[g \cdot f] = 0$ for every $g \in {\mathcal P}, f \in {\mathbb R}^{\le D}(v_1,\dots,v_n)$ with $\deg(f \cdot g) \le D$. \label{pe:feasible}
  \end{enumerate}
\end{definition}
Note that $\tilde{\EE}$ behaves similarly to an expectation operator
restricted to ${\mathbb R}^{\le D}(v_1,\dots,v_n)$ with the caveat that $\tilde{\EE}$
is only guaranteed to be non-negative on sum-of-squares polynomials.

The degree-$D$ SoS algorithm checks feasibility of a polynomial system by 
checking whether or not a degree-$D$ pseudoexpectation operator exists. To 
show an SoS lower bound, one must construct a pseudoexpectation 
operator.

\subsubsection{Moment matrix}\label{sec:moment-mtx}

We define
the moment matrix associated with a degree-$D$
pseudoexpectation $\tilde{\EE}$.
\begin{definition}[Moment Matrix of $\tilde{\EE}$]
  The moment matrix ${\mathcal M}={\mathcal M}(\tilde{\EE})$ associated to a pseudoexpectation $\tilde{\EE}$ is a
  $\binom{[n]}{\leq D/2} \times \binom{[n]}{\leq D/2}$ matrix with rows and columns indexed
  by subsets of $I, J \subseteq [n]$ of size at most $D/2$ and defined as
  \[
  {\mathcal M}[I, J] := \tilde{\EE}\left[ v^I \cdot v^J \right].
  \]
\end{definition}

To show that a candidate pseudoexpectation satisfies~\cref{pe:psdness} in~\cref{def:pseudoexpectation}, we will rely on the following standard fact.
\begin{fact}
  In the definition of pseudoexpectation, \cref{def:pseudoexpectation}, the condition in \cref{pe:psdness} is equivalent to ${\mathcal M} \succeq 0$.
\end{fact}

\subsection{Graph matrices}
To study ${\mathcal M}$, we decompose it using the framework of \textit{graph matrices}. Originally developed in the context of the planted clique problem, graph matrices are random matrices whose entries are symmetric functions of an underlying random object -- in our case, the set of vectors $d_1, \dots, d_m$. We take the general presentation and results from~\cite{AMP20}. For our purposes, the following definitions are sufficient.


The graphs that we study have two types of vertices, circles $\circle{}$ and squares $\square{}$. We let ${\mathcal C}_m$ be a set of $m$ circles labeled 1 through $m$, which we denote by $\circle{1}, \circle{2}, \dots, \circle{m}$, and let ${\mathcal S}_n$ be a set of $n$ squares labeled 1 through $n$, which we denote by $\square{1}, \square{2}, \dots, \square{n}$. We will work with bipartite graphs with edges between circles and squares, which have positive integer labels on the edges. When there are no multiedges (the graph is simple), such graphs are in one-to-one correspondence with Fourier characters on the vectors $d_u$. An edge between $\circle{u}$ and $\square{i}$ with label $l$ represents $h_{l}(d_{u,i})$ where $\{h_k\}$ is the Fourier basis (e.g. Hermite polynomials).

\[ \text{simple graph with labeled edges} \qquad \Longleftrightarrow \qquad \displaystyle\prod_{\substack{\circle{u} \in {\mathcal C}_m,\\ \square{i} \in {\mathcal S}_n}} h_{l(\circle{u}, \square{i})}(d_{u,i}) \]

An example of a Fourier polynomial as a graph with labeled edges is given in~\cref{fig:fourier_graph}. Unlabeled edges are implicitly labeled 1.
\begin{figure}[h!]
\centering      
\begin{tikzpicture}[scale=0.5,every node/.style={scale=0.5}]
\draw  (-2,3) rectangle node {\huge $i_1$}(-0.5,1.5) node (v5) {};
\draw  (3,1) ellipse (1 and 1) node {\huge $u$};
\draw  (6.5,3) rectangle node (v10) {\huge $j_1$} (8,1.5);
\draw  (-2,0) rectangle node {\huge $i_2$} (-0.5,-1.5);
\draw  (6.5,0) rectangle node {\huge $j_2$} (8,-1.5);
\node (v1) at (-0.5,3) {};
\node (v4) at (-0.5,2.25) {};
\node (v6) at (6.5,2.25) {};
\node (v8) at (-0.5,-0.75) {};
\node (v9) at (6.5,-0.75) {};
\node at (-0.5,0) {};
\node (v2) at (2,1) {};
\node at (2,1.5) {};
\node (v7) at (6.5,1.5) {};
\node (v3) at (4,1) {};
\draw  plot[smooth, tension=.7] coordinates {(v3)};
\draw  plot[smooth, tension=.7] coordinates {(v3)};
\draw  plot[smooth, tension=.7] coordinates {(v2) (0.5,2) (v4)};
\node at (1,2.2) {\huge $3$};
\draw  plot[smooth, tension=.7] coordinates {(v3) (5,2) (v6)};
\draw  plot[smooth, tension=.7] coordinates {(v3)};
\draw  plot[smooth, tension=.7] coordinates {(v2) (1,-0.5) (v8)};
\draw  plot[smooth, tension=.7] coordinates {(v3) (5,-0.5) (v9)};
\node at (6.5,3) {};
\draw  plot[smooth, tension=.7] coordinates {(v3)};
\draw  plot[smooth, tension=.7] coordinates {(v3)};
\draw  plot[smooth, tension=.7] coordinates {(v10)};
\node at (6.5,3) {};
\draw  (2,5.5) rectangle node {\huge $w_1$} (3.5,4);
\node (v11) at (2.5,4) {};
\node (v13) at (3,4) {};
\node (v12) at (3,2) {};
\draw  plot[smooth, tension=.7] coordinates {(v12) (3,3) (3,4)};
\node at (2.5,3) {\huge $2$};
\end{tikzpicture}
\caption{The Fourier polynomial $h_3(d_{u,i_1})h_1(d_{u,i_2})h_2(d_{u,w_1})h_1(d_{u,j_1})h_1(d_{u,j_2})$ represented as a graph.}
\label{fig:fourier_graph}
\end{figure}


Define the degree of a vertex $v$,  denoted $\deg(v)$, to be the sum of the labels incident to $v$, and $\abs{E}$ to be the sum of all labels. For 
intuition it is mostly enough to work with simple graphs, in which case these quantities make sense as the edge multiplicities in an implicit multigraph.

\begin{definition}[Proper]
We say an edge-labeled graph is \textit{proper} if it has no multiedges.
\end{definition}
The definitions allow for ``improper'' edge-labeled multigraphs which simplify multiplying graph matrices (\cref{sec:single-spider} and \cref{sec:exact-constraints}).

\begin{definition}[Matrix indices]
A \textit{matrix index} is a set $A$ of elements from ${\mathcal C}_m \cup {\mathcal S}_n$.
\end{definition}
We let $A(\square{i})$ or $A(\circle{u})$ be 0 or 1 to indicate if the vertex is in $A$.

\begin{definition}[Ribbons]\label{def:ribbon}
A \textit{ribbon} is an undirected, edge-labeled graph $R = (V(R), E(R), A_R, B_R)$, where $V(R) \subseteq {\mathcal C}_m\cup {\mathcal S}_n$ and $A_R, B_R$ are two matrix indices (possibly not disjoint) with $A_R, B_R \subseteq V(R)$, representing two distinguished sets of vertices. Furthermore, all edges in $E(R)$ go between squares and circles.
\end{definition}
We think of $A_R$ and $B_R$ as being the ``left'' and ``right'' sides of $R$, respectively. We also define the set of ``middle vertices'' $C_R := V(R) \setminus (A_R \cup B_R)$. If $e \not\in E(R)$, then we define its label $l(e) = 0$. We also abuse notation and write $l(\square{i}, \circle{u})$ instead of $l(\{\square{i}, \circle{u}\})$.


Akin to the picture above, each ribbon corresponds to a Fourier polynomial.
This Fourier polynomial lives inside a single entry of the matrix $M_R$.
In the definition below, the $h_k(x)$ are the Fourier basis corresponding to the respective setting. In the Gaussian case, they are the (unnormalized) Hermite polynomials, and in the boolean case, they are just the parity function, represented by
\[h_0(x) = 1, \qquad h_1(x) = x, \qquad h_k(x) = 0 \;\; (k \geq 2) \]

\begin{definition}[Matrix for a ribbon]\label{def:ribbon-matrix}
The matrix $M_R$ has rows and columns indexed by subsets of ${\mathcal C}_m~\cup~{\mathcal S}_n$, with a single nonzero entry defined by
\[M_R[I, J] = \left\{\begin{array}{lr}
    \displaystyle\prod_{\substack{e \in E(R), \\ e = \{\square{i}, \circle{u}\}}} h_{l(e)}(d_{u,i}) &  I = A_R, J = B_R\\
    0 & \text{Otherwise}
\end{array}\right. \]
\end{definition}

Next we describe the shape of a ribbon, which is essentially the ribbon when we have forgotten all the vertex labels and retained only the graph structure and the distinguished sets of vertices.
\begin{definition}[Index shapes]
An \textit{index shape} is a set $U$ of formal variables. Furthermore, each variable is labeled as either a ``circle'' or a ``square''.
\end{definition}
We let $U(\square{i})$ and $U(\circle{u})$ be either 0 or 1 for whether $\square{i}$ or $\circle{u}$, respectively, is in $U$.

\begin{definition}[Shapes]\label{def:shape}
A \textit{shape} is an undirected, edge-labeled graph $\alpha = (V(\alpha), E(\alpha), U_\alpha, V_\alpha)$ where $V(\alpha)$ is a set of formal variables, each of which is labeled as either a ``circle'' or a ``square''. $U_\alpha$ and $V_\alpha$ are index shapes (possibly with variables in common) such that $U_\alpha, V_\alpha \subseteq V(\alpha)$. The edge set $E(\alpha)$ must only contain edges between the circle variables and the square variables.
\end{definition}

We'll also use $W_\alpha := V(\alpha) \setminus (U_\alpha \cup V_\alpha)$ to denote the ``middle vertices'' of the shape.

\begin{remk}
	We will abuse notation and use $\square{i}, \square{j}, \circle{u}, \circle{v}, \ldots$ for both the vertices of ribbons and the vertices of shapes. If they are ribbon vertices, then the vertices are elements of ${\mathcal C}_m\cup{\mathcal S}_n$ and if they are shape vertices, then they correspond to formal variables with the appropriate type.
\end{remk}

\begin{definition}[Trivial shape]
	Define a shape $\alpha$ to be trivial if $U_\alpha = V_\alpha$, $W_\alpha = \emptyset$ and $E(\alpha) = \emptyset$.
\end{definition}

\begin{definition}[Transpose of a shape]
  The transpose of a shape  $\alpha = (V(\alpha), E(\alpha), U_\alpha, V_\alpha)$ is defined
  to be the shape $\alpha^{\intercal} = (V(\alpha), E(\alpha), V_\alpha, U_\alpha)$.
\end{definition}

For a shape $\alpha$ and an injective map $\sigma : 
V(\alpha) \rightarrow {\mathcal C}_m \cup {\mathcal S}_n$, we define the 
realization $\sigma(\alpha)$ as a ribbon in the natural
way, by labeling all the variables using the map 
$\sigma$. We also require $\sigma$ to be 
type-preserving i.e. it takes square variables to ${\mathcal S}_n$ and circle variables to ${\mathcal C}_m$. 
The ribbons that result are referred to as \textit{ribbons of shape $\alpha$}; notice that this partitions the set of all ribbons according to their shape\footnote{Partitions up to equality of shapes, where two shapes are equal if there is a type-preserving bijection between their variables that converts one shape to the other. When we operate on sets of shapes below, we implicitly use each distinct shape only once.}\footnote{Note that in our definition two realizations of a shape may give the same ribbon.}.

Finally, given a shape $\alpha$, the graph matrix $M_\alpha$ consists of all Fourier characters for ribbons of shape $\alpha$.
\begin{definition}[Graph matrices]\label{def:graph-matrix}
Given a shape $\alpha = (V(\alpha), E(\alpha), U_\alpha, V_\alpha)$, the \textit{graph matrix} $M_\alpha$ is
\[M_\alpha = \displaystyle\sum_{R \text{ is a ribbon of shape }\alpha} M_R\]
\end{definition}

The moment matrix for PAP will turn out to be defined using graph matrices $M_\alpha$ whose left and right sides only have square vertices, and no circles. However, in the course of the analysis we will factor and multiply graph matrices with circle vertices in the left or right.


\subsection{Norm bounds}
The spectral norm of a graph matrix is determined, up to logarithmic factors, by relatively simple combinatorial properties of the graph. For a subset $S \subseteq {\mathcal C}_m \cup {\mathcal S}_n$, we define the weight $w(S)~:=~(\#\text{ circles in }S)\cdot \log_n(m)+ (\#\text{ squares in }S)$. Observe that $n^{w(S)} = m^{\# \text{ circles in }S}\cdot n^{\#\text{ squares in }S}$.

\begin{definition}[Minimum vertex separator]
For a shape $\alpha$, a set $S_{\min}$ is a minimum vertex separator if all paths from $U_\alpha$ to $V_\alpha$ pass through $S_{\min}$ and $w(S_{\min})$ is minimized over all such separating sets.
\end{definition}

Let $W_{iso}$ denote the set of isolated vertices in $W_\alpha$. Then essentially the following norm bound holds for all shapes $\alpha$ with high probability (a formal statement can be found in~\cref{app:norm_bounds}):
\[\norm{M_\alpha} \leq  \widetilde\operatorname{O}\left(n^{\frac{w(V(\alpha)) - w(S_{\min}) + w(W_{iso})}{2}}\right)\]

In fact, the only probabilistic property required of the inputs $d_1, \dots, d_m$ by our proof is that the above norm bounds hold for all shapes that arise in the analysis.
We henceforth assume that the norm bounds in~\cref{lem:gaussian-norm-bounds} (for the Gaussian case) and~\cref{lem:norm-bounds} (for the boolean case) hold.
\subsection{Non-spiders are negligible}

For non-spiders, we will now show that their norm is small. We point out that this norm bound on non-spiders critically relies on the assumption $m \leq n^{3/2 - \varepsilon}$.

\begin{lemma}\label{lem:charging}
	If $\alpha \in {\mathcal L}$ is not a trivial shape and not a spider, then
	\[\frac{1}{n^{|E(\alpha)|/2}} n^{\frac{w(V(\alpha)) - w(S_{\min})}{2}} \le \frac{1}{n^{\Omega(\varepsilon |E(\alpha)|)}}\]
	where $S_{min}$ is the minimum vertex separator of $\alpha$.
\end{lemma}

\begin{proof}
The idea behind the proof is as follows. Each square vertex which is not in the minimum vertex separator contributes $\sqrt{n}$ to the norm bound while each circle vertex which is not in the minimum vertex separator contributes $\sqrt{m}$. To compensate for this, we will try and take the factor of $\frac{1}{\sqrt{n}}$ from each edge and distribute it among its two endpoints so that each square vertex which is not in the minimum vertex separator is assigned a factor of $\frac{1}{\sqrt{n}}$ or smaller and each circle vertex which is not in the minimum vertex separator is assigned a factor of $\frac{1}{\sqrt{m}}$ or smaller.
\begin{remark}
Instead of using the minimum vertex separator, we will actually use a set $S$ of square vertices such that $w(S) \leq w(S_{\min})$. For details, see the actual distribution scheme below.
\end{remark}
To motivate the distribution scheme which we use, we first give two attempts which don't quite work. For simplicity, for these first two attempts we assume that $U_{\alpha} \cap V_{\alpha} = \emptyset$ as vertices in $U_{\alpha} \cap V_{\alpha}$ can essentially be ignored.
\begin{enumerate}
\item[] Attempt 1: Take each edge and assign a factor of $\frac{1}{\sqrt[4]{n}}$ to its square endpoint and a factor of $\frac{1}{\sqrt[8]{m}}$ to its circle endpoint.

With this distribution scheme, since each circle vertex has degree at least $4$, each circle vertex is assigned a factor of $\frac{1}{\sqrt{m}}$ or smaller. Since each square vertex in $W_{\alpha}$ has degree at least $2$, each square vertex in $W_{\alpha}$ is assigned a factor of $\frac{1}{\sqrt{n}}$ or smaller. However, square vertices in $U_{\alpha} \cup V_{\alpha}$ may only have degree $1$ in which case they are assigned a factor of $\frac{1}{\sqrt[4]{n}}$ which is not small enough.

To fix this issue, we can have all of the edges which are incident to a square vertex in $U_{\alpha} \cup V_{\alpha}$ give their entire factor of $\frac{1}{\sqrt{n}}$ to the square vertex.
\begin{remark}\label{rmk:pe-one}
For analyzing $\tilde{\EE}[1]$, this first attempt works as $U_{\alpha} = V_{\alpha} = \emptyset$. Thus, as long as $m \leq n^{2 - \varepsilon}$, with high probability $\tilde{\EE}[1] = 1 \pm \operatorname{o}_n(1)$ .
\end{remark}
\item[] Attempt 2: For each edge which is between a square vertex in $U_{\alpha} \cup V_{\alpha}$ and a circle vertex, we assign a factor of $\frac{1}{\sqrt{n}}$ to the square vertex and nothing to the circle vertex. For all other edges, we assign a factor of $\frac{1}{\sqrt[4]{n}}$ to its square endpoint and a factor of $\frac{1}{\sqrt[6]{m}}$ to its circle endpoint (which we can do because $m \leq n^{\frac{3}{2} - \varepsilon}$).

With this distribution scheme, each square vertex is assigned a factor of $\frac{1}{\sqrt{n}}$. Since $\alpha$ is not a spider, no circle vertex is adjacent to two vertices in $U_{\alpha}$ or $V_{\alpha}$. Thus, any circle vertex which is not adjacent to both a square vertex in $U_{\alpha}$ and a square vertex in $V_{\alpha}$ must be adjacent to at least $3$ square vertices in $W_{\alpha}$ and is thus assigned a factor of $\frac{1}{\sqrt{m}}$ or smaller. However, we can have circle vertices which are adjacent to both a square vertex in $U_{\alpha}$ and a square vertex in $V_{\alpha}$. These circle vertices may be assigned a factor of $\frac{1}{\sqrt[3]{m}}$, which is not small enough.

To fix this, observe that whenever we have a circle vertex which is adjacent to both a square vertex in $U_{\alpha}$ and a square vertex in $V_{\alpha}$, this gives a path of length $2$ from $U_{\alpha}$ to $V_{\alpha}$. Any vertex separator must contain one of the vertices in this path, so we can put one of these two square vertices in $S$ and not assign it a factor of $\frac{1}{\sqrt{n}}$.
\item[] Actual distribution scheme: Based on these observations, we use the following distribution scheme. Here we are no longer assuming that $U_{\alpha} \cap V_{\alpha}$ is empty.
\begin{enumerate}
\item[1.] Choose a set of square vertices $S \subseteq U_{\alpha} \cup V_{\alpha}$ as follows. Start with $S = U_{\alpha} \cap V_{\alpha}$. Whenever we have a circle vertex which is adjacent to both a square vertex in $U_{\alpha} \setminus V_{\alpha}$ and a square vertex in $V_{\alpha} \setminus U_{\alpha}$, put one of these two square vertices in $S$ (this choice is arbitrary). Observe that $w(S) \leq w(S_{\min})$
\item[2.] For each edge which is incident to a square vertex in $S$, assign a factor of $\frac{1}{\sqrt[3]{m}}$ to its circle endpoint and nothing to this square.
\item[3.] For each edge which is incident to a square vertex in $(U_{\alpha} \cup V_{\alpha}) \setminus S$, assign a factor of $\frac{1}{\sqrt{n}}$ to the square vertex and nothing to the circle vertex.
\item[4.] For all other edges, assign a factor of $\frac{1}{\sqrt[4]{n}}$ to its square endpoint and a factor of $\frac{1}{\sqrt[6]{m}}$ to its circle endpoint.
\end{enumerate}
Now each square vertex which is not in $S$ is assigned a factor of $\frac{1}{\sqrt{n}}$ and since $\alpha$ is not a spider, all circle vertices are assigned a factor of $\frac{1}{\sqrt{m}}$ or smaller.
\end{enumerate}
We now make this argument formal.

	Let ${\mathcal C}_{\alpha}$ and ${\mathcal S}_{\alpha}$ be the set of circle vertices and the set of square vertices in $\alpha$ respectively. We have $ n^{\frac{w(V(\alpha)) - w(S_{\min})}{2}} \leq n^{0.5|{\mathcal S}_{\alpha} \setminus S_{min}| + (0.75 - \frac{\varepsilon}{2})|{\mathcal C}_{\alpha} \setminus S_{min}|}$. So, it suffices to prove that
	\[|E(\alpha)| - |{\mathcal S}_{\alpha} \setminus S_{min}| - (1.5 - \varepsilon)|{\mathcal C}_{\alpha} \setminus S_{min}| \ge \Omega(\varepsilon |E(\alpha)|)\]

	Let $Q = U_{\alpha} \cap V_{\alpha}, P = (U_{\alpha} \cup V_{\alpha}) \setminus Q$ and let $P'$ be the set of vertices of $P$ that have degree $1$ and are not in $S_{min}$. Let $E_1$ be the set of edges incident to $P'$ and let $E_2 = E(\alpha) \setminus E_1$.

	For each vertex $\square{i}$ (resp. $\circle{u}$), let the number of edges of $E_2$ incident to it be $\deg'(\square{i})$ (resp. $\deg'(\circle{u})$). Since $\alpha$ is bipartite, we have that $|E_2| = \sum_{\square{i} \in {\mathcal S}_{\alpha}} \deg'(\square{i}) = \sum_{\circle{u} \in {\mathcal C}_{\alpha}} \deg'(\circle{u})$. We get that
	\[|E(\alpha)| = |E_1| + |E_2| = |P'| + \frac{1}{2}(\sum_{\square{i} \in {\mathcal S}_{\alpha}} \deg'(\square{i}) + \sum_{\circle{u} \in {\mathcal C}_{\alpha}} \deg'(\circle{u}))\]

	We also have $|S_{\alpha} \setminus S_{min}| \le |P'| + |{\mathcal S}_{\alpha} \cap W_{\alpha}| + |{\mathcal S}_{\alpha} \cap (P \setminus P')| \le |P'| + \frac{1}{2} \sum_{\square{i} \in {\mathcal S}_{\alpha}} \deg'(\square{i})$ because each square vertex outside $P' \cup Q$ has degree at least $2$ and is not incident to any edge in $E_1$. So, it suffices to prove
	\[\frac{1}{2}\sum_{\circle{u} \in {\mathcal C}_{\alpha}} \deg'(\circle{u}) - (1.5 - \varepsilon)|{\mathcal C}_{\alpha} \setminus S_{min}| \ge \Omega(\varepsilon |E(\alpha)|)\]

	Now, observe that each $\circle{u} \in {\mathcal C}_{\alpha}$ is incident to at most two edges in $E_1$. This is because if it were adjacent to at least $3$ edges in $E_1$, then either $\circle{u}$ is adjacent to at least two vertices of degree $1$ in $U_{\alpha}$ or $\circle{u}$ is adjacent to at least two vertices of degree $1$ in $V_{\alpha}$. However, this cannot happen since $\alpha$ is not a spider. This implies that $\deg'(\circle{u}) \ge \deg(\circle{u}) - 2$.

	Note moreover that if $\circle{u} \in {\mathcal C}_{\alpha} \setminus S_{min}$, we have that $\deg'(\circle{u}) \ge \deg(\circle{u}) - 1$. This is because, building on the preceding argument, $\deg'(\circle{u}) = \deg(\circle{u}) - 2$ can only happen if there exist $\square{i} \in U_{\alpha}, \square{j} \in V_{\alpha}$ such that $(\square{i}, \circle{u}), (\square{j}, \circle{u}) \in E_1$. But then, note that we have $\square{i}, \square{j} \not\in S_{min}$ by definition of $P'$ and also, $\circle{u} \not\in S_{min}$ by assumption. This means that there is a path from $U_{\alpha}$ to $V_{\alpha}$ which does not pass through $S_{min}$, which is a contradiction.

	Finally, we set $\varepsilon$ small enough such that the following inequalities are true, both of which follow from the fact that $\deg(\circle{u}) \ge 4$ for all $\circle{u} \in {\mathcal C}_{\alpha}$.
	\begin{enumerate}
		\item For any $\circle{u} \in {\mathcal C}_{\alpha} \cap S_{min}$, we have $\frac{\deg(\circle{u}) - 2}{2} \ge \frac{\varepsilon}{10}\deg(\circle{u})$.
		\item For any $\circle{u} \in {\mathcal C}_{\alpha} \setminus S_{min}$, we have $\frac{\deg(\circle{u}) - 1}{2} - 1.5 + \varepsilon \ge \frac{\varepsilon}{10}\deg(\circle{u})$.
	\end{enumerate}
	Using this, we get
	\begin{align*}
	\frac{1}{2}\sum_{\circle{u} \in {\mathcal C}_{\alpha}} \deg'(\circle{u})& - (1.5 - \varepsilon)|{\mathcal C}_{\alpha} \setminus S_{min}| \\
    &\ge \sum_{\circle{u} \in {\mathcal C}_{\alpha} \cap S_{min}}
	\frac{\deg(\circle{u}) - 2}{2} + \sum_{\circle{u} \in {\mathcal C}_{\alpha} \setminus S_{min}}
	\frac{\deg(\circle{u}) - 1}{2} - (1.5 - \varepsilon)|{\mathcal C}_{\alpha} \setminus S_{min}|\\
	&\ge \sum_{\circle{u} \in {\mathcal C}_{\alpha} \cap S_{min}}
	\frac{\varepsilon}{10}\deg(\circle{u}) + \sum_{\circle{u} \in {\mathcal C}_{\alpha} \setminus S_{min}} \left(\frac{\deg(\circle{u}) - 1}{2} - 1.5 + \varepsilon\right)\\
	&\ge \sum_{\circle{u} \in {\mathcal C}_{\alpha} \cap S_{min}}
	\frac{\varepsilon}{10}\deg(\circle{u}) + \sum_{\circle{u} \in {\mathcal C}_{\alpha} \setminus S_{min}}
	\frac{\varepsilon}{10}\deg(\circle{u})\\
	&= \sum_{\circle{u} \in {\mathcal C}_{\alpha}} \frac{\varepsilon}{10}\deg(\circle{u}) = \Omega(\varepsilon|E(\alpha)|)
	\end{align*}
\end{proof}

Since ${\mathcal L}_{bool} \subseteq {\mathcal L}$, the above result extends to non-trivial non spider shapes in ${\mathcal L}_{bool}$ too.

\begin{corollary}
	If $\alpha \in {\mathcal L}_{bool}$ is not a trivial shape and not a spider, then
	\[\frac{1}{n^{|E(\alpha)|/2}} n^{\frac{w(V(\alpha)) - w(S_{\min})}{2}} \le \frac{1}{n^{\Omega(\varepsilon |E(\alpha)|)}}\]
\end{corollary}

\begin{corollary}\label{cor:non_spider_killing}
	If $\alpha \in {\mathcal L}$ is not a trivial shape and not a spider, then w.h.p. \[\frac{1}{n^{|E(\alpha)|/2}}\norm{M_{\alpha}} \le \frac{1}{n^{\Omega(\varepsilon |E(\alpha)|)}}\]
\end{corollary}

\begin{proof}
	Using the norm bounds in~\cref{lem:gaussian-norm-bounds}, we have
    {\footnotesize\begin{align*}
	\norm{M_\alpha} \leq 2\cdot\left(\abs{V(\alpha)} \cdot (1+\abs{E(\alpha)}) \cdot \log(n)\right)^{C\cdot (\abs{V_{rel}(\alpha)} + \abs{E(\alpha)})} \cdot n^q{\frac{w(V(\alpha)) - w(S_{\min}) + w(W_{iso})}{2}}
    \end{align*}}
	We have $W_{iso} = \emptyset$. Observe that since there are no degree $0$ vertices in $V_{rel}(\alpha)$, we have that $|V_{rel}(\alpha)| \le 2|E(\alpha)|$ and since we also have $|V(\alpha)|\cdot (1+\abs{E(\alpha)})\cdot \log n \le n^{O(\tau)}$, the factor $2\cdot(\abs{V(\alpha)} \cdot (1+\abs{E(\alpha)}) \cdot \log(n))^{C\cdot (\abs{V_{rel}(\alpha)} + \abs{E(\alpha)})}$ can be absorbed into $\frac{1}{n^{\Omega(\varepsilon |E(\alpha)|)}}$. The result follows from~\cref{lem:charging}.
\end{proof}

This says that nontrivial non-spider shapes have $\operatorname{o}_n(1)$ norm (ignoring the extra factor $\eta$ for the moment). We now demonstrate how to use this norm bound to control the total norm of all non-spiders in a block of ${\mathcal M}$,~\cref{cor:non-spider-sum}. We will first need a couple propositions which will also be of use to us later after we kill the spiders.

\begin{proposition}\label{prop:edge-shape-count}
	The number of proper shapes with at most $L$ vertices and exactly $k$ edges is at most $L^{8(k+1)}$.
\end{proposition}

\begin{proof}
    The following process captures all shapes (though many will be constructed multiple times):
  \begin{itemize}
      \item Choose the number of square and circle variables in each of the four sets $U \cap V, U \setminus (U \cap V), V \setminus (U \cap V), W$. This contributes a factor of $L^{8}$.
      \item Place each edge between two of the vertices. This contributes a factor of $L^{2 k}$.
  \end{itemize}
\end{proof}

\begin{proposition}\label{prop:coefficient-bound}
$\abs{\lambda_\alpha} \leq \eta^{\abs{U_\alpha} + \abs{V_\alpha}} \cdot  \frac{\abs{E(\alpha)}^{3\cdot \abs{E(\alpha)}}}{n^{\abs{E(\alpha)}/2}}$ where we assume by convention that $0^0 = 1$.
\end{proposition}
\begin{proof}
\noindent\textbf{(Gaussian setting)} Recall that the coefficients $\lambda_\alpha$ are either zero or are defined by the formula
\[\lambda_\alpha  = \eta^{\abs{U_\alpha} + \abs{V_\alpha}}\cdot \left( \prod_{\circle{u}\in V(\alpha)} h_{\deg(\circle{u})}(1)\right)
	\cdot \frac{1}{ n^{\abs{E(\alpha)}/2}}
	\cdot \frac{1}{\alpha!}\]

	The sequence $h_k(1)$ satisfies the recurrence $h_0(1) = h_1(1) = 1, h_{k + 1}(1) = h_k(1) - kh_{k - 1}(1)$. We can prove by induction that $\abs{h_k(1)} \le k^k$ and hence,
	\[\prod_{\circle{u}\in V(\alpha)} \abs{h_{\deg(\circle{u})}(1)} \le \prod_{\circle{u}\in V(\alpha)} (\deg(\circle{u}))^{\deg(\circle{u})} \le \abs{E(\alpha)}^{\abs{E(\alpha)}}.\]

\noindent\textbf{(Boolean setting)} In the boolean setting the coefficients $\lambda_\alpha$ are defined by
    \[\lambda_\alpha =  \eta^{\abs{U_\alpha} + \abs{V_\alpha}} \cdot \left(\prod_{\circle{u} \in V(\alpha)} e(\deg(\circle{u})) \right)\]
    Using~\cref{cor:bound_on_coeff_e_k}, we have that $\abs{e(k)} \le k^{3k} \cdot n^{-k/2}$. Thus,
    \[
    \abs{\lambda_\alpha} =  \eta^{\abs{U_\alpha} + \abs{V_\alpha}} \cdot \prod_{\circle{u} \in V(\alpha)} \abs{e(\deg(\circle{u}))} \le  \eta^{\abs{U_\alpha} + \abs{V_\alpha}} \cdot \frac{\abs{E(\alpha)}^{3\abs{E(\alpha)}}}{n^{\abs{E(\alpha)}/2}}.
    \]
\end{proof}

\begin{corollary}\label{cor:non-spider-sum}
For $k, l \in \{0, 1, \dots , D/2\}$, let ${\mathcal B}_{k,l} \subseteq {\mathcal L}$ denote the set of nontrivial, non-spiders $\alpha \in {\mathcal L}$ on the $(k,l)$ block i.e. $\abs{U_\alpha} = k, \abs{V_\alpha} = l$. The total norm of the non-spiders in ${\mathcal B}_{k, l}$ satisfies
\[\sum_{\alpha \in {\mathcal B}_{k, l}} \abs{\lambda_\alpha} \norm{M_\alpha} = \eta^{k + l} \cdot \frac{1}{n^{\Omega(\varepsilon)}} \]
\end{corollary}
\begin{proof}
\begin{align*}
    \sum_{\alpha \in {\mathcal B}_{k, l}} \abs{\lambda_\alpha} \norm{M_\alpha} & \leq \sum_{\alpha \in {\mathcal B}_{k, l}}\eta^{k+l} \cdot \frac{\abs{E(\alpha)}^{3\abs{E(\alpha)}}}{n^{\abs{E(\alpha)}/2}} \norm{M_\alpha} && \text{(\cref{prop:coefficient-bound})}\\
    & \leq \eta^{k+l} \cdot\sum_{\alpha \in {\mathcal B}_{k, l}}\left(\frac{\abs{E(\alpha)}^3}{n^{\Omega(\varepsilon)}}\right)^{\abs{E(\alpha)}} && \text{(\cref{cor:non_spider_killing})}\\
    & \leq\eta^{k+l} \cdot \sum_{\alpha \in {\mathcal B}_{k, l}}\left(\frac{n^{3\tau}}{n^{\Omega(\varepsilon)}}\right)^{\abs{E(\alpha)}} && (\alpha \in {\mathcal L})\\
    & \leq \eta^{k+l} \cdot \sum_{\alpha \in {\mathcal B}_{k, l}}\frac{1}{n^{\Omega(\varepsilon\abs{E(\alpha)})}}\\
    & \leq \eta^{k+l} \cdot\sum_{i=1}^\infty \frac{n^{O(\tau i)}}{n^{\Omega(\varepsilon i)}}\\
    & = \eta^{k+l} \cdot \frac{1}{n^{\Omega(\varepsilon)}}
\end{align*}
where the last inequality used \cref{prop:edge-shape-count} and  the fact $|E(\alpha)| \ge 1\text{ for }\alpha \in {\mathcal B}_{k, l}$.
\end{proof}


\subsection{Properties of $e(k)$}

In this section, we establish some properties of the $e(k)$ used in the
analysis. Recall that $e(k) = {\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1\dots x_k\right]$ where
$\mathcal{S}(\sqrt{n}) \coloneqq \set{x \in \set{\pm
1}^n \mid \sum_{i=1}^n x_i = \sqrt{n}}$.

\begin{claim}\label{claim:e2}
  $e(2)=0$.
\end{claim}


\begin{proof}
  Fix $y \in \mathcal{S}(\sqrt{n})$. Note that $(\sum_{i=1}^n y_i)^2 = n$ implying
  $\sum_{i < j} y_i y_j = 0$. Using this fact, we get
  $$
  {\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1 x_2\right] = {\mathbb E}_{\sigma \in S_n} y_{\sigma(1)} y_{\sigma(2)} = 0,
  $$
  concluding the proof.
\end{proof}

\begin{definition}
   We say that a tuple $\lambda = (\lambda_1,\dots,\lambda_k)$ of non-negative integers is a partition of $k$
   provided $\sum_{i=1}^k \lambda_i = k$ and $\lambda_1 \ge \cdots \ge \lambda_k$. We use the notation $\lambda \vdash k$
   to denote a partition of $k$. We refer to $\lambda_i$ as a row/part of $\lambda$.
\end{definition}

In the following, we will dealing with polynomials that can be indexed by integer partitions.
For this reason, we now fix a notation for partitions and some associated objects.

\begin{definition}
  The transpose of partition $\lambda = (\lambda_1,\dots,\lambda_k)$ is denoted $\lambda^t$ and defined as
  $\lambda^t_i = \abs{\set{j \in [k] \mid \lambda_j \ge i}}$.
\end{definition}

\begin{remark}
  For a partition $\lambda \vdash k$, $\lambda^t_1$ is the number of rows/parts of $\lambda$.
\end{remark}

\begin{definition}
  The automorphism group of a partition $\aut(\lambda) \leq S_{\lambda^t_1}$ is the group generated by transpositions $(i,j)$
  of rows $\lambda_i = \lambda_j$.
\end{definition}

\begin{remark}
  Let $\lambda \vdash k$ and $p_1(\lambda),\dots,p_k(\lambda)$ be such that $p_i(\lambda) = \abs{\set{j \in [\lambda_1^t] \mid \lambda_j = i}}$.
  Then $\aut(\lambda) \simeq S_{p_1} \times \cdots \times S_{p_k}$.
\end{remark}


\begin{lemma}\label{lem:slice_inv_exact}
  We have
  \[
  \sum_{\lambda \vdash k} \frac{\lambda!}{\lambda_1!\cdots \lambda_k!} \cdot \frac{(n)_{\lambda^t_1}}{\abs{\aut(\lambda)}} \cdot {\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1} \dots x_k^{\lambda_k} \right] = n^{k/2}.
  \]
\end{lemma}

\begin{proof}
  For $x \in \mathcal{S}(\sqrt{n})$, we have $(\sum_{i=1}^n x_i)^k =
  n^{k/2}$. Then expanding $(\sum_{i=1}^n x_i)^k$ in the previous equations and
  taking the expectation over $\mathcal{S}(\sqrt{n})$ on both sides
  yields the result of the lemma (after appropriately collecting
  terms).
\end{proof}


\begin{claim}\label{claim:bound_prod_exp_ff}
  Let $\lambda \vdash k$. We have
  $$
  (n)_{\lambda^t_1} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1} \dots x_k^{\lambda_k} \right]} \le 3^{k^3} \cdot n^{k/2}.
  $$
\end{claim}

\begin{proof}
  We induct on $k$. For $k=1$, we have $n \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1\right]} = \sqrt{n} \le 3 \cdot n^{1/2}$.
  Now, suppose $k \ge 2$. We consider three cases:
  \begin{enumerate}
    \item Case $\lambda_1 \ge 3$: Let $\lambda'$ be the partition obtained from $\lambda$ by removing two boxes from $\lambda_1$.
          Note that $\lambda_1^t = (\lambda')^t_1 \le k-2$ and
          ${\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1'} \dots x_{k-2}^{\lambda_{k-2}'} \right] = {\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1} \dots x_{k-2}^{\lambda_{k-2}} \right]$.
          By the induction hypothesis, we have $(n)_{(\lambda')^t_1} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1'} \dots x_{k-2}^{\lambda_{k-2}'} \right]} \le 3^{(k-2)^2} \cdot n^{(k-2)/2}$.
    \item Case $\lambda_1 = 2$: Let $\lambda'$ be the partition obtained from $\lambda$ by removing $\lambda_1$.
          Note that $\lambda_1^t = (\lambda')^t_1 + 1 \le k-2$. By the induction hypothesis, we have
          \begin{align*}
         (n)_{\lambda^t_1} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1} \dots x_{k-2}^{\lambda_{k-2}} \right]} &\le n \cdot (n)_{(\lambda')^t_1} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1'} \dots x_{k-2}^{\lambda_{k-2}'} \right]}\\
         &\le 3^{(k-2)^3} \cdot n^{k/2}.
          \end{align*}
    \item Case  $\lambda_1 = 1$: To bound $(n)_k \cdot {\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1} \dots x_k^{\lambda_k} \right]$, we use~\cref{lem:slice_inv_exact}
          and the two preceding cases. Let $p(k)$ be the partition function, i.e., $p(k) = \abs{\set{\lambda \vdash k}}$. We  deduce that
{\footnotesize
            \begin{align*}
             (n)_k \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1} \dots x_k^{\lambda_k}\right]}  & \le n^{k/2} + \sum_{\lambda \vdash k \colon \lambda_1 \ge 2} \frac{\lambda!}{\lambda_1!\cdots \lambda_k!} \cdot \frac{(n)_{\lambda^t_1}}{\abs{\aut(\lambda)}} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1} \dots x_k^{\lambda_k} \right]} \\
             & \le n^{k/2} + k!  \sum_{\lambda \vdash k \colon \lambda_1 \ge 2} (n)_{\lambda^t_1} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1} \dots x_k^{\lambda_k} \right]} \\
             & \le n^{k/2} + k! \sum_{\lambda \vdash k \colon \lambda_1 \ge 3} (n)_{\lambda^t_1} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1} \dots x_k^{\lambda_k} \right]}  + \\
             & \qquad \qquad k !\sum_{\lambda \vdash k \colon \lambda_1 = 2} (n)_{\lambda^t_1} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1} \dots x_k^{\lambda_k} \right]}\\
             & \le 3^{(k-2)^3} \cdot k! \cdot (1 + p(k) + k) \cdot n^{k/2} \le 3^{k^3} \cdot n^{k/2},
          \end{align*}
}%
          as desired.
  \end{enumerate}
\end{proof}

\begin{claim}\label{claim:crude_bound_e}
  Suppose $k < \sqrt{n}/2$. We have
  $$
  \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1\dots x_k\right]} \le 2 \cdot 3^{k^3} \cdot n^{-k/2}.
  $$
\end{claim}

\begin{proof}
  Follows from~\cref{claim:bound_prod_exp_ff} and the bound on $k$.
\end{proof}

\begin{remark}
  In~\cref{claim:crude_bound_e}, the factor $3^{k^3}$ is too lossy to
  allow a meaningful bound with $k = n^{\varepsilon}$, where $\varepsilon >
  0$ is a constant.
\end{remark}

Refining the ideas of~\cref{claim:bound_prod_exp_ff}, we prove a
stronger lemma below which will imply a tighter bound on $e(k)$
sufficient for our application.
\begin{lemma}\label{lem:slice_inv_exp}
   There exists a universal constant $C \ge 1$ such that
   \begin{equation}\label{eq:abs_e_k_sum}
     \sum_{\lambda \vdash k} \frac{\lambda!}{\lambda_1!\cdots \lambda_k!} \cdot \frac{(n)_{\lambda^t_1}}{\abs{\aut(\lambda)}} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1} \dots x_k^{\lambda_k} \right]} \le  k^{C \cdot k} \cdot n^{k/2}.
  \end{equation}
  In particular, for $n \ge 6$,~\cref{eq:abs_e_k_sum} holds with $C=2$.
\end{lemma}

\begin{proof}
  We induct on $k$. For $k = 1$, we have $n \cdot \abs{{\mathbb E}_{x \in \slice(\sqrt{n})} x_1} \le \sqrt{n}$ as desired.
  Using $e(2) = 0$ from~\cref{claim:e2} and the case $k=1$ of~\cref{eq:abs_e_k_sum}, we get that~\cref{lem:slice_inv_exp}
  also holds for $k=2$. Now, consider $k \ge 3$. Let $\Lambda_1 = \set{\lambda \vdash k \mid \lambda_1 = 1}$,
  $\Lambda_2 = \set{\lambda \vdash k \mid \lambda_1 = 2}$ and $\Lambda_{\ge 3} = \set{\lambda \vdash k \mid \lambda_1 = 3}$.
  Note that $\Lambda_1 \sqcup \Lambda_2 \sqcup \Lambda_{\ge 3} = \set{\lambda \vdash k}$ and $\abs{\Lambda_1} = 1$.

  For convenience define $a_{\lambda}$ to be the term associated to $\lambda \vdash k$ on the LHS of~\cref{eq:abs_e_k_sum}, i.e.,
  $$
  a_{\lambda}
  = \frac{\lambda!}{\lambda_1!\cdots \lambda_k!} \cdot \frac{(n)_{\lambda^t_1}}{\abs{\aut(\lambda)}} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1} \dots
  x_k^{\lambda_k} \right]}.
  $$

  First we bound the contribution of the terms associated to
  partitions from $\Lambda_{\ge 3}$ in the LHS
  of~\cref{eq:abs_e_k_sum}. Let $\lambda'$ be the partition obtained
  from $\lambda$ by removing two boxes from $\lambda_1$.  Note that
  $\lambda_1^t = (\lambda')^t_1 \le k-2$ and
  ${\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1'} \dots
  x_{k-2}^{\lambda_{k-2}'} \right]
  = {\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1} \dots
  x_{k-2}^{\lambda_{k-2}} \right]$. Thus,
  \begin{align*}
  &a_{\lambda} = \frac{\lambda!}{\lambda_1!\cdots \lambda_k!} \cdot \frac{(n)_{\lambda^t_1}}{\abs{\aut(\lambda)}} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1} \dots x_k^{\lambda_k} \right]}  \\
  &  \qquad   = \frac{k(k-1)}{\lambda_1 (\lambda_1-1)} \cdot \frac{\abs{\aut(\lambda')}}{\abs{\aut(\lambda)}} \frac{\lambda'!}{\lambda_1'!\cdots \lambda_k'!} \cdot \frac{(n)_{(\lambda')^t_1}}{\abs{\aut(\lambda')}} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1'} \dots x_{k-2}^{\lambda_{k-2}'} \right]}  \\
  & \qquad = k^2 \cdot \frac{\abs{\aut(\lambda')}}{\abs{\aut(\lambda)}} \cdot a_{\lambda'} \le k^3 \cdot a_{\lambda'},
  \end{align*}
  since $\abs{\aut(\lambda')}/\abs{\aut(\lambda)} \le k-2 \le k$.
  For each $\lambda' \vdash k-2$, we can form a partition $\lambda \vdash k$ in $k-2 \le k$
  ways by adding two blocks to a single row of $\lambda'$. Hence, we have
  \begin{equation}\label{eq:lamb_ge_3_contrib}
    \sum_{\lambda \in \Lambda_{\ge 3}} a_{\lambda} \le k \cdot \sum_{\lambda' \vdash k-2} k^3 \cdot a_{\lambda'} \le k^4 \cdot k^{C \cdot (k-2)} \cdot n^{(k-2)/2},
  \end{equation}
  where the last equality follows from the induction hypothesis.

  Now we bound the contribution of the terms $a_{\lambda}$ associated to partitions $\lambda$
  from $\Lambda_{2}$ in the LHS of~\cref{eq:abs_e_k_sum}. Let $i \ge 1$
  be the number of parts of size two of $\lambda$
  and let $\lambda'$ be the partition obtained
  from $\lambda$ by removing these $i$ parts of size two.  Note that $\lambda_1^t =
  (\lambda')^t_1 + i \le k-1$. We have
  \begin{align*}
  &a_{\lambda} = \frac{\lambda!}{\lambda_1!\cdots \lambda_k!} \cdot \frac{(n)_{\lambda^t_1}}{\abs{\aut(\lambda)}} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1} \dots x_k^{\lambda_k} \right]}  \\
  &  \qquad   \le n^i \cdot \frac{(k)_i}{2^i} \cdot \frac{\abs{\aut(\lambda')}}{\abs{\aut(\lambda)}} \cdot \frac{\lambda'!}{\lambda_1'!\cdots \lambda_k'!} \cdot \frac{(n)_{(\lambda')^t_1}}{\abs{\aut(\lambda')}} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1 \dots x_{k-2i} \right]} \\
  &  \qquad   = n^i \cdot \frac{(k)_i}{2^i} \cdot \frac{1}{i!} \cdot \frac{\lambda'!}{\lambda_1'!\cdots \lambda_k'!} \cdot \frac{(n)_{(\lambda')^t_1}}{\abs{\aut(\lambda')}} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1 \dots x_{k-2i} \right]},\\
  \end{align*}
  where in the last equality we used $\abs{\aut(\lambda')}/\abs{\aut(\lambda)} = 1/(i!)$.
  Since $\lambda \in \Lambda_2$ is uniquely specified by its number of parts of size two, applying the induction hypothesis we have
  \begin{align*}
    \sum_{\lambda \in \Lambda_2} a_{\lambda} & \le \sum_{i=1}^{\lfloor k/2 \rfloor} n^i \cdot \frac{(k)_i}{2^i} \cdot \frac{1}{i!} \cdot \left(\frac{\lambda'!}{\lambda_1'!\cdots \lambda_k'!} \cdot \frac{(n)_{(\lambda')^t_1}}{\abs{\aut(\lambda')}} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1 \dots x_{k-2i} \right]} \right)\\
    &\le \sum_{i=1}^{\lfloor k/2 \rfloor} n^i \cdot \frac{(k)_i}{2^i} \cdot \frac{1}{i!} \cdot k^{C\cdot(k-2i)} \cdot n^{(k-2i)/2}\\
    &\le  k^{C \cdot (k-1) } \cdot n^{k/2} \cdot \sum_{i=0}^{\infty} k^{- C \cdot i} \le  \frac{3}{2} \cdot k^{C \cdot (k-1) } \cdot n^{k/2},
  \end{align*}
  where in the last inequality we used $k \ge 3$ and $C \ge 1$.

  Finally, we consider the case $\lambda_1 = 1$. To bound $a_{\lambda}$, we use~\cref{lem:slice_inv_exact}
  and the two preceding cases. We deduce that

  \begin{align*}
    a_{\lambda} & \le n^{k/2} + \sum_{\mu \in \Lambda_2} a_{\mu} + \sum_{\mu \in \Lambda_{\ge 3}} a_{\mu} \le n^{k/2} + k^4 \cdot k^{C \cdot (k-2)} \cdot n^{(k-2)/2} + \frac{3}{2} \cdot k^{C \cdot (k-1) } \cdot n^{k/2}\\
              & = k^{C \cdot k} \cdot n^{k/2} \left( \frac{1}{k^{C \cdot k}} + \frac{k^4}{n \cdot k^{2 \cdot C}} + \frac{3}{2 \cdot k^{C}} \right).
  \end{align*}
   We can bound the LHS of~\cref{eq:abs_e_k_sum} as
  \begin{align*}
    \sum_{\mu \in \Lambda_1} a_{\mu} +  \sum_{\mu \in \Lambda_2} a_{\mu} + \sum_{\mu \in \Lambda_{\ge 3}} a_{\mu} &\le
               k^{C \cdot k} \cdot n^{k/2} \left( \frac{1}{k^{C \cdot k}} + \frac{2 \cdot k^4}{n \cdot k^{2 \cdot C}} + \frac{3}{k^{C}} \right)\\
               &\le k^{C \cdot k} \cdot n^{k/2},
  \end{align*}
  provided $C > 0$ is a sufficiently large constant. In particular, the constant $C$ can be taken to be $2$ for $n \ge 6$.
\end{proof}

\begin{corollary}\label{cor:bound_on_coeff_e_k}
  We have
  $$
  \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1\dots x_k\right]} \le k^{3\cdot k} \cdot n^{-k/2}.
  $$
\end{corollary}

\begin{proof}
   Suppose $k \le \sqrt{n}$.
   Note that~\cref{lem:slice_inv_exp} implies that for $\lambda \vdash k$ with $\lambda_1$
   there exists a constant $C > 0$ such that
   \begin{align*}
     \frac{\lambda!}{\lambda_1!\cdots \lambda_k!} \cdot \frac{(n)_{\lambda^t_1}}{\abs{\aut(\lambda)}} \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1^{\lambda_1} \dots x_k^{\lambda_k} \right]} &= (n)_k \cdot \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1\dots x_k\right]}\\
      &\le  k^{C \cdot k} \cdot n^{k/2}.
   \end{align*}
   Simplifying and using the assumption $k \le \sqrt{n}$, we obtain
   $$
   \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1\dots x_k\right]} \le \frac{k^{C \cdot k} \cdot n^{-k/2}}{\prod_{i=1}^{k-1} \left(1 - \frac{i}{n}\right)} \le 2 \cdot k^{C \cdot k} \cdot n^{-k/2}.
   $$
   Furthermore, for $n \ge 6$,~\cref{lem:slice_inv_exp} allows us to choose $C=2$.
   Since $\abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1 \right]} = 1/\sqrt{n}$, the simpler
   bound applies for all values of $k$
   $$
   \abs{{\mathbb E}_{x \in \mathcal{S}(\sqrt{n})}\left[x_1\dots x_k\right]} \le  k^{3 \cdot k} \cdot n^{-k/2},
   $$
   Now the assumption $n \ge 6$ can be removed since, for $k \ge 2$, we have$(k^{3}/\sqrt{n})^{k} \ge 1$,
   where $1$ is the trivial bound. Similarly, our initial assumption of $k \le \sqrt{n}$ can also be removed
   as the bound also becomes trivial in the regime $k > \sqrt{n}$.
\end{proof}


\section{Technical preliminaries}

In this section we record formal problem statements, then define and discuss one of the main objects in our SoS
lower bound: graph matrices.

For a vector or variable
$v \in {\mathbb R}^n$, and $I \subseteq [n]$, we use the notation
$v^I := \prod_{i \in I}v_i$. When a statement holds with high
probability (w.h.p.), it means it holds with probability $1 - o_n(1)$. In
particular, there is no requirement for small $n$.

\subsection{Problem statements}

We introduce the Planted Affine Planes problem over a distribution ${\mathcal D}$.
\begin{definition}[Planted Affine Planes (PAP) problem]\label{def:prob:pap}
	Given $d_1, \dots, d_m \sim {\mathcal D}$ where each $d_u$ is a vector in ${\mathbb R}^n$,
	determine whether there exists $v \in \set{\pm \frac{1}{\sqrt{n}}}^n$ such that
	\[
	\ip{v}{d_u}^2 = 1,
	\]
	for every $u \in [m]$.
\end{definition}
Our results hold for the Gaussian setting $\mathcal{D} = {\mathcal N}(0, I)$ and the boolean setting where ${\mathcal D}$ is uniformly sampled from $\{\pm 1\}^n$, though we conjecture in \cref{sec:open-problems} that similar SoS bounds hold under more general conditions on ${\mathcal D}$.

Observe that in both settings the solution vector $v$ is restricted to be Boolean (in the sense that the entries are either $\frac{1}{\sqrt{n}}$ or $\frac{-1}{\sqrt{n}}$) and an SoS lower bound for this restricted version of the problem is
stronger than when $v$ can be an arbitrary vector from ${\mathbb R}^n$.

As we saw in \cref{chap: main_results}, the Sherrington--Kirkpatrick (SK) problem comes from the spin-glass model
in statistical physics~\cite{SK76}.

\begin{definition}[Sherrington-Kirkpatrick problem]\label{def:prob:sk}
	Given $W \sim \GOE(n)$, compute
	\[
	\OPT(W) := \max_{x \in \{\pm 1\}^n} x^\intercal W x.
	\]
\end{definition}

The Planted Boolean Vector problem was introduced by
Mohanty--Raghavendra--Xu \cite{mohanty2020lifting}, where it was called the
``Boolean Vector in a Random Subspace''.

\begin{definition}[Planted Boolean Vector problem]\label{def:prob:pbv}
	Given as input a uniformly random $p$-dimensional subspace $V$ of $\mathbb{R}^n$ in the form of
	a projector $\Pi_V$ onto $V$, compute
	\[
	\OPT(V) :=  \frac{1}{n}\max_{b \in \{\pm 1\}^n} b^\intercal \Pi_V b.
	\]
\end{definition}

\subsection{Graph matrices}
To study ${\mathcal M}$, we decompose it using the framework of \textit{graph matrices}. Originally developed in the context of the planted clique problem, graph matrices are random matrices whose entries are symmetric functions of an underlying random object -- in our case, the set of vectors $d_1, \dots, d_m$. We take the general presentation and results from~\cite{ahn2016graph}. For our purposes, the following definitions are sufficient.


The graphs that we study have two types of vertices, circles $\circle{}$ and squares $\square{}$. We let ${\mathcal C}_m$ be a set of $m$ circles labeled 1 through $m$, which we denote by $\circle{1}, \circle{2}, \dots, \circle{m}$, and let ${\mathcal S}_n$ be a set of $n$ squares labeled 1 through $n$, which we denote by $\square{1}, \square{2}, \dots, \square{n}$. We will work with bipartite graphs with edges between circles and squares, which have positive integer labels on the edges. When there are no multiedges (the graph is simple), such graphs are in one-to-one correspondence with Fourier characters on the vectors $d_u$. An edge between $\circle{u}$ and $\square{i}$ with label $l$ represents $h_{l}(d_{u,i})$ where $\{h_k\}$ is the Fourier basis (e.g. Hermite polynomials).

\[ \text{simple graph with labeled edges} \qquad \Longleftrightarrow \qquad \displaystyle\prod_{\substack{\circle{u} \in {\mathcal C}_m,\\ \square{i} \in {\mathcal S}_n}} h_{l(\circle{u}, \square{i})}(d_{u,i}) \]

An example of a Fourier polynomial as a graph with labeled edges is given in~\cref{fig:fourier_graph}. Unlabeled edges are implicitly labeled 1.
\begin{figure}[h!]
	\centering
	\begin{tikzpicture}[scale=0.5,every node/.style={scale=0.5}]
		\draw  (-2,3) rectangle node {\huge $i_1$}(-0.5,1.5) node (v5) {};
		\draw  (3,1) ellipse (1 and 1) node {\huge $u$};
		\draw  (6.5,3) rectangle node (v10) {\huge $j_1$} (8,1.5);
		\draw  (-2,0) rectangle node {\huge $i_2$} (-0.5,-1.5);
		\draw  (6.5,0) rectangle node {\huge $j_2$} (8,-1.5);
		\node (v1) at (-0.5,3) {};
		\node (v4) at (-0.5,2.25) {};
		\node (v6) at (6.5,2.25) {};
		\node (v8) at (-0.5,-0.75) {};
		\node (v9) at (6.5,-0.75) {};
		\node at (-0.5,0) {};
		\node (v2) at (2,1) {};
		\node at (2,1.5) {};
		\node (v7) at (6.5,1.5) {};
		\node (v3) at (4,1) {};
	
		\draw  plot[smooth, tension=.7] coordinates {(v3)};
		\draw  plot[smooth, tension=.7] coordinates {(v3)};
		\draw  plot[smooth, tension=.7] coordinates {(v2) (0.5,2) (v4)};
		\node at (1,2.2) {\huge $3$};
	
		\draw  plot[smooth, tension=.7] coordinates {(v3) (5,2) (v6)};
		\draw  plot[smooth, tension=.7] coordinates {(v3)};
		\draw  plot[smooth, tension=.7] coordinates {(v2) (1,-0.5) (v8)};
		\draw  plot[smooth, tension=.7] coordinates {(v3) (5,-0.5) (v9)};
		\node at (6.5,3) {};
		\draw  plot[smooth, tension=.7] coordinates {(v3)};
		\draw  plot[smooth, tension=.7] coordinates {(v3)};
		\draw  plot[smooth, tension=.7] coordinates {(v10)};
		\node at (6.5,3) {};
		\draw  (2,5.5) rectangle node {\huge $w_1$} (3.5,4);
		\node (v11) at (2.5,4) {};
		\node (v13) at (3,4) {};
		\node (v12) at (3,2) {};
	
		\draw  plot[smooth, tension=.7] coordinates {(v12) (3,3) (3,4)};
		\node at (2.5,3) {\huge $2$};
	\end{tikzpicture}
	\caption{The Fourier polynomial $h_3(d_{u,i_1})h_1(d_{u,i_2})h_2(d_{u,w_1})h_1(d_{u,j_1})h_1(d_{u,j_2})$ represented as a graph.}
	\label{fig:fourier_graph}
\end{figure}


Define the degree of a vertex $v$,  denoted $\deg(v)$, to be the sum of the labels incident to $v$, and $\abs{E}$ to be the sum of all labels. For
intuition it is mostly enough to work with simple graphs, in which case these quantities make sense as the edge multiplicities in an implicit multigraph.

\begin{definition}[Proper]
	We say an edge-labeled graph is \textit{proper} if it has no multiedges.
\end{definition}
The definitions allow for ``improper'' edge-labeled multigraphs which simplify multiplying graph matrices (\cref{sec:single-spider}).

\begin{definition}[Matrix indices]
	A \textit{matrix index} is a set $A$ of elements from ${\mathcal C}_m \cup {\mathcal S}_n$.
\end{definition}
We let $A(\square{i})$ or $A(\circle{u})$ be 0 or 1 to indicate if the vertex is in $A$.

\begin{definition}[Ribbons]\label{def:ribbon}
	A \textit{ribbon} is an undirected, edge-labeled graph $R$ given by $R = (V(R), E(R), A_R, B_R)$, where $V(R) \subseteq {\mathcal C}_m\cup {\mathcal S}_n$ and $A_R, B_R$ are two matrix indices (possibly not disjoint) with $A_R, B_R \subseteq V(R)$, representing two distinguished sets of vertices. Furthermore, all edges in $E(R)$ go between squares and circles.
\end{definition}
We think of $A_R$ and $B_R$ as being the ``left'' and ``right'' sides of $R$, respectively. We also define the set of ``middle vertices'' $C_R := V(R) \setminus (A_R \cup B_R)$. If $e \not\in E(R)$, then we define its label $l(e) = 0$. We also abuse notation and write $l(\square{i}, \circle{u})$ instead of $l(\{\square{i}, \circle{u}\})$.


Akin to the picture above, each ribbon corresponds to a Fourier polynomial.
This Fourier polynomial lives inside a single entry of the matrix $M_R$.
In the definition below, the $h_k(x)$ are the Fourier basis corresponding to the respective setting. In the Gaussian case, they are the (unnormalized) Hermite polynomials, and in the boolean case, they are just the parity function, represented by
\[h_0(x) = 1, \qquad h_1(x) = x, \qquad h_k(x) = 0 \;\; (k \geq 2) \]

\begin{definition}[Matrix for a ribbon]\label{def:ribbon-matrix}
	The matrix $M_R$ has rows and columns indexed by subsets of ${\mathcal C}_m~\cup~{\mathcal S}_n$, with a single nonzero entry defined by
	\[M_R[I, J] = \left\{\begin{array}{lr}
		\displaystyle\prod_{\substack{e \in E(R), \\ e = \{\square{i}, \circle{u}\}}} h_{l(e)}(d_{u,i}) &  I = A_R, J = B_R\\
		0 & \text{Otherwise}
	\end{array}\right. \]
\end{definition}

Next we describe the shape of a ribbon, which is essentially the ribbon when we have forgotten all the vertex labels and retained only the graph structure and the distinguished sets of vertices.
\begin{definition}[Index shapes]
	An \textit{index shape} is a set $U$ of formal variables. Furthermore, each variable is labeled as either a ``circle'' or a ``square''.
\end{definition}
We let $U(\square{i})$ and $U(\circle{u})$ be either 0 or 1 for whether $\square{i}$ or $\circle{u}$, respectively, is in $U$.

\begin{definition}[Shapes]\label{def:shape}
	A \textit{shape} is an undirected, edge-labeled graph $\alpha$ given by $\alpha = (V(\alpha), E(\alpha), U_\alpha, V_\alpha)$ where $V(\alpha)$ is a set of formal variables, each of which is labeled as either a ``circle'' or a ``square''. $U_\alpha$ and $V_\alpha$ are index shapes (possibly with variables in common) such that $U_\alpha, V_\alpha \subseteq V(\alpha)$. The edge set $E(\alpha)$ must only contain edges between the circle variables and the square variables.
\end{definition}

We'll also use $W_\alpha := V(\alpha) \setminus (U_\alpha \cup V_\alpha)$ to denote the ``middle vertices'' of the shape.

\begin{remk}
	We will abuse notation and use $\square{i}, \square{j}, \circle{u}, \circle{v}, \ldots$ for both the vertices of ribbons and the vertices of shapes. If they are ribbon vertices, then the vertices are elements of ${\mathcal C}_m\cup{\mathcal S}_n$ and if they are shape vertices, then they correspond to formal variables with the appropriate type.
\end{remk}

\begin{definition}[Trivial shape]
	Define a shape $\alpha$ to be trivial if $U_\alpha = V_\alpha$, $W_\alpha = \emptyset$ and $E(\alpha) = \emptyset$.
\end{definition}

\begin{definition}[Transpose of a shape]
	For a shape $\alpha = (V(\alpha), E(\alpha), U_\alpha, V_\alpha)$, its transpose is defined
	to be the shape $\alpha^{\intercal} = (V(\alpha), E(\alpha), V_\alpha, U_\alpha)$.
\end{definition}

For a shape $\alpha$ and an injective map $\sigma :
V(\alpha) \rightarrow {\mathcal C}_m \cup {\mathcal S}_n$, we define the
realization $\sigma(\alpha)$ as a ribbon in the natural
way, by labeling all the variables using the map
$\sigma$. We also require $\sigma$ to be
type-preserving i.e. it takes square variables to ${\mathcal S}_n$ and circle variables to ${\mathcal C}_m$.
The ribbons that result are referred to as \textit{ribbons of shape $\alpha$}; notice that this partitions the set of all ribbons according to their shape\footnote{Partitions up to equality of shapes, where two shapes are equal if there is a type-preserving bijection between their variables that converts one shape to the other. When we operate on sets of shapes below, we implicitly use each distinct shape only once.}\footnote{Note that in our definition two realizations of a shape may give the same ribbon.}.

Finally, given a shape $\alpha$, the graph matrix $M_\alpha$ consists of all Fourier characters for ribbons of shape $\alpha$.
\begin{definition}[Graph matrices]\label{def:graph-matrix}
	Given a shape $\alpha = (V(\alpha), E(\alpha), U_\alpha, V_\alpha)$, the \textit{graph matrix} $M_\alpha$ is
	\[M_\alpha = \displaystyle\sum_{R \text{ is a ribbon of shape }\alpha} M_R\]
\end{definition}

The moment matrix for PAP will turn out to be defined using graph matrices $M_\alpha$ whose left and right sides only have square vertices, and no circles. However, in the course of the analysis we will factor and multiply graph matrices with circle vertices in the left or right.


\subsection{Norm bounds}
Similar to the norm bounds for graph matrices with only a single type of vertex (see \cref{chap: efron_stein}), the spectral norm of a graph matrix in our setting is determined, up to logarithmic factors, by relatively simple combinatorial properties of the graph. For a subset $S \subseteq {\mathcal C}_m \cup {\mathcal S}_n$, we define the weight $w(S)~:=~(\#\text{ circles in }S)\cdot \log_n(m)+ (\#\text{ squares in }S)$. Observe that $n^{w(S)} = m^{\# \text{ circles in }S}\cdot n^{\#\text{ squares in }S}$.

\begin{definition}[Minimum vertex separator]
	For a shape $\alpha$, a set $S_{\min}$ is a minimum vertex separator if all paths from $U_\alpha$ to $V_\alpha$ pass through $S_{\min}$ and $w(S_{\min})$ is minimized over all such separating sets.
\end{definition}

Let $W_{iso}$ denote the set of isolated vertices in $W_\alpha$. Then essentially the following norm bound holds for all shapes $\alpha$ with high probability (a formal statement can be found in~\cref{app:norm_bounds}):
\[\norm{M_\alpha} \leq  \widetilde\operatorname{O}\left(n^{\frac{w(V(\alpha)) - w(S_{\min}) + w(W_{iso})}{2}}\right)\]

In fact, the only probabilistic property required of the inputs $d_1, \dots, d_m$ by our proof is that the above norm bounds hold for all shapes that arise in the analysis.
We henceforth assume that the norm bounds in~\cref{lem:gaussian-norm-bounds} (for the Gaussian case) and~\cref{lem:norm-bounds} (for the boolean case) hold.

\section{Proof Strategy}\label{sec:strategy}

\input{sherrington_kirkpatrick/strategy}

\section{Pseudocalibration}\label{sec:pseudo_calib}

\input{sherrington_kirkpatrick/pseudo_calib}

\section{Proving PSD-ness}\label{sec:psd}

\input{sherrington_kirkpatrick/no_spider}

\input{sherrington_kirkpatrick/spider_killing}

\input{sherrington_kirkpatrick/finishing_the_proof}

\section{Sherrington-Kirkpatrick Lower Bounds}\label{sec:sk}

\input{sherrington_kirkpatrick/sk}




\section{Omitted technical details}

\input{sherrington_kirkpatrick/norm_bounds}

\input{sherrington_kirkpatrick/mon_on_slice}




\section{The Sum of Squares hierarchy}

We start by defining convex relaxations for polynomial optimization problems. The SoS hierarchy will then be a special family of convex relaxations. For a more detailed treatment, see e.g. \cite{sos_course, BS14:ICM, FKP19}.

\subsection{Polynomial optimization and convex relaxations}

In polynomial optimization, we are given multivariate polynomials $p, g_1, \ldots, g_m$ on $n$ variables $x_1, \ldots, x_n$ taking real values, denoted collectively by $x$, and the task is to:
\[\text{maximize } p(x)\text{ such that }g_1(x) = 0, \ldots, g_m(x) = 0\]

In general, we could also allow inequality constraints, e.g., $g_i(x) \ge 0$. For technical convenience in our setup, we work only with equality constraints but much of the theory generalizes, with some modifications, when we have inequality constraints instead. An alternate approach is to replace each inequality $g_i(x) \ge 0$ by $g_i(x) = y^2$ where $y$ is a new variable that we can introduce.

In this formulation, many optimization problems can be formulated as polynomial optimization problems.

\begin{example}[Maximum Cut]
    Given a graph $G = (V, E)$, we would like to partition the set of vertices into two subsets such that the number of edges with endpoints in different subsets is maximized. To formulate this as a polynomial optimization problem, let the graph have $n$ vertices and let $x_1, \ldots, x_n$ be variables, one for each vertex. We wish to enforce $x_i \in \{-1, 1\}$ where all vertices $i$ with $x_i = -1$ form one subset and the rest form the other subset. We can enforce this set containment constraint via the polynomial constraint $x_i^2 = 1$. For any edge $(i, j) \in E$, it is cut if and only if $x_ix_j = -1$. Therefore, the total number of edges cut is $\sum_{(i, j) \in E} \frac{1}{2}(1 - x_ix_j)$. The polynomial formulation therefore becomes
    \begin{align*}
        \max_{x \in {\mathbb R}^n} \sum_{(i, j) \in E} \frac{1}{2}&(1 - x_ix_j) \text{ such that }\\
        x_i^2 &= 1 \text{ for all }i \le n
    \end{align*}
\end{example}

\begin{example}[Maximum Clique]\label{ex: max_clique}
    Given a graph $G = (V, E)$, we would like to find the maximize size subset of vertices that form a clique. Again, let $x_1, \ldots, x_n$ be variables, one for each vertex. This time, we wish to enforce $x_i \in \{0, 1\}$, which we can easily do so using the polynomial constraint $x_i^2 = x_i$, with the intent being that all vertices $i$ with $x_i = 1$ form a clique. To enforce this clique constraint, we can add the polynomial constraint $x_ix_j = 0$ for all non-edges $(i, j) \not\in E$. Finally, to maximize the size of the subset, we simply maximize $\sum_{i \le n} x_i$. Therefore, the polynomial optimization is
    \begin{align*}
        \max_{x \in {\mathbb R}^n} \sum_{i \le n} &x_i \text{ such that}\\
        x_ix_j &= 0 \text{ for all }(i, j) \not\in E\\
        x_i^2 &= x_i \text{ for all } i \le n
    \end{align*}
\end{example}

There can be other equivalent formulations for these problems. In general, many optimization problems can be stated in this manner, therefore generic polynomial optimization contains a large class of fundamental problems that appear in computer science.

Since exactly solving maximum cut or maximum clique is NP-hard \cite{karp1972reducibility}, exactly solving these polynomial optimization problems is also NP-hard. Therefore, we turn to convex relaxations.

A convex relaxation of a polynomial optimization problem widens the search space of solution vectors $x$ into a larger space that one can efficiently optimize over. We will describe one way to do this. We identify a convex space ${\mathcal C}$ that contains the space ${\mathcal S}= \{g_1(x) = 0, \ldots, g_m(x) = 0\}$ up to a map, that is, for each $x \in {\mathcal S}$, there exists a corresponding $y \in {\mathcal C}$ such that $y$ is a representative of $x$. We also identify a convex function $\tilde{p}(y)$ such that if $y$ is a representative of $x$, then $\tilde{p}(y) = p(x)$. Then, we simply optimize $\tilde{p}(y)$ over ${\mathcal C}$. There has been significant work on efficiently optimizing a convex function over a convex body, which is possible under reasonable assumptions (see e.g. \cite{PS82}). It's clear that from the above properties, the solution we get is at least as large as the optimal solution (in the case of maximization), but it comes with the advantage that it is efficiently computable. It is desirable to design convex relaxations for problems that yield good approximations. The SoS hierarchy is a family of such convex relaxations.

\subsection{Sum of Squares relaxations}

The SoS hierarchy, sometimes referred to as the Lasserre hierarchy, was first independently studied by \cite{parrilo2000structured, lasserre2001global, shor1987approach} and has been studied in other contexts by \cite{nesterov2000squared, grigoriev2001complexity, Grigoriev01}.
It is a family of convex relaxations for polynomial optimization, parameterized by an integer known as it's degree. As we increase the degree, we get progressively tighter relaxations, but requiring longer times to optimize over.

We now formally describe the Sum of Squares hierarchy, via the so-called pseudoexpectation operator view.

\begin{definition}[Pseudo-expectation values]\label{def: pseudoexpectation}
    Given multivariate polynomial constraints $g_1 = 0$,\ldots,$g_m = 0$ on $n$ variables $x_1, \ldots, x_n$, degree $d$ pseudo-expectation values are a linear map $\tilde{\EE}$ from polynomials of $x_1, \ldots, x_n$ of degree at most $d$ to $\mathbb{R}$ satisfying the following conditions:
    \begin{enumerate}
        \item $\tilde{\EE}[1] = 1$, \label{pe:normalized}
       
        \item $\tilde{\EE}[f \cdot g_i] = 0$ for every $i \in [m]$ and polynomial $f$ such that $\deg(f \cdot g_i) \leq d$. \label{pe:feasible}
        \item $\tilde{\EE}[f^2] \geq 0$ for every polynomial $f$ such that $\deg(f^2) \le d$. \label{pe:psdness}
    \end{enumerate}
\end{definition}

Any linear map $\tilde{\EE}$ satisfying the above properties is known as a degree $d$ pseudoexpectation operator satisfying the constraints $g_1 = 0, \ldots, g_m = 0$.

\begin{definition}[Degree $d$ SoS]
    The degree $d$ SoS relaxation for the polynomial optimization problem
    \[\text{maximize } p(x)\text{ such that }g_1(x) = 0, \ldots, g_m(x) = 0\]
    is the program that maximizes $\tilde{\EE}[p(x)]$ over all degree $d$ pseudoexpectation operators $\tilde{\EE}$ satisfying the constraints $g_1 = 0, \ldots, g_m = 0$.
\end{definition}

The intuition behind pseudo-expectation values is that the conditions on the pseudo-expectation values are conditions that would be satisfied by any actual expected values over a distribution of solutions, so optimizing over pseudo-expectation values gives a relaxation of the problem.

The main observation is that the SoS relaxation can be efficiently solved! This is because the conditions on pseudo-expectation values can be captured by a semidefinite program. In particular, \cref{pe:psdness} in \cref{def: pseudoexpectation} can be reexpressed in terms of a matrix called the moment matrix.

\begin{definition}[Moment Matrix of $\tilde{\EE}$]
    Given a degree $d$ pseudo-expectation operator $\tilde{\EE}$, define the associated moment matrix $\Lambda$ to be a matrix with rows and columns indexed by monomials $p$ and $q$ such that the entry corresponding to row $p$ and column $q$ is
    \[
    \Lambda[p, q] := \tilde{\EE}\left[pq\right].
    \]
\end{definition}

It is easy to verify that \cref{pe:psdness} in~\cref{def: pseudoexpectation} equivalent to $\Lambda \succeq 0$. Therefore, solving the degree $d$ SoS relaxation can be done via semidefinite programming, see for e.g. \cite{vandenberghe1996semidefinite}.
In general, for degree-$d$ SoS, we can solve it in $n^{O(d)}$ time\footnote{This is not completely accurate due to issues of bit complexity \cite{o2017sos} but this doesn't occur for most problems of interest \cite{RW17:sos}}. Therefore, constant degree SoS can be solved in polynomial time.

\subsubsection{Analyzing degree $2$ SoS for maximum clique}

To illustrate the use of this technique, let's analyze the degree $2$ SoS relaxation for the maximum clique problem on Erd\H{o}s\xspace-R\'enyi\xspace random graphs $G_{n, 1/2}$. We use the program from \cref{ex: max_clique}.

Let $A$ be the adjacency matrix of a graph $G$ sampled from $G_{n, 1/2}$ and let $J$ be the matrix with all $1$s. Then, with high probability over the choice of $G$, from random matrix theory, we have $\lambda_{max}(A- J/2) = O(\sqrt{n})$ where $\lambda_{max}(.)$ denotes the maximum singular value. Now, suppose a set $S$ of vertices form a clique and let $\mathbf 1_S$ denote the indicator vector of the set $S$, then
\begin{align*}
    \frac{k(k - 1)}{2} &= \ip{\mathbf 1_S}{(A - J/2)\mathbf 1_S}\\
    &\le \norm{\mathbf 1_S}^2\cdot \lambda_{max}(A - J / 2)\\
    &\le k \cdot O(\sqrt{n})
\end{align*}
which shows $k \le O(\sqrt{n})$.

The crux of this simple argument is that this is a \textit{low-degree proof}, more specifically degree $2$ proof, that SoS can capture. That is, if we solve the degree $2$ SoS relaxation, we will be able to show that $\tilde{\EE}[\sum x_i] = O(\sqrt{n})$ whp.

To see this formally, we start with the following inequality: $O(\sqrt{n})I - (A - J / 2) \succeq 0$ whp. This implies
\[x^\intercal(O(\sqrt{n})I - (A - J / 2))x = \sum p_i(x)^2\]
is a sum of squares of polynomials of degree at most $1$.
A simple computation yields
\[x^\intercal(A - J/2)x = \frac{1}{2}(\sum_{i = 1}^n x_i)^2 - \sum_{i, j} x_ix_j \mathbf 1_{(i, j) \not\in E(G)}\]

For our program variables $x$, we have $x_i^2 = x_i$ and $x_ix_j \mathbf 1_{(i, j) \not\in E(G)} = 0$. Therefore,
\[\sum p_i(x)^2 = O(\sqrt{n}) (\sum_{i = 1}^n x_i) - \frac{1}{2}(\sum_{i = 1}^n x_i)^2\]
Apply $\tilde{\EE}$ both sides. We finally use the fact that for a polynomial $p(x)$, we have $\tilde{\EE}[p(x)^2] \ge \tilde{\EE}[p(x)]^2$, which is true because this rearranges to $\tilde{\EE}[(p(x)- \tilde{\EE}[p(x)])^2] \ge 0$, which is true because the left hand side is the the pseudo-expectation of a square polynomial, which is nonnegative by definition. This simple fact is essentially saying that the pseudo-variance is nonnegative. Using the linearity of $\tilde{\EE}$, we finally get

\begin{align*}
    O(\sqrt{n}) \tilde{\EE}[\sum_{i = 1}^n x_i] - \frac{1}{2}(\tilde{\EE}[\sum_{i = 1}^n x_i])^2 &\ge O(\sqrt{n}) \tilde{\EE}[\sum_{i = 1}^n x_i] - \frac{1}{2}\tilde{\EE}[(\sum_{i = 1}^n x_i)^2]\\
    &= \tilde{\EE}[O(\sqrt{n}) (\sum_{i = 1}^n x_i) - \frac{1}{2}(\sum_{i = 1}^n x_i)^2]\\
    &= \tilde{\EE}[\sum p_i(x)^2]\\
    &= \sum \tilde{\EE}[p_i(x)^2]\\
    & \ge 0
\end{align*}

Therefore, $\tilde{\EE}[\sum_{i = 1}^n x_i] = O(\sqrt{n})$ like we wanted to show.

This shows that the degree $2$ SoS relaxation certifies an upper bound of $O(\sqrt{n})$ whp on the size of the maximum clique of an Erd\H{o}s\xspace-R\'enyi\xspace random graph. In contrast, the size of the true maximum clique is $(2 + o(1)) \log n$ \cite{matula1976largest}. Despite intense effort, polynomial time algorithms can only detect a planted $k$-clique when $k = \Omega(\sqrt{n})$. Therefore, SoS already achieves the best known guarantees for this problem up to constant factors. It was shown in \cite{BHKKMP16} that higher degree SoS (up to degree $O(\log n)$) doesn't necessarily do much better, which is a SoS lower bound of the type we will study in this work.

\subsubsection{Alternate viewpoints of SoS}

In the polynomial optimization problem of maximizing $p(x)$ subject to the constraints $g_1(x) = 0, \ldots, g_m(x) = 0$, if there does not exist any degree $d$ pseudo-expectation operator $\tilde{\EE}$ satisfying $g_1 = 0, \ldots, g_m = 0$ such that $\tilde{\EE}[p] > c$, then we say that degree $d$ SoS certifies that $\tilde{\EE}[p(x)] \le c$.

A degree $d$ SoS proof that $p(x) \le c$ given $g_1(x) = 0, \ldots, g_m(x) = 0$ is an expression of the form
\[-1 = \sum_{i \le m} g_i(x) q_i(x) + \sum_{i \le a} s_i(x)^2 + (p(x) - c) \sum_{i \le b} t_i(x)^2\]
where $q_1, \ldots, q_m, s_1, \ldots, s_a, t_1, \ldots, t_b$ are polynomials in $x$ such that each term on the right hand side of the above expression has degree at most $d$. Indeed, the existence of such an expression automatically implies that $p(x) \le c$ whenever $g_1(x) = 0, \ldots, g_m(x) = 0$.

When degree $d$ SoS certifies that $\tilde{\EE}[p(x)] \le c$, by duality, this will imply that there exists a degree $d$ SoS proof that $p(x) \leq c$ given $g_1(x), \ldots, g_m(x) = 0$. The Positivstellensatz of Krivine and Stengle \cite{krivine1964anneaux, stengle1974nullstellensatz} says that for any $c$, either there exists $x$ such that $p(x) > c, g_1(x) = 0, \ldots, g_m(x) = 0$, or there is an SoS proof that $p(x) \le c$ given $g_1(x) = 0, \ldots, g_m(x) = 0$.

For a fixed $d$, degree $d$ SoS can indeed be construed as finding the best $c$ so that there is a degree $d$ SoS proof of $\tilde{\EE}[p(x)] \le c$. This also intuitively explains why higher degree SoS gives tighter relaxations. For most programs stemming from combinatorial optimization problems, degree $n$ SoS usually finds the optimal bound, where $n$ is the number of variables. So, for instance, degree $n$ SoS exactly outputs the size of the maximum clique of a graph. For efficient algorithms, we usually want constant degree SoS.
Therefore, for sum of squares lower bounds, the higher the degree, the stronger the lower bound.
In this work, all our lower bounds are for degree $n^{\varepsilon}$ SoS, which corresponds to subexponential time!

The viewpoint we have studied here is the dual view aka the search for simple proofs, which will suit our purposes. There is also the primal viewpoint where SoS can be viewed directly as a semi-definite programming relaxation of the program. This is sometimes useful for algorithm design.

Similar to the maximum clique application shown above, the SoS hierarchy has been shown formally to obtain the state-of-the art approximation guarantees for many fundamental problems both in the worst case and the average case setting. This includes constraint satisfaction problems \cite{Raghavendra08}, maximum cut \cite{GW94}, sparsest cut \cite{AroraRV04}, tensor PCA \cite{hopkins2015tensor}, etc.
Therefore, it's natural to study the limits of SoS by studying SoS lower bounds.

Before we discuss SoS lower bounds, we introduce the framework of hypothesis testing problem in more detail, suited to our purposes.

\section{Hypothesis testing}

Let $\Omega$ be a sample space. Let $\nu, \mu$ be probability distributions on $\Omega^n$. The hypothesis testing problem is the problem of distinguishing $\nu, \mu$ given access to a sample. Formally, input $x \sim \Omega^n$ is sampled from either
\begin{itemize}
    \item $H_0$: $x \sim \mu$
    \item $H_1$: $x \sim \nu$.
\end{itemize}
Our objective is to determine which distribution it came from, with high probability. This is the hypothesis testing problem in general, where traditionally, $H_0$ is known as the null hypothesis and $H_1$ the alternate hypothesis. We abuse notation and use $H_0, H_1$ to also denote the probability distributions $\mu, \nu$ respectively as well.

For example, $H_0$ could be the distribution of Erd\H{o}s\xspace-R\'enyi\xspace random graphs and $H_1$ could be the distribution of Erd\H{o}s\xspace-R\'enyi\xspace random graphs with a large planted clique. Given the graph, we would like to determine which of the two distributions it came from, or in other words, whether it contains a large clique.

A hypothesis test $f$ is a function $f: \Omega^n \rightarrow \{0, 1\}$. Given the input $x$, if $f(x) = 0$, then we report that $x$ came from the null distribution $H_0$ otherwise we report that $x$ came from the alternate distribution $H_1$.

A successful hypothesis test is a test $f$ such that when $b$ is chosen uniformly at random from $\{0, 1\}$ and $x$ is sampled from $H_b$, we have $\mathbb{E}_b \mathrm{Pr}_{x \sim H_b}[f(x) \neq b] \le o(1)$. That is, test $f$ has success probability $1 - o(1)$. Here, for simplicity, we don't distinguish type $1$ and type $2$ errors.

Indeed, for a test to be useful, it should be computable efficiently. When computational efficiency is disregarded, the famous Neyman-Pearson lemma  precisely characterizes the best hypothesis test. To define this test, we need the following standard definition.

\begin{definition}[Likelihood ratio]
    For a given hypothesis testing problem, define the likelihood ratio of an input $x$ to be $LR(x) = \frac{\mathrm{Pr}_{H_1}(x)}{\mathrm{Pr}_{H_0}(x)}$.
\end{definition}

\begin{lemma}[Neyman-Pearson Lemma]
    For a given hypothesis testing problem, the test $f$ that minimizes $\mathbb{E}_b\mathrm{Pr}_{x \sim H_b} [f(x) \neq b]$ is the likelihood ratio test
    \[f(x) = \begin{dcases}
        1 & \text{ if $LR(x) > 1$}\\
        0 & \text{ o.w.}
    \end{dcases}\]
\end{lemma}

In this work, our focus will be on efficiently computable tests $f$.

\subsection{Low degree likelihood ratio}\label{subsec: ldlr}

Consider a given hypothesis testing problem. We focus on a special class of efficiently computable hypothesis tests involving low degree multivariate polynomials. These are termed low-degree distinguishers. We give a brief treatment in this section and refer the readers to \cite{hop18, kunisky2021spectral} for a more detailed treatment.

In this section, for polynomials to be well-defined, assume $\Omega \subseteq {\mathbb R}$. Moreover, assume $H_0$ has finite moments. We will consider distinguishers that arise from multivariate polynomials $f: {\mathbb R}^n \rightarrow {\mathbb R}$. We say that the distinguisher has degree $D$ if the degree of $f$ is at most $D$. Since the output of a polynomial need not be boolean, we need an alternate definition of the success of this distinguisher. We use the following definition from \cite{hop18}.

\begin{definition}[Degree $D$ distinguisher]
    For a hypothesis testing problem, the multivariate polynomial $f$ is a successful degree $D$ distinguisher if
    \begin{itemize}
        \item (Low degree) $f$ is a multivariate polynomial of degree at most $D$.
        \item (Normalization) $\mathbb{E}_{x \sim H_0}[f(x)] = 0, \mathbb{E}_{x \sim H_0} [f(x)^2] = 1$
        \item (Distinguishability) $\lim_{n \rightarrow \infty} \mathbb{E}_{x \sim H_1} [f(x)] \rightarrow \infty$.
    \end{itemize}
\end{definition}

The normalization ensures appropriate scaling for the polynomial. Note that the normalization is over the null distribution. Informally, normalized $f$ is a successful distinguisher if it attains unbounded values on the alternate distribution in the limit. Indeed, in applications, a hypothesis test may be obtained by appropriately thresholding on the value of the polynomial.

The limit on the degree imposes the kind of computational restrictions we wish to impose on our distinguishing algorithm.
Trying to understand the power of such low-degree distinguishers for hypothesis testing problems is an active area of research.
For instance, we could ask: If degree $O(\log n)$ distinguishers fail for a hypothesis testing problem with input size $n^{O(1)}$, is the problem hard for all polynomial time algorithms?

The first natural question is to ask what's the best degree $D$ distinguisher for a given hypothesis testing problem. This has been answered in prior works and is simply the projection of the likelihood ratio $LR(x) = \frac{\mathrm{Pr}_{H_1}(x)}{\mathrm{Pr}_{H_0}(x)}$ to degree $D$ polynomials.

To make this precise, for $f, g: {\mathbb R}^n \rightarrow {\mathbb R}$, define the inner product $\ip{f}{g} = \mathbb{E}_{x \sim H_0} f(x)g(x)$. Then, we can canonically define the projection $f^{\le D}$ of a function $f$ to degree $D$ polynomials via this inner product. Take an orthonormal basis $\chi_0 = 1, \chi_1, \ldots, \chi_t$ of multivariate polynomials of degree at most $D$ where $\chi_0 = 1$ is the constant function. Then, $f^{\le D}(x) = \sum_{i \le t} \ip{f}{\chi_t}\chi_t(x)$.

The following lemma is implicit in prior works (e.g. \cite{hop17efficient, hopkins2018integrality}). We include a proof for completeness.

\begin{lemma}\label{lem: best_distinguisher}
    For a hypothesis testing problem, the optimal degree $D$ test $f$ that maximizes $\mathbb{E}_{x \sim H_1} f(x)$ is the normalized low-degree likelihood ratio $\frac{LR^{\le D} - 1}{\norm{LR^{\le D} - 1}}$. Moreover, its value is $\mathbb{E}_{x \sim H_1}[f(x)] = \norm{LR^{\le D} - 1}$.
\end{lemma}

\begin{proof}
    Let $f$ be a normalized degree $D$ polynomial with $f = \sum_{i = 0}^t c_t\chi_t$. Then, $c_0 = \mathbb{E}[f] = 0$ and $\sum c_i^2 = \mathbb{E}[f^2] = 1$. Then, \[\mathbb{E}_{x \sim H_1} f(x) = \sum_{1 \le i \le t} c_i \mathbb{E}_{x \sim H_1} \chi_i \le \sqrt{(\sum_{1 \le i \le t} c_i^2)(\sum_{1 \le i \le t} (\mathbb{E}_{x \sim H_1}\chi_i)^2)} = \sqrt{\sum_{1 \le i \le t} (\mathbb{E}_{x \sim H_1}\chi_i)^2}\]
    On the other hand, equality is attained by the polynomial $g = \frac{LR^{\le D} - 1}{\norm{LR^{\le D} - 1}}$. Indeed, we have $\mathbb{E}_{x \sim H_0}[g] = 0$ because $\mathbb{E}_{x \sim H_0}[LR^{\le D}(x)] = \mathbb{E}_{x \sim H_0}[LR(x)] = 1$ and trivially, we have $\mathbb{E}_{x \sim H_0}[g(x)^2] = 1$ since we scaled by the norm. Finally,
    \[\mathbb{E}_{x \sim H_1} g(x) = \frac{1}{\norm{LR^{\le D} - 1}} \sum_{1 \le i \le t} \ip{LR(x)}{\chi_i}^2\]
    We complete the proof by observing that $\ip{LR(x)}{\chi_i} = \mathbb{E}_{x \sim H_0} [LR(x)\chi_i(x)] = \mathbb{E}_{x \sim H_1}[\chi_i(x)]$. Computing the value is straightforward.
\end{proof}

The low-degree likelihood ratio hypothesis \cite{hop17, hop18, kunisky19notes} hypothesizes that if $H_0, H_1$ are \textit{sufficiently nice} distributions, then there is a successful hypothesis test with running itme $n^{O(D)}$ if and only if there exists a successful degree $D$ distinguisher. In particular, based on the above discussion, if $\norm{LR^{\le D} - 1} = O(1)$, then we expect that there is no $n^{O(D)}$ time successful hypothesis test.

A main contribution of this work is to provide strong evidence that this conjecture is true for many fundamental problems, by exhibiting strong SoS lower bounds. To see this connection a bit more formally, we will introduce pseudo-calibration and connect it with low-degree distinguishers.

\section{Pseudo-calibration}\label{subsec: pseudocalibration}

Consider an optimization problem we are trying to show SoS lower bounds for.
To obtain SoS integrality gaps on random instances, we need to construct valid pseudo-expectation values for a random input instance of the problem. Naturally, these pseudo-expectation values will depend on the input.

Psuedo-calibration is a heuristic introduced by \cite{BHKKMP16} to construct such candidate pseudo-expectation values almost mechanically by considering a planted distribution supported on instances of the problem with large objective value and using this planted distribution as a guide to construct the pseudo-expectation values. Note here that, for historic reasons, we use the term random distribution instead of null distribution and the term planted distribution instead of alternative distribution.

Unfortunately, psuedo-calibration doesn't guarantee feasibility of these candidate pseudo-expectation values and the corresponding moment matrix and this has to be verified separately for different problems. This verification of feasibility is relatively easy except for the PSDness condition. This is where the main contribution of this work lies, where we analyze the behavior of the constructed random moment matrix.

Indeed for our applications, psuedocalibration is used to obtain a candidate pseudoexpectation operator $\tilde{\EE}$ and a corresponding moment matrix $\Lambda$
from the random vs planted problem. This will be the starting point for all our applications. Pseudo-calibration gives lower bounds for many problems, such as the ones considered in the works \cite{Grigoriev01, Schoenebeck08, KothariMOW17, chlamtavc2018sherali, mohanty2020lifting}, making it an intriguing but poorly understood technique.

Here, we do not attempt to motivate and describe pseudo-calibration in great detail. Instead, we will briefly describe the heuristic, the intuition behind it and show an example of how to use it. A detailed treatment can be found in \cite{BHKKMP16}.

Let $\nu$ denote the random distribution and $\mu$ denote the planted distribution. Let $v$ denote the input and $x$ denote the variables for our SoS relaxation. The main idea is that, for an input $v$ sampled from $\nu$ and any polynomial $f(x)$ of degree at most the SoS degree, pseudo-calibration proposes that for any low-degree test $g(v)$, the correlation of $\tilde{\EE}[f]$ should match in the planted and random distributions. That is,
\[\mathbb{E}_{v \sim \nu}[\tilde{\EE}[f(x)]g(v)] = \mathbb{E}_{(x, v) \sim \mu}[f(x)g(v)]\]

Here, the notation $(x, v) \sim \mu$ means that in the planted distribution $\mu$, the input is $v$ and $x$ denotes the planted structure in that instance. For example, in planted clique, $x$ would be the indicator vector of the clique. If there are multiple, pick an arbitrary one.

Let ${\mathcal F}$ denote the Fourier basis of polynomials for the input $v$. By choosing different basis functions from ${\mathcal F}$ as choices for $g$ such that the degree is at most some truncation parameter $D$, we get all lower order Fourier coefficients for $\tilde{\EE}[f(x)]$ when considered as a function of $v$. Furthermore, the higher order coefficients are set to be $0$ so that the candidate pseudoexpectation operator can be written as
\[\tilde{\EE} f(x) = \sum_{\substack{g \in {\mathcal F}\\deg(g) \le n^{\varepsilon}}} \mathbb{E}_{v \sim \nu}[\tilde{\EE}[f(x)]g(v)] g(v) = \sum_{\substack{g \in {\mathcal F}\\deg(g) \le n^{\varepsilon}}} \mathbb{E}_{(x, v) \sim \mu}[[f(x)]g(v)] g(v)\]

The coefficients $\mathbb{E}_{(x, v) \sim \mu}[[f(x)]g(v)]$ can be explicitly computed in many settings, which therefore gives an explicit pseudoexpectation operator $\tilde{\EE}$.

One intuition for pseudo-calibration is as follows. The planted distribution is usually chosen to be a maximum entropy distribution which still has the planted structure. This conforms to the philosophy that random instances are hard for SoS, such as the uniform Bernoulli distribution for planted clique or the Gaussian distribution for Tensor PCA. By conditioning on the lower order moments matching such a planted distribution, pseudo-calibration can be interpreted as sort of interpolating between the random and planted distributions by only looking at lower order Fourier characters. This intuition has proven to be successful, since pseudo-calibration been successfully exploited to construct SoS lower bounds for a wide variety of dense as well as sparse problems.

An advantage of pseudo-calibration is that this construction automatically satisfies some nice properties that the pseudoexpectation $\tilde{\EE}$ should satisfy. It's linear in $v$ by construction. For all polynomial equalities of the form $f(x) = 0$ that is satisfied in the planted distribution, it's true that $\tilde{\EE}[f(x)] = 0$. For other polynomial equalities of the form $f(x, v) = 0$ that are satisfied in the planted distribution, the equality $\tilde{\EE}[f(x, v)] = 0$ is approximately satisfied. In most cases, $\tilde{\EE}$ can be mildly adjusted to satisfy these exactly.

The condition $\tilde{\EE}[1] = 1$ is not automatically satisfied but in most applications, we usually require that $\tilde{\EE}[1] = 1 \pm \operatorname{o}(1)$. Indeed, this has been the case for all known successful applications of pseudo-calibration. Once we have this, we simply set our final pseudoexpectation operator to be $\tilde{\EE}'$ defined as $\tilde{\EE}'[f(x)] = \tilde{\EE}[f(x)] / \tilde{\EE}[1]$.

We remark that the condition $\tilde{\EE}[1] = 1 \pm \operatorname{o}(1)$ has been quite successful in predicting the right thresholds between approximability and inapproximability\cite{hop17, hop18, kunisky19notes}. This will be crucial when we connect pseudo-calibration to low degree distinguishers.

\paragraph{Example: Planted Clique}
As a warmup, we review the pseudo-calibration calculation for planted clique. Here, the random distribution $\nu$ is $G(n, \frac{1}{2})$.

The planted distribution $\mu$ is as follows. For a given integer $k$, first sample $G'$ from $G(n, \frac{1}{2})$, then choose a random subset $S$ of the vertices where each vertex is picked independently with probability $\frac{k}{n}$. For all pairs $i, j$ of distinct vertices in $S$, add the edge $(i, j)$ to the graph if not already present. Set $G$ to be the resulting graph.

The input is given by $G \in \{-1, 1\}^{\binom{[n]}{2}}$ where $G_{i, j}$ is $1$ if the edge $(i, j)$ is present and $-1$ otherwise. Let $x_1, \ldots, x_n$ be the boolean variables for our SoS program such that $x_i$ indicates if $i$ is in the clique.

Given a set of vertices $V \subseteq [n]$, define $x_V = \prod_{v \in V}{x_v}$. Given a set of possible edges $E \subseteq \binom{[n]}{2}$, define $\chi_E = (-1)^{|E \setminus E(G)|} = \prod_{(i, j) \in E}G_{i, j}$.

Pseudo-calibration says that for all small $V$ and $E$,
\[
\mathbb{E}_{G \sim \nu}\left[\tilde{E}[x_V]\chi_E\right] = \mathbb{E}_{\mu}\left[x_V{\chi_E}\right]
\]
Using standard Fourier analysis, this implies that if we take
\[
c_E = \mathbb{E}_{\mu}\left[x_V{\chi_E}\right] = \left(\frac{k}{n}\right)^{|V \cup V(E)|}
\]
where $V(E)$ is the set of the endpoints of the edges in $E$, then for all small $V$,
\[
\tilde{\EE}[x_V] = \sum_{E:E \text{ is small}}{{c_E}\chi_E} = \sum_{E:E \text{ is small}}{\left(\frac{k}{n}\right)^{|V \cup V(E)|}\chi_E}
\]

Since the values of $\tilde{\EE}[x_V]$ are known, by multi-linearity, this can be naturally extended to obtain values $\tilde{\EE}[f(x)]$ for any polynomial $f$ of degree at most the SoS degree.

Here, we only set the Fourier coefficients for small $E$ and set the other larger Fourier coefficients to $0$. Usually, the choice of the truncation parameter is problem specific but there are some basic requirements \cite{hop17}. We now outline our general strategy to show SoS lower bounds. We employ this in all our results.

\subsection{Strategy to show SoS lower bounds}\label{sec: strategy_for_sos_lower_bounds}

In this work, the general strategy to show SoS lower bounds can be summarized as follows.

\begin{itemize}
	\item Given a random distribution, identify a suitable planted distribution
	\item Pseudocalibrate with respect the two distributions and obtain a candidate pseudoexpectation operator
    \item Show that the moment matrix satisfies the constraints
\end{itemize}

The most technically challenging part of this approach usually is to show that the moment matrix is positive semidefinite. Much of our contributions lies in this step, where we analyze the behavior of the random moment matrix thus obtained. Now, we connect pseudo-calibration to low-degree distinguishers.

\subsection{Connection to Low-degree distinguishers}

We are ready to connect psuedo-calibration to low-degree tests. Recall that in pseudo-calibration, we set the higher order Fourier coefficients to $0$. This is known as truncation. In particular, we truncate so that the resulting pseudoexpectation  has degree at most $D$ in the input. By construction, $\mathbb{E}[\tilde{\EE}[1]] = 1$ and we would like to understand how much $\tilde{\EE}[1]$ deviates from $1$. The following lemma says that the variance of $\tilde{\EE}[1]$ behaves like the squared value of the optimal degree-$D$ distinguisher.

\begin{lemma}\label{lem: pcal_to_ldlr}
    The pseudo-calibrated pseudo-expectation $\tilde{\EE}$, truncated to degree $D$, satisfies
    \[\mathrm{var}(\tilde{\EE}[1]) = \norm{LR^{\le D} - 1}^2\]
\end{lemma}

\begin{proof}
    Pseudocalibration sets $\mathbb{E}_{x \sim H_0}[\tilde{\EE}[1] \chi_i] = \mathbb{E}_{x \sim H_1} [\chi_i]$ for all $i \le t$. Therefore, $\tilde{\EE}[1] = 1 + \sum_{1 \le i \le t} \mathbb{E}_{x \sim H_1} [\chi_i] \chi_i$ giving $\mathrm{var}(\tilde{\EE}[1]) = \sum_{1 \le i \le t} (\mathbb{E}_{x \sim H_1}\chi_i)^2 = \norm{LR^{\le D} - 1}^2$.
\end{proof}

One of the essential steps in our SoS lower bound proofs is to verify, after pseudo-calibration, that $\tilde{\EE}[1]$ is well-behaved. In particular, for strong SoS lower bounds, we expect $\tilde{\EE}[1] = 1 + o(1)$. Although this is not formally necessary, it has often been the case in our applications and we expect it to be necessary for obtaining strong SoS lower bounds via this approach.


But when this is indeed the case and we exhibit SoS lower bounds, note that this is already strong evidence towards the low-degree likelihood ratio hypothesis. In more detail, because of \cref{lem: best_distinguisher} and \cref{lem: pcal_to_ldlr}, the best degree $D$ distinguisher does not distinguish the two distributions $\mu, \nu$. Our lower bounds affirm that the powerful SoS hierarchy cannot distinguish the two distributions as well, which is an important step towards the general hypothesis.

It's an important open problem in this field to prove that for sufficiently nice distributions $\mu, \nu$, after pseudo-calibrating, $\tilde{\EE}[1] = 1 + o(1)$ implies the existence of strong SoS lower bounds.

\section{Why Sum of Squares?}

We briefly remark on the successes of SoS in the last decade, especially in robust machine learning, a branch of machine learning where the underlying dataset is noisy, with the noise being either random or adversarial.
Robust machine learning has gotten a lot of attention in recent years because of its wide variety of use cases in machine learning and other downstream applications, including safety-critical ones like autonomous driving. For example, there has been a high volume of practical works in computer vision \cite{szegedy2013intriguing, goodfellow2014explaining, xie2019feature, hendrycks2021natural, sebe2013robust, xie2020adversarial, fischer2017adversarial, kurakin2016adversarial} and speech recognition \cite{hsu2021robust, wang2022wav2vec, rajendran2022analyzing, ravanelli2020multi, li2015robust, alzantot2018did, neekhara2019universal, olivier2022recent}.
In this important field, SoS has recently lead to breakthrough algorithms for
long-standing open problems \cite{bakshi2020robustly, liu2021settling, hopkins2020mean, klivans2018efficient, FKP19, kothari2017outlier, bakshi2020outlier, bakshi2020list, schramm2017fast}. Highlights include
\begin{itemize}
    \item Robustly learning mixtures of high dimensional Gaussians. This is an extremely important problem that has been subjected to intense scrutiny, with a long line of work culminating in \cite{bakshi2020robustly, liu2021settling}.
    \item Efficient algorithms for the fundamental problems of regression \cite{klivans2018efficient}, moment estimation \cite{kothari2017outlier}, clustering \cite{bakshi2020outlier} and subspace recovery \cite{bakshi2020list} in the presence of outliers.
\end{itemize}

Moreover, SoS algorithms are believed to be the optimal robust algorithm for many statistical problems. In a different direction, SoS algorithms have led to the design of fast algorithms for problems such as tensor decomposition \cite{hopkins2016fast, schramm2017fast}.

Broadly speaking, due to its ability to capture a wide variety of algorithmic techniques, SoS has become a fundamental tool in algorithms and optimization. It was and still remains an extremely versatile tool for combinatorial optimization \cite{GW94, AroraRV04, GuruswamiS11, raghavendra2017strongly}) but as we saw above, it is also being extensively used in Statistics and Machine Learning (apart from the references above, see also \cite{BarakBHKSZ12, bks15, HopSS15, pot17}). This sets the stage for the rest of this work where we analyze it for various problems of interest stemming from statistics and statistical physics.