\section{Generating a Spectral Sparsifier via TensorSketch}
\label{sec:tensorsketch}

Tensor type sketching has been widely used in problems \cite{swz19_tensor,dssw18,djs+19,akk+20,swyz21,szz21,sxz22,z22,syz23_sdp}. Section~\ref{sec:tensorsketch:ose} presents the definition of oblivious subspace embedding. In Section~\ref{sub:tensorsketch:TensorSRHT}, we give an overview of $\mathsf{TensorSRHT}$ and introduce its basic property. In Section~\ref{sub:tensorsketch:TensorSparse}, we present the definition of the property of $\mathsf{TensorSparse}$. In Section~\ref{sub:tensorsketch:Sketching}, we introduce the fast approximation for hessian via sketching.

\subsection{Oblivious Subspace Embedding}
\label{sec:tensorsketch:ose}

We define oblivious subspace embedding,
\begin{definition}[Oblivious subspace embedding, \cite{s06}]
We define $(\epsilon,\delta,d,n)$-Oblivious subspace embedding (\textsf{OSE}) as follows: Suppose $\Pi$ is a distribution on $m \times n$ matrices $S$, where $m$ is a function of $n, d, \epsilon$, and $\delta$. Suppose that with probability at least $1 - \delta$, for
any fixed $n \times d$ orthonormal basis $U$, a matrix $S$ drawn from the distribution $\Pi$ has the property that the singular values of $SU$ lie in the range $[1-\epsilon,1+\epsilon]$.
\end{definition}




\subsection{TensorSRHT}
\label{sub:tensorsketch:TensorSRHT}

We define a well-known sketching matrix family called {\sf TensorSRHT} \cite{ldfu13,akk+20}. It has been used in many optimization literature \cite{swyz21,szz21,sxz22}.
\begin{definition}[Tensor subsampled randomized Hadamard transform (\textsf{TensorSRHT}) \cite{akk+20,swyz21}]\label{def:tensor_srht}
The $\mathsf{TensorSRHT}$ $S: \R^n \times \R^n \to \R^m$ is defined as 
\begin{align*}
S := \frac{1}{\sqrt{m}} P \cdot (HD_1 \otimes HD_2),
\end{align*}
where each row of $P \in \{0, 1\}^{m \times n^2}$ contains only one $1$ at a random coordinate and one can view $P$ as a sampling matrix. $H$ is a $n \times n$ Hadamard matrix, and $D_1$, $D_2$ are two $n \times n$ independent diagonal matrices with diagonals that are each independently set to be a Rademacher random variable (uniform in $\{-1, 1\}$).  
\end{definition}

It is known \cite{akk+20} that {\sf TensorSRHT} matrices imply the {\sf OSE}.
\begin{lemma}[\cite{akk+20,swyz21} , see for example, Lemma 2.12 in~\cite{swyz21}]\label{lem:tensor_srht}
    Let $S$ be a {\sf TensorSRHT} matrix defined in Definition~\ref{def:tensor_srht}. If 
    \begin{align*}
        m=O(\epsilon^{-2}d^2 \log^3(nd / \epsilon\delta) ),
    \end{align*}
    then $S$ is an $(\epsilon, \delta, d^2, n^2)$-{\sf OSE} for degree-$2$ tensors. 

    Further for matrices $A_1,A_2 \in \R^{n \times d}$, $S(A_1 \otimes A_2)$ can be computed in $\wt{O}(nd + md^2)$ time.
\end{lemma}

\subsection{TensorSparse}
\label{sub:tensorsketch:TensorSparse}


\cite{sxz22} define {\sf TensorSparse} by compose Sparse embedding \cite{nn13,c16} with tensor operation \cite{p13}.
\begin{definition}[{\sf TensorSparse}, see Definition~7.6 in \cite{sxz22}]\label{def:tensor_sparse}
Let $h_1,h_2:[n] \times [s]\rightarrow [m/s]$ be $O(\log 1/\delta)$-wise independent hash functions and let $\sigma_1,\sigma_2:[n ]\times [s]\rightarrow \{\pm 1\}$ be $O(\log 1/\delta)$-wise independent random sign functions. Then, the degree two tensor sparse transform, $S:\R^n \times \R^n \rightarrow \R^m$ is given as: 
\begin{align*}
    R_{r,(i,j)} = & ~ \exists k\in [s]: \sigma_1(i,k)\sigma_2(j,k)/\sqrt{s}\cdot {\bf 1}[ ((h_1(i,k)+h_2(j,k))~\text{mod~}m/s)+(k-1)m/s=r]
\end{align*}
\end{definition}

\begin{lemma}[Theorem 7.10 in \cite{sxz22}]\label{lem:tensor_sparse}
Let $\epsilon\in (0,1)$ be precision parameter and $\delta\in (0,1)$ be success probability. Let $S \in \R^{m \times n^2}$ be a ${\sf TensorSparse}$ matrix (Def.~\ref{def:tensor_sparse}). Suppose $m=\Omega(\epsilon^{-2} d^2 \log(n/\delta))$ and $s=\epsilon^{-1} \log(n/\delta)$, then {\sf TensorSparse} provides $(\epsilon,\delta,d^2,n^2)$-{\sf OSE}.

Further for matrices $A_1,A_2 \in \R^{n \times d}$, $S(A_1 \otimes A_2)$ can be computed in $O( (\nnz(A_1) + \nnz(A_2) ) s + m d^2)$ time
\end{lemma}

\subsection{Fast Approximation for Hessian via Sketching}
\label{sub:tensorsketch:Sketching}

In this section, we present the fast approximation for hessian via sketching.

\begin{lemma}\label{lem:compute_hessian_approximate}
If the following conditions hold
\begin{itemize}
    \item Let $A_1 \in \R^{n \times d}$, let $A_2 \in \R^{n \times d}$
    \item Let $\A = (A_1 \otimes A_2) \in \R^{n^2 \times d^2}$
    \item Let $W\in \R^{n \times n}$ denote a positive diagonal matrix 
    \item Let $\ov{A}_1 = W A_1$
    \item Let $\ov{\A} = (\ov{A}_1 \otimes A_2) \in \R^{n^2 \times d^2}$
\end{itemize}
Then, we have
\begin{itemize}
    \item {\bf Part 1.} 
    \begin{align*}
    \A^\top (W^2 \otimes I_n) \A = \ov{\A}^\top \ov{\A}
    \end{align*}
    \item {\bf Part 2.} For any constant $\epsilon \in (0,0.1)$, there is an algorithm runs in $\wt{O}(nd + d^4)$ time to compute $S \ov{\A}$ such that
    \begin{align*}
        (1-\epsilon) \cdot \ov{\A}^\top \ov{\A} \preceq \ov{\A}^\top S^\top S \ov{\A} \preceq (1+\epsilon) \cdot \ov{\A}^\top \ov{\A}
    \end{align*}
    holds with probability $1-\delta$.
    \item {\bf Part 3.} For any $\epsilon \in (0,0.1)$, there is an algorithm runs in $\wt{O}(\nnz(A_1) + \nnz(A_2) + d^4)$ time to compute $S \ov{\A}$ such that
    \begin{align*}
        (1-\epsilon) \cdot \ov{\A}^\top \ov{\A} \preceq \ov{\A}^\top S^\top S \ov{\A} \preceq (1+\epsilon) \cdot \ov{\A}^\top \ov{\A}
    \end{align*}
    holds with probability $1-\delta$.
\end{itemize}
\end{lemma}
\begin{proof}

{\bf Proof of Part 1.}

We can show
\begin{align*}
    \A^\top (W^2 \otimes I_n) \A 
    = & ~ \A^\top (W \otimes I_n) \cdot (W \otimes I_n) \A \\
    = & ~  ((W \otimes I_n) (A_1 \otimes A_2) )^\top  \cdot ((W \otimes I_n) (A_1 \otimes A_2)  ) \\
    = & ~ (\ov{A}_1 \otimes A_2 )^\top ( \ov{A}_1 \otimes A_2 ) \\
    = & ~ \ov{\A}^\top \ov{\A}
\end{align*}
where the first step follows from $(W^2 \otimes I) = (W \otimes I_n) \cdot (W \otimes I_n)$ (where $\otimes$ operation and $W$ is a diagonal matrix), the second step follows from the definition of $\A$,
 
the third step follows from the definition of $\ov{A}_1$, and the last step follows from the definition of $\ov{\A}$. 







{\bf Proof of Part 2.}

It follows from using Lemma~\ref{lem:tensor_srht}.

{\bf Proof of Part 3.}

It follows from using Lemma~\ref{lem:tensor_sparse}.
\end{proof}