
% \begin{thm}(Weyl). 
%     \label{thm: Weyl}
%     $\sigma_{\min }=\sigma_1 \leq \sigma_2 \leq \cdots \leq \sigma_{n-1} \leq \sigma_n=\sigma_{\max }$
%     Let $A, B \in M_n$ be Hermitian and let the respective eigenvalues of $A, B$, and $A+B$ be $\left\{\sigma_i(A)\right\}_{i=1}^n,\left\{\sigma_i(B)\right\}_{i=1}^n$, and $\left\{\sigma_i(A+B)\right\}_{i=1}^n$, each algebraically ordered as in (4.2.1). Then

%     $$
%     \sigma_i(A+B) \leq \sigma_{i+j}(A)+\sigma_{n-j}(B), \quad j=0,1, \ldots, n-i
%     $$
    
%     for each $i=1, \ldots, n$, with equality for some pair $i, j$ if and only if there is a nonzero vector $x$ such that $A x=\sigma_{i+j}(A) x, B x=\sigma_{n-j}(B) x$, and $(A+B) x=\sigma_i(A+B) x$. Also,
    
%     $$
%     \sigma_{i-j+1}(A)+\sigma_j(B) \leq \sigma_i(A+B), \quad j=1, \ldots, i
%     $$
    
%     for each $i=1, \ldots, n$, with equality for some pair $i, j$ if and only if there is a nonzero vector $x$ such that $A x=\sigma_{i-j+1}(A) x, B x=\sigma_j(B) x$, and $(A+B) x=\sigma_i(A+B) x$. If $A$ and $B$ have no common eigenvector, then every inequality in (4.3.2a,b) is a strict inequality.
% \end{thm}

% This means 
% $$
% \sigma_r(A) \leq \sigma_r(B) - \sigma_{n}(A-B)
% $$
% $$
% \sigma_r(A) \geq \sigma_r(B) - \sigma_{n}(B-A)
% $$
% $$
% |\sigma_r(A) - \sigma_r(B)| \leq \|A - B \|_2
% $$
% By consider $M = [0, A;A, 0]$, whose eigenvalues are $\sigma_1(A), \sigma_2(A) \dots \sigma_r(A), 0, \dots, 0, \dots -\sigma_r(A),\dots -\sigma_1(A)$ we have:
% $$
% |\sigma_r(A) - \sigma_r(B)| \leq \| [0, A-B;A-B, 0]  \|_2 \leq \|A - B \|_2
% $$
% \section{Sample Complexity}
% We compare the sample complexities of different algorithms in the provided table.
% \begin{table*}[htbp]
% \centering
% \caption{Sample and Iteration Complexity for Gaussian Low-Rank Matrix Recovery ($d_1=d_2$)}
% \begin{tabular}{c|c|c}
% Method & Sample Complexity & Iteration Complexity \\ 
% \hline\hline
% \makecell{Nuclear Norm Minimization\\\citep{recht_guaranteed_2010}}    & $d_1r$              & —                   \\ 
% \hline
% \makecell{NIHT\\\citep{tanner2013normalized}}    & $d_1r$              & $\log(1/\varepsilon)$                   \\ 
% \hline
% \makecell{Factorized GD\\\citep{stoger_non-convex_2024}}    & $d_1r\kappa^2$      & $\kappa\log(1/\varepsilon)$ \\ 
% \hline
% \makecell{Scaled GD, RGD\\\citep{tong_accelerating_nodate,RGD}} & $d_1r^2\kappa^2$ & $\log(1/\varepsilon)$ \\ 
% \hline
% \makecell{RGD\\(this paper)}    & $d_1r\kappa^2$      & $\log(1/\varepsilon)$ \\ 
% \hline
% \end{tabular}
% \end{table*}
\section{Preliminary Theorems and Lemmas}
In this section, we present some preliminary theorems and lemmas, which are fundamental and will be frequently used in our proofs.  

\subsection{Supporting Theorems}
We begin with Weyl's inequality, which is useful for estimating the singular values of a perturbed matrix.

\begin{thm}[Weyl's inequality]
     \label{thm: Weyl}
Let $\bm{A}, \bm{B} \in \mathbb{R}^{d_1 \times d_2}$ be two matrices with singular values  $\sigma_1(\bm{A}) \geq \sigma_2(\bm{A}) \geq \cdots \geq \sigma_{\min\{d_1,d_2\}}(\bm{A})$ and $\sigma_1(\bm{B}) \geq \sigma_2(\bm{B}) \geq \cdots \geq \sigma_{\min\{d_1,d_2\}}(\bm{B})$. Then for any $i \in [\min\{d_1,d_2\}]$ it holds that:
$$
    |\sigma_i(\bm{A}) - \sigma_i(\bm{B})| \leq \|\bm{A} - \bm{B} \|_2.
$$
\end{thm}

%\begin{thm} [Eckart-Young Theorem]
%    Consider a matrix $\bm{A} \in \mathbb{R}^{d_1 \times d_2}$ with singular value decomposition $\bm{A}=\sum_{i=1}^{\min\{d_1,d_2\}} \sigma_i \bm{u}_i \bm{v}_i^T$, where $\sigma_i$ is the singular values of $\bm{A}$ in descending order. Let  $\widehat{\bm{A}}(k)=\sum_{i=1}^k \sigma_i \bm{u}_i \bm{v}_i^{T}$, it holds that
%    $$
%    \begin{aligned}
%    &\widehat{\bm{A}}(k)  =\underset{\mathrm{rank}(\bm{B})=k}{\arg\min}\|\bm{A}-\bm{B}\|_2 = \underset{\mathrm{rank}(\bm{B})=k}{\arg\min}\|\bm{A}-\bm{B}\|_F\qquad\mbox{and}
%    \qquad\|\bm{A}-\widehat{\bm{A}}(k)\|_2  =\sigma_{k+1}.
%    \end{aligned}
%    $$
 


% \label{Eckart-Young Theorem}
%\end{thm}
The following Bernstein inequality helps control the tail probabilities of certain random events.
\begin{thm}[{\citep[Theorem 2.8.1]{vershynin2018high}}, Bernstein's inequality]\label{thm:Bernstein subexp} Let $X_1, \ldots, X_N$ be independent, mean-zero, sub-exponential random variables. Then, for every $t \geq 0$, we have
    \begin{equation}
    \mathbb{P}\left\{\left|\sum_{i=1}^N X_i\right| \geq t\right\} \leq 2 \exp \left[-c \min \left(\frac{t^2}{\sum_{i=1}^N\left\|X_i\right\|_{\psi_1}^2}, \frac{t}{\max _i\left\|X_i\right\|_{\psi_1}}\right)\right],
    \label{eq: Berstein-compare}
    \end{equation}
    where  $\|\cdot \|_{\psi_1}$ is the sub-exponential norm and $c>0$ is an absolute constant.
\end{thm}

\subsection{Perturbation Bounds for Eigenspace}
For a matrix $\mathbf{Z} \in \domain$ with SVD $\mathbf{Z} = \mathbf{U}_{\mathbf{Z}} \mathbf{\Sigma}_{\mathbf{Z}} \mathbf{V}_{\mathbf{Z}}^{\top}$, we let $\mathbf{U}_{\mathbf{Z}, r} \in \mathbb{R}^{d_1 \times r}$ be the matrix consisting of the first $r$ columns of $\mathbf{U}_{\mathbf{Z}}$, and $\mathbf{U}_{\mathbf{Z}, r, \perp} \in \mathbb{R}^{d_1 \times (d_1-r)}$ be the matrix consisting of the remaining $d_1-r$ columns. The matrices $\mathbf{V}_{\mathbf{Z}, r}$ and $\mathbf{V}_{\mathbf{Z}, r, \perp}$ are defined similarly. The matrix $\mathbf{\Sigma}_{\bm{Z},r}$ is an $r \times r$ diagonal matrix consisting of the first $r$ singular values of $\mathbf{\Sigma}_{\bm{Z}}$. The singular values of $\mathbf{Z}$ are ordered such that their magnitudes are decreasing, i.e., $\sigma_1(\mathbf{Z}) \geq \sigma_2(\mathbf{Z}) \geq \ldots \geq \sigma_{\min\{d_1,d_2\}}(\mathbf{Z})$. 
For simplicity, we use $\bm{U}_1$ to denote $\bm{U}_{\bm{Z}_1}$, $\bm{U}_{1,r}$ to denote $\bm{U}_{\bm{Z}_1,r}$, and $\bm{U}_{2,r}$ to denote $\bm{U}_{\bm{Z}_2,r}$. Other notations are simplified similarly.

The following lemma bounds the perturbation of the subspace spanned by the first $r$ singular vectors of $\mathbf{Z}_1$ in terms of the spectral gap of $\mathbf{Z}_1$ and the perturbation on the matrix itself:

\begin{lem}[\citep{wedin_perturbation_1972}, Non-symmetric version of Davis-Kahan inequality]
Let $\mathbf{Z}_1$ and $\mathbf{Z}_2 \in \domain$ be two matrices with singular value decompositions
\[
\mathbf{Z}_1 = \begin{bmatrix}
\mathbf{U}_{1,r} & \mathbf{U}_{1,r,\perp}
\end{bmatrix}
\begin{bmatrix}
\mathbf{\Sigma}_{1,r} & \mathbf{0} \\
\mathbf{0} & \mathbf{\Sigma}_{1,r,\perp}
\end{bmatrix}
\begin{bmatrix}
\mathbf{V}_{1,r}^T \\
\mathbf{V}_{1,r,\perp}^T
\end{bmatrix},
\]
and
\[
\mathbf{Z}_2 = \mathbf{Z}_1 + \mathbf{\Delta} = \begin{bmatrix}
\mathbf{U}_{2,r} & \mathbf{U}_{2,r,\perp}
\end{bmatrix}
\begin{bmatrix}
\mathbf{\Sigma}_{2,r} & \mathbf{0} \\
\mathbf{0} & \mathbf{\Sigma}_{2,r,\perp}
\end{bmatrix}
\begin{bmatrix}
\mathbf{V}_{2,r}^T \\
\mathbf{V}_{2,r,\perp}^T
\end{bmatrix},
\]
respectively. If $\sigma_r(\mathbf{Z}_1) > \sigma_{r+1}(\mathbf{Z}_1)$ and 
\begin{equation}
\|\mathbf{Z}_1 - \mathbf{Z}_2\|_2 \leq \left(1 - \frac{1}{\sqrt{2}}\right) \left(\sigma_r(\mathbf{Z}_1) - \sigma_{r+1}(\mathbf{Z}_1)\right),
\label{eq: pertubation 2 norm close}
\end{equation}
then
\[
\max\left\{\left\| \mathbf{U}_{2, r, \perp}^{\top} \mathbf{U}_{1, r} \right\|_F, \left\|\mathbf{V}_{2, r, \perp}^{\top} \mathbf{V}_{1, r} \right\|_F\right\} \leq \frac{\sqrt{2}\left(\left\|\mathbf{U}_1^T \mathbf{\Delta}\right\|_F + \left\|\mathbf{\Delta} \mathbf{V}_1\right\|_F\right)}{\sigma_r(\mathbf{Z}_1) - \sigma_{r+1}(\mathbf{Z}_1)}.
\]
\label{lem: Wedin}
\end{lem}

The following lemma bounds the distance between two matrices after applying the thresholding operator $\mathcal{H}_r$, assuming they are sufficiently close. To use this result, we first provide a lower bound on the spectral gap of $\mathbf{Z}_1$ and show that it is large enough compared to both $\|\mathbf{Z}_1 - \mathbf{Z}_2\|_2$ and $\sigma_r(\mathbf{Z}_1)$. 
% This lemma is fundamental and widely applicable. While its proof may exist elsewhere, we prove it by ourselves here.
\begin{lem}
\label{lem: thresholding control}
Let $\bm{Z}_1$ and $\bm{Z}_2$ satisfy the same conditions as in \cref{lem: Wedin}. 
%Let
%\[
%\mathbf{Z}_{1, r} := \mathcal{H}_r(\mathbf{Z}_1) \quad \text{and} \quad \mathbf{Z}_{2, r} := \mathcal{H}_r(\mathbf{Z}_2).
%\]
Assume further that the spectral gap is large enough such that 
\[
s := \sigma_r(\mathbf{Z}_1) - \sigma_{r+1}(\mathbf{Z}_1) \geq \frac{1}{c_0} \sigma_{r+1}(\mathbf{Z}_1)
\]
for some constant $c_0 > 0$. Then, there exist constants $C_1$ and $C_2$ depending only on $c_0$ and satisfying $C_1 \leq C_2 \leq 6c_0 + 10$ such that
\[
\|\mathcal{H}_r(\mathbf{Z}_1) - \mathcal{H}_r(\mathbf{Z}_2)\|_2 \leq C_1 \left(\sigma_r(\mathbf{Z}_1) - \sigma_{r+1}(\mathbf{Z}_1)\right),
\]
and
$$
\begin{aligned}
\|\mathcal{H}_r(\mathbf{Z}_1) - \mathcal{H}_r(\mathbf{Z}_2)\|_F &\leq C_2 \left(\left\|\left(\mathbf{Z}_1 - \mathbf{Z}_2\right) \mathbf{V}_{1, r}\right\|_F + \left\| \mathbf{U}_{1, r}^T \left(\mathbf{Z}_1 - \mathbf{Z}_2\right) \right\|_F\right)\\
&
\leq 2C_2 \left\| \mathbf{Z}_1 - \mathbf{Z}_2 \right\|_F.    
\end{aligned}
$$
%\[\|\mathbf{Z}_{1,r} - \mathbf{Z}_{2, r}\|_F \leq C_2 \left(\left\|\left(\mathbf{Z}_1 - \mathbf{Z}_2\right) \mathbf{V}_{1, r}\right\|_F + \left\| \mathbf{U}_{1, r}^T \left(\mathbf{Z}_1 - \mathbf{Z}_2\right) \right\|_F\right)\leq 2C_2 \left\| \mathbf{Z}_1 - \mathbf{Z}_2 \right\|_F.\]
\end{lem}
\begin{proof}
    Recall that we have defined
    \[
    s = \sigma_r(\mathbf{Z}_1) - \sigma_{r+1}(\mathbf{Z}_1).
    \]
    \label{sec: proof of thresholding control}
    By Weyl's inequality (see \cref{thm: Weyl}) and \cref{eq: pertubation 2 norm close}, it follows that
    \[
    \sigma_r\left(\mathbf{Z}_2\right) - \sigma_{r+1}\left(\mathbf{Z}_2\right) \geq \sigma_r\left(\mathbf{Z}_1\right) - \sigma_{r+1}\left(\mathbf{Z}_1\right) - 2\|\bm{Z}_1 - \bm{Z}_2 \|_2 \geq (\sqrt{2} - 1) s.
    \]
    Therefore, for \( i = 1, 2 \), the rank-\( r \) approximation \(\bm{Z}_{i,r} = \mathcal{H}_r(\bm{Z}_i)\) is uniquely defined, as \(\sigma_r(\bm{Z}_{i}) > \sigma_{r+1}(\bm{Z}_{i})\). Moreover, by Weyl's inequality and \cref{eq: pertubation 2 norm close}, we have
    \[
    |\sigma_{r+1}(\bm{Z}_2)| \leq |\sigma_{r+1}(\bm{Z}_1)| + (1 - 1/\sqrt{2}) s \leq (c_0 + 1 - 1 /\sqrt{2}) s.
    \]
    Let \( c := c_0 + 1 - 1 /\sqrt{2} \), noting that \( c > c_0 \). We then derive the following estimate:
    \begin{equation}
    \begin{aligned}
    \|\bm{Z}_{1,r} - \mathbf{Z}_{2, r}\|_2 &\leq 
    \|\bm{Z}_{1,r} - \mathbf{Z}_{1}\|_2 + \|\bm{Z}_{1} - \mathbf{Z}_{2}\|_2 + \|\bm{Z}_{2} - \mathbf{Z}_{2, r}\|_2 \\
    &\leq |\sigma_{r+1}(\bm{Z}_1)| + |\sigma_{r+1}(\bm{Z}_2)| + 
    (1-1 / \sqrt{2})\left(\left|\sigma_r\left(\mathbf{Z}_1\right)\right| - \left|\sigma_{r+1}\left(\mathbf{Z}_1\right)\right|\right) \\
    &\leq \underbrace{(2c + 1 - 1 / \sqrt{2})}_{C_1} s,
    \end{aligned}
    \label{eq: thresholding control 2}
    \end{equation}
    where the constant \( C_1 \) satisfies \( C_1 \leq 2c_0 + 3 \).

    Let \(\mathbf{Z}_{1, r} = \mathbf{U}_{1,r} \mathbf{\Sigma}_{1,r} \mathbf{V}_{1,r}^{T}\) and \(\mathbf{Z}_{2, r} = \mathbf{U}_{2,r} \mathbf{\Sigma}_{2,r} \mathbf{V}_{2,r}^{T}\) be the SVDs of \(\bm{Z}_1\) and \(\bm{Z}_2\), respectively. Since
    \[
    \|\bm{Z}_{1,r} - \mathbf{Z}_{2, r}\|_F^2 \leq \|(\bm{Z}_{1,r} - \mathbf{Z}_{2, r})\bm{V}_1\|_F^2 = \|(\bm{Z}_{1,r} - \mathbf{Z}_{2, r})\mathbf{V}_{1,r}\|_F^2 + \|(\bm{Z}_{1,r} - \mathbf{Z}_{2, r})\mathbf{V}_{1,r,\perp}\|_F^2,
    \]
    taking the square root of both sides and using \(\sqrt{a^2 + b^2} \leq |a| + |b|\), we obtain
    \begin{equation}
    \label{eq: (in lem: thresholding control) -- decompose fro norm}
    \|\bm{Z}_{1,r} - \mathbf{Z}_{2, r}\|_F \leq 
    \|(\bm{Z}_{1,r} - \mathbf{Z}_{2, r})\mathbf{V}_{1,r}\|_F + \|(\bm{Z}_{1,r} - \mathbf{Z}_{2, r})\mathbf{V}_{1,r,\perp}\|_F.
    \end{equation}

    We now estimate the two terms on the right-hand side separately.

    \begin{itemize}
        \item For the first term, we have:
        \[
        \begin{aligned}
        \|(\bm{Z}_{1,r} - \mathbf{Z}_{2, r})\mathbf{V}_{1,r}\|_F &= \|(\bm{Z}_{1} - \mathbf{Z}_{2, r})\mathbf{V}_{1,r}\|_F \\
        &\leq \|(\bm{Z}_{1} - \mathbf{Z}_{2})\mathbf{V}_{1,r}\|_F + \|(\bm{Z}_{2} - \mathbf{Z}_{2, r})\mathbf{V}_{1,r}\|_F \\
        &= \|(\bm{Z}_{1} - \mathbf{Z}_{2})\mathbf{V}_{1,r}\|_F + \|\mathbf{U}_{2,r,\perp} \mathbf{\Sigma}_{2,r,\perp} \mathbf{V}_{2,r,\perp}^{T}\mathbf{V}_{1,r}\|_F \\
        &\stackrel{\note{a}}{\leq} \left(1 + \frac{\sqrt{2}|\sigma_{r+1}(\bm{Z}_2)|}{\left|\sigma_r\left(\mathbf{Z}_1\right)\right| - \left|\sigma_{r+1}\left(\mathbf{Z}_1\right)\right|} \right) \left\| \left(\mathbf{Z}_1 - \mathbf{Z}_2\right) \mathbf{V}_{1, r} \right\|_F \\
        &\quad + \frac{\sqrt{2}|\sigma_{r+1}(\bm{Z}_2)|}{\left|\sigma_r\left(\mathbf{Z}_1\right)\right| - \left|\sigma_{r+1}\left(\mathbf{Z}_1\right)\right|} \left\| \mathbf{U}_{1, r}^T \left(\mathbf{Z}_1 - \mathbf{Z}_2\right) \right\|_F \\
        &\leq (1 + \sqrt{2}c) \left\| \left(\mathbf{Z}_1 - \mathbf{Z}_2\right) \mathbf{V}_{1, r} \right\|_F + \sqrt{2}c \left\| \mathbf{U}_{1, r}^T \left(\mathbf{Z}_1 - \mathbf{Z}_2\right) \right\|_F,
        \end{aligned}
        \]
        where step (a) follows from
        \[
        \|\mathbf{U}_{2,r,\perp} \mathbf{\Sigma}_{2,r,\perp} \mathbf{V}_{2,r,\perp}^{T}\mathbf{V}_{1,r}\|_F \leq |\sigma_{r+1}(\bm{Z}_2)| \|\mathbf{V}_{2,r,\perp}^{T}\mathbf{V}_{1,r}\|_F
        \]
        and \cref{lem: Wedin}.

        \item For the second term, we further split it into two parts:
        \[
        \begin{aligned}
        \|(\bm{Z}_{1,r} - \mathbf{Z}_{2, r})\mathbf{V}_{1,r,\perp}\|_F &\leq \|\bm{U}_{1,r}^T (\bm{Z}_{1,r} - \mathbf{Z}_{2,r})\bm{V}_{1,r,\perp}\|_F + \|\bm{U}_{1,r,\perp}^T (\bm{Z}_{1,r} - \mathbf{Z}_{2, r})\bm{V}_{1,r,\perp}\|_F \\
        &\leq \|\mathbf{U}_{1,r}^T(\bm{Z}_{1,r} - \mathbf{Z}_{2, r})\|_F + \|\bm{U}_{1,r,\perp}^T \mathbf{Z}_{2, r}\bm{V}_{1,r,\perp}\|_F.
        \end{aligned}
        \]
        The last term is estimated as:
        \[
        \begin{aligned}
        \|\bm{U}_{1,r,\perp}^T \mathbf{Z}_{2, r}\bm{V}_{1,r,\perp}\|_F &= \|\bm{U}_{1,r,\perp}^T \mathbf{U}_{2,r} \mathbf{\Sigma}_{2,r} \mathbf{V}_{2,r}^{T}\bm{V}_{1,r,\perp}\|_F \\
        &\leq \| \bm{U}_{1,r,\perp}^T \mathbf{U}_{2,r} \mathbf{\Sigma}_{2,r} \|_2 \|\mathbf{V}_{2,r}^{T}\bm{V}_{1,r,\perp}\|_F \\
        &= \| \bm{U}_{1,r,\perp}^T \mathbf{U}_{2,r} \mathbf{\Sigma}_{2,r} \mathbf{V}_{2,r}^T\|_2 \|\mathbf{V}_{2,r}^{T}\bm{V}_{1,r,\perp}\|_F \\
        &\stackrel{\note{a}}{=} \|\bm{U}_{1,r,\perp}^T \bm{Z}_{2,r} \|_2 \|\mathbf{V}_{2,r,\perp}^{T}\bm{V}_{1,r}\|_F \\
        &= \|\bm{U}_{1,r,\perp}^T (\bm{Z}_{2,r} - \bm{Z}_{1,r}) \|_2 \|\mathbf{V}_{2,r,\perp}^{T}\bm{V}_{1,r}\|_F \\
        &\leq \|\bm{Z}_{2,r} - \bm{Z}_{1,r} \|_2 \|\mathbf{V}_{2,r,\perp}^{T}\bm{V}_{1,r}\|_F \\
        &\stackrel{\note{b}}{\leq} \frac{\sqrt{2}(2c + 1 - 1 / \sqrt{2})s}{s} \left( \left\|\left(\mathbf{Z}_1 - \mathbf{Z}_2\right) \mathbf{V}_{1, r} \right\|_F + \left\| \mathbf{U}_{1, r}^T\left(\mathbf{Z}_1 - \mathbf{Z}_2\right) \right\|_F \right) \\
        &= \sqrt{2}(2c + 1 - 1 / \sqrt{2}) \left( \left\|\left(\mathbf{Z}_1 - \mathbf{Z}_2\right) \mathbf{V}_{1, r} \right\|_F + \left\| \mathbf{U}_{1, r}^T\left(\mathbf{Z}_1 - \mathbf{Z}_2\right) \right\|_F \right),
        \end{aligned}
        \]
        where step (a) uses \(\|\mathbf{V}_{2,r}^{T}\bm{V}_{1,r,\perp}\|_F = \|\mathbf{V}_{2,r,\perp}^{T}\bm{V}_{1,r}\|_F\) \citep[Lemma 2.5]{chen2021spectral}, and step (b) follows from \cref{eq: thresholding control 2}.
    \end{itemize}

    Combining these estimates, we obtain
    \[
    \|\bm{Z}_{1,r} - \mathbf{Z}_{2, r}\|_F \leq C_2 \left( \left\|\left(\mathbf{Z}_1 - \mathbf{Z}_2\right) \mathbf{V}_{1, r} \right\|_F + \left\| \mathbf{U}_{1, r}^T\left(\mathbf{Z}_1 - \mathbf{Z}_2\right) \right\|_F \right),
    \]
    where \( C_2 = \sqrt{2}(2c + 1 - 1 / \sqrt{2}) + (1 + \sqrt{2}c) \leq 6c_0 + 10 \) is a constant.
\end{proof}

\subsection{Bounds on the Distance Between Projections}

\label{proof: lemmas RGD}
We introduce key lemmas used in the convergence analysis of the RGD algorithm, which have been stated and proved in \citep{RGD}. %The proof is given in \cref{proof: lemmas RGD}. 
The following result bounds the projection distance between the singular vector subspaces of two matrices:
\begin{lem}\label{lem: projection distance}
    Let $\bm{X}_t$ and $\bm{X}$ be two rank-$r$ matrices with compact SVDs $\bm{X}_t = \bm{U}_t \bm{\Sigma}_t \bm{V}_t^T$ and $\bm{X} = \bm{U} \bm{\Sigma} \bm{V}^T$, respectively.
    \begin{enumerate}
        \item The distance between the projection matrices of their singular vector subspaces satisfies the following bounds:
        \[
        \begin{gathered}
        \left\|\bm{U}_t \bm{U}_t^T - \bm{U} \bm{U}^T\right\|_2 \leq \frac{\left\|\bm{X}_t - \bm{X}\right\|_2}{\sigma_{\min}(\bm{X})}, \quad \left\|\bm{V}_t \bm{V}_t^T - \bm{V} \bm{V}^T\right\|_2 \leq \frac{\left\|\bm{X}_t - \bm{X}\right\|_2}{\sigma_{\min}(\bm{X})}; \\
        \left\|\bm{U}_t \bm{U}_t^T - \bm{U} \bm{U}^T\right\|_F \leq \frac{\sqrt{2}\left\|\bm{X}_t - \bm{X}\right\|_F}{\sigma_{\min}(\bm{X})}, \quad \left\|\bm{V}_t \bm{V}_t^T - \bm{V} \bm{V}^T\right\|_F \leq \frac{\sqrt{2}\left\|\bm{X}_t - \bm{X}\right\|_F}{\sigma_{\min}(\bm{X})}.
        \end{gathered}
        \]
        
        \item Let $\mathcal{P}_{\mathbb{T}_t}$ and $\mathcal{P}_{\mathbb{T}}$ be the projection operators onto the tangent spaces of the rank-$r$ matrix manifold at $\bm{X}_t$ and $\bm{X}$, respectively. Then, the following bounds hold:
        \[
        \sup_{\|\bm{Z}\|_2 = 1} \left\|\left(\mathcal{P}_{\mathbb{T}_t} - \mathcal{P}_{\mathbb{T}}\right) \bm{Z} \right\|_2 \leq \frac{2 \left\|\bm{X}_t - \bm{X}\right\|_2}{\sigma_{\min}(\bm{X})}
        \quad
        \text{and}
        \quad
        \sup_{\|\bm{Z}\|_2 = 1} \left\|\left(\mathcal{P}_{\mathbb{T}_t} - \mathcal{P}_{\mathbb{T}}\right) \bm{Z} \right\|_F \leq \frac{2 \sqrt{2} \left\|\bm{X}_t - \bm{X}\right\|_F}{\sigma_{\min}(\bm{X})}.
        \]
    \end{enumerate}
\end{lem}
\begin{proof}
    We prove only the second assertion, as the first assertion is identical to \citep[Lemma 4.2]{RGD}.

    By the definition of $\mathcal{P}_{\mathbb{T}_t}$ and $\mathcal{P}_{\mathbb{T}}$, we have
    \[
    \begin{aligned}
    \left(\mathcal{P}_{\mathbb{T}_t} - \mathcal{P}_{\mathbb{T}}\right) \bm{Z} &= \left(\bm{U}_t \bm{U}_t^T \bm{Z} + \bm{Z} \bm{V}_t \bm{V}_t^T - \bm{U}_t \bm{U}_t^T \bm{Z} \bm{V}_t \bm{V}_t^T\right)- \left(\bm{U} \bm{U}^T \bm{Z} + \bm{Z} \bm{V} \bm{V}^T - \bm{U} \bm{U}^T \bm{Z} \bm{V} \bm{V}^T\right) \\
    &= \left(\bm{U}_t \bm{U}_t^T - \bm{U} \bm{U}^T\right) \bm{Z}\left(\bm{I} - \bm{V}_t \bm{V}_t^T\right)+ \left(\bm{I} - \bm{U} \bm{U}^T\right) \bm{Z}\left(\bm{V}_t \bm{V}_t^T - \bm{V} \bm{V}^T\right).
    \end{aligned}
    \]
    
    Taking the spectral norm on both sides yields:
    \[
    \sup_{\|\bm{Z}\|_2 = 1} \left\|\left(\mathcal{P}_{\mathbb{T}_t} - \mathcal{P}_{\mathbb{T}}\right) \bm{Z} \right\|_2 \leq \|\bm{U}_t \bm{U}_t^T - \bm{U} \bm{U}^T\|_2 + \|\bm{V}_t \bm{V}_t^T - \bm{V} \bm{V}^T\|_2 \leq \frac{2 \left\|\bm{X}_t - \bm{X}\right\|_2}{\sigma_{\min}(\bm{X})}.
    \]
    
    Similarly, taking the Frobenius norm on both sides gives:
    \[
    \sup_{\|\bm{Z}\|_2 = 1} \left\|\left(\mathcal{P}_{\mathbb{T}_t} - \mathcal{P}_{\mathbb{T}}\right) \bm{Z} \right\|_F \leq \|\bm{U}_t \bm{U}_t^T - \bm{U} \bm{U}^T\|_F + \|\bm{V}_t \bm{V}_t^T - \bm{V} \bm{V}^T\|_F \leq \frac{2 \sqrt{2} \left\|\bm{X}_t - \bm{X}\right\|_F}{\sigma_{\min}(\bm{X})}.
    \]
\end{proof}


The following lemma provides second-order information about $\mathbb{M}_r$, the smooth manifold of all rank-$r$ matrices.

\begin{lem}[\citep{RGD}, Lemma 4.1]
    Let $\bm{X}_t\in\mathbb{M}_r$ with compact SVD $\bm{X}_t = \bm{U}_t \bm{\Sigma}_t \bm{V}_t^T$, and let $\mathbb{T}_t$ denote the tangent space of $\mathbb{M}_r$ at $\bm{X}_t$. Let $\bm{X}\in\mathbb{M}_r$ be another rank-$r$ matrix. Then, the following inequalities hold:
    \[
    \left\|\left(\mathcal{I} - \mathcal{P}_{\mathbb{T}_t}\right) \bm{X}\right\|_F \leq \frac{1}{\sigma_{\min}(\bm{X})} \left\|\bm{X}_t - \bm{X}\right\|_2 \left\|\bm{X}_t - \bm{X}\right\|_F \leq \frac{1}{\sigma_{\min}(\bm{X})} \left\|\bm{X}_t - \bm{X}\right\|_F^2,
    \]
    \[
    \left\|\left(\mathcal{I} - \mathcal{P}_{\mathbb{T}_t}\right) \bm{X}\right\|_2 \leq \frac{1}{\sigma_{\min}(\bm{X})} \left\|\bm{X}_t - \bm{X}\right\|_2^2.
    \]
    \label{lem: second order} 
\end{lem}
\begin{proof}
    By the definition of the projection operators $\mathcal{P}_{\mathbb{T}_t}$ and $\mathcal{P}_{\mathbb{T}}$, we have:
    \[
    \begin{aligned}
    \left(\mathcal{I} - \mathcal{P}_{\mathbb{T}_t}\right) \bm{X} &= \left(\mathcal{P}_{\mathbb{T}} - \mathcal{P}_{\mathbb{T}_t}\right) \bm{X} \\
    &= \left(\bm{U} \bm{U}^T - \bm{U}_t \bm{U}_t^T\right) \bm{X} \left(\bm{I} - \bm{V}_t \bm{V}_t^T\right) + \left(\bm{I} - \bm{U} \bm{U}^T\right) \bm{X} \left(\bm{V} \bm{V}^T - \bm{V}_t \bm{V}_t^T\right) \\
    &= \left(\bm{U} \bm{U}^T - \bm{U}_t \bm{U}_t^T\right) \bm{X} \left(\bm{I} - \bm{V}_t \bm{V}_t^T\right) \\
    &= \left(\bm{U} \bm{U}^T - \bm{U}_t \bm{U}_t^T\right) \left(\bm{X} - \bm{X}_t\right) \left(\bm{I} - \bm{V}_t \bm{V}_t^T\right).
    \end{aligned}
    \]
    Taking the spectral and Frobenius norms on both sides and applying \cref{lem: projection distance} completes the proof.
\end{proof}



\section{Proof in Restricted Isometry Property}
For completeness, we provide the proof and relevant references regarding the properties of the Restricted Isometry Property (RIP) in this section.
\begin{proof}[Proof of \cref{lem: RIP ort}]
\label{proof: RIP}
    Assertions 1, 2, and 4 follow directly from a non-symmetric version of \citep[Lemma 2.4]{stoger_non-convex_2024} and \citep[Lemma 4.4]{RGD}. 

    We now prove Assertion 3. Consider the following chain of inequalities:
    \[
    \begin{aligned}
    \sup_{\|\bm{Z}\|_F=1} \left\| \left(\mathcal{P}_{\mathbb{T}_{\bm{X}}} - \mathcal{P}_{\mathbb{T}_{\bm{X}}} \mathcal{A}^* \mathcal{A} \mathcal{P}_{\mathbb{T}_{\bm{X}}} \right)(\bm{Z}) \right\|_F 
    &\stackrel{\note{a}}{=} \sup_{\|\bm{Z}\|_F=1} \left| \left\langle \left(\mathcal{P}_{\mathbb{T}_{\bm{X}}} - \mathcal{P}_{\mathbb{T}_{\bm{X}}} \mathcal{A}^* \mathcal{A} \mathcal{P}_{\mathbb{T}_{\bm{X}}} \right)(\bm{Z}), \bm{Z} \right\rangle \right| \\
    &= \sup_{\|\bm{Z}\|_F=1} \left| \left\| \mathcal{P}_{\mathbb{T}_{\bm{X}}}(\bm{Z}) \right\|_F^2 - \left\| \mathcal{A} \mathcal{P}_{\mathbb{T}_{\bm{X}}}(\bm{Z}) \right\|_2^2 \right| \\
    &\stackrel{\note{b}}{\leq} \sup_{\|\bm{Z}\|_F=1} \delta_{2r} \left\| \mathcal{P}_{\mathbb{T}_{\bm{X}}}(\bm{Z}) \right\|_F^2 \leq \delta_{2r},
    \end{aligned}
    \]
    where:
    \begin{itemize}
        \item Step (a) follows because $\mathcal{P}_{\mathbb{T}_{\bm{X}}} - \mathcal{P}_{\mathbb{T}_{\bm{X}}} \mathcal{A}^* \mathcal{A} \mathcal{P}_{\mathbb{T}_{\bm{X}}}$ is a self-adjoint operator, and the operator norm is expressed in its variational form.
        \item Step (b) holds because $\mathcal{P}_{\mathbb{T}_{\bm{X}}}(\bm{Z})$ has rank at most $2r$, and RIP applies.
    \end{itemize}
    This completes the proof of Assertion 3.
\end{proof}



% \subsection{Proof of lemmas in \cref{sec: key lemmas}}

%\label{sec: key lemmas}


\section{Proofs in Decoupling Technique}
\label{sec: proof of decoupling Technique}


The following lemma describes the properties of the operator $\mathcal{A}_{\tuple}$ and its relationship with $\mathcal{A}$. It follows directly from the definition of $\mathcal{A}_{\tuple}$.

\begin{lem}
    For any matrix $\mathbf{Z} \in \domain$, the following properties hold:
    \begin{equation}
        \begin{aligned}
            &\big(\mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}\big)\left(\mathcal{P}_{\net}(\mathbf{Z})\right) = \mathcal{P}_{\net}(\mathbf{Z}), \\
            &\big(\mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}\big)\left(\mathcal{P}_{\net}^{\perp}(\mathbf{Z})\right) 
            = \left(\mathcal{A}^* \mathcal{A}\right)\left(\mathcal{P}_{\net}^{\perp}(\mathbf{Z})\right) - \left\langle \mathcal{A}\left(\net\right), \mathcal{A}\left(\mathcal{P}_{\net}^{\perp}(\mathbf{Z})\right) \right\rangle \net, \\
            &\big(\mathcal{A}^* \mathcal{A} - \mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}\big)(\bm{Z}) =
            (\mathcal{A}^* \mathcal{A} - I)\mathcal{P}_{\bm{x}\bm{y}^T}(\bm{Z}) + \left\langle \bm{x}\bm{y}^T, \mathcal{A}^* \mathcal{A}\left(\mathcal{P}_{\bm{x}\bm{y}^T}^{\perp}(\bm{Z})\right) \right\rangle \bm{x}\bm{y}^T.
        \end{aligned}
        \label{eq: AA-A_wA_w}
    \end{equation}
    \label{lem: compute decoupling}
\end{lem}
\begin{proof}
    We prove each assertion separately.

    \textbf{First assertion:} By the definition of $\bm{A}_{i, \tuple}$, we have $\left\langle\bm{A}_{i, \tuple}, \mathcal{P}_{\net}(\mathbf{Z})\right\rangle = 0$. Consequently,
    \[
    \begin{aligned}
        &\left(\mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}\right)\left(\mathcal{P}_{\net}(\mathbf{Z})\right) 
        = \frac{1}{m} \sum_{i=1}^m \left\langle\bm{A}_{i, \tuple}, \mathcal{P}_{\net}(\mathbf{Z})\right\rangle \bm{A}_{i, \tuple} + \left\langle\net, \mathbf{Z}\right\rangle \net
        = \left\langle\net, \mathbf{Z}\right\rangle \net.
    \end{aligned}
    \]
    This establishes the first assertion.

    \textbf{Second assertion:} For the orthogonal projection $\mathcal{P}_{\net}^{\perp}(\mathbf{Z})$, we observe that
    \[
    \begin{aligned}
        \left(\mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}\right)\left(\mathcal{P}_{\net}^{\perp}(\mathbf{Z})\right)
        &= \frac{1}{m} \sum_{i=1}^m \left\langle\bm{A}_{i, \tuple}, \mathcal{P}_{\net}^{\perp}(\mathbf{Z})\right\rangle \bm{A}_{i, \tuple} + \left\langle\net, \mathcal{P}_{\net}^{\perp}(\mathbf{Z})\right\rangle \net \\
        &= \frac{1}{m} \sum_{i=1}^m \left\langle\bm{A}_{i, \tuple}, \mathcal{P}_{\net}^{\perp}(\mathbf{Z})\right\rangle \bm{A}_{i, \tuple} 
        = \frac{1}{m} \sum_{i=1}^m \left\langle\mathbf{A}_i, \mathcal{P}_{\net}^{\perp}(\mathbf{Z})\right\rangle \bm{A}_{i, \tuple} \\
        &= \frac{1}{m} \sum_{i=1}^m \left\langle\mathbf{A}_i, \mathcal{P}_{\net}^{\perp}(\mathbf{Z})\right\rangle \mathbf{A}_i - \frac{1}{m} \sum_{i=1}^m \left\langle\mathbf{A}_i, \mathcal{P}_{\net}^{\perp}(\mathbf{Z})\right\rangle \left\langle\net, \mathbf{A}_i\right\rangle \net \\
        &= \left(\mathcal{A}^* \mathcal{A}\right)\left(\mathcal{P}_{\net}^{\perp}(\mathbf{Z})\right) - \left\langle\mathcal{A}\left(\net\right), \mathcal{A}\left(\mathcal{P}_{\net}^{\perp}(\mathbf{Z})\right)\right\rangle \net.
    \end{aligned}
    \]
    This proves the second assertion.

    \textbf{Third assertion:} For the difference $\mathcal{A}^* \mathcal{A} - \mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}$, we decompose $\bm{Z}$ as $\bm{Z} = \mathcal{P}_{\bm{x}\bm{y}^T}\bm{Z} + \mathcal{P}_{\bm{x}\bm{y}^T}^{\perp}\bm{Z}$. Then,
    \[
    \begin{aligned}
        &(\mathcal{A}_{\tuple}^*\mathcal{A}_{\tuple} - \mathcal{I})\bm{Z} = (\mathcal{A}_{\tuple}^*\mathcal{A}_{\tuple} - \mathcal{I})(\mathcal{P}_{\bm{x}\bm{y}^T}\bm{Z} + \mathcal{P}_{\bm{x}\bm{y}^T}^{\perp}\bm{Z}) \\
        &\stackrel{\note{a}}{=} (\mathcal{A}_{\tuple}^*\mathcal{A}_{\tuple} - \mathcal{I})\mathcal{P}_{\bm{x}\bm{y}^T}^{\perp}\bm{Z} \\
        &\stackrel{\note{b}}{=} (\mathcal{A}^*\mathcal{A} - \mathcal{I})(\mathcal{P}_{\bm{x}\bm{y}^T}^{\perp}\bm{Z}) - \left\langle\mathcal{A}\left(\bm{x}\bm{y}^T\right), \mathcal{A}\left(\mathcal{P}_{\bm{x}\bm{y}^T}^{\perp}(\bm{Z})\right)\right\rangle \bm{x}\bm{y}^T \\
        &= (\mathcal{A}^*\mathcal{A} - \mathcal{I})(\bm{Z} - \mathcal{P}_{\bm{x}\bm{y}^T}\bm{Z}) - \left\langle\mathcal{A}\left(\bm{x}\bm{y}^T\right), \mathcal{A}\left(\mathcal{P}_{\bm{x}\bm{y}^T}^{\perp}(\bm{Z})\right)\right\rangle \bm{x}\bm{y}^T,
    \end{aligned}
    \]
    where (a) follows from the first assertion and (b) follows from the second assertion. Rearranging terms completes the proof of the third assertion.
\end{proof}

Now we can prove \cref{prop: spectral control}, which bounds $\left\|\left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\mathbf{X}_{\star} - \bm{X}_t\right)\right\|_2$ using the virtual sequence $\{ \mathbf{X}_{t}^{\tuple} \}_{t \in \mathbb{N}}$.

\begin{proof}[Proof of \cref{prop: spectral control}]
    Let $\bm{\Delta}_t := \target - \bm{X}_t$ and $\Dtxy := \target - \Xtxy$. From the construction of the net $\mathcal{N}$, we have
    \[
    \begin{aligned}
        \left\|\left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\bm{\Delta}_t\right)\right\|_2 &= \sup_{\tuple \in \ball} \left|\left\langle\net, \left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\bm{\Delta}_t\right)\right\rangle\right|
        \leq 2 \sup_{\tuple \in \mathcal{N}} \left|\left\langle\net, \left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\bm{\Delta}_t\right)\right\rangle\right|.
    \end{aligned}
    \]

    For every $\tuple \in \mathcal{N}$, applying the triangle inequality yields
    \[
    \begin{aligned}
        &\left|\left\langle\net, \left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\bm{\Delta}_t\right)\right\rangle\right| \\
        &\leq \left|\left\langle\net, \left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\Dtxy\right)\right\rangle\right| + \left|\left\langle\net, \left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\Dtxy - \bm{\Delta}_t\right)\right\rangle\right| \\
        &\leq \left|\left\langle\net, \left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\Dtxy\right)\right\rangle\right| + \left\|\left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\Dtxy - \bm{\Delta}_t\right)\right\|_2 \\
        &\stackrel{\note{a}}{\leq} \left|\left\langle\net, \left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\Dtxy\right)\right\rangle\right| + \delta \left\|\bm{\Delta}_t - \Dtxy\right\|_F \\
        &\leq \left|\left\langle\net, \left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\mathcal{P}_{\net}^{\perp}\left(\Dtxy\right)\right)\right\rangle\right| + \left|\left\langle\net, \left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\mathcal{P}_{\net}\left(\Dtxy\right)\right)\right\rangle\right| + \delta \left\|\bm{\Delta}_t - \Dtxy\right\|_F,
    \end{aligned}
    \]
    where (a) follows from \cref{eq: RIP 2norm} in \cref{lem: RIP ort}, which is a consequence of RIP of $\mathcal{A}$. We now estimate the first two terms in the last line.

    \begin{itemize}
        \item \textbf{Second term:} The second term can be bounded as
        \[
        \begin{aligned}
            &\left|\left\langle\net, \left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\mathcal{P}_{\net}\left(\Dtxy\right)\right)\right\rangle\right| \\
            & = \left|\left\langle\net, \left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\net\right\rangle \left\langle\net, \Dtxy\right\rangle\right|
             = \left|\left(\left\|\mathcal{A}\left(\net\right)\right\|_2^2 - 1\right) \left\langle\net, \Dtxy\right\rangle\right| \\
            &\stackrel{\note{a}}{\leq} \delta \left|\left\langle\net, \Dtxy\right\rangle\right|
            \leq \delta \|\Dtxy\|_2
            \leq \delta \sup_{\tuple \in \mathcal{N}} \left\|\bm{\Delta}_t - \Dtxy\right\|_F + \delta \left\|\bm{\Delta}_t\right\|_2,
        \end{aligned}
        \]
        where (a) follows from the definition of the RIP property.

        \item \textbf{First term:} Under the assumption that \cref{eq: A ort uniform} holds, the first term can be estimated as
        \[
        \begin{aligned}
            &\left|\left\langle\net, \left(\mathcal{A}^* \mathcal{A}\right)\left(\mathcal{P}_{\net}^{\perp}\left(\Dtxy\right)\right)\right\rangle\right| \\
            &\leq 4 \sqrt{\frac{\complexity}{m}} \left\|\mathcal{A}\left(\mathcal{P}_{\net}^{\perp}\left(\Dtxy\right)\right)\right\|_2 
            \stackrel{\note{a}}{\leq} 8 \sqrt{\frac{\complexity}{m}} \left\|\mathcal{P}_{\net}^{\perp}\left(\Dtxy\right)\right\|_F 
            \leq 8 \sqrt{\frac{\complexity}{m}} \left\|\Dtxy\right\|_F \\
            &\leq 8 \sqrt{\frac{\complexity}{m}} \left\|\bm{\Delta}_t\right\|_F + 8 \sqrt{\frac{\complexity}{m}} \sup_{\tuple \in \mathcal{N}} \left\|\bm{\Delta}_t - \Dtxy\right\|_F \\
            &\stackrel{\note{b}}{\leq} 8 \sqrt{\frac{2 r (\complexity)}{m}} \left\|\bm{\Delta}_t\right\|_2 + 8 \sqrt{\frac{\complexity}{m}} \sup_{\tuple \in \mathcal{N}} \left\|\bm{\Delta}_t - \Dtxy\right\|_F,
        \end{aligned}
        \]
        where (a) follows from the RIP property of $\mathcal{A}$, $\mathrm{rank}(\mathcal{P}_{\net}^{\perp}(\Dtxy)) \leq 2r + 2$, and $1 + \delta_{2r+2} \leq 2$, and (b) follows from $\operatorname{rank}(\bm{\Delta}_t) \leq 2r$.
    \end{itemize}

    Combining all the estimated terms and taking the supreme over $\tuple\in\mathcal{N}$, we obtain the final bound:
    \[
    \begin{aligned}
        \left\|\left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\bm{\Delta}_t\right)\right\|_2 &\leq \left(2 \delta + 16 \sqrt{\frac{2 r (\complexity)}{m}}\right) \left\|\bm{\Delta}_t\right\|_2
        + \left(2\delta + 2\delta + 16 \sqrt{\frac{\complexity}{m}}\right) \sup_{\tuple \in \mathcal{N}} \left\|\bm{\Delta}_t - \Dtxy\right\|_F.
    \end{aligned}
    \]
    This completes the proof.
\end{proof}




\section{Proof of Initialization}
\label{proof: initialization}
The proof of \cref{lem: initialization} follows a structure similar to that of \citep[Lemma 4.1]{stoger_non-convex_2024}.

\begin{proof}[Proof of \cref{lem: initialization}]
To prove this lemma, we establish the following two inequalities:
\begin{equation}
\left\|\mathbf{X}_{\star} - \bm{X}_0\right\|_2 \leq \frac{1}{2}\RegionConstant \Std,    
\label{eq: SpectalInit}
\end{equation}
and
\begin{equation}
\left\|\bm{X}_0 - \mathbf{X}_{0}^{\tuple}\right\|_F \leq \frac{1}{2}\RegionConstant \Std, \quad \forall~\tuple \in \mathcal{N}.
\label{eq: x0-xwF}
\end{equation}

First, from \cref{lem: RIP}, with probability at least $1 - \exp(-(\complexity))$, the operator $\mathcal{A}$ satisfies RIP of rank $6r$ with $\delta_{6r} = \delta$ when $m \geq c \delta^{-2} r (\complexity)$, where $c$ is a universal constant. This implies that, with the same probability, $\mathcal{A}$ satisfies RIP of rank $6r$ with constant $\delta = \sqrt{\frac{cr(\complexity)}{m}}$. We choose $m > c r(\complexity)$ to ensure that $\delta < 1$.

%Let $\widetilde{\mathcal{N}} := \mathcal{N}_1 \times \mathcal{N}_2$, where $\mathcal{N}_1$ is an $\varepsilon$-net on $\mathbb{S}^{d_1-1}$ and $\mathcal{N}_2$ is an $\varepsilon$-net on $\mathbb{S}^{d_2-1}$. When $\varepsilon = \frac{1}{4}$, $\widetilde{\mathcal{N}} = \mathcal{N}$ and it has a size of at most $12^{\complexity}$. 
Then, we have
\[
\begin{aligned}
\left\|\left(\mathcal{A}^* \mathcal{A}\right)\left(\mathbf{X}_{\star}\right) - \mathbf{X}_{\star}\right\|_2 & \leq 2 \sup_{\tuple \in \mathcal{N}} \frac{1}{m} \sum_{i=1}^m \mathbf{x}^{T}\left(\left\langle\mathbf{A}_i, \mathbf{X}_{\star}\right\rangle \mathbf{A}_i - \mathbf{X}_{\star}\right) \mathbf{y} \\
& = 2 \sup_{\tuple \in \mathcal{N}} \frac{1}{m} \sum_{i=1}^m\left(\left\langle\mathbf{A}_i, \mathbf{X}_{\star}\right\rangle \mathbf{x}^{T} \mathbf{A}_i \mathbf{y} - \mathbf{x}^{T} \mathbf{X}_{\star} \mathbf{y}\right).
\end{aligned}
\]
The expectation can be computed as $\mathbb{E}\langle\mathbf{A}_i, \mathbf{X}_{\star}\rangle \mathbf{x}^{T} \mathbf{A}_i \mathbf{y} = \mathbf{x}^{T} \mathbf{X}_{\star} \mathbf{y}$. From \citep{vershynin2018high}, we have
\[
\| \langle\mathbf{A}_i, \mathbf{X}_{\star}\rangle \mathbf{x}^{T} \mathbf{A}_i \mathbf{y} \|_{\psi_1} \leq \|\langle\mathbf{A}_i, \mathbf{X}_{\star}\rangle\|_{\psi_2} \| \mathbf{x}^{T} \mathbf{A}_i \mathbf{y} \|_{\psi_2} \leq K \|\target \|_F,
\]
where $K$ is a universal constant, and therefore the centered version satisfies
\[
\| \langle\mathbf{A}_i, \mathbf{X}_{\star}\rangle \mathbf{x}^{T} \mathbf{A}_i \mathbf{y} - \mathbf{x}^{T} \mathbf{X}_{\star} \mathbf{y} \|_{\psi_1} \leq K \| \mathbf{X}_{\star} \|_{F}.
\]
Applying Bernstein's inequality, we obtain:
\[
\mathbb{P}\left(\left|\frac{1}{m} \sum_{i=1}^m\left(\left\langle\mathbf{A}_i, \mathbf{X}_{\star}\right\rangle \mathbf{x}^{T} \mathbf{A}_i \mathbf{y} - \mathbf{x}^{T} \mathbf{X}_{\star} \mathbf{y}\right)\right| \geq t\right) \leq 2 \exp \left(-C^{\prime} \min \left\{\frac{m t^2}{\left\|\mathbf{X}_{\star}\right\|_F^2}, \frac{m t}{\left\|\mathbf{X}_{\star}\right\|_F}\right\}\right).
\]
Setting $t = C''(\sqrt{\frac{\complexity}{m}} + \frac{\complexity}{m}) \| \mathbf{X}_{\star}\|_F$, the probability is less than $2\exp(-C''C'(\complexity))$ for a fixed pair $(\bm{x},\bm{y})$. Taking a union bound over all $\tuple \in \mathcal{N}$, we obtain
\[
\sup_{\tuple \in \mathcal{N}} \left| \frac{1}{m} \sum_{i=1}^m\left(\left\langle\mathbf{A}_i, \mathbf{X}_{\star}\right\rangle \mathbf{x}^{T} \mathbf{A}_i \mathbf{y} - \mathbf{x}^{T} \mathbf{X}_{\star} \mathbf{y}\right) \right| \leq C''(\sqrt{\frac{\complexity}{m}} + \frac{\complexity}{m}) \| \mathbf{X}_{\star}\|_F
\]
with probability at least $1 - 2\exp((\ln12 - C''C')(\complexity))$. We choose $C''$ sufficiently large such that $\ln12 - C''C' < -4$, ensuring a high success probability. Consequently, with probability at least $1 - 2\exp{(-4(\complexity))}$,
\[
\begin{aligned}
\left\|\left(\mathcal{A}^* \mathcal{A}\right)\left(\mathbf{X}_{\star}\right) - \mathbf{X}_{\star}\right\|_2 &\leq 2 C''(\sqrt{\frac{\complexity}{m}} + \frac{\complexity}{m}) \| \mathbf{X}_{\star}\|_F
\leq 2 C''(\sqrt{\frac{\complexity}{m}} + \frac{\complexity}{m}) \sqrt{r} \kappa \Std.
\end{aligned}
\]
We choose a proper constant $C_1$ and let $m \geq C_1 \kappa^2 r(\complexity)$ to make the constant before $\Std$ less than or equal to $\min\{\frac{1}{4}c_1, \frac{1}{10}\}$, and then we obtain 
\begin{equation}
\left\|\left(\mathcal{A}^* \mathcal{A}\right)\left(\mathbf{X}_{\star}\right) - \mathbf{X}_{\star}\right\|_2 \leq \min\big\{\frac{1}{4}c_1, \frac{1}{10}\big\} \Std.
\label{eq: AAX-X}
\end{equation}
This, together with Weyl's inequality, implies that the spectral gap for $\left(\mathcal{A}^* \mathcal{A}\right)\left(\mathbf{X}_{\star}\right)$ satisfies:
\begin{equation}
s_1 := \sigma_{r}\left(\left(\mathcal{A}^* \mathcal{A}\right)\left(\mathbf{X}_{\star}\right)\right) - \sigma_{r+1}\left(\left(\mathcal{A}^* \mathcal{A}\right)\left(\mathbf{X}_{\star}\right)\right) \geq \frac{4}{5} \Std > 0.
\label{eq: gapforAA}
\end{equation}

As a result, $\bm{X}_0 = \mathcal{H}_r(\mathcal{A}^* \mathcal{A}(\mathbf{X}_{\star}))$ is uniquely defined. Using the best rank-$r$ approximation property of $\bm{X}_0$, we obtain
\[
\begin{aligned}
\left\|\mathbf{X}_{\star} - \bm{X}_0\right\|_2 &\leq \left\|\mathbf{X}_{\star} - \left(\mathcal{A}^* \mathcal{A}\right)\left(\mathbf{X}_{\star}\right)\right\|_2 + \left\|\left(\mathcal{A}^* \mathcal{A}\right)\left(\mathbf{X}_{\star}\right) - \bm{X}_0\right\|_2 \\
&\leq 2 \left\|\mathbf{X}_{\star} - \left(\mathcal{A}^* \mathcal{A}\right)\left(\mathbf{X}_{\star}\right)\right\|_2.
\end{aligned}
\]
Thus, combining it with \eqref{eq: AAX-X}, we obtain \eqref{eq: SpectalInit}.

From \cref{lem: compute decoupling}, we have 
\begin{equation}
\left(\mathcal{A}^* \mathcal{A} - \mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}\right)\left(\mathbf{X}_{\star}\right) 
= \left\langle\net, \mathbf{X}_{\star}\right\rangle\left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\net\right) + \left\langle \bm{x}\bm{y}^T, \mathcal{A}^* \mathcal{A}\left(\mathcal{P}_{\bm{x}\bm{y}^T}^{\perp}(\bm{Z})\right) \right\rangle \bm{x}\bm{y}^T.
\label{eq: AA - AxyAxy in intial}
\end{equation}
Therefore,
\begin{equation}
\begin{aligned}
\left\|\left(\mathcal{A}^* \mathcal{A} - \mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}\right)\left(\mathbf{X}_{\star}\right) \right\|_2 &\leq 
\|\target\|_2 \| \left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\net\right)\|_2 
+ \left|\left\langle\mathcal{A}\left(\net\right), \mathcal{A}\left(\mathcal{P}_{\net}^{\perp}\left(\mathbf{X}_{\star}\right)\right)\right\rangle\right| \\
&:= I_1 + I_2.
\end{aligned}
\end{equation}
From \cref{eq: RIP 2norm} in \cref{lem: RIP ort}, it follows that 
\[
I_1 \leq \|\target \|_2 \cdot \delta \leq \kappa \Std \sqrt{\frac{cr(\complexity)}{m}}.
\]
To estimate $I_2$, we use
\[
\left\langle\mathcal{A}\left(\net\right), \mathcal{A}\left(\mathcal{P}_{\net}^{\perp}\left(\mathbf{X}_{\star}\right)\right)\right\rangle = \frac{1}{m} \sum_{i=1}^m\left\langle\net, \mathbf{A}_i\right\rangle\left\langle\mathbf{A}_i, \mathcal{P}_{\net}^{\perp}\left(\mathbf{X}_{\star}\right)\right\rangle.
\]
Here, $\sum_{i=1}^m\left\langle\net, \mathbf{A}_i\right\rangle\left\langle\mathbf{A}_i, \mathcal{P}_{\net}^{\perp}\left(\mathbf{X}_{\star}\right)\right\rangle$ is a sum of $m$ independent sub-exponential random variables with mean zero due to the rotation invariance of the Gaussian measure. Each term has a sub-exponential norm $K\left\|\mathbf{X}_{\star}\right\|_F$ with constant $K$. Applying Bernstein's inequality, we obtain that for each fixed $\tuple$, with probability at least $1 - \exp (-4 (\complexity))$,
\[
I_2 = \left|\left\langle\mathcal{A}\left(\net\right), \mathcal{A}\left(\mathcal{P}_{\net}^{\perp}\left(\mathbf{X}_{\star}\right)\right)\right\rangle\right| \leq c_2 \kappa \sigma_{\min }\left(\mathbf{X}_{\star}\right) \sqrt{r}\left(\sqrt{\frac{\complexity}{m}} + \frac{\complexity}{m}\right),
\]
where $c_2$ is a constant depending only on $K$.
Taking a union bound over all $\tuple \in \mathcal{N}$ and combining $I_1$ and $I_2$, we obtain that, with probability at least $1 - \exp (- (\complexity))$, 
\begin{equation}
\left\|\left(\mathcal{A}^* \mathcal{A} - \mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}\right)\left(\mathbf{X}_{\star}\right)\right\|_2 \leq c_3 \kappa \sigma_{\min }\left(\mathbf{X}_{\star}\right) \sqrt{r}\left(\sqrt{\frac{\complexity}{m}} + \frac{\complexity}{m}\right), \quad \forall~\tuple \in \mathcal{N}.
\label{eq: dist AA-AAxy}
\end{equation}
By choosing a proper $C_2$ and letting $m \geq C_2 \kappa^2 r (\complexity)$, \eqref{eq: dist AA-AAxy} implies
\begin{equation}
\left\|\left(\mathcal{A}^* \mathcal{A} - \mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}\right)\left(\mathbf{X}_{\star}\right)\right\|_2 \leq \frac{4(1 - 1/\sqrt{2})}{5} \Std \leq (1 - 1/\sqrt{2}) s_1,
\label{eq: original dist}
\end{equation}
where in the last inequality we have used \eqref{eq: gapforAA}. Furthermore, by using \eqref{eq: gapforAA} and \eqref{eq: AAX-X}, we obtain
\[
c_0 := \frac{\sigma_{r+1}(\mathcal{A}^* \mathcal{A}(\target))}{s_1} \leq \frac{\frac{1}{10}}{\frac{4}{5}} \leq 1.
\]
Applying \cref{lem: thresholding control} to $\bm{Z}_1 := \mathcal{A}^* \mathcal{A}(\target)$ and $\bm{Z}_2 := \mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}(\target)$ and noticing that $\bm{X}_0 = \mathcal{H}_r(\bm{Z}_1)$ and $\mathbf{X}_{0}^{\tuple} = \mathcal{H}_r(\bm{Z}_2)$, we obtain 
\begin{equation}
\begin{aligned}
\| \bm{X}_0 - \mathbf{X}_{0}^{\tuple} \|_2 &\leq 16 \left\|\left(\mathcal{A}^* \mathcal{A} - \mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}\right)\left(\mathbf{X}_{\star}\right)\right\|_2 \\
&\leq c_3' \kappa \sigma_{\min }\left(\mathbf{X}_{\star}\right) \sqrt{r}\left(\sqrt{\frac{\complexity}{m}} + \frac{\complexity}{m}\right),
\end{aligned}
\end{equation}
where we have used \eqref{eq: dist AA-AAxy} in the last inequality, and
\begin{equation}
\begin{aligned}
&\| \bm{X}_0 - \mathbf{X}_{0}^{\tuple} \|_F \\
& \leq 16 \left( \|(\mathcal{A}^* \mathcal{A} - \mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple})\left(\mathbf{X}_{\star}\right) \bm{V}_{1,r} \|_F + \|\bm{U}_{1,r}^T (\mathcal{A}^* \mathcal{A} - \mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple})(\mathbf{X}_{\star}) \|_F \right) \\
& \stackrel{\note{a}}{\leq} 16 \left|\left\langle\net, \mathbf{X}_{\star}\right\rangle \right| \left\|\left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\net\right)\bm{V}_{1,r}\right\|_F 
+ 16 \left|\left\langle\net, \mathbf{X}_{\star}\right\rangle\right| \left\|\bm{U}_{1,r}^T \left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\net\right)\right\|_F \\
&\qquad + 16 \left|\left\langle\mathcal{A}\left(\net\right), \mathcal{A}\left(\mathcal{P}_{\net}^{\perp}\left(\mathbf{X}_{\star}\right)\right)\right\rangle\right| \|\net\bm{V}_{1,r} \|_F + 16 \left|\left\langle\mathcal{A}\left(\net\right), \mathcal{A}\left(\mathcal{P}_{\net}^{\perp}\left(\mathbf{X}_{\star}\right)\right)\right\rangle\right| \|\bm{U}_{1,r}^T\net \|_F \\
&\stackrel{\note{b}}{\leq} 64 c \kappa \sigma_{\min }\left(\mathbf{X}_{\star}\right) \sqrt{r}\left(\sqrt{\frac{\complexity}{m}} + \frac{\complexity}{m}\right),
\end{aligned}
\label{eq: X - Xxy F}
\end{equation}
where (a) follows from \cref{eq: AA - AxyAxy in intial}, and (b) follows from \cref{eq: RIP V} and \cref{eq: RIP w ort} in \cref{lem: RIP ort}, $\|\bm{U}_{1,r}\|_2 \leq 1$, and $\|\bm{V}_{1,r}\|_2 \leq 1$.
We choose a proper constant $C_2' > C_2$ and let $m \geq C_2' \kappa^2 r(\complexity)$ to ensure 
that the last term in \cref{eq: X - Xxy F} is not greater than $\frac{1}{2}\Std$ and thus \cref{eq: x0-xwF}. 

Throughout the proof, we have imposed several lower bounds on $m$. We then take their maximum, i.e., $m \geq C \kappa^2 r(\complexity)$ with $C = \max \{c, C_1, C_2' \}$, to complete the proof.
\end{proof}


\section{Proofs in Convergence Analysis}
This section presents the proof of \cref{lem: first T steps}, a key result in our analysis. Unlike the corresponding argument for factorized gradient descent in \citep{stoger_non-convex_2024}, our proof requires analyzing the projection of the gradient onto the tangent space of $\bm{X}_t$, which relies on \cref{lem: projection distance} and \cref{lem: second order}. 
Additionally, the use of a hard-thresholding operator after the gradient step introduces errors that are bounded using \cref{lem: thresholding control}. 

\label{proof: convergence}
\begin{proof}[Proof of \cref{lem: first T steps}]
From the assumption of this lemma,
\begin{equation}
\RegionConstant < \frac{1}{1000},
\label{eq: c1}
\end{equation}
and we have $\mathcal{A}$ satisfies RIP of rank $6r$ with 
\begin{equation}
 \delta = \delta_{6r} \leq   \frac{1}{24}\RegionConstant < 1.  
 \label{eq: delta 6r}
\end{equation}
Besides, \cref{eq: spectral control} holds with this $\delta$ for $t \leq T \leq \Maxiter$.

We prove this theorem by induction. The assumption \cref{eq: E0} of this lemma gives $E_0\leq c_1\Std$. Assume that
$$
E_0\leq c_1\Std, \quad E_1\leq (1000 c_1)c_1\Std,\quad\cdots,\quad E_t\leq (1000 c_1)^t c_1\Std.
$$
We will need to show that $E_{t+1}\leq (1000 c_1)^{t+1}c_1\Std$, i.e.,
$$
\| \bm{X}_{t+1} - \target \|_2 + \sup_{\tuple \in \mathcal{N}} \| \bm{X}_{t+1} - \mathbf{X}_{t+1}^{\tuple} \|_F\leq c_1 (1000c_1)^{t+1}\Std.
$$

For this purpose, we estimate
%$$\| \bm{X}_t - \target \|_2\leq c_1 \Std\qquad\mbox{and}\qquad\sup_{\tuple \in \mathcal{N}}\| \bm{X}_t - \Xtxy \|_F \leq c_1 \Std.$$ Then we control 
$\| \bm{X}_{t+1} - \target \|_2$ and $\sup_{\tuple \in \mathcal{N}}\|\bm{X}_{t+1} - \mathbf{X}_{t+1}^{\tuple} \|_F$, respectively. Notice that the inductive assumption $E_t\leq (1000 c_1)^t c_1\Std$ implies
\begin{equation}\label{eq: inductive assumpution of Et}
\| \bm{X}_{t} - \target \|_2\leq c_1 \Std\qquad\mbox{and}\qquad \sup_{\tuple \in \mathcal{N}}\|\bm{X}_{t} - \Xtxy\|_F\leq c_1 \Std.   
\end{equation}


\textbf{Estimate $\|\bm{X}_{t+1} - \bm{X}_*\|_2$.}
We first compute $\| \bm{W}_t-\target\|_2$. By decomposing $\target-\bm{X}_t$ onto $\mathbb{T}_t$ and $\mathbb{T}_t^{\perp}$, we obtain
$$
\begin{aligned}  &\| \bm{W}_t-\target\|_2 \\
&= \left\|\left(\mathcal{I}-\mathcal{P}_{\mathbb{T}_t} \mathcal{A}^{*} \mathcal{A}\right)\left(\target-\bm{X}_t\right)\right\|_2 \\
& \leq  \left\|\left(\mathcal{I}-\mathcal{P}_{\mathbb{T}_t}\right)\left(\target-\bm{X}_t\right)\right\|_2 + \left\|\mathcal{P}_{\mathbb{T}_t}\left(I - \mathcal{A}^{*} \mathcal{A} \right)\left(\target-\bm{X}_t\right)\right\|_2 \\
&  \stackrel{\note{a}}{\leq} \frac{1}{\sigma_{\min }(\target)}\left\|\bm{X}_t- \target\right\|_2^2 + \left\|\mathcal{P}_{\mathbb{T}_t}\left(I - \mathcal{A}^{*} \mathcal{A} \right)\left(\target-\bm{X}_t\right)\right\|_2  \\
& \stackrel{\note{b}}{\leq} \left( \RegionConstant + 3\left(16 \sqrt{\frac{2 r (\complexity)}{m}} +2 \delta\right) \right)
\left\|\bm{X}_t- \target\right\|_2 
 + 12\left(\delta+4 \sqrt{\frac{\complexity}{m}}\right) \sup _{\tuple \in \mathcal{N}}\left\|\bm{X}_t - \Xtxy\right\|_F \\
& \leq  \left( \frac{3}{2}\RegionConstant + 48 \sqrt{\frac{2 r (\complexity)}{m}}  \right)
\left\|\bm{X}_t- \target\right\|_2  
 + \left( \frac{1}{2}\RegionConstant + 48 \sqrt{\frac{\complexity}{m}}\right) \sup _{\tuple \in \mathcal{N}}\left\|\bm{X}_t - \Xtxy\right\|_F , 
\end{aligned}
$$
where (a) follows from \cref{lem: second order}, and (b) from \cref{eq: spectral control} in \cref{prop: spectral control}, the first equation in \cref{eq: inductive assumpution of Et}, and $\sup_{\|\bm{Z} \|_2 = 1} \left\|\mathcal{P}_{\mathbb{T}_t} \bm{Z} \right\|_2 \leq 3$. We choose a proper constant $C'$ and let $m \geq C' \kappa^2 r(\complexity)$ to make the coefficients before $\|\bm{X}_t- \target\|_2$ and $\sup _{\tuple \in \mathcal{N}}\left\|\bm{X}_t - \Xtxy\right\|_F$ above are both smaller than $2 \RegionConstant$. Then we have:
\begin{equation}\label{eq: Wt-X* in spectral norm}
    \left\|\target-\bm{W}_t\right\|_2 \leq 2 \RegionConstant \left( \left\|\bm{X}_{t}-\target\right\|_2
    +  \sup _{\tuple \in \mathcal{N}}\left\|\bm{X}_t - \Xtxy\right\|_F  \right) \leq \RegionConstant  \sigma_{\min }(\target),
\end{equation}
where in the last inequality we used the fact that $2\RegionConstant < 1$ and the inductive assumption.
This, together with Weyl's inequality, implies that $\sigma_r(\bm{W}_t) \geq (1 - \RegionConstant) \Std > \RegionConstant \Std \geq  \sigma_{r+1}(\bm{W}_t)$ and 
\begin{equation}
    s:= \sigma_r(\bm{W}_t) - \sigma_{r+1}(\bm{W}_t) \geq (1 - 2 \RegionConstant) \Std > 0,
    \label{eq: spectral gap} 
\end{equation}
i.e., the spectral gap of $\bm{W}_t$ is positive.
Then, $\bm{X}_{t+1} = \mathcal{H}_r(\bm{W}_t)$ is uniquely defined, which is the best rank-$r$ approximation to $\bm{W}_t$. Therefore,
\begin{equation}
    \begin{aligned}
\left\|\bm{X}_{t+1}-\target\right\|_2& 
\leq\left\|\bm{X}_{t+1}-\bm{W}_t\right\|_2+\left\|\bm{W}_t-\target\right\|_2
 \leq  2 \left\|\bm{W}_t-\target\right\|_2\\
& \leq 4 \RegionConstant \left( \left\|\bm{X}_{t}-\target\right\|_2
+  \sup _{\tuple \in \mathcal{N}}\left\|\bm{X}_t - \Xtxy\right\|_F  \right),
    \end{aligned}
\label{eq: X_t+1 - X*}
\end{equation}
where in the last inequality we have used \eqref{eq: Wt-X* in spectral norm}.

\textbf{Estimate $\|\bm{X}_{t+1} - \mathbf{X}_{t+1}^{\tuple}\|_F$.} 
Since $\bm{X}_{t+1}=\mathcal{H}_r(\bm{W}_t)$ and $\bm{X}_{t+1}^{\tuple}=\mathcal{H}_r(\bm{W}_t^{\tuple})$, applying \cref{lem: thresholding control},  we can upper bound $\| \bm{X}_{t+1} -  \mathbf{X}_{t+1}^{\tuple}  \|_F$ by $\|\bm{W}_t - \Wtxy\|_F$. We first bound $\|\bm{W}_t - \Wtxy\|_F$ by
\begin{equation}
\begin{aligned}
    &\| \bm{W}_{t} -  \Wtxy  \|_F \\
    &= 
    \| (\bm{X}_t - \mathcal{P}_{\mathbb{T}_{t}}\mathcal{A}^*\mathcal{A}(\bm{X}_t - \target))  - (\Xtxy - \PTtxy\mathcal{A}_{\tuple}^*\mathcal{A}_{\tuple}(\Xtxy - \target))                          \|_F \\
& \leq \|(\mathcal{I} - \mathcal{P}_{\mathbb{T}_{t}}) \Xtxy \|_F +  
\|\mathcal{P}_{\mathbb{T}_{t}}\left(\bm{X}_t - \mathcal{A}^*\mathcal{A}(\bm{X}_t - \target) - \Xtxy + \mathcal{A}^*\mathcal{A}(\Xtxy - \target) \right) \|_F \\
&\qquad + \|\mathcal{P}_{\mathbb{T}_{t}}(\mathcal{A}^*\mathcal{A} - \mathcal{A}_{\tuple}^*\mathcal{A}_{\tuple})(\Xtxy - \target) \|_F
+ \|(\mathcal{P}_{\mathbb{T}_{t}} - \PTtxy) \mathcal{A}_{\tuple}^*\mathcal{A}_{\tuple}(\Xtxy - \target)  \|_F\\
&:= I_1 + I_2 + I_3 + I_4.
\end{aligned}
\label{eq: error for W_t - W_tw}
\end{equation}
We estimate the four terms respectively.
\begin{itemize}
\item Bounding $I_1$. $I_1$ is a second-order term about $\|\bm{X}_t- \Xtxy\|_F$. Indeed, \cref{lem: second order} implies
\begin{equation}\label{eq: Lemma 12 implies I1}
I_1 \leq \frac{1}{\sigma_{\min }(\Xtxy)}\|\bm{X}_t- \Xtxy\|_F^2.    
\end{equation}

We need to derive a lower bound for $\sigma_{\min }(\bm{X}_{t})$ and $\sigma_{\min }(\Xtxy)$ respectively. From Weyl's inequality and the inductive assumption \eqref{eq: inductive assumpution of Et}, we have
$$
\sigma_{\min }(\bm{X}_{t}) \geq \Std - \left\|\bm{X}_{t} - \target\right\|_2 \geq \Std - c_1 \Std \geq (1 - \RegionConstant) \Std
$$
and
\begin{equation}
\sigma_{\min }(\Xtxy) \geq \sigma_{\min }(\bm{X}_{t}) - \left\|\bm{X}_{t} - \Xtxy\right\|_F \geq   (1 - 2\RegionConstant) \Std.
\label{eq: lowerbound for stdxy}
\end{equation}
Plugging it in \eqref{eq: Lemma 12 implies I1} gives
\begin{equation}
\begin{aligned}
    I_1 &\leq \frac{1}{(1 - 2\RegionConstant) \Std}\left\|\bm{X}_t- \Xtxy\right\|_F^2 \leq \frac{\RegionConstant}{1 - 2 \RegionConstant} \| \bm{X}_t - \Xtxy \|_F \leq
    2 \RegionConstant \| \bm{X}_t - \Xtxy \|_F,
    \label{eq: I1}
\end{aligned}
\end{equation}
where we have used the inductive assumption \eqref{eq: inductive assumpution of Et} in the second inequality and \eqref{eq: c1} in the last inequality.

\item Bounding $I_2$. We estimate $I_2$ by projecting $\bm{X}_t- \Xtxy$ onto $\mathbb{T}_t$ and $\mathbb{T}_t^{\perp}$ respectively as follows:
\begin{equation}
\begin{aligned}
I_2 &= \|\mathcal{P}_{\mathbb{T}_{t}}(\mathcal{I} - \mathcal{A}^*\mathcal{A})(\bm{X}_t- \Xtxy) \|_F \\
& \leq \|\mathcal{P}_{\mathbb{T}_{t}}(\mathcal{I} - \mathcal{A}^*\mathcal{A})\mathcal{P}_{\mathbb{T}_{t}}(\bm{X}_t- \Xtxy) \|_F + \|\mathcal{P}_{\mathbb{T}_{t}}(\mathcal{I} - \mathcal{A}^*\mathcal{A})(\mathcal{I} - \mathcal{P}_{\mathbb{T}_{t}})(\bm{X}_t- \Xtxy) \|_F \\
& \stackrel{\note{a}}{\leq} \delta_{2r}
\| \bm{X}_t- \Xtxy \|_F + \delta_{3r}\|(\mathcal{I} - \mathcal{P}_{\mathbb{T}_{t}}) (\bm{X}_t- \Xtxy )\|_F\\
& \stackrel{\note{b}}{\leq} \delta_{2r} \| \bm{X}_t- \Xtxy \|_F + \delta_{3r} \frac{1}{\sigma_{\min }(\Xtxy)}\left\|\bm{X}_t- \Xtxy\right\|_F^2 \\
& \stackrel{\note{c}}{\leq} \left(\delta_{2r} + \frac{\RegionConstant\delta_{3r}}{1 - 2 \RegionConstant}  \right) \| \bm{X}_t- \Xtxy \|_F \\
& \stackrel{\note{d}}{\leq}  2 \RegionConstant \| \bm{X}_t- \Xtxy \|_F,
\end{aligned}
\label{eq: I2}
\end{equation}
where:
\begin{itemize}
\item step (a) follows from the properties 3 and 4 of RIP in \cref{lem: RIP ort},
\item step (b) follows from 
\cref{lem: projection distance},
\item step (c) follows from
the inductive assumption \cref{eq: inductive assumpution of Et} and \cref{eq: lowerbound for stdxy},
\item step (d) follows from $\RegionConstant < \frac{1}{1000}$ and $\delta < \frac{1}{24}\RegionConstant$.
\end{itemize}


\item Bounding $I_3$. For $I_3$, we denote $\bm{\Delta}_t := \bm{X}_t - \target$ and  $\Dtxy := \Xtxy - \target$. Then, $I_3$ is estimated as follows:
\begin{equation}
\label{eq: I3}
\begin{aligned}
I_3 &= \|\mathcal{P}_{\mathbb{T}_{t}}(\mathcal{A}^*\mathcal{A} - \mathcal{A}_{\tuple}^*\mathcal{A}_{\tuple})(\Xtxy - \target) \|_F \\
& \stackrel{\note{a}}{\leq} \|\mathcal{P}_{\mathbb{T}_{t}}(\mathcal{A}^*\mathcal{A} - \mathcal{I}) \langle \bm{x}\bm{y}^T,\Dtxy \rangle \bm{x}\bm{y}^T  \|_F + \|\langle\mathcal{A}\left(\bm{x}\bm{y}^T\right), \mathcal{A}(\mathcal{P}_{\bm{x}\bm{y}^T}^{\perp}(\Dtxy) \rangle \mathcal{P}_{\mathbb{T}_{t}}(\bm{x}\bm{y}^T) \|_F\\
&\stackrel{}{\leq} \|\mathcal{P}_{\mathbb{T}_{t}}(\mathcal{A}^*\mathcal{A} - \mathcal{I}) \langle \bm{x}\bm{y}^T,\Dtxy \rangle \bm{x}\bm{y}^T  \|_F + \big| \langle\mathcal{A}\left(\bm{x}\bm{y}^T\right), \mathcal{A}(\mathcal{P}_{\bm{x}\bm{y}^T}^{\perp}(\Dtxy) \rangle \big|\\
& \stackrel{\note{b}}{\leq} \|\mathcal{P}_{\mathbb{T}_{t}}(\mathcal{A}^*\mathcal{A} - \mathcal{I}) \langle \bm{x}\bm{y}^T,\Dtxy \rangle \bm{x}\bm{y}^T  \|_F \\
& + \left(8 \sqrt{\frac{2 r (\complexity)}{m}}\left\|\bm{\Delta}_t\right\|_2+8 \sqrt{\frac{\complexity}{m}} \sup _{\tuple \in \mathcal{N}}\left\|\bm{\Delta}_t-\bm{\Delta}_{t}^{\tuple}\right\|_F\right)\\
& \leq \| \Dtxy \|_2  \left(   \|\mathcal{P}_{\mathbb{T}_{t}}(\mathcal{A}^*\mathcal{A} - \mathcal{I})\mathcal{P}_{\mathbb{T}_{t}}\bm{x}\bm{y}^T \|_F +  \|\mathcal{P}_{\mathbb{T}_{t}}(\mathcal{A}^*\mathcal{A})(\mathcal{I} -\mathcal{P}_{\mathbb{T}_{t}})\bm{x}\bm{y}^T \|_F \right) \\ &+ 
 \left(8 \sqrt{\frac{2 r (\complexity)}{m}}\left\|\bm{\Delta}_t\right\|_2 +8 \sqrt{\frac{\complexity}{m}} \sup _{\tuple \in \mathcal{N}}\left\|\bm{\Delta}_t-\bm{\Delta}_{t}^{\tuple}\right\|_F\right) \\
& \stackrel{\note{c}}{\leq} \left(8 \sqrt{\frac{2 r (\complexity)}{m}}+ \delta_{2r} + \delta_{3r}  \right)\left\|\bm{\Delta}_t\right\|_2 
\\
&+\left(8 \sqrt{\frac{\complexity}{m}} + \delta_{2r} + \delta_{3r} \right) \sup _{\tuple \in \mathcal{N}}\left\|\bm{\Delta}_t-\bm{\Delta}_{t}^{\tuple}\right\|_F \\
&\stackrel{\note{d}}{\leq}\left(8 \sqrt{\frac{2 r (\complexity)}{m}}+ \frac{1}{12} \RegionConstant \right)\left\|\bm{\Delta}_t\right\|_2 
\\
&+\left(8 \sqrt{\frac{\complexity}{m}} + \frac{1}{12} \RegionConstant \right) \sup _{\tuple \in \mathcal{N}}\left\|\bm{\Delta}_t-\bm{\Delta}_{t}^{\tuple}\right\|_F,
\end{aligned}
\end{equation}
where:
\begin{itemize}
    \item step (a) follows form \eqref{eq: AA-A_wA_w} in \cref{lem: compute decoupling},
    \item step (b) follows from \eqref{eq: A ort uniform} in \cref{lem: orthogonality preserve without rip uniform},
    \item step (c) from $\| \Dtxy \|_2 \leq \| \bm{\Delta}_{t} \|_2 + \| \bm{\Delta}_{t} - \Dtxy \|_F$ and \cref{lem: RIP},
    \item step (d) from $\RIPnorm\leq\RIPort\leq\delta\leq\frac{1}{24}c_1$ by assumption \eqref{eq: delta 6r}.
\end{itemize}



We further denote the upper bound for $I_3$ in the last inequality as $I_3'$, that is
$$
I_3 \leq I_3' := \left(8 \sqrt{\frac{2 r (\complexity)}{m}}+ \frac{1}{12} \RegionConstant \right)\left\|\bm{\Delta}_t\right\|_2 
+\left(8 \sqrt{\frac{\complexity}{m}} + \frac{1}{12} \RegionConstant \right) \sup _{\tuple \in \mathcal{N}}\left\|\bm{\Delta}_t-\bm{\Delta}_{t}^{\tuple}\right\|_F.
$$


\item Bounding $I_4$. We estimate $I_4$ as in the following:
\begin{equation}
\begin{aligned}
I_4 & \leq \| (\mathcal{P}_{\mathbb{T}_{t}} - \PTtxy) \mathcal{A}^*\mathcal{A}(\Xtxy - \target)  \|_F +  \|\mathcal{P}_{\mathbb{T}_{t}}(\mathcal{A}^*\mathcal{A} - \mathcal{A}_{\tuple}^*\mathcal{A}_{\tuple})(\Xtxy - \target) \|_F \\
&+ \|\PTtxy(\mathcal{A}^*\mathcal{A} - \mathcal{A}_{\tuple}^*\mathcal{A}_{\tuple})(\Xtxy - \target) \|_F \\
& \stackrel{\note{a}}{\leq} \| (\mathcal{P}_{\mathbb{T}_{t}} - \PTtxy) \mathcal{A}^*\mathcal{A}(\Xtxy - \target)  \|_F + 2I_3'\\
& \leq \| (\mathcal{P}_{\mathbb{T}_{t}} - \PTtxy) (\mathcal{A}^*\mathcal{A}-\mathcal{I}) (\Xtxy - \target)  \|_F + \| (\mathcal{P}_{\mathbb{T}_{t}} - \PTtxy)  (\Xtxy - \target)  \|_F  + 2I_3'\\
& \stackrel{\note{b}}{\leq} \frac{ 4\sqrt{2}}{\sigma_{\min }(\bm{X}_t)}\left\|\bm{X}_t- \Xtxy\right\|_F  \left( \| (\mathcal{A}^*\mathcal{A}-\mathcal{I})(\Dtxy)  \|_2 
+   \| \Dtxy \|_2  \right) + 2I_3' \\ 
& \stackrel{\note{c}}{\leq} \frac{4\sqrt{2} \RegionConstant}{1 - 2\RegionConstant} \left( \| (\mathcal{A}^*\mathcal{A}-\mathcal{I})(\Dtxy)  \|_2 
+   \| \Dtxy \|_2  \right) + 2I_3'\\
& \leq \frac{4\sqrt{2} \RegionConstant}{1 - 2\RegionConstant}
\left( \| \bm{\Delta}_{t}\|_2 + \| \bm{\Delta}_{t} - \Dtxy\|_F + \| (\mathcal{A}^*\mathcal{A}-\mathcal{I})\bm{\Delta}_t  \|_2 + \| (\mathcal{A}^*\mathcal{A}-I)(\bm{\Delta}_t - \Dtxy)  \|_2   \right) + 2I_3' \\
&\stackrel{\note{d}}{\leq} \frac{4\sqrt{2} \RegionConstant}{1 - 2\RegionConstant}  \left(1 + 16 \sqrt{\frac{2 r (\complexity)}{m}} + 2 \delta \right)\| \bm{\Delta}_{t}\|_2 \\
&+ \frac{4\sqrt{2} \RegionConstant}{1 - 2\RegionConstant}\left(1 + 4\delta+16 \sqrt{\frac{\complexity}{m}} + \RIPop \right)\|\bm{\Delta}_t - \Dtxy\|_F + 2I_3' \\
& \stackrel{\note{e}}{\leq}
 4\sqrt{2}  \times \frac{500}{499} \times  \left(1 + 16 \sqrt{\frac{2 r (\complexity)}{m}} + \frac{1}{500} \right) \RegionConstant\| \bm{\Delta}_{t}\|_2 \\
&+ 4\sqrt{2}  \times \frac{500}{499} \times\left(1 + 16 \sqrt{\frac{\complexity}{m}} + \frac{1}{200} \right) \RegionConstant\|\bm{\Delta}_t - \Dtxy\|_F + 2I_3',
\end{aligned}
\label{eq: I4}
\end{equation}
where:
\begin{itemize}
    \item step (a) follows from $\|\PTtxy(\mathcal{A}^*\mathcal{A} - \mathcal{A}_{\tuple}^*\mathcal{A}_{\tuple})(\Xtxy - \target) \|_F$ and be estimated similarly as in \cref{eq: I3},
    \item step (b) follows from \cref{lem: projection distance},
    \item step (c) follows from \cref{eq: inductive assumpution of Et} and \cref{eq: lowerbound for stdxy},
    \item step (d) follows from \cref{prop: spectral control} and $\| (\mathcal{A}^*\mathcal{A}-\mathcal{I})(\bm{\Delta}_t - \Dtxy)  \|_2 \leq \delta_{r+2} \|\bm{\Delta}_t - \Dtxy \|_F$,
    \item step (e) follows from $\delta< \RegionConstant < \frac{1}{1000}$.
\end{itemize}
\end{itemize}

We choose a proper constant $C''$ and let $m \geq C'' \kappa^2 r(\complexity)$ to make the coefficients before $\|\bm{\Delta}_t - \Dtxy\|_F$ and $\|\bm{\Delta}_t\|_2$ in the last term of \cref{eq: I3} are both smaller than $2 \RegionConstant$, and those in the last term (excluding $2I_3'$) of \cref{eq: I4} are smaller than $8 \RegionConstant$. As a result, $I_3  \leq I_3' \leq 2\RegionConstant E_t$, and $I_4 \leq 8 \RegionConstant E_t + 2I_3' \leq 12 \RegionConstant E_t$. Besides, we have $I_1 \leq 2 \RegionConstant E_t$ by \cref{eq: I1} and $I_2 \leq 2 \RegionConstant E_t$ by \cref{eq: I2}. Substituting all these in \cref{eq: error for W_t - W_tw} gives:
\begin{equation}
    \begin{aligned}
    \| \bm{W}_{t} -  \Wtxy  \|_F &\leq (2 \cdot 3+ 12) \RegionConstant \left( \left\|\bm{X}_{t}-\target\right\|_2 + \sup _{\tuple \in \mathcal{N}}\left\|\bm{X}_{t} - \Xtxy\right\|_F  \right) \leq \RegionConstant \Std,
    \end{aligned}
    \label{eq: error for W_t - W_twCombined}
\end{equation}
where we use $18 \RegionConstant <1$ and the inductive assumption \cref{eq: inductive assumpution of Et} in the last inequality.

To estimate $\| \bm{X}_{t+1} -  \mathbf{X}_{t+1}^{\tuple}  \|_F$, we check the validity of \cref{lem: thresholding control}.
First, from \cref{eq: spectral gap} and \cref{eq: error for W_t - W_twCombined}, 
$$
\|\bm{W}_t - \Wtxy\|_2 \leq  \|\bm{W}_t - \Wtxy\|_F \leq \frac{\RegionConstant}{1 - 2\RegionConstant} \left(\sigma_r(\bm{W}_t) - \sigma_{r+1}(\bm{W}_t) \right) \stackrel{\cref{eq: c1}}{<} (1 - 1/\sqrt{2})s. 
$$ 
Second, we define $c_0 := \frac{\RegionConstant}{1 - 2 \RegionConstant}$, and we have $\sigma_{r+1}(\bm{W}_t) \leq c_0 s$ by \cref{eq: spectral gap}. Then all conditions in \cref{lem: thresholding control} are met, and therefore it implies that: there exists a constant $C_2 $ that is only related to $\RegionConstant$ such that
\begin{equation}
\begin{aligned}
    \| \bm{X}_{t+1} -  \mathbf{X}_{t+1}^{\tuple}  \|_F &\leq  2C_2 \|\bm{W}_t - \Wtxy\|_F \leq 2(\frac{6\RegionConstant}{1 - 2 \RegionConstant} + 10) \|\bm{W}_t - \Wtxy\|_F\\
    &%\stackrel{32 \times 18 < 996}{\leq} 
     \leq 996 \RegionConstant \left( \left\|\bm{X}_{t}-\target\right\|_2 + \sup _{\tuple \in \mathcal{N}}\left\|\bm{X}_{t} - \Xtxy\right\|_F  \right), \\
\end{aligned}
\label{eq: error for X_t+1 - X_t+1_w}
\end{equation}
where we use \cref{eq: error for W_t - W_twCombined} in the last inequality. Summing up \cref{eq: error for X_t+1 - X_t+1_w} and \cref{eq: X_t+1 - X*} gives
\begin{equation}
\begin{aligned}
    \| \bm{X}_{t+1} - \target \|_2 + \sup_{\tuple \in \mathcal{N}} \| \bm{X}_{t+1} - \mathbf{X}_{t+1}^{\tuple} \|_F
    %\leq (1000 \RegionConstant)^t c_1 \Std .  
&\leq 1000 \RegionConstant (\| \bm{X}_{t} - \target \|_2 + \sup_{\tuple \in \mathcal{N}} \| \bm{X}_{t} - \Xtxy \|_F )\\
&\leq \RegionConstant (1000 \RegionConstant)^{t+1} \Std,
\label{eq: contraction t to t+1}
\end{aligned}
\end{equation}
where in the last inequality we have used \eqref{eq: inductive assumpution of Et} and \eqref{eq: c1}.


Throughout the proof, we have imposed two lower bounds on $m$. We then take their maximum, i.e., $m \geq C \kappa^2 r(\complexity)$ with $C = \max \{C', C''\}$, to complete the proof.
\end{proof}



% \section{Proof in Convergence Analysis}
% From the assumption of this lemma,
% \begin{equation}
% \RegionConstant < \frac{1}{1000},
% \label{eq: c1}
% \end{equation}
% and we have that $\mathcal{A}$ satisfies the Restricted Isometry Property (RIP) of rank $6r$ with 
% \begin{equation}
%  \delta = \delta_{6r} \leq \frac{1}{24}\RegionConstant < 1.  
%  \label{eq: delta 6r}
% \end{equation}
% Additionally, \cref{eq: spectral control} holds with this $\delta$ for $t \leq T \leq \Maxiter$.

% We prove this theorem by induction. The assumption \cref{eq: E0} of this lemma gives $E_0 \leq c_1\Std$. Assume that
% \[
% E_0 \leq c_1\Std, \quad E_1 \leq (1000 c_1)c_1\Std, \quad \cdots, \quad E_t \leq (1000 c_1)^t c_1\Std.
% \]
% We need to show that $E_{t+1} \leq (1000 c_1)^{t+1}c_1\Std$, i.e.,
% \[
% \| \bm{X}_{t+1} - \target \|_2 + \sup_{\tuple \in \mathcal{N}} \| \bm{X}_{t+1} - \mathbf{X}_{t+1}^{\tuple} \|_F \leq c_1 (1000c_1)^{t+1}\Std.
% \]

% To this end, we estimate $\| \bm{X}_{t+1} - \target \|_2$ and $\sup_{\tuple \in \mathcal{N}}\|\bm{X}_{t+1} - \mathbf{X}_{t+1}^{\tuple} \|_F$ separately. Notice that the inductive assumption $E_t \leq (1000 c_1)^t c_1\Std$ implies
% \begin{equation}\label{eq: inductive assumption of Et}
% \| \bm{X}_{t} - \target \|_2 \leq c_1 \Std \quad \text{and} \quad \sup_{\tuple \in \mathcal{N}}\|\bm{X}_{t} - \Xtxy\|_F \leq c_1 \Std.   
% \end{equation}


For completeness, we also include the proof of \cref{lem: afterT}, which was established in prior work given the initialization $\mathcal{H}_r(\mathcal{A}^*(\bm{b}))$ \citep[Theorem 2.2]{RGD}.
%but is reproduced here for self-containment. 
We slightly modify the proof and show that whenever $\|\bm{X}_T-\target\|_F$ is sufficiently small, RGD will converge linearly to $\target$.

\begin{proof}[Proof of \cref{lem: afterT}]
The proof follows the same structure as \citep[Theorem 2.2]{RGD}. Since $\bm{X}_{t+1} = \mathcal{H}_r(\bm{W}_t)$ is the best rank-$r$ approximation to $\bm{W}_t$, we have
\[
\| \bm{X}_{t+1} - \target\|_F \leq \left\|\bm{X}_{t+1} - \bm{W}_t\right\|_F + \left\|\bm{W}_t - \target\right\|_F \leq 2 \left\|\bm{W}_t - \target\right\|_F.
\]
Substituting $\bm{W}_t = \bm{X}_t + \mathcal{P}_{\mathbb{T}_t} \mathcal{A}^{*} \mathcal{A}\left(\target - \bm{X}_t\right)$ into the above inequality yields:
\[
\begin{aligned} 
    \| \bm{X}_{t+1} - \target\|_F
    &\leq 2 \left\|\left(\mathcal{I} - \mathcal{P}_{\mathbb{T}_t} \mathcal{A}^{*} \mathcal{A}\right)\left(\target - \bm{X}_t\right)\right\|_F \\
    &\leq 2 \left\|\left(\mathcal{I} - \mathcal{P}_{\mathbb{T}_t}\right)\left(\target - \bm{X}_t\right)\right\|_F + 2 \left\|\mathcal{P}_{\mathbb{T}_t}\left(I - \mathcal{A}^{*} \mathcal{A}\right)\left(\target - \bm{X}_t\right)\right\|_F \\
    &\stackrel{\note{a}}{\leq} \frac{2}{\sigma_{\min}(\target)}\left\|\bm{X}_t - \target\right\|_F^2 + 2 \left\|\mathcal{P}_{\mathbb{T}_t}\left(I - \mathcal{A}^{*} \mathcal{A}\right) \mathcal{P}_{\mathbb{T}_t} \left(\target - \bm{X}_t\right)\right\|_F \\
    &\quad + 2 \left\|\mathcal{P}_{\mathbb{T}_t}\left(I - \mathcal{A}^{*} \mathcal{A}\right)(I - \mathcal{P}_{\mathbb{T}_t}) \left(\target - \bm{X}_t\right)\right\|_F \\
    &\stackrel{\note{b}}{\leq} \frac{2}{\sigma_{\min}(\target)}\left\|\bm{X}_t - \target\right\|_F^2 + 2 \RIPnorm \| \target - \bm{X}_t \|_F + 2 \RIPort \left\|\target - \bm{X}_t\right\|_F \\
    &\leq \left(2 \dfrac{\|\target - \bm{X}_t\|_F}{\Std} + 4c_2\right) \left\|\target - \bm{X}_t\right\|_F,
\end{aligned} 
\]
where (a) follows from \cref{lem: second order}, and (b) follows from \cref{lem: RIP ort} and the inequalities $\RIPnorm \leq \RIPort \leq \delta_{6r} \leq c_2$.

Define 
\[
\gamma_t = 2 \dfrac{\|\target - \bm{X}_t\|_F}{\Std} + 4c_2.
\]
By the condition \cref{eq: init_F_norm}, we have $\gamma_T \leq 6c_2 < 1$. The remainder of the proof proceeds by induction. Assume $\gamma_k < 6c_2$ for $k = T, T+1, \dots, t$. Then, we have
\[
\| \bm{X}_{t} - \target\|_F \leq (6c_2)^{t-T} \|\bm{X}_T - \target\|_F \leq \|\bm{X}_T - \target\|_F \leq c_2 \Std.
\]
Therefore, $\gamma_{t+1} \leq 6c_2$. By induction, we conclude that $\gamma_t < 6c_2$ for all $t \geq T$.
\end{proof}



