


\section{Theoretical Analysis}
\label{sec: Proof main}

In this section, we prove \cref{thm: RGDmain}. We begin by introducing the Restricted Isometry Property (RIP), which is commonly used in prior analyses. Next, we highlight the primary theoretical challenge that introduces the $r^2$ term in the sample complexity. To address this issue, we present the key decoupling technique, inspired by \citep{stoger_non-convex_2024}. Following this, we provide the necessary supporting lemmas and conclude with the proof of the main theorem based on these results.
%\subsection{}
\subsection{Restricted Isometry Property}
The Restricted Isometry Property (RIP) is a fundamental tool in the analysis of low-rank matrix recovery problems, particularly under random Gaussian measurements. This property ensures that a measurement operator approximately preserves the geometry of low-rank matrices, which is crucial for analyzing the performance of various recovery algorithms. We introduce the definition and properties of the Restricted Isometry Property (RIP), which plays a crucial role in our analysis.

\begin{defn}
    The linear measurement operator $\mathcal{A}: \domain \rightarrow \mathbb{R}^m$ satisfies the Restricted Isometry Property (RIP) of rank $r$ with RIP-constant $\delta_r\in(0,1)$ if it holds that
    \[
    \begin{split}
    \left(1 - \delta_r\right)\|\mathbf{Z}\|_F^2\leq \|\mathcal{A}(\mathbf{Z})&\|_2^2 \leq \left(1 + \delta_r\right)\|\mathbf{Z}\|_F^2,\cr
    &\forall~\mathbf{Z} \in \domain~:~\mathrm{rank}(\mathbf{Z}) \leq r.
    \end{split}
    \]
\end{defn}

The RIP is a uniform result, as it holds for all low-rank matrices rather than just specific matrices of interest, such as $\mathbf{X}_t - \mathbf{X}_{\star}$. The RIP is widely used in the theoretical analysis of matrix sensing problems. If $m = \Omega(r (d_1 + d_2))$, then the measurement operator $\mathcal{A}$ satisfies the RIP of order $r$ with high probability. 
The results from {\citep[Lemma 3.1]{candes_tight_nodate_RIP_nonsymmetric} and \citep[Lemma 2.2]{stoger_non-convex_2024}} directly extend to rectangular matrices:
% \begin{lem}[{\citep[Theorem 2.3]{candes_tight_nodate_RIP_nonsymmetric}}]
\begin{lem}
    \label{lem: RIP}
    Let $\mathcal{A}: \domain \rightarrow \mathbb{R}^m$ be a Gaussian measurement operator as described above. Then, $\mathcal{A}$ satisfies the RIP of rank-$r$ with constant $\delta_r$ satisfying $\delta_r=\delta \leq 1$ with probability $1 - \varepsilon$ when
    \[
    m \geq C \delta^{-2} \left(r (d_1 + d_2) + \log \left(2 \varepsilon^{-1}\right)\right),
    \]
    where $C > 0$ is a universal constant. In particular, with probability at least $1 - \ProbRIP$, $\mathcal{A}$ satisfies the RIP of rank $r$ and constant $\delta$ provided $m \geq C \delta^{-2} r (d_1 + d_2)$.
\end{lem}
% A direct property of RIP constants $\delta_r$ is that $\delta_{r'} \leq \delta_{r}$ 
The following properties of the RIP will be used throughout our proofs. The mapping $\mathcal{I}: \domain \to \domain$ represents the identity.

\begin{lem}\label{lem: RIP ort}
    Let $\mathcal{A}: \domain \rightarrow \mathbb{R}^m$ be a linear measurement operator satisfying the RIP with $r_0$ and RIP constant $\delta_{r_0}$ for any $r_0\leq 3r$. Then, the following statements hold:
    \begin{enumerate}
        \item Let $\mathbf{V} \in \mathbb{R}^{d_2 \times r^{\prime}}$ be any matrix with orthonormal columns, i.e., $\mathbf{V}^{\top} \mathbf{V} = \mathbf{I}$. Then, for any matrix $\mathbf{Z} \in \domain$ satisfying $\mathrm{rank}(\mathbf{Z}) \leq r$, it holds that
        \begin{equation}
            \left\|\left(\mathcal{I} - \mathcal{A}^* \mathcal{A}\right)(\mathbf{Z}) \mathbf{V}\right\|_F \leq \delta_{r + 2 r^{\prime}} \|\mathbf{Z}\|_F.
            \label{eq: RIP V}
        \end{equation}
        In particular, if we take $r' = 1$, then we have
        \begin{equation}
            \left\|\left(\mathcal{I} - \mathcal{A}^* \mathcal{A}\right)(\mathbf{Z})\right\|_2 \leq \delta_{r + 2} \|\mathbf{Z}\|_F.
            \label{eq: RIP 2norm}
        \end{equation}

        \item Let $\mathbf{x} \in \mathbb{R}^{d_1}$ be such that $\|\mathbf{x}\|_2 = 1$, and let $\mathbf{y} \in \mathbb{R}^{d_2}$ be such that $\|\mathbf{y}\|_2 = 1$. Define the orthogonal projection operators
        \[
        \begin{aligned}
            \mathcal{P}_{\net}(\mathbf{Z}) &:= \left\langle \net, \mathbf{Z} \right\rangle \net, \\
            \mathcal{P}_{\net}^{\perp}(\mathbf{Z}) &:= \mathbf{Z} - \left\langle \net, \mathbf{Z} \right\rangle \net.
        \end{aligned}
        \]
        Then, for any matrix $\mathbf{Z} \in \domain$ satisfying $\mathrm{rank}(\mathbf{Z}) \leq r$, we have
        \begin{equation}
            \big|\left\langle \mathcal{A}\big(\net\right), \mathcal{A}\big(\mathcal{P}_{\net}^{\perp}(\mathbf{Z})\big) \big\rangle\big| \leq \delta_{r + 2} \|\mathbf{Z}\|_F.
            \label{eq: RIP w ort}
        \end{equation}

        \item Let $\mathbf{X} \in \domain$ be a matrix of rank $r$. Then, it holds that
        \begin{equation}
            \sup_{\|\mathbf{Z}\|_F = 1} \left\| \left(\mathcal{P}_{\mathbb{T}_{\mathbf{X}}} - \mathcal{P}_{\mathbb{T}_{\mathbf{X}}} \mathcal{A}^* \mathcal{A} \mathcal{P}_{\mathbb{T}_{\mathbf{X}}}\right)(\mathbf{Z}) \right\|_F \leq \delta_{2 r}.
            \label{eq: RIP norm}
        \end{equation}
        
        \item Let $\bm{Z} \in \domain$ be a matrix of rank at most $r$. Then,
         \begin{equation}
         \begin{split}
            \| \mathcal{P}_{\mathbb{T}_{\mathbf{X}}} \mathcal{A}^* \mathcal{A} (\mathcal{I} &- \mathcal{P}_{\mathbb{T}_{\mathbf{X}}})(\mathbf{Z})\|_F \\
            &\leq \delta_{3 r} \left\| \left(\mathcal{I} - \mathcal{P}_{\mathbb{T}_{\mathbf{X}}}\right)(\mathbf{Z}) \right\|_F.
        \end{split}
        \end{equation}
    \end{enumerate}
\end{lem}

The proof is in \cref{proof: RIP}.


\subsection{Limitations of RIP-based Analysis}
Before presenting our proof, we first highlight why uniform results based solely on the RIP are insufficient for achieving optimal sample complexity. A standard RIP-based analysis \citep{RGD} typically yields a sample complexity that scales as $r^2$ rather than $r$.  They show that a sufficiently small yet $O(1)$ RIP constant, requiring $m = \Omega(r(\complexity))$, ensures 
\begin{equation}
\left\|\mathcal{P}_{\mathbb{T}_t}\left(\mathcal{I}-\mathcal{A}^{*} \mathcal{A}\right)\left(\target-\bm{X}_t\right)\right\|_F \ll \|\target-\bm{X}_t\|_F.
\label{eq: F norm I-AA*}
\end{equation}
 This inequality guarantees linear convergence of $\{\bm{X}_t\}_{t\geq T}$ to $\target$ in Frobenius norm for some $T \in \mathbb{N}$ whenever $\bm{X}_T$ satisfies
\begin{equation}
\| \bm{X}_T - \target \|_F \ll \Std.    
\label{eq: uniform region F}
\end{equation} 
To achieve this, they simply take $T=0$ and use spectral initialization, which only achieve $\| \bm{X}_T - \target \|_2 \ll \Std$ with $m = \Omega(r(\complexity)\kappa^2)$. 
They use $\| \bm{X}_T - \target \|_F \leq \sqrt{2r} \| \bm{X}_T - \target \|_2
$ and require a RIP constant scaling as $O(1 / \sqrt{r})$ to ensure \cref{eq: uniform region F}, which in turn necessitates a sample complexity of $\Omega(r^2)$.

Alternatively, one could analyze convergence in the $2$-norm, which would require a $2$-norm counterpart of \eqref{eq: F norm I-AA*}:
\begin{equation}
\left\|\mathcal{P}_{\mathbb{T}_t}\left(\mathcal{I}- \mathcal{A}^{*} \mathcal{A}\right)\left(\target-\bm{X}_t\right)\right\|_2 \ll \|\target-\bm{X}_t\|_2.
   \label{eq: deviation-2}
\end{equation}
However, deriving \eqref{eq: deviation-2} is challenging. Attempting to prove \eqref{eq: deviation-2}, we may consider proving a uniform result such as 
$\left\|\mathcal{P}_{\mathbb{T}_t}\left(\mathcal{I}- \mathcal{A}^{*} \mathcal{A}\right)\left( \bm{\Delta}_t\right)\right\|_2 \ll \|\bm{\Delta}_t\|_2
$ for all possible $2r$-rank matrices $\bm{\Delta}_t$, but it is highly likely to fail with $\Omega(r)$ in sample complexity. Indeed, \citep{stoger_non-convex_2024} provides a related negative result: 
\[
    \sup _{\mathrm{rank}(\mathbf{Z})\leq r}\left\|\left(\mathcal{I} - \mathcal{A}^* \mathcal{A}\right)(\mathbf{Z})\right\|_2 \geq \frac{1}{16} \|\bm{Z} \|_2 \sqrt{\frac{r^2 d_1}{m}}.
\]
Although their setting differs slightly from ours, this result underscores the difficulty of establishing uniform $2$-norm bounds analogous to RIP.

Instead of relying on the uniform results, we leverage the fact that $\left\{\mathbf{X}_{t}\right\}_{t \in \mathbb{N}}$ is a discrete sequence and approach \eqref{eq: deviation-2} directly. However, since $\left\{\mathbf{X}_{t}\right\}_{t \in \mathbb{N}}$ is generated by $\mathcal{A}$ and is thus dependent on it, the absence of a uniform result necessitates techniques to decouple them. One common way is resampling \citep{candes_phase_2015_CDP}, but it increases the sample complexity. Inspired by \citep{stoger_non-convex_2024}, we used a delicate decoupling technique, which will be elaborated in the following section.


\subsection{Key Decoupling Technique}
%Most of the proofs in this section are provided in \cref{sec: proof of decoupling Technique}. 
%Recall that $\left\{\mathbf{X}_{t}\right\}_{t \in \mathbb{N}}$ is the sequence generated by \cref{alg: RGD} with Gaussian measurement operator $\mathcal{A}$ with $m$ measurements. 
 Define $\boldsymbol{\Delta}_t:=\target - \bm{X}_t$. As illustrated in the previous section, the key is to control $\left\|\left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\boldsymbol{\Delta}_t\right)\right\|_2$. We first recall a typical method to control the $2$-norm of a general random matrix $\bm{M} \in \domain$ \citep{vershynin2018high}. Define $\mathbb{S}^{d-1} := \left\{\mathbf{x} \in \mathbb{R}^d : \|\bm{x}\|_2=1 \right\}$ and $\mathbb{S} := \ball$. 
 We can  construct an $\varepsilon$-net $\mathcal{N}_1 \in \mathbb{S}^{d_1-1}$ and an $\varepsilon$-net $\mathcal{N}_2 \in \mathbb{S}^{d_2-1}$ with $\varepsilon = \frac{1}{4}$, and let
\begin{equation}
\mathcal{N}: = \mathcal{N}_1 \times \mathcal{N}_2 \in \mathbb{S}.
\label{eq: N}
\end{equation}
It is well known that the size of $\varepsilon$-net for $\mathbb{S}^{d-1}$ can be smaller than $(\frac{3}{\varepsilon})^{d}$, so $|\mathcal{N}| \leq \Maxiter$.  Then we have:
$$
\begin{aligned}
\left\|\bm{M}\right\|_2 
&= \sup _{\tuple \in \mathbb{S}} \left|\left\langle\net,\bm{M}\right\rangle\right| \\
&\leq  \sup _{\tuple \in \mathcal{N}}\left|\left\langle\net,\bm{M} \right\rangle \right| + \frac{2}{4}\sup _{\tuple \in \mathbb{S}} \left|\left\langle\net,\bm{M}\right\rangle\right|, 
\label{eq: N net}
\end{aligned}
$$
which imples
$$
\left\|\bm{M}\right\|_2  \leq 2 \sup _{\tuple \in \mathcal{N}}\left|\left\langle\net,\bm{M} \right\rangle \right|.
$$
Substituting $\bm{M} = \left(\mathcal{A}^* \mathcal{A}-\mathcal{I}\right)(\bm{\Delta}_t)$, we turn to estimate $\sup _{\tuple \in \mathcal{N}}\left|\left\langle\net,\left(\mathcal{A}^* \mathcal{A}-\mathcal{I}\right)\left(\boldsymbol{\Delta}_t\right)\right\rangle\right|.$ 

For any  $\tuple \in \mathcal{N}$,  we have 
$$
\begin{aligned}
 &|\left\langle\net,\left(\mathcal{A}^* \mathcal{A}-\mathcal{I}\right)\left(\boldsymbol{\Delta}_t\right)\right\rangle | \\
 &\leq  |\left\langle\net,\left(\mathcal{A}^* \mathcal{A}-\mathcal{I}\right)\left( \mathcal{P}_{\net}\boldsymbol{\Delta}_t \right)\right\rangle| \\
 &+ \left|\left\langle\net,\left(\mathcal{A}^* \mathcal{A}-\mathcal{I}\right)\left( \Pxyort \boldsymbol{\Delta}_t\right)\right\rangle \right| .
\end{aligned}
$$
The first term on the right-hand side is smaller than $O(\sqrt{\frac{r(\complexity)}{m}}) \|\boldsymbol{\Delta}_t\|_2$ by \cref{eq: RIP 2norm} in \cref{lem: RIP} if RIP is satisfied, and the second one equals to 
\begin{equation}
I := \Big|\frac{1}{m} \sum_{i=1}^m\langle \bm{A}_i,\net\rangle \langle \mathcal{P}_{\net}^{\perp}(\bm{A}_i), \boldsymbol{\Delta}_t\rangle\Big|.
\label{eq: error I}
\end{equation}
We define 
\begin{equation}
\mathbf{A}_{i}^{\tuple} := \mathcal{P}_{\net}^{\perp}(\bm{A}_i) = \mathbf{A}_i - \left\langle \net, \mathbf{A}_i \right\rangle \net.
\label{eq: modified measurements}
\end{equation}
Using the rotation invariance property of Gaussian random variables, $\big\{\mathbf{A}_{i}^{\tuple}\big\}_{i=1}^m$ are stochastically independent of $\big\{\big\langle \mathbf{A}_i, \net \big\rangle\big\}_{i=1}^m$.
If $\boldsymbol{\Delta}_t$ is independent of $\left\{\left\langle \mathbf{A}_i, \net \right\rangle\right\}_{i=1}^m$, it is not difficult to deal with it.

\begin{lem}\label{lem: orthogonality preserve without rip}
    %Let $\mathcal{N}$ be \eqref{eq: N}. 
    %Let $T \in \mathbb{N}$ be such that $T \leq 12^{\complexity}$. 
    For any $\tuple \in \mathcal{N}$ and $\bm{Z}$ independent of $\left\{\left\langle\mathbf{A}_i, \net\right\rangle\right\}_{i=1}^m$, it holds with probability at least $1 - 2\exp(-8(\complexity))$ that
    %it holds for all $\tuple \in \mathcal{N}$ and all $1 \leq t \leq T$ that
    \begin{equation}
    \begin{aligned}
        &\big|\big\langle \net, \left(\mathcal{A}^* \mathcal{A}\right)\big(\mathcal{P}_{\net}^{\perp}\left(\bm{Z}\right)\big) \big\rangle\big| \\
        & \leq 4 \sqrt{\frac{\complexity}{m}} \Big\| \mathcal{A}\big(\mathcal{P}_{\net}^{\perp}\big(\bm{Z}\big)\big) \Big\|_2
        \label{eq: A ort}
     \end{aligned}
    \end{equation}
\end{lem}
\begin{proof}
    % We denote $\Delta_{t}^{\tuple} := \mathbf{X}_{\star} - \mathbf{X}_{t}^{\tuple}$, which is independent of $\left\{\left\langle \net, \mathbf{A}_i \right\rangle\right\}_{i=1}^m$. 
    % % First, $\left\{\bm{A}_{i, \tuple}\right\}_{i=1}^m$ and $\left\{\left\langle \net, \mathbf{A}_i \right\rangle\right\}_{i=1}^m$ are independent. Next, $\left\{\left\langle \net, \mathbf{A}_i \right\rangle\right\}_{i=1}^m$ is independent of $\bm{\Delta}_{t}^{\tuple}$ because $\bm{\Delta}_{t}^{\tuple}$ is derived from $\bm{A}_{i, \tuple}$. 
    % $\left\{\left\langle \net, \mathbf{A}_i \right\rangle\right\}_{i=1}^m$ are independent of $\big\{\big\langle \mathbf{A}_i, \mathcal{P}_{\net}^{\perp}\big(\bm{\Delta}_{t}^{\tuple}\big) \big\rangle\big\}_{i=1}^m$ from $\big\langle \mathbf{A}_i, \mathcal{P}_{\net}^{\perp}\big(\bm{\Delta}_{t}^{\tuple}\big) \big\rangle = \big\langle \mathbf{A}_{i}^{\tuple}, \bm{\Delta}_{t}^{\tuple} \big\rangle$. 
    Under the assumption, $\left\{\left\langle \mathbf{A}_i, \net \right\rangle\right\}_{i=1}^m$ are independent of $\left\{\left\langle \mathcal{P}_{\net}^{\perp}(\mathbf{A}_i), \bm{Z} \right\rangle\right\}_{i=1}^m$. Then, for all $x > 0$, with probability at least $1 - 2 \exp\left(-x^2 / 2\right)$,
    \[
        \begin{aligned}
            &\big|\big\langle \net, \left(\mathcal{A}^* \mathcal{A}\right)\big(\mathcal{P}_{\net}^{\perp}\big(\bm{Z}\big)\big) \big\rangle\big| \\
            &= \Big|\frac{1}{m} \sum_{i=1}^m \left\langle \net, \mathbf{A}_i \right\rangle \big\langle \mathcal{P}_{\net}^{\perp}(\mathbf{A}_i), \bm{Z} \big\rangle \Big| \\
            &\leq \frac{x}{m} \sqrt{\sum_{i=1}^m \big\langle \mathcal{P}_{\net}^{\perp}(\mathbf{A}_i), \bm{Z} \big\rangle^2} \\
            &= \frac{x}{\sqrt{m}} \Big\| \mathcal{A}\big(\mathcal{P}_{\net}^{\perp}\big(\bm{Z}\big)\big) \Big\|_2.
        \end{aligned}
    \]
    The inequality follows from the fact that, conditioning on $\{\langle \mathcal{P}_{\net}^{\perp}(\mathbf{A}_i), \bm{Z} \rangle\}_{i=1}^m$, $\sum_{i=1}^m \left\langle \net, \mathbf{A}_i \right\rangle \big\langle \mathcal{P}_{\net}^{\perp}(\mathbf{A}_i), \bm{Z} \big\rangle$ is a Gaussian variable with mean $0$ and variance $\sum_{i=1}^m \big\langle \mathcal{P}_{\net}^{\perp}(\mathbf{A}_i), \bm{Z} \big\rangle^2$. 
    Then it holds directly from the tail probability of Gaussian random variables.
    Choose $x = 4\sqrt{\complexity}$, and the failure probability is at most $2 \exp\left(-8(\complexity)\right)$. %|\mathcal{N}| T \leq 2 \exp\left(-8(\complexity) + 2\ln(12)(\complexity)\right) \leq 2 \exp\left(-2(\complexity)\right)$.
\end{proof}
 We assume that $\bm{Z}$ has a rank less than $2r$ here, since all the matrices we care about in this section have rank less than $2r$. If we rely solely on RIP, we can bound this term as $O(\sqrt{\frac{r^2(\complexity)}{m}}) \|\bm{Z}\|_2$ using \cref{eq: RIP w ort}. In contrast, this lemma converts it into the right-hand side of \eqref{eq: A ort}, and we can eliminate the factor $r$ and bound the term as $O(\sqrt{\frac{r(\complexity)}{m}}) \|\bm{Z}\|_2$ using \cref{eq: RIP 2norm}. 

However, we can not take $\bm{Z} = \bm{\Delta}_t$ since  the $\left\{\mathbf{X}_{t}\right\}_{t \in \mathbb{N}}$ is generated by $\mathcal{A}$ and thus dependent on $\left\{\left\langle\mathbf{A}_i, \net\right\rangle\right\}_{i=1}^m$. To relieve the statistical dependence between $\{\bm{X}_t\}_{t\in\mathbb{N}}$ and  $\left\{\left\langle\mathbf{A}_i, \net\right\rangle\right\}_{i=1}^m$, the central idea is to introduce 
a virtual sequence $\big\{\mathbf{X}_{t}^{\tuple}\big\}_{t \in \mathbb{N}}$ that is independent of $\left\{\left\langle\mathbf{A}_i, \net\right\rangle\right\}_{i=1}^m$ to approximate  the real sequence $\{ \mathbf{X}_{t} \}_{t \in \mathbb{N}}$ .
%To define it, we will first define a virtual measurement operator $\mathcal{A}_{{\tuple}}$ independent of $\left\{\left\langle\mathbf{A}_i,\net\right\rangle\right\}_{i=1}^m$, then generate our virtue sequence through it.
%Moreover, to ensure that good properties of the virtual sequence can transfer to the real sequence,  $\{ \mathbf{X}_{t}^{\tuple} \}_{t \in \mathbb{N}}$ and  $\left\{\mathbf{X}_t\right\}_{t \in \mathbb{N}}$ should remain close, which will be illustrated as follows.

To this end, we construct a modified measurement operator $\mathcal{A}_{\tuple}: \domain \rightarrow \mathbb{R}^{m+1}$ that is statistically independent of $\{\langle\bm{A}_i,\net\rangle\}$ to approximate $\mathcal{A}$ as follows:
\[
\left[\mathcal{A}_{\tuple}(\mathbf{Z})\right]_i :=\begin{cases}
\frac{1}{\sqrt{m}} \big\langle \mathbf{A}_{i}^{\tuple}, \mathbf{Z} \big\rangle, &\mbox{for~}i\in[m],\cr
\big\langle \net, \mathbf{Z} \big\rangle, &\mbox{for }i=m+1.
\end{cases}
\]
%i.e, $\|\mathbf{X}_{t}^{\tuple} - \mathbf{X}_{t} \|_F$ should stay small
%We define $\bm{\Delta}_{t,\tuple} := \target - \Xtxy$, then we can substitute $\bm{\Delta}_t$ by $ \bm{\Delta}_{t,\tuple} +(\bm{\Delta}_t - \bm{\Delta}_{t,\tuple}) $ and analyze their effects in $I$ \cref{eq: error I}. The error introduced by $\bm{\Delta}_{t,\tuple}$ can be well-controlled by taking $\bm{Z} = \bm{\Delta}_{t,\tuple}$ in \cref{lem: A keeps ort in A-norm} since .  To guarantee that the second term introduces a small error in $I$,  $\|\bm{\Delta}_t - \bm{\Delta}_{t,\tuple}\|_F = \|\mathbf{X}_{t}^{\tuple} - \mathbf{X}_{t} \|_F$  should stay small. 
%With these two points in mind, 
The first $m$ terms are Gaussian random measurements of 
$\mathcal{P}_{\net}^{\perp}(\bm{Z})$ and independent of $\left\{\left\langle\mathbf{A}_i, \net\right\rangle\right\}_{i=1}^m$ by \cref{eq: modified measurements}, 
%$\left\{\left\langle\mathbf{A}_i, \net\right\rangle\right\}_{i=1}^m$. 
%\langle\mathbf{A}_i^{\tuple},\bm{Z}\rangle  = \langle\mathbf{A}_i,  \mathcal{P}_{\net}^{\perp}\bm{Z} \rangle$. 
and the $m+1$-th term is introduced to collect the information of $\mathcal{P}_{\net}(\bm{Z})$ deterministically. From this construction, $\mathbb{E}\mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple} = \mathbb{E}(\mathcal{P}_{\net} + \mathcal{P}_{\net}^{\perp} \mathcal{A}^*\mathcal{A} \mathcal{P}_{\net}^{\perp}) = \mathcal{I} = \mathbb{E}\mathcal{A}^*\mathcal{A}$, which means $\mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}$ approximates $\mathcal{A}^*\mathcal{A}$ well in terms of expectation. For more properties of $\mathcal{A}_{\tuple}$ and its relationship with $\mathcal{A}$, see \cref{lem: compute decoupling} in the Appendix.

Finally we define the virtual sequence $\{ \mathbf{X}_{t}^{\tuple} \}_{t \in \mathbb{N}}$ to be the sequence generated by \cref{alg: RGD} with input data $\mathcal{A}_{\tuple}$ and $\mathcal{A}_{\tuple}(\target)$ as follows: for $t=0$,
$$    
\bm{X}_{0}^{\tuple}  = \mathcal{H}_r \big( \mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}\left(\mathbf{X}_{\star}\right) \big),
$$
and, for $t\geq 0$, 
$$
\begin{aligned}
&\mathbf{W}_{t}^{\tuple}  = \mathbf{X}_{t}^{\tuple}  \\
 & \qquad\qquad- \mu \mathcal{P}_{\mathbb{T}_{t}^{\tuple}} \mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple} \big(\mathbf{X}_{t}^{\tuple} - \target\big),\\
&\mathbf{X}_{t+1}^{\tuple} = \mathcal{H}_r(\mathbf{W}_{t}^{\tuple}),
\end{aligned}
$$
where $\mathbb{T}_{t}^{\tuple}$ is the tangent space of the manifold $\mathbb{M}_r$ at $\Xtxy$.
% $\left\{\mathbf{A}_{i}^{\tuple}\right\}_{i=1}^m$ are independent of $\left\{\left\langle\mathbf{A}_i, \net\right\rangle\right\}_{i=1}^m$, and the $m+1$ one is deterministic. $\mathcal{A}_{\tuple}$ is thus stochastically independent of $\left\{\left\langle\mathbf{A}_i, \net\right\rangle\right\}_{i=1}^m$. The $m+1$ term is introduced to make $\mathcal{A}_{\tuple}$ close the original $\mathcal{A}$, which can me seen in the following lemma:
% The first $m$ modified measurement matrices are Gaussian random and orthogonal to $\net$, and the $m+1$ one is deterministic. 
% Using the rotation invariance property of Gaussian random variables, $\left\{\mathbf{A}_{i}^{\tuple}\right\}_{i=1}^m$ are stochastically independent of $\left\{\left\langle \mathbf{A}_i, \net \right\rangle\right\}_{i=1}^m$. 
% Therefore, $\mathcal{A}_{\tuple}$ is stochastically independent of $\left\{\left\langle\mathbf{A}_i, \net\right\rangle\right\}_{i=1}^m$. 

%The following lemma follows directly from the definition of the virtual sequence, which describes the properties of $\mathcal{A}_{\tuple}$ and its relationship with $\mathcal{A}$. Its proof is in \cref{sec: proof of decoupling Technique}.
% \begin{lem}
%     For any matrix $\mathbf{Z} \in \domain$, it holds that
%     \begin{equation}
%         \begin{aligned}
%             &\big(\mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}\big)\left(\mathcal{P}_{\net}(\mathbf{Z})\right) = \mathcal{P}_{\net}(\mathbf{Z}), \\
%             &\big(\mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}\big)\left(\mathcal{P}_{\net}^{\perp}(\mathbf{Z})\right) \\
%             &= \left(\mathcal{A}^* \mathcal{A}\right)\left(\mathcal{P}_{\net}^{\perp}(\mathbf{Z})\right) - \left\langle \mathcal{A}\left(\net\right), \mathcal{A}\left(\mathcal{P}_{\net}^{\perp}(\mathbf{Z})\right) \right\rangle \net, \\
%             &\big(\mathcal{A}^* \mathcal{A} - \mathcal{A}_{\tuple}^* \mathcal{A}_{\tuple}\big)(\bm{Z}) \\
%             &= (\mathcal{A}^* \mathcal{A} - I)\mathcal{P}_{\bm{x}\bm{y}^T}(\bm{Z}) + \left\langle \bm{x}\bm{y}^T, \mathcal{A}^* \mathcal{A}\left(\mathcal{P}_{\bm{x}\bm{y}^T, \perp}(\bm{Z})\right) \right\rangle \bm{x}\bm{y}^T.
%         \end{aligned}
%         \label{eq: AA-A_wA_w}
%     \end{equation}
%     \label{lem: compute decoupling}
% \end{lem}
% For the last term on the right-hand side of \cref{eq: AA-A_wA_w}, which represents the difference between   $\mathcal{A}$ and  $\mathcal{A}_{\tuple}$ and thus affects $\|\mathbf{X}_{t}^{\tuple} - \bm{X}_t\|_F$

% Now we define the virtual sequence $\{ \mathbf{X}_{t}^{\tuple} \}_{t \in \mathbb{N}}$ to be the sequence generated by \cref{alg: RGD} with input data $\mathcal{A}_{\tuple}$ and $\mathcal{A}_{\tuple}(\target)$ as follows:

 %$\left\{\mathbf{X}_{t}^{\tuple}\right\}_{t \in \mathbb{N}}$ are derived from $\left\{\mathbf{A}_{i}^{\tuple}\right\}_{i=1}^m$ and also independent of $\left\{\left\langle\mathbf{A}_i, \net\right\rangle\right\}_{i=1}^m$. 
Consequently, $\big\{\mathbf{X}_{t}^{\tuple}\big\}_{t \in \mathbb{N}}$ is independent of $\left\{\left\langle\mathbf{A}_i, \net\right\rangle\right\}_{i=1}^m$ and approximates $\left\{\mathbf{X}_t\right\}_{t \in \mathbb{N}}$.  
The stochastic independence properties and approximation properties inherent in the construction of the virtual sequence significantly benefit the analysis. A straightforward analysis yields a corollary of \cref{lem: orthogonality preserve without rip} specified for our virtual sequence $\{\Xtxy\}_{t \in \mathbb{N}}$. For simplicity, we denote $[m]=\{1, \dots, m\}$ and $[m]-1=\{0, \dots, m-1\} $.
 
\begin{lem} \label{lem: orthogonality preserve without rip uniform}
With probability at least $1 - \ProbAort$, it holds that 
\begin{equation} \label{eq: A ort uniform}
\begin{aligned}
&\left|\left\langle\net,\left(\mathcal{A}^* \mathcal{A}\right)\left(\mathcal{P}_{\net}^{\perp}\left(\mathbf{X}_{\star}-\Xtxy\right)\right)\right\rangle\right| \\
&\leq 4 \sqrt{\frac{\complexity}{m}}\left\|\mathcal{A}\left(\Pxyort \left(\mathbf{X}_{\star}- \Xtxy\right)\right)\right\|_2, \\
&\qquad\qquad \forall ~ \tuple \in \mathcal{N} \text{ in } \cref{eq: N} \text{ and } t \in [\Maxiter]-1. 
\end{aligned}
\end{equation}
\end{lem}
\begin{proof} %[Proof of \cref{lem: orthogonality preserve without rip uniform}]
Notice that $\target - \Xtxy$ is independent of $\{\langle\bm{A}_i,\net\rangle \}_{i=1}^m$, so we can take $\bm{Z} =\target - \Xtxy$ for any $t$ and $\tuple$ in \cref{lem: orthogonality preserve without rip}. We simply take a union bound, and then \cref{eq: A ort uniform} is satisfied with probability at least $1 - 2|\mathcal{N}|T\exp(-8(\complexity)) \geq 1 - 2\exp(-2(\complexity))$. 
\end{proof}


 
 Using \cref{lem: orthogonality preserve without rip uniform}, we can finally get an estimation of  $\left\|\left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\mathbf{X}_{\star} - \bm{X}_t\right)\right\|_2$: 
%For the last term on the right-hand side of \cref{eq: AA-A_wA_w}, which represents the difference between   $\mathcal{A}$ and  $\mathcal{A}_{\tuple}$ and thus affects $\|\mathbf{X}_{t}^{\tuple} - \bm{X}_t\|_F$, we can derive a delicate estimate by letting $\bm{Z} = \mathbf{X}_{\star} - \mathbf{X}_{t}^{\tuple}$.


%If we rely solely on RIP, we can bound this term as $O(\sqrt{\frac{r(\complexity)}{m}}) \|\mathbf{X}_{\star} - \mathbf{X}_{t}^{\tuple}\|_F$ using \cref{eq: RIP w ort}. However, this lemma converts it into the right-hand side, and we can eliminate the factor $r$ and bound the term as $O(\sqrt{\frac{\complexity}{m}}) \|\mathbf{X}_{\star} - \mathbf{X}_{t}^{\tuple}\|_F$ using \cref{eq: RIP 2norm}.  

%Based on \cref{lem: orthogonality preserve without rip}, we can derive a key technical lemma that bounds $\left\|\left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\mathbf{X}_{\star} - \bm{X}_t\right)\right\|_2$, the $2$-norm of the deviation term. To control it, a typical method is to find a representative finite set $\mathcal{N}$ in $\ball$, such that
% \begin{equation}
% \begin{aligned}
% &\left\|\left(\mathcal{A}^* \mathcal{A}-\mathcal{I}\right)\left(\boldsymbol{\Delta}_t\right)\right\|_2 \\
% &= \sup _{\tuple \in \ball} \left|\left\langle\net,\left(\mathcal{A}^* \mathcal{A}-\mathcal{I}\right)\left(\boldsymbol{\Delta}_t\right)\right\rangle\right| \\
% &\leq 2 \sup _{\tuple \in \mathcal{N}}\left|\left\langle\net,\left(\mathcal{A}^* \mathcal{A}-\mathcal{I}\right)\left(\boldsymbol{\Delta}_t\right)\right\rangle\right|,
% \label{eq: N net}
% \end{aligned}
% \end{equation} 
% where $\boldsymbol{\Delta}_t:=\target - \bm{X}_t$ and $|\mathcal{N}| \leq \Maxiter$. More details about this technique $\varepsilon$-net are refereed to \citep{vershynin2018high}.

%This motivates us to identify a representative finite set $\mathcal{N}$ in $\ball$. Before defining it, we recall the notion of an $\varepsilon$-net.
% 
%In the following, we focus on the virtual sequence $\{ \mathbf{X}_{t}^{\tuple} \}_t$ for $\tuple \in \mathcal{N}$.


%Besides, the good properties of virtual sequence transfer to the real sequence inevitably introduce small cost related to the distance between the virtual and real sequence  $\{ \mathbf{X}_{t}^{\tuple} \}_{t \in \mathbb{N}}$ and  $\left\{\mathbf{X}_t\right\}_{t \in \mathbb{N}}$, i.e, $\|\mathbf{X}_{t}^{\tuple} - \mathbf{X}_{t} \|_F$, 
\begin{lem}
    Let $\mathcal{N}$ be in \cref{eq: N}. Let  $\{ \mathbf{X}_{t}^{\tuple} \}_{t\in\mathbb{N}}$ be the virtual sequence constructed for $\tuple \in \mathcal{N}$. 
      Assume that $\mathcal{A}$ satisfies RIP of rank $6r$, and let $\delta=\delta_{6r}\leq 1$. Assume that \cref{eq: A ort uniform} holds. Then we have
    \begin{equation}   \label{eq: spectral control}
        \begin{aligned}
            &\forall t\in[\Maxiter]-1,\quad \left\|\left(\mathcal{A}^* \mathcal{A} - \mathcal{I}\right)\left(\mathbf{X}_{\star} - \bm{X}_t\right)\right\|_2 \\
            &\leq \sigma_1 \left\|\mathbf{X}_{\star} - \bm{X}_t\right\|_2 + \sigma_2 \sup_{\tuple \in \mathcal{N}} \left\| \bm{X}_t - \mathbf{X}_{t}^{\tuple} \right\|_F,
        \end{aligned}
    \end{equation}
    where $\sigma_1 = 16 \sqrt{\frac{2r (\complexity)}{m}} + 2 \delta$ and $\sigma_2 = 4\delta + 16 \sqrt{\frac{\complexity}{m}}$.
    \label{prop: spectral control}
\end{lem}

Its proof is deferred to \cref{sec: proof of decoupling Technique}. When $\mathcal{A}$ is Gaussian measurement operator, $\delta = O(\frac{r(\complexity)}{m})$ with high probability from \cref{lem: RIP}, so $\sigma_1$ and $\sigma_2$ can become arbitrarily close to $0$ as $m$ increases. This result approaches \cref{eq: deviation-2}, with an additional error term arising from the distance between the real and virtual sequences. Consequently, we are going to control both the distances from $\bm{X}_t$ to $\target$ and from $\bm{X}_t$ to $\mathbf{X}_{t}^{\tuple}$ at initialization and demonstrate that these distances contract during the iterations. Although \cref{eq: spectral control} is not uniform and holds for at most $T$ steps, it enables \cref{eq: uniform region F} with $T = O(\ln r)$, allowing convergence analysis in \citep{RGD} available with $m = \Omega(\kappa^2 r(\complexity))$ .


\subsection{Proof of the Main Theorem}
In this section, we provide of proof of \cref{thm: RGDmain}. The proof is divided into three phases: the initialization, the first $T$ steps to meet $\| \bm{X}_T - \target \|_F \ll \Std$ in \cref{eq: uniform region F}, and the subsequence steps where the linear convergence in Frobenius norm is guaranteed \citep{RGD}. For simplicity, we denote for $t \in \mathbb{N}$:
\begin{equation}
E_t := \left\|\bm{X}_{t}-\target\right\|_2 + \sup _{\tuple \in \mathcal{N}}\left\|\bm{X}_{t} - \Xtxy\right\|_F.
\label{eq: def of E}
\end{equation}
\paragraph{Phase I: Initialization.}
We show in \cref{lem: initialization} (whose proof is in \cref{proof: initialization}) that $E_0$ can be small with high probability provided $m = \Omega(\kappa^2 r(\complexity))$ . This is a non-PSD version of \citep[Lemma 4.1]{stoger_non-convex_2024}.%which means $\bm{X}_0$ and $\mathbf{X}_{0}^{\tuple}$ enter an attraction region around $\target$ with sufficiently large $m$. 
\begin{lem}
    \label{lem: initialization}
    Let $\RegionConstant>0$ be arbitrarily given. Then there exists a constant $C>0$ such that when $m \geq C \kappa^2 r (\complexity)$, with probability at least $1 - \Probinit$, it holds that:
\begin{equation} \label{eq: E0}
    E_0 \leq \RegionConstant \Std,
\end{equation}
where $E_0$ is defined in \cref{eq: def of E}.
\end{lem}






%\subsection{Some key lemmas}


\paragraph{Phase II: Contraction in $2$-norm in the first $T$ steps.}
Using \cref{eq: A ort uniform} and \cref{eq: spectral control}, we estimate $E_t$ by induction starting from $\cref{eq: E0}$.
%like $\| \bm{X}_{t+1} - \target \|_2  \leq \sigma_1 \| \bm{X}_{t} - \target \|_2 + \sigma_2 \sup_{\tuple \in \mathcal{N}} \| \bm{X}_{t} - \Xtxy \|_F $  and \\ $
%\sup _{\tuple \in \mathcal{N}}\left\|\bm{X}_{t+1} - \mathbf{X}_{t+1}^{\tuple}\right\|_F \leq \mu_1 \sup _{\tuple \in \mathcal{N}}\left\|\bm{X}_{t} - \Xtxy\right\|_F + \mu_2 \left\|\bm{X}_{t} - \target\right\|_2
%$, with $\sigma_1, \sigma_2, \mu_1, \mu_2$ are all $O(\sqrt{\frac{r(\complexity)}{m}})$. Denote $E_t = \left\|\bm{X}_{t}-\target\right\|_2 + \sup _{\tuple \in \mathcal{N}}\left\|\bm{X}_{t} - \Xtxy\right\|_F$. If we have $ \gamma = \max \left\{\sigma_1 , \mu_1, \sigma_2 , \mu_2 \right\} < \frac{1}{2}$, then we can get $E_{t+1} \leq (2\gamma) E_t$,
%which implies the linear convergence of $E_t$ and also $\left\|\bm{X}_{t}-\target\right\|_2$. This also guarantees that $\bm{X}_t$ will iterate within this attraction region, and the induction goes on. This is the following lemma:
\newcommand{\mConstant}{C}
\begin{lem}
    \label{lem: first T steps}
% An induction lemma
% for iteration 0 ... T
% At level t, 
% X_t - X^*, X_t - X_{t,w}, (sum) small enough (local)$
% At level t+1
% W_t - X^* is small (2-norm) -> X_{t+1} - X^* is small (2-norm)
% W_t is good (spectral gap greater than lambda_min) 
% W_{t,w} - W_{t} is small (F-norm) , then of course 2-norm
% Hardthresholding is stable -> X_{t,w} - X_{t} is small (F-norm)
%need m \geq C \kappa^2 r (\complexity)
%Assume the \cref{eq: spectral control} holds. 
%Let $T \in \mathbb{N}$ be such that $T \leq 12^{\complexity}$. 
Let $\RegionConstant$ be an absolute constant such that $\RegionConstant \in (0, 0.001)$. Assume that $\mathcal{A}$ satisfies RIP of rank $6r$, and let $\delta=\delta_{6r} < \frac{1}{24} \RegionConstant$.
Assume that \cref{eq: A ort uniform}  and \cref{eq: E0} hold.
%Assume that
%\begin{equation}
%    \| \bm{X}_0 - \target \|_2 + \sup_{\tuple \in \mathcal{N}} \| \bm{X}_0 - \mathbf{X}_{0}^{\tuple} \|_F \leq c_1 \Std .
%    \label{eq: initial}
%\end{equation}
Then there exists a constant $C>0$ depending on $\RegionConstant$ only  such that when $m \geq C \kappa^2 r (\complexity)$, %the following holds with probability at least $1 - \ProbAort - \ProbRIP$: 
\begin{equation}
\begin{aligned}
    E_t \leq (1000 \RegionConstant)^t c_1 \Std, \qquad \forall ~~t \in [\Maxiter].
\end{aligned} 
%\leq 1000 \RegionConstant (\| \bm{X}_{t} - \target \|_2 + \sup_{\tuple \in \mathcal{N}} \| \bm{X}_{t} - \Xtxy \|_F ).
\label{eq: contraction}
\end{equation}





\end{lem}
This lemma is critical, and its proof differs significantly from the parallel one for factorized gradient descent in \citep{stoger_non-convex_2024}. Specifically, the gradient is projected onto the tangent space of $\bm{X}_t$, requiring careful analysis of the projection operator, as detailed in \cref{lem: projection distance} in the Appendix. Additionally, our algorithm incorporates a hard-thresholding operator after the gradient descent step, for which \cref{lem: thresholding control} is necessary to bound the error introduced by thresholding. The detailed proof is provided in \cref{proof: convergence}.

% \newcommand{hc}[1]{%
% {\textcolor{red}{#1}}
% }
By choosing $\RegionConstant$ sufficiently small and  $T = O(\ln r)$, \cref{lem: first T steps} implies $\left\|\bm{X}_{T}-\target\right\|_F \leq \sqrt{2r}\left\|\bm{X}_{T}-\target\right\|_2 \leq  \sqrt{2r}E_T  \ll \Std$. 
\paragraph{Phase III: Contraction in Frobenius norm in the subsequent steps.}
%Third, we will show that after $\left\|\bm{X}_{T}-\target\right\|_F \ll \Std$ , $\bm{X}_t$ will enter a uniform convergence region without the need of \cref{eq: spectral control} for $t \geq T$. This is the following lemma (whose proof is in \cref{proof: convergence}):
With $\left\|\bm{X}_{T}-\target\right\|_F \ll \Std$, we can directly apply the result from \citep{RGD} to establish the convergence of $\bm{X}_t$ to $\target$ in Frobenius norm with $m = \Omega(\kappa^2r(\complexity))$. For completeness, we introduce the following lemma (whose proof is in \cref{proof: convergence}).
\begin{lem}[\citep{RGD}]
    \label{lem: afterT}
    Let $\RegionConstantafterT$ be an arbitrary constant that satisfies $0 < 6 \RegionConstantafterT < 1$. Assume that the measurement operator $\mathcal{A}$ satisfies the RIP of rank $6 r$ with constant $\delta_{6 r}< \RegionConstantafterT$. Assume that 
    \begin{equation}
    \| \bm{X}_T -  \target \|_F \leq \RegionConstantafterT \sigma_{\min }\left(\mathbf{X}_{\star}\right)
    \label{eq: init_F_norm}
    \end{equation} 
    for some $T \in \mathbb{N}$. Then it holds for all $t \geq T$ that
    $$
    \| \bm{X}_t -  \target \|_F \leq (6 \RegionConstantafterT) ^{t-T} \| \bm{X}_T -  \target \|_F.
    $$
\end{lem}
Combining these three phases, we can give the proof of \cref{thm: RGDmain}.
\begin{proof}[Proof of \cref{thm: RGDmain}]%\noindent \textit{Proof of \cref{thm: RGDmain}}:
\label{proof: main thm}
    Recall that $\rho \in (0,1)$ is the target convergence rate, and we have denoted 
    $E_0 = \initError$. We define the constants $\RegionConstantafterT = \ConvergenceRate/6$, $\RegionConstant = \min\{\frac{\rho}{1000}, \frac{1}{2000},e^{\frac{\ln2 - \frac{1}{2}}{2}}\RegionConstantafterT \}<1$, and $\delta  = \min \{\RegionConstantafterT, \frac{1}{24}\RegionConstant\}$.
    
    The proof relies on the following events:
    \begin{itemize}    
    %\The $\delta = \delta_{6r} \leq \min \{\RegionConstantafterT, \frac{1}{24}\RegionConstant\}$ 
    \item $\mathcal{A}$ satisfies RIP of rank $6r$ with $\delta_{6r} < \delta$. By \cref{lem: RIP}, this event holds with probability at least $1 - \ProbRIP$ provided that $m \geq C'\kappa^2 r(\complexity)$.
    \item The inequality \cref{eq: A ort uniform} holds. By \cref{lem: orthogonality preserve without rip uniform}, this occurs with probability at least $1 - \ProbAort$. %, which directly implies \cref{eq: spectral control} with this $\delta$.
    %We analyze the number of steps needed to satisfy \cref{eq: init_F_norm}. 
    \item The initial error satisfies $E_0 \leq \RegionConstant \Std$, i.e., \eqref{eq: E0} holds. By \cref{lem: initialization}, this is true with probability at least $1 - \Probinit$ when $m \geq C'' \kappa^2 r(\complexity)$.
     \end{itemize}
     Applying a union bound, the probability that all these three events occur simultaneously is at least $1 - 7\exp(-(\complexity))$.
     
    %From \cref{lem: orthogonality preserve without rip} we know the conclusion of \cref{prop: spectral control} holds for $\Maxiter$ steps with probability $1 - \ProbAort$. 
    
     %As a result, the following holds with probability larger than $1 - (\ProbAort + \Probinit + \ProbRIP) > 1 - 7\exp(-(\complexity))$ when $m$
    Assuming these events hold, we proceed with the proof. Combining the RIP, \cref{eq: A ort uniform}, and \eqref{eq: E0}, it follows from \cref{lem: first T steps} that for all $t \in [\Maxiter]$,
    \begin{equation}
       \begin{aligned}
       &\|\bm{X}_t - \target  \|_F  \leq \sqrt{2r} \|\bm{X}_t - \target  \|_2 
       \leq \sqrt{2r}  \rho_1^t E_0 \\
       &\leq \sqrt{2r}  \rho_1 ^t  \RegionConstant \Std \leq \sqrt{2r}  \rho ^t   \Std,
        \end{aligned}
       \label{eq: Phase1}
     \end{equation}  
     where $\rho_1 = 1000\RegionConstant\leq \rho <1$, and the number of measurements satisfies $m \geq C''' \kappa^2 r(\complexity)$.

     Let $T = \ln(2r) \leq \Maxiter$. A straightforward calculation shows that
    $$
    \frac{\frac{1}{2}\ln{2r} + \ln{\frac{\RegionConstant}{\RegionConstantafterT}}}{\ln{2r}} \leq
    %\stackrel{\Maxiter \geq T > \ln{(\complexity)} \geq \ln{2r}}{\leq} 
    \frac{1}{2} + 2\ln{\frac{\RegionConstant}{\RegionConstantafterT}} \stackrel{\RegionConstant \\< e^{\frac{\ln2 - \frac{1}{2}}{2}} \RegionConstantafterT}{ < } \ln 2<\ln{\frac{1}{1000 \RegionConstant}}.
    $$ 
    This implies $\sqrt{2r}  \rho_1^T \RegionConstant \Std < \RegionConstantafterT \Std$, which ensures that \cref{eq: init_F_norm} holds. Using this result and the RIP, \cref{lem: afterT} guarantees that for $t \geq T$,
    \begin{equation}
    \|\bm{X}_t - \target  \|_F \leq \rho^{t-T} \|\bm{X}_T - \target  \|_F.
    \label{eq: phase2}
    \end{equation}

    Combining \eqref{eq: Phase1} for $t\in[T]$ and \eqref{eq: phase2} for $t\geq T$, we obtain the convergence result \eqref{eq: mainconv}.
    %So we need only $\ln(2r)$ steps to enter \cref{eq: init_F_norm}, which is smaller than $\Maxiter$. We denote $T$ as the first $t$ such that $\|\bm{X}_t - \target  \|_F$ satisfies \cref{eq: init_F_norm}, that is $T := \min \{ 0\leq t \leq \Maxiter: \|\bm{X}_t - \target  \|_F \leq \RegionConstantafterT \Std  \} $. 

    To conclude, we determine the number of measurements required by taking the maximum of the conditions on $m$ throughout the proof:
    $$
    m \geq C\kappa^2r(\complexity),
    $$ 
    where $C = \max\{C',C'',C'''\}$. 
\end{proof}

% \noindent \textbf{Sketch of the Proof of \cref{thm: RGDmain}}:

% Second, with \cref{prop: spectral control} holds for $\Maxiter$ steps, \cref{lem: first T steps} derives an induction relationship in this attraction region like $\| \bm{X}_{t+1} - \target \|_2  \leq \sigma_1 \| \bm{X}_{t} - \target \|_2 + \sigma_2 \sup_{\tuple \in \mathcal{N}} \| \bm{X}_{t} - \Xtxy \|_F $  and $
% \sup _{\tuple \in \mathcal{N}}\left\|\bm{X}_{t+1} - \mathbf{X}_{t+1}^{\tuple}\right\|_F \leq \mu_1 \sup _{\tuple \in \mathcal{N}}\left\|\bm{X}_{t} - \Xtxy\right\|_F + \mu_2 \left\|\bm{X}_{t} - \target\right\|_2
% $, with $\sigma_1, \sigma_2, \mu_1, \mu_2$ are all $O(\sqrt{\frac{r(\complexity)}{m}})$. Denote $E_t = \left\|\bm{X}_{t}-\target\right\|_2 + \sup _{\tuple \in \mathcal{N}}\left\|\bm{X}_{t} - \Xtxy\right\|_F$. If we have $ \gamma = \max \left\{\sigma_1 , \mu_1, \sigma_2 , \mu_2 \right\} < \frac{1}{2}$, then we can get $E_{t+1} \leq (2\gamma) E_t$, which implies the linear convergence of $E_t$ and also $\left\|\bm{X}_{t}-\target\right\|_2$. This also means $\bm{X}_t$ will iterate within this attraction region, which guarantees that the induction goes on.

% Third, we show in \cref{lem: afterT} after $T \leq \Maxiter$ steps, $\left\|\bm{X}_{T}-\target\right\|_F \ll \Std$ , and $\bm{X}_t$ will enter a uniform convergence region without the need of \cref{prop: spectral control}. 



