%\documentclass{uai2024} % for initial submission
\documentclass[accepted]{uai2024} % after acceptance, for a revised version; 
% also before submission to see how the non-anonymous paper would look like 
                        
%% There is a class option to choose the math font
% \documentclass[mathfont=ptmx]{uai2024} % ptmx math instead of Computer
                                         % Modern (has noticeable issues)
% \documentclass[mathfont=newtx]{uai2024} % newtx fonts (improves upon
                                          % ptmx; less tested, no support)
% NOTE: Only keep *one* line above as appropriate, as it will be replaced
%       automatically for papers to be published. Do not make any other
%       change above this note for an accepted version.

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

\input{math_commands.tex}
%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz} % nice language for creating drawings and diagrams
\usepackage{star}
\usepackage{enumitem}
\usepackage{url}
\usepackage{hyperref}

\hypersetup{
 	colorlinks=true,
 	linkcolor=black,
 	%filecolor=red,
 	urlcolor=black,
 	citecolor=black,
 }
%% Provided macros
% \smaller: Because the class footnote size is essentially LaTeX's \small,
%           redefining \footnotesize, we provide the original \footnotesize
%           using this macro.
%           (Use only sparingly, e.g., in drawings, as it is quite small.)

%% Self-defined macros
\newcommand{\swap}[3][-]{#3#1#2} % just an example



%%%comment
\setlength\marginparwidth{10cm}
%\usepackage[marginpar=2cm]{geometry}
\newcommand{\scolor}[1]{{\color{magenta}#1}}
\newcommand{\sscomment}[2]{\scolor{ #1}\marginpar{\tiny\scolor{S:\ #2}}}
\newcommand{\scomment}[1]{\scolor{$\dagger$}\reversemarginpar{\footnotesize\scolor{S:\ #1}}}



\title{Gradient descent in matrix factorization: 
Understanding large initialization}

% The standard author block has changed for UAI 2024 to provide
% more space for long author lists and allow for complex affiliations
%
% All author information is authomatically removed by the class for the
% anonymous submission version of your paper, so you can already add your
% information below.
%
% Add authors
\author[1]{Hengchao Chen}
\author[2]{Xin Chen}
\author[1]{Mohamad Elmasri}
\author[1,3]{\href{mailto:<qsunstats@gmail.com>}{Qiang Sun}}
% Add affiliations after the authors
\affil[1]{%
    University of Toronto
}
\affil[2]{%
    Princeton University
}

\affil[3]{%
    MBZUAI
}
  


\begin{document}

\maketitle

\begin{abstract}


Gradient Descent (GD) has been proven effective in solving various matrix factorization problems. However, its optimization behavior with large initial values remains less understood. To address this gap, this paper presents a novel theoretical framework for examining the convergence trajectory of GD with a large initialization. The framework is grounded in signal-to-noise ratio concepts and inductive arguments. The results uncover an implicit incremental learning phenomenon in GD and offer a deeper understanding of its performance in large initialization scenarios.
     
\end{abstract}

\section{Introduction}\label{sec:1}



%Despite their intriguing performance guarantees in sample complexity and stability against noise, these methods typically require a huge computational cost, which may be cubically dependent on the size of the matrix. 


Low-rank optimization is pivotal in various  applications, including matrix completion~\citep{candes2012exact}, phase retrieval~\citep{shechtman2015phase}, matrix sensing~\citep{recht2010guaranteed}, and others. One common approach to addressing low-rank problems is through convex relaxation techniques such as nuclear norm regularization \citep{recht2010guaranteed}.  While these methods offer robust statistical performance guarantees, they often incur substantial computational costs, potentially scaling cubically with matrix size \citep{chi2019nonconvex}.  
In response, recent studies \citep{chen2019gradient,ma2018implicit,chi2019nonconvex} have proposed employing matrix factorization, which naturally encodes low rankness and significantly reduces the computational expenses per iteration when using gradient descent. However, a primary concern remains: the non-convex nature of the resulting objective necessitates a thorough investigation into the convergence properties of gradient descent (GD).

%This greatly reduces the per-iteration computational costs when gradient descent is used. The remaining concern is that the resulting objective is non-convex and one needs to examine the convergence theory of gradient descent (GD). 


Recent research has demonstrated that GD effectively solves a variety of low-rank problems. \cite{ma2018implicit} show that with a benign initialization\footnote{A benign initialization refers to starting the algorithm near the global minima.}, GD converges linearly to the global minima in applications such as matrix completion, phase retrieval, and blind deconvolution. Similarly, \cite{zhu2021global} explore the optimization landscape of matrix sensing, finding that GD, when starting with a benign initialization,  converges linearly to the global minima.
These findings highlight the intriguing optimization properties of GD in non-convex settings  and have catalyzed many studies in low-rank matrix optimization \citep{chi2019nonconvex}.



Unlike in convex optimization, initialization is critical in non-convex optimization. The aforementioned works establish global convergence  under the assumption of a benign initialization, necessitating carefully designed starting points. This prompts a critical question: Is random initialization\footnote{Random initialization selects an initial point randomly, typically far from the global minima.} sufficient to ensure fast global convergence? For certain problems, the answer is affirmative. \cite{chen2019gradient} show that with a random initialization, GD quickly converges globally in phase retrieval.  \cite{stoger2021small} show that GD with a small random initialization\footnote{Small or large initialization refers to whether the norm of the initial point is near zero or substantially larger, respectively.} also achieves rapid global convergence in matrix sensing. Subsequent studies have further examined GD with small random initialization in matrix sensing, addressing different aspects such as the asymmetric case \citep{jiang2023algorithmic} and the incremental learning phenomenon \citep{jin2023understanding}. These investigations, except \citet{chen2019gradient}, all assume a small initialization, which we  discuss below in detail.  

%Besides these works, there are few results on the fast global convergence of randomly initialized GD.     



\begin{figure*}[t]
    \centering
    \includegraphics[width = \textwidth]{MF.pdf}
    \caption{The top left panel shows the errors $\|\mSigma_r-\mX_t\mX_t^\top\|_{\rm F}$ over iterations. The rest panels  show the heat maps of the top three rows and columns of  $\mX_t\mX_t^\top$ at iterations $t=0,37,80,140$, and 300, corresponding to the red points in the top left panel.}
    \label{fig:1}
\end{figure*}

In this paper, we investigate the convergence properties of GD with a large initialization.  Using a large initialization often helps reduce the training time, and is widely adopted in neural network training \citep{sun2020optimization}. For simplicity, we limit our analysis to a population version\footnote{A brief introduction to the matrix sensing problem can be found  in  Appendix \ref{sec:apd:preliminary}. The difference between matrix sensing and its population version will also be discussed.} of the matrix sensing problem. Specifically, we aim to study the convergence properties  of GD when solving the following symmetric matrix factorization problem:
\begin{align}\label{equ:1}
    \mX^*=\argmin_{\mX\in\sR^{d\times r}}\|\mSigma-\mX\mX^\top\|_{\rm F}^2,
\end{align}
where $\mSigma\in\sR^{d\times d}$ is a positive semi-definite matrix of rank at least $r$. %While the solutions are known to be   $\mX^*\mX^{*\top}=\mSigma_r$ with $\mSigma_r$ being the best rank $r$ approximation of $\mSigma$,
While it is straightforward to show that $\mX^*\mX^{*\top} = \mSigma_r$, with $\mSigma_r$ being the best rank-$r$ approximation of $\mSigma$, establishing the global convergence theory of GD in this problem is non-trivial. Let the initialization be: 
\$
\mX_0=\varpi\mN_0\in\RR^{d\times r},\ \textnormal{entries of}~\mN_0~\textnormal{being i.i.d.}~ \cN(0, {1}/{d}).
\$

Let $\kappa=\lambda_1/\Delta\geq 1$ be the conditional number, with  $\lambda_1$ being the largest eigenvalue of $\bSigma$ and $\Delta$ denoting  the eigengap.
Prior works typically assume a small initialization, 
aka $\varpi=d^{-\iota(\kappa)}$ for some positive  increasing function  $\iota(\cdot)$\footnote{We call $\bX_0$  a small initialization when $\varpi$ tends to zero as $d$ increases. This is because a standard statistical analysis shows that $\norm{\bX_0}_{\rm F} = \Theta(\varpi)$, which omits constants that are independent of $d$.}.  For instance, \cite{stoger2021small} require 
\#\label{equ:small}
\varpi\lesssim  d^{-3\kappa^2}, 
\#
and \cite{jin2023understanding} need a even smaller $\varpi$. Such $\varpi$ decays rapidly to zero as $d$  or $\kappa$ increases. In our study, we do not impose such conditions and only require $\varpi=O(1)$. We show that this requirement is necessary since GD will diverge when $\varpi$ is larger than this order. We refer to an initialization with $\varpi=O(1)$ as a large initialization, reflecting its relative magnitude in comparison to typical values considered in the field.  
Notably, our theories are applicable to the small initialization scenario as well, which is a special case of $\varpi = O(1)$.

\subsection{Our contribution}\label{sec:1.1}

%Our main result is the following theorem. It is proven using a novel signal-to-noise ratio (SNR) argument along with an inductive argument. These analysis will be explained below.

Our main result is outlined in the informal theorem below, with its rigorous counterparts collected later. This result is established through a novel signal-to-noise ratio (SNR) based approach, combined with an inductive argument. 

\begin{theorem}[Informal] 
   Assume that $\bSigma$ is a positive semi-definite matrix with its top $r+1$ eigenvalues being distinct and arranged in descending order.  Let $\bX_t$ be the GD sequence for problem \eqref{equ:1} with  $\bX_0=\varpi\bN_0$, where $\varpi$ is a positive constant independent of $d$ and $\bN_0\in\RR^{d\times r}$ has independent $\cN(0,{1}/{d})$ entries. Then 
    \begin{enumerate}[leftmargin=*]
    \item The GD sequence converges to the global minima almost surely~\citep{lee2016gradient,zhu2021global};
    \item A comprehensive trajectory analysis of GD is provided, indicating that eigenvectors associated with larger eigenvalues are learned first;
    \item Under an additional assumption, GD achieves $\epsilon$-accuracy in $\cO(\log(1/\epsilon)+\log(d))$ steps. 
    \end{enumerate}
\end{theorem}


Property 1 is a direct consequence of prior works. It guarantees that our trajectory analysis is valid almost surely. Property 2 shows that the top eigenvectors can be learned first,  unaffected by the behavior of later  signals. This point will be clarified later. Property 3 describes the fast global convergence of GD under an additional assumption concerning the saddle point escaping property. The verification of this assumption requires new theoretical techniques, which we defer to future research.


Now we elucidate Property 2 and our theoretical analysis through  a simple yet representative example concerning  rank-two matrix approximation. The experiment is conducted with parameters: $d=4000$, $r=2$, and $\mSigma={\rm diag}(1,0.5, \ve)$, where $\ve\in\sR^{d-r}$ forms an arithmetic sequence decreasing from $0.3$  to $0$. The initialization matrix is set to  $\mX_0=0.5\mN_0$ with entries of $\mN_0$ independently sampled from $\gN(0,\frac{1}{d})$.  The GD iterations $\bX_t$ are computed using  a step size of 0.1. We evaluate errors using $\|\mSigma_r-\mX_t\mX_t^\top\|_{\rm F}$ with $\mSigma_r={\rm diag}(1,0.5,0,\ldots,0)$ representing  the best rank-$r$ matrix approximation to $\mSigma$. In Figure \ref{fig:1}, the error trajectory is plotted, highlighting key inflection points and featuring heat maps of the first three rows and columns of $\mX_t\mX_t^\top$ at these steps.  Observations indicate that GD undergoes an incremental learning process, characterized by error curves that exhibit both flat and steep segments. %Observations reveal that GD exhibits an incremental learning phenomenon and the error curve has two types of shapes: flat and steep. 
%Giving mathematical explanations for these phenomena is the main objective of this paper. 



%Rather than presenting the theorem upfront, we offer a step-by-step trajectory analysis that corresponds with the error curve depicted in Figure \ref{fig:1}. 
 %We define $\mX_0$ as the initial point and $\mX_t$ as the GD sequence given by \eqref{equ:9} with a learning rate of $\eta$. 
 %Contrary to Section \ref{sec:3.1}, where the first two rows of $\mX_t$ are analyzed concurrently, here, we examine these rows one after the other. 


To interpret the error trajectory  in Figure \ref{fig:1}, we  analyze the first $r$ rows of $\bX_t$ individually. Specifically, we examine the dynamics of the quantities $\sigma_1(\vu_{k,t})$ and $\sigma_1(\vu_{k,t}\mK_{k,t}^\top)$, where $\vu_{k,t}$ is the $k$-th row of $\mX_t$ and $\mK_{k,t}$ is the $(k+1)$-to-$d$-th rows of $\mX_t$. 
These quantities correspond to the diagonal and off-diagonal elements in the heat map of $\mX_t\mX_t^\top$. Hence, our mathematical analysis directly explains the dynamics observed in the heat maps of Figure \ref{fig:1}. Our analysis on the SNR $\sigma_1^2(\vu_{k,t})/\sigma_1(\bu_{k,t}\bK_{k,t}^\top)$ reveals that the off-diagonal elements decrease at a geometric rate once the signal strength $\sigma_1^2(\vu_{k,t})$ reaches a specific threshold. This observation motivates us to employ an inductive argument to analyze the whole convergence trajectory. 



%Our analysis can also be extended to give a new proof for the global linear convergence of GD with benign initialization in matrix sensing problems.
Lastly, we highlight an ancillary contribution regarding the convergence of GD with a benign initialization. Prior works, such as~\cite{zhu2021global},  only prove the  global linear convergence of GD when $\bSigma$ is exactly of rank $r$. In contrast, our work extends these results by proving global linear convergence of GD for all matrix approximation problems, allowing for  $\bSigma$ whose rank exceeds $r$. Our analysis is  based on an SNR argument, which distinguishes  from the prior landscape-based analysis. Additionally, this analysis can be further applied to offer a new proof for the global linear convergence of benignly initialized GD  in matrix sensing problems.



\subsection{Related work}

Matrix factorization based low rank matrix optimization has received significant attention in recent years \citep{chi2019nonconvex}. A primary challenge involves  analyzing the optimization properties. Previous studies have approached  this issue through various angles, including examing the optimization landscape \citep{sun2016complete,sun2018geometric,zhu2021global} and directly conducting  convergence analyses \citep{ma2018implicit,chen2019gradient,stoger2021small}. %General non-convex optimization theories have also shed significant light on the low-rank problems. Notably, 
\citet{lee2016gradient} show  that GD escapes saddle points almost surely under the strict saddle point condition, implying global convergence in scenarios where all local minima are also global minima, and all saddle points are strict.


%The implicit regularization property \scomment{I think implicit regularization is not equal to global convergence. I delete this due to weak connection.} of GD ~\citep{gunasekar2017implicit,li2018algorithmic} has intrigued the deep learning community, particularly since asymmetric matrix factorization can be viewed as a special case of a two-layer linear neural network \citep{baldi1989neural,zhu2020global}. These studies  have further motivated  research on the implicit bias of GD in deep matrix factorization \citep{chou2024gradient} and deep learning \citep{vardi2023implicit}.

In our paper, we refer to the implicit incremental learning phenomenon as the prioritized learning of the top eigenstructure. This phenomenon has been investigated from different perspectives. \citet{li2020towards} show that in matrix factorization, gradient flow with infinitesimal initialization is mathematically equivalent to greedy low-rank learning under specific assumptions. \cite{jin2023understanding} show that in matrix sensing, GD with a small initialization exhibits an incremental learning phenomenon. \citet{simon2023stepwise} observe that a similar incremental learning phenomenon exists in self-supervised learning when a small initialization is employed. However, none of these works explore the large initialization regime, which we will investigate for the first time in this paper.

\subsection{Paper overview}

The rest of this paper proceeds as follows. Section \ref{sec:2} reviews the usage of SNR analysis for rank-one matrix approximation. Section \ref{sec:3.1} uses the SNR analysis to prove the local linear convergence of GD in general rank problems. In Section \ref{sec:RI}, we examine the random initialization. Specifically, Section \ref{sec:3.2} reviews small initialization while Section \ref{sec:large} considers large initialization and presents  our main result. A sketch of proof is provided in Section \ref{sec:proof}.  
Concluding discussions are presented in Section \ref{sec:5} and all detailed proofs are collected in the Appendix.





\section{A warm-up}\label{sec:2_total}

In this section, we first introduce the  SNR analysis for rank-one matrix approximation problems as developed by \cite{chen2019gradient}. We then discuss the challenges involved in extending this analysis to the case of general rank. Despite these challenges, it is possible to extend the analysis with the use of a benign initialization. We provide such an extension in Theorem \ref{thm:1}, which generalizes  the previous results.


\subsection{Rank-one matrix approximation}\label{sec:2}
We begin with a review of rank-one matrix approximation. In this context, \citet{chen2019gradient} demonstrate  that GD with a large random initialization exhibits linear convergence to the global minima by leveraging an SNR argument. Specifically, consider problem \eqref{equ:1} with $r=1$ and assume\footnote{There is no loss of generality in assuming that $\bSigma$ is diagonal, as the analysis of  GD  is invariant to orthogonal rotations.  For a detailed explanation, please refer to Appendix \ref{sec:a2-apd}.} $\mSigma={\rm diag}(\lambda_1,\ldots,\lambda_d)$ is diagonal with decreasing diagonal elements and $\lambda_1>\lambda_2$. Let the initialization  vector  $\vx_0\in\sR^{d}$ be such that its first entry is  non-zero  and its  norm $\|\vx_0\|$ is less than $2\lambda_1$. Then  $\vx_t\vx_t^\top$ rapidly converges to ${\rm diag}(1,0,\ldots,0)$.  The vector  $\vx_t$ is updated according to the GD rule:
\begin{align}
    \vx_t=\vx_{t-1}+\eta (\mSigma-\vx_{t-1}\vx_{t-1}^\top)\vx_{t-1},
\end{align}
where $\eta$ is the learning rate.



\citet{chen2019gradient} first decompose $\vx_t$ as $\vx_{t}=(\eva_t,\vb_t)^\top$, where $\eva_t\in\sR$ and $\vb_t\in\sR^{d-1}$. The GD rule is then rewritten  as
\begin{align}
    \eva_t&=a_{t-1}+\eta\lambda_1a_{t-1}-\eta (a_{t-1}^2+\|\vb_{t-1}\|^2)a_{t-1},\\
    \vb_t&=\vb_{t-1}+\eta \mSigma_{\rm res}\vb_{t-1}-\eta (a_{t-1}^2+\|\vb_{t-1}\|^2)\vb_{t-1},
\end{align}
where $\mSigma_{\rm res}={\rm diag}(\lambda_2,\ldots,\lambda_d)$. Let $\alpha_t=|a_t|$ and $\beta_t=\|\vb_t\|$, and assume $\eta\lambda_1$ is smaller than some constant, say $\frac{1}{12}$. Then we have
\begin{align}
    \alpha_t&=(1+\eta\lambda_1-\eta\alpha_{t-1}^2-\eta\beta_{t-1}^2)\alpha_{t-1},\label{equ:5}\\
    \beta_t&\leq (1+\eta\lambda_2-\eta\alpha_{t-1}^2-\eta\beta_{t-1}^2)\beta_{t-1}.\label{equ:6}
\end{align}
By dividing \eqref{equ:6} by \eqref{equ:5}, it follows that
\begin{align}
    \frac{\beta_t}{\alpha_t}&\leq \frac{1+\eta\lambda_2-\eta \alpha_{t-1}^2-\eta\beta_{t-1}^2}{1+\eta\lambda_1-\eta \alpha_{t-1}^2-\eta\beta_{t-1}^2}\cdot\frac{\beta_{t-1}}{\alpha_{t-1}}\notag \\
    &\leq (1-\frac{\eta \Delta}{3})\cdot \frac{\beta_{t-1}}{\alpha_{t-1}},\label{equ:7}
\end{align}
where $\Delta=\lambda_1-\lambda_2$ is the eigengap and the second inequality uses the fact that for all $s\in[-1/2,1/2]$,
\begin{align}
    h(s)=\frac{1-\eta\Delta/2+s}{1+\eta\Delta/2+s}\leq h({1}/{2})\leq 1-\frac{\eta\Delta}{3}.
\end{align}
Inequality \eqref{equ:7} indicates that the ratio ${\beta_t}/{\alpha_t}$ decreases to zero geometrically fast. Using this result, \citet{chen2019gradient} establish that $\beta_t$ and $\alpha_t$ rapidly converge to zero and $\lambda_1$,  respectively. In our paper, we refer to this argument as an SNR analysis, designating $\alpha_t$ as the signal strength and $\beta_t$ as the noise strength. 



\subsection{The general rank case: Challenges and a solution}\label{sec:3.1}

Generalizing the SNR argument to general rank problems introduces significant challenges. For example, the global minima  cannot simply be characterized by the two real numbers $\alpha_t$ and $\beta_t$. Identifying other effective measures that characterize  the GD sequence, and providing a dynamic analysis akin to that  in equations \eqref{equ:5} and \eqref{equ:6}, is notably difficult. Fundamentally, this difficulty arises from the heterogeneity across different dimensions or, more formally, from the non-commutativity of matrix multiplication.


%\subsection{Local linear convergence}
%It is easier to tackle this challenge when using benign initialization with a high initial SNR. 
%In this section, we demonstrate how to use SNR analysis to establish the local linear convergence of GD in general rank problems.   


One way to address this challenge involves using  a benign initialization with a high initial SNR. This strategy facilitates the extension  of the  SNR analysis to general rank problems and enables the establishment of  the local linear convergence for GD.  Consider problem \eqref{equ:1} with a general $r$ and assume $\mSigma={\rm diag}(\lambda_1,\ldots,\lambda_d)$ is diagonal with decreasing diagonal elements, where the eigengap $\Delta\coloneqq\lambda_r-\lambda_{r+1}>0$. Let $\mX_0\in\sR^{d\times r}$ be an initialization  point. Then the GD update rule is: 
\begin{align}
    \mX_{t}=\mX_{t-1}+\eta(\mSigma-\mX_{t-1}\mX_{t-1}^\top)\mX_{t-1},\label{equ:9}
\end{align}
where $\eta$ is the learning rate. 


For the SNR argument, we decompose $\mX_t$ into $(\mU_t^\top,\mJ_t^\top)^\top$, where $\mU_t$ consists of  the first $r$ rows of $\mX_t$ and $\mJ_t$ includes the remaining  $d-r$ rows. Analogous to the rank-one case,  $\mU_t$ is considered the signal, being non-zero at the global minima,  while $\mJ_t$ is considered the noise, being zero at the same.  Adopting a benign initialization means that $\sigma_r(\mU_0)$ is large and  $\sigma_1(\mJ_0)$ is small. More precisely, we define the set $\gR$ as: 
\begin{align}\label{equ:R}
    \gR=\bigg\{\mX=\begin{pmatrix}\mU\\\mJ\end{pmatrix}\mid\  &\sigma_1^2(\mX)\leq 2\lambda_1, \sigma_r^2(\mU)\geq \Delta/4, \notag \\
    &~~~\sigma_1^2(\mJ)\leq \lambda_r-\Delta/2 \bigg\}.
\end{align}
This set contains all the global minima of problem~\eqref{equ:1}. Moreover, the SNR ${\sigma_r^2(\mU)}/{\sigma_1^2(\mJ)}$ is larger than the constant ${\Delta}/{(4\lambda_1)}$ for any $\mX$ in $\gR$. In the Appendix, we demonstrate that if GD is initialized within $\gR$,  the sequence $\mX_t$ will remain in $\gR$ and the SNR will rapidly increase  towards infinity. Consequently, we can establish the local linear convergence of GD as in Theorem \ref{thm:1}, which is instrumental in analyzing  random initialization. Later, we will show that, for any initialization point  $\bX_0\notin\cR$, the convergence of GD consists of two phases: the first phase, where the sequence enters $\cR$, followed by  the final global convergence phase. With Theorem \ref{thm:1}, we only need to analyze the first phase. 

%%%%%Theorem%%%%%
\begin{theorem}\label{thm:1}
    Suppose $\eta\leq \frac{\Delta^2}{36\lambda_1^3}$, $\mX_0\in\gR$, and $\mX_t$ is the GD sequence  given by \eqref{equ:9}. Then, for any small $\epsilon>0$, we have $\|\mSigma_r-\mX_t\mX_t^\top\|\leq \epsilon$ in $\gO(\frac{6}{\eta\Delta}\ln\frac{200r\lambda_1^3}{\eta\Delta^2\epsilon})$ iterations, where $\mSigma_r={\rm diag}(\lambda_1,\ldots,\lambda_r,0,\ldots,0)$. 
\end{theorem}

%While our paper aims to understand large initialization in later sections, Theorem \ref{thm:1} is still an additional contribution of the paper. Prior works on local linear convergence either study the rank-one case~\citep{chen2019gradient} or require $\mSigma$ to be exact of rank $r$~\citep{zhu2021global}. Their arguments cannot be directly used to prove Theorem \ref{thm:1}.  In contrast, by employing an SNR argument, we can establish the local linear convergence for general cases. Our SNR analysis relies on a lower bound for the signal $\sigma_{r}^2(\bU_{t+1})$ and an upper bound for the noise $\sigma_1^2(\bJ_{t+1})$. These two bounds need to be related so that the ratio of SNR$_{t+1}$ by SNR$_t$ can be analyzed. This is the challenging part of the SNR analysis. Finally, we note that although we assume $\mSigma$ is positive semi-definite for simplicity, our proof can be easily extended to general symmetric $\mSigma$. Also, it can be modified to establish the local linear convergence of GD for matrix sensing~\citep{zhu2021global}.


%While our paper primarily focuses on understanding large initializations in subsequent sections, Theorem \ref{thm:1} is an additional contribution.

Prior studies on local linear convergence have either focused on the rank-one scenario \citep{chen2019gradient} or assumed that $\mSigma$ is exactly of rank $r$ \citep{zhu2021global}, and their proof arguments do not directly apply to Theorem \ref{thm:1}. In contrast, by utilizing an SNR argument, we establish local linear convergence for general cases. Our SNR analysis hinges on establishing a lower bound for the signal, $\sigma_{r}^2(\bU_{t+1})$, and an upper bound for the noise, $\sigma_1^2(\bJ_{t+1})$. These bounds must be precisely related to facilitate the analysis of the ratio between SNR$_{t+1}$  and SNR$_{t}$,  which poses significant  challenges.

Moreover, while we assume 
$\mSigma$ is positive semi-definite for simplicity, our proof  can easily be adapted to general symmetric case. It can also be modified to establish the local linear convergence of GD in matrix sensing scenarios \citep{zhu2021global}.




%%%%%%%%Random initialization%%%%%%%%%

\section{Random initialization}\label{sec:RI}

Benign initialization, while conceptually valuable, has limited practical utility because it often requires oracle information. This limitation is particularly pronounced in matrix sensing problems, where $\mSigma$ is observed only through random measurements \citep{stoger2021small}. Consequently, researchers have shifted their focus towards random initialization. According to Theorem \ref{thm:1}, the convergence analysis of GD simplifies to determining the duration required for the sequence to enter the set $\cR$. 
Once within  $\cR$, the sequence is guaranteed to converge to the global minima exponentially fast.



\subsection{Small random initialization}\label{sec:3.2}

Existing research, with the exception of the rank-one case,  predominantly focuses on  small random initialization. This approach assumes $\bX_0=\varpi\bN_0$, where $\bN_0\in\RR^{d\times r}$ has independent $\cN(0,\frac{1}{d})$ entries and $\varpi$ is notably  small. Concentration arguments indicate  the norm $\norm{\bX_0}$ is of order $\cO(\varpi)$. When $\varpi$ is sufficiently small, the higher-order term $\mX_{\cdot}\mX_{\cdot}^\top\mX_{\cdot}$ in \eqref{equ:9} becomes negligible in the early iterations. Consequently, during these early iterations, the GD iteration behaves like a power method: %\scomment{give a ref for power method.}
\begin{align}
    \mX_t\approx\mX_{t-1}+\eta\mSigma\mX_{t-1}.
\end{align}
The eigenvectors associated with larger eigenvalues will be learned faster. Using the same $\bU,\bJ$ notation as in Section~\ref{sec:3.1}, we obtain that  ${\sigma_{r}(\bU_{t+1})}/{\sigma_{r}(\bU_t)}$ is greater than ${\sigma_1(\bJ_{t+1})}/{\sigma_1(\bJ_t)}$ for small $t$, indicating  that the signal strength increases faster than the noise strength. By picking  a sufficiently small $\varpi$, we can show that after $\cO(\log(d))$ rounds, $\sigma_r^2(\bU_{t})$ will rise above $\Delta/4$ while $\sigma_1(\bJ_t)$ remains negligible. %This implies that the sequence $\bX_t$ will enter the set $\cR$ quickly, and combined with a local linear convergence result, \cite{stoger2021small} demonstrate the global linear convergence of GD. 
This rapid entry of the sequence $\bX_t$ into the region 
$\cR$ facilitates global linear convergence, as shown by \cite{stoger2021small}.  Additionally, \cite{jin2023understanding} explore  the incremental learning behavior of GD under small $\varpi$ conditions. Other studies, such as those by \cite{ma2022behind} and \cite{soltanolkotabi2023implicit}, also examine GD with small initialization.

%These work typically require $\varpi=d^{-\iota(\kappa)}$ for some positive, increasing function $\iota(\cdot)$,  where $\kappa=\lambda_1/\Delta\geq 1$ is the conditional number. For instance, 


%then the GD sequence will exhibit a certain implicit bias~\citep{stoger2021small,jin2023understanding,li2018algorithmic}. In particular, 

%and their studies reveal that GD with sufficiently small initialization exhibits a certain implicit bias~\citep{stoger2021small,jin2023understanding,li2018algorithmic}. Specifically, \citet{stoger2021small} highlights that when the initialization is sufficiently small, the higher-order term $\mX_{t-1}\mX_{t-1}^\top\mX_{t-1}$ in \eqref{equ:9} becomes negligible during the early stage. Consequently, in the early stage, the GD iteration behaves like a spectral method:
%\begin{align}
%    \mX_t\approx\mX_{t-1}+\eta\mSigma\mX_{t-1}.
%\end{align}
%Based on this observation, one can prove for sufficiently small random initialization that GD exhibits linear convergence to the global minima~\citep{stoger2021small}. As demonstrated in \citet{jin2023understanding}, this convergence is also in an incremental way. The elements corresponding to larger eigenvalues of $\mSigma$ tend to increase faster. For more precise statements of the results, one can refer to these two papers. 

\subsection{Large random initialization}\label{sec:large}

In practice, however, a large initialization is often employed, where $\mX_0=\varpi\mN_0$ with  $\varpi$ being  a constant {\bf independent of $d$}. In such cases, the arguments in Section \ref{sec:3.1} or Section \ref{sec:3.2} are inadequate. Specifically, the initial SNR is too low for  the arguments in Section~\ref{sec:3.1} to be applicable, and the initial magnitude $\norm{\mX_0}$ is too high to use the arguments in Section \ref{sec:3.2}. Thus, to examine large initialization, we  need a more delicate analysis, as presented in this section. 


%The core idea of the analysis is given by Figure \ref{fig:1} and related discussions in Section~\ref{sec:1}. 
%Our results will be deterministic and applicable to random initialization.



%To proceed, we first introduce some notations.
%\footnote{This is because GD analysis is invariant to  rotations.}

Our analysis builds upon the discussions presented   in Section \ref{sec:1.1}. Specifically, we consider problem \eqref{equ:1} with a general rank parameter $r$ and assume without loss of generality that $\mSigma={\rm diag}(\lambda_1,\ldots,\lambda_d)$ is diagonal with decreasing diagonal elements. Suppose the leading $r+1$ eigenvalues of $\mSigma$ are strictly decreasing and let $\Delta=\min_{i\leq r}\{\lambda_i-\lambda_{i+1}\}>0$ be the eigengap. Our goal is to characterize the convergence  of the GD sequence $\bX_t$ as defined in equation \eqref{equ:9}. 


%Suppose $\bX_t$ is the GD sequence in \eqref{equ:9} and $\bX_0$ is the initial point. The aim of this section is to analyze the convergence of $\bX_t$.

%whose relationships to Figure \ref{fig:1} are presented in Section \ref{sec:1.1}. 

Following the discussion in Section \ref{sec:1.1}, we define $\bu_{k,t}$ as the $k$-th row of $\bX_t$ and $\bK_{k,t}$ as the $(k+1)$-to-$d$-th rows of $\bX_t$. The relationships between these definitions and the visualizations in Figure \ref{fig:1} are introduced  in Section \ref{sec:1.1}.
Let
\begin{align}\label{equ:S}
    \gS=\{\mX\in\sR^{d\times r}\mid \ & \sigma_1^2(\mX)\leq 2\lambda_1,\notag\\
    &\sigma_1^2(\mK_{k})\leq \lambda_k-\frac{3\Delta}{4},\forall k\leq r\},
\end{align}
be a subset of $\RR^{d\times r}$ where $\bX$ and $\bK_k$, the $(k+1)$-to-$d$~rows of $\mX$, both have bounded norms. We will show that $\bX_t$ quickly enters the set $\cS$. The duration until this entry occurs is denoted by:  %and we denote its duration by
\$
  t_{{\rm init},1}=\min\{t\geq 0\mid \bX_t\in\cS\}.
  \$ 
To streamline our presentation,  we introduce two constants $t^*$ and $t^{\sharp}$:
\#
t^*&=\log\left(\frac{\Delta^2}{8\lambda_1^3+144r^2\lambda_1}\right)/\log(1-\eta\Delta/6), \\
 t^{\sharp}&=\log\left({\Delta}/{(4r)}\right)/\log(1-\eta\Delta/6).\label{equ:t*}
\#  
Subsequently, we define the quantities 
\$
\{T_{\bu_k},t_k,t_k^*,t_{{\rm init},k+1}\}
\$ 
in a successive manner up to $T_{\bu_r}$:
\begin{itemize}[leftmargin=*]
    \item Define $T_{\bu_k}$, counted from $t_{{\rm init},k}+1$, as the earliest time when the strength of the $k$-th signal,  $\sigma_1^2(\bu_{k,t})$,  first  surpasses $\Delta/2$:
    \$
    T_{\bu_k}=\min \{t\geq 0\mid \sigma_1^2(\bu_{k,\,t+t_{{\rm init},k}})\geq\Delta/2\};
    \$
    \item $t_k=t_{{\rm init},k}+T_{\bu_k}+t^*$;
    \item $t_k^*$ is the smallest integer for which the following inequality holds, indicating that the $(k+1)$-th signal strength no longer falls below a geometrically decaying sequence %smaller than a geometrically decaying sequence. 
    \#
    &r(1-\eta\Delta/6)^{t_{k}^*} \nonumber\\
    &\qquad\leq\sqrt{\frac{\Delta}{8}} \min\left\{\sigma_1(\bu_{k+1,t_k+t_k^*}),\sqrt{\frac{\Delta}{2}}\right\}; \label{equ:assumption}
    \#
    \item $t_{{\rm init},k+1}=t_k+t_k^*$.
\end{itemize}

%These quantities characterize the convergence of GD, and our main result is to upper bound these quantities. To that end, we first observe that $t_k^*<\infty,\forall k\leq r$ almost surely when we employ random initialization. This result, stated in Proposition \ref{prop:asp}, follows from the theory of \citet{lee2016gradient} and the landscape analysis of \citet{zhu2021global}. The rigorous proof is postponed to the Appendix. 


These quantities are instrumental in characterizing the convergence of GD, and our primary objective is to upper bound these quantities. We first find that  $t_k^*<\infty,\forall k\leq r$  almost surely when random initialization is utilized. This finding, articulated in Proposition \ref{prop:asp}, is supported by the theory of \citet{lee2016gradient} and the landscape analysis of  \citet{zhu2021global}. A detailed proof of this result is deferred to the Appendix.

%Our first observation is that $t_k^*<\infty,\forall k\leq r$ almost surely when we use random initialization. 

\begin{proposition}\label{prop:asp}
    Let $\eta\leq\frac{\Delta}{100\lambda_1^2}$ and $\bX_t$ be the GD sequence initialized with  $\bX\in\RR^{d\times r}$. Then the following set 
    \$
    \{\bX\in\RR^{d\times r}|  \sigma_1(\bX)\leq {1}/{\sqrt{3\eta}},\,  t_k^*=\infty\textnormal{ for some }k\leq r\}
    \$
    is of Lebesgue measure zero.
\end{proposition}


%In Theorem \ref{lma:r'}, we provide upper bounds for these quantities and characterize the behavior of GD in specific time. Our result is deterministic and applicable to the case of large random initialization. First, we present the following mild assumption. 
%Our result is deterministic and thus applicable to the case of random initialization. 


%Motivated by this proposition, we shall make the following assumption in our main theorem.
%Our theorem considers deterministic initialization, and is applicable to the case of random initialization. 


Motivated by this proposition, we introduce the following assumption and then present  our main theorem. While our theorem is formulated under deterministic initialization, it remains applicable to scenarios involving random initialization.


\begin{assumption}\label{asp:1}
    Assume that $t_k^*<\infty$ for all $k\leq r$. 
\end{assumption}

%Assumption \ref{asp:1} almost surely holds if we use random initialization. 
 %This proves the following proposition. 



%Now we present the main theorem.

\begin{theorem}\label{lma:r'}

    Suppose $\eta\leq\frac{\Delta^2}{100\lambda_1^{3}}$, $\sigma_1(\bX_0)\leq 1/{\sqrt{3\eta}}$, $\bX_t$ is the GD sequence, and Assumption \ref{asp:1} holds. Then we have
    \begin{enumerate}
        \item[(1)] $\bX_t\in\cR$ for all $t\geq t_{\cR}\coloneqq t_{{\rm init},r}+T_{\bu_r}+t^*+t^{\sharp},$ where 
        \$
        t_{{\rm init},1}&=\gO\left(\frac{1}{\eta\lambda_1}\log\frac{1}{6\eta\lambda_1}\right)+\gO \left(\frac{1}{\eta\Delta}\log\frac{8\lambda_1}{\Delta} \right),\\
        T_{\bu_k}&=\cO\left(\frac{4}{\eta\Delta}\log\frac{\Delta}{2\sigma_1^2(\bu_{k,t_{\rm init},k})}\right),\quad\forall k\leq r.
        \$
        
        
        \item[(2)]GD achieves $\epsilon$-accuracy, i.e., $\norm{\bSigma_r-\bX_t\bX_t^\top}_{\rF}\leq \epsilon$, in 
        \#\label{equ:global}
        t_{\cR}+\cO\left(\frac{6}{\eta\Delta}\ln\frac{200r\lambda_1^3}{\eta\Delta^2\epsilon}\right)
        \# 
        iterations.  
        
        \item[(3)] For all $k< r$ and $t\geq t_k$, both $\sigma_1(\bu_{k,t}\bK_{k,t}^\top)$ and $p_{k,t}$ converge to zero linearly fast:
    \#
    &\sigma_1(\bu_{k,t}\bK_{k,t}^\top)\leq(1-\eta\Delta/6)^{t-t_{k}},\notag\\
    &|p_{k,t}|\leq (2\lambda_1+\frac{24r}{\eta\Delta})\cdot(1-\eta\Delta/8)^{t-t_k},\label{equ:pkt}
    \#
    where $p_{k,t}=\lambda_k-\sigma_1^2(\bu_{k,t})$. This reveals the implicit incremental learning of GD.   
    \end{enumerate}
    %Here $t_{{\rm init},k}, T_{\bu_k},t_k,t_k^*$ are defined above the theorem. 
    %, where
    %The linear convergence of $\bX_t$ to the global minima after $\bX_t$ entering $\cR$ is stated in Theorem \ref{thm:1}.
\end{theorem}



Theorem \ref{lma:r'} imposes relatively  mild conditions. First, the condition that $\sigma_1(\bX_0)\leq {1}/{\sqrt{3\eta}}$ holds with high probability when we pick $\bX_0=\varpi\bN_0$ with $\varpi\lesssim 1/{\sqrt{\eta}}$, using the same $\bN_0$ as previously discussed. This order is maximal, as the GD sequence may diverge when $\sigma_1(\bX_0)$ exceeds this order. For example, consider $\bSigma=\zero$ and $\eta\sigma_1^2(\bX_0)\geq3$. By employing an inductive argument for the GD iteration \eqref{equ:9}, we can show that
\$
\sigma_1(\bX_{t+1})\geq (\eta\sigma_1^2(\bX_t)-1)\cdot \sigma_1(\bX_t)>2\sigma_1(\bX_t),\quad \forall t.
\$
This  result implies that the GD sequence diverges when $\varpi$ is a large constant. Consequently,  it  establishes the maximal order of $\varpi$ for convergence is $\cO(1/\sqrt{\eta})$. The only possible improvement could be a constant factor. Additionally, this rate is independent of the dimension $d$, in stark contrast to the condition in the small initialization scenario \eqref{equ:small} where $\varpi$ decays to zero exponentially fast as $d$ increases. This relaxed assumption makes our theorem applicable to large initialization. Second, Assumption \ref{asp:1} is considered mild as demonstrated in Proposition \ref{prop:asp}. Therefore,  Theorem \ref{lma:r'} is applicable across a wide range of contexts.   



The conclusions of Theorem \ref{lma:r'} are threefold. First, we upper bound all quantities except $t_k^*$ by logarithmic terms. 
These bounds partially explain the fast convergence of GD in Figure \ref{fig:1}. 
Next, by combining property (1) with Theorem \ref{thm:1}, we obtain the global convergence rate in \eqref{equ:global}. %All quantities in the rate \eqref{equ:global} are shown to be logarithmic terms except $t_k^*$, which sheds insights on the fast convergence of GD. 
Third, we show
that the $k$-th signal strength converges to the target value exponentially fast following  the $t_k$th step. Crucially, this convergence is independent of $t_{j}^*$ for all $j\geq k$.
 This indicates that the convergence of the $k$-th signal is not affected by the behavior of subsequent signals ($(k+1)$-to-$r$-th), exemplifying an implicit incremental learning phenomenon in GD.


%In other words, the $k$-th signal will converge fast to the target value independent of the behavior of the latter ($(k+1)$-to-$r$-th) signals. This is referred to as an implicit incremental learning phenomenon of GD.  


Finally, if we make an additional assumption, we can obtain the fast global convergence of GD in Theorem \ref{thm:8}.

\begin{assumption}\label{asp:2}
    Assume $t_k^*=\cO(\log(d))$ for all $k\leq r$.
\end{assumption}

\begin{theorem}\label{thm:8}
    Assume that conditions in Theorem \ref{lma:r'}  and Assumption \ref{asp:2} hold. Then GD achieves $\epsilon$-accuracy in $\cO(\log(d)+\log(1/\epsilon))$ iterations.
\end{theorem}


%While it is challenging to verify this assumption, it is still an informative assumption as explained in Section~\ref{sec:4.2.3}.
 
Assumption \ref{asp:2} is  particularly nuanced as  it upper bounds the quantity $t_k^*$. We call it a transition assumption because it 
 facilitates the analytical progression from the analysis of the $k$-th row to the $(k+1)$-th row, positing the transition time is $\cO(\log(d))$. Verifying this assumption is challenging and we leave it to the future work. 
 
 
 %Without Assumption \ref{asp:2}, the results in Theorem \ref{lma:r'} still hold. 
%Without Assumption \ref{asp:2}, the results in Theorem \ref{lma:r'} still hold

%allows us to transit the analysis from the $k$-th row to the $(k+1)$-th row and it assumes the transition time is $\cO(\log(d))$.
%Without Assumption \ref{asp:2}, the results in Theorem \ref{lma:r'} still hold. 



\section{Proof sketch}\label{sec:proof}
 
%In this section, we will provide a sketch of proof. We will start with rank-two matrix approximation and then extend it to general rank problems. The only difference between rank-two problem and general rank problems lies in how many rounds of inductive arguments are needed. 


In this section, we provide a sketch of the proof. Initially focusing on rank-two matrix approximation, we subsequently extend the analysis to general rank problems. The primary distinction between the rank-two scenario and general rank problems lies in the number of rounds of inductive arguments required.


\subsection{Rank-two matrix approximation}\label{sec:3}

To start with, we first show that when $\sigma_1(\bX_0)\leq {1}/{\sqrt{3\eta}}$, the GD sequence will quickly enter the region $\cS$  defined in \eqref{equ:S}, and the sequence will remain in $\cS$ afterwards. This proves the first property in Theorem \ref{lma:r'}. Recall that $t_{{\rm init},1}=\min\{t\geq 0\mid \bX_t\in\cS\}$ and $\bX_t$ is the GD sequence given by \eqref{equ:9}.

%To present our analysis, we define the following set
%Define
%\begin{align}\label{equ:S}
 %   \gS=\{\mX\in\sR^{d\times r}\mid \sigma_1^2(\mX)\leq 2\lambda_1,\sigma_1^2(\mK_{k})\leq \lambda_k-\frac{3\Delta}{4},\forall k\leq r\},
%\end{align}
%where $\mK_k$ stands for the $(k+1)$-to-$d$ rows of $\mX$, meaning that the magnitudes of $\bX_0$ and $\bK_{k,0}$ are upper bounded by certain constants.  %This is achievable if we use random initialization $\mX_0=\alpha\mN_0$ with a reasonably small constant $\alpha$, say $\lambda_{r}-\frac{3\Delta}{4}$. Lemma \ref{lma:1} shows that if $\mX_0\in\gS$, then the GD sequence $\mX_t$ will remain in $\gS$, given a reasonably small learning rate. Hence, the magnitudes of $\mX_t$ and $\mK_{k,t}$ are always restrained by certain constants, forming the basis for the ensuing analysis.

\begin{lemma}\label{lma:1}
    Suppose $\eta\leq\frac{1}{12\lambda_1}$ and $\sigma_1(\mX_0)\leq \frac{1}{\sqrt{3\eta}}$. Then $\mX_t\in\gS$ for all $t\geq t_{{\rm init},1}$, where 
    \begin{align*}
        t_{{\rm init},1}=\gO\left(\frac{1}{\eta\lambda_1}\log\frac{1}{6\eta\lambda_1}\right)+\gO\left(\frac{1}{\eta\Delta}\log\frac{8\lambda_1}{\Delta}\right).
        %T_1&=\gO\left(\frac{1}{\eta\lambda_1}\log\frac{1}{6\eta\lambda_1}\right),\textnormal{ or 0 if }\sigma_1^2(\mX_0)\leq2\lambda_1,\\
        %T_{\mK}&=\gO\left(\frac{1}{\eta\Delta}\log\frac{8\lambda_1}{\Delta}\right), \textnormal{ or 0 if }\mX_0\in\gS.
    \end{align*}
\end{lemma}

Lemma \ref{lma:1} establishes  that $\cS$ is an absorbing set of GD, indicating  that once the sequence enters this set, it will remain there indefinitely. This characteristic allows us to assume $\bX_t\in\cS$ in subsequent analysis. 


\subsubsection{$\sigma_1^2({\mathbf{u}}_{1,t})$ rapidly increases  above $\Delta/2$}\label{sec:4.2.1}


Our next step is to analyze the first row $\bu_{1,t}$ of $\mX_t$. This is in sharp contrast to the results in Section~\ref{sec:3.1} and \ref{sec:3.2}, where the first $r$ rows of $\bX_t$ are analyzed together. Although using large initialization makes previous analysis infeasible, it is still manageable to examine only the first row of $\bX_t$. In Lemma~\ref{lma:2}, we show that $\sigma_1^2(\bu_{1,t})$ rapidly increases to be larger or equal to $\Delta/2$, and it remains larger than or equal to this threshold afterwards. 
This implies that the first signal strength will become larger than or equal to  $\Delta/2$ after logarithmic steps. It allows us to employ an SNR argument in the subsequent analysis.
%In addition, this lemma aligns with the first stage of the GD dynamics as displayed in Figure \ref{fig:1}, offering theoretical insights about the GD dynamics.  
Furthermore, this lemma aligns with the initial phase of the GD dynamics as illustrated in Figure \ref{fig:1}, providing valuable theoretical insights into the behavior of GD.


\begin{lemma}\label{lma:2}
    Let $\eta\leq\frac{1}{12\lambda_1}$ and  $\sigma_1(\bX_0)\leq1/{\sqrt{3\eta}}$. Assume $\sigma_1(\bu_{1,t_{{\rm init},1}})>0$. Then 
    \$
    \sigma_1^2(\bu_{1,t})\geq \Delta/2,~~~\forall~t\geq t_{{\rm init},1}+ T_{\bu_1}, 
    \$ 
 where
    \begin{align*}
        T_{\bu_1}=\cO\left(\frac{4}{\eta\Delta}\log\frac{\Delta}{2\sigma_1^2(\bu_{1,{t_{{\rm init},1}}})}\right).
    \end{align*}
\end{lemma}




\subsubsection{SNR converges linearly towards infinity and $\sigma_1^2({\mathbf{u}}_{1,t})$ converges}



%, where the numerator and  denominator are considered as the signal and  noise respectively. 
%It then follows that the noise strength diminishes to zero fast at a linear rate.


Once $\sigma_1^2(\bu_{1,t})$ exceeds $\Delta/{2}$, then we can employ an SNR argument similar to \eqref{equ:7}. Specifically, we pick the SNR as
\$
\frac{\sigma_1^2(\bu_{1,t})}{\sigma_1(\bu_{1,t}\bK_{1,t}^\top)},
\$ 
and show  that it converges linearly towards infinity, where $\bK_{1,t}$ is the $2$-to-$d$-th rows of $\bX_t$. Since $\sigma_1^2(\bu_{1,t})$ is in the interval $[\Delta/2,2\lambda_1]$ by Lemma \ref{lma:1} and \ref{lma:2}, we can show that the noise strength $\sigma_1(\bu_{1,t}\bK_{1,t}^\top)$ diminishes to zero  fast. In particular, 
if $\bu_{1,t}\bK_{1,t}^\top=\zero$, then the dynamics of $\bu_{1,t}$ becomes
\$
\bu_{1,t+1}=\bu_{1,t}+\eta\lambda_1\bu_{1,t}-\eta\sigma_1^2(\bu_{1,t})\bu_{1,t}.
\$
This update rule implies the fast convergence of $\sigma_1^2(\bu_{1,t})$ to $\lambda_1$. Generally, when the term $\bu_{1,t}\bK_{1,t}^\top$ is close to zero, the dynamics of $\bu_{1,t}$ will mimic the above iteration. Following this, we can establish the fast convergence of $\sigma_1^2(\bu_{1,t})$ to $\lambda_1$. These results, established in Lemma \ref{lma:3}, relate to Property 3 in Theorem \ref{lma:r'}. 


\begin{lemma}\label{lma:3}
    Let $\eta\leq\frac{\Delta}{100\lambda_1^2}$ and assume $\sigma_1(\bX_0)\leq 1/{\sqrt{3\eta}}$ and $\sigma_1(\bu_{1,0})>0$. Then for all $t\geq t_1$, we have
    \begin{align*}
        \sigma_1(\bu_{1,t}\bK_{1,t}^\top)\leq (1-\eta\Delta/6)^{t-t_1}
    \end{align*}
    where $t_1=t_{{\rm init},1}+T_{\bu_1}+t^*$, $T_{\bu_1}$ is given in Lemma \ref{lma:2}, and  $t^*$ is a constant defined in \eqref{equ:t*}. 
    In addition, let 
    \$
    p_{1,t}=\lambda_1-\sigma_1^2(\bu_{1,t})
    \$
    be the error term. Then for all $t\geq t_1$, we have
    \$
    |p_{1,t}|\leq (2\lambda_1+24\frac{24r}{\eta\Delta})\cdot (1-\eta\Delta/8)^{t-t_1}.
    \$
\end{lemma}


The above result implies  the rapid convergence of the first signal. This convergence is independent of the behavior of subsequent signals, a phenomenon known as implicit incremental learning. Furthermore, this result corresponds to the second phase of the GD dynamics, as depicted in Figure \ref{fig:1}, offering a substantial theoretical explanation.


%It implies the fast convergence of the first signal strength. 
%In particular, the convergence of the first signal is independent of the behavior of the other signals, which is referred to as an implicit incremental learning phenomenon. This result corresponds to the second phase of the GD dynamics as depicted in Figure \ref{fig:1}, providing a substantive theoretical explanation.




\subsubsection{Transition assumption and induction}\label{sec:4.2.3}

Lemma \ref{lma:3} shows that the magnitude $\sigma_1(\bu_{1,t}\bK_{1,t}^\top)$ diminishes linearly to zero. This motivates us to decouple the original matrix factorization problem into two sub-problems. For the first sub-problem, we study the convergence of the first row of $\mX_t$, which has been presented in previous section. In the second sub-problem, we examine $\bK_{1,t}$, the $2$-to-$d$-th rows of $\mX_t$. Such decoupling is exact when $\bu_{1,t}\mK_{1,t}^\top=0$, and under this condition, the update rule of $\mK_{1,t}$ becomes
\$
\mK_{1,t}=\mK_{1,t-1}+\eta(\bGamma_1-\mK_{1,t-1}\mK_{1,t-1}^\top)\mK_{1,t-1},
\$
where $\bGamma_1=\diag(\lambda_2,\ldots,\lambda_{d})$. This is congruent with the GD update rule of $\mX_t$ as in \eqref{equ:9}, and hence an inductive argument could be applied. 


Generally, when the noise term $\sigma_1(\bu_{1,t}\bK_{1,t}^\top)$ only decreases fast but does not reach zero, one should check whether $\bu_{1,t}\bK_{1,t}^\top$ is negligible (in the analysis of $\bu_{2,t}$). Specifically, if $\sigma_1(\bu_{2,t})$ is not always decreasing at the same speed as $\sigma_1(\bu_{1,t}\bK_{1,t}^\top)$, then we can apply the above inductive argument. To formalize  this intuition, we introduce a variable $t_1^*$, which is defined as the smallest integer such that 
\#
r(1-\eta\Delta/6)^{t_{1}^*}\leq\sqrt{\frac{\Delta}{8}}\min\{\sigma_1(\bu_{2,t_1+t_1^*}),\sqrt{\frac{\Delta}{2}}\},\label{equ:t1*}
\#
where $t_1$ is defined in Lemma \ref{lma:3}. Recall that for all $t\geq t_1$, 
\$
\sigma_1(\bu_{1,t}\bK_{1,t}^\top)\leq (1-\eta\Delta/6)^{t-t_1}.
\$
Thus, \eqref{equ:t1*} essentially compares the second signal strength $\sigma_1(\bu_{2,\cdot})$ with an upper bound on the noise  $\sigma_1(\bu_{1,t}\bK_{1,t}^\top)$. When \eqref{equ:t1*} holds, we find that the noise term is negligible, and thus a similar result as Lemma \ref{lma:2} can be established for the second signal $\sigma_1(\bu_{2,\cdot})$, leading to Lemma \ref{lma:10}. 
%It is also related to the second and third property in Theorem \ref{lma:r'} (for $k=2$). 


\begin{lemma}\label{lma:10}
    Suppose the conditions of Lemma \ref{lma:3} holds. Let $t_{{\rm init},2}=t_1+t_1^*$, where $t_1$ is given by Lemma \ref{lma:3} and $t_1^*$ is given by \eqref{equ:t1*}. Suppose $t_1^*<\infty$. Then 
    \$
    \sigma_1^2(\bu_{2,t})\geq {\Delta}/{2},~~~\forall~t\geq t_{{\rm init},2}+T_{\bu_2}, 
    \$ 
     where
    \$
    T_{\bu_2}=\cO\left(\frac{4}{\eta\Delta}\log\frac{\Delta}{2\sigma_1^2(\bu_{2,t_{{\rm init},2}})}\right).
    \$
\end{lemma}

In Lemma \ref{lma:10}, we assume $t_1^*<\infty$, which relates to Assumption \ref{asp:1}. If we  assume $t_1^*=\cO(\log(d))$ as in Assumption \ref{asp:2}, then we can show that $T_{\bu_2}=\cO(\log(d))$ as well. While we have not theoretically characterized the quantity $t_1^*$, our theory is still insightful in the following sense. 
\begin{itemize}[leftmargin=*]
    \item First, the term $\sigma_1(\bu_{1,t}\bK_{1,t}^\top)$ is shown to decay to zero linearly fast while $\sigma_1^2(\bu_{2,t})$ does not seem to possess similar theories. Hence, we may expect that the time point $t_1^*$ is not large. 
    \item Second, $t_1^*$ characterizes the time when the GD sequence escapes from the saddle points\footnote{Any stationary point with $\bu_{2}=0$ is a saddle point. Hence, if the GD sequence $\mX_t$ converges with $t_1^*=\infty$, then it must converge to a saddle point.}. This time is inevitable for the GD sequence converging to the global minima. Even we do not provide an upper bound on $t_1^*$, we know the convergence behavior of GD during this time. Notably, during this time, both $\sigma_1(\bu_{1,t}\bK_{1,t}^\top)$ and $\sigma_1^2(\bu_{2,t})$ converge to zero fast. 
    \item Thirdly, during the time $t_1$-$(t_1+t_1^*)$, while $\sigma_1^2(\bu_{2,t})$ converges to zero fast, the first signal $\sigma_1^2(\bu_{1,t})$ still converges to $\lambda_1$, as shown in Lemma \ref{lma:3}. This means the convergence of the first signal is not affected by the behaviors of the rest signals, which supports the incremental learning phenomenon -- {\it leading signals first converge even when the rest are stuck by saddle points}. 
    \item Finally, the time $t_1$ to $t_1+t_1^*$ aligns with the third stage of the GD dynamics as displayed in Figure \ref{fig:1}. The experiment shows that the time $t_1^*$ is not too long. 
\end{itemize}
Despite these arguments, there is still a need to examine the duration $t_1^*$ in the future research, which might involve investigating specific initialization mechanisms. 


%Once $\sigma_1(\bu_{2,t})$ is not as small as the geometrically decreasing sequence $(1-\eta\Delta/6)^t$, the quantity $\sigma_1^2(\bu_{2,t})$ will rise exponentially fast above $\Delta/2$. This is analogous to the discussion in Section \ref{sec:4.2.1}, and a precise statement is given in the following lemma. 



%\begin{remark}
%    Similar to Lemma \ref{lma:3}, we can show that the (second) SNR $\frac{\sigma_1^2(\bu_{2,t})}{\sigma_1(\bu_{2,t}\bK_{2,t}^\top)}$ also ascends to infinity fast after $t_1+t_1^*+T_{\bu_2}$ iterations. This allows us to successively repeat the argument in general rank case; see Section \ref{sec:4} for more details. Here, since $r=2$, we can directly use local linear convergence to conclude the trajectory analysis. This is the content of next subsection.
%\end{remark}

\subsubsection{Final convergence}

Previous analyses indicate that the strengths of both the first and second signals exceed $\Delta/2$, and the corresponding noise components decay geometrically. A simple verification shows that the GD sequence $\mX_t$ will quickly enter the region $\gR$, which is defined in \eqref{equ:R}. Then by the local linear convergence of GD in Theorem \ref{thm:1}, we shall complete the characterization of the GD sequence's convergence to the global minima. This final stage aligns with the fourth stage of the GD dynamics as illustrated in Figure \ref{fig:1}.


%\subsubsection{Summary}

%Now we briefly summarize the four-stage trajectory analysis in this section. 
%\begin{itemize}
%    \item At stage I, the first signal strength will increase fast above a positive threshold, say $\Delta/2$.
%    \item At stage II, the SNR $\sigma_1^2(\bu_{1,t})/\sigma_1(\bu_{1,t}\bK_{1,t}^\top)$ increases to infinity quickly. This implies that $\sigma_1(\bu_{1,t}\bK_{1,t}^\top)$ converges to zero fast. Simultaneously,  $\sigma_1^2(\bu_{1,t})$ converges fast to $\lambda_1$. 
%    \item At stage III, the second signal strength $\sigma_1(\bu_{2,t})$ will first keep smaller than a geometrically decaying sequence. After a period of time, $\sigma_1(\bu_{2,t})$ will not be as small as this sequence. Then the quantity $\sigma_1^2(\bu_{2,t})$ will increase above $\Delta/2$ at a linear rate.
%    \item The final stage is the local linear convergence as studied in Section \ref{sec:3.1}. 
%\end{itemize}

\subsection{General rank matrix approximation}\label{sec:4}

It is straightforward to extend the rank-two case to the general rank case. The key point is to repeat the inductive arguments for $(r-1)$ rather than one times. 


Similar to the rank-two case, we will now successively show that $\sigma_1^2(\bu_{k,t})$ surpasses $\Delta/2$ and $\sigma_1(\bu_{k,t}\bK_{k,t}^\top)$ diminishes linearly to zero  for all $k\leq r$. Moreover, we will show that $\sigma_1^2(\bu_{k,t})$ converges to $\lambda_k$ after certain iterations. Once the first $r$ rows of $\bX_t$ are all analyzed, we can show that the sequence $\bX_t$ quickly enters the region $\cR$ defined in \eqref{equ:R}. By invoking the local linear convergence theorem, we will conclude the proof. 

Our analysis consistently uses the SNR argument, although the specific SNR definitions vary. 
\begin{itemize}[leftmargin=*]
    \item For analyzing the $k$-th signal strength in Theorem \ref{lma:r'}, we examine the SNR given by
    \$
    \frac{\sigma_1^2(\bu_{k,t})}{\sigma_1(\bu_{k,t}\bK_{k,t}^\top)}.
    \$
    We will prove both the diminishing of $\sigma_1(\bu_{k,t}\bK_{k,t}^\top)$ to zero and the convergence of $\sigma_1^2(\bu_{k,t})$ to $\lambda_k$.
    \item For analyzing the local linear convergence in Theorem \ref{thm:1}, we utilize the SNR  defined  as 
    \$
    {\sigma_r^2(\bU_t)}/{\sigma_1^2(\bJ_t)},
    \$
    where $\bU,\bJ$ are defined in Section \ref{sec:3.1}. We will prove the linear convergence of $\bJ$ to zero, as well as the fast convergence of $\bU_{t}$ to the target matrix.


\end{itemize}



\section{Concluding remarks}\label{sec:5}

This paper presents a comprehensive analysis of the trajectory of GD for matrix factorization problems, with a partcular focus on large initialization. By employing both an SNR argument and inductive reasoning, we deepen the investigation and uncover that even with large initialization, GD still exhibit an incremental learning phenomenon. We anticipate that these insights will stimulate further research in related domains.


%A significant challenge identified is the model's ability to escape saddle points efficiently.
%The analysis employs both an SNR argument and an induction argument to bolster the investigation's depth and insight. Our finding is that even with large initialization, GD may still exhibit an incremental learning phenomenon, and the main challenging convergence issue is to escape from the saddle points. We hope such findings can inspire other researchers in related fields.



%\item Our analysis assumes strictly decreasing top eigenvalues. Extending the findings to matrices with possibly equal eigenvalues  requires additional research.

%Second, our analysis focuses on the simplest matrix factorization setting. It is intriguing to study similar results in other settings, such as matrix sensing, where $\bSigma$ is only accessible via linear measurements. Our delicate dynamic analysis is sensitive to the noise introduced by the measurement mechanism. Hence, new theoretical tools are needed.

%Third, it is interesting to examine GD in solving deep matrix factorization. It is unknown how large initialization affects the GD trajectory in that case.


This study presents several limitations that naturally suggest avenues for future research.
\begin{itemize}[leftmargin = *]
    \item  First, we have not established an upper bound for $t_k^*$ defined in \eqref{equ:assumption}. Determining an effective upper bound is crucial, and exploring potential negative results in this context could also be insightful.
    \item Second, our analysis assumes strictly decreasing top eigenvalues. Extending the findings to matrices with possibly equal eigenvalues  requires additional research.
    \item  Third, our analysis is confined to the simplest matrix factorization scenario. Exploring these results in more complex settings, such as matrix sensing where $\bSigma$  is accessible only through linear measurements, would be particularly compelling. Given that our dynamic analysis is sensitive to noise,  this generalization may be challenging. 
    \item  Last, investigating  GD in deep matrix factorization also presents a significant research opportunity. It remains unclear how large initialization impacts the GD trajectory in such a complex case.
\end{itemize}

\scolor{



}



\section*{Acknowledgement}

Hengchao Chen and Qiang Sun are supported in part by an NSERC Dicovery Grant (RGPIN-2018-06484), a Data Sciences Institute Catalyst Grant, and a computing grant from Compute Canada. 



%\bibliography{references}
%\bibliographystyle{iclr2024_conference}


% References
\bibliography{uai2024-template}

\newpage

\onecolumn

\title{Gradient descent in matrix factorization: 
Understanding large initialization\\(Supplementary Material)}
\maketitle


\appendix
\section*{\Large Appendix}

\section{Preliminary}\label{sec:apd:preliminary}

In this section, we present preliminary to our studied formulation. 

\subsection{Matrix sensing and it population version}

In this section, we review matrix sensing problems \citep{recht2010guaranteed,li2018algorithmic} and derive its population version as a matrix factorization problem \citep{chi2019nonconvex}. In matrix sensing problems, we aim to recover an unknown  symmetric positive semidefinite matrix $\bSigma\in\RR^{d\times d}$ from a set of linear measurements
\$
y_i=\langle{\bSigma},{\bA_i}\rangle,\quad i=1,\ldots, m.
\$
Here $\{\bA_i\}\subseteq\RR^{d\times d}$ are sensing matrices known {\it a priori}. A standard assumption is to require that $\bSigma$ is of rank $r\ll d$ and $\bA_i$ has independent $\cN(0,1)$ entries. Under such assumptions, one common strategy is to employ matrix factorization, and solve the following least square minimizing problem:
\$
\hat\bSigma=\hat \bX\hat \bX^\top,\quad \hat \bX=\argmin_{\bX\in\mathbb{R}^{d\times r}}f(\bX)\overset{\rm def}{=}\frac{1}{4m}\sum_{i=1}^m(y_i-\langle\bA_i,\bX\bX^\top\rangle)^2.
\$
Starting from an initial point $\bX_0\in\mathbb{R}^{d\times r}$, the gradient descent algorithm will update $\bX_t$ as follows
\$
\bX_t=\bX_{t-1}-\eta \nabla f(\bX_{t-1}),
\$
where $\nabla f(\bX)$ is the gradient of $f$ given by
\$
 \nabla f(\bX)=-\frac{1}{m}\sum_{i=1}^m(y_i-\left<\bA_i,\bX\bX^\top\right>)\bA_i\bX=-\frac{1}{m}\sum_{i=1}^m\left<\bA_i,\bSigma-\bX\bX^\top\right>\cdot \bA_i\bX.
\$
Using statistical concentration analysis \citep{recht2010guaranteed}, we know that the gradient $\nabla f(\bX)$ is approximately
\$
\nabla f(\bX)\approx\mathbb{E}\nabla f(\bX)=-(\bSigma-\bX\bX^\top)\bX.
\$
Thus, the matrix sensing is related to the following population version:
\$
\bX_t=\bX_{t-1}+\eta(\bSigma-\bX_t\bX_t^\top)\bX_t,
\$
which is the GD update rule \eqref{equ:9} for matrix factorization problems. Many works \citep{recht2010guaranteed,li2018algorithmic,8357489,zhu2021global} build on the intuition that for sufficiently large $m$, matrix sensing is approximately the matrix factorization. These works rely on the restricted isotropy property (RIP) established in \citet{recht2010guaranteed}. We can use this property and our analysis in Section \ref{sec:3.1} to derive the local linear convergence in the matrix sensing problem. However, for the case of large initialization, RIP seems not sufficient. Therefore, extending matrix factorization to matrix sensing requires further statistical analysis. 



\subsection{Invariance to orthogonal rotations}\label{sec:a2-apd}

In this section, we
elaborate why there is no loss of generality to assume that $\bSigma$ is a diagonal matrix. Specifically, when $\bSigma$ is a general symmetric positive semidefinite matrix, we can write $\bSigma=\bU\bLambda \bU^\top$ by the eigen-decomposition, where $\bU\in\mathbb{R}^{d\times d}$ is an orthogonal matrix and $\bLambda$ is a diagonal matrix. Suppose the GD sequence is given by
\$
\bX_t=\bX_{t-1}+\eta(\bSigma-\bX_{t-1}\bX_{t-1}^\top)\bX_{t-1}.
\$
Then we consider the transformation $\bY_{t}=\bU\bX_t$, which leads to
\$
\bY_t=\bY_{t-1}+\eta(\bLambda-\bY_{t-1}\bY_{t-1}^\top)\bY_{t-1}.
\$
This reduces to the case where $\bLambda$ is diagonal. In addition, the convergences of $\bY_t$ and $\bX_t$ are associated, and the random initialization of $\bY_0$ and $\bX_0$ are associated up to an orthogonal matrix. Consequently, there is no loss of generality to assume that $\bSigma$ is diagonal.

\section{Proof of Theorem \ref{thm:1}}

Our proof of Theorem \ref{thm:1} consists of three steps. 
\begin{itemize}
    \item First, we show that $\cR$ is an absorbing region for GD. Here a set is regarded as an absorbing set if the GD sequence remains within the set after its first entrance. 
    \item Next, we show that $\sigma_1(\bJ_t)$ converges to zero at a linear rate, employing an SNR argument. 
    \item Finally, we establish the linear convergence to the global minima.
\end{itemize}
Before diving deeper, we first write down the update rules for $\bU_t$ and $\bJ_t$. By \eqref{equ:9}, we have 
\#
\bU_{t+1}&=\bU_t+\eta\bLambda_r\bU_t-\eta\bU_t\bX_t^\top\bX_t,\label{equ:U}\\
\bJ_{t+1}&=\bJ_t+\eta\bLambda_{\rm res}\bJ_t-\eta\bJ_t\bX_t^\top\bX_t,\label{equ:J}
\#
where $\bLambda_r=\diag(\lambda_1,\ldots,\lambda_r)$ and $\bLambda_{\rm res}=\diag(\lambda_{r+1},\ldots,\lambda_{d})$. Note that $\bSigma_r=\diag(\bLambda_r,\zero)$. 


\subsection{The GD sequence remains in $\gR$}

Lemma \ref{lma:2.2} shows that $\cR$ is an absorbing region for GD.

\begin{lemma}\label{lma:2.2}
Suppose $\eta\leq \frac{\Delta^2}{36\lambda_1^{3}}$ and $\bX_t\in\cR$. Then $\bX_{t'}\in\cR$ for all $t'\geq t$.
\end{lemma}

\begin{proof}
    This lemma is proved by induction. Suppose $\bX_t\in\cR$. 
    \begin{itemize}
        \item By Lemma \ref{lma:inv1} and $\sigma_1^2(\bX_t)\leq2\lambda_1$, we get $\sigma_1^2(\bX_{t+1})\leq2\lambda_1$.
        \item By Lemma \ref{lma:inv2}, $\sigma_1^2(\bX_{t})\leq 2\lambda_1$, and $\sigma_1^2(\bJ_t)\leq \lambda_r-\Delta/2$, we get $\sigma_1^2(\bJ_{t+1})\leq\lambda_r-\Delta/2$.
        \item By Lemma \ref{lma:inv3} and $\bX_t\in\cR$, we get $\sigma_r^2(\bU_{t+1})\geq\Delta/4$ and thus $\bX_{t+1}\in\cR$.
    \end{itemize}   
    By induction, we conclude that $\bX_{t'}\in\cR$ for all $t'\geq t.$
\end{proof}

\subsubsection{Technical lemmas}

In this section, we summarize technical lemmas used in the proof of Lemma \ref{lma:2.2}. 

Lemma \ref{lma:inv1} delineates the first category of absorbing sets for GD, denoted as 
\$
\cS_1=\{\bX\in\RR^{d\times r}\mid\sigma_1(\bX)\leq a\},
\$
valid for any $a\in[\sqrt{\lambda_1},1/\sqrt{3\eta}]$. 

\begin{lemma}\label{lma:inv1}
 
    Suppose $\eta\leq\frac{1}{3\lambda_1}$ and $a\in[\sqrt{\lambda_1},1/\sqrt{3\eta}]$. If $\sigma_1(\bX_t)\leq a$, then $\sigma_1(\bX_{t'})\leq a$, $\forall$ $t'\geq t$. 
\end{lemma}

\begin{proof}
	    Lemma \ref{lma:3.2} states that if $\sigma_1(\bX_t)\leq1/\sqrt{3\eta}$, then the following inequality holds 
	    \$
	    \sigma_1(\bX_{t+1})\leq(1+\eta\lambda_1-\eta\sigma_1^2(\bX_t))\cdot\sigma_1(\bX_t).
	    \$
     \begin{itemize}
         \item If $\sqrt{\lambda_1}\leq\sigma_1(\bX_t)\leq a$, the above inequality implies that $\sigma_1(\bX_{t+1})\leq\sigma_1(\bX_t)\leq a$.
         \item If $\sigma_1(\bX_t)\leq\sqrt{\lambda_1}\leq a$, it follows that
		\$
		\sigma_1(\bX_{t+1})\leq (1+\eta\lambda_1-\eta\lambda_1)\sqrt{\lambda_1}\leq a.
		\$ 
        This uses the fact that $g_1(s)=(1+\eta\lambda_1-\eta s^2)s$ is increasing on $[0,1/\sqrt{3\eta}]$. 
     \end{itemize}
         By induction, we have $\sigma_1(\bX_{t'})\leq a$ for all $t'\geq t$.
	\end{proof}

Lemma \ref{lma:inv2} demonstrates that if $\sigma_1(\bX_t)\leq\sqrt{2\lambda_1}$, $\sigma_1^2(\bJ_t)\leq a$, and $a\geq\lambda_{r+1}$, then $\sigma_1^2(\bJ_{t+1})\leq a$. Combining with Lemma \ref{lma:inv1}, it implies that
\$
\cS_2=\{\bX=\begin{pmatrix}
    \bU\\
    \bJ
\end{pmatrix}\in\RR^{d\times r}\mid\sigma_1(\bX)\leq\sqrt{2\lambda_1},\sigma_1^2(\bJ)\leq a\}
\$
is an absorbing set for GD, provided that $a\geq\lambda_{r+1}$ and $\eta\leq\frac{1}{12\lambda_1}$. Here $\bU$ and $\bJ$ are the top $r$ rows and the $(r+1)$-to-$d$-th rows of $\bX$ respectively. 

\begin{lemma}\label{lma:inv2}
   
    Suppose $\eta\leq\frac{1}{12\lambda_1}$, $\sigma_1^2(\bX_t)\leq2\lambda_1$, and $a\geq \lambda_{r+1}$. If $\sigma_1^2(\bJ_t)\leq a$, then $\sigma_1^2(\bJ_{t+1})\leq a$.
    
\end{lemma}

\begin{proof}
	    By Lemma \ref{lma:3.5}, we have
	    \$
	    \sigma_1(\bJ_{t+1})\leq(1+\eta(\lambda_{r+1}-\sigma_1^2(\bJ_t)) )\cdot\sigma_1(\bJ_t).
	    \$
     \begin{itemize}
         \item If $\lambda_{r+1}<\sigma_1^2(\bJ_{t})\leq a$, then it follows that $\sigma_1^2(\bJ_{t+1})\leq\sigma_1^2(\bJ_t)\leq a$.
         \item If $\sigma_1^2(\bJ_{t})\leq\lambda_{r+1}\leq a$, then
	    \$
	    \sigma_1^2(\bJ_{t+1})\leq(1+\eta(\lambda_{r+1}-\lambda_{r+1}))^2\lambda_{r+1}\leq a.
	    \$
     This uses the observation that $g_2(s)=(1+\eta(\lambda_{r+1}-s^2))s$ is increasing on $[0,1/\sqrt{3\eta}]$.
     \end{itemize}
	This concludes the proof.
	\end{proof}
	
Lemma \ref{lma:inv3} is the last piece needed to show that region $\cR$ is an absorbing set for GD.

\begin{lemma}\label{lma:inv3}
{
    Suppose $\eta\leq\frac{\Delta^2}{32\lambda_1^{3}}$, $\sigma_1(\bX_t)\leq\sqrt{2\lambda_1}$, and $\sigma_1^2(\bJ_t)\leq\lambda_r-\Delta/2$. If $\sigma_r^2(\bU_t)\geq \Delta/4$, then $\sigma_r^2(\bU_{t+1})\geq\Delta/4$.
}
\end{lemma}

\begin{proof}
	    Since $\eta\leq\frac{1}{32\lambda_1}$ and $\sigma_1^2(\bJ_t)\leq\lambda_{r}-\Delta/2$, by Lemma~\ref{lma:3.8a}, we have
	    \$
	    \sigma_r^2(\bU_{t+1})\geq(1+\eta\Delta-2\eta\sigma_r^2(\bU_t))\cdot \sigma_r^2(\bU_t)-4\eta^2\lambda_1^{3}.
	    \$
	    Since $g_3(s)=(1+\eta\Delta-2\eta s)s$ is increasing on $(-\infty,\frac{1}{4\eta}]$ and $\frac{\Delta}{4}\leq\sigma_r^2(\bU_t)\leq2\lambda_1\leq\frac{1}{4\eta}$, we have
	    \$
	    \sigma_{r}^2(\bU_{t+1})&\geq(1+\frac{\eta\Delta}{2})\cdot \frac{\Delta}{4}-4\eta^2\lambda_1^{3} \geq\frac{\Delta}{4},
	    \$
	    where the last inequality uses $\eta\leq\frac{\Delta^2}{32\lambda_1^{3}}$.
	\end{proof}
	

The following lemmas give certain singular value analysis that are used in prior lemmas and subsequent analysis. Lemma \ref{lma:3.2} establishes an upper bound for $\sigma_1(\bX_{t+1})$. 

\begin{lemma}
	\label{lma:3.2}
	    { If $\sigma_1(\bX_t)\leq 1/\sqrt{3\eta}$, then we have
	    \$
	    \sigma_1(\bX_{t+1})\leq(1+\eta\lambda_1-\eta\sigma_1^2(\bX_t))\cdot\sigma_1(\bX_t).%\label{equ:X1}
	    \$
	    }
	\end{lemma}

    \begin{proof}
	    By the singular value inequality and \eqref{equ:9}, 
	    \#
	    \sigma_1(\bX_{t+1})&\leq\sigma_1(\bX_t(\bI_r-\eta\bX_t^\top\bX_t))+\eta\sigma_1(\bSigma\bX_t)\notag\\
	    &\leq\sigma_1(\bX_t(\bI_r-\eta\bX_t^\top\bX_t))+\eta\lambda_1\sigma_1(\bX_t),\label{equ:3.8}
	    \#
	    where we use $\sigma_1(\bSigma)=\lambda_1$.
		Observe that all $r$ singular values of $\bX_t(\bI_r-\eta\bX_t^\top\bX_t)$ are given by
		\$
		(1-\eta\sigma_i^2(\bX_t))\cdot\sigma_i(\bX_t),\ i=1,\ldots,r,
		\$ 
		since $\eta\sigma_1^2(\bX_t)\leq 1$. The function $g_4(s)=(1-\eta s^2)s$ is increasing on $[0,1/\sqrt{3\eta}]$. Hence, the fact $0\leq\sigma_i(\bX_t)\leq\sigma_1(\bX_t)\leq1/\sqrt{3\eta}$ implies that
		\$
		\sigma_1(\bX_t(\bI_r-\eta\bX_t^\top\bX_t))=(1-\eta\sigma_1^2(\bX_t))\cdot\sigma_1(\bX_t).
		\$
        Substituting this equality into \eqref{equ:3.8}, we conclude the proof.
	\end{proof}

Lemma \ref{lma:3.5} gives an upper bound for $\sigma_1(\bJ_{t+1})$.

 \begin{lemma}
	\label{lma:3.5}
	    { Suppose $\eta \leq \frac{1}{12\lambda_1}$ and $\sigma_1(\bX_t)\leq\sqrt{2\lambda_1}$, then we have
	    \$
	    \sigma_1(\bJ_{t+1})\leq(1+\eta(\lambda_{r+1}-\sigma_1^2(\bJ_t)-\sigma_r^2(\bU_t)) )\cdot\sigma_1(\bJ_t).
	    \$}
	\end{lemma}

 \begin{proof}
	   The update rule \eqref{equ:J} of $\bJ_{t+1}$ can be decomposed as follows:
	   \$
	   \bJ_{t+1}=\underbrace{\frac{1}{2}\bJ_t-\eta\bJ_t\bJ_t^\top\bJ_t}_{\bB}+\underbrace{(\frac{1}{4}\bI_{d-r}+\eta\bLambda_{\rm res})\bJ_t}_{\bC}+\underbrace{\bJ_t(\frac{1}{4}\bI_r-\eta\bU_t^\top\bU_t)}_{\bD}.
	   \$
	   By the singular value inequality,
	   \$
	   \sigma_1(\bJ_{t+1})\leq\sigma_1(\bB)+\sigma_1(\bC)+\sigma_1(\bD). 
	   \$
	   Observe that all singular values of $\bB$ are given by
	   \$
	   \sigma_i(\bJ_t)/2-\eta\sigma_i^3(\bJ_t),\quad i=1,\ldots,d-r.
	   \$
	   Since $g_5(s)=s/2-\eta s^3$ is increasing on $[0,1/\sqrt{6\eta}]$, the condition $\sigma_i(\bJ_t)\leq\sigma_1(\bJ_t)\leq\sqrt{2\lambda_1}\leq1/\sqrt{6\eta}$ implies that
	   \$
	   \sigma_1(\bB)=\sigma_1(\bJ_t)/2-\eta\sigma_1^3(\bJ_t).
	   \$
	   For the second term $\bC$, it follows from the singular value inequality that
	   \$
	   \sigma_1(\bC)\leq\sigma_1(\frac{1}{4}\bI_{d-r}+\eta\bLambda_{\rm res})\sigma_1(\bJ_t)\leq(1/4+\eta\lambda_{r+1})\sigma_1(\bJ_t),
	   \$
	   where the second inequality uses $\eta\sigma_1(\bLambda_{\rm res})\leq\eta\lambda_1\leq1/4.$ For the third term $\bD$, since $\eta\sigma_1^2(\bU_t)\leq2\eta\lambda_1\leq1/4$, we have
	   \$
	   \sigma_1(\bD)\leq(1/4-\eta\sigma_r^2(\bU_t))\sigma_1(\bJ_t).
	   \$
	   Finally, we conclude the proof by combining the analysis of $\bB,\bC,$ and $\bD$.
	\end{proof}

Lemma \ref{lma:3.8a} provides an lower bound for $\sigma_r^2(\bU_{t+1})$. 
 
	\begin{lemma}\label{lma:3.8a}
	    { 
     Suppose $\eta\leq\frac{1}{32\lambda_1}$ and $\sigma_1(\bX_t)\leq\sqrt{2\lambda_1}$, then we have
	    \$
	    \sigma_r^2(\bU_{t+1})\geq(1+2\eta(\lambda_r-\sigma_1^2(\bJ_t)-\sigma_r^2(\bU_t)) )\cdot\sigma_r^2(\bU_t)-4\eta^2\lambda_1^{3}.
	    \$
	    }
\end{lemma}

\begin{proof}
	    Substituting the update rule \eqref{equ:U} of $\bU_{t+1}$ into $\bU_{t+1}\bU_{t+1}^\top$, we get 
	    \$
	    \bU_{t+1}\bU_{t+1}^\top&=(\bU_t-\eta\bU_t\bX_t^\top\bX_t+\eta\bLambda_r\bU_t)\cdot(\bU_t-\eta\bU_t\bX_t^\top\bX_t+\eta\bLambda_r\bU_t)^\top\\
	    &=\bB+\bC-\eta^2\bR_1+\eta^2\bR
	    \$
	    where
	    \$
	    \bB&=\bU_t(\frac{1}{2}\bI_r-2\eta\bX_t^\top\bX_t)\bU_t^\top,\\
	    \bC&=(\frac{1}{\sqrt{2}}\bI_r+\sqrt{2}\eta\bLambda_r)\bU_t\bU_t^\top(\frac{1}{\sqrt{2}}\bI_r+\sqrt{2}\eta\bLambda_r),\\
	    \bR_1&=2\bLambda_r\bU_t\bU_t^\top\bLambda_r,\\
	    \bR&=(\bLambda_r\bU_t-\bU_t\bX_t^\top\bX_t)(\bLambda_r\bU_t-\bU_t\bX_t^\top\bX_t)^\top.
	    \$
	    Here $\bB$ is positive semi-definite (PSD) since $2\eta\sigma_1^2(\bX_t)\leq4\eta\lambda_1\leq1/2$ and $\bC,\bR_1,\bR$ are all PSD. By the eigenvalue inequality and the equivalence between eigenvalues and singular values of a PSD matrix, we have
	    \#
	    \sigma_r^2(\bU_{t+1})&\geq\sigma_r(\bB)+\sigma_r(\bC)-\eta^2\sigma_1(\bR_1)+\eta^2\sigma_r(\bR)\notag\\
	    &\geq\sigma_r(\bB)+\sigma_r(\bC)-\eta^2\sigma_1(\bR_1)\label{equ:3.9}.
	    \#
	    For the first term $\bB$, we decompose it into two terms:
	    \$
	    \bB=\underbrace{\bU_t((\frac{1}{2}-2\eta\sigma_1^2(\bJ_t))\cdot\bI_r-2\eta\bU_t^\top\bU_t)\bU_t^\top}_{\bB_1}+2\eta\cdot\underbrace{\bU_t(\sigma_1^2(\bJ_t)\cdot\bI_r-\bJ_t^\top\bJ_t)\bU_t^\top}_{\bB_2}.
	    \$
	    The inequality $2\eta(\sigma_1^2(\bJ_t)+\sigma_1^2(\bU_t))\leq8\eta\lambda_1\leq1/2$ implies that $\bB_1$ is PSD. Since $\bB_2$ is also PSD, we have $\sigma_r(\bB)\geq\sigma_r(\bB_1)$.
	    To determine $\sigma_r(\bB_1)$, we write the singular values of $\bB_1$ as
	    \$
	    (\frac{1}{2}-2\eta\sigma_1^2(\bJ_t))\cdot\sigma_i^2(\bU_t)-2\eta\sigma_i^4(\bU_t),\ i=1,\ldots,r.
	    \$
	    Since $1/2-2\eta\sigma_1^2(\bJ_t)\geq1/4$, the function $g_6(s)=(1/2-2\eta\sigma_1^2(\bJ_t))s-2\eta s^2$ is increasing on $(-\infty,\frac{1}{16\eta}]$. Then the inequality $\sigma_i^2(\bU_t)\leq\sigma_1^2(\bU_t)\leq 2\lambda_1\leq\frac{1}{16\eta}$ implies that
	    \$
	    \sigma_r(\bB_1)=(\frac{1}{2}-2\eta(\sigma_1^2(\bJ_t)+\sigma_r^2(\bU_t)))\cdot\sigma_r^2(\bU_t).
	    \$
	    For the second term $\bC$, we have
	    \$
	    \sigma_r(\bC)\geq\sigma_r^2(\frac{1}{\sqrt{2}}\bI_r+\sqrt{2}\eta\bLambda_r)\sigma_r^2(\bU_t)\geq(\frac{1}{2}+2\eta\lambda_r)\sigma_r^2(\bU_t).
	    \$
	    For the third term $\bR_1$, since $\sigma_1^2(\bX_t)\leq 2\lambda_1$, we have
	    \$
	    \sigma_1(\bR_1)\leq4\lambda_1^{3}.
	    \$
	    Finally, substituting the analysis of $\bB,\bC,\bR_1$ into \eqref{equ:3.9} gives the desired result.
	\end{proof}
	
\subsection{$\sigma_1(\mathbf{J}_t)$ converges to zero linearly via an SNR argument}\label{sec:J}

Lemma \ref{lma:2.3} shows that if $\bX_0\in\cR$, then $\sigma_1(\bJ_t)$ will diminish to zero at a geometric rate. A key step of the analysis is to examine the SNR $\frac{\sigma_r^2(\bU_{t})}{\sigma_{1}^2(\bJ_t)}$. Our analysis extends the rank-one case in Section \ref{sec:2} to a general rank scenario.

\begin{lemma}\label{lma:2.3}
    Suppose $\eta\leq {\Delta^2}/({32\lambda_1^{3}})$ and $\bX_0\in\cR$. Then, for all $t\geq 0$, we have
    \$%\label{equ:J-converge}
	\frac{\sigma_1^2(\bJ_{t+1})}{\sigma_r^2(\bU_{t+1})}\leq(1-\eta\Delta/3) \cdot \frac{\sigma_1^2(\bJ_t)}{\sigma_r^2(\bU_t)}.
    \$
    Hence, $\sigma_1^2(\bJ_{t})\leq  {8\lambda_1^{2}}  (1-\eta\Delta/3)^t /{\Delta}$ for all $t$ and $\sigma_1^2(\bJ_t)<\epsilon$ after
    \$
    T_{\bJ}^\epsilon=\cO\left(\frac{3}{\eta\Delta}\log\frac{8\lambda_1^{2}}{\epsilon\Delta}\right) ~~\text{iterations.}
    \$
\end{lemma}

\begin{proof}
    By Lemma \ref{lma:2.2}, we have $\bX_t\in\cR$ for all $t\geq0$.
    Then by Lemma \ref{lma:3.5}, 
        \$
        \sigma_1^2(\bJ_{t+1})&\leq(1+2\eta(\lambda_{r+1}-\sigma_1^2(\bJ_t)-\sigma_r^2(\bU_t))+16\eta^2\lambda_1^{2} )\cdot\sigma_1^2(\bJ_t)\\
        &\leq(1-\eta\Delta/2+2\eta(\lambda_r-\Delta/2-\sigma_1^2(\bJ_t)-\sigma_r^2(\bU_t)))\cdot\sigma_1^2(\bJ_t),
        %&\leq(1-\eta\Delta+2\eta(\lambda_{r}^*-\Delta/2-\sigma_1^2(\bJ_t)-\sigma_r^2(\bU_t))+10\eta^2\lambda_1^{*2})\cdot\sigma_1^2(\bJ_t)\\
        %&\leq(1-\eta\Delta/2+2\eta(\lambda_{r}^*-\Delta/2-\sigma_1^2(\bJ_t)-\sigma_r^2(\bU_t)))\cdot\sigma_1^2(\bJ_t),
        \$
        where the second inequality follows from $\eta\leq\frac{\Delta}{32\lambda_1^{2}}$.
        By Lemma \ref{lma:3.8a}, 
        \$
        \sigma_r^2(\bU_{t+1})&\geq(1+\eta\Delta+2\eta(\lambda_{r}-\Delta/2-\sigma_1^2(\bJ_t)-\sigma_r^2(\bU_t)))\cdot\sigma_r^2(\bU_t)-4\eta^2\lambda_1^{3}\\
        &\geq(1+\eta\Delta/2+2\eta(\lambda_{r}-\Delta/2-\sigma_1^2(\bJ_t)-\sigma_r^2(\bU_t)))\cdot\sigma_r^2(\bU_t),
        \$
        where we use $\sigma_r^2(\bU_t)\geq\Delta/4$ and $\eta\leq\frac{\Delta^2}{32\lambda_1^{3}}$ in the second inequality. A combination of the above two inequalities gives that
        \$
        \frac{\sigma_1^2(\bJ_{t+1})}{\sigma_r^2(\bU_{t+1})}\leq\frac{1-\eta\Delta/2+2\eta(\lambda_{r}-\Delta/2-\sigma_1^2(\bJ_t)-\sigma_r^2(\bU_t))}{1+\eta\Delta/2+2\eta(\lambda_{r}-\Delta/2-\sigma_1^2(\bJ_t)-\sigma_r^2(\bU_t))}\cdot\frac{\sigma_1^2(\bJ_{t})}{\sigma_r^2(\bU_{t})}.
        \$
        Since the function $g_7(s)=\frac{1-\eta\Delta/2+s}{1+\eta\Delta/2+s}$ is increasing on $[-1/2,1/2]$, the condition $-1/2\leq2\eta(\lambda_{r}-\Delta/2-\sigma_1^2(\bJ_t)-\sigma_r^2(\bU_t))\leq1/2$ implies that
        \$
        \frac{\sigma_1^2(\bJ_{t+1})}{\sigma_r^2(\bU_{t+1})}\leq\frac{3/2-\eta\Delta/2}{3/2+\eta\Delta/2}\cdot\frac{\sigma_1^2(\bJ_{t})}{\sigma_r^2(\bU_{t})}\leq(1-\eta\Delta/3)\cdot\frac{\sigma_1^2(\bJ_{t})}{\sigma_r^2(\bU_{t})}.
        \$
        By deduction, we have
        \$
        \sigma_1^2(\bJ_{t})&\leq(1-\eta\Delta/3)^t\cdot\sigma_r^2(\bU_{t})\frac{\sigma_1^2(\bJ_0)}{\sigma_r^2(\bU_0)}\leq(1-\eta\Delta/3)^t\cdot\frac{8\lambda_1^{2}}{\Delta},
        \$
        where the second inequality follows from $\sigma_r^2(\bU_t)\leq2\lambda_1$, $\sigma_1^2(\bJ_0)\leq\lambda_1$, and $\sigma_r^2(\bU_0)\geq\Delta/4$. Therefore, for any $\epsilon>0$, it takes at most $T_{\bJ}^{\epsilon}=\cO(\frac{3}{\eta\Delta}\log\frac{8\lambda_1^{2}}{\epsilon\Delta})$ iterations to have $\sigma_1^2(\bJ_t)\leq \epsilon$.
\end{proof}
    

\subsection{Final convergence}

For the convergence of $\bX_t\bX_t^\top$ to $\bSigma_r$, It remains to show that $\bU_t\bU_t^\top$ converges to $\bLambda_r$ fast, where $\bLambda_r=\diag(\lambda_1,\ldots,\lambda_r)$. Equivalently, it suffices to show that $\sigma_1(\bP_t)$ converges to zero linearly, where $\bP_t=\bLambda_t-\bU_t\bU_t^\top$. This is established in Lemma \ref{lma:2.4}. 

\begin{lemma}\label{lma:2.4}
    Suppose $\eta\leq {\Delta^2}/({36\lambda_1^{3}})$ and $\bX_0\in\cR$. Then, for all $t\geq0$, we have
	\$
	    \sigma_1(\bP_{t+1})\leq  \frac{100\lambda_1^{2}}{\eta\Delta^2}    (1-\eta\Delta/4)^{t+1}.
	\$
Hence, for any $\epsilon>0$, it takes $T_{\bP}^{\epsilon}=\cO\left(\frac{4}{\eta\Delta}\log\frac{100\lambda_1^{2}}{\eta\Delta^2\epsilon}\right)$ iterations to reach $\sigma_1(\bP_{t})\leq \epsilon$.
\end{lemma}

\begin{proof}
 By Lemma \ref{lma:2.2}, $\bX_t\in\cR$ for all $t\geq 0$.
	    Using the notation of $\bP_t$, \eqref{equ:U} can be rewritten as
	    \$
	    \bU_{t+1}=\bU_t+\eta\bP_t\bU_t-\eta\bU_t\bJ_t^\top\bJ_t.
	    \$
	    By direct calculation, we have
	    \$
	    \bP_{t+1}=(\bI_r-\eta\bU_t\bU_t^\top)\bP_t(\bI_r-\eta\bU_t\bU_t^\top)-\eta^2(\bP_t\bU_t\bU_t^\top\bP_t+\bU_t\bU_t^\top\bP_t\bU_t\bU_t^\top)+\bR_t,
	    \$
	    where
	    \$
	    \bR_t=\eta(\bI_r+\eta\bP_t)\bU_t\bJ_t^\top\bJ_t\bU_t^\top+\eta\bU_t\bJ_t^\top\bJ_t\bU_t^\top(\bI_r+\eta\bP_t)-\eta^2\bU_t(\bJ_t^\top\bJ_t)^2\bU_t^\top.
	    \$
	    By the singular value inequality, 
	    \$
	    \sigma_1(\bP_{t+1}) &\leq((1-\eta\Delta/4)^2+8\eta^2\lambda_1^{2})\cdot \sigma_1(\bP_t)+\sigma_1(\bR_t)\\
	    &\leq(1-\eta\Delta/4)\cdot \sigma_1(\bP_t)+\sigma_1(\bR_t),
	    \$
	    where we use $\Delta/4\leq\sigma_r^2(\bU_t)\leq\sigma_1^2(\bU_t)\leq2\lambda_1$ in the first inequality and $\eta\leq\frac{\Delta}{36\lambda_1^{2}}$ in the second inequality. For the remainder term $\bR_t$, by the singular value inequality and the condition $\eta\leq\frac{\Delta^2}{36\lambda_1^{3}}$, we have
	    \$
	    \sigma_1(\bR_t)\leq\sigma_1^2(\bJ_t)\leq(1-\eta\Delta/3)^t\cdot\frac{8\lambda_1^{2}}{\Delta},
	    \$
	    where the last inequality follows from Lemma \ref{lma:2.3}.
	 Then by deduction, we have
  \$
  \frac{\sigma_1(\bP_{t+1})}{(1-\eta\Delta/4)^{t+1}}&\leq\frac{\sigma_1(\bP_t)}{(1-\eta\Delta/4)^t}+\left(\frac{1-\eta\Delta/3}{1-\eta\Delta/4}\right)^{t}\frac{8\lambda_1^{2}}{(1-\eta\Delta/4)\Delta}\\
  &\leq \sigma_1(\bP_0) + \sum_{i=1}^t\left(\frac{1-\eta\Delta/3}{1-\eta\Delta/4}\right)^{i}\frac{8\lambda_1^{2}}{(1-\eta\Delta/4)\Delta}\\
  &\leq \sigma_1(\bP_0)+\frac{96\lambda_1^{2}}{\eta\Delta^2}\leq \frac{100\lambda_1^{2}}{\eta\Delta^2},
  \$
  where the last inequality follows from $\sigma_1(\bP_0)\leq2\lambda_1$. Therefore, it takes $T_{\bP}^{\epsilon}=\cO(\frac{4}{\eta\Delta}\log\frac{100\lambda_1^{2}}{\eta\Delta^2\epsilon})$ iterations to achieve $\sigma_1(\bP_t)\leq \epsilon$.
\end{proof}

\subsection{Proof of Theorem \ref{thm:1}}

By combining Lemma \ref{lma:2.3} and Lemma \ref{lma:2.4}, we can prove Theorem \ref{thm:1}. 

\begin{proof}
    Observe that
\$
\norm{\bSigma_r-\bX_t\bX_t^\top}_{\rF}\leq\norm{\bP_t}_{\rF}+2\norm{\bJ_t\bX_t^\top}_{\rF}\leq r\sigma_1(\bP_t)+2r\sqrt{2\lambda_1}\sigma_1(\bJ_t),\quad \forall \bX_t\in\cR,
%\label{equ:error}
\$
where we use the fact that $\norm{\bA}_{\mathrm{F}}\leq r\sigma_1(\bA)$ for any rank-$r$ matrix $\bA$. Let
\$
T^\epsilon=\max\left\{T_{\bJ}^{\epsilon^2/(32r^2\lambda_1^{})},T_{\bP}^{\epsilon/(2r)}\right\}.
\$ 
Then, $\norm{\bSigma_r-\bX_t\bX_t^\top}_{\rF}\leq\epsilon$ for all $t\geq T^\epsilon$. Theorem \ref{thm:1} follows from $T^\epsilon=\cO(\frac{6}{\eta\Delta}\log\frac{200r\lambda_1^{2}}{\eta\Delta^2\epsilon})$.
\end{proof}

\section{Proof of Proposition \ref{prop:asp}}

\begin{proof}
    Consider $\bX$ with $\sigma_1(\bX)\leq\frac{1}{\sqrt{3\eta}}$. Let $\bX_t$ be the GD sequence initialized by $\bX$. By Corollary 2 of \cite{lee2019first}, we know GD sequence almost surely avoids the strict saddle points. By \cite{zhu2021global}, we know all the saddle points are strict and all the local minima are global minima. Therefore, we conclude that the GD sequence converges to the global minima almost surely.

    Now it remains to show that Assumption \ref{asp:1} must hold if the GD sequence converges to the global minima. Indeed, if we suppose Assumption \ref{asp:1} does not hold, then the GD sequence will converge with $\lim_{t\to\infty}\sigma_1(\bu_{k,t})=0$ for some $k\leq r$. This means the GD sequence converges to a saddle point, since any stationary point with some $\bu_{k,t}=\zero$ ($k\leq r$) is a saddle point, rather than a global minimum. This leads to the contradiction.
\end{proof}


\section{Analysis of large initialization}

In this section, we will prove Theorem \ref{lma:r'} as well as the results in Section \ref{sec:proof}. Before delving further, we first write down the update rules of $\bu_{k,t}$ and $\bK_{k,t}$. Recall that $\bu_{k,t}$ and $\bK_{k,t}$ are the $k$-th and $(k+1)$-to-$d$-th rows of $\bX_t$. The update rules are given by
\#
\bu_{k,t+1}&=\bu_{k,t}+\eta \lambda_k\bu_{k,t}-\eta \bu_{k,t}\bX_t^\top\bX_t,\label{equ:u} \\
\bK_{k,t+1}&=\bK_{k,t}+\eta\bGamma_k\bK_{k,t}-\eta\bK_{k,t}\bX_t^\top\bX_t,\label{equ:K}
\#
where $\bGamma_{k}=\diag(\lambda_{k+1},\ldots,\lambda_d)$. We also remind readers that $\bu_{k,t}\in\RR^{1\times r}$ is a row vector. Moreover, we let $\bPi_{\bu_k,t}$ denote the projection matrix associated with $\bu_{k,t}$, that is,
\$
\bPi_{\bu_{k,t}}=\bu_{k,t}^\top(\bu_{k,t}\bu_{k,t}^\top)^{-1}\bu_{k,t}\in\RR^{r\times r}.
\$
Also, we let $\bG_{k,t}$ denote the first $k$ rows of $\bX_t$.


\subsection{Proofs for Section \ref{sec:proof}} 

In this section, we collect proofs related to the rank-two matrix approximation. 

\subsubsection{Proof of Lemma \ref{lma:1}}

\begin{proof}
Note that $t_{{\rm init},1}\leq T_1+T_{\bK}$, where 
\$
T_1=\min\{t\geq 0\mid \sigma_1^2(\bX_t)\leq 2\lambda_1\}
\$ 
is the first time when $\sigma_1^2(\bX_t)$ is smaller than $2\lambda_1$, and
\$
     T_{\bK}&=\min\{t\geq 0\mid \sigma_1^2(\bK_{k,t+T_1})\leq \lambda_k-\frac{3\Delta}{4},\forall k\leq r\}.
     \$
     To prove the lemma, it suffices to analyze $T_1$ and $T_{\bK}$ separately. 

First, we analyze $T_1$ as follows.
\begin{itemize}
    \item If $\sigma_1^2(\bX_0)\leq 2\lambda_1$, then $T_1=0$.
    \item If $2\lambda_1<\sigma_1^2(\bX_0)<1/(3\eta)$, then by Lemma \ref{lma:inv1}, $\sigma_1^2(\bX_t)\leq 1/(3\eta)$ for all $t$. Furthermore, it follows from Lemma \ref{lma:3.2} that
    \$
	    \sigma_1(\bX_{t+1})&\leq (1+\eta\lambda_1-\eta\sigma^2_1(\bX_t))\cdot\sigma_1(\bX_t)\\
	    &\leq(1-\eta\lambda_1)\cdot\sigma_1(\bX_t),\quad \forall t<T_1,
	    \$
     where the second inequality uses $\sigma_1^2(\bX_t)>2\lambda_1$ for all $t<T_1$. It implies that 
     \$
     \sigma_1(\bX_{t})\leq(1-\eta\lambda_1)^{t}\cdot\sigma_1(\bX_0)
     \$ 
     for all $t\leq T_1$ and 
     \$
     T_{1}=\cO\left(\frac{1}{\eta\lambda_1}\log\frac{\sigma_1(\bX_0)}{\sqrt{2\lambda_1}}\right).
     \$
\end{itemize}
 
     
     By Lemma \ref{lma:inv1}, we have $\sigma_1^2(\bX_t)\leq2\lambda_1$ for all $t\geq T_1$.
     
     Next, we analyze $T_{\bK}$ and the following quantities
     \$
     T_{\bK_{k}}&=\min\{t\geq 0\mid\sigma_1^2(\bK_{k,t+T_1})\leq\lambda_k-\frac{3\Delta}{4}\}.
     \$
     Recall that $\bK_{k,t}$ is the $(k+1)$-to-$d$-th rows of $\bX_t$. 
     Then by \eqref{equ:K}, we have
     %we show that $\sigma_1^2(\bK_{k})$ drops below $\lambda_k-\frac{3\Delta}{4}$ fast for $k\leq r$.
    %in additional $T_{\bK}=\cO\left(\frac{2}{\eta\Delta}\log\frac{8\lambda_1^*}{\Delta}\right)$ iterations for $k<r'$ or $k=r$. 
    %By \eqref{equ:K},
    \#
	    \bK_{k,t+1}&=\bK_{k,t}+\eta\bGamma_k\bK_{k,t}-\eta\bK_{k,t}\bX_t^\top\bX_t\notag\\
     &=\underbrace{\frac{1}{2}\bK_{k,t}-\eta\bK_{k,t}\bK_{k,t}^\top\bK_{k,t}}_{\bB}+\underbrace{(\frac{1}{4}\bI_{d-k}+\eta\bGamma_k)\bK_{k,t}}_{\bC}+\underbrace{\bK_{k,t}(\frac{1}{4}\bI_{k}-\eta\bG_{k,t}^\top\bG_{k,t})}_{\bD},\notag
	    \#
     where $\bGamma_k=\diag(\lambda_{k+1},\ldots,\lambda_d)$ and $\bG_{k,t}\in\RR^{k\times r}$ is the first $k$ rows of $\bX_t$. By the singular value inequality, we obtain
    \$
	    \sigma_1(\bK_{k,t+1})\leq\sigma_1(\bB)+\sigma_1(\bC)+\sigma_1(\bD).
	    \$
    For the first term $\bB$, similar to Lemma \ref{lma:3.5}, we can show that
	    \$
	    \sigma_1(\bB)=\sigma_1(\bK_{k,t})/2-\eta\sigma_1^3(\bK_{k,t}),\quad \forall t\geq T_1.
	    \$
For the second term $\bC$, by the singular value inequality,
	    \$
	    \sigma_1(\bC)\leq(\frac{1}{4}+\eta\lambda_{k+1})\cdot\sigma_1(\bK_{k,t}).
	    \$
     For the third term $\bD$, since $\bG_{k,t}^\top\bG_{k,t}$ is PSD and $\eta\sigma_1^2(\bG_{k,t})\leq \frac{1}{4}$ for all $t\geq T_1$, we have
	    \$
	    \sigma_1(\bD)\leq\sigma_1(\bK_{k,t})/4,\quad \forall t\geq T_1.
	    \$
     Combining,
     \#\label{equ:a4.1}
	    \sigma_1(\bK_{k,t+1})\leq(1+\eta\lambda_{k+1}-\eta\sigma_1^2(\bK_{k,t}))\cdot\sigma_1(\bK_{k,t}),\quad \forall t\geq T_1,\quad \forall k\leq r.
	\#
      Since $\lambda_{k+1}\leq \lambda_k -\Delta$ for $ k\leq r$, \eqref{equ:a4.1} implies that 
     \$
	    \sigma_1(\bK_{k,t+T_1+1})\leq(1-\eta\Delta/4)\cdot\sigma_1(\bK_{k,t+T_1}),\quad\forall t <T_{\bK_k},\quad \forall k\leq r.
	    \$
     Hence, $\sigma_1(\bK_{k,t+T_1})\leq(1-\eta\Delta/4)^{t}\cdot\sigma_1(\bK_{k,T_1})$ for all $t\leq T_{\bK_k}$. In particular, 
     \$
     T_{\bK_k}=\cO\left(\frac{2}{\eta\Delta}\log\frac{\sigma_1^2(\bK_{k,T_1})}{\lambda_k-\frac{3\Delta}{4}}\right)\textnormal{ and } T_{\bK}=\cO\left(\frac{2}{\eta\Delta}\log\frac{8\lambda_1}{\Delta}\right),
     \$
     where we use $\sigma_1^2(\bK_{k,T_1})\leq2\lambda_1$ and $\lambda_k-\frac{3\Delta}{4}\geq \frac{\Delta}{4}$. 
     
     Finally, similar to Lemma \ref{lma:inv1} and \ref{lma:inv2}, for any $a\geq \lambda_{k+1}$, if $\sigma_1^2(\bK_{k,t+T_1})\leq a$, then $\sigma_1^2(\bK_{k,t'+T_1})\leq a$ for all $t'\geq t$. This implies that $\sigma_1^2(\bK_{k,t+T_1})\leq\lambda_k-\frac{3\Delta}{4}$ for all $t\geq T_{\bK}$ for $k\leq r$.  
\end{proof}

\subsubsection{Proof of Lemma \ref{lma:2}}

\begin{proof}
    This lemma is a special case of Lemma \ref{lma:d1}, where we take $k=1$ and $t_{\rm init}=t_{{\rm init},1}$. Notice that $\bG_{0,t}=\zero$ and $\bX_t\in\cS$ for all $t\geq t_{{\rm init},1}$ by Lemma \ref{lma:1}. Thus, the conditions in Lemma \ref{lma:d1} trivially hold. Then Lemma~\ref{lma:2} immediately follows from Lemma \ref{lma:d1}.
\end{proof}

\subsubsection{Proof of Lemma \ref{lma:3}}

\begin{proof}
    The lemma is a special case of Lemma \ref{lma:d2} and Lemma \ref{lma:25}. In Lemma \ref{lma:d2}, we take $k=1$ and $t_{\rm init}=t_{{\rm init},1}+T_{\bu_1}$. In Lemma \ref{lma:25}, we take $k=1$ and $t_{\rm init}=t_{{\rm init},1}+T_{\bu_1}+t^*$.
\end{proof}

\subsubsection{Proof of Lemma \ref{lma:10}}

\begin{proof}
    This lemma is a special case of Lemma \ref{lma:d1}, where we take $k=2$ and $t_{\rm init}=t_1+t_1^*$. 
\end{proof}

%\subsection{Proofs for Section \ref{sec:4}: general rank matrix approximation}

%In this section, we present proofs for general rank matrix approximation.

 \subsection{Proof of Theorem \ref{lma:r'}}

 \begin{proof}
    %The first property follows from Lemma \ref{lma:1}. 
    To prove this theorem, we will use an inductive argument. Our induction hypotheses are listed below:
     
        \begin{itemize}
            
	        
	        \item H($k,1$). $\sigma_1(\bu_{k,t}\bG_{k-1,t}^\top)\leq \sqrt{\frac{\Delta}{8}}\min\{\sigma_1(\bu_{k,t_{{\rm init},k}}),\sqrt{\frac{\Delta}{2}}\}\cdot (1-\eta\Delta/6)^{t-t_{{\rm init},k}}$ for all $t\geq t_{{\rm init},k}$.
	        
	        \item H($k,2$).  $T_{\bu_k}=\cO\left(\frac{4}{\eta\Delta}\log\frac{\Delta}{2\sigma_1^2(\bu_{k,t_{{\rm init},k}})}\right)$ and $\sigma_1^2(\bu_{k,t})\geq \frac{\Delta}{2}$ for all $t\geq t_{{\rm init},k}+T_{\bu_k}$.
         
	       \item H($k,3$). $\sigma_1(\bu_{k,t}\bK_{k,t}^\top)\leq(1-\eta\Delta/6)^{t-t_{k}}$ for all $t\geq t_{k}$.
	   \end{itemize}
    Note that H(1,1) trivially holds because $\bG_{0,t}=\zero$. Then we prove H($k,1$), H($k,2$), H($k,3$), H($k+1,1$) successively until H($r,3$). 

\begin{itemize}
    \item $\{\textnormal{H}(j,\cdot)\}_{j<k}$ + H($k,1$) $\to$ H($k,2$) 
    
\vspace{6pt}

    This follows from Lemma~\ref{lma:d1}, where we take $t_{\rm init}=t_{{\rm init},k}$.

    \vspace{6pt}
    
\item $\{\textnormal{H}(j,\cdot)\}_{j<k}$ + H($k,1$) + H($k,2$) $\to$ H($k,3$) 

\vspace{6pt}

This follows from Lemma \ref{lma:d2}, where we take $t_{\rm init}=t_{{\rm init},k}+T_{\bu_k}$. 

\vspace{6pt}

\item $\{\textnormal{H}(j,\cdot)\}_{j\leq k}$ $\to$ H($k+1,1$) 

\vspace{6pt}

By $\{\textnormal{H}(j,3)\}_{j\leq k}$,
\$
\sigma_1(\bu_{k+1,t}\bG_{k,t}^\top)\leq \sum_{j\leq k}\sigma_1(\bu_{j,t}\bK_{j,t}^\top)\leq r(1-\eta\Delta/6)^{t-t_{k}},
\$
for all $t\geq t_{k}$. By definition of $t_{k}^*$, we have
\$
r(1-\eta\Delta/6)^{t_{k}^*}\leq \sqrt{\frac{\Delta}{8}}\min\{\sigma_1(\bu_{k,t_{k}+t_{k}^*}),\sqrt{\frac{\Delta}{2}}\}.
\$ 
Then H($k+1,1$) follows from the definition $t_{{\rm init},k+1}=t_{k}+t_{k}^*$. 
\end{itemize}

By induction, H$(k,\cdot)$ holds for all $k\leq  r$.

For all $t\geq t_k$, \eqref{equ:pkt} follows from Lemma \ref{lma:25}, where $t_{{\rm init}}$ is taken as $t_k$.

For all $t\geq t_{{\rm init},r}+T_{\bu_r}$, we have $\sigma_1^2(\bu_{k,t})\geq \frac{\Delta}{2}$ for all $k\leq r$. Simultaneously,  
\$
\sum_{j\leq r}\sigma_1(\bu_{j,t}\bK_{j,t}^\top)\leq r(1-\eta\Delta/6)^{t-(t_{{\rm init},r}+T_{\bu_r}+t^*)}
\$
holds for all $t\geq t_{{\rm init},r}+T_{\bu_r}+t^*$.
Let $\bU_{t}$ be the first $r$ rows of $\bX_t$. Viewing $\bU_t\bU_t^\top$ as the sum of diagonal elements and off-diagonal elements, we find that 
\$
\sigma_r^2(\bU_t)\geq \Delta/2-r(1-\eta\Delta/6)^{t-(t_{{\rm init},r}+T_{\bu_r}+t^*)}
\$
for all $t\geq t_{{\rm init},r}+T_{\bu_r}+t^*$. Hence, $\sigma_r^2(\bU_t)\geq \Delta/4$ for all $t\geq t_{{\rm init},r}+T_{\bu_r}+t^* + t^{\sharp}$, where
\$
t^{\sharp}=\frac{\log(\Delta/(4r))}{\log(1-\eta\Delta/6)}.
\$
This implies that $\bX_t\in\cR$ for $t\geq t_{\cR}\coloneqq t_{{\rm init},r}+T_{\bu_r}+t^* + t^{\sharp}$.
%The sixth property is merely an application of Theorem \ref{thm:1}.

The property (2) is merely an application of Theorem \ref{thm:1}.
\end{proof}

\subsection{Proof of Theorem \ref{thm:8}}

\begin{proof}
    This property immediately follows from Theorem \ref{lma:r'}.
\end{proof}
 
\subsection{Technical lemmas}

This section collects technical lemmas that are used in previous sections. Let us recall that $\bu_{k,t}$ and $\bK_{k,t}$ are the $k$-th and the $(k+1)$-to-$d$-th rows of $\bX_t$ respectively. The projection matrix associated with $\bu_{k,t}$ is denoted by 
\$
\bPi_{\bu_k,t}=\bu_{k,t}^\top(\bu_{k,t}\bu_{k,t}^\top)^{-1}\bu_{k,t}.
\$  
The first $k$ rows of $\bX_t$ are denoted by $\bG_{k,t}$, and $\bG_{0,t}=\zero$ by definition. 

\subsubsection{Dynamics}

This subsection contains lemmas describing the dynamics of the GD sequence. 

Lemma \ref{lma:d1} shows that when $\sigma_1(\bu_{k,t}\bG_{k-1,t}^\top)$ is sufficiently small, the signal term $\sigma_1^2(\bu_{k,t+1})$ can rise above $\Delta/2$ quickly. Moreover, as shown in Lemma \ref{lma:d1}, the term $\sigma_1^2(\bu_{k,t+1})$ will remain larger than $\Delta/2$. 
	
	\begin{lemma}\label{lma:d1}
  
    Suppose $\eta\leq\frac{1}{12\lambda_1}$, $\bX_{t}\in\gS$, and for some $t_{\rm init}\geq 0$ and $k\leq r$, the condition
    \$
    \sigma_1(\bu_{k,t}\bG_{k-1,t}^\top)\leq \sqrt{\frac{\Delta}{8}}\min\{\sigma_1(\bu_{k,t_{\rm init}}),\sqrt{\frac{\Delta}{2}}\}\cdot (1-\eta\Delta/6)^{t-t_{\rm init}}
    \$ 
    holds for all $t\geq t_{\rm init}$. Then $\sigma_1^2(\bu_{k,t})\geq \frac{\Delta}{2}$ for all $t\geq 
t_{\rm init}+ T_{\bu_{k}}$, where
\$
T_{\bu_k}=\cO\left(\frac{4}{\eta\Delta}\log\frac{\Delta}{2\sigma_1^2(\bu_{k,t_{\rm init}})}\right).
\$
In addition, for all $t\geq t_{\rm init}$, we have
    \#\label{equ:d1}
    \sigma_1^2(\bu_{k,t+1})\geq(1+2\eta\lambda_k-\eta\Delta/4-2\eta\sigma_1^2(\bu_{k,t})-2\eta\sigma_1^2(\bK_{k,t}\bPi_{\bu_k,t})))\cdot\sigma_1^2(\bu_{k,t}),
    \#
    where $\bPi_{\bu_k,t}=\bu_{k,t}^\top(\bu_{k,t}\bu_{k,t}^\top)^{-1}\bu_{k,t}$ is the projection matrix associated with $\bu_{k,t}$.
\end{lemma}
\begin{proof}
    %\xccomment{In the previous line, we define $\bu_{k,t}$ a row vector? (yes)}
    First, we show that $\sigma_1^2(\bu_{k,t})\geq \min\{\sigma_1^2(\bu_{k,t_{\rm init}}),\frac{\Delta}{2}\}$ for all $t\geq t_{\rm init}$ by induction.
    
    This is true when $t=t_{\rm init}$. Now suppose $\sigma_1^2(\bu_{k,t})\geq\min\{\sigma_1^2(\bu_{k,t_{\rm init}}),\frac{\Delta}{2}\}$ for some $t\geq t_{\rm init}$. By assumption, $\sigma_1^2(\bu_{k,t}\bG_{k-1,t}^\top)\leq\frac{\Delta}{8}\min\{\sigma_1^2(\bu_{k,t_{\rm init}}),\frac{\Delta}{2}\}\leq \frac{\Delta}{8}\sigma_1^2(\bu_{k,t})$. Then by Lemma~\ref{lma:4.2} and $\bX_t\in\gS$, we have
    \#
    \sigma_1^2(\bu_{k,t+1})&\geq(1+2\eta\lambda_k-2\eta\sigma_1^2(\bu_{k,t})-2\eta\sigma_1^2(\bK_{k,t}\bPi_{\bu_k,t})))\cdot\sigma_1^2(\bu_{k,t})-\frac{\eta\Delta}{4} \sigma_1^2(\bu_{k,t})\label{equ:H2}\\
    &\geq(1+5\eta\Delta/4-2\eta\sigma_1^2(\bu_{k,t}))\cdot\sigma_1^2(\bu_{k,t}).\label{equ:H3}
    \#
    Then we consider two cases.
    \begin{itemize}
        \item If $\sigma_1^2(\bu_{k,t})\leq\frac{5\Delta}{8}$, then $\sigma_1^2(\bu_{k,t+1})\geq\sigma_1^2(\bu_{k,t})\geq\min\{\sigma_1^2(\bu_{k,t_{\rm init}}),\frac{\Delta}{2}\}$. 
        \item If $\sigma_1^2(\bu_{k,t})\geq\frac{5\Delta}{8}$, then 
        \$
        \sigma_1^2(\bu_{k,t+1})&\geq (1+\frac{5\eta\Delta}{4}-\frac{5\eta\Delta}{4})\cdot\frac{5\Delta}{8}=\frac{5\Delta}{8}\\
        &\geq\min\{\sigma_1^2(\bu_{k,t_{\rm init}}),\frac{\Delta}{2}\},
        \$
        where the first inequality uses the fact that  $g_{8}(s)=(1+\frac{5\eta\Delta}{4}-2\eta s)s$ is increasing on $(-\infty,1/4\eta]$.
    \end{itemize}
    In both cases, we have $\sigma_1^2(\bu_{k,t+1})\geq \min\{\sigma^2_1(\bu_{k,{\rm init}}),\frac{\Delta}{2}\}$. The claim then follows by induction. 
    
    Furthermore, the above analysis shows that inequalities \ref{equ:H2} and \ref{equ:H3} hold for all $t\geq t_{\rm init}$, which leads to the inequality \ref{equ:d1}.
    
    Let 
    \$
    T_{\bu_k}=\min\{t\geq 0\mid\sigma_1^2(\bu_{k,t+t_{\rm init}})\geq \frac{\Delta}{2}\}.
    \$
    Then for $t<T_{\bu_k}$, we have $\sigma_1^2(\bu_{k,t+t_{\rm init}})<\frac{\Delta}{2}$ and by inequality \ref{equ:H3},
    \$
    \sigma_1^2(\bu_{k,t+1+t_{\rm init}})\geq(1+\eta\Delta/4)\cdot\sigma_1^2(\bu_{k,t+t_{\rm init}}).
    \$
    Hence, for all $t\leq T_{\bu_{k}}$, we have
    \$
    \sigma_1^2(\bu_{k,t+t_{\rm init}})\geq (1+\eta\Delta/4)^t\cdot\sigma_1^2(\bu_{k,t_{\rm init}}),
    \$  
    and 
    \$
    T_{\bu_k}=\cO\left(\frac{4}{\eta\Delta}\log\frac{\Delta}{2\sigma_1^2(\bu_{k,t_{\rm init}})}\right).
    \$
    Finally, by inequality \ref{equ:H3}, we have for any $a\leq \frac{5\Delta}{8}$, if $\sigma_1^2(\bu_{k,t})\geq a$, then $\sigma_1^2(\bu_{k,t+1})\geq a$. Thus, by induction, $\sigma_1^2(\bu_{k,t})\geq\frac{\Delta}{2}$ for all $t\geq t_{\rm init}+T_{\bu_{k}}$.
\end{proof}

Lemma \ref{lma:d2} shows that when the noise terms $\sigma_1(\bu_{j,t}\bK_{j,t}^\top)$ converge linearly to zero for all $j<k$ and the $k$-th signal term $\sigma_1^2(\bu_{k,t})\geq\frac{\Delta}{2}$, the noise term $\sigma_1(\bu_{k,t}\bK_{k,t}^\top)$ will also converge linearly to zero. The key component is to analyze the SNR 
\$
\frac{\sigma_1^2(\bu_{k,t})}{\sigma_1(\bu_{k,t}\bK_{k,t}^\top)}.
\$

\begin{lemma}\label{lma:d2}

    Suppose $\eta\leq\frac{\Delta}{100\lambda_1^{2}}$, $\bX_t\in\gS$, and for some $t_{\rm init}\geq 0$ and $k\leq r$, the conditions
    \#
    \sigma_1(\bu_{j,t}\bK_{j,t}^\top)&\leq(1-\eta\Delta/6)^{t-t_{\rm init}}, \quad \forall j<k,\label{cond:uk}\\
    \sigma_1(\bu_{k,t}\bG_{k-1,t}^\top)&\leq\frac{\Delta}{4}(1-\eta\Delta/6)^{t-t_{\rm init}},\label{cond:ug}\\
    \sigma_1^2(\bu_{k,t})&\geq\frac{\Delta}{2}\label{cond:u}
    \#
    hold for all $t\geq t_{\rm init}$.
    %Then we have
    %\$
    %\frac{\sigma_1(\bu_{k,t}\bK_{k,t}^\top)}{\sigma_1^2(\bu_{k,t})}
    %\leq(1-\eta\Delta/6)^{t}\frac{144r^2}{\Delta^2},\quad \forall t\geq 0.
    %\$
    %Let $t^{*}$ be the smallest integer such that $(1-\eta\Delta/6)^{t^*}\leq\frac{\Delta^2}{144r^2}$.
    Then we have 
    \$
    \sigma_1(\bu_{k,t}\bK_{k,t}^\top)\leq(1-\eta\Delta/6)^{t-t_{\rm init}-t^*}
    \$
    for all $t\geq t_{\rm init}+t^*$, where 
    \$
    t^*=\log\left(\frac{\Delta^2}{8\lambda_1^3+144r^2\lambda_1}\right)/\log(1-\eta\Delta/6).
    \$
\end{lemma}

	\begin{proof}
By condition \ref{cond:ug}, we can apply Lemma \ref{lma:d1} to obtain
	\$
    \sigma_1^2(\bu_{k,t+1})\geq(1+2\eta\lambda_k-\eta\Delta/4-2\eta\sigma_1^2(\bu_{k,t})-2\eta\sigma_1^2(\bK_{k,t}\bPi_{\bu_k,t})))\cdot\sigma_1^2(\bu_{k,t})
    \$
    for all $t\geq t_{\rm init}$.
    By Lemma \ref{lma:4.3}, we have
	\$
    &\ \sigma_1(\bu_{k,t+1}\bK_{k,t+1}^\top)\\
    \leq\ &(1+\eta\lambda_k+\eta\lambda_{k+1}-2\eta\sigma_1^2(\bu_{k,t})-2\eta\sigma_1^2(\bK_{k,t}\bPi_{\bu_k,t})+25\eta^2\lambda_1^{2})\cdot\sigma_1(\bu_{k,t}\bK_{k,t}^\top)\\
    &+3\eta\sigma_1(\bu_{k,t}\bG_{k-1,t}^\top)\sigma_1(\bK_{k,t}\bG_{k-1,t}^\top)
    \$
    for all $t\geq t_{\rm init}$. 
    Divide both sides of the inequality by $\sigma_1^2(\bu_{k,t+1})$. By Lemma \ref{lma:d3} and $\sigma_1^2(\bu_{k,t+1})\geq\frac{\Delta}{2}$,  we have
    \#
    \frac{\sigma_1(\bu_{k,t+1}\bK_{k,t+1}^\top)}{\sigma_1^2(\bu_{k,t+1})}&\leq(1-\eta\Delta/6)\frac{\sigma_1(\bu_{k,t}\bK_{k,t}^\top)}{\sigma_1^2(\bu_{k,t})}+\frac{6\eta}{\Delta}\sigma_1(\bu_{k,t}\bG_{k-1,t}^\top)\sigma_1(\bK_{k,t}\bG_{k-1,t}^\top)\label{equ:H4}
    \#
    for all $t\geq t_{\rm init}$.
    Observe that by condition \ref{cond:uk} and definitions of $\bu_{k,t},\bK_{k,t},$ and $\bG_{k-1,t}$, we have
    \#
    \max\{\sigma_1(\bu_{k,t}\bG_{k-1,t}^\top),\sigma_1(\bK_{k,t}\bG_{k-1,t}^\top)\}\leq\sum_{j<k}\sigma_1(\bu_{k,t}\bK_{k,t}^\top)\leq r(1-\eta\Delta/6)^{t-t_{\rm init}}\label{equ:H5}
    \#
    for all $t\geq t_{\rm init}$. 
    Combining \eqref{equ:H4} and \eqref{equ:H5},
    \$
     \frac{\sigma_1(\bu_{k,t+1}\bK_{k,t+1}^\top)}{\sigma_1^2(\bu_{k,t+1})}
    \leq(1-\eta\Delta/6)\frac{\sigma_1(\bu_{k,t}\bK_{k,t}^\top)}{\sigma_1^2(\bu_{k,t})}+\frac{6\eta r^2}{\Delta}(1-\eta\Delta/6)^{2(t-t_{\rm init})}
    \$
    for all $t\geq t_{\rm init}$.
    Therefore, for all $t\geq t_{\rm init}$,
    \$
    {\rm Q}_{t+1}\leq (1-\eta\Delta/6)\cdot {\rm Q}_{t},
    \$
    where the quantity $Q_t$ is given by
    \$ 
    {\rm Q}_t=\frac{\sigma_1(\bu_{k,t}\bK_{k,t}^\top)}{\sigma_1^2(\bu_{k,t})}+\frac{36r^2}{\Delta^2}(1-\eta\Delta/6)^{2(t-t_{\rm init})-1}.
    \$
    By induction, we have
    \$
    \frac{\sigma_1(\bu_{k,t}\bK_{k,t}^\top)}{\sigma_1^2(\bu_{k,t})}
    \leq(1-\eta\Delta/6)^{t-t_{\rm init}}\left(\frac{\sigma_1(\bu_{k,t_{\rm init}}\bK_{k,t_{\rm init}}^\top)}{\sigma_1^2(\bu_{k,t_{\rm init}})}+\frac{36r^2}{\Delta^2}(1-\eta\Delta/6)^{-1}\right).
    \$
    This implies that
    \$
    \sigma_1(\bu_{k,t}\bK_{k,t}^\top)\leq\frac{8\lambda_1^3+144r^2\lambda_1}{\Delta^2}\cdot(1-\eta\Delta/6)^{t-t_{\rm init}},
    \$
    where we use $1-\eta\Delta/6\geq 1/2$, $\sigma_1^2(\bX_t)\leq2\lambda_1$, and $\sigma_1^2(\bu_{k,t})\geq \frac{\Delta}{2}$ for all $t\geq t_{\rm init}$.
    By definition of $t^*$, we have 
    \$
    (1-\eta\Delta/6)^{t^*}\leq\frac{\Delta^2}{8\lambda_1^3+144r^2\lambda_1}.
    \$
    Thus, for all $t\geq t_{\rm init}+t^*$, we have 
    \$
    \sigma_1(\bu_{k,t}\bK_{k,t}^\top)\leq
    (1-\eta\Delta/6)^{t-t_{\rm init} -t^*},
    \$
    which concludes the proof.   
\end{proof}

Let $p_{k,t}=\lambda_k-\sigma_1^2(\bu_{k,t})$ be the error term associated with the $k$-th signal. Lemma \ref{lma:25} shows that when the noise terms $\sigma_1(\bu_{j,t}\bK_{j,t}^\top)$ converge linearly to zero for all $j\leq k$ and the $k$-th signal term $\sigma_1^2(\bu_{k,t})\geq \frac{\Delta}{2}$, this signal term will converge fast to $\lambda_k$. Specifically, the error term $|p_{k,t}|$ will converge to zero at a linear rate. The analysis is similar to Lemma \ref{lma:2.4}. 

\begin{lemma}\label{lma:25}
    Suppose $\eta\leq \frac{\Delta}{100\lambda_1^2}$, $\bX_t\in\cS$, and for some $t_{\rm init}\geq 0$ and $k\leq r$, the conditions
    \#
    \sigma_1(\bu_{j,t}\bK_{j,t}^\top)&\leq (1-\eta\Delta/6)^{t-t_{\rm init}},\quad \forall j\leq k,\label{cond:uk-2}\\
    \sigma_1^2(\bu_{k,t})&\geq\frac{\Delta}{2}
    \#
    hold for all $t\geq t_{\rm init}$. Then for all $t\geq t_{\rm init}$, we have
    \$
    |p_{k,t}|\leq (2\lambda_1+\frac{24r}{\eta\Delta})\cdot(1-\eta\Delta/8)^{t-t_{\rm init}},
    \$
    where $p_{k,t}=\lambda_k-\sigma_1^2(\bu_{k,t})$.
\end{lemma}

\begin{proof}
    Using the notation of $p_{k,t}$, \eqref{equ:u} can be rewritten as
    \$
    \bu_{k,t+1}=\bu_{k,t}+\eta p_{k,t}\bu_{k,t}-\eta\bu_{k,t}\bW_t.
    \$
    where
    \$
    \bW_t=\bG_{k-1,t}^\top\bG_{k-1,t}+\bK_{k,t}^\top\bK_{k,t}.
    \$
    By direction calculation, we have
    \$
    p_{k,t+1}=p_{k,t}\cdot((1-\eta\sigma_1^2(\bu_{k,t}))^2+\eta^2\lambda_k\sigma_1^2(\bu_{k,t})) + {\ \rm res}_{t}
    \$
    where
    \$
    {\rm res}_t=2\eta(1+\eta p_{k,t})\bu_{k,t}\bW_t\bu_{k,t}^\top-\eta^2\bu_{k,t}\bW_t^2\bu_{k,t}^\top.
    \$
    By the singular value inequality, for all $t\geq t_{\rm init}$, we have
    \#
    |p_{k,t+1}|&\leq|p_{k,t}|\cdot((1-\eta\sigma_1^2(\bu_{k,t}))^2+\eta^2\lambda_k\sigma_1^2(\bu_{k,t}))+|{\rm res}_t|\notag\\
    &\leq |p_{k,t}|\cdot((1-\eta\Delta/2)^2+2\eta^2\lambda_1^2)+|{\rm res}_t|\notag\\
    &\leq |p_{k,t}|\cdot(1-\eta\Delta/2)+|{\rm res}_t|,\label{equ:p}
    \#
    where the second inequality uses $\Delta/2\leq \sigma_1^2(\bu_{k,t})\leq 2\lambda_1$ and the third inequality use $\eta\leq \frac{\Delta}{100\lambda_1^2}$. Using $\eta\leq\frac{\Delta}{100\lambda_1^2}$ and $\sigma_1^2(\bX_t)\leq 2\lambda_1$, we have
    \$
    |{\rm res}_t|\leq \sum_{j\leq k}\sigma_1(\bu_{k,t}\bK_{k,t}^\top)\leq r(1-\eta\Delta/6)^{t-t_{\rm init}}
    \$
    for all $t\geq t_{\rm init}$. Substituting this into \eqref{equ:p}, we obtain
    \$
    |p_{k,t+1}|\leq |p_{k,t}|\cdot (1-\eta\Delta/2)+r(1-\eta\Delta/6)^{t-t_{\rm init}}\\
    \leq |p_{k,t}|\cdot (1-\eta\Delta/8)+r(1-\eta\Delta/6)^{t-t_{\rm init}}.
    \$
    This implies that for all $t\geq t_{\rm init}$, 
    \$
    Q_{t+1}\leq Q_t+\frac{r}{1-\eta\Delta/8}\left(\frac{1-\eta\Delta/6}{1-\eta\Delta/8}\right)^{t-t_{\rm init}},
    \$
    where 
    \$
    Q_t=\frac{|p_{k,t}|}{(1-\eta\Delta/8)^{t-t_{\rm init}}}.
    \$
    By induction, for all $t\geq t_{\rm init}$, we have
    \$
    Q_{t}&\leq Q_{t_{\rm init}}+\frac{r}{1-\eta\Delta/8}\sum_{i=0}^{t-1-t_{\rm init}}\left(\frac{1-\eta\Delta/6}{1-\eta\Delta/8}\right)^{i}\\
    &\leq |p_{k,t_{\rm init}}|+\frac{24r}{\eta\Delta}\\
    &\leq 2\lambda_1+\frac{24r}{\eta\Delta}.
    \$
    Hence, for all $t\geq t_{\rm init}$, we have 
    \$
    |p_{k,t}|\leq (2\lambda_1+\frac{24r}{\eta\Delta})\cdot (1-\eta\Delta/8)^{t-t_{\rm init}},
    \$
    which concludes the proof.
\end{proof}

\subsubsection{Technical calculations}

The following lemmas provide calculations related to an SNR argument, where the SNR refers to the ratio 
\$
\frac{\sigma_1^2(\bu_{k,t})}{\sigma_1(\bu_{k,t}\bK_{k,t}^\top)}.
\$
Recall that $\bu_{k,t}$ is the $k$-th row of $\bX_t$ and $\bK_{k,t}$ represents the $(k+1)$-to-$d$-th rows of $\bX_t$. Moreover, we recall that
\$
\bPi_{\bu_k,t}=\bu_{k,t}^\top(\bu_{k,t}\bu_{k,t}^\top)^{-1}\bu_{k,t}
\$ 
is the projection matrix associated with $\bu_{k,t}$.  $\bG_{k,t}$ collects the first $k$ rows of $\bX_t$.


Lemma \ref{lma:4.2} provides a lower bound on $\sigma_1^2(\bu_{k,t+1})$ in terms of the preceding iteration. 

\begin{lemma}\label{lma:4.2}
 
    For any $k$ and $t\geq 0$, we have
    \$
    \sigma_1^2(\bu_{k,t+1})\geq(1+2\eta\lambda_k-2\eta\sigma_1^2(\bu_{k,t})-2\eta\sigma_1^2(\bK_{k,t}\bPi_{\bu_k,t}))\cdot\sigma_1^2(\bu_{k,t})-2\eta\sigma_1^2(\bu_{k,t}\bG_{k-1,t}^\top).\label{equ:uu}
    \$
    
\end{lemma}


	\begin{proof}
	    Substituting \eqref{equ:u} into $\sigma_1^2(\bu_{k,t+1})$ gives that
	    \$
	    \sigma_1^2(\bu_{k,t+1}) 
     =\ & \bu_{k,t+1}\bu_{k,t+1}^\top\\
     =\ &\bu_{k,t}(\bI_r+\eta\lambda_k\bI_r-\eta\bX_t^\top\bX_t)^2\bu_{k,t}^{\top}\\
	    =\ &\bu_{k,t}(\bI_r+2\eta\lambda_k\bI_r-2\eta\bX_t^\top\bX_t)\bu_{k,t}^\top+\eta^2\bR_{k,t}\\
	    =\ &\bu_{k,t}(\bI_r+2\eta\lambda_k\bI_r-2\eta\sigma_1^2(\bu_{k,t})\bI_r-2\eta\sigma_1^2(\bK_{k,t}\bPi_{\bu_k,t})\bI_r-2\eta\bG_{k-1,t}^\top\bG_{k-1,t})\bu_{k,t}^\top\\
	    &+2\eta\bR_{k,t}'+\eta^2\bR_{k,t},
	    %&=(1+2\eta\lambda_k^*-2\eta\sigma_1^2(\bX_t\bPi_{\bu_k,t}))\bu_{k,t}\bu_{k,t}^\top+\eta\bR_t'+\eta^2\bR_t,
	    \$
	    where $\bR_{k,t}$ and $\bR_{k,t}'$ are non-negative real numbers given by
	    \$
	    \bR_{k,t}&=\bu_{k,t}(\lambda_k\bI_r-\bX_t^\top\bX_t)^2\bu_{k,t}^\top,\\
	    \bR_{k,t}'&=\bu_{k,t}(\sigma_1^2(\bK_{k,t}\bPi_{u_k,t})\bI_r-\bPi_{\bu_k,t}\bK_{k,t}^\top\bK_{k,t}\bPi_{\bu_k,t})\bu_{k,t}^\top.
	    \$
	    It then follows that
	    \$
	    \sigma_1^2(\bu_{k,t+1})\geq(1+2\eta\lambda_k-2\eta\sigma_1^2(\bu_{k,t})-2\eta\sigma_1^2(\bK_{k,t}\bPi_{\bu_k,t}))\cdot\sigma_1^2(\bu_{k,t})-2\eta\sigma_1^2(\bu_{k,t}\bG_{k-1,t}^\top),
	    \$
	    which concludes the proof.
	\end{proof}
	
	Lemma \ref{lma:4.3} provides an upper bound on $\sigma_1(\bu_{k,t+1}\bK_{k,t+1}^\top)$ in terms of the preceding iteration. 


\begin{lemma}\label{lma:4.3}

    Suppose $\eta\leq\frac{1}{12\lambda_1}$ and $\sigma_1^2(\bX_t)\leq 2\lambda_1$. For any $k\leq r$, if $\sigma_1^2(\bu_{k,t})>0$, then we have
    \$
    &\ \sigma_1(\bu_{k,t+1}\bK_{k,t+1}^\top)\\
    \leq\ &\left(1+\eta\lambda_k+\eta\lambda_{k+1}-2\eta\sigma_1^2(\bu_{k,t})-2\eta\sigma_1^2(\bK_{k,t}\bPi_{\bu_k,t})+25\eta^2\lambda_1^{2}\right)\cdot\sigma_1(\bu_{k,t}\bK_{k,t}^\top)\notag\\
    &+3\eta\sigma_1(\bu_{k,t}\bG_{k-1,t}^\top)\sigma_1(\bK_{k,t}\bG_{k-1,t}^\top).%\label{equ:uk}
    \$
\end{lemma}


	\begin{proof}
	    Substituting \eqref{equ:u} and \eqref{equ:K} into $\bu_{k,t+1}\bK_{k,t+1}^\top$ gives that
	    \$
	    \bu_{k,t+1}\bK_{k,t+1}^\top&=\bu_{k,t}\bK_{k,t}^\top+\eta\lambda_k\bu_{k,t}\bK_{k,t}^\top+\eta\bu_{k,t}\bK_{k,t}^\top\bGamma_k-2\eta\bu_{k,t}\bX_t^\top\bX_t\bK_{k,t}^\top+\eta^2\bE,\\
	    &=\bB+\bC-2\eta\bD+\eta^2\bE,
	    \$
	    where
	    \$
	    \bB&=\bu_{k,t}\bK_{k,t}^\top\left(\frac{1}{2}\bI_{d-k}-2\eta\bK_{k,t}\bPi_{\bu_{k,t}}\bK_{k,t}^\top\right)\\
	    \bC&=\bu_{k,t}\bK_{k,t}^\top\left(\frac{1}{2}\bI_{d-k}+\eta\lambda_k\bI_{d-k}-2\eta\sigma_1^2(\bu_{k,t})\bI_{d-k}+\eta\bGamma_k-2\eta\bK_{k,t}(\bI_r-\bPi_{\bu_k,t})\bK_{k,t}^\top\right),\\
	    \bD&=\bu_{k,t}\bG_{k-1,t}^\top\bG_{k-1,t}\bK_{k,t}^\top,\\
	    \bE&=\lambda_k\bu_{k,t}\bK_{k,t}^\top\bGamma_k-\bu_{k,t}\bX_t^\top\bX_t\bK_t^\top\bGamma_k-\lambda_k\bu_{k,t}\bX_t^\top\bX_t\bK_{k,t}^\top+\bu_{k,t}(\bX_t^\top\bX_t)^2\bK_{k,t}^\top.
	    \$
	    By the singular value inequality,
	    \$
	    \sigma_1(\bu_{k,t+1}\bK_{k,t+1}^\top)\leq\sigma_1(\bB)+\sigma_1(\bC)+2\eta\sigma_1(\bD)+\eta^2\sigma_1(\bE).
	    \$
	    For the first term $\bB$, observe that
	    \$
	    (\bu_{k,t}\bu_{k,t}^\top)^{-1/2}\bB&=(\bu_{k,t}\bu_{k,t}^\top)^{-1/2}\bu_{k,t}\bK_{k,t}^\top\left(\frac{1}{2}\bI_{d-k}-\bK_{k,t}\bPi_{\bu_k,t}\bK_{k,t}^\top\right)\\
        &=\left(1/2-\sigma_1^2((\bu_{k,t}\bu_{k,t}^\top)^{-1/2}\bu_{k,t}\bK_{k,t}^\top\right)\cdot (\bu_{k,t}\bu_{k,t}^\top)^{-1/2}\bu_{k,t}\bK_{k,t}^\top\\
	    &=\left(1/2-\sigma_1^2(\bK_{k,t}\bPi_{\bu_k,t})\right)\cdot (\bu_{k,t}\bu_{k,t}^\top)^{-1/2}\bu_{k,t}\bK_{k,t}^\top.
	    \$
	    where we use the equality $\sigma_1(\bK_{k,t}\bPi_{\bu_k,t})=\sigma_1((\bu_{k,t}\bu_{k,t}^\top)^{-1/2}\bu_{k,t}\bK_{k,t}^\top)$.
	    Thus,
	    \$
	    \sigma_1(\bB)=(1/2-\sigma_1^2(\bK_{k,t}\bPi_{\bu_k,t}))\cdot\sigma_1(\bu_{k,t}\bK_{k,t}^\top).
	    \$
	    For the second term $\bC$, by the singular value inequality,
	    \$
	    &\sigma_1(\bC)\\
     \leq\ &\sigma_1\left(\frac{1}{2}\bI_{d-k}+\eta\lambda_k\bI_{d-k}-2\eta\sigma_1^2(\bu_{k,t})\bI_{d-r}+\eta\bGamma_k-2\eta\bK_{k,t}(\bI_r-\bPi_{\bu_k,t})\bK_{k,t}^\top\right)\cdot\sigma_1(\bu_{k,t}\bK_{k,t}^\top)\\
	    \leq\ &(1/2+\eta\lambda_k-2\eta\sigma_1^2(\bu_{k,t})+\eta\lambda_{k+1})\cdot\sigma_1(\bu_{k,t}\bK_{k,t}^\top).
	    \$
	    For the third term $\bD$,  $\sigma_1(\bD)\leq\sigma_1(\bu_{k,t}\bG_{k-1,t}^\top)\sigma_1(\bK_{k,t}\bG_{k-1,t}^\top)$.
	    For the fourth term $\bE$, since $\sigma_1^2(\bX_t)\leq2\lambda_1$, we have
	    \$
	    \sigma_1(\bE)\leq25\lambda_1^{2}\sigma_1(\bu_{k,t}\bK_{k,t})+8\lambda_1 \sigma_1(\bu_{k,t}\bG_{k-1,t}^\top)\sigma_1(\bK_{k,t}\bG_{k-1,t}^\top).
	    \$
	    Combining, we prove the lemma.
	\end{proof}
	
Lemma \ref{lma:d3} provides an upper bound on a specific ratio, which is used in the proof of Lemma \ref{lma:d2}. It serves as a new variant of the SNR argument. 

\begin{lemma}\label{lma:d3}

    Suppose $\eta\leq\frac{\Delta}{100\lambda_1^{2}}$, $\sigma_1^2(\bX_t)\leq2\lambda_1$, and $\lambda_{k+1}\leq \lambda_k-\Delta$. Let
    \$
    {\rm ratio}\coloneqq\frac{1+\eta\lambda_k+\eta\lambda_{k+1}-2\eta\sigma_1^2(\bu_{k,t})-2\eta\sigma_1^2(\bK_{k,t}\bPi_{\bu_k,t})+25\eta^2\lambda_1^{2}}{1+2\eta\lambda_k-\eta\Delta/4-2\eta\sigma_1^2(\bu_{k,t})-2\eta\sigma_1^2(\bK_{k,t}\bPi_{\bu_k,t})}.
    \$
    Then ${\rm ratio}\leq 1-\eta\Delta/6$.
\end{lemma}
	\begin{proof}
 Since $\eta\leq\frac{\Delta}{100\lambda_1^{2}}$ and $\lambda_{k+1}<\lambda_k-\Delta$, we have 
 \$
 {\rm ratio}\leq \frac{1-\eta\Delta/4+s_0}{1+\eta\Delta/4+s_0},
 \$
 where 
 \$
 s_0=2\eta\lambda_k-\eta\Delta/2-2\eta\sigma_1^2(\bu_{k,t})-2\eta\sigma_1^2(\bK_{k,t}\bPi_{k,t})\in[-1/2,1/2].
 \$ 
 Since the function $g_9(s)=\frac{1-\eta\Delta/4+s}{1+\eta\Delta/4+s}$ is increasing on $[-1/2,1/2]$, we have
	    \$
	    {\rm ratio}\leq\frac{1-\eta\Delta/4+1/2}{1+\eta\Delta/4+1/2}\leq1-\eta\Delta/6,
	    \$
     which concludes the proof.
\end{proof}

%{\color{purple}

\section{Additional experiments}

In this section provide additional experiments to support and illustrate our theoretical results. 

\subsection{Rank-two matrix approximation}\label{sec:e1}

Our first extended experiment examines rank-two matrix approximation with varying dimension $d$ and initial magnitude $\varpi$. Specifically, we will choose $d$ from the set $\{1000, 2000, 4000\}$ and choose $\varpi$ from the set $\{0.001,0.5,2\}$. For each $d$, we set $\bSigma=\diag(\ba,\be)$, where $\ba\in\RR^r$ is a decreasing arithmetic sequence starting from 1 to 0.5 and $\be\in\RR^{d-r}$ is an arithmetic sequence transitioning from 0.3 to zero. Let $\bX_0=\varpi\bN_0$ with the entries of $\bN_0$ independently drawn from $\cN(0,\frac{1}{d})$. We compute the GD sequence $\bX_t$ with a step size of 0.1 and evaluate the errors $\norm{\bSigma_r-\bX_t\bX_t^\top}_{\rF}$, where $\bSigma_r=\diag(\ba,\zero)$ is the best rank-r approximation to $\bSigma$. The error curves of GD for different settings are displayed in Figure \ref{fig:2}. 

Figure \ref{fig:2} demonstrate that all the error curves exhibit the similar behaviors. The only differences lie on the first stage. 
\begin{itemize}
    \item When we use a small $\varpi=0.001$, the error does not rapidly change at the beginning. This is because $\norm{\bX_t}_{\rF}$ is close to zero and the error $\norm{\bSigma_r-\bX_t\bX_t}_{\rF}$ is approximately $\norm{\bSigma_r}_{\rF}$. This period of time corresponds to the second  property of Theorem 6. 
    \item When we use $\varpi=2$, we find the error first drops rapidly from a large value to $\norm{\bSigma_r}$. This corresponds to the Lemma \ref{lma:1} and the first property in Theorem \ref{lma:r'}. 
    \item When we use $\varpi=0.5$, the first stage nearly disappears. This means that $T_{\bu_1}$ in Theorem \ref{lma:r'} is small, especially compared with the case where $\varpi=0.001$. 
\end{itemize} 
In addition, we want to mention that if we use $\varpi=10$ to initialize the algorithm and keep other settings unchanged, then the GD sequence will diverge. This serves as a supplementary to the above experimental results. 

\begin{figure}[t]
    \centering
    \includegraphics[width = 0.7 \textwidth]{MF_2.pdf}
    \caption{Error curves of GD, measured by $\norm{\bSigma_r-\bX_t\bX_t^\top}_{\rF}$, for rank-two matrix approximation. The columns represent different initial magnitudes $\varpi=0.001,0.5,2$. The rows represent different dimensions $d=1000,2000,4000$.}
    \label{fig:2}
\end{figure}

\begin{figure}[t]
    \centering
    \includegraphics[width = 0.7\textwidth]{MF_3.pdf}
    \caption{Error curves of GD, measured by $\norm{\bSigma_r-\bX_t\bX_t^\top}_{\rF}$, for general rank matrix approximation. The dimension $d$ is set as 1000. Different rows represent different rank $r$. Different columns represent different initial magnitudes $\varpi$.}
    \label{fig:3}
\end{figure}

\subsection{General rank matrix approximation}

Our second experiment examines general rank matrix approximation, where we fix dimension $d=1000$ and vary the rank $r$ across $\{2,6,10\}$. In addition, for each setting, we examine different initial magnitudes $\varpi\in\{0.001,0.5,2\}$. Our  setting for $\bSigma$ is the same as before, that is, $\bSigma=\diag(\ba,\be)$ with $\ba\in\RR^{r}$ and $\be^{d-r}$ being two arithmetic sequences. We initialize GD using $\bx_0=\varpi\bN_0$ and we compute the GD sequence and the errors $\norm{\bSigma_r-\bX_t\bX_t}_{\rF}$. The results are displayed in Figure~\ref{fig:3}. 

As the results demonstrate, the effects of $\varpi$ is similar to the one in Section \ref{sec:e1}. Moreover, we observe another interesting phenomenon that may need additional explanations. Figure \ref{fig:3} shows that the error curve for larger rank $r$ is smoother than the one for smaller rank $r$. Our explanation is that for larger rank $r$, the differences between successive eigenvalues are smaller. Thus, it is harder to distinguish the associated eigenvectors, and all the eigenvectors may be learned together. As a result, the error curve remains decreasing along the iterations.





\end{document}
