\section{Experiment}
In this section, we evaluate the performance of the Riemannian Gradient Descent (RGD) algorithm, as described in \cref{alg: RGD}, on Gaussian matrix sensing problems. We present phase transition diagrams to illustrate the relationship between sample complexity $m$ and the rank $r$ or condition number $\kappa$ of $\target$. Furthermore, we compare the efficiency of RGD with factorized gradient descent (GD) methods in ill-conditioned settings.

\paragraph{Phase Transition Diagram}
We study the phase transition behavior of RGD by systematically varying the rank $r$ and the number of measurements $m$ in Gaussian matrix sensing problems, with fixed dimensions ($d_1 = 60$, $d_2 = 80$) and condition number $\kappa = 2$. For each $(r, m)$ pair, we perform 20 independent trials. 
A trial is considered successful if $\frac{\|\bm{X}_{N} - \target\|_F}{\|\target\|_F} \leq 10^{-2}$ after $N = 100$ iterations.  
This setup allows us to empirically estimate the success rate as a function of $m$ and $r$. 
\Cref{fig:phase_transition} (left) reveals a sharp phase transition, where the minimal sample complexity $m$ required for successful recovery increases linearly with the rank $r$.
\begin{figure*}[t]
    \centering
    \begin{minipage}[b]{0.48\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figs/r_m.pdf}
        % \caption{
        %     Empirical recovery probability for each $(r, m)$ pair. The color gradient from black (failure) to white (success) highlights a sharp phase transition, indicating that the sample complexity $m$ increases linearly with the rank $r$.
        % }
        \label{fig:phase_transition_r}
    \end{minipage}
    \hfill
    \begin{minipage}[b]{0.48\textwidth}
        \centering
        \includegraphics[width=\textwidth]{figs/kappa_m.pdf}
        % \caption{
        %     Empirical recovery probability for each $(\kappa, m)$ pair. The nearly horizontal boundary indicates that increasing the condition number $\kappa$ has little effect on the sample complexity $m$ required for recovery.
        % }
        \label{fig:phase_transition_kappa}
    \end{minipage}
    \centering
        \caption{
            Phase transition diagrams for Gaussian matrix sensing: (left) $m$ vs. $r$; (right) $m$ vs. $\kappa$. \\ 
            Black indicates failure; white indicates success.
        }
    \label{fig:phase_transition}
\end{figure*}
We further examine how the condition number $\kappa$ affects the sample complexity $m$. Keeping the dimensions fixed as before and setting the rank $r = 10$, we vary $\kappa$ from $1$ to $280$. 
The nearly horizontal boundary in \cref{fig:phase_transition} (right) indicates that increasing the condition number $\kappa$ has little effect on the sample complexity $m$ required for successful recovery.
Explaining this empirical insensitivity may require new theoretical insights.


% \paragraph{$\kappa-$dependence}
\paragraph{Comparison with Factorized GD}
We also compare the convergence speed of RGD and factorized GD in ill-conditioned settings. We use square matrices ($d_1 = d_2 = 80$), rank $r = 15$, $m = 13200$, and condition number $\kappa = 20$. Stepsizes are set to $\mu = 1$ for RGD, and $\mu = 0.9$ (empirically optimal) and $\mu = 1$ for GD. 
The ground truth $\target$ is PSD, following \citep{stoger_non-convex_2024}. As shown in \cref{fig:error_time}, RGD is stable and converges rapidly, while GD becomes unstable at larger stepsizes.

\begin{figure}
    \centering
    \includegraphics[width=0.48\textwidth]{figs/curve 2.png}
    \caption{
        Error versus time for RGD and factorized GD. GD with a large stepsize ($\mu = 1$, blue) oscillates, while RGD (dashed) is stable and efficient. GD with the empirically optimal stepsize ($\mu = 0.9$, red) is also shown.
    }
    \label{fig:error_time}
\end{figure}

%
% Similar empirical results have been reported in Section 3.1 of [1], where RGD-type methods recover low-rank matrices with $m$ scaling linearly in $r$ for both Gaussian and matrix completion problems. However, prior theoretical guarantees required $m = \Omega(r^2)$ even in the Gaussian case. Our work advances the theory by reducing this requirement to $m = \Omega(r)$.

% \paragraph{2. Fast Convergence of RGD}
% We further assess the convergence speed of RGD and compare it to factorized GD [2] in ill-conditioned scenarios.
%
% \textbf{Setup:}
% \begin{itemize}
%     \item Dimensions: $d_1 = d_2 = 80$, rank $r = 15$, $\kappa = 20$
%     \item Stepsizes: RGD ($\mu=1$), GD (optimal $\mu=0.9$ found empirically and $\mu=1$)
%     \item SPD ground truth $X_*$ for compatibility with [2]
% \end{itemize}
%
% \textbf{Results:}
% The [error-time curves](https://limewire.com/d/ctBla#VsKmUd4E6w) show:
% \begin{itemize}
%     \item GD becomes unstable at larger stepsizes (e.g., $\mu = 1$) in ill-conditioned cases
% \end{itemize}

