\section{Problem Setup and Preliminaries}
\label{sec:prelim}

% \textbf{Notation}. We use $\mathbb{E}\bbb{.}$ to denote expectation and $[n]$ for $\{1, \dots, n\}$. The $\ell^{2}$ norm for vectors is $\norm{.}_{2}$ or $\norm{.}$, the operator norm for matrices is $\normop{.}$ or $\norm{.}$, and the Frobenius norm for matrices is $\norm{.}_{F}$. $I \in \mathbb{R}^{d \times d}$ is the identity matrix, with $i^{th}$ column $e_{i}$. $\inner{A}{B} := \Tr(A^T B)$ is the matrix inner product. $\widetilde{O}$ and $\widetilde{\Omega}$ represent order notations with logarithmic factors. $a \lesssim b$  denotes $a \leq Cb$ for a universal positive constant $C > 0$. $\diag{a_{1}, a_{2}, \cdots, a_{d}} \in \R^{d \times d}$ represents a diagonal matrix with entries $\left\{a_i\right\}_{i \in [d]}$ on the diagonal. For vector $v \in \R^{d}$ and set $S \subseteq [d]$ with $|S| = k$, $v[S] \in \R^{k}$ denotes the vector constructed by selecting elements of $v$ indexed by $S$. 

\textbf{Notation.} Let $[n]=\{1,\dots,n\}$ for all positive integers $n$. For a vector $v$, $\|v\| = \|v\|_2$ denotes its $\ell_2$ norm. For a matrix $A$, $\|A\| = \normop{A}$ is the operator norm, $\normf{A}$ is the Frobenius norm, and $\norm{A}_p$ is the Schatten $p$-norm of $A$, which is the $\ell_{p}$ norm of the vector of singular values of $A$. We define the \textit{two-to-infinity} norm $\norm{A}_{2\leftarrow \infty} := \sup_{\norm{x}_{2}=1}\norm{Ax}_{\infty}$. For a random matrix $M$ and $p,q \ge 1$, we define the norm $\vertiii{M}_{p,q} \defeq \E[\norm{M}_p^q]^{1/q}$. Let $I\in\R^{d\times d}$ be the identity matrix with $i^{\textsf{th}}$ column $e_i$. Define the inner product of matrices as $\langle A,B\rangle=\Tr(A^T B)$. We use $\widetilde{O}$ and $\widetilde{\Omega}$ for bounds up to logarithmic factors and use $a\lesssim b$ to mean $a\le Cb$ for some universal constant $C$. $\diag{a_1,\dots,a_d}$ denotes the diagonal matrix with entries $a_1,\dots,a_d$. For a vector $v\in\R^d$ and $S\subseteq [d]$ with $|S|=k$, $v[S]\in\R^k$ is the ``sub-vector'' of $v$ with its coordinates indexed by $S$.

% We start by defining subgaussianity for multivariate distributions.
% \begin{definition}
%     \label{definition:subgaussian}
%      A random mean-zero vector $X \in \mathbb{R}^{d}$ with covariance matrix $\Sigma$ is a $\sigma-$subgaussian random vector ($\sigma > 0$) if for all vectors $v \in \mathbb{R}^{d}$, we have 
%     $\E\bbb{\exp\bb{v^{T}X}} \leq \exp\bb{ \sigma^{2}v^T\Sigma v/2}$. Equivalently, $\exists \; L > 0$, such that $\forall p \geq 2$, $\bb{\E\bbb{|v^{T}X|^{p}}}^{\frac{1}{p}} \leq L\sigma \sqrt{p}\sqrt{v^T\Sigma v}$. \footnote{The results developed in this work follow if instead of subgaussianity, the moment bound holds $\forall \; p \leq 8$.}
% \end{definition}
% This definition of subgaussianity has been used in contemporary works on PCA and covariance estimation (See for example \cite{mendelson2020robust, jambulapati2020robust, diakonikolas2023nearly, kumar2024oja} and Theorem 4.7.1 in \cite{vershynin2018high}).
% \begin{assumption}[Subgaussianity]
%     \label{assumption:subg}
%      $\left\{X_{i}\right\}_{i \in [n]}$ are of independent and identically distributed $\sigma$-subgaussian vectors in $\mathbb{R}^{d}$ with covariance matrix $\Sigma := \E\bbb{X_{i}X_{i}^{T}}$.
%     \vspace{-5pt}
% \end{assumption}
\textbf{Data}. Let $\left\{X_i\right\}_{i\in[n]}$ be independent and identically distributed ($\iid$) mean-zero vectors sampled from the distribution $\mathcal{P}$ over $\mathbb{R}^{d}$ with covariance matrix $\Sigma := \E\bbb{X_{i}X_{i}^{T}}$. Let $A_i := X_iX_i^{\top}$. Let $v_{1}, v_{2},  \ldots, v_{d}$ 
denote the eigenvectors of $\Sigma$ with corresponding eigenvalues $\lambda_{1} > \lambda_{2} \geq \ldots \geq \lambda_{d}$. Let $\vp := \bbb{v_{2}, v_{3}, \ldots, v_{d}} \in \mathbb{R}^{d \times \bb{d-1}}$.

% and {\rd $\Lambda_{2} := \diag{\lambda_{2}, \lambda_{3}, \ldots, \lambda_{d}}$.}

We operate under the following assumptions unless otherwise specified.

\begin{assumption}\label{assumption:bounded_moments}
For any $X_{i} \sim \mathcal{P}, A_i = X_iX_i^{\top}$, we assume the following moment bounds, where $\sqrt{\Nu} \le \mathcal{M}_2 \le \mathcal{M}_4$:
\begin{gather}
    \normop{\E\bbb{\bb{A_i - \Sigma}^{2}}} \leq \Nu \label{eq:Nu_assumptions}\\
    \E\bbb{\normop{A_i - \Sigma}^{2}}^{\frac{1}{2}} \leq \Mtwo \qquad \E\bbb{\normop{A_i - \Sigma}^{4}}^{\frac{1}{4}} \leq \Mfour. \label{eq:momentbound_assumptions}
\end{gather}
% For notational convenience, define $\bar{\Nu} := \Nu + \lambda_{1}^{2}$.
\end{assumption}

\begin{assumption}\label{assumption:sample_size}
There exists a universal constant $\kappa > 5$ such that $d = o\bb{n^{\kappa}}$ and  $\frac{n}{\log\bb{n}} \geq 2\max\left\{\kappa, \frac{\kappa^{2}\Mtwo^{4}\log\bb{d}}{\bb{\eigengap}^{4}} \right\}$. 
\end{assumption}

Assumption~\ref{assumption:bounded_moments} provides a suitable moment bound on the iterates $A_i$, and Assumption~\ref{assumption:sample_size} shows that we can handle the dimension $d$ growing polynomially with the sample size $n$, while requiring a mild base number of samples for convergence. We note that the constraint $\kappa > 5$ is arbitrary and our algorithm works as long as $d = \mathsf{poly}(n)$. These assumptions are commonly used in the streaming PCA literature (see for e.g. \cite{jain2016streaming}).

\textbf{Oja's Algorithm with constant learning rate.} With a constant learning rate, $\eta_{n}$, and initial vector, $u_{0}$, Oja's algorithm \citep{oja1982simplified} (denoted as $\Oja\bb{\left\{X_{t}\right\}_{t \in [n]}, \eta_n, u_{0}}$) performs the updates in Eq~\eqref{eq:ojaupdate}. Define $\forall t \in [n]$,
\ba{B_{t} := \prod_{i=0}^{t-1}\bb{I + \eta_{n} X_{t-i}X_{t-i}^{T}};\qquad %\bb{I + \eta_{n} X_{t-1}X_{t-1}^{T}}\cdots \bb{I + \eta_{n} X_{1}X_{1}^{T}}, \; 
B_{0} = I. 
\label{definition:Bn}
}
such that $u_{t} = B_{t}u_0/\norm{B_t u_0}_{2}$.