\section{Main Results}
\label{sec:main_results}

Recall the definition of Oja's algorithm with a constant learning rate, as defined in Section~\ref{sec:prelim}. For $\iid$ data $\mathcal{D}_{n} := \left\{X_{i}; X_{i} \in \R^{d}\right\}_{i \in [n]}$,  the learning rate $\eta_{n}$ defined in Lemma~\ref{lemma:learning_rate_choice}, and a random initial vector $u_0 \defeq g/\norm{g}$ where $g \sim \mathcal{N}(0, \id_d)$, define the \textit{Oja vector}
\ba{
 \voja(\mathcal{D}_{n}) := \Oja(\mathcal{D}_{n}, \eta_{n}, u_0). \label{eq:voja_def}
}
This is a random vector, with randomness over the data $\mathcal{D}_{n}$ as well as the initial vector $u_0$. While there are a myriad of works on the sin-squared error $1-(v_1^T\voja)^2$, there is, to our knowledge, no existing analysis on the concentration of the elements of the recovered vector around their population counterparts. One exception is~\citep{kumarsarkar2024sparse}, who showed that for sparse PCA, the elements of the Oja vector in the support of the true eigenvector are large, whereas those outside are small. However, these guarantees do not show concentration in our setting. We start our analysis with the Hoeffding decomposition of the matrix product (also see~\cite{lunde2021bootstrapping, vandervaart-asymptotic}).
The Hoeffding decomposition is a powerful tool that allows one to write the \textit{residual} of the Oja vector as
\ba{\label{eq:hoeffding}
\roja := \voja - \bb{v_1^{\top}\voja}v_1 = \Psi_{n,1} + \Res_n
}
where $\Psi_{n,1}$ is $\eta_n$ times a sum of independent but non-identically distributed random vectors and the residual $\Res_n$ is negligible compared to $\Psi_{n,1}$ (see Lemma~\ref{lemma:oja_error_decomposition} for details).

%This was the basis of the analysis of the asymptotic distribution of the sin-squared error in previous work. 
First, we show that the covariance matrix $\E[\Psi_{n,1}\Psi_{n,1}^T]$ of the dominant term in the residual converges to $\V$ when suitably scaled. Later, in Proposition~\ref{prop:main:clt} we will show that the distribution of the entries of $\roja$ is asymptotically normal with covariance matrix $\E[\Psi_{n,1}\Psi_{n,1}^T]/(\eta_n\bb{\eigengap})$.

\begin{lemma}[Asymptotic variance]\label{lemma:second_moment_matrix}
    Let 
    \bas{\widetilde{M} &:= \E\bbb{\vp^\top \bb{A_{1}-\Sigma}v_{1}v_{1}^\top\bb{A_{1}-\Sigma}\vp}, \\ d_k &:= 1-\bb{\frac{\lambda_1-\lambda_{k+1}}{1+\eta_n\lambda_1}} \eta_n. } 
    Then, the matrix $R^{(n)} \in \R^{(d-1) \times (d-1)}$ with entries 
    \bas{
         R^{(n)}_{k,l} &:= \frac{\widetilde{M}_{kl}}{(1+\eta_{n}\lambda_{1})^2} \bb{\frac{1 - \bb{d_kd_l}^{n}}{1 - d_kd_l}}, 
    }
    satisfies $\E\bbb{\Psi_{n,1}\Psi_{n,1}^{\top}} = \eta_n^2\vp R^{(n)}\vp^{\top}$. 
    
    Define the matrices $R_0 \in \R^{(d-1) \times (d-1)}$ and $ \mathbb{V} \in \R^{d \times d}$ as
    \ba{\label{eq:asympvar}
     (R_{0})_{k,l} \defeq \frac{\widetilde{M}_{k\ell}}{2\lambda_1-\lambda_{k+1}-\lambda_{\ell+1}}; \;\; \mathbb{V} \defeq \frac{1}{\eigengap}\vp R_0\vp^T. 
    }
    then,
    \ba{
    \norm{\frac{1}{\eta_n\bb{\eigengap}}\E[\Psi_{n,1}\Psi_{n,1}^T]-\mathbb{V}}_F\lesssim \frac{\eta_n \lambda_{1}\Mtwo^{2}}{\bb{\lambda_1-\lambda_2}^{2}}. \label{eq:variance_diff_bound}
    }
     %$\frac{\eta R^{(n)}_{kl}-R_0(k,l)}{R_0(k,l)}=O(\eta\lambda_1)$.
\end{lemma}



This shows that suitably scaled, $\E[\Psi_{n,1}\Psi_{n,1}^T]$ converges to the matrix $\mathbb{V}$. Note that the scaling factor $\eta_n \bb{\eigengap} = \frac{\alpha \log n}{n}$ is independent of model parameters for the choice of $\eta_{n}$ defined in Lemma~\ref{lemma:learning_rate_choice}.

The next result establishes a Central Limit Theorem (CLT) for the subset of elements in the residual vector $r_{\text{oja}}$ with sufficiently large limiting variance. 
% also converges in distribution to 


% \begin{proposition}[\label{prop:clt}Entrywise CLT]
% WLOG, assume $\lambda_1\ge 1$, and 
% $\frac{\Mtwo^4n\eta_n^2}{\lambda_1(\lambda_1-\lambda_2)}\leq 1$.  Let $\{X_i\}_{i=1}^n\in \mathbb{R}^d$ be a random mean-zero vector with covariance matrix \( \Sigma \) such that for all vectors \( v \in \mathbb{R}^d \), we have
% \[
% \mathbb{E} \left[ \exp \left( v^T X_1 \right) \right] \leq \exp \left( \frac{\sigma^2 v^T \Sigma v}{2} \right).
% \]
% Let $\roja=\voja - (v_{1}^{\top}\voja)v_{1}$.   Consider the set $J=\{j:  \V_{jj}\geq  b\}$ for some $b>0$. Let $p=|J|$. Let $\mathcal{A}^{\text{re}}$ denote the class of all hyperrectangles in $\mathbb{R}^p$. That is, $\mathcal{A}^{\text{re}}$ consists of all sets $A$ of the form:
% \begin{equation}
%     A = \{u \in \mathbb{R}^p : a_j \leq u_j \leq b_j \text{ for all } j = 1, \dots, p\}
% \end{equation}
% for some real values $a_j$ and $b_j$ satisfying $-\infty \leq a_j \leq b_j \leq \infty$ for each $j = 1, \dots, p$.      Let $H_i = \vp\lambp^{n-i}\vp^{\top}\bb{A_{i}-\Sigma}v_{1}$. Let $Y_i\in \mathbb{R}^p$ denote independent mean zero normal vectors such that $$\E[Y_iY_i^T]=\frac{n\eta_n}{\lambda_1}\E[H_i[J]H_i[J]^T].$$ %Define $\rho(\mathcal{A}_{re})=$
%     We have {\small
%     \bas{
%     &\sup_{A\in \mathcal{A}_{re}}\left|P\bb{\tfrac{\roja[J]}{\sqrt{\lambda_1\eta_n}}\in A}-P\bb{\tfrac{S_Y}{\sqrt{n}}\in A}\right| = 
%     \widetilde{O}\bb{ \bb{\tfrac{\Mtwo^4n\eta_n^2}{\lambda_1(\lambda_1-\lambda_2)}}^{\frac{1}{6}}}
%     }}
% \end{proposition}

\begin{proposition}[CLT for a suitable subset of entries]\label{prop:main:clt}
Let $\{X_i\}_{i=1}^n$ be independent mean-zero random vectors with covariance matrix $\Sigma$ such that $\mathbb{E}\bigl[\exp(v^\top X_1)\bigr]\le\exp\bigl(\tfrac{\sigma^2\,v^\top\Sigma\,v}{2}\bigr)$ for all $v\in\mathbb{R}^d$ and $\sigma > 0$ is some constant. 

For all $i \in [n]$, let
\[
H_i := \frac{\sign\bb{v_{1}^{\top}u_0}}{(1+\eta_n \lambda_1)}\vp\,\lambp^{\,n-i}\vp^\top\bigl(A_i-\Sigma\bigr)v_1,
\]
Let $b > 0$ be a constant, and let $J \subseteq [d]$ be the set of coordinates with $\V_{jj} \ge b$. Let $p \defeq |J|$. 

Let $Y_i\in\mathbb{R}^p$ be independent mean-zero Gaussian vectors with covariance matrix $$\mathbb{E}[Y_iY_i^\top]=\frac{n\eta_n}{\eigengap}\,\mathbb{E}[H_i[J]H_i[J]^\top],$$ and let $S_{Y} := \sum_{i=1}^{n}Y_i$.

Suppose the learning rate $\eta_n$, set according to Lemma~\ref{lemma:learning_rate_choice}, satisfies
$\frac{\Mtwo^{2} \lambda_1 \eta_n}{\bb{\eigengap}^2}\lesssim b$. Then,
\bas{
& \sup_{A\in\mathcal{A}^{\text{re}}}\bigg|\Prob\Bigl(\frac{\roja[J]}{\sqrt{\bb{\eigengap}\eta_n}}\in A\Bigr)-\Prob\Bigl(\frac{S_Y}{\sqrt{n}}\in A\Bigr)\bigg| \\ &=\tilde{O}\bb{ \bb{\frac{\Mfour}{\eigengap }}^{1/3}n^{-1/6} + \bb{\frac{\Mtwo}{\eigengap}}^{1/2}n^{-1/8}},
}
where $\mathcal{A}^{\text{re}}$ is the collection of all hyperrectangles in $\mathbb{R}^p$, i.e, sets of the form $A=\{u\in\mathbb{R}^p : a_j\le u_j\le b_j\text{ for }j=1,\dots,p\}$ and each $a_j$ and $b_j$ belongs to $\mathbb{R}\cup\{-\infty,\infty\}$. Here,
$\tilde{O}$ hides logarithmic factors in $n$, $d$, and polynomial factors in $b$ and in model parameters $ \lambda_1,\eigengap, \Mtwo, \Mfour$. 
\end{proposition}

\begin{remark}
    Note that the first $n^{-1/6}$ term in the convergence rate arises from the high-dimensional CLT result by~\cite{ChernoCLT2015} applied to $\Psi_{n,1}$. %This bound does not match the Berry-Esseen rate in the lower-dimensional settings. However, even if we apply a low-dimensional Berry-Esseen type bound from~\cite{raic2019multivariate} to the $p$ coordinates and use the techniques from~\cite{bentkus1986dependence} and~\cite{gotze1991rate}, our rate is still limited by the error of the remaining higher-order terms ($\Res_n$ in equation~\ref{eq:hoeffding}) in the residual vector. 
    The main bottleneck is the $n^{-1/8}$ term, resulting from the higher-order terms of the Hoeffding decomposition ($\Res_n$ in equation~\ref{eq:hoeffding}). We note that the second term may be tightened by using better concentration bounds. 
    We point the reader to Proposition~\ref{prop:clt_appendix} in the Appendix for a complete statement and proof. 
\end{remark}


Proposition~\ref{prop:main:clt} establishes a Gaussian approximation of suitably scaled $\roja[J]$, where $J$ is a set of elements with large enough asymptotic variance. Our proof uses results from~\cite{chernozhukov2017detailed} on the Hájek projection~\eqref{eq:hoeffding} and bounds the effect of the remainder term by using Nazarov's Lemma~\citep{nazarov2003maximal} (Theorem~\ref{thm:Nazarov}). We use this to derive concentration bounds for all coordinates. The lower bound on the variance is crucial and comes from Nazarov's inequality. It is also a condition of the results in~\cite{chernozhukov2017detailed}. A simple observation here is that when $b_k$ is zero, i.e. $v_1(k)=1$, then $\V_{kk}=0$. Here, CLT may not hold since the Hájek projection is zero, and the perturbation arises from some of the smaller error terms in the error decomposition. 

\begin{theorem}\label{thm:main:entrywise_concentration_bound} Let the learning rate $\eta_n$ be set according to Lemma~\ref{lemma:learning_rate_choice}. Further, for $X_i \sim \mathcal{P}, A_i = X_iX_i^{\top}$, let $\normop{A_i - \Sigma} \leq \Mone$ almost surely. Then, with probability at least $3/4$, uniformly for all $k \in [d]$, 
% \rd The lower order terms have poly dependence on $1/\delta$? \bk
\bas{
    \frac{\Abs{e_k^{\top}\roja}}{\sqrt{\eta_n\bb{\eigengap}}} &\lesssim \sqrt{\mathbb{V}_{kk}\log\bb{d}} + C b_k\sqrt{\frac{\log n}{n}},
    %&\eta_{n}b_k\bb{\Mone\log\bb{\frac{d}{\delta}} + \Mtwo\sqrt{\frac{\lambda_{1}}{\eigengap}}\sqrt{\log\bb{\frac{d}{\delta}}}} \\
    %&\quad\quad + b_{k}\bb{\frac{\sqrt{s_n\log\bb{\frac{1}{\delta}}}}{\delta^{2}} + \frac{1}{\delta^{\frac{3}{2}}}}\bb{\sqrt{d}\exp\bb{-\eta_{n}n\bb{\lambda_{1}-\lambda_{2}}} + \frac{\sqrt{\eta_{n}^{3}n}\Mtwo^{2}\log\bb{d}}{\sqrt{\lambda_{1}-\lambda_{2}}}} \\
    %&\quad\quad + \frac{b_k \eta_{n}^2n \Mtwo^2 \log d} {\sqrt{\delta}} + \frac{b_k\sqrt{s_n}\eta_n\sqrt{n}\Mtwo\log\bb{d}}{\sqrt{\delta}} 
}
where $b_k \defeq \norm{e_{k}^{\top}\vp}_{2}$, $\mathbb{V}$ is defined in Eq~\ref{eq:asympvar}, and $C$ is a constant that depends on $\lambda_1, \eigengap, \Mtwo,$ and $\Mone$.
% where $b_k := \norm{\vp^{\top}e_k}_{2}$,  $\widetilde{M} := \E\bbb{\vp^{\top}\bb{A_j-\Sigma}v_1v_1^{\top}\bb{A_j-\Sigma}^{\top}\vp}$ and $R_0 \in \R^{(d-1) \times (d-1)}$ with entires
%     \bas{
%     R_0(k,l) = \frac{\widetilde{M}_{k\ell}}{2\lambda_1-\lambda_{k+1}-\lambda_{\ell+1}}, k, l \in [d-1]
%     }.
\end{theorem}

\begin{remark}
The limiting marginal variances $\V_{kk}$ also appear in the finite-sample bound for the elements of the residual vector. Estimating these variances enables us to quantify the uncertainty associated with each component of $\hat{v}_1$, even when the sample size is finite.

\end{remark}
In Appendix~\ref{appendix:entrywise_error_bounds}, we provide a complete result with arbitrary failure probability  $\delta$ in Lemma~\ref{lemma:entrywise_concentration_bound}. The above guarantee can be boosted to a high probability one using geometric aggregation (see e.g. Alg. 3 in~\cite{kumarsarkar2024sparse}).
%\end{remark}

\subsection{Uncertainty estimation}
\label{ssec:uncertainty_estimation}
Proposition~\ref{prop:main:clt} shows that the asymptotic variance of elements of the residual $\roja(i)$ is governed by the variance of the entries $\E[(e_i^T\Psi_{n,1})^2]$ of $\Psi_{n,1}$. We cannot directly get to $\Psi_{n,1}$ since we only observe $\voja$. If we could estimate $\roja$, it would give us an idea of the error. However, we do not know $v_1$, and so cannot directly access $\roja$. We alleviate this difficulty by using the following high-accuracy estimate of $v_1$ constructed using $N$ samples,
\ba{\label{eq:vtilde}
\vmain \gets \Oja(\mathcal{D}_{N}, \eta_{N}, u_0),
} where $N$ satisfies the bounds of Theorem~\ref{thm:high_prob_error_bound}.

We now provide a subsampling-based approach (Alg.~\ref{alg:variance_estimation}) to estimate $\E[(e_i^T\Psi_{n,1})^2]$ with high probability, allowing us to provide confidence intervals around the eigenvector elements. 
%We , and then rescaling the estimate by $\eta_b/\eta_n$, as indicated by Lemma~\ref{lemma:second_moment_matrix}.%as indicated by
%equation~\eqref{eq:per_coord_bound}.
Algorithm~\ref{alg:variance_estimation} takes as input the data $\{X_{i}\in \mathbb{R}^{d}\}_{i \in [n]}$, a failure probability $\delta$, and the proxy unit vector $\vmain$.
The $n$ samples are split into $m_1$ batches with $n/m_1$ samples each. Then, the ${\ell}^{\text{th}}$ batch of $n/m_1$ samples is further split into $m_2$ batches of size $B \defeq n/(m_1m_2)$ each. Oja vectors $\left\{\hat{v}_{\ell, j}\right\}_{j \in [m_2]}$ are computed on each of these $m_2$ batches, and the variance of the $k^{\mathsf{th}}$ coordinate is estimated as
\ba{
    \hat{\sigma}^{2}_{k, \ell} := \sum_{j \in [m_2]} \dfrac{\bb{e_k^{\top} \bb{\hat{v}_{\ell, j} - (\vmain^\top \hat{v}_{\ell, j})\vmain}}^2}{m_2}. \label{eq:def_sigma_hat_ell_main}
}
We will show that with a constant success probability, $\hat{\sigma}^{2}_{k, \ell}$ is close to the true variance of the corresponding coordinate. This is essentially the variance of a smaller dataset with scale $\eta_B$. To obtain a bound over all coordinates with an arbitrary failure probability, we take a median of the $m_1$ variances. For the final estimate of the diagonal elements $\V_{kk}$ of $\V$, the median is scaled by a factor $1/\eta_B \bb{\eigengap}$. In Theorem~\ref{thm:high_prob_error_bound}, we show that $\hat{\gamma}_k$ concentrates around $\V_{kk}$ (see~\eqref{eq:main_error_bound_all}). For elements with large $\V_{kk}$, appropriate sample size $N$ and batch size $B$, Theorem~\ref{thm:high_prob_error_bound} also provides multiplicative error guarantees for the variance estimate (see~\eqref{eq:main_error_bound_some}).

\begin{remark}
We are using an estimate of $\E[(e_k^T\Psi_{n,1})^2]$ to provide the confidence interval around $\hat{v}_1(k)$. Algorithm~\ref{alg:variance_estimation} requires an estimate $\tilde{v}$ of $v_1$ for computing the estimates $\hat{\sigma}^{2}_{\ell, k}$ in Line 11, which is provided as an input to the algorithm and assumed to satisfy $\tilde{v} \gets \Oja\bb{\mathcal{D}_N, \eta_N, z/\norm{z}_2}$ for $z \sim \mathcal{N}\bb{0, I}$. For large $N$, this error of approximating $v_1$ by $\vmain$ is small. In our experiments, we choose $N = n$ and obtain $\vmain$ by running the algorithm on the entire data.
% For unit vectors $\hat{v}$ and $\tilde{v}$, let $\alpha := \tilde{v}^{\top}\tilde{v}$. We know that $a$ such that $|\alpha| \geq $. Then it is straightforward to verify, 
% \bas{
%     \norm{\hat{v} - \alpha\tilde{v}}_{2}^{2} 
%     &\leq 1.1\norm{\hat{v} - \sign\bb{\alpha}\tilde{v}}_{2}^{2} + 10\sin^{2}\bb{\tilde{v}, v_1}
% }
% This shows that although the centering procedure in Line 11 of Algorithm~\ref{alg:variance_estimation} to compute the estimates $\hat{\sigma}$ center the approximate eigenvectors $\hat{v}$ around $\alpha \tilde{v}$.
\end{remark}

\input{variance_estimation_algorithm}

%Theorem~\ref{thm:high_prob_error_bound} provides guarantees for Algorithm~\ref{alg:variance_estimation}.

\begin{theorem}\label{thm:high_prob_error_bound}
Let $K$ be the set of indices in [d] that satisfy 
\begin{align}
    N &= \tilde{\Omega}\bb{B/c_k^2} ~~\text{ and} \label{eq:N_lower_bound_main}\\
    B &= \tilde{\Omega}\bb{\bb{\frac{b_k}{c_k}}^{2}\bb{\frac{\Mtwo}{\lambda_1 - \lambda_2}}^{2}} \notag \\ &\;\;+  \tilde{\Omega}\bb{\bb{\frac{b_k}{c_k}}^4 \bb{\frac{\Mfour}{\Mtwo}}^4 + \frac{\lambda_1}{c_k^2 \bb{\eigengap}}}, \label{eq:B_lower_bound_main}
\end{align}
where $b_k := \norm{e_k^{\top}\vp}$, $c_k := \sqrt{\tfrac{\E\bbb{\bb{e_k^{\top}\Eone{B}}^{2}}}{\eta_B}\tfrac{\eigengap}{\Mtwo^{2}}}$, and $B, N$ are respectively the batch size and the number of samples used for the proxy estimate $\tilde{v}$ in Algorithm~\ref{alg:variance_estimation}. 

Then, with probability at least $1-\delta$, the output $\left \{ \hat{\gamma}_k \right \}_{k \in [d]}$ of Algorithm~\ref{alg:variance_estimation} satisfies
\begin{gather}
\Abs{\hat{\gamma}_{k} - 
    \V_{kk}} \lesssim \frac{\V_{kk}}{\sqrt{m}} + \tilde{O}\bb{\frac{B}{N} + \frac{1}{B^{1/2}}}  ~~\forall k \in [d], \text{ and} \label{eq:main_error_bound_all} \\
\Abs{\hat{\gamma}_{k} - \V_{kk}} \lesssim \frac{\V_{kk}}{\sqrt{m}} ~~\forall k \in [K]. \label{eq:main_error_bound_some}
\end{gather}
\end{theorem}
\begin{remark}
    The output of Algorithm~\ref{alg:variance_estimation} rescales the median of the variances by the quantity $\eta_B \bb{\eigengap} = \frac{\alpha \log B}{B}$. This is consistent with the entrywise concentration bounds in Theorem~\ref{thm:main:entrywise_concentration_bound} (which shows that the error in the $j^{th}$ entry is $\sqrt{\eta_n\bb{\eigengap} \V_{kk}}$, up to logarithmic terms) for a sufficiently large sample size and with Proposition~\ref{prop:main:clt} and Lemma~\ref{lemma:second_moment_matrix} (which show that the limiting variance of suitable entries of $\roja$ is $\eta_n\bb{\eigengap} \V_{kk}$). 
    % This is reflected in algorithm~\ref{alg:variance_estimation}, where the output rescales the median of the variances by the quantity $\frac{1}{\eta_B \bb{\eigengap}} = \frac{\alpha \log B}{B}$. 
\end{remark}

\begin{remark}Theorem~\ref{thm:main:entrywise_concentration_bound} provides bounds about entries of the leading eigenvector. We believe our techniques can be generalized to provide uncertainty estimates for entries of top-$k$ eigenvectors using deflation-based approaches (see e.g \cite{pmlr-v247-jambulapati24a}).
\end{remark}

Equation~\eqref{eq:main_error_bound_all} holds for all coordinates $k \in [d]$ and we show in the Appendix (see Remark~\ref{remark:prop2_higher_order}) that for the choice of $B$ and $N$ in Theorem~\ref{thm:high_prob_error_bound}, the higher order terms are indeed $o\bb{\frac{1}{\sqrt{m}}}$. Moreover, for any coordinate $k$ for which equations~\eqref{eq:N_lower_bound_main} and~\eqref{eq:B_lower_bound_main} hold, the lower order terms of equation~\eqref{eq:main_error_bound_all} are $O(\V_{kk}/\sqrt{m})$. This implies an $O(1/\sqrt{\log n})$-multiplicative guarantee on the error of $\hat{\gamma}_k$ like equation~\eqref{eq:main_error_bound_some}.

%\rd TODO : Add theorem to estimate $\sin^{2}$ error and bound the variance of the small guys with that. \bk



% 1. CLT for elements of the residual (or a uniform result)

% 2. Show that for a value of $t$, the expectation matches

% 3. Concentration (by using Bobby-type result)

% 4. See if this holds for linear regression. I think it does.