%\guannan{Say one of the key technical innovation is the analysis of Stage 1, so we put the proof of Thm 5.1 here. }
One of the key innovations of this work is the SVD-based framework we use to decouple the unstable subspace from the rest of the system. Therefore, we prove \Cref{thm:projection} here. After the system runs for time $T$, we record the state space in a $n \times T$ matrix $D$ whose $t$-th column is defined as:
\begin{equation*}
    D(t) = x_t = A x_{t-1} + \eta_{t}.
\end{equation*}
We decompose $A$ based on $E_u \oplus E_s$-decomposition. Suppose $E_u$ and $E_s$ are represented by their orthonormal bases $Q_1 \in \mathbb{R}^{n \times k}$ and $Q_2 \in \mathbb{R}^{n \times (n-k)}$, respectively, i.e. 
    $E_u = \col(Q_1), E_s = \col(Q_2) $.
Let $Q = [Q_1, Q_2]$ (which is invertible as $A$ is diagonalizable), and let $R = [R_1^*, R_2^*]^*:= Q^{-1}$. Since $E_u$ and $E_s$ are both invariant with regard to $A$, we know there exists $N_1 \in \mathbb{R}^{k \times k}$, $N_2 \in \mathbb{R}^{(n-k) \times (n-k)}$, s.t.
\begin{equation*}
    AQ = Q
    \begin{bmatrix}
    N_1 & \\ & N_2    
    \end{bmatrix}
    \Leftrightarrow
    N := \begin{bmatrix}
    N_1 & \\ & N_2    
    \end{bmatrix}
    = RAQ .
\end{equation*}

We are now ready to prove Theorem \ref{thm:projection}.

\begin{proof}
Let $D = U \Sigma V^*$ denote the compressed singular value decomposition of $D$ and $\sigma_1 > \dots > \sigma_n$ denote its singular values. In this case, we have $U \in \mathbb{R}^{n \times \min\{n,T\}}$, $\Sigma \in \mathbb{R}^{\min\{n,T\} \times \min\{n,T\}}$, and $V \in \mathbb{R}^{T \times \min\{n,T\}}$. Moreover, denote $U = [u_1,\dots,u_n]$ and $V = [v_1,\dots,v_n]$.

Furthermore, we have the following equalities
\begin{equation*}
    D = QRD = Q\begin{bmatrix}
        R_1 D \\ R_2 D
    \end{bmatrix} 
    = Q \begin{bmatrix}
        D_1 \\ D_2
    \end{bmatrix}
    = \begin{bmatrix}
    Q_1 & Q_2
\end{bmatrix}
\begin{bmatrix}
    D_1 \\ 0
\end{bmatrix}
+ 
\begin{bmatrix}
    Q_1 & Q_2
\end{bmatrix}
\begin{bmatrix}
    0 \\ D_2
\end{bmatrix}
= Q_1 D_1 + Q_2 D_2.
\end{equation*}

Let 
\begin{equation*}
    \mathcal{D} = \begin{bmatrix}
         0 & (Q_1 D_1)^* \\
         Q_1 D_1 & 0
    \end{bmatrix}
    , \quad 
    J = \begin{bmatrix}
         0 & (Q_2 D_2)^* \\
         Q_2 D_2 & 0
    \end{bmatrix}
    ,\quad 
    \mathcal{D} + J
    = \begin{bmatrix}
         0 & D^* \\
         D & 0 
    \end{bmatrix}.
\end{equation*}
We can decompose $\mathcal{D}+J$ in the following form
\begin{equation*}
    \mathcal{D} + J = \begin{bmatrix}
        0 & V \Sigma U^* \\
        U \Sigma V^* & 0
    \end{bmatrix}
    = \frac{1}{2}\left(\begin{bmatrix}
        V \\ U
    \end{bmatrix}\Sigma
    \begin{bmatrix}
        V \\ U
    \end{bmatrix}^* 
    -
    \begin{bmatrix}
        V \\ -U
    \end{bmatrix}\Sigma
    \begin{bmatrix}
        V \\ -U
    \end{bmatrix}^* \right).
\end{equation*}
Therefore, we see that the eigenvalues of $\mathcal{D}+J$ are exactly $\{\pm \sigma_i\}$ with eigenvectors $[v_i^*, \pm u_i^*]^*$, respectively. Correspondingly, the top $k$ largest eigenvalues of $\mathcal{D}+J$ are the top $k$ largest singular values of $D$, or the square root of top $k$ largest eigenvalues of $DD^*$. 

Similarly, we use compressed singular value composition on $D_1 = U_1 \Sigma_1 V_1^*$, where $U_1 \in \mathbb{R}^{k \times k}, \Sigma_1 \in \mathbb{R}^{k \times k}, V_1 \in \mathbb{R}^{T \times k}$, and decompose $\mathcal{D}$ as follows:
\begin{align*}
    \mathcal{D} =& \begin{bmatrix}
        0 & V_1 \Sigma_1 U_1^* Q_1^* \\
        Q_1 U_1 \Sigma_1 V_1^* & 0
    \end{bmatrix}
    \\
    =& \frac{1}{2}\bigg(\begin{bmatrix}
        V_1 \Sigma_1 V_1^* & V_1 \Sigma_1 U_1^* Q_1^* \\
        Q_1 U_1 \Sigma_1 V_1^* & Q_1 U_1 \Sigma_1 U_1^* Q_1^*
    \end{bmatrix}
    -
    \begin{bmatrix}
        V_1 \Sigma_1 V_1^* & -V_1 \Sigma_1 U_1^* Q_1^* \\
        -Q_1 U_1 \Sigma_1 V_1^* & Q_1 U_1 \Sigma_1 U^* Q_1^*
    \end{bmatrix}
    \bigg)
    \\
    =& \frac{1}{2}\bigg(\begin{bmatrix}
        V_1 \Sigma_1 \\ Q_1 U_1 \Sigma_1 
    \end{bmatrix}
    \begin{bmatrix}
        V_1^* & U_1^*Q_1^*
    \end{bmatrix}-
    \begin{bmatrix}
        V_1 \Sigma_1 \\ -Q_1 U_1 \Sigma_1 
    \end{bmatrix}
    \begin{bmatrix}
        V_1^* & -U_1^*Q_1^*
    \end{bmatrix}\bigg)
    \\
    =& \frac{1}{2}\left(\begin{bmatrix}
        V_1 \\ Q_1 U_1
    \end{bmatrix}\Sigma_1
    \begin{bmatrix}
       V_1 \\ Q_1 U_1
    \end{bmatrix}^* 
    -
    \begin{bmatrix}
         V_1 \\ -Q_1 U_1
    \end{bmatrix}\Sigma_1
    \begin{bmatrix}
        V_1 \\ -Q_1 U_1
    \end{bmatrix}^* \right).
\end{align*}
We see that the top $k$ largest eigenvalues of $\mathcal{D}$ are the top $k$ largest singular values of $D_1$, denoted as $\hat{\sigma}_1,\dots,\hat{\sigma}_k$. 

Let $U^{(k)}$ and $V^{(k)}$ denote the submatrices containing the first $k$ columns of $U$ and $V$, respectively. Let $\Pi $ and $\Pi'$ denote the projection onto the eigenspaces of the largest $k$ eigenvectors of $\mathcal{D}+J$ and $\mathcal{D}$, respectively.

It is clear that
\begin{align*}
    \Pi &= \frac{1}{2}
    \begin{bmatrix}
        V^{(k)} \\ U^{(k)}
    \end{bmatrix}
    \begin{bmatrix}
        (V^{(k)})^* & (U^{(k)})^*
    \end{bmatrix}
    = \frac{1}{2}
    \begin{bmatrix}
        V^{(k)} (V^{(k)})^* & V^{(k)} (U^{(k)})^*\\
        U^{(k)} (V^{(k)})^* & U^{(k)} (U^{(k)})^*
    \end{bmatrix},
    \\
    \Pi' &= \frac{1}{2}
    \begin{bmatrix}
        V_1 \\ Q_1 U_1
    \end{bmatrix}
    \begin{bmatrix}
        V_1^* & U_1^* Q_1^*
    \end{bmatrix}
    = \frac{1}{2}
    \begin{bmatrix}
        V_1 V_1^* & V_1 U_1^*Q_1^*\\
        Q_1 U_1 V_1^* & Q_1 U_1 U_1^* Q_1^*
    \end{bmatrix}
    = \frac{1}{2}
    \begin{bmatrix}
        V_1 V_1^* & V_1 U_1^*Q_1^*\\
        Q_1 U_1 V_1^* & Q_1 Q_1^*
    \end{bmatrix}.
\end{align*}

By Davis-Kahan Theorem (see \citet{Davis-Kahan} and Appendix~\ref{Appendix:DKT}), we have
\begin{equation*}
    \norm{\Pi-\Pi'} \leq \frac{1}{2}\frac{\sqrt{2k}\norm{J}_2}{\hat{\sigma}_k - \sigma_{k+1}} = \frac{\sqrt{2k}\norm{Q_2 D_2}}{\hat{\sigma}_k - \sigma_{k+1}} 
    \leq \frac{\sqrt{2k}\norm{Q_2}\norm{D_2}}{\hat{\sigma}_k - \sigma_{k+1}} = \frac{\sqrt{2k}\norm{D_2}}{\hat{\sigma}_k - \sigma_{k+1}}.
\end{equation*}
Since $
    \widehat{\Pi}_1 = U^{(k)} (U^{(k)})^*,  \Pi_1 = Q_1 Q_1^* $, we have
\begin{equation*}
    \norm{\widehat{\Pi}_1 - \Pi_1} \leq \norm{\Pi - \Pi'}\leq \frac{\sqrt{2k}\norm{D_2}}{\hat{\sigma}_k - \sigma_{k+1}}.
\end{equation*}
We next show that $\hat{\sigma}_k = \Omega(|\lambda_k|^T)$, $\sigma_{k+1} = O(T)$ and $\Vert D_2\Vert = O(T)$, based on which  $ \norm{\widehat{\Pi}_1 - \Pi_1}  \leq \frac{O(T)}{\Omega(\lambda_{k}^T - T)} \rightarrow 0$. More formally, we have the following. 

%By Lemma A.5. of \cite{Simchowitz18}, we get
%\begin{equation*}
%    \widetilde{D}\widetilde{D}^* \preceq \begin{bmatrix}
%        2D_1 D_1^* & 0 \\ 0 & 2D_2 D_2^*
%    \end{bmatrix}
%\end{equation*}

%Thus, we need to bound $\norm{D_1 D_1^*}$ and $\norm{D_2 D_2^*}_2$. 

\begin{lemma}
    \label{lemm:D1_bound_final}
    If
    \begin{equation}
    \label{eqn:part_T}
    T > \Theta\left(\frac{\log k - 2 \log \left(\frac{\gap}{k^{\frac{k}{2}+3}}\right) - 3\log \theta }{\log |\lambda_k|}\right)
\end{equation}
    is satisfied, with probability at least $1-4\theta$,
    \begin{equation*}
        D_1 D_1^* \succeq \frac{\pi|\lambda_k|^{2T}\theta^2}{4} \frac{\gap^2}{k^{k+6}} \frac{|\lambda_1|^2}{|\lambda_1|^2-1} ,
    \end{equation*}
    where we recall $\gap = \left|\prod_{\substack{m_1 \neq m_2, \\ m_1, m_2 \in \{1,\dots,k\}}}(\lambda_{m_1}^{-1} - \lambda_{m_2}^{-1})\right|$.
\end{lemma}
The proof of Lemma \ref{lemm:D1_bound_final} is delayed to Appendix \ref{Appendix:D1}.

For $D_2$, we have the following inequalities
\begin{equation}
\label{eqn:D2_bound}
    \norm{D_2}_2 \leq \sqrt{T}\norm{D_2}_1 
        \leq \sqrt{T} \sum_{i = k+1}^n \left(\sum_{j = 1}^T \lambda_i^j C\right) 
        \leq \sqrt{T} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right) .
\end{equation}

By \Cref{lemm:D1_bound_final} and \eqref{eqn:D2_bound}, in order to have $\norm{\widehat{\Pi}_1 - \Pi_1} < \epsilon$, we need
\begin{align}
    & \norm{\widehat{\Pi}_1 - \Pi_1} < \epsilon \notag
    \\
    \Leftarrow&\frac{\sqrt{2k}\sqrt{T} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right)}{\frac{\sqrt{\pi}|\lambda_k|^{T}\theta}{2} \frac{\gap}{k^{\frac{k}{2}+3}} \sqrt{\frac{|\lambda_1|^2}{|\lambda_1|^2-1}} - 2\sqrt{2k}\sqrt{T} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right)} 
    < \epsilon
    \notag
    \\
    \Leftarrow& \frac{2\sqrt{2k}k^{\frac{k}{2}+3}\sqrt{T} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right)}{\sqrt{\pi}|\lambda_k|^{T}\theta \gap - 4\sqrt{2k}k^{\frac{k}{2}+3}\sqrt{T} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right)} 
    < \epsilon
    \notag
    \\
    \Leftarrow& \frac{2\sqrt{2}k^{\frac{k+7}{2}}\sqrt{T} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right)}{\frac{1}{2}\sqrt{\pi}|\lambda_k|^{T}\theta \gap} 
    < \epsilon
    \label{eqn:insertion_exp}
    \\
    \Leftarrow & 4\sqrt{2}k^{\frac{k+7}{2}}\sqrt{T} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right) 
    < \sqrt{\pi}|\lambda_k|^{T}\theta \gap\epsilon \notag 
    \\
    \Leftarrow & \frac{1}{2} \log T + \log\bigg(4\sqrt{2}k^{\frac{k+7}{2}} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right)\bigg) 
    < T \log |\lambda_k| + \log \big(\sqrt{\pi}\theta \gap\epsilon\big) \notag 
    \\
    \Leftarrow & \frac{1}{2} T \log |\lambda_k| > \log \bigg(\frac{4\sqrt{2}k^{\frac{k+7}{2}} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right)}{\sqrt{\pi}\theta \gap\epsilon}\bigg)
    \label{eqn:insertion2}
    \\
    \Leftarrow & T > \frac{2\log \bigg(\frac{4\sqrt{2}k^{\frac{k+7}{2}} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right)}{\sqrt{\pi}\theta \gap\epsilon}\bigg)}{\log |\lambda_k|}
    \label{eqn:projection_intermediate}
\end{align}
where in \eqref{eqn:insertion_exp}, we require
\begin{align}
    &4\sqrt{2k}k^{\frac{k}{2}+3}\sqrt{T} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right) < \frac{1}{2} \sqrt{\pi}|\lambda_k|^{T}\theta \gap 
    \notag
    \\
    \Leftarrow& \frac{1}{2}\log T + \log\bigg(4\sqrt{2}k^{\frac{k+7}{2}} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right)\bigg) 
    < T \log |\lambda_k| + \log(\frac{1}{2}\sqrt{\pi} \theta \gap) 
    \notag
    \\
    \Leftarrow & \frac{1}{2}T \log|\lambda_k| > \log\bigg(4\sqrt{2}k^{\frac{k+7}{2}} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right)\bigg) 
    - \log(\frac{1}{2}\sqrt{\pi} \theta \gap)
    \label{eqn:insertion3}
    \\
     \Leftarrow & T > \frac{2\log\bigg(\frac{8\sqrt{2}k^{\frac{k+7}{2}} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right)}{\sqrt{\pi} \theta \gap }\bigg)}{\log|\lambda_k|}
    \label{eqn:projection_requirement}
\end{align}
where in \eqref{eqn:insertion2} and \eqref{eqn:insertion3}, we need $T \log |\lambda_k| > \log T$.
In order to have $T \log |\lambda_k| > \log T$, define 
$$f(T) := T  \log |\lambda_k| - \log T.$$
When $T > \log |\lambda_k|$, we have $f(T) = \left(\log |\lambda_k|\right)^2 - \log \log |\lambda_k| > 0$ and $f'(T) = \log |\lambda_k| - \frac{1}{T} > 0$. 

Therefore, when $T > \log |\lambda_k|$, we have $T \log |\lambda_k| > \log T$.

Combining \eqref{eqn:projection_intermediate}, \eqref{eqn:projection_requirement}, and $T > \log |\lambda_k|$ required above, we get
\begin{equation}
\label{eqn:T_complete}
    \begin{split}
        T &> \max \Bigg\{\frac{2\log\bigg(\frac{8\sqrt{2}k^{\frac{k+7}{2}} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right)}{\sqrt{\pi} \theta \gap }\bigg)}{\log|\lambda_k|}, \frac{2\log \bigg(\frac{4\sqrt{2}k^{\frac{k+7}{2}} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right)}{\sqrt{\pi}\theta \gap\epsilon}\bigg)}{\log |\lambda_k|},  \log |\lambda_k|\Bigg\}.
    \end{split}
\end{equation}
Treating the eigenvalue terms and $\theta$ to be constants as stated in the theorem, for $\norm{\widehat{\Pi}_1 - \Pi_1} < \epsilon$ to hold, we need
\begin{equation}
\label{eqn:final_T}
    T > \Theta(\left(k \log k + \log(n-k) - \log \epsilon - \log \gap\right).
\end{equation}
This concludes the proof.
\end{proof}


%\guannan{CAdd a conclusion session, point out future directions. }\ziyi{how does this look?}
