
After the system runs for time $T$, we record the state space in a $n \times T$ matrix $D$ whose $t$-th column is defined as follows:
\begin{equation*}
    D(t) = x_t = A x_{t-1} + \eta_{t}
\end{equation*}


Similar to \cite{LTI}, we also decompose $A$ based on $E_u \oplus E_s$-decomposition. Suppose $E_u$ and $E_s$ are represented by their orthonormal bases $Q_1 \in \mathbb{R}^{n \times k}$ and $Q_2 \in \mathbb{R}^{n \times (n-k)}$, respectively, i.e. 
    $E_u = \col(Q_1), E_s = \col(Q_2) $.
Let $Q = [Q_1, Q_2]$ (which is invertible as $A$ is diagonalizable), and let $R = [R_1', R_2']':= Q^{-1}$. Since $E_u$ and $E_s$ are both invariant with regard to $A$, we know there exists $N_1 \in \mathbb{R}^{k \times k}$, $N_2 \in \mathbb{R}^{(n-k) \times (n-k)}$, such that

\begin{equation*}
    AQ = Q
    \begin{bmatrix}
    N_1 & \\ & N_2    
    \end{bmatrix}
    \Leftrightarrow
    N := \begin{bmatrix}
    N_1 & \\ & N_2    
    \end{bmatrix}
    = RAQ .
\end{equation*}

We are now ready to prove Theorem \ref{thm:projection}

\begin{proof}[proof of Theorem \ref{thm:projection}]
We partition matrix $D$ into two sub-matrices $D_1,D_2$ as follows:
\begin{equation*}
    D = Q R D 
    = Q \begin{bmatrix}
        R_1 D \\ R_2 D
    \end{bmatrix}
    := Q \begin{bmatrix}
        D_1 \\ D_2
    \end{bmatrix} 
    := Q \widetilde{D}.
\end{equation*}
Therefore, we have
\begin{equation*}
    \begin{split}
        &\left(D'D\right)^{-1} \\
        =& \left(\begin{bmatrix}
            D_1' & D_2'
        \end{bmatrix}
        \begin{bmatrix}
            Q_1' Q_1 & Q_1'Q_2 \\ Q_2'Q_1 & Q_2'Q_2
        \end{bmatrix}
        \begin{bmatrix}
            D_1 \\ D_2
        \end{bmatrix}
        \right)^{-1}
        \\
        =& \big(D_1'D_1 + D_2'D_2 + D_1' Q_1' Q_2 D_2 + D_2'Q_2' Q_1 D_1\big)^{-1}
        \\
        =& \left(D_1'\left(I + \Upsilon\right)D_1\right)^{-1}
        \\
        =& (D_{1})_{\text{right}}^{-1} \left(\sum_{t=0}^\infty (-1)^t \Upsilon^t  \right) (D_1')_{\text{left}}^{-1} ,
    \end{split}
\end{equation*}

where

\begin{equation*}
    \begin{split}
        \Upsilon =& \left(D_1'\right)_{\text{right}}^{-1} \left( D_2'D_2\right)(D_1)_{\text{left}}^{-1} + Q_1'Q_2 D_2 (D_1)_{\text{left}}^{-1}
        \\
        &+ \left(D_1'\right)_{\text{right}}^{-1} D_2' Q_2' Q_1 ,
    \end{split}
\end{equation*}
where $(G)_{\text{left}}^{-1} := (G'G)^{-1}G'$ and $(\cdot)_{\text{right}}^{-1} := G'(GG')^{-1}$ represent left and right inverse of $G$, respectively. \guannan{provide precise formula?} \ziyi{done} \guannan{Then $(D_1')_{right}^{-1} = D_1(D_1'D_1)^{-1}$. However $D_1$ is $k\times T$ and is a wide matrix; so $(D_1' D_1)$ may not be invertable? }

\ziyi{Consider the following setting:
\begin{align*}
    D &= U \Sigma V' 
    \\
    D'D &= V (\Sigma' \Sigma) V' 
    \\
    DD' &= U (\Sigma \Sigma') U'
    \\
    D(D'D)^{-1}D' & = U\Sigma V' (V (\Sigma' \Sigma)V')^{-1} V \Sigma' U'
    \\
    & = U \Sigma (\Sigma' \Sigma)^{-1} \Sigma' U'
\end{align*}
 $\Sigma (\Sigma' \Sigma)^{-1} \Sigma'$ seems to be what we want?
}

Now, we express the projection matrix as follows:
\begin{equation}
\label{eqn:decompose_D}
    \begin{split}
        &D(D'D)^{-1}D' =
        Q\begin{bmatrix}
        D_1 \\ D_2
        \end{bmatrix}
        \left(D'D\right)^{-1}
        \begin{bmatrix}
        D_1' & D_2'
        \end{bmatrix}Q'
        \\
        =& Q\begin{bmatrix}
        D_1\left(D'D\right)^{-1}D_1' & D_1\left(D'D\right)^{-1}D_2'\\
        D_2\left(D'D\right)^{-1}D_1' & D_2\left(D'D\right)^{-1}D_2'
        \end{bmatrix}Q' .
    \end{split}
\end{equation}
It suffices to show the $2 \times 2$ matrix offers a projection that can be represented as below:
\begin{equation*}
    \hat{\Pi} = Q
    \begin{bmatrix}
D_1 \\ 0
\end{bmatrix}
\left(D'D\right)^{-1}
\begin{bmatrix}
D_1' & 0
\end{bmatrix}Q'
=
Q
\begin{bmatrix}
\rm I & 0 \\ 0 & 0
\end{bmatrix}
Q' .
\end{equation*}
We will finish the proof by bounding the error of all four terms below:
\small
\begin{equation}\label{eqn:upper_left}
    \begin{split}
        &\norm{D_1\left(D'D\right)^{-1}D_1'- \text{I}}= 
     \norm{\sum_{t=1}^\infty (-1)^t \Upsilon^t} ,
    \end{split}
\end{equation}

\begin{equation}\label{eqn:lower_left}
    \begin{split}
        &\norm{D_2\left(D'D\right)^{-1}D_1'}=
        \norm{D_2 D_1^{-1}\left(\sum_{t=0}^\infty (-1)^t \Upsilon^t\right)} ,
    \end{split}
\end{equation}
\begin{equation}\label{eqn:upper_right}
    \begin{split}
        &\norm{D_1\left(D'D\right)^{-1}D_2'}=
        \norm{\left(\sum_{t=0}^\infty (-1)^t \Upsilon^t \right)(D_1')^{-1} D_2' } ,
    \end{split}
\end{equation}
\begin{equation}\label{eqn:lower_right}
    \begin{split}
        &\norm{D_2\left(D'D\right)^{-1}D_2'}
        =
        \norm{D_2 D_1^{-1}\left(\sum_{t=0}^\infty (-1)^t \Upsilon^t\right) (D_1')^{-1}D_2' } .
    \end{split}
\end{equation}
\normalsize
Since we know $Q Q' = Q_1 Q_1' + Q_2 Q_2' \preccurlyeq 2 I$, by setting all terms above sum under $\epsilon/2$, we obtain $\norm{\hat{\Pi}_1 - \Pi_1} < \epsilon$. In order to bound \eqref{eqn:upper_left} - \eqref{eqn:lower_right}, it is clear that we need to bound $\norm{\Upsilon}$. Thus, we need to bound $\norm{(D_1')_{\text{left}}^{-1}}_2, \norm{(D_1)_{\text{right}}^{-1}}_2$ and $\norm{D_2}_2$. 

\begin{lemma}
    \label{lemm:D1_bound_final}
    If \eqref{eqn:part_T} is satisfied, with probability $1-4\theta$, then
    \begin{equation*}
        D_1 D_1' \succeq \frac{\pi|\lambda_k|^{2T}\theta^2}{4} \frac{\gap^2}{k^{k+6}} \frac{|\lambda_1|^2}{|\lambda_1|^2-1} ,
    \end{equation*}
    where $\gap := \left|\prod_{m_1 \neq m_2}(\lambda_{m_1}^{-1} - \lambda_{m_2}^{-1})\right|, m_1, m_2 \in \{1,\dots,k\}$.
\end{lemma}
The proof of Lemma \ref{lemm:D1_bound_final} is delayed to Appendix \ref{Appendix:D1}.

For $D_2$ defined above, we have the following inequalities
\begin{equation}
\label{eqn:D2_bound}
    \begin{split}
        \norm{D_2}_2 &\leq \sqrt{T}\norm{D_2}_1 
        \leq \sqrt{T} \sum_{i = k+1}^n \left(\sum_{j = 1}^T \lambda_i^j C\right)\\
        &\leq \sqrt{T} (n-k)\left(\frac{C}{1-|\lambda_{k+1}|}\right) .
    \end{split}
\end{equation}


Lastly, by Lemma A.1 of \cite{LTI}, we have $\norm{Q_1'Q_2}, \norm{Q_2' Q_1} \leq \sqrt{2 \xi}$. we can thus bound $\Upsilon$ as follows:

\begin{equation*}
    \begin{split}
        \norm{\Upsilon} &\leq \norm{(D_1')_{\text{left}}^{-1}}_2 \norm{(D_1)_{\text{right}}^{-1}}_2 \norm{D_2' D_2}  
        \\
        & \quad + \sqrt{2 \xi} \norm{(D_1)_{\text{left}}^{-1}} \norm{D_2} + \sqrt{2 \xi} \norm{(D_1)_{\text{right}}^{-1}} \norm{D_2'}
        \\
        & < 4\left(\sqrt{T} (n-k) \frac{C}{1-|\lambda_{k+1}|}\right)
        \\
        & \quad \cdot\left(\frac{2}{\sqrt{\pi}|\lambda_k|^T\theta} \frac{k^{\frac{k}{2}+3}}{\gap} \sqrt{\frac{|\lambda_1|^2-1}{|\lambda_1|^2}} \right) < \epsilon' ,
    \end{split}
\end{equation*}
where we used $\norm{(D_1')_{\text{left}}^{-1}}_2 \norm{(D_1)_{\text{right}}^{-1}}_2 \norm{D_2' D_2} <  \norm{(D_1)_{\text{left}}^{-1}} \norm{D_2} $ and $\norm{(D_1')_{\text{left}}^{-1}}_2 \norm{(D_1)_{\text{right}}^{-1}}_2 \norm{D_2' D_2} < \norm{(D_1)_{\text{right}}^{-1}} \norm{D_2'}$, because each term in $\Upsilon$ is less than $\epsilon' < 1$, and the left term is effectively the square of the right terms.

Simplify the above, we get
\begin{equation}
\label{eqn:lamb_over_T}
    \begin{split}
        &\frac{|\lambda_k|^{2T}}{T} > \frac{64 k^{k+6} (n-k)^2 (|\lambda_1|^2 -1)}{\pi(\epsilon')^2\theta^2 |\lambda_1|^2 \gap^{2}}
        \\
        &\cdot\left(\frac{C}{1-|\lambda_{k+1}|}\right)^2 := B(\epsilon', \theta).
    \end{split}
\end{equation}

With some algebra, if we have $\frac{1}{2}T \log |\lambda_k| > \log T$, the inequalities below will lead to \eqref{eqn:lamb_over_T}
\[
\displaystyle
\begin{array}{rcccl}
     &&\frac{1}{B(\epsilon', \theta)} &>& T\left(\frac{1}{|\lambda_k|^2}\right)^T \\
     &\Leftarrow&\log\frac{1}{B(\epsilon', \theta)} &>& \log T - 2T \log |\lambda_k|\\
     &\Leftarrow&\log \frac{1}{B(\epsilon', \theta)} &>& T\log |\lambda_k| - 2T \log |\lambda_k|\\
     &\Leftarrow& T &>& \frac{1}{\log |\lambda_k|} \log B(\epsilon', \theta).
\end{array}
\]
In order to have $T \log |\lambda_k| > \log T$, define 
$$f(T) := T  \log |\lambda_k| - \log T.$$
When $T > \log |\lambda_k|$, we have $f(T) = \left(\log |\lambda_k|\right)^2 - \log \log |\lambda_k| > 0$ and $f'(T) = \log |\lambda_k| - \frac{1}{T} > 0$. 

Therefore, when $T > \log |\lambda_k|$, we have $T \log |\lambda_k| > \log T$.

Correspondingly, we pick 
\begin{equation}
\label{eqn:T_bound_in_B}
    T > \max\left(\frac{1}{\log |\lambda_k|} \log B(\epsilon', \theta), \log |\lambda_k|\right) .
\end{equation}

In order to have all four terms above sum to less than $\epsilon/2$, we need $8\sum_{t=0}^\infty \epsilon'^{t} < \epsilon$, so we have
\begin{equation*}
    \frac{8\epsilon'}{1 - \epsilon'} < \epsilon \quad \Rightarrow \quad \epsilon' < \frac{\epsilon}{8+8\epsilon} .
\end{equation*}
Subsitute the above into \eqref{eqn:T_bound_in_B} obtains
\begin{equation}
    T > \max\left(\frac{1}{\log |\lambda_k|} \log B(\frac{\epsilon}{8+8\epsilon}, \theta), \log |\lambda_k|\right),
\end{equation}
with probability $1-4\theta$.

\small
\begin{align*}
    T >& \max\left(\frac{1}{\log |\lambda_k|} \log B(\frac{\epsilon}{8+8\epsilon}, \theta), \log |\lambda_k|\right)\\
     =& \max\bigg(\frac{1}{\log |\lambda_k|} \log \frac{64 k^{k+6} (n-k)^2 (|\lambda_1|^2 -1)}{\pi\left(\frac{\epsilon}{8+8\epsilon}\right)^2\theta^2 |\lambda_1|^2 \gap^{2}} 
     \\
     & \quad \cdot \left(\frac{C}{1-|\lambda_{k+1}|}\right)^2,  \log |\lambda_k|\bigg)\\
     =& O\bigg(\max\big\{\log|\lambda_k|, \frac{1}{\log|\lambda_k|} \big( k\log k+ \log(n-k) \\
        & + \log |\lambda_1| - \log \theta - \log \gap + \log C 
        \\
        & \quad - \log \left(1-|\lambda_{k+1}|\right) - \log \epsilon \big)\big\}\bigg)
    \\
     =& O\bigg(\frac{1}{\log|\lambda_k|} \big( k\log k+ \log(n-k) + \log |\lambda_1| - \log \theta  \\
     & - \log \gap + \log C - \log \left(1-|\lambda_{k+1}|\right) - \log \epsilon\big)\bigg) ,
\end{align*}
\normalsize
where we assumed $n >> k$, $|\lambda_1| >> |\lambda_k|$.

Combined with \eqref{eqn:part_T}, we get
\begin{equation*}
    \begin{split}
        T  >& O\Bigg(\frac{1}{\log|\lambda_k|}\Bigg(- \log \gap + k \log k - \log \theta + \log(n-k)
        \\
        & + \log |\lambda_1| + \log C - \log \left(1-|\lambda_{k+1}|\right) - \log \epsilon\Bigg)\Bigg) .
    \end{split}
\end{equation*}
If we assume the eigenvalue-related terms and $\theta$ to be constants, we get
\begin{equation}
\label{eqn:final_T}
    T > O \left(k \log k + \log(n-k) - \log \epsilon\right).
\end{equation}
\end{proof}
