Lemma~\ref{lemm:ls} gives the explicit form for the solution to the least squares problem in Algorithm~\ref{alg:LTS0}

\begin{lemma}
\label{lemm:ls}
Given $D := [x_0, \cdots, x_T]$ and $\hat{\Pi}_1 = U^{(k)} (U^{(k)})^*$, the solution to 
\begin{equation*}
    \hat{M}_1 = \arg\min_{M_1} \sum_{t=0}^T \norm{(U^{(k)})^* x_{t+1} - M_1 (U^{(k)})^* x_t}^2
\end{equation*}
is uniquely given by $\hat{M}_1 = (U^{(k)})^* A U^{(k)} + \varpi$, where $\varpi = \left(\sum_t (U^{(k)})^* \eta_t x_t^* U^{(k)}\right)((\Sigma^{(k)})^2)^{-1}$.
\end{lemma}
\begin{proof}
    Sincec $M_1$ is a stationary point of $\mathcal{L}$, for any $\Delta$ in the neighborhood of $O$, we have
    \begin{align*}
        0 \leq& \mathcal{L}(M_1 + \Delta) - \mathcal{L}(M_1)\\
        =& \sum_t \norm{\hat{y}_{1,t+1} - M_1 \hat{y}_{1,t} - \Delta \hat{y}_{1,t}}^2 - \sum_t \norm{\hat{y}_{1,t+1} - M_1 \hat{y}_{1,t}}^2 
        \\
        =& \sum_t \langle \Delta \hat{y}_{1,t}, \hat{y}_{1,t+1} - M_1 \hat{y}_{1,t} \rangle + O(\norm{\Delta}^2) 
        \\
        =& \sum_t \text{tr}\left(\hat{y}_{1,t}^* \Delta^*(\hat{y}_{1,t+1}-M_1\hat{y}_{1,t})\right) + O(\norm{\Delta}^2)
        \\
        =& \sum_t \text{tr} \left(\Delta^*(\hat{y}_{1,t+1}-M_1 \hat{y}_{1,t})\hat{y}_{1,t}^*\right) + O(\norm{\Delta}^2)
        \\
        =& \text{tr}\left(\Delta^* \sum_t \left(\hat{y}_{1,t+1} - M_1\hat{y}_{1,t}\right)\hat{y}_{1,t}^*\right) + O(\norm{\Delta}^2).
    \end{align*}
    Since the above holds for all $\Delta$, we get
    \small
    \begin{equation*}
        \sum_t (\hat{y}_{1,t+1} - M_1 \hat{y}_{1,t})\hat{y}_{1,t}^* \Leftrightarrow M_1 \sum_t \hat{y}_{1,t} \hat{y}_{1,t}^* = \sum_t \hat{y}_{1,t+1} \hat{y}_{1,t}^*.
    \end{equation*}
    \normalsize
    Plugging in $\hat{y}_{1,t} = (U^{(k)})^* x_t$ and $\hat{y}_{1,t+1} = (U^{(k)})^* (Ax_t + \eta_t)$, we have 
    \begin{equation*}
        \begin{split}
            M_1 (U^{(k)})^* DD^* U^{(k)} &= M_1 \sum_t (U^{(k)})^* x_t x_t^* U^{(k)}
            \\
            &= \sum_t (U^{(k)})^* (Ax_t + \eta_t) x_t^* U^{(k)} 
            \\
            &= (U^{(k)})^* A DD^* U^{(k)} + \sum_t (U^{(k)})^* \eta_t x_t^* U^{(k)}.
        \end{split}
    \end{equation*}
    Since $U^{(k)}$ are the first $k$ singular vectors of $D$, we have the following equalities:
    \begin{equation}
    \label{eqn:inverse_term}
        (U^{(k)})^* DD^* U^{(k)} = (U^{(k)})^* U \Sigma V^* V \Sigma^* U^* U^{(k)} = \begin{bmatrix}
                I^{(k)} & 0
            \end{bmatrix} \Sigma^2 \begin{bmatrix}
                I^{(k)} \\ 0
            \end{bmatrix} = (\Sigma^{(k)})^2,
    \end{equation}
    which is invertible, and $\hat{M}_1$ is explicitly given by
    \small
    \begin{equation}
    \label{eqn:M1hat_interm}
        \hat{M}_1 = \left((U^{(k)})^* A DD^* U^{(k)} + \sum_t (U^{(k)})^* \eta_t x_t^* U^{(k)}\right)(\Sigma^{(k)})^{-2}.
    \end{equation}
    \normalsize
    Moreover, we have
    \begin{align*}
        &U^{(k)} (U^{(k)})^* DD^*U^{(k)} =  U^{(k)} (\Sigma^{(k)})^2
        \\
        =& \begin{bmatrix}
            U^{(k)} & 0 
        \end{bmatrix}
        \begin{bmatrix}
            (\Sigma^{(k)})^2 \\ 0
        \end{bmatrix}
        = U \begin{bmatrix}
            (\Sigma^{(k)})^2 \\ 0
        \end{bmatrix}
        \\
        =& U \Sigma^2 \begin{bmatrix}
            I^{(k)} \\ 0
        \end{bmatrix}
        = U \Sigma^2 U^* U^{(k)} = D D^* U^{(k)},
    \end{align*}
    where the first equality is obtained by using \eqref{eqn:inverse_term}. Substituting the above in \eqref{eqn:M1hat_interm} yields
    \small
    \begin{equation*}
        \begin{split}
            \hat{M}_1 &= \left((U^{(k)})^* A(U^{(k)} (U^{(k)})^* DD^*)U^{(k)}\right)(\Sigma^{(k)})^{-2} + \varpi
            \\
            &=\left((U^{(k)})^* A U^{(k)} (U^{(k)})^*\right) \left( DD^* U^{(k)}\right)(\Sigma^{(k)})^{-2}+ \varpi
            \\
            &= (U^{(k)})^* A U^{(k)} + \varpi,
        \end{split}
    \end{equation*}
    \normalsize
    where $\varpi = \left(\sum_t (U^{(k)})^* \eta_t x_t^* U^{(k)}\right)(\Sigma^{(k)})^{-2}$.
\end{proof}

We want to show $(U^{(k)})^* A U^{(k)}$ is the dominating term of the above expression, as we will bound $\varpi$ in the following lemma.
\begin{lemma}
\label{lemm:52.2}
Under the premise of \Cref{thm:projection},
    \begin{equation*}
        \norm{M_1 - \hat{M}_1} < 3\norm{A} \delta
    \end{equation*}
    for any $\delta > 0$ whenever 
    \begin{equation*}
        T \geq \frac{\log \left(\frac{4C}{\pi \theta^2 \norm{A}\delta} \frac{k^{k+6}}{\gap^2}  \right)}{\log |\lambda_k|}.
    \end{equation*}
\end{lemma}
\begin{proof}
    First, we prove that $\varpi \leq \delta$. 
    Let $H = [\eta_1,\dots,\eta_T]$, then we have %\guannan{$\Sigma^*$} \ziyi{done}
    \begin{align*}
        \varpi &= (U^{(k)})^* H D^* U^{(k)} (\Sigma^{(k)})^{-2}
        \\
        &= (U^{(k)})^* H V \Sigma^* U^* U^{(k)} (\Sigma^{(k)})^{-2}
        \\
        &= (U^{(k)})^* H V \Sigma^* \begin{bmatrix}
            I^{(k)} \\ 0
        \end{bmatrix} (\Sigma^{(k)})^{-2}
        \\
        &= (U^{(k)})^* H V \begin{bmatrix}
            \Sigma^{(k)} \\ 0
        \end{bmatrix} (\Sigma^{(k)})^{-2}
        \\
        &= (U^{(k)})^* H V \begin{bmatrix}
            (\Sigma^{(k)})^{-1} \\ 0
        \end{bmatrix}.
    \end{align*}
    Therefore, 
    \begin{align}
        &\norm{\varpi} \leq \norm{A}\delta \notag
        \\
        \Leftarrow& \norm{H} \norm{(\Sigma^{(k)})^{-1}} \leq \norm{A}\delta \notag
        \\
        \Leftarrow& \sqrt{T} C \frac{2}{\sqrt{\pi}|\lambda_k|^{T}\theta} \frac{k^{\frac{k}{2}+3}}{\gap} \sqrt{\frac{|\lambda_1|^2-1}{|\lambda_1|^2}} \leq \norm{A}\delta 
        \label{eqn:bibbers_ineq}
        \\
        \Leftarrow & \frac{|\lambda_k|^{T}}{\sqrt{T}} \geq \frac{2C}{\sqrt{\pi} \theta \norm{A}\delta} \frac{k^{\frac{k}{2}+3}}{\gap} 
        \notag
        \\
        \Leftarrow & T \log |\lambda_k| - \frac{1}{2}\log T \geq \log \left(\frac{2C}{\pi \theta \norm{A}\delta} \frac{k^{\frac{k}{2}+3}}{\gap}  \right)
        \notag
        \\
        \Leftarrow & \frac{1}{2} T \log |\lambda_k| \geq \log \left(\frac{2C}{\pi \theta \norm{A}\delta} \frac{k^{\frac{k}{2}+3}}{\gap}  \right)
        \label{eqn:repeat_insert}
        \\
        \Leftarrow & T \geq \frac{2\log \left(\frac{2C}{\pi \theta \norm{A}\delta} \frac{k^{\frac{k}{2}+3}}{\gap}  \right)}{\log |\lambda_k|},
        \label{eqn:T_additional_criteria}
    \end{align}
    where \eqref{eqn:bibbers_ineq} used \Cref{lemm:D1_bound_final} and that for a $n \times T$ matrix $H$, $\norm{H}_2 \leq \sqrt{T} \norm{H}_1$, and \eqref{eqn:repeat_insert} requires $\log T < T \log |\lambda_k|$, which is satisfied when we derived \eqref{eqn:insertion2} and \eqref{eqn:insertion3}. We can use \Cref{lemm:D1_bound_final} to bound $\norm{(\Sigma^{(k)})^{-1}}$ is a direct result of Cauchy Interlacing Theorem. We further observe that \eqref{eqn:T_additional_criteria} does not change the criteria obtained in \eqref{eqn:final_T}. 

    Recall that $U^{(k)} = \hat{P}_1$. We obtain
    \begin{align*}
        \norm{M_1 - \hat{M}_1} &= P_1^* A P_1 - \left((U^{(k)})^* A U^{(k)} + \varpi\right)
        \\
        & \leq \norm{P_1^* A P_1 - P_1^* A \hat{P}_1^*} + \norm{P_1^* A \hat{P}_1 - \hat{P}_1^* A \hat{P}_1^*} + \norm{\varpi}
        \\
        & \leq \norm{A}\norm{P_1 - \hat{P}_1} + \norm{A}\norm{P_1 - \hat{P}_1} + \norm{\varpi}
        \\
        &\leq 3 \norm{A}\delta.
    \end{align*}
    where in the last inequality, we used \Cref{coro:52.1}.
\end{proof}

With \Cref{lemm:52.2}, we are ready to prove \Cref{prop:G2}.

\begin{proof}[Proof of \Cref{prop:G2}]
    By \Cref{lemm:52.2}, we get $\norm{M_1 - \hat{M}_1} < 3\norm{A} \delta$. Moreover, by Gelfand's formula, we have
    \begin{align*}
        \norm{M_1^t} &= \norm{P_1^* A^t P_1} \leq \norm{A^t} \leq \zeta_{\epsilon_1}(A)(|\lambda_1| + \epsilon_1)^t, 
        \\
        \norm{\hat{M}_1^t} &= \norm{\hat{P}_1^* A^t \hat{P}_1} \leq \norm{A^t} \leq \zeta_{\epsilon_1}(A)(|\lambda_1| + \epsilon_1)^t, 
    \end{align*}
    Therefore, by telescoping, we get
    \begin{align*}
        \norm{M_1^{\tau} - \hat{M}_1^{\tau}} &= \norm{\sum_{i=1}^{\tau}(M_1^i \hat{M}_1^{\tau-i} - M_1^{i-1}\hat{M}_1^{\tau-i+1})}
        \\
        &\leq \norm{M_1^{i-1}}\norm{M_1^{\tau-i}}\norm{M_1 - \hat{M}_1} 
        \\
        &< \tau \cdot \zeta_{\epsilon_1}(A)^2 (|\lambda_1| + \epsilon_1)^{\tau-1}\cdot 3\norm{A}\delta
        \\
        &= 3 \tau \norm{A} \zeta_{\epsilon_1} (A)^2(|\lambda_1|+\epsilon_1)^{\tau-1} \delta.
    \end{align*}
\end{proof}

With \Cref{prop:G2}, the following corollary easily follows:
\begin{corollary}
    \label{coro:G2}
    Under the premise of \Cref{thm:main}, when $\delta < \frac{1}{\tau}$,
    \begin{equation*}
        \norm{\hat{M}_1^\tau} < \left(\zeta_{\epsilon_1}(M_1)(|\lambda_1| + \epsilon_1) + 3 \norm{A}\zeta_{\epsilon_1}(A)\right)(|\lambda_1| + \epsilon_1)^{\tau-1}.
    \end{equation*}
\end{corollary}
\begin{proof}
    By Gelfand's formula and \Cref{prop:G2}, 
    \begin{align*}
        \norm{\hat{M}_1^{\tau}} &\leq \norm{M_1^{\tau}} + \norm{\hat{M}_1^{\tau} - M_1^{\tau}} 
        \\
        &\leq \zeta_{\epsilon_1}(A)(\lambda_1 + \epsilon_1)^{\tau} + 3 \tau \norm{A} \zeta_{\epsilon_1} (A)^2(|\lambda_1|+\epsilon_1)^{\tau-1} \delta
        \\
        &< \left(\zeta_{\epsilon_1}(M_1)(|\lambda_1| + \epsilon_1) + 3 \norm{A}\zeta_{\epsilon_1}(A)\right)(|\lambda_1| + \epsilon_1)^{\tau-1}.
    \end{align*}
    where the last inequality requires $\delta < \frac{1}{\tau}$. 
\end{proof}
