\section{Appendix}\label{sec:appendix}
\subsection{Calculation steps for \ref{relative_residual_approximation}}\label{relative_residual_approximation_calculation}
Here we are going to show how \textit{squared relative residual} implies \eqref{relative_residual} and then implies \eqref{relative_residual_approximation}. Consider $\phi=\mathbf{\Psi}\mathbf{v}=\sum_{i=1}^{N_K} \psi_i \mathbf{v}_i$ with $\|\phi\|_2=1$, then
\begin{align*}\label{relative_residual}   
    &\quad \frac{\int_{\Omega} |\mathcal{K}\phi(x) - \lambda \phi(x)|^2 d\mu(x)}{\int_{\Omega} |\phi(x)|^2 d\mu(x)} \\
    &= \int_{\Omega} |\mathcal{K}\phi(x) - \lambda \phi(x)|^2 d\mu(x) \\
    &= \langle \mathcal{K}\phi - \lambda \phi, \mathcal{K}\phi - \lambda \phi \rangle_{\mu} \\
    &= \langle \mathcal{K}\phi, \mathcal{K}\phi \rangle_{\mu} - \langle \lambda \phi, \mathcal{K}\phi \rangle_{\mu} - \langle \mathcal{K}\phi, \lambda \phi \rangle_{\mu} + \langle \lambda \phi, \lambda \phi \rangle_{\mu} \\    
    &= \langle \mathcal{K}\mathbf{\Psi}\mathbf{v}, \mathcal{K}\mathbf{\Psi}\mathbf{v} \rangle_{\mu} - \bar{\lambda} \langle \mathbf{\Psi}\mathbf{v}, \mathcal{K}\mathbf{\Psi}\mathbf{v} \rangle_{\mu} - \lambda \langle \mathcal{K}\mathbf{\Psi}\mathbf{v}, \mathbf{\Psi}\mathbf{v} \rangle_{\mu} + |\lambda|^2 \langle \mathbf{\Psi}\mathbf{v}, \mathbf{\Psi}\mathbf{v} \rangle_{\mu} \\
    &= \langle \sum_{i=1}^{N_K} \mathcal{K}\psi_i \mathbf{v}_i, \sum_{j=1}^{N_K} \mathcal{K}\psi_j \mathbf{v}_j \rangle_\mu - \bar{\lambda} \langle \sum_{i=1}^{N_K} \psi_i \mathbf{v}_i, \sum_{j=1}^{N_K} \mathcal{K}\psi_j \mathbf{v}_j \rangle_\mu - \lambda \langle \sum_{i=1}^{N_K} \mathcal{K}\psi_i\mathbf{v}_i, \sum_{j=1}^{N_K} \psi_j\mathbf{v}_j \rangle_\mu + |\lambda|^2 \langle \sum_{i=1}^{N_K} \psi_i \mathbf{v}_i, \sum_{j=1}^{N_K} \psi_j \mathbf{v}_j \rangle_\mu \\
    &= \sum_{i,j=1}^{N_K} \bar{\mathbf{v}}_i \langle \mathcal{K}\psi_i, \mathcal{K}\psi_j \rangle_\mu \mathbf{v}_j - \bar{\lambda} \sum_{i,j=1}^{N_K} \bar{\mathbf{v}}_i \langle \psi_i, \mathcal{K}\psi_j \rangle_\mu \mathbf{v}_j - \lambda \sum_{i,j=1}^{N_K} \bar{\mathbf{v}}_i \langle \mathcal{K}\psi_i, \psi_j \rangle_\mu \mathbf{v}_j + |\lambda|^2 \sum_{i,j=1}^{N_K} \bar{\mathbf{v}}_i \langle \psi_i,\psi_j \rangle_\mu \mathbf{v}_j \\
    &= \sum_{i,j=1}^{N_K} \bar{\mathbf{v}}_i \left[ \langle \mathcal{K}\psi_i, \mathcal{K}\psi_j \rangle_\mu  - \bar{\lambda} \langle \psi_i, \mathcal{K}\psi_j \rangle_\mu - \lambda \langle \mathcal{K}\psi_i, \psi_j \rangle_\mu + |\lambda|^2 \langle \psi_i,\psi_j \rangle_\mu \right] \mathbf{v}_j \quad \eqref{relative_residual} \\
    &\approx \sum_{i,j=1}^{N_K} \bar{\mathbf{v}}_i \left[ \frac{1}{m}[\Psi_Y^* \Psi_Y]_{ij} - \bar{\lambda} \frac{1}{m}[\Psi_X^* \Psi_Y]_{ij} - \lambda \frac{1}{m}[\Psi_Y^* \Psi_X]_{ij} + |\lambda|^2 \frac{1}{m}[\Psi_X^* \Psi_X]_{ij} \right] \mathbf{v}_j \\
    &= \frac{1}{m}\mathbf{v}^* \left[ \Psi_Y^* \Psi_Y - \lambda (\Psi_X^* \Psi_Y)^* - \bar{\lambda} \Psi_X^* \Psi_Y + |\lambda|^2 \Psi_X^* \Psi_X \right] \mathbf{v} \quad \eqref{relative_residual_approximation}
\end{align*}
\begin{remark}
    the inner product above is defined as $\langle f,g \rangle_{\mu} = \int_{\Omega} \, f^* g \, d\mu(x)$
\end{remark}


\subsection{Details for deriving \eqref{thm:main}}\label{thmpf:main}

\begin{align*}
    J &= \sum_{i=1}^{N_K} \widehat{\textit{res}}(\lambda_i, \phi_i)^2  \\
    &= \sum_{i=1}^{N_K} \frac{1}{m} \mathbf{v}_i^* \left[ \Psi_Y^* \Psi_Y - \lambda_i (\Psi_X^* \Psi_Y)^* - \bar{\lambda}_i \Psi_X^* \Psi_Y + |\lambda_i|^2 \Psi_X^* \Psi_X \right] \mathbf{v}_i \\
    &= \sum_{i=1}^{N_K} \frac{1}{m} \left[ \mathbf{v}_i^*  (\Psi_Y^* \Psi_Y) \mathbf{v}_i- \mathbf{v}_i^* (\Psi_X^* \Psi_Y)^* \lambda_i \mathbf{v}_i - \mathbf{v}_i^* \bar{\lambda}_i (\Psi_X^* \Psi_Y) \mathbf{v}_i + \mathbf{v}_i^* K^* (\Psi_X^* \Psi_X) K \mathbf{v}_i \right] \\
    &= \sum_{i=1}^{N_K} \frac{1}{m} \left[ \mathbf{v}_i^* (\Psi_Y^* \Psi_Y) \mathbf{v}_i- \mathbf{v}_i^* (\Psi_X^* \Psi_Y)^* K \mathbf{v}_i - \mathbf{v}_i^* K^* (\Psi_X^* \Psi_Y) \mathbf{v}_i + \mathbf{v}_i^* K^* (\Psi_X^* \Psi_X) K \mathbf{v}_i \right] \\
    &= \sum_{i=1}^{N_K} \frac{1}{m} \bigg( \langle \Psi_Y \mathbf{v}_i, \Psi_Y \mathbf{v}_i \rangle  - \langle \Psi_Y \mathbf{v}_i, \Psi_X K \mathbf{v}_i \rangle \\
    & \qquad - \langle\Psi_X K \mathbf{v}_i, \Psi_Y \mathbf{v}_i \rangle + \langle \Psi_X K \mathbf{v}_i, \Psi_X K \mathbf{v}_i \rangle \bigg) \\
    &= \sum_{i=1}^{N_K} \frac{1}{m} \langle \Psi_Y \mathbf{v}_i - \Psi_X K \mathbf{v}_i, \Psi_Y \mathbf{v}_i - \Psi_X K \mathbf{v}_i \rangle \\
    &= \sum_{i=1}^{N_K} \frac{1}{m} \| \Psi_Y \mathbf{v}_i - \Psi_X K \mathbf{v}_i \|_2^2 \\
    &= \frac{1}{m} \| (\Psi_Y  - \Psi_X K) V \|_{F}^2.
\end{align*}
Next, by \href{https://en.wikipedia.org/wiki/Matrix_calculus#Denominator-layout_notation}{matrix calculus with denominator layout convention}, we try to find minimal of $J$:
\begin{align*}
    0 = \frac{d J}{d K} &= \frac{d \tr(J)}{d K} \quad  (\text{since $J$ is a scalar})\\
    &= \frac{d}{d K}\tr\bigg( \frac{1}{m} \sum_{i=1}^{N_K} \mathbf{v}_i^* \bigg[ \Psi_Y^* \Psi_Y - (\Psi_X^* \Psi_Y)^* K \\
    &\qquad - K^* (\Psi_X^* \Psi_Y) + K^* (\Psi_X^* \Psi_X) K \bigg] \mathbf{v}_i \bigg) \\
    &= \sum_{i=1}^{N_K} \frac{d}{d K} \tr\bigg( \mathbf{v}_i^* \bigg[ L - A^* K - K^* A + K^* G K \bigg] \mathbf{v}_i \bigg) \\
    &= \sum_{i=1}^{N_K} \frac{d}{d K} \tr\left( \mathbf{v}_i^* L \mathbf{v}_i \right) + \frac{d}{d K} \tr\left( \mathbf{v}_i^* A^* K \mathbf{v}_i \right) + \frac{d}{d K} \tr\left( \mathbf{v}_i^* K^* A \mathbf{v}_i \right) + \frac{d}{d K} \tr\left( \mathbf{v}_i^* K^* G K \mathbf{v}_i \right) \\
    &= \sum_{i=1}^{N_K} -A\mathbf{v}_i \mathbf{v}_i^*-A\mathbf{v}_i \mathbf{v}_i^*+(G+G^*)K\mathbf{v}_i \mathbf{v}_i^* \\
    &= \sum_{i=1}^{N_K} (-2A+2GK)\mathbf{v}_i \mathbf{v}_i^* \quad (\text{$G$ is symmetric})
\end{align*}
where $\tr()$ is trace of a matrix and $G = \Psi_X^* \Psi_X, A = \Psi_X^* \Psi_Y, L = \Psi_Y^* \Psi_Y.$

Since eigenvector $v_i$ is not a zero vector, $v_i v_i^*$ is not a zero matrix. So
$$-2A+2GK = 0 \Rightarrow K = G^{\dagger}A.$$
\begin{remark}
    To solve $\tfrac{d}{d K} \tr\left( \mathbf{v}_i^* K^* G K \mathbf{v}_i \right)$ , we simply rewrite it as 
    $$\tfrac{d}{d K} \tr\left( \mathbf{v}_i^* K^* G K \mathbf{v}_i \right) = \tfrac{d}{d K} \tr\left( (K \mathbf{v}_i)^* G (K \mathbf{v}_i) \right)$$
\end{remark}


\subsection{Discussion On Convergence}\label{convergence_discussion}
To understand how neural networks enhance NN-ResDMD, it is important to introduce Barron space \citep{Pinkus_1999,cybenko:hal-03753170,haykin2009neural,256500}. Barron space characterizes functions efficiently approximated by two-layer neural networks, which is central to NN-ResDMD. By leveraging networks that approximate functions within this space, NN-ResDMD can flexibly optimize the dictionary functions for Koopman operator approximation, making it highly effective for complex, high-dimensional systems.

A function \( f \) belongs to Barron space \( \mathcal{B} \) if it can be represented as:
\[
f(x) = \int_{\Omega} a \sigma(w^T x) \rho(da, dw),
\]
where \( \sigma \) is the activation function, \( w \) is a weight vector, \( a \) is a coefficient, and \( \rho \) is a probability distribution. The complexity of \( f \) is measured by the Barron norm \( \|f\|_\mathcal{B} \):
\[
\|f\|_\mathcal{B} = \inf_{\rho \in P_f} \left( \int_{\Omega} |a| \|w\|_1 \rho(da, dw) \right),
\]
where \( P_f \) is the set of distributions for which \( f \) can be represented. This framework provides a basis for analyzing approximation errors in neural networks.

The following theorem \citep{e2020mathematicalunderstandingneuralnetworkbased} discusses the approximation capabilities of two-layer neural networks within this context, establishing a foundation for the subsequent analysis.


\begin{theorem}[Direct Approximation Theorem, \(L^2\)-version]\label{weinan_theorem}
    For any \(f \in \mathcal{B}\) and \(r \in \mathbb{N}\), there exists a two-layer neural network \(f_r\) with \(r\) neurons \(\{(a_i, \mathbf{w}_i)\}\) such that
    \[
    \|f - f_r\|_{L^2} \lesssim \frac{\|f\|_{\mathcal{B}}}{\sqrt{r}}.
    \]
\end{theorem}
This result implies that the approximation error decreases at a rate of \( 1/\sqrt{r} \) as the number of neurons \( r \) increases, with the constant \( \|f\|_\mathcal{B} \) reflecting the complexity of the function \( f \) within the Barron space.

Now, consider a Barron space $ \mathcal{B} $ which is dense in $L^2(\Omega, \mu)$ and a projected Koopman operator $ \mathcal{K}_{N_K} : \mathcal{B}_{N_K} \to L^2(\Omega, \mu) $ where $\mathcal{B}_{N_K} \subseteq \mathcal{B}$ is a $N_K$-dimensional subspace spanned by some   dictionary $\mathbf{\Psi} = \{\psi_i\}_{i=1}^{N_K}$. According to Theorem \ref{weinan_theorem}, we can have a  well-trained dictionary that almost spans $\mathcal{B}_{N_K}$, i.e., given $\epsilon>0$, we can always obtain a dictionary $\mathbf{\Psi}_r = \{\psi_{r,i}\}_{i=1}^{N_K}$ such that $\sum_{i=1}^{N_K} \Vert \psi_{r,i} - \psi_i \Vert^2_2 < \epsilon$.

\subsection{Highlights of NN-ResDMD compared with typical existing neural network-based Koopman framework} \label{sec:discussion_VAMP}

\textcolor{cyan}{Our NN-ResDMD method takes a fundamentally different approach from existing deep learning methods by building upon the residual-based framework of ResDMD rather than the different Koopman-approximating loss functions following the variational principles of VAMPnets \citep{TianWu+2021+635+659, wu2020variational, mardt2018vampnets} or the deep autoencoder structure in \citep{Lusch2017DeepLF}. By incorporating spectral residual measures into deep learning and introducing a structured representation that captures dependencies among eigenvalues, we achieve more compact and interpretable models for nonlinear systems with continuous spectra. This approach enables us to directly minimize Koopman spectral approximation errors while avoiding the high-dimensional representations or point-spectrum limitations of previous methods.}

\textcolor{cyan}{If we take the VAMP framework as an example, here are the connections and differences. The proposed loss function and the VAMP score share the goal of optimizing approximations of the Koopman operator's spectral properties, establishing a connection in their ultimate purpose. However, although they both depend on the covariance matrices (in our manuscript Equation 3.2), their methodologies differ significantly. Our residual-based method directly minimizes the spectral approximation error of the Koopman operator and accommodates both point and continuous spectra, while the VAMP score follows a variational framework, maximizing the sum of singular values to approximate the point spectrum, primarily for stochastic systems (though see an exception in \citet{TianWu+2021+635+659}). Moreover, while VAMP is specifically designed for Markov processes and requires the Koopman operator to be Hilbert-Schmidt, our approach focuses on deterministic systems and enables a more comprehensive spectral analysis that incorporates continuous spectra. This distinction in scope and methodology highlights how the two frameworks complement each other in addressing different aspects of spectral estimation.}

\subsection{Discussion of computation costs} \label{sec:comp_cost}
%%%% Original %%%%
% \textcolor{ForestGreen}{Despite the various advantages of the NN-ResDMD framework, one significant limitation is its higher computational cost compared to the original ResDMD and other classical methods.}

% \textbf{\textcolor{ForestGreen}{Theoretical Perspective:}}
% \textcolor{ForestGreen}{The NN-ResDMD algorithm's computational demands stem primarily from its iterative optimization process. Each iteration involves a gradient descent update with complexity scaling linearly with both system dimensionality and neural network parameters. Though individual gradient steps are computationally lightweight for standard network architectures, the algorithm's efficiency issue lies in its repeated least-squares optimizations. Compared to standard single least-squares computation as in most numerical algorithms, NN-ResDMD requires multiple iterations to achieve convergence, with stochastic gradient descent methods showing a theoretical $O(1/n)$ convergence rate (See \cite{bach2013non}). However, the method's nonlinear optimization nature also presents challenges for establishing concrete convergence bounds and error estimates. }

% \textbf{\textcolor{ForestGreen}{Empirical Perspective:}}
% \textcolor{ForestGreen}{In our experiments, without computing the pseudospectrum, the computational cost of ResDMD typically ranges from seconds to minutes. NN-ResDMD, on the other hand, can require tens of minutes to several hours, depending on factors such as data dimensionality, the number of snapshots, hidden layer configurations, dictionary sizes, and training convergence criteria.}

% \textbf{\textcolor{ForestGreen}{Trade-off Between Cost and Accuracy:}}
% \textcolor{ForestGreen}{While NN-ResDMD's additional computational steps introduce higher costs, they enhance the accuracy and robustness of Koopman eigenpair estimation by allowing automatic dictionary learning and minimizing spurious spectral components. This trade-off makes NN-ResDMD particularly valuable in applications where precision is critical. However, its computational demands render it less suitable for real-time or online Koopman model learning tasks.}
\textcolor{ForestGreen}{Despite the various advantages of the NN-ResDMD framework, one significant limitation is its higher computational cost compared to the original ResDMD and other classical methods.}

\textbf{\textcolor{ForestGreen}{Theoretical Perspective:}}
\textcolor{ForestGreen}{The NN-ResDMD algorithm's computational demands stem primarily from its iterative optimization process. Each iteration involves a gradient descent update with complexity scaling linearly with both system dimensionality and neural network parameters. Though individual gradient steps are computationally lightweight for standard network architectures, the algorithm's efficiency issue lies in its repeated least-squares optimizations. Compared to standard single least-squares computation as in most numerical algorithms, NN-ResDMD requires multiple iterations to achieve convergence, with stochastic gradient descent methods showing a theoretical $O(1/n)$ convergence rate (See \cite{bach2013non}). However, the method's nonlinear optimization nature also presents challenges for establishing concrete convergence bounds and error estimates.} 

\textbf{\textcolor{ForestGreen}{Comparison:}}
\textcolor{ForestGreen}{The computational costs of EDMD, ResDMD, EDMD-DL, Hankel-DMD, and NN-ResDMD vary significantly based on their core computational steps and specific configurations. For a dataset with $ m = 10^5 $ data points and $ N_K = 300 $ dictionary functions (for EDMD-based methods), the theoretical complexity and runtime differ across methods. \textbf{EDMD} involves least squares and eigenvalue decomposition, with a complexity of $ O(N_K^2 m + N_K^3) $, making it the fastest method, and typically requiring only seconds to minutes for computation. \textbf{ResDMD} extends EDMD by adding residual evaluation and pseudospectrum computation. The residual evaluation introduces an additional $ O(N_K^3) $, and pseudospectrum computation across $ n_z $ grid points incurs $ O(n_z N_K^3) $, resulting in a total complexity of $ O(N_K^2 m + n_z N_K^3) $. This leads to runtimes ranging from minutes to hours, depending on the resolution of the pseudospectrum grid. \textbf{EDMD-DL} incorporates dictionary learning through stochastic gradient descent (SGD), where each iteration involves matrix construction ($ O(N_K^2 m) $), Koopman matrix computation ($ O(N_K^3) $), and neural network forward/backward propagation ($ O(d|H|) $, with $ d|H| $ representing the total network parameter size). With $ k $ SGD iterations, the total complexity becomes $ O(k(N_K^2 m + N_K^3 + d|H|)) $, leading to runtimes also in the range of minutes to hours depending on $ k $. \textbf{NN-ResDMD}, which builds on EDMD-DL, shares the same complexity, $ O(k(N_K^2 m + N_K^3 + d|H|)) $, but includes the explicit use of Koopman matrix eigenvectors and optional pseudospectrum computation, making its runtime slightly longer than EDMD-DL for high-resolution spectral analysis. \textbf{Hankel-DMD}, using a time delay embedding dimension $ T $, constructs a Hankel matrix ($ O(T m) $), performs singular value decomposition (SVD) ($ O(T^2 m) $), and computes the eigenvalues of a reduced $ T \times T $ matrix ($ O(T^3) $). The total complexity is $ O(T m + T^2 m + T^3) $, and the runtime is heavily influenced by $ T $, typically ranging from minutes to hours. While EDMD is computationally the most efficient, ResDMD and Hankel-DMD provide higher precision and robustness in spectral analysis at the expense of increased runtime, and EDMD-DL and NN-ResDMD offer flexibility and accuracy through dictionary learning, with additional SGD iterations and optional pseudospectrum computation contributing to their computational burden.}

\textbf{\textcolor{ForestGreen}{Empirical Perspective:}}
\textcolor{ForestGreen}{In our experiments, without computing the pseudospectrum, the computational cost of ResDMD typically ranges from seconds to minutes. NN-ResDMD, on the other hand, can require tens of minutes to several hours, depending on factors such as data dimensionality, the number of snapshots, hidden layer configurations, dictionary sizes, and training convergence criteria.}

\textbf{\textcolor{ForestGreen}{Trade-off Between Cost and Accuracy:}}
\textcolor{ForestGreen}{While NN-ResDMD's additional computational steps introduce higher costs, they enhance the accuracy and robustness of Koopman eigenpair estimation by allowing automatic dictionary learning and minimizing spurious spectral components. This trade-off makes NN-ResDMD particularly valuable in applications where precision is critical. However, its computational demands render it less suitable for real-time or online Koopman model learning tasks.}

\subsection{Source code}
For reproducibility, the source code will be available at the following anonymous URL: \href{https://anonymous.4open.science/r/ICLR-7305-PROJ}{https://anonymous.4open.science/r/ICLR-7305-PROJ}. A full version of the codebase will be released upon acceptance of the paper.

\subsection{Hankel-DMD}

\subsubsection{Justification of Using Hankel-DMD as comparison in all experiments} \label{hankel_dmd_intro}
\textcolor{ForestGreen}{Hankel-DMD operates by constructing a Hankel matrix from time-delayed measurements of the system state, based on Takens' embedding theorem, which states that time-delayed coordinates can reconstruct the state space of dynamical systems. Hankel-DMD also falls within the framework of Extended Dynamic Mode Decomposition (EDMD), as it effectively uses time-delayed states as dictionary functions. This connection introduces convergence conditions specific to time-delay embeddings, differing from those associated with standard EDMD implementations. This makes Hankel-DMD a natural choice for comparison in the pendulum system. Specifically, the method enables a more detailed extraction of the system's modes and dynamics, with theoretical guarantees established in works like \citep{arbabi2017ergodic}, which proved its convergence for ergodic systems.}

\textcolor{ForestGreen}{Practically, the approach involves constructing a large matrix of time-shifted copies of measured data, where the number of delays determines how many past states are considered. This theoretically grounded framework is particularly effective when the system states have good temporal resolution and has shown strong performance in analyzing high-dimensional dynamical systems. Consequently, we also apply Hankel-DMD to the turbulence and neural dynamics experiments to evaluate its effectiveness in these representative high-dimensional settings.}

\textcolor{ForestGreen}{As results, although its performance rivals NN-ResDMD in the simple pendulum system by showing eigenvalues points near the unit circle and containing some polluted eigenvalues, which are close to the ground truth unit circle, we would like to emphasize that it capture the point spectrum and miss the full spectral information. When it comes to high-dimensional systems, it fails to capture key dynamics in higher-dimensional systems, as seen in the later experiment (Section~\ref{sec:turbulence} and Section~\ref{sec:neural_dynamics}).}

\subsubsection{Application in Turbulence} \label{sec:hankel_turbulence}
\textcolor{ForestGreen}{Here we present the Koopman modes computed by Hankel-DMD for comparison with the NN-ResDMD results. As shown in Figure \ref{fig:turbulence_hankeldmd}, despite having small residuals, these modes fail to clearly capture the fundamental pressure field structure that was successfully identified by NN-ResDMD's first Koopman mode (see Figure \ref{fig:turbulence}). This comparison demonstrates the superior ability of NN-ResDMD to extract physically meaningful patterns from complex fluid systems.}
\begin{figure}[!htb]
    \centering    
    % Row 1
    \begin{subfigure}[b]{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{images/koopman_mode_hankeldmd_3.png}
        % \caption{Optional caption for subfigure (a)}
        \label{fig:hankeldmd_78}
    \end{subfigure}%
    \hfill
    \begin{subfigure}[b]{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{images/koopman_mode_hankeldmd_1.png}
        % \caption{Optional caption for subfigure (b)}
        \label{fig:hankeldmd_86}
    \end{subfigure}
    
    \vspace{0.5cm} % Adjust vertical space between rows as needed
    
    % Row 2
    \begin{subfigure}[b]{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{images/koopman_mode_hankeldmd_6.png}
        % \caption{Optional caption for subfigure (c)}
        \label{fig:hankeldmd_89}
    \end{subfigure}%
    \hfill
    \begin{subfigure}[b]{0.45\textwidth}
        \centering
        \includegraphics[width=\textwidth]{images/koopman_mode_hankeldmd_2.png}
        % \caption{Optional caption for subfigure (d)}
        \label{fig:hankeldmd_93}
    \end{subfigure}
    
    \caption{The plots illustrate turbulence detection using the four Koopman modes computed by Hankel-DMD, which are ranked with their corresponding residuals from the smallest.}
    \label{fig:turbulence_hankeldmd}

\end{figure}


\subsection{Practical details for neural data analysis}
\subsubsection{Dataset Details and Experimental Setup}

The dataset utilized in this study is part of the open dataset provided for the 'Sensorium 2023' competition \citep{turishcheva2023dynamic}. The dataset consists of calcium imaging recordings from the primary visual cortex of mice. During the experiments, the mice were presented with natural video stimuli while the activity of thousands of neurons was recorded. The objective of the competition is to predict large-scale neuronal population activity in response to different frames of the stimulus videos, based on the hypothesis that population dynamics in the primary visual cortex, driven by visual stimuli, encode significant information about the dynamics of the videos \citep{basole2003mapping, onat2011natural,henaff2021primary}.

\subsubsection{Task Definition and Rationale}
In contrast to the competition's prediction objective, our study focuses on the task of state partitioning of neural signals. While prediction remains feasible, we aim to demonstrate that state partitioning is sufficient to highlight the superiority of NN-ResDMD over a series of other methods in uncovering the latent dynamics of the system. Specifically, in each experiment, a set of six video stimuli was repeatedly presented to each mouse, creating ideal conditions for defining brain states. The recording setup remained consistent for each mouse, ensuring that the neural activities could be interpreted as originating from the same dynamical system, with the primary variable being the input stimulus.

We hypothesize that during repeated trials with identical visual stimuli, the underlying dynamics of the neural system remain consistent. Consequently, the recurrence of the same brain state is expected during these trials. This provides a reliable basis for testing the efficacy of Koopman decomposition methods in uncovering latent dynamics and distinguishing these states.

\subsubsection{Dataset Structure and Dimensionality}
The dataset includes neural recordings from five mice, with each mouse responding to six distinct video stimuli, presented in 9-10 repeated trials (resulting in approximately 60 trials in total). Each trial involves recordings of over 7000 neurons. The duration of each video stimulus is 10 seconds, with a sampling rate of 50 Hz, yielding 300 data points (299 snapshots) per trial. Thus, the data to be analyzed consists of a high-dimensional time series with 7000+ observables per snapshot.

\subsubsection{Implementations of NN-ResDMD and other classical methods} \label{sec:implement_appendix}

We compare here four methods: the proposed NN-ResDMD and three classical Koopman decomposition methods for high-dimensional systems: the Hankel-DMD, the EDMD with RBF basis, and the Kernel ResDMD. We applied them to the 5 datasets, although with slightly different implementations and different dimensions of approximated Koopman invariance subspace. 

For NN-ResDMD, we train the dictionaries with all the snapshots recorded in each mouse such that the total snapshot number is the product of the snapshot number in one trial and the number of all trials. This is to avoid overfitting with the small snapshot numbers within a trial. The high-dimensional data is first reduced to 300 dimensions with Singular Value Decomposition. The dimension of the Koopman subspace is chosen to be 601, consisting of 300 trained bases and 301 pre-chosen ones (constant and the first-degree polynomials of the SVD-ed 300 dimensions). The first 501 eigenfunctions sorted by the modulus of eigenvalues are selected to avoid spurious eigenvalues estimation due to noise. One can find the decomposed eigenfunctions in Figure~\ref{fig:resDMD_HankelDMD_efuns}A(top), with a marker of the ground truth state separations based on stimulus identity. 

For Hankel-DMD, the Koopman eigenfunctions were approximated using the eigenvectors of the Hankel matrix. Specifically, the Hankel matrix was formed as in Equation 53 from \citet{arbabi2017ergodic}, using all the observables from one trial of each mouse with a delay of 50. Consequently, the snapshot size became 249 times the observable number, and the resulting number of eigenfunctions was 50, each with a length of 50. The Hankel-DMD eigenfunctions for each trial of data are shown in Figure~\ref{fig:resDMD_HankelDMD_efuns}A (bottom), alongside the ground truth trial identities for comparison.

For EDMD with RBF basis, the high-dimensional dataset is first reduced to 300 dimensions with SVD. Then RBF basis is calculated with 1000 RBF functions. The choice of the basis number is decided based on classical experiments of using RBF basis to estimate the Koopman operator of Duffing systems \citep{li2017extended}.  

For Kernel ResDMD, as it is a variant of Kernel EDMD \citep{kevrekidis2016kernel}, the dimension of the Koopman invariant subspace should corresponds to the sample number (in time). Given the data size to be 300, we have 299 snapshots, resulting in 299 Koopman bases. The detailed calculated is performed for each trial with the program provided in the original ResDMD paper \citep{colbrook2023residual, colbrook2024rigorous}. We chose the kernel function as the commonly-used normalized Gaussian function in the calculation.

The Koopman eigenfunctions from both NN-ResDMD and other methods represent dynamical features corresponding to one of the six video stimuli. To evaluate how well the eigenfunctions capture the latent dynamics, we assess the similarity of the features for trials with the same stimulus and their dissimilarity from those corresponding to different stimuli. Effectively, this makes the problem a clustering task, where the separability of the Koopman eigenfunctions reflects how well they capture the key dynamic components related to the stimuli.

\subsection{Choice justification of dictionary sizes} \label{sec:basis_choice}
\textcolor{ForestGreen}{In this section, we provide justifications for the use of different dictionary sizes (i.e., the number of Koopman eigenfunctions) in the aforementioned four methods for the neural dynamics experiment.}

\textcolor{ForestGreen}{First, the high-dimensional data was pre-processed using SVD to reduce its dimensionality to 300. Then, for the four methods}:

\begin{enumerate} 
    \item \textcolor{ForestGreen}{For NN-ResDMD, we selected 300 trained basis functions and 300 first-order monomial basis functions as the dictionary for the 300 reduced observables. This choice ensures the dictionary is rich enough to span the Koopman invariant subspace. Hence, the size of the trained dictionary was set to be at least equal to the original observable size. Then based on the rank of estimated Koopman eigenvalues, we select the dominant 501 eigenfunctions to avoid the eigenfunctions with zero eigenvalues.}
    
    \item \textcolor{ForestGreen}{For Hankel DMD, the number of delays (as dictionary size/number of eigenfunctions) is first constrained by the temporal sample size (i.e., snapshot size) because it cannot exceed the maximum snapshot size. Therefore, it is impossible to choose the same dictionary size as the NN-ResDMD example. Choosing the delay too small will result in an insufficient dictionary size to span the Koopman invariant subspace, and too large will reduce the actual snapshot size to estimate the covariance matrices in the estimation of the Koopman matrix. Therefore, we chose a compromise delay number of 50 that satisfies both needs.}
    
    \item \textcolor{ForestGreen}{For RBF basis, in principle, we can use the same dictionary size. However, our previous experience with a similar dataset and the results of using the RBF basis for the EDMD method all suggest that the performance will be better with more dictionary functions. Therefore, we chose 1000 RBF basis and the original 300 first-order monomial basis as a better condition compared to the same dictionary size with NN-ResDMD.}
    
    \item \textcolor{ForestGreen}{For Kernel ResDMD, the dictionary size is theoretically determined to be the number of snapshots. Therefore, we cannot make the dictionary size consistent with the NN-ResDMD example.}
\end{enumerate}
\textcolor{ForestGreen}{Based on the above justifications, we believe our choices of dictionary sizes are reasonable and ensure a fair comparison across the methods.}

\subsubsection{Visualization and Clustering Performance}
To visualize the clustering of high-dimensional Koopman eigenfunctions, we perform dimensionality reduction using Multi-dimensional Scaling (MDS). MDS is particularly useful for visualizing high-dimensional data by preserving pairwise similarities \citep{kruskal1964nonmetric} (here we use correlation as a measure of similarities). While UMAP \citep{mcinnes2018umap} and t-SNE \citep{van2008visualizing} are alternative visualization methods, with different emphasis on global-local relationships, we primarily use MDS in this study and provide UMAP and t-SNE results in the supplementary materials (see Appendix Figure~\ref{fig:umap_tsne}A, B, Appendix Figure~\ref{fig:edmd_rbf_all}C, D and Appendix Figure~\ref{fig:kernel_resdmd_all}C, D). UMAP in implementation is still correlation-based. For t-SNE estimation we use the perplexity of 15, as a value for optimal separation.

By applying MDS, the high-dimensional eigenfunction-based features are reduced to a low-dimensional space. For illustration, we present the results of reducing the feature space to two dimensions (Figure~\ref{fig:resDMD_HankelDMD_efuns}B-E). The NN-ResDMD reduced features for the six types of trials (corresponding to the six video stimuli) are well-separated for all five mice (Figure~\ref{fig:resDMD_HankelDMD_efuns}B). In contrast, the Hankel-DMD features show no clear clustering structure (Figure~\ref{fig:resDMD_HankelDMD_efuns}C). Similarly, the features produced by EDMD with an RBF basis and Kernel ResDMD do not show clear separability (Figure~\ref{fig:resDMD_HankelDMD_efuns}D-E, Appendix Figure~\ref{fig:edmd_rbf_all}B-D, Appendix Figure~\ref{fig:kernel_resdmd_all}B-D).


\begin{figure}[!htb]
\centering
\includegraphics[width=\textwidth]{UMAP_tSNE_2d_new}
\caption{State Partition performance of eigenfunctions for NN-ResDMD and Hankel-DMD in 2D space visualized with UMAP (A) and t-SNE (B).}
\label{fig:umap_tsne}
\end{figure}

\begin{figure}[!htb]
\centering
\includegraphics[width=\textwidth]{edmd_rbf_cluster_all.png}
\caption{Full results of EDMD with RBF basis. (A) 1301 Koopman eigenfunctions estimated by EDMD with RBF basis in 6 states characterized by 6 different video stimuli in an example mouse. Eigenfunctions in each trial of each state contain 300 data points (10s with a sampling rate of 50Hz). 
(B) 2-D representation of Koopman eigenfunctions for each trial of all tested mice, calculated by EDMD with RBF basis and reduced by Multidimensional Scaling (MDS). No clear separation of states can be seen from the reduced representation. 
(C) Same as (B) but visualized with UMAP. No clear separation of states can be seen from the reduced representation. 
(D) Same as (C) but visualized with t-SNE. No clear separation of states can be seen from the reduced representation.}
\label{fig:edmd_rbf_all}
\end{figure}

\begin{figure}[!htb]
\centering
\includegraphics[width=\textwidth]{kernel_resdmd_cluster_all.png}
\caption{Same as Figure~\ref{fig:edmd_rbf_all} but estimated with Kernel ResDMD, with 299 basis of the Koopman subspace, thus 299 eigenfunctions.}
\label{fig:kernel_resdmd_all}
\end{figure}

\begin{figure}[!htb]
\centering
\includegraphics[width=\textwidth]{umap_tsne_dbindex.png}
\caption{Davies-Bouldin Indices evaluating the clustering performance of dynamical components learned by four methods (NN-ResDMD, Hankel DMD, EDMD+RBF, and Kernel ResDMD) across five mice. Comparisons are shown using UMAP (A) and t-SNE (B).}
\label{fig:umap_tsne_dbindex}
\end{figure}

\subsubsection{Clustering Quality Metrics}
We further quantified the clustering quality by calculating the Davies-Bouldin Index (DBI) for both Koopman decomposition methods across all mice (Figure~\ref{fig:resDMD_HankelDMD_efuns}F). The DBI is designed to assess the compactness of clusters and the separability between them. A lower DBI indicates better clustering performance. NN-ResDMD features yield significantly lower DBI scores compared to other methods, confirming that NN-ResDMD produces more clearly defined clusters corresponding to the ground truth trials. Similar clustering results are observed with UMAP and t-SNE (see Appendix Figure~\ref{fig:umap_tsne_dbindex}), further supporting the superior performance of NN-ResDMD in capturing the latent dynamic structure compared to the other classical methods.