\documentclass{article}[12] % For LaTeX2e
\usepackage{iclr2025_conference,times}

% Optional math commands from https://github.com/goodfeli/dlbook_notation.
\input{math_commands.tex}

\usepackage{hyperref}
\usepackage{url}
\usepackage[utf8]{inputenc}
\usepackage{amssymb}
\usepackage{amsmath}
\usepackage{amsthm}
\usepackage{mathtools}
\usepackage{float}
\usepackage{graphicx}
\usepackage{subcaption}
\usepackage{hyperref}
\usepackage[dvipsnames]{xcolor}
\usepackage{enumitem}
\usepackage{subcaption}
\usepackage{comment}




\DeclareMathOperator{\tr}{tr}
\numberwithin{equation}{section}
\newtheorem{theorem}{Theorem}[section]
\newtheorem{lemma}[theorem]{Lemma} % Define lemma environment with the same counter as theorem
\usepackage[linesnumbered,ruled,vlined]{algorithm2e}
\usepackage{algorithm2e}
\usepackage[colorinlistoftodos]{todonotes}
\usepackage{afterpage}
% \theoremstyle{remark}
\newtheorem*{remark}{Remark}
% \DeclareMathOperator*{\argmin}{arg\,min}

\title{NN-ResDMD: Learning Koopman Representations for Complex Dynamics with Spectral Residuals}

% Authors must not appear in the submitted version. They should be hidden
% as long as the \iclrfinalcopy macro remains commented out below.
% Non-anonymous submissions will be rejected without review.

\author{Yuanchao Xu$^{1}$\thanks{Equal Contributions.}\enspace, 
Kaidi Shao$^{2}$\footnotemark[1]\enspace, 
Nikos Logothetis$^{2}$,
Zhongwei Shen$^{1}$\thanks{Corresponding Authors. Email to: zhongwei@ualberta.ca}\enspace, \\ 
$^1$ Department of Mathematical and Statistical Science, University of Alberta\\
$^2$ International Center for Primate Brain Research, Chinese Academy Science\\
% \texttt{zhongwei@ualberta.ca} \\
}

% \author{Yuanchao Xu \thanks{Equal contribution. Email to} \\
% % Department of Mathematical and Statistical Science\\
% University of Alberta\\
% % Edmonton, AB T6G2R3, Canada \\
% \texttt{yuanchao@ualberta.ca} \\
% \And
% Kaidi Shao\footnotemark[1] \\
% % International Center for Primate Brain Research \\
% Chinese Academy Science \\
% % Song Jiang District, China \\
% \texttt{kaidi.shao@icpbr.ac.cn} \\
% \And
% Matthew Colbrook \\
% % Department of Applied Mathematics and Theoretical Physics \\
% University of Cambridge\\
% \texttt{m.colbrook@damtp.cam.ac.uk} \\
% \And
% Nikos Logothetis \\
% % International Center for Primate Brain Research \\
% Chinese Academy Science \\
% % Song Jiang District, China \\
% \texttt{nikos.logothetis@icpbr.ac.cn} \\
% \And
% Zhongwei Shen \\
% % Department of Mathematical and Statistical Science \\
% University of Alberta\\
% % Edmonton, AB T6W5E7, Canada \\
% \texttt{zhongwei@ualberta.ca} \\
% }


% The \author macro works with any number of authors. There are two commands
% used to separate the names and addresses of multiple authors: \And and \AND.
%
% Using \And between authors leaves it to \LaTeX{} to determine where to break
% the lines. Using \AND forces a linebreak at that point. So, if \LaTeX{}
% puts 3 of 4 authors names on the first line, and the last on the second
% line, try using \AND instead of \And before the third author name.

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

% \iclrfinalcopy % Uncomment for camera-ready version, but NOT for submission.
\begin{document}


\maketitle

\begin{abstract}

Analyzing long-term behaviors in high-dimensional nonlinear dynamical systems remains a significant challenge. The Koopman operator framework has emerged as a powerful tool to address this issue by providing a globally linear perspective on nonlinear dynamics. However, existing methods for approximating the Koopman operator and its spectral components, particularly in large-scale systems, often lack robust theoretical guarantees.
Residual Dynamic Mode Decomposition (ResDMD) introduces a spectral residual measure to assess the convergence of the estimated Koopman spectrum, which helps filter out spurious spectral components. 
Nevertheless, it depends on pre-computed spectra, thereby inheriting their inaccuracies. 
To overcome its limitations, we introduce the Neural Network-ResDMD (NN-ResDMD), a method that directly estimates Koopman spectral components by minimizing the spectral residual. By leveraging neural networks, NN-ResDMD automatically identifies the optimal basis functions of the Koopman invariant subspace, eliminating the need for manual selection and improving the reliability of the analysis.
Experiments on physical and biological systems demonstrate that NN-ResDMD significantly improves both accuracy and scalability, making it an effective tool for analyzing complex dynamical systems.
\end{abstract}

\section{Introduction}
In the study of complex dynamical systems, a critical challenge lies in accurately extracting and analyzing long-term behavior in high-dimensional nonlinear systems.  Various data-driven methods \citep{Brunton_Kutz_2019,schetzen2006volterra,wiggins2003introduction,slotine1991applied,lan2013linearization,Mezic2005} have been developed to address this challenge, with the Koopman operator \citep{Koopman1931,koopman1932dynamical} framework emerging as a powerful tool due to its ability to globally linearize nonlinear systems. Unlike local linearization methods \citep{Hartman1960ALI,Grobman1959}, which approximate dynamics near fixed points, the Koopman operator transforms the entire system into a linear form within an infinite-dimensional space, which allows the use of spectral analysis techniques to study complex dynamics.

Despite its promise, practical computational challenges arise from the infinite-dimensional nature of the Koopman operator. Numerical methods such as Extended Dynamic Mode Decomposition (EDMD) \citep{Williams2015} have been developed to approximate the Koopman operator using a finite set of observables, making it possible to extract dynamic modes from data. However, EDMD lacks theoretical guarantees of convergence and may fail to capture the full Koopman spectrum accurately, particularly in large-scale, complex systems.

To address these limitations, the Residual Dynamic Mode Decomposition (ResDMD) method \citep{colbrook2024rigorous} was introduced, which offers convergence guarantees by using a spectral residual measure that quantifies the extent to which the estimated Koopman spectrum converges to the true spectrum of the system. By assessing the convergence, ResDMD can eliminate spurious spectral components—those that do not correspond to the true dynamics of the system—thereby enhancing the reliability and robustness of the spectral estimation. However, ResDMD primarily serves as a filtering tool for precomputed spectra rather than providing a direct and more accurate approximation of Koopman spectra. Consequently, it lacks the capacity to independently refine the spectral estimation.

In this paper, we propose Neural Network-ResDMD (NN-ResDMD), which overcomes this limitation by providing a method to directly compute Koopman eigenpairs by minimizing the spectral residual. Additionally, NN-ResDMD employs neural networks to automatically select basis functions, eliminating the need for manual intervention, a common challenge in EDMD-based methods. Through experiments on both toy models and real-world high-dimensional systems, we demonstrate that NN-ResDMD significantly improves accuracy and scalability, making it a practical and effective tool for analyzing complex dynamical systems.


\section{Preliminary on Koopman operator}
Consider a discrete-time dynamical system \((\Omega, \mu)\) governed by a map \(F: \Omega \to \Omega\), where \(\Omega \subseteq \mathbb{R}^d\) is the state space, and \(\mu\) is a probability measure. The evolution of the system is described by:
\[
x_{k+1} = F(x_k), \quad k \in \mathbb{Z}^+.
\]
The Koopman operator \(\mathcal{K}\) acts on observables \(g \in L^2(\Omega, \mu)\) as:
\[
\mathcal{K} g = g \circ F.
\]
Although \(F\) is nonlinear, the Koopman operator \(\mathcal{K}\) is linear, enabling spectral analysis of the system in the infinite-dimension function space. 

A key aspect of modern Koopman operator theory is Koopman Mode Decomposition (KMD) \citep{Mezic2005}, which represents system dynamics through its spectral components, i.e. the eigenvalues, Koopman modes, and eigenfunctions. The discrete spectrum is particularly important for insights into long-term behavior, such as periodicity and stability. Our analysis emphasizes these spectral components derived from KMD. Specifically, we seek eigenpairs \((\lambda_i, \phi_i)\), where \(\lambda_i\) are eigenvalues and \(\phi_i\) are the corresponding Koopman eigenfunctions.

One of the most prominent numerical methods to approximate the Koopman operator and its spectral components is the Extended Dynamic Mode Decomposition (EDMD) method, introduced by \cite{Williams2015}. In EDMD, a set of observables (dictionary \textcolor{blue}{or basis} functions) \(\mathbf{\Psi} = [\psi_1, \dots, \psi_{N_K}]\) is selected, and the span of these observables defines the subspace \( V_{N_K} \coloneqq \text{span}\{\psi_i\}_{i=1}^{N_K} \). Snapshots of the system's state are then collected, and the method constructs a finite-dimensional approximation of the Koopman operator by solving a least-squares problem that relates the snapshots of observables. This enables the computation of eigenvalues, eigenfunctions, and Koopman modes. \textcolor{blue}{Note that while common choices of dictionary functions are polynomials, Fourier basis, RBF functions, etc., the optimal choice of basis functions is usually unknown a priori and depends heavily on the specific dynamical system. }

Given independent and identically distributed data snapshots \(\{(x_i, y_i)\}_{i=1}^m\) with \(y_i = F(x_i)\), two matrices \(\Psi_X\) and \(\Psi_Y\) are formed by evaluating the dictionary on the data snapshots:

\[
\Psi_X = \begin{bmatrix}
\psi_1(x_1) & \dots & \psi_{N_K}(x_1) \\
\vdots & \ddots & \vdots \\
\psi_1(x_m) & \dots & \psi_{N_K}(x_m)
\end{bmatrix}, \quad
\Psi_Y = \begin{bmatrix}
\psi_1(y_1) & \dots & \psi_{N_K}(y_1) \\
\vdots & \ddots & \vdots \\
\psi_1(y_m) & \dots & \psi_{N_K}(y_m)
\end{bmatrix}.
\]

EDMD computes the Koopman matrix approximation as \(K = \Psi_X^{\dagger} \Psi_Y\), where \(\Psi_X^{\dagger}\) is the pseudo-inverse of \(\Psi_X\). The eigenvalues of \(K\) provide approximations of the Koopman operator’s spectrum, and the Koopman eigenfunctions \(\phi_i\) are approximated as \(\phi_i = \mathbf{\Psi} \mathbf{v}_i\), where \(\mathbf{v}_i \in \mathbb{C}^{N_K}\) is the \(i\)-th eigenvector of \(K\).


\section{Koopman Operator Learning}
While EDMD effectively approximates the Koopman operator, it still suffers from issues like spectral pollution. As the dictionary size increases, spurious eigenvalues can accumulate, leading to an inaccurate or over-saturated spectrum that misrepresents the system’s true dynamics. This makes it difficult to distinguish between meaningful dynamic modes and noise, ultimately reducing the accuracy of the analysis. To address these limitations, Residual Dynamic Mode Decomposition (ResDMD) \citep{colbrook2024rigorous} filters out spurious eigenvalues by assessing their spectral residuals. However, ResDMD relies on precomputed eigenpairs, inheriting inaccuracies from methods like EDMD without directly improving the initial spectral estimation.

In contrast, we introduce the Neural Network-ResDMD (NN-ResDMD), a new method that provides a theoretically convergent way to approximate the Koopman operator and its spectral components by minimizing a ResDMD-specific loss function. Additionally, NN-ResDMD optimizes the dictionary functions for the Koopman invariant subspace using a Feedforward Neural Network (FNN), which eliminates the need for manual design of basis functions.


\subsection{ResDMD Review}
Now, suppose we have obtained an eigenpair \((\lambda, \phi)\) of \(\mathcal{K}\) from EDMD or other methods \citep{colbrook2023mpedmd,baddoo2021physics,10.1063/5.0073893,Schmid2010,Tu2014,li2017extended} where $\lambda \in \mathbb{C}$ and the eigenfunction $\phi$ is expanded in terms of dictionary functions, i.e., $\phi = \mathbf{\Psi}\mathbf{v} = \sum_{i=1}^{N_K} \psi_i v_i \in V_{N_K}$ for some $\mathbf{v} \in \mathbb{C}^{N_K}$, where $ v_i$ represents weights of the span. Without loss of generality, we consider $\phi$ has been normalized, i.e., $\|\phi\|_2=1$. The accuracy of this eigenpair approximation in the ResDMD framework can be measured by computing its \textit{squared relative residual} using the dictionary in the following way:
\begin{gather}\label{relative_residual}   
    \textit{res}(\lambda, \phi)^2 \coloneqq \frac{\int_{\Omega} |\mathcal{K}\phi(x) - \lambda \phi(x)|^2 d\mu(x)}{\int_{\Omega} |\phi(x)|^2 d\mu(x)} \notag \\
    = \sum_{i,j=1}^{N_K} \bar{v}_i \left[ \langle \mathcal{K} \psi_i, \mathcal{K} \psi_j \rangle_{\mu} - \lambda \langle \psi_i, \mathcal{K}\psi_j \rangle_{\mu} - \bar{\lambda} \langle \mathcal{K}\psi_i, \psi_j \rangle_{\mu} + |\lambda|^2 \langle \psi_i, \psi_j \rangle_{\mu} \right] v_j, 
\end{gather}
where $\bar{v}_i, \bar{\lambda}$ denote the complex conjugate of $v_i, \lambda$.

This \textit{squared relative residual} in \eqref{relative_residual} is the theoretical value that measures the distance between \(\phi\) and the eigenspace associated with \(\lambda\), especially under the assumption that \(\lambda\) is in the discrete spectrum of \(\mathcal{K}\). To approximate this residual in practice, we apply the Galerkin approximation \citep{boyd2013chebyshev}, which states that as the number of data points $m$ increases, the following limits hold:
\begin{equation}\label{eq:psi_inner_product_resdmd}
    \begin{aligned}
        \lim_{m \to \infty} \frac{1}{m}\left[\Psi_X^* \Psi_X\right]_{ij} &= \langle \psi_i, \psi_j \rangle_{\mu}, \\
        \lim_{m \to \infty} \frac{1}{m}\left[\Psi_X^* \Psi_Y\right]_{ij} &= \langle \psi_i, \mathcal{K}\psi_j \rangle_{\mu}, \\
        \lim_{m \to \infty} \frac{1}{m}\left[\Psi_Y^* \Psi_Y\right]_{ij} &= \langle \mathcal{K}\psi_i, \mathcal{K}\psi_j \rangle_{\mu} = \langle \psi_i, \mathcal{K}^*\mathcal{K}\psi_j \rangle_{\mu}, \\
    \end{aligned}
\end{equation}
where $*$ denotes complex conjugate. Using this approximation, the \textit{squared relative residual} from \eqref{relative_residual} is approximated as follows (see \ref{relative_residual_approximation_calculation} for more details):
\begin{equation}\label{relative_residual_approximation}    
    \widehat{\textit{res}}(\lambda, \phi)^2 \coloneqq \frac{1}{m}\mathbf{v}^* \left[ \Psi_Y^* \Psi_Y - \lambda (\Psi_X^* \Psi_Y)^* - \bar{\lambda} \Psi_X^* \Psi_Y + |\lambda|^2 \Psi_X^* \Psi_X \right] \mathbf{v}.
\end{equation}
where \eqref{relative_residual_approximation}, denoted as $\widehat{\textit{res}}(\lambda, \phi)^2$, represents the approximation of the theoretical value in \eqref{relative_residual}. 

\textcolor{Mulberry}{By definition in \eqref{relative_residual}, the residual quantifies the deviation from the spectral property, measuring how far the estimated eigenpair is from the true spectrum. In practice, \eqref{relative_residual_approximation} is calculated for all precomputed eigenpairs, retaining those with residuals below a threshold. However, while residuals help filter and select valid eigenpairs, they do not improve the accuracy of eigenpair estimation.}
%By definition in \eqref{relative_residual}, this residual quantifies the deviation from satisfying the spectral property, effectively measuring how far the estimated eigenpair deviates from the true spectrum. In practice, \eqref{relative_residual_approximation} can be calculated for all precomputed eigenpairs, and those with residuals below a certain threshold are retained. However, the key limitation is that while the residuals offer a way to filter and select valid eigenpairs, they do not provide a more accurate method for estimating the eigenpairs themselves.


\subsection{Neural Network-ResDMD}

\noindent \textbf{General framework }
In this section, we present the Neural Network-ResDMD (NN-ResDMD) framework, designed to compute the eigenpairs of the Koopman operator directly using ResDMD-based spectral residuals, as illustrated in Figure \ref{fig:flow_chart_2}. The method first determines the optimal dictionary functions by minimizing the \textit{total residual} \( J \coloneqq \sum_{i=1}^{N_K} \widehat{\textit{res}}(\lambda_i, \phi_i)^2 \), over all computed eigenpairs \(\{(\lambda_i, \phi_i)\}_{i=1}^{N_K}\). \textcolor{blue}{The spectral residual directly impacts the finite-dimensional projection of the Koopman operator and our method minimizes this residual to ensure the learned basis functions adequately capture the Koopman dynamics.} This approach allows the construction of the Koopman operator matrix \( \tilde{K} \) without relying on external methods or post-processing. Equation \eqref{thm:main} enables NN-ResDMD to compute eigenpairs directly, improving accuracy compared to ResDMD, which relies on filtering precomputed results from other methods.

In this framework, neural networks parameterize the dictionary functions \( \mathbf{\Psi}(x; \theta) \), where \( \theta \) represents the network parameters. By minimizing the spectral residual \( J \), this approach directly optimizes the dictionary functions \textcolor{red}{towards better approximation of Koopman spectral components, which ensures the learned operator captures the underlying spectral properties of the dynamical system}. This is fundamentally different from traditional methods like EDMD, which \textcolor{red}{focus on minimizing prediction errors in the observable space without explicitly considering spectral accuracy}. The neural network architecture serves as a flexible function approximator, that allows the framework to adaptively learn the optimal dictionary that minimizes spectral residuals, \textcolor{red}{thereby producing more accurate and reliable Koopman spectral decompositions}. This \textcolor{red}{spectral-oriented optimization} improves the accuracy of eigenvalues approximations and enhances the quality of the computed eigenfunctions, which leads to better characterization of the system's dynamic behavior.
\begin{figure}[!htb]
    \centering
    \includegraphics[width=0.75\linewidth]{images/flow_chart_2.png}
            \vspace{-2em} % Adjust the value to reduce the gap
    \caption{(Left) The classical ResDMD and (Right) the Neural Networks based ResDMD methods}

    \label{fig:flow_chart_2}
\end{figure}

\noindent \textbf{From Residual to NN }
This section explains how neural networks are integrated into the ResDMD framework. In ResDMD, the \textit{squared relative residual} approximation \eqref{relative_residual_approximation} measures how well a computed eigenpair fits the dataset. If the Koopman matrix \( K \) is well-approximated by the projected Koopman operator \( \mathcal{K}_{N_K} \), the \textit{total residual} \( J \) should approach zero as more data is provided. Thus, \( J \) can be used as a loss function, and the optimal Koopman matrix \( \tilde{K} \) is obtained by minimizing:
\begin{equation}\label{minimization_problem}
    J = \sum_{i=1}^{N_K} \widehat{\textit{res}}(\lambda_i, \phi_i)^2 .
\end{equation}

which is equivalent to minimization the following (See \ref{thmpf:main} for more details):
\begin{equation}\label{thm:main}
    J = \frac{1}{m} \Vert (\Psi_Y  - \Psi_X K) V \Vert _{F}^2
\end{equation}
where $V$ is a matrix in which each column is an eigenvector $\mathbf{v}_i$ of Koopman matrix $K$. Thus, with a fixed dictionary function $\mathbf{\Psi}$, the explicit form for the optimal Koopman matrix \( \tilde{K} \) can be directly computed as
\begin{equation}\label{K_approximation}
    \tilde{K} = G^{\dagger}A
\end{equation}
where $G = \frac{1}{m}\Psi_X^* \Psi_X , A = \frac{1}{m}\Psi_X^* \Psi_Y$.
\begin{remark}
    Typically a regularization term is needed to enhance stability. Here we add a small perturbation, i.e., $ \tilde{K} = \textcolor{blue}{(G+\sigma I)^{-1}}A$ for some small number $\sigma>0$.
\end{remark}
As shown in \eqref{K_approximation}, NN-ResDMD provides an explicit expression for \( \tilde{K} \) given the \textit{optimal} dictionary function  $\mathbf{\Psi}$, allowing for the direct computation of Koopman eigenpairs. The optimization problem in Equation~\ref{thm:main} is to minimize the error along the eigen-basis, in contrast to the optimization problem $\Vert \Psi_Y  - \Psi_X K \Vert _{F}^2$ for EDMD, thereby yielding different \textit{optimal} $\mathbf{\Psi}$ compared to EDMD. \textcolor{red}{Therefore, although the $K$ update procedure appear identical to the EDMD approach, they originate from different theoretical foundations and serve different optimization purposes.} Additionally, it automatically optimizes basis functions using neural networks, removing the need for manual selection. Since NN-ResDMD is based on the ResDMD framework, it also retains the theoretical convergence guarantees that EDMD lacks\textcolor{red}{: EDMD has convergence results under strong assumptions, such as requiring the Koopman operator to be bounded (Assumption 2 in \cite{korda2018convergence}), ResDMD requires only that the operator is closed and densely defined. }

In NN-ResDMD, neural networks parameterize the dictionary functions \( \mathbf{\Psi}(x; \theta) \) to minimize the \textit{total residual} \( J(\theta) \), as defined in \eqref{minimization_problem}. The feedforward neural network generates the dictionary functions based on data snapshots, and the total residual is given by:
\begin{equation}\label{residual_theta}
    J(\theta) = \frac{1}{m} \| (\Psi_Y(\theta) - \Psi_X(\theta) K(\theta)) V(\theta) \|_F^2
\end{equation}
where \( K(\theta) \) and \( V(\theta) \) depend on \( \theta \). The Koopman matrix \( \tilde{K}(\theta) \) is computed as:
\begin{equation}\label{K_theta}
  \tilde{K}(\theta) = G(\theta)^\dagger A(\theta)
\end{equation}
with \( G(\theta) = \frac{1}{m} \Psi_X(\theta)^* \Psi_X(\theta) \) and \( A(\theta) = \frac{1}{m} \Psi_X(\theta)^* \Psi_Y(\theta) \).

The algorithm alternates between updating \( K(\theta) \) via least squares and optimizing \( \theta \) using gradient descent until \( J(\theta) \) converges, yielding the approximated Koopman spectrum and optimized dictionary functions. \textcolor{red}{While it is possible to optimize both \( K(\theta) \) and \( J(\theta) \) simultaneously, as done in \cite{takeishi2017learning} and \cite{otto2019linearly}, our separate procedure ensures computational efficiency and numerical stability compared to the coupled optimization case.}


\noindent \textbf{Computing Algorithm }In our neural networks implementation, we include some non-trainable basis outputs to enhance the dictionary functions. Specifically, we add a vector of ones and the coordinates of the state space as non-trainable basis in the output layer, which help avoid trivial solutions, i.e., $J = 0$ for some initial $\theta$. For the network architecture, \textcolor{ForestGreen}{we build a three-layer Feedforward Network where each hidden layer size can be specified during training. W}e use the hyperbolic tangent (tanh) function as the activation function for the hidden layers. In terms of optimization, we employ the Adam optimizer for updating the network parameters. Adam is particularly well-suited for this task due to its ability to adapt the learning rate for each parameter, which can lead to faster convergence in the alternating optimization process between the network parameters and the Koopman matrix. The computing steps are illustrated in the following Algorithm \ref{alg:1}.

\RestyleAlgo{ruled}
\begin{algorithm}[!hbt]
\caption{NN-ResDMD}\label{alg:1}
\SetKwInput{KwInput}{Input}   % Set the Input
\SetKwInput{KwOutput}{Output} % Set the Output
\KwInput{Dataset $X, Y$, number of observables $N_K$, learning step $\delta$, regularization parameter $\sigma$, loss function threshold $\epsilon>0$, \textcolor{Mulberry}{grid points $\{z_1, \ldots z_{n_z}\}$}. 
% residual threshold ϵ2>0\epsilon_2>0, 
% a grid z1,…zk∈Cz_1, \dots z_k \in \mathbb{C}.
}
\BlankLine
Initialize $\theta$, thus initializing $\mathbf{\Psi}(\theta)$ \;
\textcolor{Mulberry}{Compute $\tilde{K}(\theta)$ and its eigenvector matrix $V(\theta)$ \;}
\While{$J(\theta) > \epsilon$}{
    Update $\theta = \theta - \delta \nabla_{\theta} J(\theta)$ \;
    Compute $G(\theta)=\frac{1}{m}\Psi_{X}^* \Psi_{X}, A(\theta)=\frac{1}{m} \Psi_{X}^* \Psi_{Y}$\;
    Update $\tilde{K}(\theta) = ( G(\theta) + \sigma I )^{-1} A(\theta) $ and $V(\theta)$ \;
}

% Solve eigenpairs $\{(\lambda_i, \phi_i=\mathbf{\Psi}\mathbf{v}_i)\}_{i=1}^{N_K}$ of $\tilde{K}(\theta)$ \;

\KwOutput{\textcolor{Mulberry}{$\tilde{K}(\theta)$, eigenpairs $\{(\lambda_i, \phi_i=\mathbf{\Psi}\mathbf{v}_i)\}_{i=1}^{N_K}$ and pseudospectrum $\{z_j: \tau_j<\varepsilon\}$.}}

% \For{each zjz_j}{
%     Compute τj=min\tau_j = \min_{v_i \in \mathbb{C}^{N_k}} \widehat{\textit{res}}(z_j, \mathbf{\Psi}(\theta) \mathbf{v}_i) (See (???)\eqref{relative_residual_approximation}(???)\eqref{relative_residual_approximation}) and the corresponding vectors \varphi_j = \mathbf{\Psi}(\theta) \mathbf{v}_i\varphi_j = \mathbf{\Psi}(\theta) \mathbf{v}_i\;
% }
% \KwOutput{Eigenpairs \{(\lambda_i, \phi_i)\}_{i=1}^{N_K}\{(\lambda_i, \phi_i)\}_{i=1}^{N_K}; estimate of the \epsilon\epsilon-pseudospectrum \{z_j : \tau_j < \epsilon_2\}\{z_j : \tau_j < \epsilon_2\} and approximate eigenfunctions \{\varphi_j : \tau_j < \epsilon_2\}\{\varphi_j : \tau_j < \epsilon_2\}.}
\end{algorithm}

\textcolor{ForestGreen}{While the practical advantages of NN-ResDMD are demonstrated through experiments, it is important to note its computational demands. The algorithm's computational complexity stems primarily from its iterative optimization process. Each iteration involves a gradient descent update with complexity scaling linearly with both system dimensionality and neural network parameters. Although individual gradient steps are computationally lightweight for standard network architectures, the algorithm's efficiency issue lies in its repeated least-squares optimizations. Compared to standard single least-squares computation as in most numerical algorithms, NN-ResDMD requires multiple iterations to achieve convergence, with stochastic gradient descent methods showing a theoretical $O(1/n)$ convergence rate. However, the method's nonlinear optimization nature also presents challenges for establishing concrete convergence bounds and error estimates.}

If the continuous spectrum of the Koopman operator is of interest, following the ResDMD paper's idea, we can scan candidate spectrum values within a grid in the complex plane using the residuals. Specifically, we compute \( \tau_j = \min_{\mathbf{v}_i \in \mathbb{C}^{N_k}} \widehat{\textit{res}}(z_j, \mathbf{\Psi}(\theta) \mathbf{v}_i) \), where \( \tau_j \) is the minimum residual for a grid point \( z_j \in \mathbb{C} \). The approximated whole spectrum containing the continuous spectrum is then given by \( \{z_j : \tau_j < \varepsilon\} \). \textcolor{Mulberry}{More details can be found in \citep{colbrook2024rigorous}.}

While the practical advantages of NN-ResDMD are demonstrated through experiments, it's also worth noting that the method has theoretical underpinnings \citep{haykin2009neural, weinan2019barron}  that support its convergence properties. A brief discussion on the convergence aspects of NN-ResDMD, leveraging existing results from approximation theory in Barron spaces, is provided in Appendix \ref{convergence_discussion}. This discussion offers insights into how the neural network component of NN-ResDMD contributes to its effectiveness in approximating complex dynamical systems.


%%%%%%%%%%%%%%%%% Examples %%%%%%%%%%%%%%%%%
\section{Application in physical and biological systems}
%In this chapter, we present three examples that demonstrate the effectiveness of the NN-ResDMD method. In each example, we focus on estimating the three key quantities of the Koopman Mode Decomposition (KMD): eigenvalues, eigenfunctions and Koopman modes. In the first (low-dimensional) example on the classical pendulum system, we will show that our method requires significantly fewer dictionary observables compared to a similar example in \citep[Section 4.3.1, Section 6.3]{colbrook2024rigorous} when computing the spectrum of the Koopman operator and performs better in approximating continuous spectra. In the second (high-dimensional) example on turbulence, we will show that our method can detect acoustic vibrations and distinguish the pressure field by computing the Koopman modes. In the third (real-world) example on a high-dimensional neural system, we compare our method with three other popular methods in the data-driven Koopman analysis field: Hankel-DMD \citep{arbabi2017ergodic}, EDMD with Radial Basis Function (RBF) basis, and kernelized-ResDMD (Kernel ResDMD) \citep{colbrook2024rigorous}, and demonstrate the superiority of our method in identifying and clustering latent dynamic structures. These examples illustrate how our method performs in various systems, and provides a comprehensive evaluation of its capabilities.
\textcolor{Mulberry}{In this chapter, we present three examples demonstrating the effectiveness of NN-ResDMD in estimating the key quantities of Koopman Mode Decomposition (KMD): spectrum, eigenfunctions, and Koopman modes. In the first low-dimensional example of a classical pendulum system, our method requires significantly fewer dictionary observables than \citep[Section 4.3.1, Section 6.3]{colbrook2024rigorous} to compute the Koopman spectrum. The second high-dimensional example on turbulence highlights our method's ability to detect acoustic vibrations and distinguish the pressure field through Koopman modes. The third example, a real-world high-dimensional neural system, compares NN-ResDMD with three popular methods—Hankel-DMD \citep{arbabi2017ergodic}, EDMD with RBF basis, and kernelized-ResDMD \citep{colbrook2024rigorous}—and demonstrates its superiority in identifying and clustering latent dynamic structures. Together, these examples showcase NN-ResDMD's performance across diverse systems and comprehensively evaluate its capabilities.} 

\textcolor{ForestGreen}{Specifically, in all three experiments, we compare NN-ResDMD with Hankel-DMD, a theoretically grounded method that analyzes dynamical systems using time-delayed state measurements (see Appendix \ref{hankel_dmd_intro} for a justification). Although its performance rivals NN-ResDMD in the simple pendulum system, it fails to capture key dynamics in higher-dimensional systems.}

\subsection{Pendulum}
\textcolor{blue}{The pendulum system is a measure-preserving system due to its Hamiltonian nature, which theoretically implies its whole spectrum lies on the unit circle. For its dynamical behaviors}, if the initial position of the pendulum is sufficiently far from the peak and the initial angular speed sufficiently small, the pendulum will oscillate; otherwise, the pendulum will pass the peak and rotate. In other words, this complex system exhibits two types of dynamical behaviors: rotation and oscillation. Here we simulate two cases with different numbers of initial points. We choose 90 and 240 initial points uniformly in the domain $[ -\pi, \pi]_{per} \times [-15, 15]$. Each point evolves 1000 steps with a step size of 0.5. Thus, the total data size in each set is approximately $9 \times 10^4$ and $2.4 \times 10^5$, respectively.

%\textcolor{ForestGreen}{Here, the ground truth spectrum is the entire unit circle ($ |\lambda| = 1 $), encompassing eigenvalues and the continuous spectrum.} 
%As we can see in Figure \ref{fig:pendulum_90}, we only need $N_K=300$ observables to calculate the whole spectrum approximation, which is significantly fewer than the number (nearly 1000) of observables required in \citep[Section 4.3.1]{colbrook2024rigorous} given the same data size. Moreover, even when the data size is largely increased, as seen in Figure \ref{fig:pendulum_240}, the number of necessary observables ($N_K=350$) remains relatively small, demonstrating the robustness of efficient observables over different data sizes.
\textcolor{Mulberry}{As shown in Figure \ref{fig:pendulum_90}, only $N_K=300$ observables are needed to approximate the full spectrum, significantly fewer than the nearly 1000 required in \citep[Section 4.3.1]{colbrook2024rigorous} for the same data size. Even with a much larger data size (Figure \ref{fig:pendulum_240}), the required observables remain small ($N_K=350$), demonstrating the robustness of efficient observables across data sizes.}

% %%%% Original %%%%
% As shown in Figure \ref{fig:pendulum_90_others}, we compare our method to four approaches: EDMD, EDMD with Dictionary Learning (EDMD-DL), Hankel-DMD, and ResDMD, on the dataset with 90 initial points to compute the Koopman matrix and its corresponding spectral information. The first three methods (EDMD \citep{Williams2015}, EDMD-DL \citep{li2017extended}, and Hankel-DMD are limited to computing eigenvalues associated with the point spectrum. In these experiments, both EDMD and ResDMD use the hyperbolic cross approximation with Hermite functions up to order 15 and Fourier functions up to order 20. Hankel-DMD uses a time delay of 150. Although Hankel-DMD yields accurate eigenvalues, \textcolor{ForestGreen}{it still suffers from spectral pollution} and requires careful tuning of the time delay parameter. With 300 basis functions, ResDMD is still unable to fully capture the whole spectrum, i.e., the unit circle, due to the insufficient number of basis functions. In the original ResDMD work, 964 basis functions using a hyperbolic cross approximation of order 100 were required to adequately cover the spectrum with a dataset of the same size \citep[Section 4.3.1]{colbrook2024rigorous}. This comparison demonstrates that NN-ResDMD, even with only 300 basis functions, outperforms all four classical methods in terms of capturing the complete spectrum with greater accuracy and fewer basis functions.

%%%% Modification 1 %%%%
As shown in Figure \ref{fig:pendulum_90_others}, we compare our method to four approaches: EDMD, EDMD with Dictionary Learning (EDMD-DL), Hankel-DMD, and ResDMD, on the dataset with 90 initial points to compute the Koopman matrix and its corresponding spectral information. The first three methods (EDMD \citep{Williams2015}, EDMD-DL \citep{li2017extended}, and Hankel-DMD are limited to computing eigenvalues associated with the point spectrum. In these experiments, both EDMD and ResDMD use the hyperbolic cross approximation with Hermite functions up to order 15 and Fourier functions up to order 20. Hankel-DMD uses a time delay of 150. \textcolor{ForestGreen}{While Hankel-DMD yields accurate eigenvalues and shows points near the unit circle which is the ground truth for this Hamiltonian system, it only captures the point spectrum(eigenvalues) and misses the entire spectrum(eigenvalues + continuous spectrum). It also still suffers from spectral pollution and requires careful tuning of the time delay parameter.} With 300 basis functions, ResDMD is still unable to fully capture the entire spectrum, i.e., the unit circle, due to the insufficient number of basis functions. In the original ResDMD work, 964 basis functions using a hyperbolic cross approximation of order 100 were required to adequately cover the spectrum with a dataset of the same size \citep[Section 4.3.1]{colbrook2024rigorous}. \textcolor{ForestGreen}{Our method, in contrast, represents the results using shaded areas that show the pseudospectrum, which is a key feature that can capture the whole spectrum. While the shaded area may appear broad, this actually demonstrates our method's ability to detect the complete spectrum. This radius of the shaded region accounts for computational uncertainties, as exact spectrum computation is computationally impossible. Theoretically, ResDMD guarantees that as this error tolerance(radius of the shaded region) approaches zero, the pseudospectrum converges to the true spectrum (the unit circle in this case) without spectral pollution.} This comparison demonstrates that NN-ResDMD, even with only 300 basis functions, outperforms all four classical methods in terms of capturing the complete spectrum with greater accuracy and fewer basis functions.


%%%%%%%%
\begin{figure}[!htb]
\centering

\begin{minipage}[b]{0.22\textwidth}
    \centering
    \includegraphics[width=\linewidth]{images/pendulum_90_resdmd_25basis}
    \vspace{-2em} % Adjust the value to reduce the gap
    \caption*{\scriptsize NN-ResDMD: $N_K = 25$}
\end{minipage}%
\hfill
\begin{minipage}[b]{0.22\textwidth}
    \centering
    \includegraphics[width=\linewidth]{images/pendulum_90_resdmd_50basis}
        \vspace{-2em} % Adjust the value to reduce the gap
    \caption*{\scriptsize NN-ResDMD: $N_K = 50$}
\end{minipage}%
\hfill
\begin{minipage}[b]{0.22\textwidth}
    \centering
    \includegraphics[width=\linewidth]{images/pendulum_90_resdmd_100basis}
        \vspace{-2em} % Adjust the value to reduce the gap
    \caption*{\scriptsize NN-ResDMD: $N_K = 100$}
\end{minipage}%
\hfill
\begin{minipage}[b]{0.22\textwidth}
    \centering
    \includegraphics[width=\linewidth]{images/pendulum_90_resdmd_300basis}
        \vspace{-2em} % Adjust the value to reduce the gap
    \caption*{\scriptsize NN-ResDMD: $N_K = 300$}
\end{minipage}%

\caption{The four plots depict the spectrum of the Koopman operator, constructed using varying dictionary size $N_K$ of 25, 50, 100, and 300. Each plot utilizes 90 initial points to illustrate the impact of increasing the dictionary size on approximating the spectrum of the Koopman operator.}
\label{fig:pendulum_90}

% \vspace{1cm}

\begin{minipage}[b]{0.22\textwidth}
    \centering
    \includegraphics[width=\linewidth]{images/pendulum_240_resdmd_50basis}
        \vspace{-2em} % Adjust the value to reduce the gap
    \caption*{\scriptsize NN-ResDMD: $N_K = 50$}
\end{minipage}%
\hfill
\begin{minipage}[b]{0.22\textwidth}
    \centering
    \includegraphics[width=\linewidth]{images/pendulum_240_resdmd_150basis}
        \vspace{-2em} % Adjust the value to reduce the gap
    \caption*{\scriptsize NN-ResDMD: $N_K = 150$}
\end{minipage}%
\hfill
\begin{minipage}[b]{0.22\textwidth}
    \centering
    \includegraphics[width=\linewidth]{images/pendulum_240_resdmd_250basis}
        \vspace{-2em} % Adjust the value to reduce the gap
    \caption*{\scriptsize NN-ResDMD: $N_K = 250$}
\end{minipage}%
\hfill
\begin{minipage}[b]{0.22\textwidth}
    \centering
    \includegraphics[width=\linewidth]{images/pendulum_240_resdmd_350basis}
        \vspace{-2em} % Adjust the value to reduce the gap
    \caption*{\scriptsize NN-ResDMD: $N_K = 350$}
\end{minipage}%

\caption{ \textcolor{Mulberry}{Same example as Figure~\ref{fig:pendulum_90} but with larger data size, using 240 initial points to show the effect of increasing dictionary size on approximating the Koopman operator spectrum.}}
\label{fig:pendulum_240}

\begin{minipage}[b]{0.22\textwidth}
    \centering
    \includegraphics[width=\linewidth]{images/pendulum_90_edmd.png}
        \vspace{-2em} % Adjust the value to reduce the gap
    \caption*{\scriptsize EDMD: $N_K = 300$}
\end{minipage}%
\hfill
\begin{minipage}[b]{0.22\textwidth}
    \centering
    \includegraphics[width=\linewidth]{images/pendulum_90_edmddl.png}
        \vspace{-2em} % Adjust the value to reduce the gap
    \caption*{\scriptsize EDMD-DL: $N_K = 300$}
\end{minipage}%
\hfill
\begin{minipage}[b]{0.22\textwidth}
    \centering
    \includegraphics[width=\linewidth]{images/pendulum_90_hankeldmd.png}
    \caption*{\scriptsize Hankel-DMD: $N_K = 300$}
\end{minipage}%
\hfill
\begin{minipage}[b]{0.22\textwidth}
    \centering
    \includegraphics[width=\linewidth]{images/pendulum_90_resdmd.png}
        \vspace{-2em} % Adjust the value to reduce the gap
    \caption*{\scriptsize ResDMD: $N_K = 300$}
\end{minipage}%

\caption{ \textcolor{Mulberry}{Comparison with classical methods. The four plots above represent the spectral information obtained from a $300\times300$ Koopman matrix, calculated using four methods: EDMD, EDMD with Dictionary Learning (EDMD-DL), Hankel-DMD, and ResDMD. The illustrated eigenvalue spectra of the Koopman operator highlight the differences in results produced by these methods.}}
\label{fig:pendulum_90_others}

\end{figure}


\subsection{Turbulence} \label{sec:turbulence}

\textcolor{ForestGreen}{Recovering spatial patterns is a typical goal of DMD-based methods, especially in fluid dynamics, where }Kernel ResDMD has been particularly successful in capturing such patterns and detecting acoustic vibrations \textcolor{ForestGreen}{in the turbulence system. However, Kernel ResDMD requires careful selection of kernel functions, while }NN-ResDMD bypasses this by using neural networks to train observables and compute Koopman modes. 
%Variants of DMD algorithms \citep{Colbrook2023TheMO,Tu2014,Williams2015,rowley2009spectral} have shown strong results in fluid dynamics. In \citep[Section 6.3]{colbrook2024rigorous}, it is demonstrated that Kernel ResDMD can capture key spatial patterns and detect acoustic vibrations but requires careful selection of kernel functions.

\textcolor{ForestGreen}{We demonstrate this by applying NN-ResDMD to the turbulence system u}sing the dataset from \citep[Section 6.3]{colbrook2024rigorous}. \textcolor{ForestGreen}{The ground truth in the first plot of Figure 5 represents a high-dimensional pressure field distribution (approximately 30,000 spatial dimensions) around an airfoil, with a clear distinction between the upper and lower surfaces. Technically, }we apply truncated Singular Value Decomposition (SVD), select 300 observables, compute Koopman modes, and project them back into the original state space.

\textcolor{ForestGreen}{In Figure \ref{fig:turbulence}, the first Koopman mode estimated by NN-ResDMD with the smallest residual value successfully highlights a clear global spatial separation that aligns with the pattern observed in the original pressure field. }This advantage allows the first Koopman mode to directly distinguish spatial features present in the true pressure field, making it a powerful tool for interpreting complex fluid dynamics data. Subsequent Koopman modes also reveal strong acoustic waves that are critical in various aeronautical engineering fields. In contrast, Kernel ResDMD with a generic normalized Gaussian kernel function, as shown in the original work, is unable to produce a Koopman mode similar to the first Koopman mode from NN-ResDMD that clearly distinguishes the pressure field. For comparison, we also plot four Koopman modes computed by Hankel-DMD with a time delay of 5, corresponding to the four smallest residual values, which similarly do not reveal the pressure field patterns as in NN-ResDMD. These results are presented in Appendix Figure~\ref{fig:turbulence_hankeldmd} \textcolor{ForestGreen}{and Appenxic~\ref{sec:hankel_turbulence}}.

%In Figure \ref{fig:turbulence}, the first Koopman mode estimated by NN-ResDMD, which corresponds to the constant eigenfunction, has the smallest residual value and successfully highlights a clear global spatial separation that aligns with patterns observed in the original pressure field. The small residual values in the figures associated with the Koopman modes confirm the estimation accuracy. This advantage allows the first Koopman mode to directly distinguish spatial features that are present in the true pressure field, which makes it a powerful tool for the interpretation of complex fluid dynamics data. Subsequent Koopman modes also reveal strong acoustic waves that are critical in various aeronautical engineering fields. In contrast, Kernel ResDMD with a generic normalized Gaussian kernel function, as shown in the original work, is unable to produce a Koopman mode similar to the first Koopman mode from NN-ResDMD that clearly distinguishes the pressure field. For comparison, we also plot four Koopman modes computed by Hankel-DMD with a time delay of 5, corresponding to the four smallest residual values, which similarly do not reveal the pressure field patterns as in NN-ResDMD. These results are presented in Appendix Figure~\ref{fig:turbulence_hankeldmd}
\begin{figure}[!htb]
\centering

\begin{minipage}[b]{0.25\textwidth}
    \centering
    \includegraphics[width=\linewidth]{images/pressure_field.png}
    % \caption*{original pressure field}
\end{minipage}%
\hfill
\begin{minipage}[b]{0.25\textwidth}
    \centering
    \includegraphics[width=\linewidth]{images/koopman_mode_1.png}
    % \caption*{1st Koopman mode}
\end{minipage}%
\hfill
\begin{minipage}[b]{0.25\textwidth}
    \centering
    \includegraphics[width=\linewidth]{images/koopman_mode_6.png}
    % \caption*{6th Koopman mode}
\end{minipage}%
\hfill
\begin{minipage}[b]{0.25\textwidth}
    \centering
    \includegraphics[width=\linewidth]{images/koopman_mode_7.png}
    % \caption*{7th Koopman mode}
\end{minipage}%

\caption{The plots illustrate turbulence detection with Koopman modes computed by 300 observables. The first plot shows a 2D scatter plot of the pressure field, while the other plots display various Koopman modes, each labeled with corresponding residuals. \textcolor{ForestGreen}{The small residual values in the figures associated with the Koopman modes confirm the estimation accuracy. }}
\label{fig:turbulence}

\end{figure}


\subsection{Identification of neural dynamics in mice visual cortex} \label{sec:neural_dynamics}
Since NN-ResDMD directly minimizes the residuals based on eigenfunctions, its estimated evolution of eigenfunctions over time should ideally capture latent dynamics. To evaluate how effectively NN-ResDMD reveals latent temporal dynamics in real data, we apply it to a dataset of high-dimensional neural signals and demonstrate its advantages over a series of classical methods: the Hankel-DMD, EDMD (combined with RBF basis) and Kernel ResDMD. These methods are selected as representative approaches for handling high-dimensional data.

\textcolor{Mulberry}{The dataset is part of the open dataset on mice from the competition "Sensorium 2023" \citep{turishcheva2023dynamic, turishcheva2024retrospective}. In the experiments, mice viewed natural videos while their neural signals were recorded via calcium imaging in the primary visual cortex, reflecting the activity of thousands of neurons. Here, we focus on the state partitioning of neural signals. Specifically, in each mouse, six video stimuli were repeatedly shown, creating ideal conditions to define brain states. Neural activity during repeated trials with the same stimuli is assumed to reflect the same underlying dynamic system, enabling Koopman decomposition methods to uncover and separate these brain states.}
%The dataset is part of the open dataset on mice available for the competition 'Sensorium 2023'  \citep{turishcheva2023dynamic, turishcheva2024retrospective}. During the experiments, mice are required to look at natural videos while the neural signals are calcium imaging recordings in mice's primary visual cortex reflecting the activities of thousands of neurons. Here, 
%The competition aims to predict the large population of neuronal activities given different frames of the stimulus videos, based on the hypothesis that population dynamics in the primary visual cortex driven by the visual stimuli contains most information about the dynamics of the videos  \citep{basole2003mapping, onat2011natural,henaff2021primary}.
%Contrary to this aim, 
%we focus on a simple task of state partition of neural signals. 
%While it is also possible to perform predictions, we think this simpler task is sufficient to demonstrate the superiority of NN-ResDMD over Hankel-DMD. 
%Specifically, in each tested mouse, six video stimuli were repeatedly shown, creating ideal conditions for defining brain states. The experiment assumes that neural activity during repeated trials with the same stimuli reflects the same underlying dynamic system, allowing Koopman decomposition methods to be tested for reliably uncovering and separating these brain states.

The dataset consists of neural recordings from five mice, each exposed to 6 video stimuli, repeated 9-10 times for a total of around 60 trials. Each recording captures the activity of over 7,000 neurons, with each 10-second video sampled at 50 Hz, resulting in 300 data points per trial.

\textcolor{Mulberry}{We applied NN-ResDMD and three classical Koopman decomposition methods (Hankel-DMD, EDMD with RBF basis, and Kernel ResDMD) to these datasets, using different implementations and Koopman subspace dimensions. For NN-ResDMD, we trained dictionaries on all snapshots from each mouse to avoid overfitting, reduced the data to 300 dimensions via SVD, and selected 501 eigenfunctions. The decomposed eigenfunctions are shown in Figure~\ref{fig:resDMD_HankelDMD_efuns}A(top), with markers indicating ground truth state separations. For Hankel-DMD, we built a Hankel matrix with a delay of 50, producing 50 eigenfunctions per trial. In EDMD with RBF basis, we used the SVD-truncated 300 basis and 1000 RBF functions, resulting in 1301 eigenfunctions. For Kernel ResDMD, we used normalized Gaussians as kernel functions, setting the Koopman subspace dimension to 299 eigenfunctions based on \cite{colbrook2023residual}.} \textcolor{ForestGreen}{See Appendix~\ref{sec:implement_appendix} for method details and Appendix~\ref{sec:basis_choice} for dictionary size justification.} \textcolor{Mulberry}{These eigenfunctions, shown in Figure~\ref{fig:resDMD_HankelDMD_efuns}A(bottom), Appendix Figure~\ref{fig:edmd_rbf_all}A, and Appendix Figure~\ref{fig:kernel_resdmd_all}A, are compared to the ground truth trial identities.}

%We applied NN-ResDMD and three classical Koopman decomposition methods (Hankel-DMD, EDMD with RBF basis and Kernel ResDMD) to these datasets, utilizing varying implementations and Koopman subspace dimensions. For NN-ResDMD, we trained dictionaries using all snapshots from each mouse to avoid overfitting, reducing the data to 300 dimensions via SVD and selecting 501 eigenfunctions. The decomposed eigenfunctions are shown in Figure~\ref{fig:resDMD_HankelDMD_efuns}A(top), with markers indicating ground truth stimulus-based state separations. For Hankel-DMD, we built a Hankel matrix with a delay of 50, producing 50 eigenfunctions per trial. In the EDMD with RBF basis approach, we employed the SVD-truncated 300 basis and 1000 RBF basis functions, resulting in 1301 eigenfunctions as temporal features. For Kernel ResDMD, we chose normalized Gaussians as kernel functions. Based on \cite{colbrook2023residual}, the dimension of the Koopman invariant subspace was set to the number of temporal snapshots (i.e. 299 eigenfunctions). See Appendix~\ref{sec:implement_appendix} for implementation details of these methods \textcolor{ForestGreen}{and Appendix~\ref{sec:basis_choice} for a justification of dictionary size choices}. These eigenfunctions, plotted in Figure~\ref{fig:resDMD_HankelDMD_efuns}A(bottom), Appendix Figure~
%\ref{fig:edmd_rbf_all}A and Appendix Figure~\ref{fig:kernel_resdmd_all}A, are compared against the ground truth trial identities.

The Koopman eigenfunctions represent dynamical features corresponding to the video stimuli. To evaluate their effectiveness, we assess how well eigenfunctions of the same stimuli cluster together, distinguishing them from other states. If the eigenfunctions capture key dynamics related to the stimuli, those from trials with the same video should be separable from others. This turns the problem into a clustering task based on the separability of eigenfunctions across different stimuli. \textcolor{ForestGreen}{Note that averaged trial differences are even visibly clear for the NN-ResDMD case.}

%The dataset contains neural recordings of five mice, each responding to 6 video stimuli with a repetition of 9-10 times (i.e. altogether around 60 trials). Each neural recording contains the activities of over 7000 neurons. 
%Each video lasts 10 seconds with a sampling rate of 50 Hz, therefore the actual data to be analyzed in each trial consists 300 data points (thus 299 snapshots) with a high dimension of over 7000 (i.e. observable number).

%We applied NN-ResDMD and Hankel-DMD to the 5 datasets, although with slightly different implementations and different dimensions of approximated Koopman invariance subspace. 
%For NN-ResDMD, we train the dictionaries with all the snapshots recorded in each mouse such that the total snapshot number is the product of the snapshot number in one trial and the number of all trials. This is to avoid overfitting with the small snapshot numbers within a trial. The high-dimensional data is first reduced to 300 dimensions with Singular Value Decomposition. The dimension of the Koopman subspace is chosen to be 601, consisting of 300 trained bases and 301 pre-chosen ones (constant and the first-degree polynomials of the SVD-ed 300 dimensions). The first 501 eigenfunctions sorted by the modulus of eigenvalues are selected to avoid spurious eigenvalues estimation due to noise. One can find the decomposed eigenfunctions in Figure~\ref{images/ResDMD_HankelDMD_efuns}A(top), with a marker of the ground truth state separations based on stimulus identity. 

% Two figures are combined together
\begin{figure}[!htb]
\centering
\includegraphics[width=\textwidth]{exp3_mainfig.png}
\caption{
\textcolor{Mulberry}{NN-ResDMD outperforms Hankel-DMD in identifying latent dynamic structures in neural signals with a dictionary size of 501.
(A) (Top) 500 Koopman eigenfunctions estimated by NN-ResDMD across 6 states characterized by different video stimuli in an example mouse. Each trial contains 300 data points (10s at 50Hz). (Bottom) 50 Koopman eigenfunctions approximated by Hankel-DMD, each 50 points long, reflecting the dimension of the Hankel matrix.
(B) 2D representation of Koopman eigenfunctions for all tested mice, computed by NN-ResDMD and reduced via Multidimensional Scaling (MDS). Trials of the same state cluster well.
(C) Same as (B) but computed with Hankel-DMD, showing no clear state separation.
(D) 2D representation of Koopman eigenfunctions for the first mouse, computed by EDMD with an RBF basis. See Appendix Figure~\ref{fig:edmd_rbf_all} for full results.
(E) Same as (D) but computed with Kernel ResDMD. See Appendix Figure~\ref{fig:kernel_resdmd_all} for full results.
(F) Davies-Bouldin Indices (DBIs) evaluating clustering quality across four methods (NN-ResDMD, Hankel-DMD, EDMD+RBF, and Kernel ResDMD) for five mice. Lower DBI values for NN-ResDMD indicate better clustering.}}
\label{fig:resDMD_HankelDMD_efuns}
\end{figure}

\begin{comment}
\caption{NN-ResDMD outperforms Hankel-DMD in identifying latent dynamic structures in neural signals with a dictionary basis of size 501. 
(A) (Top) 500 Koopman eigenfunctions estimated by NN-ResDMD in 6 states characterized by 6 different video stimuli in an example mouse. Eigenfunctions in each trial of each state contain 300 data points (10s with a sampling rate of 50Hz. (Bottom) 50 Koopman eigenfunctions approximated by Hankel-DMD. In line with the dimension of the Hankel matrix, each eigenfunction is 50 points long. 
(B) 2-D representation of Koopman eigenfunctions for each trial of all tested mice, calculated by NN-ResDMD and reduced by Multidimensional Scaling (MDS). Trials of the same state are well-clustered. 
(C) Same as (B) but calculated with Hankel-DMD. No clear separation of states can be seen from the reduced representation. 
(D) 2D representation of Koopman eigenfunctions estimated using EDMD with RBF basis for each trial in the first mouse, shown as an example. See Appendix Figure~\ref{fig:edmd_rbf_all} for the full results.
(E) Same as (D) but estimated by Kernel ResDMD for the first mouse. See Appendix Figure~\ref{fig:kernel_resdmd_all} for full results.
(F) Davies-Bouldin Indices (DBIs) as a metric to evaluate the clustering quality for the two reduced representations learned by four methods (NN-ResDMD, Hankel-DMD, EDMD+RBF, and Kernel ResDMD) across five mice. Lower DBI values in the case of NN-ResDMD suggest better clustering compared to other classical methods.}
\end{comment}
%For Hankel-DMD, we approximate the Koopman eigenfunctions with the eigenvectors of the Hankel matrix. Specifically, we form the Hankel matrix (Equation 53 in  \citep{arbabi2017ergodic}) with all the observables in one trial of each mouse with a delay of 50. The snapshot size is then 249 times the observable number. Therefore the resulting number of eigenfunctions is 50, each with a length of 50. Similar to Figure 3A, we plot the eigenfunctions for each trial of data in Figure~\ref{images/ResDMD_HankelDMD_efuns}A(bottom) with a comparison with the ground truth trial identities.

%The eigenfunctions of both the NN-ResDMD and Hankel-DMD can be understood as dynamical features of a state corresponding to one of the 6 video stimuli. Therefore to see how well the Koopman eigenfunctions reveal the latent dynamic structures, it is critical to check how the features of the same state are similar to each other and how they are different from other states. In other words, if the decomposed Koopman eigenfunctions can capture the key dynamic components related to the videos, eigenfunctions of the trials with the same video stimuli should be separable from eigenfunctions of other trials, making this a clustering problem.
We use Multi-dimensional Scaling (MDS) to visualize how these eigenfunction-based features cluster according to ground truth states. MDS reduces data dimensionality based on similarities, making it ideal for visualizing clustering performance. While UMAP and t-SNE are alternative methods, we show MDS results in 2D space (Figure~\ref{fig:resDMD_HankelDMD_efuns}B-E), with similar results for UMAP and t-SNE in the supplementary materials (Appendix Figure~\ref{fig:umap_tsne}, Appendix Figure~\ref{fig:edmd_rbf_all}C,D and Appendix Figure~\ref{fig:kernel_resdmd_all}C,D).

The 2D MDS visualization reveals clear separation of features for all 5 mice using NN-ResDMD (Figure~\ref{fig:resDMD_HankelDMD_efuns}B), whereas no other method shows clear clustering (Figure~\ref{fig:resDMD_HankelDMD_efuns}C-E, Appendix Figure~\ref{fig:edmd_rbf_all}B, Appendix Figure~\ref{fig:kernel_resdmd_all}B). To quantify this clustering, we calculate the Davies-Bouldin index (DBI), a measure of clustering quality that assesses how compact and well-separated the clusters are. A lower DBI indicates more compact clusters that are farther apart from each other, which corresponds to better clustering. The DBI is significantly lower for NN-ResDMD (Figure~\ref{fig:resDMD_HankelDMD_efuns}F), suggesting that it captures the latent dynamic structure more effectively than all three other methods. Similar clustering patterns are confirmed with UMAP and t-SNE (Appendix Figure~\ref{fig:umap_tsne_dbindex}).

%Here we perform a dimension reduction with Multi-dimensional Scaling (MDS) to visualize how such high-dimensional features are clustered with ground truth states.
%MDS is a dimensional reduction method based on the similarities between high-dimensional data  \citep{kruskal1964nonmetric}, and thereby is useful for a low-dimensional visualization of clustering performance.
%UMAP  \citep{mcinnes2018umap} and t-SNE  \citep{van2008visualizing} are two alternative ways for such visualization with different emphasis on global-local relationships, for which we provide similar dimension reduction results in the supplementary materials. 
%By performing MDS, we reduce the high-dimensional eigenfunction-based features to a low-dimensional space (here the dimension to be reduced is the product of the number of eigenfunctions and the number of time points in a trial). For illustration purposes, we show here the reduction results in 2-D space, as seen in Figure~???\ref{images/ResDMD_HankelDMD_efuns}B(Left top, bottom). We can see that in the 2D representation after applying MDS, reduced features with NN-ResDMD for 6 types of trials (corresponding to 6 video stimuli) can be well-separated for all 5 mice(Figure~???\ref{images/ResDMD_HankelDMD_efuns}B(Left top)). On the contrary, no clear clustering structures can be seen with reduced features obtained from Hankel-DMD (Figure~???\ref{images/ResDMD_HankelDMD_efuns}B(Left bottom)). 
%We further quantify the clustering quality by calculating the Davies-Bouldin indices (DBIs) for both Koopman decomposition methods for all mice (Figure~???\ref{images/ResDMD_HankelDMD_efuns}B(Right)). DBIs are significantly larger for features of NN-ResDMD compared to Hankel-DMD. As this index is designed to evaluate how well different clusters are concentrated, it is clear that reduced features of NN-ResDMD are more clustered for ground truth trials than Hankel-DMD, suggesting that the Koopman eigenfunctions estimated with NN-ResDMD are more informative than Hankel-DMD in revealing the latent dynamic structure.
%Similar clustering results are obtained with UMAP and t-SNE (see Appendix Figure~???\ref{fig:umap_tsne}A, B), both supporting the superiority of NN-ResDMD to Hankel-DMD.


\section{Conclusion and future work}
Koopman spectral components (eigenpairs) are fundamental to understanding dynamical systems, as they reveal intrinsic patterns and structures underlying complex temporal behavior through a linear framework for analyzing nonlinear dynamics. In this paper, we introduced NN-ResDMD, a method for estimating eigenpairs by minimizing spectral residuals, eliminating ResDMD’s need to filter precomputed results. \textcolor{ForestGreen}{Despite higher computational costs, u}sing neural networks to learn eigenpairs provides a significant advantage by capturing patterns automatically and reducing manual intervention in basis selection. This flexibility is particularly beneficial for high-dimensional systems where traditional methods often struggle. Our experiments demonstrate that NN-ResDMD outperforms classical methods—including EDMD, Hankel-DMD, ResDMD, and their variants—in uncovering critical spatiotemporal characteristics of nonlinear dynamics.

\textcolor{ForestGreen}{
Despite the advantages, NN-ResDMD has several limitations and we discuss the major ones here. First, the neural network structure incurs higher computational costs compared to classical approaches, making it unsuitable for real-time learning tasks (see a brief discussion in Appendix~\ref{sec:comp_cost}). Second, the deterministic nature of the framework does not account for stochastic aspects of the system, such as those addressed by methods like VAMP \cite{mardt2018vampnets}, limiting its applicability to highly noisy data. Additionally, the performance of NN-ResDMD is sensitive to hyperparameter tuning, including network architecture, dictionary size, and training criteria, which can require significant effort to optimize. }
%Koopman spectral components (eigenpairs) are fundamental to understanding unknown dynamical systems, as they reveal the intrinsic patterns and structures underlying complex temporal behavior by providing a linear framework for analyzing nonlinear dynamics. In this paper, we introduced NN-ResDMD, a method for effectively estimating of these eigenpairs based on minimizing spectral residuals, which overcomes ResDMD’s limitation by eliminating the need to filter pre-computed results. \textcolor{ForestGreen}{Despite increased computation cost, t}he use of neural networks to learn these eigenpairs offers a significant advantage in capturing such patterns automatically, thus enhancing adaptability and reducing the need for manual intervention in basis selection. This flexibility is particularly beneficial for high-dimensional systems where traditional approaches may struggle to uncover the underlying dynamics effectively. In this line, our experiments clearly demonstrate that NN-ResDMD significantly outperforms classical methods—including EDMD, Hankel-DMD, ResDMD and their variants—in uncovering the critical spatiotemporal characteristics of nonlinear dynamics.

Koopman eigenpairs provide unique perspectives into the interpretation of nonlinear dynamical mechanisms, and feedforward neural networks (FNNs) represent an initial step in learning spectral properties directly from data. \textcolor{red}{In recent years, various deep neural network structures have been employed to learn the Koopman representations with different optimization targets other than the spectral residuals (e.g. \cite{Lusch2018, takeishi2017learning, mardt2018vampnets, yeung2019learning, otto2019linearly, azencot2020forecasting, alford2022deep, Iwata2020NeuralDM}}\textcolor{cyan}{(see Appendix section~\ref{sec:discussion_VAMP} for a comparison with the VAMP framework).} \textcolor{ForestGreen}{With our approach, we demonstrate that even basic architectures can achieve significant improvements in Koopman operator estimation by using the spectral residual loss.} \textcolor{red}{Therefore, future} work could focus on refining neural network architectures to enhance the accuracy and efficiency of Koopman eigenpair estimation. One promising direction is the incorporation of Physics-Informed Neural Networks (PINNs) and Physics-Informed Neural Operators (PINOs), which integrate physical laws directly into the learning process. This integration will ensure that the resulting Koopman eigenfunctions align with known physical constraint, avoid overfitting and faciliatates generalization. Indeed, the integration of PINNs and PINOs with the Koopman framework has the potential to serve as a powerful bridge between data-driven and model-driven approaches, offering enhanced insights into complex systems and enabling more robust temporal evolution predictions.
%While ResDMD offers theoretical guarantees, NN-ResDMD advances practical applications by using direct eigenpair computation along with automatic basis optimization. Future work could focus on refining the neural network architecture and exploring more diverse datasets. An additional direction would be the integration of physics-informed neural networks (PINNs) or physics-informed neural operator (PINO), which would allow the framework to incorporate known physical laws, thus leading to potential improvements in accuracy and reducing data requirements in systems with well-understood dynamics.

%Koopman eigenfunctions are fundamental to understanding unknown dynamical systems, as they reveal the intrinsic patterns and structures underlying complex temporal behavior by providing a linear framework for analyzing nonlinear dynamics. The use of neural networks to learn these eigenfunctions, as seen in NN-ResDMD, offers a significant advantage in capturing such patterns automatically, enhancing adaptability and reducing the need for manual intervention in basis selection. This flexibility is particularly beneficial for high-dimensional systems where traditional approaches may struggle to uncover the underlying dynamics effectively. Incorporating Physics-Informed Neural Networks (PINNs) and Physics-Informed Neural Operators (PINOs) into this framework can push the boundaries further by embedding physical laws directly into the learning process. PINNs can incorporate constraints from differential equations into the training, leading to better generalization and interpretability of the learned Koopman eigenfunctions. Similarly, PINOs facilitate operator learning in functional spaces, potentially making it easier to learn complex mappings across various domains and time scales. By leveraging these advances, future work can develop more powerful and interpretable models for identifying latent structures in complex systems, improving predictive capabilities, and enabling robust analysis in areas ranging from fluid dynamics to neuroscience.

% In this paper, we introduced the NN-ResDMD method, which enhances ResDMD by automating the selection of optimal basis functions using neural networks. This approach eliminates the need for manual selection, leading to improved spectral decompositions, especially in high-dimensional systems. Our experiments demonstrate that NN-ResDMD outperforms traditional methods like Hankel-DMD, particularly in complex scenarios involving physical and biological systems.

% While the underlying ResDMD framework offers rigorous theoretical guarantees, our work focuses on practical improvements through the integration of neural networks. Future research could explore refining the neural networks architecture for specific applications and further testing NN-ResDMD on diverse datasets and complex systems.


% \subsubsection*{Author Contributions}
% If you'd like to, you may include  a section for author contributions as is done
% in many journals. This is optional and at the discretion of the authors.

%\subsubsection*{Acknowledgments}
%Authors are grateful to Matthew Colbrook for valuable guidance on ResDMD technical part and providing fluid data, to Bin Han for discussion on computational issues, to Michel Beserve and Rory Bufacchi for proofreading and to Turishcheva Polina for providing permission of using the Sensorium 2023 dataset.


\bibliography{iclr2025_conference}
\bibliographystyle{iclr2025_conference}

\newpage
\appendix
\input{supplementary}

% \section{Appendix}\label{sec:appendix}
% \subsection{Calculation steps for ????????????\ref{relative_residual_approximation}}\label{relative_residual_approximation_calculation}
% Here we are going to show how \textit{squared relative residual} implies (???)(???)(???)(???)\eqref{relative_residual} and then implies (???)(???)(???)(???)\eqref{relative_residual_approximation}. Consider ϕ=Ψv=∑NKi=1ψivi\phi=\mathbf{\Psi}\mathbf{v}=\sum_{i=1}^{N_K} \psi_i \mathbf{v}_i with ‖ϕ‖2=1\|\phi\|_2=1, then
% \begin{align*}\label{relative_residual}   
%     &\quad \frac{\int_{\Omega} |\mathcal{K}\phi(x) - \lambda \phi(x)|^2 d\mu(x)}{\int_{\Omega} |\phi(x)|^2 d\mu(x)} \\
%     &= \int_{\Omega} |\mathcal{K}\phi(x) - \lambda \phi(x)|^2 d\mu(x) \\
%     &= \langle \mathcal{K}\phi - \lambda \phi, \mathcal{K}\phi - \lambda \phi \rangle_{\mu} \\
%     &= \langle \mathcal{K}\phi, \mathcal{K}\phi \rangle_{\mu} - \langle \lambda \phi, \mathcal{K}\phi \rangle_{\mu} - \langle \mathcal{K}\phi, \lambda \phi \rangle_{\mu} + \langle \lambda \phi, \lambda \phi \rangle_{\mu} \\    
%     &= \langle \mathcal{K}\mathbf{\Psi}\mathbf{v}, \mathcal{K}\mathbf{\Psi}\mathbf{v} \rangle_{\mu} - \bar{\lambda} \langle \mathbf{\Psi}\mathbf{v}, \mathcal{K}\mathbf{\Psi}\mathbf{v} \rangle_{\mu} - \lambda \langle \mathcal{K}\mathbf{\Psi}\mathbf{v}, \mathbf{\Psi}\mathbf{v} \rangle_{\mu} + |\lambda|^2 \langle \mathbf{\Psi}\mathbf{v}, \mathbf{\Psi}\mathbf{v} \rangle_{\mu} \\
%     &= \langle \sum_{i=1}^{N_K} \mathcal{K}\psi_i \mathbf{v}_i, \sum_{j=1}^{N_K} \mathcal{K}\psi_j \mathbf{v}_j \rangle_\mu - \bar{\lambda} \langle \sum_{i=1}^{N_K} \psi_i \mathbf{v}_i, \sum_{j=1}^{N_K} \mathcal{K}\psi_j \mathbf{v}_j \rangle_\mu - \lambda \langle \sum_{i=1}^{N_K} \mathcal{K}\psi_i\mathbf{v}_i, \sum_{j=1}^{N_K} \psi_j\mathbf{v}_j \rangle_\mu + |\lambda|^2 \langle \sum_{i=1}^{N_K} \psi_i \mathbf{v}_i, \sum_{j=1}^{N_K} \psi_j \mathbf{v}_j \rangle_\mu \\
%     &= \sum_{i,j=1}^{N_K} \bar{\mathbf{v}}_i \langle \mathcal{K}\psi_i, \mathcal{K}\psi_j \rangle_\mu \mathbf{v}_j - \bar{\lambda} \sum_{i,j=1}^{N_K} \bar{\mathbf{v}}_i \langle \psi_i, \mathcal{K}\psi_j \rangle_\mu \mathbf{v}_j - \lambda \sum_{i,j=1}^{N_K} \bar{\mathbf{v}}_i \langle \mathcal{K}\psi_i, \psi_j \rangle_\mu \mathbf{v}_j + |\lambda|^2 \sum_{i,j=1}^{N_K} \bar{\mathbf{v}}_i \langle \psi_i,\psi_j \rangle_\mu \mathbf{v}_j \\
%     &= \sum_{i,j=1}^{N_K} \bar{\mathbf{v}}_i \left[ \langle \mathcal{K}\psi_i, \mathcal{K}\psi_j \rangle_\mu  - \bar{\lambda} \langle \psi_i, \mathcal{K}\psi_j \rangle_\mu - \lambda \langle \mathcal{K}\psi_i, \psi_j \rangle_\mu + |\lambda|^2 \langle \psi_i,\psi_j \rangle_\mu \right] \mathbf{v}_j \quad (???)\eqref{relative_residual} \\
%     &\approx \sum_{i,j=1}^{N_K} \bar{\mathbf{v}}_i \left[ \frac{1}{m}[\Psi_Y^* \Psi_Y]_{ij} - \bar{\lambda} \frac{1}{m}[\Psi_X^* \Psi_Y]_{ij} - \lambda \frac{1}{m}[\Psi_Y^* \Psi_X]_{ij} + |\lambda|^2 \frac{1}{m}[\Psi_X^* \Psi_X]_{ij} \right] \mathbf{v}_j \\
%     &= \frac{1}{m}\mathbf{v}^* \left[ \Psi_Y^* \Psi_Y - \lambda (\Psi_X^* \Psi_Y)^* - \bar{\lambda} \Psi_X^* \Psi_Y + |\lambda|^2 \Psi_X^* \Psi_X \right] \mathbf{v} \quad \eqref{relative_residual_approximation}
% \end{align*}
% \begin{remark}
%     the inner product above is defined as $\langle f,g \rangle_{\mu} = \int_{\Omega} \, f^* g \, d\mu(x)$
% \end{remark}


% \subsection{Details for deriving \eqref{thm:main}}\label{thmpf:main}

% \begin{align*}
%     J &= \sum_{i=1}^{N_K} \widehat{\textit{res}}(\lambda_i, \phi_i)^2  \\
%     &= \sum_{i=1}^{N_K} \frac{1}{m} \mathbf{v}_i^* \left[ \Psi_Y^* \Psi_Y - \lambda_i (\Psi_X^* \Psi_Y)^* - \bar{\lambda}_i \Psi_X^* \Psi_Y + |\lambda_i|^2 \Psi_X^* \Psi_X \right] \mathbf{v}_i \\
%     &= \sum_{i=1}^{N_K} \frac{1}{m} \left[ \mathbf{v}_i^*  (\Psi_Y^* \Psi_Y) \mathbf{v}_i- \mathbf{v}_i^* (\Psi_X^* \Psi_Y)^* \lambda_i \mathbf{v}_i - \mathbf{v}_i^* \bar{\lambda}_i (\Psi_X^* \Psi_Y) \mathbf{v}_i + \mathbf{v}_i^* K^* (\Psi_X^* \Psi_X) K \mathbf{v}_i \right] \\
%     &= \sum_{i=1}^{N_K} \frac{1}{m} \left[ \mathbf{v}_i^* (\Psi_Y^* \Psi_Y) \mathbf{v}_i- \mathbf{v}_i^* (\Psi_X^* \Psi_Y)^* K \mathbf{v}_i - \mathbf{v}_i^* K^* (\Psi_X^* \Psi_Y) \mathbf{v}_i + \mathbf{v}_i^* K^* (\Psi_X^* \Psi_X) K \mathbf{v}_i \right] \\
%     &= \sum_{i=1}^{N_K} \frac{1}{m} \bigg( \langle \Psi_Y \mathbf{v}_i, \Psi_Y \mathbf{v}_i \rangle  - \langle \Psi_Y \mathbf{v}_i, \Psi_X K \mathbf{v}_i \rangle \\
%     & \qquad - \langle\Psi_X K \mathbf{v}_i, \Psi_Y \mathbf{v}_i \rangle + \langle \Psi_X K \mathbf{v}_i, \Psi_X K \mathbf{v}_i \rangle \bigg) \\
%     &= \sum_{i=1}^{N_K} \frac{1}{m} \langle \Psi_Y \mathbf{v}_i - \Psi_X K \mathbf{v}_i, \Psi_Y \mathbf{v}_i - \Psi_X K \mathbf{v}_i \rangle \\
%     &= \sum_{i=1}^{N_K} \frac{1}{m} \| \Psi_Y \mathbf{v}_i - \Psi_X K \mathbf{v}_i \|_2^2 \\
%     &= \frac{1}{m} \| (\Psi_Y  - \Psi_X K) V \|_{F}^2.
% \end{align*}
% Next, by \href{https://en.wikipedia.org/wiki/Matrix_calculus#Denominator-layout_notation}{matrix calculus with denominator layout convention}, we try to find minimal of $J$:
% \begin{align*}
%     0 = \frac{d J}{d K} &= \frac{d \tr(J)}{d K} \quad  (\text{since $J$ is a scalar})\\
%     &= \frac{d}{d K}\tr\bigg( \frac{1}{m} \sum_{i=1}^{N_K} \mathbf{v}_i^* \bigg[ \Psi_Y^* \Psi_Y - (\Psi_X^* \Psi_Y)^* K \\
%     &\qquad - K^* (\Psi_X^* \Psi_Y) + K^* (\Psi_X^* \Psi_X) K \bigg] \mathbf{v}_i \bigg) \\
%     &= \sum_{i=1}^{N_K} \frac{d}{d K} \tr\bigg( \mathbf{v}_i^* \bigg[ L - A^* K - K^* A + K^* G K \bigg] \mathbf{v}_i \bigg) \\
%     &= \sum_{i=1}^{N_K} \frac{d}{d K} \tr\left( \mathbf{v}_i^* L \mathbf{v}_i \right) + \frac{d}{d K} \tr\left( \mathbf{v}_i^* A^* K \mathbf{v}_i \right) + \frac{d}{d K} \tr\left( \mathbf{v}_i^* K^* A \mathbf{v}_i \right) + \frac{d}{d K} \tr\left( \mathbf{v}_i^* K^* G K \mathbf{v}_i \right) \\
%     &= \sum_{i=1}^{N_K} -A\mathbf{v}_i \mathbf{v}_i^*-A\mathbf{v}_i \mathbf{v}_i^*+(G+G^*)K\mathbf{v}_i \mathbf{v}_i^* \\
%     &= \sum_{i=1}^{N_K} (-2A+2GK)\mathbf{v}_i \mathbf{v}_i^* \quad (\text{GG is symmetric})
% \end{align*}
% where \tr()\tr() is trace of a matrix and G=Ψ∗XΨX,A=Ψ∗XΨY,L=Ψ∗YΨY.G = \Psi_X^* \Psi_X, A = \Psi_X^* \Psi_Y, L = \Psi_Y^* \Psi_Y.

% Since eigenvector viv_i is not a zero vector, viv∗iv_i v_i^* is not a zero matrix. So
% −2A+2GK=0⇒K=G†A.-2A+2GK = 0 \Rightarrow K = G^{\dagger}A.
% \begin{remark}
%     To solve ddK\tr(v∗iK∗GKvi)\tfrac{d}{d K} \tr\left( \mathbf{v}_i^* K^* G K \mathbf{v}_i \right) , we simply rewrite it as 
%     ddK\tr(v∗iK∗GKvi)=ddK\tr((Kvi)∗G(Kvi))\tfrac{d}{d K} \tr\left( \mathbf{v}_i^* K^* G K \mathbf{v}_i \right) = \tfrac{d}{d K} \tr\left( (K \mathbf{v}_i)^* G (K \mathbf{v}_i) \right)
% \end{remark}


% \subsection{Discussion On Convergence}\label{convergence_discussion}
% To understand how neural networks enhance NN-ResDMD, it is important to introduce Barron space \citep{Pinkus_1999,cybenko:hal-03753170,haykin2009neural,256500}. Barron space characterizes functions efficiently approximated by two-layer neural networks, which is central to NN-ResDMD. By leveraging networks that approximate functions within this space, NN-ResDMD can flexibly optimize the dictionary functions for Koopman operator approximation, making it highly effective for complex, high-dimensional systems.

% A function \( f \) belongs to Barron space \( \mathcal{B} \) if it can be represented as:
% \[
% f(x) = \int_{\Omega} a \sigma(w^T x) \rho(da, dw),
% \]
% where \( \sigma \) is the activation function, \( w \) is a weight vector, \( a \) is a coefficient, and \( \rho \) is a probability distribution. The complexity of \( f \) is measured by the Barron norm \( \|f\|_\mathcal{B} \):
% \[
% \|f\|_\mathcal{B} = \inf_{\rho \in P_f} \left( \int_{\Omega} |a| \|w\|_1 \rho(da, dw) \right),
% \]
% where \( P_f \) is the set of distributions for which \( f \) can be represented. This framework provides a basis for analyzing approximation errors in neural networks.

% The following theorem \citep{e2020mathematicalunderstandingneuralnetworkbased} discusses the approximation capabilities of two-layer neural networks within this context, establishing a foundation for the subsequent analysis.


% \begin{theorem}[Direct Approximation Theorem, \(L^2\)-version]\label{weinan_theorem}
%     For any \(f \in \mathcal{B}\) and \(r \in \mathbb{N}\), there exists a two-layer neural network \(f_r\) with \(r\) neurons \(\{(a_i, \mathbf{w}_i)\}\) such that
%     \[
%     \|f - f_r\|_{L^2} \lesssim \frac{\|f\|_{\mathcal{B}}}{\sqrt{r}}.
%     \]
% \end{theorem}
% This result implies that the approximation error decreases at a rate of \( 1/\sqrt{r} \) as the number of neurons \( r \) increases, with the constant \( \|f\|_\mathcal{B} \) reflecting the complexity of the function \( f \) within the Barron space.

% Now, consider a Barron space B \mathcal{B}  which is dense in L2(Ω,μ)L^2(\Omega, \mu) and a projected Koopman operator KNK:BNK→L2(Ω,μ) \mathcal{K}_{N_K} : \mathcal{B}_{N_K} \to L^2(\Omega, \mu)  where BNK⊆B\mathcal{B}_{N_K} \subseteq \mathcal{B} is a NKN_K-dimensional subspace spanned by some   dictionary Ψ={ψi}NKi=1\mathbf{\Psi} = \{\psi_i\}_{i=1}^{N_K}. According to Theorem ???????????????????????????????????????\ref{weinan_theorem}, we can have a  well-trained dictionary that almost spans BNK\mathcal{B}_{N_K}, i.e., given ϵ>0\epsilon>0, we can always obtain a dictionary Ψr={ψr,i}NKi=1\mathbf{\Psi}_r = \{\psi_{r,i}\}_{i=1}^{N_K} such that ∑NKi=1‖\sum_{i=1}^{N_K} \Vert \psi_{r,i} - \psi_i \Vert^2_2 < \epsilon.


% \subsection{Practical details for neural data analysis}
% \subsubsection{Dataset Details and Experimental Setup}

% The dataset utilized in this study is part of the open dataset provided for the 'Sensorium 2023' competition \citep{turishcheva2023dynamic}. The dataset consists of calcium imaging recordings from the primary visual cortex of mice. During the experiments, the mice were presented with natural video stimuli while the activity of thousands of neurons was recorded. The objective of the competition is to predict large-scale neuronal population activity in response to different frames of the stimulus videos, based on the hypothesis that population dynamics in the primary visual cortex, driven by visual stimuli, encode significant information about the dynamics of the videos \citep{basole2003mapping, onat2011natural,henaff2021primary}.

% \subsubsection{Task Definition and Rationale}
% In contrast to the competition's prediction objective, our study focuses on the task of state partitioning of neural signals. While prediction remains feasible, we aim to demonstrate that state partitioning is sufficient to highlight the superiority of NN-ResDMD over Hankel-DMD in uncovering the latent dynamics of the system. Specifically, in each experiment, a set of six video stimuli was repeatedly presented to each mouse, creating ideal conditions for defining brain states. The recording setup remained consistent for each mouse, ensuring that the neural activities could be interpreted as originating from the same dynamical system, with the primary variable being the input stimulus.

% We hypothesize that during repeated trials with identical visual stimuli, the underlying dynamics of the neural system remain consistent. Consequently, the recurrence of the same brain state is expected during these trials. This provides a reliable basis for testing the efficacy of Koopman decomposition methods in uncovering latent dynamics and distinguishing these states.

% \subsubsection{Dataset Structure and Dimensionality}
% The dataset includes neural recordings from five mice, with each mouse responding to six distinct video stimuli, presented in 9-10 repeated trials (resulting in approximately 60 trials in total). Each trial involves recordings of over 7000 neurons. The duration of each video stimulus is 10 seconds, with a sampling rate of 50 Hz, yielding 300 data points (299 snapshots) per trial. Thus, the data to be analyzed consists of high-dimensional time series with 7000+ observables per snapshot.

% \subsubsection{NN-ResDMD and Hankel-DMD Implementations}

% We applied NN-ResDMD and Hankel-DMD to the 5 datasets, although with slightly different implementations and different dimensions of approximated Koopman invariance subspace. 
% For NN-ResDMD, we train the dictionaries with all the snapshots recorded in each mouse such that the total snapshot number is the product of the snapshot number in one trial and the number of all trials. This is to avoid overfitting with the small snapshot numbers within a trial. The high-dimensional data is first reduced to 300 dimensions with Singular Value Decomposition. The dimension of the Koopman subspace is chosen to be 601, consisting of 300 trained bases and 301 pre-chosen ones (constant and the first-degree polynomials of the SVD-ed 300 dimensions). The first 501 eigenfunctions sorted by the modulus of eigenvalues are selected to avoid spurious eigenvalues estimation due to noise. One can find the decomposed eigenfunctions in Figure~?????????????????????????????????????????????????????????????????????\ref{fig:resDMD_HankelDMD_efuns}?????????????????????????????????????????????????????????????????????\ref{fig:resDMD_HankelDMD_efuns}A(top), with a marker of the ground truth state separations based on stimulus identity. 

% For Hankel-DMD, the Koopman eigenfunctions were approximated using the eigenvectors of the Hankel matrix. Specifically, the Hankel matrix was formed as in Equation 53 from \citep{arbabi2017ergodic}, using all the observables from one trial of each mouse with a delay of 50. Consequently, the snapshot size became 249 times the observable number, and the resulting number of eigenfunctions was 50, each with a length of 50. The Hankel-DMD eigenfunctions for each trial of data are shown in Figure~\ref{fig:resDMD_HankelDMD_efuns}A (bottom), alongside the ground truth trial identities for comparison.

% The Koopman eigenfunctions from both NN-ResDMD and Hankel-DMD represent dynamical features corresponding to one of the six video stimuli. To evaluate how well the eigenfunctions capture the latent dynamics, we assess the similarity of the features for trials with the same stimulus and their dissimilarity from those corresponding to different stimuli. Effectively, this makes the problem a clustering task, where the separability of the Koopman eigenfunctions reflects how well they capture the key dynamic components related to the stimuli.

% \subsubsection{Visualization and Clustering Performance}
% To visualize the clustering of high-dimensional Koopman eigenfunctions, we perform dimensionality reduction using Multi-dimensional Scaling (MDS). MDS is particularly useful for visualizing high-dimensional data by preserving pairwise similarities \citep{kruskal1964nonmetric}. While UMAP \citep{mcinnes2018umap} and t-SNE \citep{van2008visualizing} are alternative visualization methods, with different emphasis on global-local relationships, we primarily use MDS in this study and provide UMAP and t-SNE results in the supplementary materials (see Appendix Figure~\ref{fig
% }A, B).

% By applying MDS, the high-dimensional eigenfunction-based features are reduced to a low-dimensional space. For illustration, we present the results of reducing the feature space to two dimensions (Figure~??????????????????????????????????????????????????????????????????????????????\ref{fig:resDMD_HankelDMD_efuns}B, Left top and bottom). The NN-ResDMD reduced features for the six types of trials (corresponding to the six video stimuli) are well-separated for all five mice (Figure~???????????????????????????????????????????????????\ref{fig:resDMD_HankelDMD_efuns}B, Left top). In contrast, the Hankel-DMD features show no clear clustering structure (Figure~??????????????????????????????????????????????????????????????????????????????\ref{fig:umap_tsne}B, Left bottom).


% \begin{figure}[!htb]
% \centering
% \includegraphics[width=\textwidth]{UMAP_tSNE_2d_newfont}
% \caption{State Partition performance of eigenfunctions in 2D space visualized with UMAP (A) and t-SNE (B). The layout is similar to Figure~???????????????????????????????????????????????????????????????????????????\ref{fig:resDMD_HankelDMD_efuns}B.}
% \label{fig:umap_tsne}
% \end{figure}

% \subsubsection{Clustering Quality Metrics}
% We further quantified the clustering quality by calculating the Davies-Bouldin Index (DBI) for both Koopman decomposition methods across all mice (Figure~????????????????????????????????????????????????????????????????????????\ref{fig:resDMD_HankelDMD_efuns}B, Right). The DBI is designed to assess the compactness of clusters and the separability between them. A lower DBI indicates better clustering performance. NN-ResDMD features yield significantly lower DBI scores compared to Hankel-DMD, confirming that NN-ResDMD produces more clearly defined clusters corresponding to the ground truth trials. Similar clustering results are observed with UMAP and t-SNE (see Appendix Figure~????????????????????????????????????????????????????????????????????????\ref{fig:umap_tsne}A, B), further supporting the superior performance of NN-ResDMD in capturing the latent dynamic structure compared to Hankel-DMD.














\end{document}
