\section{Our Result}\label{sec:result}
In Section~\ref{sec:elliptic regularity}, we first present a classical elliptic regularity lemma. In Section~\ref{sec:regularization_effect}, we then show how controlling the second-order loss ensures bounded estimation error in a stronger Sobolev norm, thereby revealing a key regularization effect. In Section~\ref{sec:excess_risk}, we derive an excess risk bound, demonstrating that our method generalizes well under finite-sample conditions. In Section~\ref{sec:discrete_propagation}, we further analyze a discrete propagation inequality under noise. In Section~\ref{sec:main_result}, we combine these insights in our main theorem, proving that the learned trajectory remains robust against noise and sampling limitations.

\subsection{Elliptic Regularity}\label{sec:elliptic regularity} In this section, we introduce the first result, which is a classical result that characterizes the relationship between different Sobolev norms for sufficiently smooth functions.


\begin{lemma}[Elliptic regularity in \cite{e10_pde}]\label{lem:pde_bound}
Let $\Omega \subset \mathbb{R}^d$ be a bounded domain with a sufficiently smooth boundary. Suppose $h:\Omega \to \mathbb{R}$ belongs to $L^2(\Omega)$, has weak derivatives up to second order in $L^2(\Omega)$ and satisfies appropriate boundary conditions. Then, there exists a constant $C_{\mathrm{reg}}>0$, depending only on $\Omega$ and the boundary conditions, such that
\begin{align*}
    \| h \|_{H^2(\Omega) } \leq C_{\mathrm{reg}} (\| \nabla h\|_{L^2(\Omega)} + \| h\|_{L^2(\Omega)}).
\end{align*}
\end{lemma}

The above result is fundamental in establishing norm equivalences in Sobolev spaces, which we will use to analyze the regularity of error terms in subsequent lemmas.

\subsection{Regularization Effect}\label{sec:regularization_effect} 
In this section, we now connect the second-order loss function with the Sobolev norm of the estimation error.


\begin{lemma}[Regulazation effect]\label{lem:reg_effect}
Let $\{(x_0, x_1, t)\}$ denote the sampling start point, endpoint, and time in the training set, and suppose the true trajectory $\ddot{x}_t \in H^2(\Omega)$, we consider 
\begin{align*}
L_{2,2,\theta_1,\theta_2} = \E [ \| \ddot{x}_t^\True - u_{2,\theta_2}(u_{1,\theta_1}(x_t^\True, t), x_t^\True, t) \|^2_2 ],
\end{align*}

The second-order loss with respect to the true second derivative. Particularly, there exist $C_{\mathrm{reg}} \in \R$ when $L_{2,2,\theta_1,\theta_2}$ is sufficiently small such that
\begin{align*}
    & ~ \| \dot{x}^\Est_t - \dot{x}_t^\True \|_{H^2(\Omega)} \\
    \leq & ~  C_{\mathrm{reg}} (L_{2,2,\theta_1,\theta_2}^{1/2} + \| \dot{x}_t^{\Est} - \dot{x}_t^{\True} \|_{L^2(\Omega)}).
\end{align*}
\end{lemma}
\begin{proof}
    First we let $h(\cdot) = \dot{x}_t^\Est (\cdot) - \dot{x}_t^\True (\cdot)$, the problem depends on both $t$ and $x$, we could it by $h(t,x)$. For clarity, we simply write $h(\cdot)$ and regard it as a function on $\Omega$. Generally, one assumes $\dot{x}_t^\Est, \dot{x}_t^\True \in H^2(\Omega)$ so that $h \in H^2(\Omega)$.

    Applying Lemma~\ref{lem:pde_bound} to $h(\cdot) = \dot{x}_t^\Est - \dot{x}_t^\True$, we have 
    \begin{align}\label{eq:lesssim_1}
       & ~ \| \dot{x}^\Est_t - \dot{x}_t^\True \|_{H^2(\Omega)} \notag\\
      \leq & ~ C_{\mathrm{reg}} (\| \nabla h\|_{L^2(\Omega)} + \| h\|_{L^2(\Omega)}).
    \end{align}
    By the Definition of the loss function, a small $L_{2,2,\theta_1,\theta_2}$ implies
    \begin{align}\label{eq:lesssim_2}
        \| \dot{x}^\Est_t - \dot{x}_t^\True \|_{L^2(\Omega)}\lesssim L^{1/2}_{2,2,\theta_1,\theta_2}
    \end{align}
    Combining Eq.\eqref{eq:lesssim_1} and \eqref{eq:lesssim_2}, we have
    \begin{align*}
    & ~ \| \dot{x}^\Est_t - \dot{x}_t^\True \|_{H^2(\Omega)} \\ 
    \leq & ~ C_{\mathrm{reg}} (L_{2,2,\theta_1,\theta_2}^{1/2} + \| \dot{x}_t^{\Est} - \dot{x}_t^{\True} \|_{L^2(\Omega)}).
\end{align*}
Thus, we complete the proof.
\end{proof}

The above lemma highlights the importance of small second-order loss: it guarantees that the estimation error in the stronger Sobolev norm $H^2(\Omega)$ remains controlled.

\subsection{Excess Risk}\label{sec:excess_risk}

In this section, we introduce the following result which bounds the difference between the empirical and population loss, demonstrating that our method generalizes well under finite-sample conditions.

\begin{lemma}[Symmetrization bound]\label{lem:first_half_excess_risk}
Let $\{x_i\}_{i=1}^N$ and $\{x_i'\}_{i=1}^N$ be i.i.d. samples. For $\mathcal{G} = \{\ell_{\theta} : \theta \in \Theta \}$, we have:
\begin{align*}
    \sup_{g \in \mathcal{G}}  | \frac{1}{N} \sum_{i=1}^N  (g(x_i) - g(x_i') )  | \leq \frac{2}{N} \E_\sigma  [ \sup_{g \in \mathcal{G}} \sum_{i=1}^N \sigma_i g(x_i) ],
\end{align*}
where $\{\sigma_i\}_{i=1}^N$ are Rademacher random variables, $\sigma_i \in \{+1, -1\}$ with equal probability.
\end{lemma}
\begin{proof}
For each $\sigma_i$ has a symmetric distribution, we have:
\begin{align*}
 |\sum_{i=1}^N  (g(x_i) - g(x_i') ) |
    \leq 
    \E_\sigma  [|\sum_{i=1}^N \sigma_i  (g(x_i) - g(x_i') ) |
\end{align*}
Taking the supremum over $g \in \mathcal{G}$ and noting that $\{x_i\}$ and $\{x_i'\}$ have the same distribution, we can split the expression inside the absolute value:
\begin{align*}
    &~ \sup_{g \in \mathcal{G}}  |\sum_{i=1}^N  (g(x_i) - g(x_i') ) | \\
    \leq  &~
    \mathbb{E}_\sigma  [\sup_{g \in \mathcal{G}} |\sum_{i=1}^N 
    \sigma_i (g(x_i) - g(x_i') ) |].
\end{align*}
By the triangle inequality, we get:
\begin{align*}
     |\sum_{i=1}^N \sigma_i  (g(x_i) - g(x_i') ) |
      \leq 
     |\sum_{i=1}^N \sigma_i g(x_i) |
     + 
     |\sum_{i=1}^N \sigma_i g(x_i') |.
\end{align*}
Hence,
\begin{align*}
    &~ \sup_{g \in \mathcal{G}}  |\sum_{i=1}^N (g(x_i) - g(x_i')) |\\
      \leq  &~
    \mathbb{E}_\sigma  [\sup_{g \in \mathcal{G}} |\sum_{i=1}^N \sigma_i\,g(x_i) |
     + 
    \sup_{g \in \mathcal{G}} |\sum_{i=1}^N \sigma_i g(x_i') | ].
\end{align*}
Because $\{x_i'\}$ is drawn from the same distribution as $\{x_i\}$, the two supremum terms have the same expected value. Therefore, we can combine them as follows:
\begin{align*}
    \sup_{g \in \mathcal{G}}  | \frac{1}{N} \sum_{i=1}^N  (g(x_i) - g(x_i') )  | \leq \frac{2}{N} \E_\sigma  [ \sup_{g \in \mathcal{G}} \sum_{i=1}^N \sigma_i g(x_i) ],
\end{align*}
Thu,s we complete the proof.
\end{proof}

\begin{lemma}[Theorem 6.11 in \cite{sb_14_book}]\label{lem:part2_excess_risk_bound}
As we defined in Definition~\ref{def:sobolev_space},~\ref{def:pp_loss} and \ref{def:emp_loss}, if Assumption~\ref{ass:rademacher} holds, for $g \in \G$ where $\G = \{\ell_\theta : \theta \in \Theta \}$, we have
\begin{align*}
 \sup_{g \in \G}| \frac{1}{N} \sum_{i=1}^N g(x_i') - \E [g(x)]| 
\leq & ~ O(\sqrt{\ln (1/ \beta)/N})
\end{align*}
\end{lemma}

\begin{lemma}[Excess risk]\label{lem:learning_bound} As we defined in Definition~\ref{def:pp_loss}, we have 
\begin{align}\label{eq:emp_loss}
    & ~ \wt{L}_{2,\theta} \notag \\ 
    = & ~ \frac{1}{N}\sum_{i=1}^N ( \| \dot{x}_t^{\True,i} - u_{1,\theta_1}(\cdot) \|^2_2 + \| \ddot{x}_t^{\True, i} - u_{2,\theta_2}(\cdot) \|^2_2) 
\end{align}
and
\begin{align}\label{eq:true_part_emp_loss}
    L_{2,\theta} = & ~ \E [\| \dot{x}_t^\True - u_{1,\theta_1}(\cdot) \|^2_2 + \| \ddot{x}_t^\True - u_{2,\theta_2}(\cdot)\|^2_2] 
\end{align}
Suppose $\F_1$ nad $\F_2$ have finite or at most polynomially growing complexities $\mathcal{C}(\F_1), \mathcal{C}(\F_2)$. Then for $\beta \in (0,1)$, with probability at least $1-\beta$, we have
\begin{align*}
    | \wt{L}_{2,\theta} - L_{2,\theta} | \leq O((\mathcal{C}(\F_1) + \mathcal{C}(\F_2) + \ln(1/\beta))/N)^{1/2}.
\end{align*}
\end{lemma}
\begin{proof}
    Let $\G = \{\ell_\theta : \theta \in \Theta \}$ represent the complexity of $\G$ the Rademacher/VC dimension, As we defined in Definition~\ref{def:second_order_flow_matching_loss_new},~\ref{def:emp_loss} and ~\ref{def:pp_loss}, we calculate the empirical loss and population loss,
    \begin{align*}
        \wt{L}_{2,\theta} = & ~\frac{1}{N} \sum_{i=1}^N l_\theta (x_i) \\
        = & ~\frac{1}{N}\sum_{i=1}^N ( \| \dot{x}_t^{\True,i} - u_{1,\theta_1}(\cdot) \|^2_2 \\
        & ~ + \| \ddot{x}_t^{\True, i} - u_{2,\theta_2}(\cdot) \|^2_2) 
    \end{align*}
    and
    \begin{align*}
        L_{2,\theta}  =&~ \E [l_{\theta}(X)] \\
        = &~ \E [\| \dot{x}_t^\True - u_{1,\theta_1}(\cdot) \|^2_2 + \| \ddot{x}_t^\True - u_{2,\theta_2}(\cdot)\|^2_2] 
    \end{align*}
    let $\{x_i'\}_{i=1}^N$ be an i.i.d.\ sample from the same distribution as $\{x_i\}_{i=1}^N$, 
    and let $\{\sigma_i\}_{i=1}^N$ be i.i.d.\ Rademacher random variables ($\sigma_i \in\{+1,-1\}$ with probability $1/2$ each).  Then, for any $g\in \G$, we have
    \begin{align}\label{eq:bound_sum_E}
    & \sup_{g \in \G}| \frac{1}{N} \sum_{i=1}^{N} g(x_i) - \E [g(x)] | \notag  \\
   \leq  & ~ \sup_{g \in \G}| \frac{1}{N} \sum_{i=1}^N (g(x_i)) - g(x_i') | \notag \\
   & ~ + \sup_{g \in \G}| \frac{1}{N} \sum_{i=1}^N g(x_i') - \E [g(x)]| 
\end{align}

We can upper bound the first term in Eq.~\eqref{eq:bound_sum_E},
\begin{align}\label{eq:bound_sum_E_part1}
 & ~ \sup_{g \in \G}| \frac{1}{N} \sum_{i=1}^N (g(x_i)) - g(x_i') | \notag \\
 \leq & ~ \frac{2}{N} \E_{\sigma} [\sup_{g \in \G} \sum_{i=1}^N \sigma_i g(x_i)] \notag \\
 = & ~ 2 \wt{ {\cal R} }_N( {\cal G} ) \notag \\
 \leq & ~2 \cdot  O ( \sqrt{ C( {\cal G}  ) / N  } ) \notag \\
 \leq  & ~ O(\sqrt{(C(\F_1) + C(\F_2) )/N}) 
\end{align}
where the first step follows from Lemma~\ref{lem:first_half_excess_risk}, the second step comes from we define $\wt{\mathcal{R}}_N(G) : = \E_\sigma [\sup_{g \in \G} \frac{1}{N} \sum_{i=1}^N \sigma_i g(x_i)]$, the third step follows from Assumption~\ref{ass:rademacher}, the forth step follows from the definition of $\G$.

We can upper bound the second term in Eq.~\eqref{eq:bound_sum_E} by using Lemma~\ref{lem:part2_excess_risk_bound},
\begin{align}\label{eq:bound_sum_E_part2}
    \sup_{g \in \G}| \frac{1}{N} \sum_{i=1}^N g(x_i') - \E [g(x)]| \leq  O(\sqrt{\ln (1/ \beta)/N})
\end{align}


Loading Eq.~\eqref{eq:bound_sum_E_part1} and Eq.~\eqref{eq:bound_sum_E_part2}, we can obtain
\begin{align*}
    & ~ \sup_{g \in \G}| \frac{1}{N} \sum_{i=1}^{N} g(x_i) - \E [g(x)] | \\ 
    \leq & ~ O(\sqrt{(C(\F_1) + C(\F_2) +\ln(1 / \beta))/N})
\end{align*}
Thus, we complete the proof.
\end{proof}

\subsection{Discrete Propagation under Noise}\label{sec:discrete_propagation}

In this section, we show the lemma about discrete propagation under noise, which quantifies how noise in the trajectory affects the error propagation in a discrete setting.

\begin{lemma}[Discrete propagation under noise]\label{lem:prog_noise}
    Suppose $\eta_i$ satisfies $\| \eta_i \| \leq \delta$, there exist $C_{\mathrm{prop}} \in \R$ such that
    \begin{align*}
        e_{l+1} \leq (1+ \Delta t \cdot C_{\mathrm{prop}}) e_l + C_{\mathrm{prop}} \cdot \Delta t \cdot \delta \cdot \epsilon
    \end{align*}
    unrolling for $l = 0,\cdots, L-1$, we have 
    \begin{align*}
        e_L \leq e_0 \exp (C_{\mathrm{prop}}) + \frac{\delta \cdot \epsilon}{C_{\mathrm{prop}}} (\exp (C_{\mathrm{prop}}) - 1)
    \end{align*}
\end{lemma}
\begin{proof}
    By Assumptions~\ref{ass:lipschitz} and~\ref{ass:discrete_time}, the discrete updates for both the estimated and true systems can be written as:
    \begin{align*}
        x_{l+1} = & ~ x_l + \Delta t \cdot u_{1,\theta_1}(x_l,t_l) \\
        & \quad + \frac{(\Delta t)^2}{2} u_{2,\theta_2} (u_{1,\theta_1}(x_l,t_l), x_l, t_l).
    \end{align*}
    Subtracting the true system update from the estimated one gives:
    \begin{align*}
        & ~ x_{l+1}^\Est - x_{l+1}^\True \\
        = & ~ (x_l^\Est - x_l^\True) + \Delta t \cdot (u_{1,\theta_1}(x_l^\Est, t_l) - \dot{x}_l^\True) \\
        & ~ + \frac{(\Delta t)^2}{2} \cdot (u_{2,\theta_2}(\cdot) - \ddot{x}^\True).
    \end{align*}
    Taking the $H^2$-norm, we have:
    \begin{align*}
        & ~ \| x_{l+1}^\Est - x_{l+1}^\True \|_{H^2(\Omega)} \\
        \leq & ~ \| x_l^\Est - x_l^\True \|_{H^2(\Omega)} \\
        & ~ + \Delta t \cdot \| u_{1,\theta_1}(x_l^\Est, t_l) - \dot{x}_l^\True \|_{H^2(\Omega)} \\
        & ~ + O((\Delta t)^2).
    \end{align*}
    Since $\dot{x}_l^\True \sim u_{1,\theta_1}(x_l^\True, t_l)$ and the deviation is controlled by $\| x_l^\Est - x_l^\True \|$ and noise $\delta \epsilon$, we can write:
    \begin{align*}
        e_{l+1} = & ~ \| x_{l+1}^\Est - x_{l+1}^\True \|_{H^2(\Omega)} \\
        \leq & ~ (1+ \Delta t \cdot C_{\mathrm{prop}}) e_l + C_{\mathrm{prop}} \Delta t \cdot \delta \cdot \epsilon,
    \end{align*}
    where $C_{\mathrm{prop}}$ depends on the Lipschitz constants of $u_{1,\theta_1}$ and $u_{2,\theta_2}$.
    Repeatedly applying this inequality from $l=0$ to $l=L-1$, we have:
    \begin{align*}
        e_L = & ~ e_0 \prod_{j=0}^{L-1} (1 + \Delta t \cdot C_{\mathrm{prop}}) \\
        & ~ + \sum_{l=0}^{L-1} (C_{\mathrm{prop}} \Delta t \cdot \delta \cdot \epsilon \prod_{j=l+1}^{L-1} (1+ \Delta t \cdot C_{\mathrm{prop}}) ).
    \end{align*}
    Recognizing that:
    %\begin{align*}
    $
        \prod_{j=0}^{L-1}(1+\Delta t \cdot C_{\mathrm{prop}}) = \exp(C_{\mathrm{prop}}),
    $
    %\end{align*}
    we simplify the summation term using the geometric series formula:
    \begin{align*}
        \sum_{l=0}^{L-1} \prod_{j=l+1}^{L-1}(1+ \Delta t \cdot C_{\mathrm{prop}})
        = \frac{\exp(C_{\mathrm{prop}}) - 1}{C_{\mathrm{prop}}}.
    \end{align*}
    Thus, we obtain:
    \begin{align*}
        e_L \leq e_0 \exp(C_{\mathrm{prop}}) + \frac{\delta \cdot \epsilon}{C_{\mathrm{prop}}} (\exp(C_{\mathrm{prop}}) - 1).
    \end{align*}
    This completes the proof.
\end{proof}
This result provides a discrete Gronwall-type inequality, quantifying the growth of error under bounded noise.


\subsection{Main Result}\label{sec:main_result}

In this section, we now state and prove our main result with the auxiliary lemmas in place, which establishes the robustness of the learned trajectory against noise and finite-sample effects.


\begin{theorem}[Noise robustness]
\label{thm:main}\footnote{We state the proof of Theorem~\ref{thm:main} in Section~\ref{sec:missing_proof} in our Appendix.}
Suppose all Assumption~\ref{ass:smooth},~\ref{ass:lipschitz},~\ref{ass:noise_bound} and~\ref{ass:discrete_time}  holds, Let $\wt{\theta} = (\wt{\theta}_1, \wt{\theta}_2)$ is the approximately optimal solution, then $\beta \in (0,0.1)$, the final-time($t=1$) trajectory estimate satisfies 
\begin{align*}
     & ~ \| x_{t=1}^\Est - x_{t=1}^\True \|_{H^2(\Omega)} \\
    \leq & ~ C_1 \exp(C_2) \cdot (e_0 + \delta \cdot \epsilon) \\
    \quad & + C_3 \cdot ((\mathcal{C}(\F_1) + \mathcal{C}(\F_2) + \ln(1/\beta))/N)^{1/2},
\end{align*}
where $e_0 = \| x_0^\Est - x_0^\True \|$ is the initial error. $C_1,C_2,C_3$ depends on Lipschitz constant $L$, dimension $d$, sobolev embedding constant, $\Delta t$, $\exp(C_2)$ represents the discrete gronwall factor for the time interval $[0,1]$.
\end{theorem}
