% \section*{Appendix}

% \begin{center}
%     \textbf{\LARGE Appendix }
% \end{center}

{\bf Roadmap.} In Section~\ref{sec:app:pre}, we introduce some notations and basic concepts.   
We state the proof of Theorem~\ref{thm:main} in Section~\ref{sec:missing_proof}. In Section~\ref{sec:app:3rd}, we extend our result to a third-order case. In Section~\ref{sec:app:k_order}, we extend our result to $k$-th order.
In Section~\ref{app:empirical_study}, we provide comprehensive experiments to evaluate our NRFlow under complex conditions. 

\section{Preliminary}\label{sec:app:pre}
In Section~\ref{sec:app:pre:notation}, we introduce some notations we use in the appendix. In Section~\ref{sec:app:pre:flow_matching}, we introduce some basic concepts about flow matching. In Section~\ref{sec:app:OT}, we introduce the background of optimal transport.


\subsection{Notations}\label{sec:app:pre:notation}
We use $\Pr[]$ to denote the probability. We use $\E[]$ to denote the expectation. We use $\var[]$ to denote the variance.
We use $\|x\|_p$ to denote the $\ell_p$ norm of a vector $x \in \R^n$, i.e. $\|x\|_1 := \sum_{i=1}^n |x_i|$, $\|x\|_2 := (\sum_{i=1}^n x_i^2)^{1/2}$, and $\|x\|_{\infty} := \max_{i \in [n]} |x_i|$. 
For variables $a,b$, We write  $ a \lesssim b$ to indicate that $a$ is bounded above by $b$ up to a multiplicative constant independent of the main parameters. We write $a \gtrsim b$ to indicate that $a$ is bounded below by $b$ up to a multiplicative constant independent of the main parameters. We denote $\dot{x}^{(k)}$ as the $k$-th order derivative field of $x$. We use $\mathrm{Dist}$ as the function represents the probability distribution of a given random variable or random vector, mapping it to its corresponding measure on the probability space.
\subsection{Flow Matching}\label{sec:app:pre:flow_matching}
In this section, we restate and introduce some definitions of flow matching and the algorithm. We restate part of Definition~\ref{def:2nd} and introduce the loss function of flow matching.

\begin{definition}[Loss function]\label{def:1st_loss}
The loss function for the second order method contains two parts. We define the first part which is trying to using $\dot{x}_t$ in Fact~\ref{fac:one_second_order}, $x_t$ and $t$ to learn function $u_{1,t}$, thus the loss is
\begin{align*}
    L_{\mathrm{1st}} := \| \dot{x}_t - u_{1,\theta_1}(x_t, t) \|_2^2.
\end{align*}
\end{definition}


Here we restate Definition~\ref{def:flow_matching}
\begin{definition}[A variant of flow matching in \cite{flow_matching}]
\label{def:flow_matching_informal}
Given two distributions $\mu_0$ and $\pi_0$ on $\mathbb{R}^d$, flow matching aims to learn a time-dependent velocity field
\begin{align*}
   v_\theta : \mathbb{R}^d \times [0,1] \;\to\; \mathbb{R}^d
\end{align*}
such that for any trajectory $x_t$ transporting $x_0 \sim \mu_0$ to $x_1 \sim \pi_0$, we have 
\begin{align*}
   \dot{x}_t \sim v_\theta(x_t,\, t).
\end{align*}
\end{definition}

We present the training algorithm and inference algorithm of flow matching.
\begin{algorithm}[!ht]\caption{Training algorithm of flow matching}
\begin{algorithmic}[1]
\Procedure{1stOrderForward}{$ $}
    \For{each iteration} 
    \State Random sample $x_0$ and time $t$, with target $x_1$
    \State $x_t \gets \alpha_t \cdot x_0 + \sqrt{1-\alpha_t^2} \cdot x_1$\
    \State Compute gradient with respect to $L_{\mathrm{1st}}$ \Comment{See Definition~\ref{def:1st_loss}}
    \EndFor 
    \State \Return $u_1$ \Comment{One network functions}
\EndProcedure
\end{algorithmic}
\end{algorithm}

\begin{algorithm}[!ht]\caption{Inference algorithm of flow matching}
\begin{algorithmic}[1]
\Procedure{1stOrderInference}{$u_1$}
    \State $x_0 \sim {\cal N}(0,1)$
    \State Initial $x \gets x_0$
    \For{$t$ from $0$ to $1$ with step $\Delta t = 0.01$}
        \State $x \gets x+ \Delta t \cdot u_1(x,t)$
    \EndFor
    \State \Return $x$
\EndProcedure
\end{algorithmic}
\end{algorithm}
 
\subsection{Optimal Transport}\label{sec:app:OT}
In this section, we introduce some background of optimal transport.


The optimal transport (OT) problem, as originally framed by Monge~\cite{monge1781}, seeks to minimize a cost functional:
\begin{align*}
\inf_{\mathcal{T}} &~ \mathbb{E}[c(\mathcal{T}(x_0) - x_0)],\\
\mathrm{s.t.} &~ \mathrm{Dist}(\mathcal{T}(x_0)) = \pi_0, \quad \mathrm{Dist}(x_0) = \mu_0,
\end{align*}
where the optimization is over deterministic mappings $\mathcal{T}: \mathbb{R}^d \to \mathbb{R}^d$ that define a coupling $(x_0, x_1)$ with $x_1 = \mathcal{T}(x_0)$, minimizing the cost $c$~\cite{v09}.

Kantorovich~\cite{k58} extended Monge's problem by introducing the Monge-Kantorovich (MK) formulation, which allows for both deterministic and stochastic couplings $(x_0, x_1)$ with marginal distributions $\mu_0$ and $\pi_0$. Notably, when $\mu_0$ is absolutely continuous with respect to the Lebesgue measure, the optimal coupling remains deterministic, reducing the problem to the set of mappings $\mathcal{T}$. This equivalence facilitates a dynamic interpretation, where the aim is to identify a continuous-time trajectory $\{x_t\}_{t \in [0,1]}$ from a collection of smooth interpolants $\mathcal{X}$, such that $x_0 \sim \mu_0$ and $x_1 \sim \pi_0$. For a convex cost function $c$, Jensen's inequality implies:
\begin{align*}
\mathbb{E}[c(x_1 - x_0)] \geq \inf_{\{x_t\}_{t \in [0,1]} \in \mathcal{X}} \mathbb{E}\left[\int_0^1 c(\dot{x}_t)  \d t\right].
\end{align*}
The infimum is achieved when $x_t$ follows the displacement interpolant, $x_t = t x_1 + (1-t)x_0$, representing a geodesic in the Wasserstein space~\cite{m97}.

When the process is governed by ordinary differential equations (ODEs) of the form $ \d x_t = v_t(x_t) \d t$, the evolution of the Lebesgue density $\epsilon_t$ of $x_t$ satisfies the continuity equation:
\begin{align*}
\frac{\partial \epsilon_t}{\partial t} + \nabla \cdot (v_t \epsilon_t) = 0.
\end{align*}
The Monge problem can then be reformulated dynamically as:
\begin{align*}
\inf_{\{v_t\}_{t \in [0,1]}, \{x_t\}_{t \in [0,1]}} &~ \mathbb{E}\left[\int_0^1 c(v_t(x_t))  \d t\right],\\
\mathrm{s.t.} &~ \frac{\partial \epsilon_t}{\partial t} + \nabla \cdot (v_t \epsilon_t) = 0, \\
&~ \mu_0 = \frac{ \d \mu_0}{ \d \lambda}, \quad \pi_0 = \frac{ \d \pi_0}{ \d \lambda}.
\end{align*}
Although this dynamic formulation provides deeper insights, solving it is computationally challenging. For cost functions like the $\ell_2$ norm, this reduces to minimizing the kinetic energy of the flow, as shown by~\cite{scn23}, where displacement interpolants are energy-optimal and correspond to straight-line flow paths.

 

\section{Missing proof of Theorem~\ref{thm:main}}\label{sec:missing_proof}
Here, we state the proof of Theorem~\ref{thm:main}.
\begin{proof}
    Let $\wt{\theta} = (\wt{\theta}_1, \wt{\theta}_2)$ denote the approximate optimal solution for the estimated loss in Eq.~\eqref{eq:emp_loss}. By Lemma~\ref{lem:learning_bound}, we have 
    \begin{align*}
    | \wt{L}_{2,\theta} - L_{2,\theta} | \leq  O((\mathcal{C}(\F_1) + \mathcal{C}(\F_2) + \ln(1/\beta))/N)^{1/2}.
\end{align*}
Therefore, under the true distribution, $\dot{x}_t^\Est$ and $\ddot{x}_t^\Est$ approximate $\dot{x}_t^\True$ and $\ddot{x}_t^\True$ well in an $L^2$ sense.

As we defined $\ddot{x}_t^\True \in H^2(\Omega)$ ,we then apply Lemma~\ref{lem:reg_effect}, which leverages the Assumption~\ref{ass:smooth}. If $L_{2,2,\theta_1,\theta_2}$ is small, there exist $C_{\mathrm{reg}}$ such that 
\begin{align*}
    & ~ \| \dot{x}^\Est_t - \dot{x}_t^\True \|_{H^2(\Omega)} \\
    \leq & ~  C_{\mathrm{reg}} (L_{2,2,\theta_1,\theta_2}^{1/2} + \| \dot{x}_t^{\Est} - \dot{x}_t^{\True} \|_{L^2(\Omega)}).
\end{align*}
Since Lemma~\ref{lem:learning_bound} already guarantees that $\dot{x}_t^\Est$ and $\ddot{x}_t^\Est$ are close to the true $\dot{x}_t^\True$ and $\ddot{x}_t^\True$ in $L^2$, we conclude that the learned fields are also close in the stronger $H^2(\Omega)$ norm. As we assumed in Assumption~\ref{ass:noise_bound} and~\ref{ass:discrete_time}, For or uniform time steps $\Delta t = 1/L$, the update for the estimate is 
\begin{align*}
    x_{l+1}^\Est = & ~ x_{l}^\Est + \Delta t \cdot u_{1,\wt{\theta}_1}(x_l^\Est, t_l) \\
    & ~ + \frac{(\Delta t)^2}{2} u_{2,\wt{\theta}_2} (u_{1,\wt{\theta}_1}(x_l^\Est, t_l), x_l^\Est, t_l).
\end{align*}
Similarly, the true trajectory $x_{l+1}^\True$ follows the same scheme but with the true velocity and acceleration (plus noise bounded by $\delta \epsilon$). Subtracting two updates and taking the $H^2$-norm, and invoking the Lipschitz condition in Assumption~\ref{ass:lipschitz}, yields Lemma~\ref{lem:prog_noise}, we have
\begin{align*}
         e_{l+1} = & ~ \| x_{l+1}^\Est - x_{l+1}^\True \|_{H^2(\Omega)} \\
         \leq & ~ (1+ \Delta t \cdot C_{\mathrm{prop}}) e_l + C_{\mathrm{prop}} \Delta t \cdot \delta \cdot \epsilon.
     \end{align*}
Here $C_{\mathrm{prop}}$ depends on Lipschitz constants and bounds on $\dot{x}$ and $\ddot{x}$. Iterating the above and a discrete Gronwell argument shows
\begin{align*}
    e_L = & ~ \| x_L^\Est - x_L^\True \|_{H^2(\Omega)} \\
    \leq & ~ e_0 \exp (C_{\mathrm{prop}}) + \frac{\delta \cdot \epsilon}{C_{\mathrm{prop}}} (\exp (C_{\mathrm{prop}}) - 1).
\end{align*}
Since $\exp(C_{\mathrm{prop}})$ is just a constant factor, denote it by $e^{C_2}$. Combine all of these terms then yields the exact form of Theorem~\ref{thm:main}:
\begin{align*}
    &~  \| x_{t=1}^\Est - x_{t=1}^\True \|_{H^2(\Omega)} \\
    \leq & ~ C_1 \exp(C_2) \cdot (e_0 + \delta \cdot \epsilon) \\ 
    & ~ +  C_3 \cdot ((\mathcal{C}(\F_1) + \mathcal{C}(\F_2) + \ln(1/\beta))/N)^{1/2}.
\end{align*}
Thus, we complete the proof.
\end{proof}


\section{Extension on third-order Flow Matching}\label{sec:app:3rd}
In this section, we extend the second-order flow-matching framework in Section~\ref{sec:pre_flow_matching} to incorporate third-order information. We first introduce additional assumptions in Section~\ref{sec:3rd_preliminary} to ensure that the third derivative of the true trajectory is sufficiently smooth and bounded. In Section~\ref{sec:app:3rd_algorithm}, we introduce our third-order training algorithm and the inference algorithm. In Section~\ref{sec:app:3rd_elliptic_regularity}, we introduce the elliptic regularity for third-order cases. In Section~\ref{sec:app:3rd_regularization_effect}, we present the result of the regularization effect result for the third-order loss function. In Section~\ref{sec:app:3rd_excess_risk}, we show the excess risk of third-order cases. In Section~\ref{sec:app:3rd_discrete_propagation}, we show the lemma about discrete propagation under noise quantifies how noise in the trajectory affects the error propagation in a third-order discrete setting. In Section~\ref{sec:app:3rd_main_result}, we prove our third-order main result.


\subsection{Preliminary}\label{sec:3rd_preliminary}
In this section, we introduce some additional definitions and assumptions specific to the third-order extension.



\begin{assumption}[Smoothness in higher Sobolev spaces]\label{ass:smooth_3rd}
We assume $x_t^\True \in H^3(\Omega)$, its derivatives up tothe  third order lie in $L^2(\Omega)$ and satisfy suitable boundary conditions.
\begin{align*}
\|\dot{x}_t^\True\|_{H^3(\Omega)} \leq M_1, \quad 
\|\ddot{x}_t^\True\|_{H^3(\Omega)} \leq M_2, \quad
\|\dddot{x}_t^\True\|_{H^3(\Omega)} \leq M_3.
\end{align*}
In addition, we assume the third derivative $\dddot{x}_t^\True$ is continuous over $[0,1]$ and satisfies
\begin{align*}
\|\dddot{x}_t^\True\|_\infty \leq M_3.
\end{align*}
\end{assumption}

The assumption above is critical to ensure the trajectory has sufficient regularity for third-order analysis.


\begin{remark}
Assumption~\ref{ass:smooth_3rd} extends Assumption~\ref{ass:smooth} by requiring a bounded third derivative and ensuring the entire trajectory has appropriate regularity in Sobolev space $H^3(\Omega)$. This added smoothness is essential for deriving higher-order error bounds.
\end{remark}

We now define the discrete-time update rule for third-order systems.

\begin{assumption}[Time discretization for third-order update]\label{ass:3rd_discrete_time}
Let $\Delta t = 1/L$ be the uniform step size, and define discrete times $t_l = l \Delta t$ for $l = 0,1,\ldots, L$. The third-order discrete update for the estimated system is:
\begin{align*}
x_{l+1}^\Est  =  x_l^\Est  +  \Delta t   u_{1,\theta_1}(x_l^\Est,t_l) +  \frac{(\Delta t)^2}{2}   u_{2,\theta_2} (u_{1,\theta_1}(x_l^\Est,t_l), x_l^\Est,t_l ) +  \frac{(\Delta t)^3}{6}   u_{3,\theta_3} (u_{2,\theta_2}(\cdot), x_l^\Est, t_l ). 
\end{align*}
\end{assumption}

The discrete update incorporates terms up to the third derivative, capturing the dynamics more accurately.

\begin{assumption}
\label{ass:3rd_lipschitz}
The learned fields $u_{1,\theta_1}(x,t)$ , $u_{2,\theta_2}(v,x,t)$  and $u_{3,\theta_3}(a,x,t)$ are $L$-Lipschitz continuous in spatial and temporal arguments.
Formally, there exists $L > 0$ such that for all $x,y \in \R^d$ and $t,s \in [0,1]$:
\begin{align*}
\|u_{1,\theta_1}(x,t) - u_{1,\theta_1}(y,t)\|_2 &\leq L \|x-y\|_2 , \\
\|u_{2,\theta_2}(v,x,t) - u_{2,\theta_2}(v,y,t)\|_2 &\leq L \|x-y\|_2, \\
\|u_{3,\theta_3}(a,x,t) - u_{3,\theta_3}(a,y,t)\|_2 &\leq L \|x-y\|_2, 
\end{align*}
This is the natural extension of the second-order scheme in Assumption~\ref{ass:discrete_time}.
\end{assumption}

This assumption is necessary to control the propagation of errors through the system.

\begin{definition}[Third-order rectified flow]\label{def:3rd_order_flow}
A \emph{third-order rectified flow} is determined by three learned fields:
\begin{align*}
u_{1,\theta_1}(x,t) &~\\
u_{2,\theta_2}(v,x,t) &~ \mathrm{where} \quad v = u_{1,\theta_1}(x,t),\\
u_{3,\theta_3}(a,x,t) &~ \mathrm{where} \quad a = u_{2,\theta_2}(v,x,t).
\end{align*}
These fields aim to approximate $\dot{x}_t^\True$, $\ddot{x}_t^\True$, and $\dddot{x}_t^\True$, respectively.
\end{definition}


We now introduce the third-order analog of the velocity and acceleration fields. In addition to the velocity $u_{1,\theta_1}$ and acceleration $u_{2,\theta_2}$ fields, we define a field 
\begin{align*}
    u_{3,\theta_3}(a,x,t),
\end{align*}
where $ a = u_{2,\theta_2}(v,x,t)$ and $v = u_{1,\theta_1}(x,t)$. This field aims to approximate the third derivative $\dddot{x}_t^\True$.


Here, we introduce the definition of the field of third-order flow.
\begin{definition}[Third-order flow field]\label{def:third_order_flow}
A third-order rectified flow is characterized by a velocity field $u_{1,\theta_1}(x,t)$, an acceleration field $u_{2,\theta_2}(v,x,t)$, and a field
\begin{align*}
u_{3,\theta_3}(a, x, t),
\end{align*}
where 
\begin{align*}
v = u_{1,\theta_1}(x,t),  \quad a = u_{2,\theta_2}(u_{1,\theta_1}(x,t), x, t).
\end{align*}
The function $u_{3,\theta_3}$ aims to approximate the $\dddot{x}_t^\True$.
\end{definition}

And we present the loss function of third-order flow as follows.
\begin{definition}[Third-order loss function]\label{def:3rd_loss}
Let $\dot{x}_t^\True$, $\ddot{x}_t^\True$, and $\dddot{x}_t^\True$ be the true velocity, acceleration, and of the trajectory $x_t^\True$. 
We define the third-order loss as
\begin{align*}
L_{\mathrm{3rd}}(\theta_1,\theta_2,\theta_3) =
  \underbrace{\mathbb{E}  [\|\dot{x}_t^\True - u_{1,\theta_1}(x_t, t)\|^2_2 ]}_{L_{3,1,\theta_1}}
  +
  \underbrace{\mathbb{E}  [\|\ddot{x}_t^\True - u_{2,\theta_2} (u_{1,\theta_1}(x_t,t),  x_t,  t  )\|^2_2 ]}_{L_{3,2,\theta_2,\theta_1}}
  +
  \underbrace{\mathbb{E}  [\|\dddot{x}_t^\True - u_{3,\theta_3} (u_{2,\theta_2}(\cdot),  x_t,  t )\|^2_2 ]}_{L_{3,3,\theta_3,\theta_2,\theta_1}},
\end{align*}
where each expectation is taken over the possibly noisy samples of the continuous trajectory $x_t^\True$. 
\end{definition}

Here's the empirical third-order loss.
\begin{definition}[Empirical third-order loss]\label{def:3rd_emp_loss}
Given a training dataset  $\{(x_0^i, x_1^i)\}_{i=1}^N $ and time samples  $\{t_i\} $, we define the empirical third-order loss:
\begin{align*}
\wt{L}_{\mathrm{3rd}} =
\frac{1}{N} \sum_{i=1}^N
[ \|\dot{x}_t^{\True,i} - u_{1,\theta_1}(x_t^i, t_i)\|^2 + \|\ddot{x}_t^{\True,i} - u_{2,\theta_2}(u_{1,\theta_1}(x_t^i,t_i),  x_t^i,  t_i )\|^2 + \|\dddot{x}_t^{\True,i} - u_{3,\theta_3}(u_{2,\theta_2}(\cdot), x_t^i, t_i )\|^2 ].   
\end{align*}


\end{definition}

\subsection{Proposed Third-Order Algorithms}\label{sec:app:3rd_algorithm}
We present the natural extension of the second-order methods in Section~\ref{sec:proposed_method} to incorporate the jerk term. Here are our third-order training algorithm and inference algorithm.

\begin{algorithm}[!ht]\caption{Our third-order training algorithm }
\begin{algorithmic}[1]
\Procedure{3rdOrderForward}{$ $}
    \For{each iteration} 
    \State Random sample $x_0$ and time $t$, with target $x_1$
    \State $x_t \gets \alpha_t \cdot x_0 + \sqrt{1-\alpha_t^2} \cdot x_1$\
    \State Compute gradient with respect to $L_{\mathrm{3rd}}$ \Comment{See Definition~\ref{def:3rd_loss}}
    \EndFor 
    \State \Return $u_1, u_2, u_3$ \Comment{Three network functions}
\EndProcedure
\end{algorithmic}
\end{algorithm}

\begin{algorithm}[!ht]\caption{Our third-order inference algorithm} \label{alg:3rd_inference}
\begin{algorithmic}[1]
\Procedure{3rdOrderInference}{$u_1,u_2,u_3$}
    \State $x_0 \sim {\cal N}(0,1)$
    \State Initial $x \gets x_0$
   
     \For{$t$ from $0$ to $1$ with step $\Delta t = 0.01$} 
     \State $x \gets x + \Delta t \cdot u_1(x,t) + \frac{(\Delta t)^2}{2} \cdot u_2(u_1(x,t), x, t) + \frac{(\Delta t)^3}{6}\cdot u_3 (u_2(u_1(x,t), x, t), x_t, t )$
    \EndFor 
    \State \Return $x$
\EndProcedure
\end{algorithmic}
\end{algorithm}


\subsection{Elliptic Regularity}\label{sec:app:3rd_elliptic_regularity}
We now provide key lemmas and the main theorem establishing noise-robustness for third-order flow matching. In this section, we first introduce the elliptic regularity.

\begin{lemma}[Elliptic regularity in \cite{e10_pde}]\label{lem:3rd_pde_bound}
Let $\Omega \subset \mathbb{R}^d$ be a bounded domain witha  smooth boundary. Suppose a function $h:\Omega \to \mathbb{R}$ has weak derivatives up to order 3 in $L^2(\Omega)$ and satisfies relevant boundary conditions. Then there exists a constant $C_{\mathrm{reg,3}} > 0$ (depending on $\Omega$) such that
\begin{align*}
\| h \|_{H^3(\Omega)}  \leq C_{\mathrm{reg,3}}  (\|\nabla^2 h\|_{L^2(\Omega)} + \|\nabla h\|_{L^2(\Omega)} + \|h\|_{L^2(\Omega)}).   
\end{align*}
\end{lemma}


\subsection{Regularization Effect}\label{sec:app:3rd_regularization_effect}
In this section, we show the result of the regularization effect for the third-loss order.
\begin{lemma}[Regularization effect for third-order loss]\label{lem:3rd_reg_effect}
As we defined in Definition~\ref{def:3rd_emp_loss}, then we define
\begin{align*}
L_{\mathrm{3rd}} := \mathbb{E} [\|\dddot{x}_t^\True - u_{3,\theta_3} (u_{2,\theta_2}(\cdot), x_t^\True, t )\|^2  ].    
\end{align*}
If  $\dddot{x}_t^\True \in H^3(\Omega) $ and  $L_{\mathrm{3rd}} $ is sufficiently small, then there exists a constant  $C_{\mathrm{reg,3}}$ such that
\begin{align*}
\|\ddot{x}_t^\Est - \ddot{x}_t^\True\|_{H^3(\Omega)}
\leq C_{\mathrm{reg,3}} (L_{\mathrm{3rd}}^{1/2} +  \|\ddot{x}_t^\Est - \ddot{x}_t^\True\|_{L^2(\Omega)}).    
\end{align*}
\end{lemma}
\begin{proof}
    Applying Lemma~\ref{lem:3rd_pde_bound} to $h(\cdot) = \ddot{x}_t^\Est - \ddot{x}_t^\True$, we have 
    \begin{align}\label{eq:lesssim_1_3rd}
       & ~ \| \ddot{x}^\Est_t - \ddot{x}_t^\True \|_{H^3(\Omega)} \notag\\
      \leq & ~ C_{\mathrm{reg,3}} ( \| \nabla^2 h\|_{L^2(\Omega)}  +\| \nabla h\|_{L^2(\Omega)} + \| h\|_{L^2(\Omega)}).
    \end{align}
    By the Definition of the loss function, a small $L_{2,2,\theta_1,\theta_2}$ implies
    \begin{align}\label{eq:lesssim_2_3rd}
        \| \ddot{x}^\Est_t - \ddot{x}_t^\True \|_{L^2(\Omega)}\lesssim L^{1/2}_{\mathrm{3rd}}
    \end{align}
    Combining Eq.\eqref{eq:lesssim_1_3rd} and \eqref{eq:lesssim_2_3rd}, we have
    \begin{align*}
    & ~ \| \ddot{x}^\Est_t - \ddot{x}_t^\True \|_{H^3(\Omega)} \\ 
    \leq & ~ C_{\mathrm{reg,3}} (L_{\mathrm{3rd}}^{1/2} + \| \ddot{x}_t^{\Est} - \ddot{x}_t^{\True} \|_{L^2(\Omega)}).
\end{align*}
Thus, we complete the proof.
\end{proof}

\subsection{Excess Risk}\label{sec:app:3rd_excess_risk}
In this section, we first introduce some necessary tools that need to be used in Lemma~\ref{lem:3rd_reg_effect}. Then, we show our result of excess risk for third-order flow. First, we restate the symmetrization bound again.

\begin{lemma}[Symmetrization bound, formal version of Lemma~\ref{lem:first_half_excess_risk}]\label{lem:first_half_excess_risk_formal}
Let $\{x_i\}_{i=1}^N$ and $\{x_i'\}_{i=1}^N$ be i.i.d. samples. For $\mathcal{G} = \{\ell_{\theta} : \theta \in \Theta \}$, we have:
\begin{align*}
    \sup_{g \in \mathcal{G}}  \| \frac{1}{N} \sum_{i=1}^N  (g(x_i) - g(x_i') )  \| \leq \frac{2}{N} \E_\sigma  [ \sup_{g \in \mathcal{G}} \sum_{i=1}^N \sigma_i g(x_i) ],
\end{align*}
where $\{\sigma_i\}_{i=1}^N$ are Rademacher random variables, $\sigma_i \in \{+1, -1\}$ with equal probability.
\end{lemma}
\begin{proof}
For each $\sigma_i$ has a symmetric distribution, we have:
\begin{align*}
 \|\sum_{i=1}^N  (g(x_i) - g(x_i') ) \|
    \leq 
    \E_\sigma  [\|\sum_{i=1}^N \sigma_i  (g(x_i) - g(x_i') ) \|
\end{align*}
Taking the supremum over $g \in \mathcal{G}$ and noting that $\{x_i\}$ and $\{x_i'\}$ have the same distribution, we can split the expression inside the absolute value:
\begin{align*}
    &~ \sup_{g \in \mathcal{G}}  \|\sum_{i=1}^N  (g(x_i) - g(x_i') ) \| \\
    \leq  &~
    \mathbb{E}_\sigma  [\sup_{g \in \mathcal{G}} \|\sum_{i=1}^N 
    \sigma_i (g(x_i) - g(x_i') ) \|].
\end{align*}
By the triangle inequality, we get:
\begin{align*}
     \|\sum_{i=1}^N \sigma_i  (g(x_i) - g(x_i') ) \|
      \leq 
     \|\sum_{i=1}^N \sigma_i g(x_i) \|
     + 
     \|\sum_{i=1}^N \sigma_i g(x_i') \|.
\end{align*}
Hence,
\begin{align*}
    &~ \sup_{g \in \mathcal{G}}  \|\sum_{i=1}^N (g(x_i) - g(x_i')) \|\\
      \leq  &~
    \mathbb{E}_\sigma  [\sup_{g \in \mathcal{G}} \|\sum_{i=1}^N \sigma_i\,g(x_i) \|
     + 
    \sup_{g \in \mathcal{G}} \|\sum_{i=1}^N \sigma_i g(x_i')].
\end{align*}
Because $\{x_i'\}$ is drawn from the same distribution as $\{x_i\}$, the two supremum terms have the same expected value. Therefore, we can combine them as follows:
\begin{align*}
    \sup_{g \in \mathcal{G}}  \| \frac{1}{N} \sum_{i=1}^N  (g(x_i) - g(x_i') )  \| \leq \frac{2}{N} \E_\sigma  [ \sup_{g \in \mathcal{G}} \sum_{i=1}^N \sigma_i g(x_i) ],
\end{align*}
Thus, we complete the proof.
\end{proof}

Here we restate Lemma~\ref{lem:part2_excess_risk_bound}.

\begin{lemma}[Formal version of Lemma~\ref{lem:part2_excess_risk_bound}]\label{lem:part2_excess_risk_bound_informal}
As we defined in Definition~\ref{def:sobolev_space},~\ref{def:3rd_loss} and \ref{def:3rd_emp_loss}, if Assumption~\ref{ass:rademacher} holds, for $g \in \G$ where $\G = \{\ell_\theta : \theta \in \Theta \}$, we have
\begin{align*}
 \sup_{g \in \G}\| \frac{1}{N} \sum_{i=1}^N g(x_i') - \E [g(x)]\| \notag 
\leq & ~ O(\sqrt{\ln (1/ \beta)/N})
\end{align*}
\end{lemma}

We next present the result of excess risk for third-order flow.

\begin{lemma}[Excess risk]\label{lem:excess_risk_3rd} As we defined in Definition~\ref{def:3rd_loss} and \ref{def:3rd_emp_loss}, we have 
\begin{align*}
\wt{L}_{\mathrm{3rd}} =
\frac{1}{N} \sum_{i=1}^N
[ \|\dot{x}_t^{\True,i} - u_{1,\theta_1}(x_t^i, t_i)\|^2 + \|\ddot{x}_t^{\True,i} - u_{2,\theta_2}(u_{1,\theta_1}(x_t^i,t_i),  x_t^i,  t_i )\|^2 + \|\dddot{x}_t^{\True,i} - u_{3,\theta_3}(u_{2,\theta_2}(\cdot), x_t^i, t_i )\|^2 ]   
\end{align*}
and
\begin{align*}
L_{\mathrm{3rd}}(\theta_1,\theta_2,\theta_3) =
  \underbrace{\mathbb{E}  [\|\dot{x}_t^\True - u_{1,\theta_1}(x_t, t)\|^2 ]}_{L_{3,1,\theta_1}}
  +
  \underbrace{\mathbb{E}  [\|\ddot{x}_t^\True - u_{2,\theta_2} (u_{1,\theta_1}(x_t,t),  x_t,  t  )\|^2 ]}_{L_{3,2,\theta_2,\theta_1}}
  +
  \underbrace{\mathbb{E}  [\|\dddot{x}_t^\True - u_{3,\theta_3} (u_{2,\theta_2}(\cdot),  x_t,  t )\|^2 ]}_{L_{3,3,\theta_3,\theta_2,\theta_1}},
\end{align*}

Suppose $\F_1$ and $\F_2$ have finite or at most polynomially growing complexities $\mathcal{C}(\F_1), \mathcal{C}(\F_2)$. Then for $\beta \in (0,1)$, with probability at least $1-\beta$, we have
\begin{align*}
    | \wt{L}_{3rd} - L_{3rd} | \leq O((\mathcal{C}(\F_1) + \mathcal{C}(\F_2) + \ln(1/\beta))/N)^{1/2}.
\end{align*}
\end{lemma}
\begin{proof}
    Let $\G = \{\ell_\theta : \theta \in \Theta \}$ represent the complexity of $\G$ the Rademacher/VC dimension, As we defined in Definition~\ref{def:3rd_order_flow},~ ~\ref{def:3rd_loss} and~\ref{def:3rd_emp_loss} we calculate the empirical loss and population loss,
    \begin{align*}
        \wt{L}_{\mathrm{3rd}} =
        \frac{1}{N} \sum_{i=1}^N
        [ \|\dot{x}_t^{\True,i} - u_{1,\theta_1}(x_t^i, t_i)\|^2 + \|\ddot{x}_t^{\True,i} - u_{2,\theta_2}(u_{1,\theta_1}(x_t^i,t_i),  x_t^i,  t_i )\|^2 + \|\dddot{x}_t^{\True,i} - u_{3,\theta_3}(u_{2,\theta_2}(\cdot), x_t^i, t_i )\|^2 ]
    \end{align*}
    and
    \begin{align*}
        L_{\mathrm{3rd}}(\theta_1,\theta_2,\theta_3) =
        \underbrace{\mathbb{E}  [\|\dot{x}_t^\True - u_{1,\theta_1}(x_t, t)\|^2 ]}_{L_{3,1,\theta_1}}
        +
        \underbrace{\mathbb{E}  [\|\ddot{x}_t^\True - u_{2,\theta_2} (u_{1,\theta_1}(x_t,t),  x_t,  t  )\|^2 ]}_{L_{3,2,\theta_2,\theta_1}}
        +
        \underbrace{\mathbb{E}  [\|\dddot{x}_t^\True - u_{3,\theta_3} (u_{2,\theta_2}(\cdot),  x_t,  t )\|^2 ]}_{L_{3,3,\theta_3,\theta_2,\theta_1}},
\end{align*}
    let $\{x_i'\}_{i=1}^N$ be an i.i.d.\ sample from the same distribution as $\{x_i\}_{i=1}^N$, 
    and let $\{\sigma_i\}_{i=1}^N$ be i.i.d.\ Rademacher random variables ($\sigma_i \in\{+1,-1\}$ with probability $1/2$ each).  Then, for any $g\in \G$, we have
    \begin{align}\label{eq:bound_sum_E_3rd}
    \sup_{g \in \G}\| \frac{1}{N} \sum_{i=1}^{N} g(x_i) - \E [g(x)] \| 
   \leq   \sup_{g \in \G}\| \frac{1}{N} \sum_{i=1}^N (g(x_i)) - g(x_i') \| 
    + \sup_{g \in \G}\| \frac{1}{N} \sum_{i=1}^N g(x_i') - \E [g(x)]\| 
\end{align}


We can upper bound the first term in Eq.~\eqref{eq:bound_sum_E_3rd},
\begin{align}\label{eq:bound_sum_E_part1_3rd}
 & ~ \sup_{g \in \G}\| \frac{1}{N} \sum_{i=1}^N (g(x_i)) - g(x_i') \| \notag \\
 \leq & ~ \frac{2}{N} \E_{\sigma} [\sup_{g \in \G} \sum_{i=1}^N \sigma_i g(x_i)] \notag \\
 = & ~ 2 \wt{ {\cal R} }_N( {\cal G} ) \notag \\
 \leq & ~2 \cdot  O ( \sqrt{ C( {\cal G}  ) / N  } ) \notag \\
 \leq  & ~ O(\sqrt{(C(\F_1) + C(\F_2) )/N}) 
\end{align}
where the first step follows from Lemma~\ref{lem:first_half_excess_risk_formal}, the second step comes from we define $\wt{\mathcal{R}}_N(G) : = \E_\sigma [\sup_{g \in \G} \frac{1}{N} \sum_{i=1}^N \sigma_i g(x_i)]$, the third step follows from Assumption~\ref{ass:rademacher}, the forth step follows from the definition of $\G$.

We can upper bound the second term in Eq.~\eqref{eq:bound_sum_E_3rd} by using Lemma~\ref{lem:part2_excess_risk_bound_informal},
\begin{align}\label{eq:bound_sum_E_part2_3rd}
    \sup_{g \in \G}\| \frac{1}{N} \sum_{i=1}^N g(x_i') - \E [g(x)]\| \leq  O(\sqrt{\ln (1/ \beta)/N})
\end{align}


Loading Eq.~\eqref{eq:bound_sum_E_part1_3rd} and Eq.~\eqref{eq:bound_sum_E_part2_3rd}, we can obtain
\begin{align*}
    \sup_{g \in \G}\| \frac{1}{N} \sum_{i=1}^{N} g(x_i) - \E [g(x)] \|  
    \leq  O(\sqrt{(C(\F_1) + C(\F_2) +\ln(1 / \beta))/N})
\end{align*}
\end{proof}

\subsection{Discrete Propagation}\label{sec:app:3rd_discrete_propagation}

In this section, we show the lemma about discrete propagation under noise quantifies how noise in the trajectory affects the error propagation in a discrete setting for third-order flow.

\begin{lemma}[Discrete propagation with jerk]\label{lem:prog_noise_3rd}
Under Assumptions~\ref{ass:smooth_3rd},~\ref{ass:3rd_discrete_time} and \ref{ass:3rd_lipschitz} let
\begin{align*}
e_l = \| x_l^\Est - x_l^\True \|_{H^3(\Omega)}.
\end{align*}
Then there is a constant $C_{\mathrm{prop,3}} > 0$ such that
\begin{align*}
e_{l+1} \leq (1 + \Delta t  C_{\mathrm{prop,3}} ) e_l +  C_{\mathrm{prop,3}}  \delta  \epsilon  \Delta t.
\end{align*}
Unrolling from $l=0$ to $l=L-1$ with $\Delta t=1/L$ yields
\begin{align*}
e_L  \leq  e_0 \exp (C_{\mathrm{prop,3}} ) +  \frac{\delta  \epsilon}{C_{\mathrm{prop,3}}}  (\exp (C_{\mathrm{prop,3}} ) - 1 ).
\end{align*}
\end{lemma}
\begin{proof}
    By Assumptions~\ref{ass:3rd_discrete_time} and \ref{ass:3rd_lipschitz}, we have
    \begin{align*}
    x_{l+1}^\Est &= x_l^\Est 
    + \Delta t    u_{1,\theta_1}( x_l^\Est, t_l )
    + \frac{(\Delta t)^2}{2}  u_{2,\theta_2}  (u_{1,\theta_1}(\cdot), x_l^\Est, t_l )
    + \frac{(\Delta t)^3}{6}  u_{3,\theta_3}  (u_{2,\theta_2}(\cdot), x_l^\Est, t_l ),
    \\
    x_{l+1}^\True &= x_l^\True 
    + \Delta t    \dot{x}_l^\True 
    + \frac{(\Delta t)^2}{2}  \ddot{x}_l^\True
    + \frac{(\Delta t)^3}{6}  \dddot{x}_l^\True.
    \end{align*}
    Subtracting the true update from the estimated one and taking the $H^3$-norm, the difference involves
    Lipschitz constants, the prior error $e_l$, and a noise term bounded by $\delta  \epsilon$. One obtains 
    \begin{align*}
    \|x_{l+1}^\Est - x_{l+1}^\True\|_{H^3(\Omega)} \leq
    (1 + \Delta t \cdot C_{\mathrm{prop,3}}) \|x_l^\Est -x_l^\True\|_{H^3(\Omega)} + C_{\mathrm{prop,3}} \delta \epsilon \Delta t.
    \end{align*}
    where $C_{\mathrm{prop,3}}$ depends on Lipschitz constants of $u_{1,\theta_1}$, $u_{2,\theta_2}$, $u_{3,\theta_3}$, and the boundedness of $\ddot{x}_t^\True, \dddot{x}_t^\True$. 

    Repeating this inequality from $l=0$ to $l=L-1$ and noting $\Delta t = 1/L$, by a discrete Gronwall argument we have
    \begin{align*}
    e_L = & ~ \|x_L^\Est - x_L^\True\|_{H^3(\Omega)} \\
    \leq & ~ e_0  \exp (C_{\mathrm{prop,3}} ) + \sum_{l=0}^{L-1}  (C_{\mathrm{prop,3}}  \delta \epsilon \Delta t  \prod_{j=l+1}^{L-1} (1 + \Delta t C_{\mathrm{prop,3}})),
    \\
    \leq & ~
    e_0  \exp (C_{\mathrm{prop,3}} ) + \frac{\delta \epsilon}{C_{\mathrm{prop,3}}}  (\exp (C_{\mathrm{prop,3}} ) - 1).
    \end{align*}
    This completes the proof.
\end{proof}

\subsection{Main Result: Third-Order Noise Robustness}\label{sec:app:3rd_main_result}
Combining the above, we obtain the final noise-robustness result for third-order flow matching in this section.

\begin{theorem}[Third-order noise robustness]\label{thm:3rd_main}
Suppose Assumptions~\ref{ass:smooth_3rd},~\ref{ass:3rd_discrete_time},~\ref{ass:lipschitz} and \ref{ass:noise_bound} hold. 
Let $\wt{\theta} = (\wt{\theta}_1,\wt{\theta}_2,\wt{\theta}_3)$ be an approximately optimal solution minimizing the empirical loss $\wt{L}_{\mathrm{3rd},\theta_1,\theta_2,\theta_3}$. 
Then, with probability at least $1 - \beta$, for uniform time steps $t_l = l \Delta t$ with $\Delta t = 1/L$, we have
\begin{align*}
\| x_L^\Est - x_L^\True \|_{H^3(\Omega)} \leq C_1' \exp(C_2') ( e_0 + \delta  \epsilon ) + C_3' 
 (
(\mathcal{C}(\mathcal{F}_1) + \mathcal{C}(\mathcal{F}_2) + \mathcal{C}(\mathcal{F}_3) + \ln(1/ \beta))/N
 )^{1/2},
\end{align*}
where $e_0 = \|x_0^\Est - x_0^\True\|_{H^3(\Omega)}$ denotes the initial error, and $C_1', C_2', C_3'$ depend on Lipschitz constants, the dimension $d$, and Sobolev embedding constants in $H^3(\Omega)$.
\end{theorem}
\begin{proof}
    Let $\wt{\theta} = (\wt{\theta}_1, \wt{\theta}_2)$ denote the approximate optimal solution for the estimated loss, use Lemma~\ref{lem:excess_risk_3rd}, we have 
    \begin{align*}
    | \wt{L}_{\mathrm{3rd}} - L_{\mathrm{3rd}} | \leq  O((\mathcal{C}(\F_1) + \mathcal{C}(\F_2) + \mathcal{C}(\F_3)\ln(1/\beta))/N)^{1/2}.
\end{align*}
Therefore, under the true distribution, $\dot{x}_t^\Est$ and $\ddot{x}_t^\Est$ approximate $\dot{x}_t^\True$ and $\ddot{x}_t^\True$ well in an $L^2$ sense.

As we defined $\dddot{x}_t^\True \in H^3(\Omega)$ ,we then apply Lemma~\ref{lem:3rd_reg_effect}, which leverages the Assumption~\ref{ass:smooth_3rd}. If $L_{\mathrm{3rd}}$ is small, there exist $C_{\mathrm{reg,3}}$ such that 
\begin{align*}
\|\ddot{x}_t^\Est - \ddot{x}_t^\True\|_{H^3(\Omega)}
\leq C_{\mathrm{reg,3}} (L_{\mathrm{3rd}}^{1/2} +  \|\ddot{x}_t^\Est - \ddot{x}_t^\True\|_{L^2(\Omega)}).    
\end{align*}
 We conclude that the learned fields are also close to the stronger $H^2(\Omega)$ norm. As we assumed in Assumption~\ref{ass:noise_bound} and~\ref{ass:3rd_discrete_time}, For or uniform time steps $\Delta t = 1/L$, the update for the estimate is 
\begin{align*}
    x_{l+1}^\Est = & ~ x_{l}^\Est + \Delta t u_{1,\wt{\theta}_1}(x_l^\Est, t_l) + \frac{(\Delta t)^2}{2} u_{2,\wt{\theta}_2} (u_{1,\wt{\theta}_1}(x_l^\Est, t_l), x_l^\Est, t_l) + \frac{(\Delta t)^3}{6}  u_{3,\wt{\theta_3}}  (u_{2,\wt{\theta_2}}(\cdot), x_l^\Est, t_l ),
    \\
    x_{l+1}^\True &= x_l^\True 
    + \Delta t    \dot{x}_l^\True 
    + \frac{(\Delta t)^2}{2}  \ddot{x}_l^\True
    + \frac{(\Delta t)^3}{6}  \dddot{x}_l^\True.
\end{align*}
Subtracting two updates and taking the $H^3$-norm, and invoking the Lipschitz condition in Assumption~\ref{ass:3rd_lipschitz}, yields Lemma~\ref{lem:prog_noise_3rd}, we have
\begin{align*}
         e_{l+1} = & ~ \| x_{l+1}^\Est - x_{l+1}^\True \|_{H^3(\Omega)} \leq (1+ \Delta t \cdot C_{\mathrm{prop,3}}) e_l + C_{\mathrm{prop,3}} \Delta t \cdot \delta \cdot \epsilon.
     \end{align*}
Here $C_{\mathrm{prop,3}}$ depends on Lipschitz constants and bounds on $\dot{x}$ and $\ddot{x}, \dddot{x}$. Iterating the above and a discrete Gronwell argument shows
\begin{align*}
    e_L = & ~ \| x_L^\Est - x_L^\True \|_{H^3(\Omega)} \leq e_0 \exp (C_{\mathrm{prop,3}}) + \frac{\delta \cdot \epsilon}{C_{\mathrm{prop,3}}} (\exp (C_{\mathrm{prop,3}}) - 1).
\end{align*}
Since $\exp(C_{\mathrm{prop,3}})$ is just a constant factor, denote it by $e^{C_2}$. Combine all of these terms then yields the exact form of Theorem~\ref{thm:3rd_main}:
\begin{align*}
    \| x_L^\Est - x_L^\True \|_{H^3(\Omega)} \leq C_1' \exp(C_2') ( e_0 + \delta  \epsilon ) + C_3' 
    ((\mathcal{C}(\mathcal{F}_1) + \mathcal{C}(\mathcal{F}_2) + \mathcal{C}(\mathcal{F}_3) + \ln(1/ \beta))/N)^{1/2},
\end{align*}
Thus, we complete the proof.
\end{proof}
\section{Extension on \texorpdfstring{$k$}{}-th order Flow Matching}\label{sec:app:k_order}
In this section, we extend the second-order flow-matching framework in Section~\ref{sec:pre_flow_matching} to incorporate third-order information. We first introduce additional assumptions in Section~\ref{sec:app:k_pre} to ensure that the third derivative of the true trajectory is sufficiently smooth and bounded. In Section~\ref{sec:app:k_algorithm}, we introduce our third-order training algorithm and the inference algorithm. In Section~\ref{sec:app:k_elliptic_regularity}, we introduce the elliptic regularity for the third-order case. In Section~\ref{sec:app:k_regularization_effect}, we present the result of the regularization effect result for the third-order loss function. In Section~\ref{sec:app:k_excess_risk}, we show the excess risk of the third-order case. In Section~\ref{sec:app:k_discrete_propagation}, we show the lemma about discrete propagation under noise quantifies how noise in the trajectory affects the error propagation in a third-order discrete setting. In Section~\ref{sec:app:k_main}, we prove our $k$-th order main result.



\subsection{Preliminary}\label{sec:app:k_pre}
In this section, we introduce some additional definitions and assumptions specific to the $k$-th order extension.


\begin{assumption}[Smoothness in higher Sobolev spaces]
\label{ass:smooth_k}
We assume the true trajectory $x_t^\True \in H^k(\Omega)$ and that its derivatives up to the $k$-th order are sufficiently smooth and bounded. Formally, there exist constants $\{M_j\}_{j=1}^k > 0$ such that
\begin{align*}
 \| \dot{x}_t^{(j), \True}  \|_{H^k(\Omega)}  \leq  M_j,\quad \mathrm{for } j=1,\dots, k,
\end{align*}
where $\dot{x}_t^{(j), \True}$ denotes the $j$-th order time derivative of $x_t^\True$. We also require these derivatives to be continuous on $[0,1]$ in the time variable.
\end{assumption}

Then, we introduce our assumption for time discretization under $k$-th order update.

\begin{assumption}[Time discretization for \texorpdfstring{$k$}{k}-th order update]
\label{ass:k_discrete_time}
Let $\Delta t = 1/L$ be the uniform step size, and define discrete times $t_l = l \Delta t$ for $l = 0,1,\ldots, L$. 
We consider the following \emph{$k$-th order discrete update} for the estimated system:
\begin{align}
\label{eq:k_th_order_update}
x_{l+1}^\Est = x_l^\Est + \sum_{j=1}^k \frac{(\Delta t)^j}{j!} u_{j,\theta_j}  (u_{j-1,\theta_{j-1}} (\dots u_{1,\theta_1}(x_l^\Est, t_l)\dots ), x_l^\Est, t_l),
\end{align}
where each $u_{j,\theta_j}$ is a learned field approximating the $j$-th order derivative $\dot{x}_t^{(j),\True}$.
\end{assumption}

The $k$-th order Lipschitz continuity is also necessary, and we present it here.

\begin{assumption}[$k$-th order Lipschitz continuity]
\label{ass:k_lipschitz}
We assume the learned fields $(u_{j,\theta_j})_{j=1}^k$ are each $L$-Lipschitz continuous in their spatial and temporal arguments. Formally, there exists $L>0$ such that for any $x,y \in \R^d$ and $t,s\in[0,1]$
\begin{align*}
\| u_{j,\theta_j} (\ldots, x,t ) - u_{j,\theta_j} (\ldots, y,t ) \|_2 \leq & ~ L \|x-y\|_2 \\
\| u_{j,\theta_j} (\ldots, x,t ) - u_{j,\theta_j} (\ldots, x,s ) \|_2 \leq & ~ L \|t-s\|_2.
\end{align*}
\end{assumption}

Next we introduce the definition of $k$-th order loss function.

\begin{definition}[$k$-th order flow]
A $k$-order flow involves a sequence of learned fields $u_{j,\theta_j}$ for $j = 1, \dots, k$, each targeting the approximation of $(\dot{x}^{(j)}_t)^{\mathrm{True}}$.
\end{definition}

Here is the $k$-th order loss function.

\begin{definition}[$k$-th order loss function]\label{def:k_order_loss}
The $k$-order loss function evaluates the accuracy of approximations for each derivative:
\begin{align*}
L_{\mathrm{k-order}}(\theta_1, \dots, \theta_k) = \sum_{j=1}^k \mathbb{E} [\|(\dot{x}^{(j)}_t)^\mathrm{True} - u_{j,\theta_j} (u_{j-1,\theta_{j-1}}(\dots), x_t, t)\|^2 ].
\end{align*}
\end{definition}

And we introduce empirical $k$-th order loss here.

\begin{definition}[Empirical $k$-th order loss]
\label{def:k_emp_loss}
Given a training dataset $\{ (x_0^i, x_1^i) \}_{i=1}^N$ with times $\{t_i\}$ and (approximate) ground-truth derivatives up to the $k$-th order, the empirical $k$-th order loss is
\begin{align*}
\wt{L}_{\mathrm{k-order}}(\theta_1,\dots,\theta_k) = \frac{1}{N}\sum_{i=1}^N \sum_{j=1}^k
\| (\dot{x}^{(j)}_{t_i})^{\True,i} - u_{j,\theta_j}(u_{j-1,\theta_{j-1}}(\dots), x_{t_i}^i, t_i) \|^2.
\end{align*}
\end{definition}

\subsection{Proposed \texorpdfstring{$k$}{}-th Order Algorithms}\label{sec:app:k_algorithm}

In this section, we show our $k$-th order training algorithm and inference algorithm. First, we show the $k$-th order training algorithm.

\begin{algorithm}[!ht]\caption{Our $k$-th order training algorithm}
\begin{algorithmic}[1]
\Procedure{k-thOrderForward}{$ $}
    \For{each iteration} 
    \State Random sample $x_0$ and time $t$, with target $x_1$
    \State $x_t \gets \alpha_t \cdot x_0 + \sqrt{1-\alpha_t^2} \cdot x_1$\
    \State Compute gradient with respect to $L_{\mathrm{k-order}}$ \Comment{See Definition~\ref{def:k_order_loss}} 
    \EndFor 
    \State \Return $u_1, u_2, \cdots, u_k$ \Comment{$k$ network functions}
\EndProcedure
\end{algorithmic}
\end{algorithm}


\begin{algorithm}
\caption{Our $k$-th order inference algorithm}
\label{alg:k_order_inference}
\begin{algorithmic}[1]
\Procedure{k-thOrderInference}{$u_1, \dots, u_k$}
    \State $x_0 \sim \mathcal{N}(0,1)$
    \State Initialize $x \gets x_0$
    \For{$t$ from $0$ to $1$ with step $\Delta t = 0.01$}
        \State $x \gets x + \sum_{j=1}^k \frac{(\Delta t)^j}{j!} \cdot u_j(u_{j-1}(\dots u_1(x,t) \dots), x, t)$
    \EndFor
    \State \Return $x$
\EndProcedure
\end{algorithmic}
\end{algorithm}



\subsection{Elliptic Regularity}\label{sec:app:k_elliptic_regularity}
In this section, we introduce the first result, which is a classical result that characterizes the relationship between different Sobolev norms for sufficiently smooth functions for $k$-th order flow. 
\begin{lemma}[Elliptic regularity in \cite{e10_pde}]
\label{lem:pde_bound_k}
Let $\Omega \subset \R^d$ be a bounded domain with a sufficiently smooth boundary. Suppose $h:\Omega \to \R$ has weak derivatives up to order $k$ in $L^2(\Omega)$. Then there is a constant $C_{\mathrm{reg},k}>0$ depending on $\Omega$ such that
\begin{align*}
\|h\|_{H^k(\Omega)} \leq C_{\mathrm{reg},k} (\sum_{m=0}^{k-1} \|\nabla^m h\|_{L^2(\Omega)}).
\end{align*}
\end{lemma}



\subsection{Regularization Effect}\label{sec:app:k_regularization_effect}

In this section, we now connect the second-order loss function with the Sobolev norm of the estimation error under the $k$-th order loss function.

\begin{lemma}[Regularization effect for $k$-th order loss]\label{lem:kth_reg_effect}
As we defined in Definition~\ref{def:k_emp_loss}, then we define
\begin{align*}
L_{\mathrm{k-order}}(\theta_1, \dots, \theta_k) = \sum_{j=1}^k \mathbb{E} [\|(\dot{x}^{(j)}_t)^\mathrm{True} - u_{j,\theta_j} (u_{j-1,\theta_{j-1}}(\dots), x_t, t)\|^2 ].
\end{align*}

If  $(\dot{x}_t^\True)^{(k)} \in H^3(\Omega) $ and  $L_{\mathrm{k-order}} $ is sufficiently small, then there exists a constant  $C_{\mathrm{regk}}$ such that
\begin{align*}
\|(\dot{x}_t^\Est)^{(k-1)} - (\dot{x}_t^\True)^{(k-1)} \|_{H^k(\Omega)}
\leq C_{\mathrm{regk}} (L_{\mathrm{k-order}}^{1/2} +  \| (\dot{x}_t^\Est)^{(k-1)} - (\dot{x}_t^\True)^{(k-1)}  \|_{L^2(\Omega)}).    
\end{align*}
\end{lemma}
\begin{proof}
    Applying Lemma~\ref{lem:pde_bound_k} to $h(\cdot) = \ddot{x}_t^\Est - (\dot{x}_t^\True)^{(k-1)}$, we have 
    \begin{align}\label{eq:lesssim_1_kth}
       & ~ \| (\dot{x}^\Est_t)^{(k-1)} - (\dot{x}_t^\True)^{(k-1)} \|_{H^k(\Omega)} \notag\\
      \leq & ~ C_{\mathrm{regk}} (\sum_{m=0}^{k-1} \|\nabla^m h\|_{L^2(\Omega)}).
    \end{align}
    By the Definition of the loss function, a small $L_{2,2,\theta_1,\theta_2}$ implies
    \begin{align}\label{eq:lesssim_2_kth}
        \| (\dot{x}^\Est_t)^{(k-1)} - (\dot{x}_t^\True)^{(k-1)} \|_{L^2(\Omega)}\lesssim L^{1/2}_{\mathrm{k-order}}
    \end{align}
    Combining Eq.\eqref{eq:lesssim_1_kth} and \eqref{eq:lesssim_2_kth}, we have
    \begin{align*}
    & ~ \|(\dot{x}^\Est_t)^{(k-1)} - (\dot{x}_t^\True)^{(k-1)}\|_{H^k(\Omega)} \\ 
    \leq & ~ C_{\mathrm{regk}} (L_{\mathrm{k-order}}^{1/2} + \| (\dot{x}^\Est_t)^{(k-1)} - (\dot{x}_t^\True)^{(k-1)} \|_{L^2(\Omega)}).
\end{align*}
Thus, we complete the proof.
\end{proof}



\subsection{Excess Risk}\label{sec:app:k_excess_risk}
In this section, we present a result that quantifies the gap between the empirical and population loss, highlighting the strong generalization capabilities of our method even with a finite sample size.
\begin{lemma}[Excess risk for $k$-th order]\label{lem:excess_risk_kth}
As in Definition~\ref{def:k_order_loss} and \ref{def:k_emp_loss}, let
\begin{align*}
\wt{L}_{\mathrm{k-order}}  =  \frac{1}{N}\sum_{i=1}^N \sum_{j=1}^k
\| (\dot{x}^{(j)}_{t_i})^{\True,i} - u_{j,\theta_j}(u_{j-1,\theta_{j-1}}(\dots), x^i_{t_i}, t_i ) \|^2
\end{align*}
and 
\begin{align*}
L_{\mathrm{k-order}}(\theta_1,\dots,\theta_k)
 =  \sum_{j=1}^k \E  [\|\dot{x}_t^{(j),\True} - u_{j,\theta_j}(u_{j-1,\theta_{j-1}}(\dots), x_t, t)\|^2  ].
\end{align*}
Suppose each function class $\mathcal{F}_j$ has finite or at most polynomially growing complexity $\mathcal{C}(\mathcal{F}_j)$. Then for $\beta \in (0,1)$, with probability at least $1-\beta$, we have
\begin{align*}
    | \wt{L}_{\mathrm{k-order}} - L_{\mathrm{k-order}} | \leq O(\sum_{i=1}^k C({\cal F}_i) + \ln(1/\beta))/N)^{1/2}.
\end{align*}
\end{lemma}
\begin{proof}
    Let $\G = \{\ell_\theta : \theta \in \Theta \}$ represent the complexity of $\G$ the Rademacher/VC dimension, as we defined in Definition~\ref{def:3rd_order_flow},~ ~\ref{def:3rd_loss} and~\ref{def:3rd_emp_loss} we calculate the empirical loss and population loss,
    \begin{align*}
    \wt{L}_{\mathrm{k-order}}  =  \frac{1}{N}\sum_{i=1}^N \sum_{j=1}^k
    \| (\dot{x}^{(j)}_{t_i})^{\True,i} - u_{j,\theta_j}(u_{j-1,\theta_{j-1}}(\dots), x^i_{t_i}, t_i ) \|^2
    \end{align*}
    and
    \begin{align*}
        L_{\mathrm{k-order}}(\theta_1,\dots,\theta_k)
        =  \sum_{j=1}^k \E  [\|\dot{x}_t^{(j),\True} - u_{j,\theta_j}(u_{j-1,\theta_{j-1}}(\dots), x_t, t)\|^2  ].
    \end{align*}
    let $\{x_i'\}_{i=1}^N$ be an i.i.d.\ sample from the same distribution as $\{x_i\}_{i=1}^N$, 
    and let $\{\sigma_i\}_{i=1}^N$ be i.i.d.\ Rademacher random variables ($\sigma_i \in\{+1,-1\}$ with probability $1/2$ each).  Then, for any $g\in \G$, we have
    \begin{align}\label{eq:bound_sum_E_kth}
    \sup_{g \in \G}\| \frac{1}{N} \sum_{i=1}^{N} g(x_i) - \E [g(x)] \| 
   \leq   \sup_{g \in \G}\| \frac{1}{N} \sum_{i=1}^N (g(x_i)) - g(x_i') \| 
    + \sup_{g \in \G}\| \frac{1}{N} \sum_{i=1}^N g(x_i') - \E [g(x)]\| 
\end{align}


We can upper bound the first term in Eq.~\eqref{eq:bound_sum_E_kth},
\begin{align}\label{eq:bound_sum_E_part1_kth}
 & ~ \sup_{g \in \G}\| \frac{1}{N} \sum_{i=1}^N (g(x_i)) - g(x_i') \| \notag \\
 \leq & ~ \frac{2}{N} \E_{\sigma} [\sup_{g \in \G} \sum_{i=1}^N \sigma_i g(x_i)] \notag \\
 = & ~ 2 \wt{ {\cal R} }_N( {\cal G} ) \notag \\
 \leq & ~2 \cdot  O ( \sqrt{ C( {\cal G}  ) / N  } ) \notag \\
 \leq  & ~ O( ( \sum_{i=1}^k C({\cal F}_i) /N )^{1/2} ) 
\end{align}
where the first step follows from Lemma~\ref{lem:first_half_excess_risk_formal}, the second step comes from we define $\wt{\mathcal{R}}_N(G) : = \E_\sigma [\sup_{g \in \G} \frac{1}{N} \sum_{i=1}^N \sigma_i g(x_i)]$, the third step follows from Assumption~\ref{ass:rademacher}, the forth step follows from the definition of $\G$ in $k$-th order case.

We can upper bound the second term in Eq.~\eqref{eq:bound_sum_E_kth} by using Lemma~\ref{lem:part2_excess_risk_bound_informal},
\begin{align}\label{eq:bound_sum_E_part2_kth}
    \sup_{g \in \G}\| \frac{1}{N} \sum_{i=1}^N g(x_i') - \E [g(x)]\| \leq  O(\sqrt{\ln (1/ \beta)/N})
\end{align}


Loading Eq.~\eqref{eq:bound_sum_E_part1_kth} and Eq.~\eqref{eq:bound_sum_E_part2_kth}, we can obtain
\begin{align*}
    \sup_{g \in \G}\| \frac{1}{N} \sum_{i=1}^{N} g(x_i) - \E [g(x)] \|  
    \leq  O( ( (\sum_{i=1}^k C({\cal F}_i) +\ln(1 / \beta) ) /N )^{1/2} )
\end{align*}
\end{proof}


\subsection{Discrete Propagation}\label{sec:app:k_discrete_propagation}

In this section, we show the lemma about discrete propagation under noise quantifies how noise in the trajectory affects the error propagation in a discrete setting for $k$-th order flow.

\begin{lemma}[Discrete propagation under noise for \texorpdfstring{$k$}{k}-th order]
\label{lem:prog_noise_k}
Under Assumptions~\ref{ass:k_discrete_time} and~\ref{ass:k_lipschitz}, let
\begin{align*}
e_l  =  \| x_l^\Est - x_l^\True\|_{H^k(\Omega)}.
\end{align*}
Then there is a constant $C_{\mathrm{prop},k}>0$ such that
\begin{align*}
e_{l+1} \leq (1 + \Delta t  C_{\mathrm{prop},k} )  e_l + C_{\mathrm{prop},k} \delta \epsilon \Delta t.
\end{align*}
Iterating from $l=0$ to $l=L-1$ (where $\Delta t = 1/L$) gives
\begin{align*}
e_L \leq e_0  \exp (C_{\mathrm{prop},k} ) + \frac{\delta \epsilon}{C_{\mathrm{prop},k}} (\exp (C_{\mathrm{prop},k} ) - 1 ).
\end{align*}
\end{lemma}
\begin{proof}
    By Assumptions~\ref{ass:k_discrete_time} and \ref{ass:k_lipschitz}, we have
    \begin{align*}
    x_{l+1}^\Est = & ~ x_l^\Est 
    + \Delta t    u_{1,\theta_1}( x_l^\Est, t_l )
    + \frac{(\Delta t)^2}{2}  u_{2,\theta_2}  (u_{1,\theta_1}(\cdot), x_l^\Est, t_l )
    + \frac{(\Delta t)^3}{6}  u_{3,\theta_3}  (u_{2,\theta_2}(\cdot), x_l^\Est, t_l ) \\
    &~ + \sum_{j=4}^k \frac{(\Delta t)^j}{j!} u_{j,\theta_j}  (u_{j-1,\theta_{j-1}} (\dots u_{1,\theta_1}(x_l^\Est, t_l)\dots ), x_l^\Est, t_l),
    \\
    x_{l+1}^\True &= x_l^\True 
    + \Delta t    \dot{x}_l^\True 
    + \frac{(\Delta t)^2}{2}  \ddot{x}_l^\True
    + \frac{(\Delta t)^3}{6}  \dddot{x}_l^\True
    + \sum_{j=4}^k \frac{(\Delta t)^j}{j!}(\dot{x}_t^\True)^{(j)}.
    \end{align*}
    Subtracting the true update from the estimated one and taking the $H^k$-norm, the difference involves
    Lipschitz constants, the prior error $e_l$, and a noise term bounded by $\delta  \epsilon$. One obtains 
    \begin{align*}
    \|x_{l+1}^\Est - x_{l+1}^\True\|_{H^k(\Omega)} \leq
    (1 + \Delta t \cdot C_{\mathrm{prop,k}}) \|x_l^\Est -x_l^\True\|_{H^k(\Omega)} + C_{\mathrm{prop,k}} \delta \epsilon \Delta t.
    \end{align*}
    where $C_{\mathrm{prop,k}}$ depends on Lipschitz constants of $u_{1,\theta_1}$, $u_{2,\theta_2}$, \dots $u_{k,\theta_k}$, and the boundedness of $(\dot{x}_t^\True)^{(k-1)}, (\dot{x}_t^\True)^{k}$. 

    Repeating this inequality from $l=0$ to $l=L-1$ and noting $\Delta t = 1/L$, by a discrete Gronwall argument we have
    \begin{align*}
    e_L = & ~ \|x_L^\Est - x_L^\True\|_{H^k(\Omega)} \\\leq & ~e_0  \exp (C_{\mathrm{prop,k}} ) + \sum_{l=0}^{L-1}  (C_{\mathrm{prop,k}}  \delta \epsilon \Delta t  \prod_{j=l+1}^{L-1} (1 + \Delta t C_{\mathrm{prop,k}})),
    \\
    \leq & ~s
    e_0  \exp (C_{\mathrm{prop,k}} ) + \frac{\delta \epsilon}{C_{\mathrm{prop,k}}}  (\exp (C_{\mathrm{prop,k}} ) - 1).
    \end{align*}
    This completes the proof.
\end{proof}

\subsection{Main result for \texorpdfstring{$k$}{k}-th Order Noise Robustness}\label{sec:app:k_main}

In this section, we formally state and prove our main result, leveraging the auxiliary lemmas to demonstrate the robustness of the learned trajectory against noise and the effects of finite sample sizes for $k$-th order flow.

\begin{theorem}[Noise robustness for \texorpdfstring{$k$}{k}-th order flow matching]
\label{thm:main_k}
Suppose Assumptions~\ref{ass:smooth_k}, \ref{ass:k_discrete_time}, \ref{ass:k_lipschitz}, and \ref{ass:noise_bound} hold. Let
\begin{align*}
\wt{\theta} =(\wt{\theta}_1,\dots,\wt{\theta}_k)
\end{align*}
be an approximately optimal solution minimizing the empirical $k$-th order loss in Definition~\ref{def:k_emp_loss}. Then, with probability at least $1-\beta$, the final-time estimate $x_L^\Est$ satisfies:
\begin{align*}
\| x_L^\Est - x_L^\True \|_{H^k(\Omega)} \leq C_1''  \exp(C_2'')  (e_0 + \delta \epsilon ) + C_3'' ( ( \sum_{i=1}^{k}\mathcal{C}(\F_i) +\ln(1/\beta))/N)^{1/2},
\end{align*}
where $e_0=\| x_0^\Est - x_0^\True \|_{H^k(\Omega)}$ and $C_1'', C_2'', C_3''$ depend on the Lipschitz constants, Sobolev embedding constants, and dimension $d$. The term $e^{C_2''}$ arises from the discrete Gronwall factor over $[0,1]$.
\end{theorem}
\begin{proof}
    Let $\wt{\theta} = (\wt{\theta}_1, \dots, \wt{\theta}_k)$ denote the approximate optimal solution for the estimated loss, use Lemma~\ref{lem:excess_risk_kth}, we have 
    \begin{align*}
    | \wt{L}_{\mathrm{k-order}} - L_{\mathrm{k-order}} | \leq O(\sum_{i=1}^k C({\cal F}_i) + \ln(1/\beta))/N)^{1/2}.
    \end{align*}
Therefore, under the true distribution, $\dot{x}_t^\Est$ and $\ddot{x}_t^\Est$ approximate $\dot{x}_t^\True$ and $\ddot{x}_t^\True$ well in an $L^2$ sense, the higher order field also hold.

As we defined $(\dot{x}_t^\True)^{(k)} \in H^k(\Omega)$ ,we then apply Lemma~\ref{lem:kth_reg_effect}, which leverages the Assumption~\ref{ass:smooth_k}. If $L_{\mathrm{k-order}}$ is small, there exist $C_{\mathrm{regk}}$ such that 
\begin{align*}
\|(\dot{x}_t^\Est)^{(k-1)} - (\dot{x}_t^\True)^{(k-1)} \|_{H^k(\Omega)}
\leq C_{\mathrm{regk}} (L_{\mathrm{k-order}}^{1/2} +  \| (\dot{x}_t^\Est)^{(k-1)} - (\dot{x}_t^\True)^{(k-1)}  \|_{L^2(\Omega)}).    
\end{align*}
 We conclude that the learned fields are also close to the stronger $H^2(\Omega)$ norm. As we assumed in Assumption~\ref{ass:noise_bound} and~\ref{ass:k_discrete_time}, For or uniform time steps $\Delta t = 1/L$, the update for the estimate is 
\begin{align*}
    x_{l+1}^\Est = & ~ x_l^\Est 
    + \Delta t    u_{1,\theta_1}( x_l^\Est, t_l )
    + \frac{(\Delta t)^2}{2}  u_{2,\theta_2}  (u_{1,\theta_1}(\cdot), x_l^\Est, t_l )
    + \frac{(\Delta t)^3}{6}  u_{3,\theta_3}  (u_{2,\theta_2}(\cdot), x_l^\Est, t_l ) \\
    &~ + \sum_{j=4}^k \frac{(\Delta t)^j}{j!} u_{j,\theta_j}  (u_{j-1,\theta_{j-1}} (\dots u_{1,\theta_1}(x_l^\Est, t_l)\dots ), x_l^\Est, t_l),
    \\
    x_{l+1}^\True &= x_l^\True 
    + \Delta t    \dot{x}_l^\True 
    + \frac{(\Delta t)^2}{2}  \ddot{x}_l^\True
    + \frac{(\Delta t)^3}{6}  \dddot{x}_l^\True
    + \sum_{j=4}^k \frac{(\Delta t)^j}{j!}(\dot{x}_t^\True)^{(j)}.
\end{align*}
Subtracting two updates and taking the $H^k$-norm, and invoking the Lipschitz condition in Assumption~\ref{ass:k_lipschitz}, yields Lemma~\ref{lem:prog_noise_3rd}, we have
\begin{align*}
         e_{l+1} = & ~ \| x_{l+1}^\Est - x_{l+1}^\True \|_{H^k(\Omega)} \leq (1+ \Delta t \cdot C_{\mathrm{prop,k}}) e_l + C_{\mathrm{prop,k}} \Delta t \cdot \delta \cdot \epsilon.
     \end{align*}
Here $C_{\mathrm{prop,k}}$ depends on Lipschitz constants and bounds on $\dot{x}$ , $\ddot{x},\dots, \dot{x}_t^{(k)}$. Iterating the above and a discrete Gronwell argument shows
\begin{align*}
    e_L = & ~ \| x_L^\Est - x_L^\True \|_{H^k(\Omega)} \leq e_0 \exp (C_{\mathrm{prop,k}}) + \frac{\delta \cdot \epsilon}{C_{\mathrm{prop,k}}} (\exp (C_{\mathrm{prop,k}}) - 1).
\end{align*}
Since $\exp(C_{\mathrm{prop}})$ is just a constant factor, denote it by $e^{C_2}$. Combine all of these terms then yields the exact form of Theorem~\ref{thm:main_k}:
\begin{align*}
\| x_L^\Est - x_L^\True \|_{H^k(\Omega)} \leq C_1''  \exp(C_2'')  (e_0 + \delta \epsilon ) + C_3'' ( ( \sum_{i=1}^{k}\mathcal{C}(\F_i) +\ln(1/\beta))/N)^{1/2},
\end{align*}
Thus, we complete the proof.
\end{proof}