\section{Preliminary}\label{sec:pre}
In this section, we provide the foundational concepts, notations, and assumptions required for the subsequent theoretical developments. In Section~\ref{sec:notation}, we begin by listing the key notations employed throughout this work. In this Section~\ref{sec:assumption}, we state the principal assumptions under which our analysis is conducted. Next, we introduce the flow-matching framework, along with its second-order extension, and highlight several important definitions in Section~\ref{sec:pre_flow_matching}. Finally, in Section~\ref{sec:proposed_method}, we provide our second-order algorithms.

\subsection{Notations}\label{sec:notation}

We use $\Pr[]$ to denote the probability. We use $\E[]$ to denote the expectation. We use $\var[]$ to denote the variance.
We use $\|x\|_p$ to denote the $\ell_p$ norm of a vector $x \in \R^n$, i.e. $\|x\|_1 := \sum_{i=1}^n |x_i|$, $\|x\|_2 := (\sum_{i=1}^n x_i^2)^{1/2}$, and $\|x\|_{\infty} := \max_{i \in [n]} |x_i|$. 
For variables $a,b$, We write  $ a \lesssim b$ to indicate that $a$ is bounded above by $b$ up to a multiplicative constant independent of the main parameters. We write $a \gtrsim b$ to indicate that $a$ is bounded below by $b$ up to a multiplicative constant independent of the main parameters. We denote $\dot{x}^{(k)}$ as the $k$-th order derivative field of $x$. We use $\|\cdot \|_{H^2(\Omega)}$ to denote the Sobolev norm in $W^{2,2}(\Omega)$, corresponding to $k=2$ and $p=2$. 
\subsection{Assumptions}\label{sec:assumption}

We now outline the principal assumptions that underlie our theoretical analysis. These assumptions concern smoothness, Lipschitz continuity, bounded noise, function-class complexity, and time discretization. First, we show the assumption of smoothness and boundness.

\begin{assumption}[Smoothness and boundedness]
\label{ass:smooth}
We assume the true trajectory $x_t^{\True}$ and its first and second derivatives are sufficiently smooth and bounded.
Specifically, $x_t^{\True} \in H^2(\Omega)$, and there exist constants $M_1, M_2 > 0$ such that
\begin{align*}
\|\dot{x}_t^{\True}\|_{H^2(\Omega)} \leq M_1,
\quad
\|\ddot{x}_t^{\True}\|_{H^2(\Omega)} \leq M_2.
\end{align*}
\end{assumption}

\begin{assumption}[Lipschitz continuity]
\label{ass:lipschitz}
The learned fields $u_{1,\theta_1}(x,t)$ and $u_{2,\theta_2}(v,x,t)$ are $L$-Lipschitz continuous in spatial and temporal arguments.
Formally, there exists $L > 0$ such that for all $x,y \in \R^d$ and $t,s \in [0,1]$:
\begin{align*}
\|u_{1,\theta_1}(x,t) - u_{1,\theta_1}(y,t)\|_2 &\leq L \|x-y\|_2 , \\
\|u_{2,\theta_2}(v,x,t) - u_{2,\theta_2}(v,y,t)\|_2 &\leq L \|x-y\|_2,
\end{align*}
and
\begin{align*}
\|u_{2,\theta_2}(v,x,t) - u_{2,\theta_2}(v,y,t)\|_2
& \leq L \|x-y\|_2, \\
\|u_{2,\theta_2}(v,x,t) - u_{2,\theta_2}(v,x,s)\|_2 &\leq L \| t-s \|_2,
\end{align*}
Similar conditions hold for time differences.
\end{assumption}


\begin{assumption}[Bounded noise magnitude]
\label{ass:noise_bound}
There exists $\delta > 0$ such that $\|\eta_i\|_2 \le \delta$ or $\mathbb{E}[\|\eta_i\|_2^2] \le \delta^2$.
This ensures that the noise does not grow without bounds.
\end{assumption}

\begin{assumption}[Rademacher complexity or VC dimension]\label{ass:rademacher}
There exist function classes $\mathcal{F}_1, \mathcal{F}_2$ such that
\begin{align*}
u_{1,\theta_1}(\cdot , \cdot) \in \mathcal{F}_1, \quad  u_{2,\theta_2}(\cdot , \cdot) \in \mathcal{F}_2.
\end{align*}
The complexity of each class is measured by $\mathcal{C}(\mathcal{F}_1)$ and $\mathcal{C}(\mathcal{F}_2)$.
\end{assumption}

\begin{assumption}[Bounded loss]\label{ass:subG_loss}
There exists some constant $Q>0$ such that for all $\theta \in \Theta$ and all $x \in {\cal X}$, the per-sample loss $l_{\theta} (x)$ satisfies $|l_\theta (x) | \leq Q$.
\end{assumption}

\begin{assumption}[Time discretization]\label{ass:discrete_time}
For the inference (deployment) stage, let $\Delta t = 1/L$ be the uniform step size, and define discrete times $t_l = l \Delta t$ for $l =0,1,\cdots, L$. The numerical scheme for forward integration is 
\begin{align*}
x_{l+1} = & ~ x_l + \Delta t \cdot u_{1,\theta_1}(x_l,t_l) \\
& ~ + \frac{(\Delta t)^2}{2} u_{2,\theta_2} (u_{1,\theta_1}(x_l,t_l), x_l, t_l).
\end{align*}
\end{assumption}

\begin{remark}
    In this paper, our derivation accounts for higher-order residual terms through discrete Gronwall-type analyses (see Lemma~\ref{lem:prog_noise}). Despite the discretization assumption potentially omitting explicit higher-order terms, we bound such cumulative effects over time by considering Lipschitz continuity and noise assumptions. This ensures that any leftover remainder does not cause unbounded error growth. 
\end{remark}


\subsection{Flow Matching and Rectified Flow}\label{sec:pre_flow_matching}

Next, we describe the general framework of flow matching and its second-order rectification. These concepts form the basis for our proposed method, as they integrate first and second-order information for trajectory estimation.


\begin{definition}[Easy error]\label{def:easy_error}
Let 
\begin{align*}
    c_1(t) := \dot{x}^\Est_t - \dot{x}^\True_t, \quad c_2(t) := \ddot{x}^\Est_t - \ddot{x}^\True_t.
\end{align*}
\end{definition}


\begin{fact}\label{fac:one_second_order}
Let a field $x_t$ be defined as 
\begin{align*}
x_t = \alpha_t x_0 + \beta_t x_1,
\end{align*}
where $\alpha_t$ and $\beta_t$ are functions of $t$, and $x_0, x_1$ are constants. Then, the first-order gradient $\dot{x_t}$ and the second-order gradient $\ddot{x_t}$ can be manually calculated as
\begin{align*}
\dot{x}_t =  \dot{\alpha_t} x_0 + \dot{\beta_t} x_1~~\mathrm{and}~~\ddot{x}_t =&~ \ddot{\alpha_t} x_0 + \ddot{\beta_t} x_1.
%\dot{x}_t =&~ \dot{\alpha_t} x_0 + \dot{\beta_t} x_1, \\
%\ddot{x}_t =&~ \ddot{\alpha_t} x_0 + \ddot{\beta_t} x_1.
\end{align*}
\end{fact}


\begin{definition}[A variant of flow matching in \cite{flow_matching}]
\label{def:flow_matching}
Given two distributions $\mu_0$ and $\pi_0$ on $\mathbb{R}^d$, flow matching aims to learn a time-dependent velocity field
\begin{align*}
   v_\theta : \mathbb{R}^d \times [0,1] \;\to\; \mathbb{R}^d
\end{align*}
such that for any trajectory $x_t$ transporting $x_0 \sim \mu_0$ to $x_1 \sim \pi_0$, we have 
\begin{align*}
   \dot{x}_t \sim v_\theta(x_t,\, t).
\end{align*}
\end{definition}

\begin{remark}
    In practice, one often samples $(x_0, x_1)$ from $(\mu_0, \pi_0)$ and parameterizes $x_t$ (e.g. via interpolation) at intermediate times to build a training objective that matches the velocity field to the true time derivative $\dot{x}_t$.
\end{remark}

\begin{definition}[Second-order flow matching and loss] 
\label{def:second_order_flow_matching_loss_new}
We additionally learn an acceleration field
\begin{align*}
u_{2,\theta_2}(v, x, t), \quad \mathrm{where} \quad v = u_{1,\theta_1}(x,t),   
\end{align*}

to approximate $\ddot{x}_t$. 
Hence, the two-part (second-order) loss is: 
\begin{align*}
   L_{\mathrm{2nd}}(\theta_1,\theta_2) 
   =&~  \underbrace{\mathbb{E}  [\|\dot{x}_t^\True - u_{1,\theta_1}(x_t,t)\|^2_2 ]}_{L_{2,1,\theta_1}} \\
   & ~ + \underbrace{\mathbb{E}  [\|\ddot{x}_t^\True - u_{2,\theta_2}  (u_{1,\theta_1}(x_t,t), x_t, t )\|^2_2 ]}_{L_{2,2,\theta_2,\theta_1}}.
\end{align*}

Here, $\dot{x}_t^\True$ and $\ddot{x}_t^\True$ are observed (or numerically approximated) true velocity and acceleration, while $u_{1,\theta_1}$ and $u_{2,\theta_2}$ are the networks to be trained.
\end{definition}

\begin{definition}[Trajectory and time parameterization]\label{def:trajectory_time}
Consider a continuous trajectory $\{ x_t \}_{t \in [0,1]} \in \R^d$ connecting an initial distribution $\mu_0$ to a target data distribution $\pi_0$.
We assume $x_0 \sim \mu_0$ and $x_1 \sim \pi_0$.
\end{definition}


\begin{definition}[First and second order flow]
A first-order rectified flow is characterized by a velocity field 
$u_{1,\theta_1}(x,t)$ 
approximating 
$\dot{x}_t$.
A second-order rectified flow further involves an acceleration field 
$u_{2,\theta_2}(v,x,t)$, 
where 
$v = u_{1,\theta_1}(x,t)$ 
approximates 
$\dot{x}_t$, 
and 
$u_{2,\theta_2}$ 
approximates 
$\ddot{x}_t$.
\end{definition}

\begin{definition}[Sobolev space]\label{def:sobolev_space}
For a domain $\Omega \subset \R^d$, the Sobolev space $H^2(\Omega)$ is defined as
\begin{align*}
H^2(\Omega) = \{ f \in L^2(\Omega) : D^\alpha f \in L^2(\Omega), \forall |\alpha| \leq 2\}.
\end{align*}
We assume the trajectory $x_t(\omega)$ or its corresponding fields lie in such spaces, ensuring sufficient smoothness.
\end{definition}


\begin{definition}[Noisy data and noise proportion]\label{def:noise}
Let the training dataset $X = \{ x_i \}_{i=1}^N$ be drawn from $\pi_0$ but corrupted by noise.
We denote the noise proportion as $\epsilon = N_{\Noisy} / N$.
A noisy sample can be modeled as
\begin{align*}
x_i^{\Noisy} = x_i^{\Clean} + \eta_i,
\end{align*}
where $\eta_i$ satisfies certain boundedness conditions.
\end{definition}

\begin{definition}[Error]\label{def:error}
As we assume in Assumption~\ref{ass:discrete_time}, we define the error in $H^2$-norm.
\begin{align*}
    e_k := \| x_k^\Est - x_k^\True \|_{H^2(\Omega)}
\end{align*}
\end{definition}


\begin{definition}[Second-order loss function]\label{def:2nd}
The loss function for the second-order method contains two parts. We define the first part which is trying to using $\dot{x}_t$ in Fact~\ref{fac:one_second_order}, $x_t$ and $t$ to learn function $u_{1,t}$, thus the loss is
\begin{align*}
    L_{2,1,\theta_1} := \| \dot{x}_t - u_{1,\theta_1}(x_t, t) \|_2^2
\end{align*}
Next, we define the second part, which is trying to use $\ddot{x}_t, u_{1,\theta_1}(x_t,t)$, $x_t$ and $t$ to learn $u_{2,\theta_2}$ function, thus the loss is
\begin{align*}
    L_{2,2,\theta_2,\theta_1} := \| \ddot{x}_t - u_{2,\theta_2}( u_{1,\theta_1}(x_t, t), x_t, t) \|_2^2
\end{align*}
Overall, the total loss is
\begin{align*} 
    L_{2,\theta} := L_{2,1,\theta_1} + L_{2,2,\theta_2,\theta_1}
\end{align*}
\end{definition}

\begin{definition}[Empirical loss]\label{def:emp_loss}
    We define the empirical second-order loss as 
    %\begin{align*}
    $
        \wt{L}_{2,\theta} = \frac{1}{N} \sum_{i=1}^N l_{\theta}(x_i).
    $
    %\end{align*}
\end{definition}


\begin{definition}[Population loss]\label{def:pp_loss}
    We define the population second-order loss as 
    %\begin{align*}
    $
        L_{2,\theta} = \E [l_{\theta}(X)]
    $
    %\end{align*}
\end{definition}


\subsection{Proposed Method}\label{sec:proposed_method}

In this section, we now summarize the second-order algorithms that arise from the definitions above. Due to the space limitation, we delay the original first-order algorithm and our new third algorithms in the appendix.


\begin{algorithm}[!ht]\caption{Our new second-order training process}
\begin{algorithmic}[1]
\Procedure{2ndOrderForward}{$ $}
    \For{each iteration} 
    \State Random sample $x_0$ and time $t$, with target $x_1$
    \State $x_t \gets \alpha_t \cdot x_0 + \sqrt{1-\alpha_t^2} \cdot x_1$\
    \State Compute gradient with respect to $L_{2,\theta}$ \Comment{see Def.~\ref{def:2nd}}
    \EndFor 
    \State \Return $u_1, u_2$ \Comment{Two network functions}
\EndProcedure
\end{algorithmic}
\end{algorithm}




\begin{algorithm}[!ht]\caption{Our new second-order inference algorithm} \label{alg:second_order_alg_4}
\begin{algorithmic}[1]
\Procedure{2ndOrderInference}{$u_1,u_2$}
    \State $x_0 \sim {\cal N}(0,1)$
    \State Initial $x \gets x_0$
   
     \For{$t$ from $0$ to $1$ with step $\Delta t = 0.01$} 
     \State $x \gets x + \Delta t \cdot u_1(x,t) + \frac{(\Delta t)^2}{2} \cdot u_2(u_1(x,t), x, t)$
    \EndFor 
    \State \Return $x$
\EndProcedure
\end{algorithmic}
\end{algorithm}

\begin{remark}
    Since our approach could use a separate neural network for each higher-order term, the overall complexity scales by a constant factor corresponding to the number of higher-order terms. Thus, the time complexity of our method exactly matches the complexity of previous first-order flow-matching methods, and this constant factor will not substantially increase computational overhead beyond the original first-order algorithm. 
\end{remark}

\begin{remark}
    Our model parameterizes all orders of time derivatives, rather than only parameterizing higher-order derivatives (e.g., acceleration) and using numerical integration to compute lower-order derivatives (e.g., velocity). This technical choice ensures the numerical stability of our model and avoids introducing extra numerical errors during integration. In computationally resource-scarce settings where fewer model parameters are needed, our model can also flexibly consider parameterizing only higher-order time derivatives while deriving lower-order derivatives through numerical integration.
\end{remark}