\section{Results}\label{sec:results}


\subsection{Assumptions}


We make the following mild assumptions on the data distribution $\qdata$ and on the score estimate $s$.

\begin{asm}[second moment bound]\label{ass:second_moment}
    We assume that $\mf m_2^2 \triangleq \E_{\qdata}[\norm \cdot^2] < \infty$.
\end{asm}

\begin{asm}[Lipschitz score]\label{ass:lip_score}
    For all $t\in [0,T]$, the score $\nabla \ln q_t$ is $L$-Lipschitz, for some $L\ge 1$.
\end{asm}

\begin{asm}[Lipschitz score estimate]\label{ass:lip_estimate}
    For all $t$ for which we need to estimate the score function in our algorithms, the score estimate $s_t$ is $L$-Lipschitz.
\end{asm}

\begin{asm}[score estimation error]\label{ass:score_error}
    For all $t$ for which we need to estimate the score function in our algorithms,
    \begin{align*}
        \E_{q_t}[\norm{s_t - \nabla \ln q_t}^2] \le \esc^2\,.
    \end{align*}
\end{asm}

\noindent Assumptions~\ref{ass:second_moment}, \ref{ass:lip_score}, and \ref{ass:score_error} are standard and were shown in \cite{Chenetal23diffmodels, CheLeeLiu23ImprovedSGM, leelutan23sgmgeneral} to suffice for obtaining polynomial-time convergence guarantees for DDPMs. The new condition that we require in our analysis is Assumption~\ref{ass:lip_estimate}, which was used in \cite{leelutan22sgmpoly} but ultimately shown to be unnecessary for DDPMs.
We leave it as an open question whether this can be lifted in the ODE setting.

\begin{rmk}\label{rmk:wo_lip_score}
    As observed in \cite{Chenetal23diffmodels, CheLeeLiu23ImprovedSGM, leelutan23sgmgeneral}, Assumption~\ref{ass:lip_score} can be removed via early stopping, at the cost of polynomially larger iteration complexity. The idea is that if $\qdata$ has compact support but does not necessarily satisfy
    Assumption~\ref{ass:lip_score} (\emph{e.g.}, if $\qdata$ is supported on a compact and lower-dimensional manifold), then $\quo_\delta$ will satisfy Assumption~\ref{ass:lip_score} if $\delta > 0$.
    By applying our analysis up to time $T-\delta$ instead of time $T$, one can show that a suitable projection of the output distribution is close in Wasserstein distance to $\qdata$ (see~\cite[Corollary 2.4]{CheLeeLiu23ImprovedSGM} or~\cite[Corollary 5]{Chenetal23diffmodels}).
    For brevity, we do not consider this extension of our results here.
\end{rmk}

\newcommand{\nrounds}{N}
\newcommand{\tottime}{T}
\newcommand{\predtime}{W_{\sf pred}}
\newcommand{\predsteps}{M_{\sf pred}}
\newcommand{\predh}{h_{\sf pred}}
\newcommand{\corrtime}{W_{\sf corr}}
\newcommand{\corrsteps}{M_{\sf corr}}
\newcommand{\corrh}{h_{\sf corr}}


\subsection{Algorithms}\label{sec:algs}


We provide the pseudocode for the two algorithms we consider, \emph{Diffusion Predictor + Overdamped Modeling} (\DPOM{}) and \emph{Diffusion Predictor + Underdamped Modeling} (\DPUM{}), in Algorithms~\ref{alg:over} and \ref{alg:under} respectively. The only difference between the two algorithms is in the corrector step, which we \textcolor{orange}{highlight} in Algorithm~\ref{alg:under}.
For simplicity, we take the total amount of time $T$ to be equal to $N_0/L + \hpred$ for an integer $N_0 \ge 1$, and we assume that $1/L$ is a multiple of $\hpred$ and that $\hpred$ is a multiple of $\delta = \Theta\bigl(\frac{\varepsilon^2}{L^2 \,(d\vee \mf m_2^2)}\bigr)$.

We consider two stages: in the first stage, which lasts until time $N_0/L = T-\hpred$, we intersperse predictor epochs (run for time $1/L$,
discretized with step size $\hpred$) and corrector epochs (run for time $\Theta(1/L)$ for the overdamped corrector or for time $\Theta(1/\sqrt L)$ for the underdamped corrector, and discretized with step size $\hcorr$). The second stage lasts from time $T-\hpred$ to time $T-\delta$, and we incorporate geometrically decreasing step sizes for the predictor.
Note that this implies that our algorithm uses \emph{early stopping}.

\begin{algorithm}[ht]
\DontPrintSemicolon
\caption{\DPOM($\tottime, \predh, \corrh, s$)}\label{alg:over}
	\KwIn{Total time $\tottime$, predictor step size $\predh$, corrector step size $\corrh$, score estimates $s$}
	\KwOut{Approximate sample from the data distribution $\qdata$}
        Draw $\wh{x}_0 \sim \gamma^d$.\;
	\For{$n = 0,1,\dotsc, N_0 -1$}{
        \textbf{Predictor:} Starting from $\wh{x}_{n/L}$, run the discretized probability flow ODE~\eqref{eq:prob_ode_discrete} from time $\frac{n}{L}$ to $\frac{n+1}{L}$ with step size $\predh$ and estimated scores to obtain $\wh x_{(n+1)/L}'$.\;
        \textbf{Corrector:} Starting from $\wh{x}'_{(n+1)/L}$, run overdamped Langevin Monte Carlo for total time $\Theta(1/L)$ with step size $\corrh$ and score estimate $s_{(n+1)/L}$ to obtain $\wh{x}_{(n+1)/L}$. \;
    }
        \textbf{Predictor:} Starting from $\wh x_{T-\hpred}$, run the discretized probability flow ODE~\eqref{eq:prob_ode_discrete} with step sizes $\hpred/2, \hpred/4, \hpred/8,\dotsc, \delta$ and estimated scores to obtain $\wh x_{T-\delta}'$.\;
        \textbf{Corrector:} Starting from $\wh x_{T-\delta}'$, run overdamped Langevin Monte Carlo for total time $\Theta(1/L)$ with step size $\hcorr$ and score estimate $s_{T-\delta}$ to obtain $\wh x_{T-\delta}$.\;
    \Return $\wh{x}_{T-\delta}$\;
\end{algorithm}

\begin{algorithm}[ht]
\DontPrintSemicolon
\caption{\DPUM($\tottime, \predh, \corrh, s$)}\label{alg:under}
	\KwIn{Total time $\tottime$, predictor step size $\predh$, corrector step size $\corrh$, score estimates $s$}
	\KwOut{Approximate sample from the data distribution $\qdata$}
        Draw $\wh{x}_0 \sim \gamma^d$.\;
	\For{$n = 0,1,\dotsc, N_0 -1$}{
        \textbf{Predictor:} Starting from $\wh{x}_{n/L}$, run the discretized probability flow ODE~\eqref{eq:prob_ode_discrete} from time $\frac{n}{L}$ to $\frac{n+1}{L}$ with step size $\predh$ and estimated scores to obtain $\wh x_{(n+1)/L}'$.\;
        \textbf{Corrector:} Starting from $\wh{x}'_{(n+1)/L}$, run \textcolor{orange}{underdamped Langevin Monte Carlo} for \textcolor{orange}{total time $\Theta(1/\sqrt L)$} with step size $\corrh$ and score estimate $s_{(n+1)/L}$ to obtain $\wh{x}_{(n+1)/L}$. \;
    }
        \textbf{Predictor:} Starting from $\wh x_{T-\hpred}$, run the discretized probability flow ODE~\eqref{eq:prob_ode_discrete} with step sizes $\hpred/2, \hpred/4, \hpred/8,\dotsc, \delta$ and estimated scores to obtain $\wh x_{T-\delta}'$.\;
        \textbf{Corrector:} Starting from $\wh x_{T-\delta}'$, run \textcolor{orange}{underdamped Langevin Monte Carlo} for \textcolor{orange}{total time $\Theta(1/\sqrt L)$} with step size $\hcorr$ and score estimate $s_{T-\delta}$ to obtain $\wh x_{T-\delta}$.\;
    \Return $\wh{x}_{T-\delta}$\;
\end{algorithm}


\subsection{Convergence guarantees}


\noindent Our main results are the following convergence guarantees for the two predictor-corrector schemes described in \S\ref{sec:algs}:

\begin{thm}[\DPOM{}]\label{thm:pc_over}
    Suppose that Assumptions~\ref{ass:second_moment}--\ref{ass:score_error} hold. If $\widehat{q}$ denotes the output of \DPOM{} (Algorithm~\ref{alg:over}) with $\delta \asymp \frac{\varepsilon^2}{L^2 \, (d \vee \mf m_2^2)}$, then 
    \begin{equation}\label{eq:dpom_bd}
        \TV(\widehat{q},\qdata) \lesssim (\sqrt d \vee \mf m_2) \exp(-T) + L^2 T d^{1/2} \hpred + L^{3/2} T d^{1/2} \hcorr^{1/2} + L^{1/2} T\esc + \varepsilon\,.
    \end{equation}
    In particular, if we set $T=\Theta\bigl(\log(\frac{d \vee \mf m_2^2}{\varepsilon^2})\bigr)$, $\hpred = \widetilde \Theta(\frac{\varepsilon}{L^2 d^{1/2}})$, $\hcorr = \widetilde \Theta(\frac{\varepsilon^2}{L^3 d})$, and if the score estimation error satisfies $\esc \le \widetilde O(\frac{\varepsilon}{\sqrt L})$, then 
    we can obtain TV error $\ep$ with a total iteration complexity of $\widetilde \Theta(\frac{L^3 d}{\varepsilon^2})$ steps.
\end{thm}

The five terms in the bound~\eqref{eq:dpom_bd} correspond, respectively, to: the convergence of the forward (OU) process; the discretization error from the predictor steps; the discretization error from the corrector steps; the score estimation error; and the early stopping error.

Theorem~\ref{thm:pc_over} recovers nearly the same guarantees as the one in~\cite{Chenetal23diffmodels, CheLeeLiu23ImprovedSGM, leelutan23sgmgeneral}, but for the probability flow ODE with overdamped Langevin corrector instead of the reverse SDE without corrector.
Recall also from Remark~\ref{rmk:wo_lip_score} that our results can easily be extended to compactly supported data distributions without smooth score functions. This covers essentially all distributions encountered in practice.
Therefore, our result provides compelling theoretical justification complementing the empirical efficacy of the probability flow ODE\@, which was hitherto absent from the literature.

However, in Theorem~\ref{thm:pc_over}, the iteration complexity is dominated by the corrector steps.
Next, we show that by replacing the overdamped LMC with underdamped LMC, we can achieve a quadratic improvement in the number of steps, considering the dependence on $d$. As discussed in the Introduction, this highlights the potential benefits of the ODE framework over the SDE.

\begin{thm}[\DPUM{}]\label{thm:pc_under}
    Suppose that Assumptions~\ref{ass:second_moment}--\ref{ass:score_error} hold. If $\widehat{q}$ denotes the output of \DPUM{} (Algorithm~\ref{alg:under}) with $\delta \asymp \frac{\varepsilon^2}{L^2 \, (d\vee \mf m_2^2)}$, then 
    \begin{equation}\label{eq:dpum_bd}
        \TV(\widehat{q},\qdata) \lesssim (\sqrt d \vee \mf m_2) \exp(-T) + L^2 T d^{1/2} \hpred + L^{3/2} T d^{1/2} \hcorr + L^{1/2} T\esc + \varepsilon\,.
    \end{equation}
    In particular, if we set $T=\Theta\bigl(\log(\frac{d \vee \mf m_2^2}{\varepsilon^2})\bigr)$, $\hpred = \widetilde \Theta(\frac{\varepsilon}{L^2 d^{1/2}})$, $\hcorr = \widetilde \Theta(\frac{\varepsilon}{L^{3/2} d^{1/2}})$, and if the score estimation error satisfies $\esc \le \widetilde O(\frac{\varepsilon}{\sqrt L})$, then 
    we can obtain TV error $\ep$ with a total iteration complexity of $\widetilde \Theta(\frac{L^2 d^{1/2}}{\varepsilon})$ steps.
\end{thm}