\newcommand{\window}{W}


\section{Corrector step}\label{sec:corrector}


In \S\ref{sec:over} (resp.\ \S\ref{sec:under}), we will show that if $p$, $q$ are close in Wasserstein distance, then running the corrector step based on the overdamped (resp.\ underdamped) Langevin diffusion starting from $p$ and from $q$ for some amount of time results in distributions which are close in \emph{total variation} distance.
In the end-to-end analysis in \S\ref{sec:end_to_end}, we combine this ``total variation to Wasserstein'' regularization with the Wasserstein discretization analysis of the predictor step in \S\ref{sec:predictor} in order to establish our final results.


\subsection{Corrector via overdamped Langevin}\label{sec:over}


We will take the potential and score estimate defining the Markov kernels $\Plan$ and $\Plans$ from \S\ref{sec:diffusion} to be $U$ and $s$ respectively. Recall that these correspond respectively to running the overdamped Langevin diffusion with stationary distribution $q \propto \exp(-U)$ and running the discretized diffusion with score estimate $s$, both for time $h$.

The main result of this section is to show that $p\Plans^N$ and $q$ are close in total variation if $p$ and $q$ are close in Wasserstein.

\begin{thm}[Overdamped corrector]\label{thm:main_overdamped}
For any $\Tcorr\deq Nh \lesssim 1/L$,
\begin{equation}
    \TV(p\Plans^N,q) \lesssim W_2(p,q)/\sqrt{\Tcorr} + \esc\sqrt{\Tcorr} + L\sqrt{dh\Tcorr} \,.
\end{equation}
In particular, for $\Tcorr \asymp 1/L$,
\begin{equation}
    \TV(p\Plans^N,q) \lesssim \sqrt L \,W_2(p,q) + \esc/\sqrt{L} + \sqrt{Ldh} \,.
\end{equation}
\end{thm}

We will bound $\TV(p\Plan^N, q)$ and $\TV(p\Plans^N, p\Plan^N)$ separately. For the former, we use the following short-time regularization result:

\begin{lem}[{\cite[Lemma 4.2]{bobkov2001hypercontractivity}}]\label{lem:overdamp_reg}
    If $\Tcorr \lesssim 1/L$, then
    \begin{equation}
        \TV(p\Plan^N, q) \lesssim \sqrt{\KL(p\Plan^N \mmid q)} \lesssim W_2(p,q)/\sqrt{\Tcorr}\,.
    \end{equation}
\end{lem}
\begin{proof}
    The first inequality is Pinsker's inequality.
    The second inequality is a consequence of \cite[Lemma 4.2]{bobkov2001hypercontractivity}, which gives a bound of $\KL(p\Plan^{N}\mmid q) \lesssim L\,(1+1/(e^{2L\Tcorr}-1))\,W_2^2(p,q)$. The claim then follows from simplifying by using $\Tcorr \lesssim 1/L$.
\end{proof}

For the latter term, we introduce notation for two stochastic processes.
\begin{align*}
    \D \statx_t
    &= -\nabla U(\statx_t) \, \D t + \sqrt 2 \, \D B_t\,, & \statx_0 \sim q\,, \\
    \D x_t
    &= -\nabla U(x_t) \, \D t + \sqrt 2 \,\D B_t\,, & x_0 \sim p\,.
\end{align*}
Note that for any integer $k \ge 0$,
\begin{align*}
    \statx_{kh} \sim q\Plan^k\,, \qquad x_{kh} \sim p\Plan^k\,.
\end{align*}
Observe that marginally, $q\Plan^k = q$ for any $k\ge 0$ because $q$ is the stationary distribution of the Langevin diffusion. The three processes are coupled by using the same Brownian motion and by coupling $x_0 = \wh x_0 \sim p$ and $\statx_0 \sim q$ optimally.

Before we proceed to bound $\TV(p\Plan^N, p\Plans^N)$, we need the following simple lemma.

\begin{lem}\label{lem:x_vs_stat}
    If $\Tcorr \lesssim 1/L$, then 
    \begin{equation}
        \E[\norm{x_t - \statx_t}^2] \lesssim W_2^2(p,q)
    \end{equation}
    for all $0 \le t \le \Tcorr$.
\end{lem}
\begin{proof}
    By It\^o's formula,
    \begin{equation}
        \D (\norm{x_t - \statx_t}^2) 
        = - 2\,\langle x_t - \statx_t, \nabla U(x_t) - \nabla U(\statx_t)\rangle \le 2L\,\norm{x_t - \statx_t}^2\,. \label{eq:timederiv_0}
    \end{equation}
    By Gr\"{o}nwall's inequality, 
    \begin{equation}
        \norm{x_t - \statx_t}^2 \le e^{2Lt}\,\norm{x_0 - \statx_0}^2\,,
    \end{equation}
    so that if we couple the two processes by coupling $x_0$ and $\statx_0$ optimally, we conclude that
    \begin{equation}
        \E[\norm{x_t - \statx_t}^2] \le e^{2Lt} \E[\norm{x_0 - \statx_0}^2] = e^{2Lt}\,W_2^2(p,q) \lesssim W_2^2(p,q)\,,
    \end{equation}
    recalling that $t\le \Tcorr \lesssim 1/L$ by hypothesis.
\end{proof}

\noindent It remains to bound $\TV(p\Plans^N, p\Plan^N)$.

\begin{lem}\label{lem:mut_nut_over}
    If $\Tcorr \lesssim 1/L$, then
    \begin{align*}
        \TV(p\Plans^N,p\Plan^N)
        \lesssim  \sqrt{\KL(p\Plan^N \mmid p\Plans^N)}
        \lesssim L\sqrt{\Tcorr}\,W_2(p,q) + \esc\sqrt{\Tcorr} + L\sqrt{dh\Tcorr}\,.
    \end{align*}
\end{lem}
\begin{proof}
    As $x$ and $\wh x$ are driven by the same Brownian motion, by Girsanov's theorem\footnote{Although the validity of Girsanov's theorem typically requires Novikov's condition to be satisfied, this can be avoided via an approximation argument as in~\cite{Chenetal23diffmodels}.} and the data processing inequality we have
    \begin{equation}
        \KL(p\Plan^N\mmid p\Plans^N) \lesssim \sum^{N-1}_{k=0} \int^{(k+1)h}_{kh} \E[\norm{s(x_{kh}) - \nabla U(x_u)}^2] \, \D u\,.
    \end{equation}
    We can decompose the integrand as follows:
    \begin{align}
        \E[\norm{s(x_{kh}) - \nabla U(x_u)}^2]
        &\lesssim \E\bigl[\norm{s(x_{kh}) - s(\statx_{kh})}^2 + \norm{s(\statx_{kh}) - \nabla U(\statx_{kh})}^2 \\
        &\qquad\qquad{} + \norm{\nabla U(\statx_{kh}) - \nabla U(\statx_u)}^2 + \norm{\nabla U(\statx_u) - \nabla U(x_u)}^2\bigr] \\
        &\le L^2\E[\norm{x_{kh} - \statx_{kh}}^2] + \esc^2 + L^2\E[\norm{\statx_{kh} - \statx_u}^2] + L^2\E[\norm{\statx_u - x_u}^2] \\
        &\lesssim L^2 \,W_2^2(p,q) + \esc^2 + L^2 \E[\norm{\statx_{kh} - \statx_u}^2]\label{eq:girsanov_over}
    \end{align}
    where we used Lemma~\ref{lem:x_vs_stat} to bound $\E[\norm{\statx_u - x_u}^2]$.

    It remains to bound $\E[\norm{\statx_{kh} - \statx_u}^2]$. Note that
    \begin{align*}
        \E[\norm{\statx_{kh} - \statx_u}^2]
        &= \E\Bigl[\Bigl\|\int^u_{kh} -\nabla U(\statx_s)\, \D s + \sqrt 2\,(B_u - B_{kh})\Bigr\|^2\Bigr]
        \lesssim h\int^u_{kh} \E[\norm{\nabla U(\statx_s)}^2]\, \D s + dh\\
        &\le Ldh^2 + dh \lesssim dh\,,
    \end{align*}
    where in the last step we used that $\E[\norm{\nabla U(\statx_u)}^2] \le Ld$. Substituting this into \eqref{eq:girsanov_over}, we obtain
    \begin{align}
        \KL(p\Plan^N\mmid p\Plans^N)
        &\lesssim L^2 \Tcorr\,W_2^2(p,q) + \esc^2 \Tcorr + L^2dh\Tcorr\,.
    \end{align}
    The claimed bound on $\TV(p\Plan^N,p\Plans^N)$ follows by Pinsker's inequality.
\end{proof}

\begin{proof}[Proof of Theorem~\ref{thm:main_overdamped}]
    This is immediate from Lemma~\ref{lem:overdamp_reg} and Lemma~\ref{lem:mut_nut_over}, recalling that $\Tcorr \lesssim 1/L$ so that the bound in Lemma~\ref{lem:overdamp_reg} dominates the $W_2(p,q)$ term in Lemma~\ref{lem:mut_nut_over}.
\end{proof}


\subsection{Corrector via underdamped Langevin}\label{sec:under}


Throughout, we set the friction parameter to
\begin{equation}
    \fric \asymp \sqrt{L} \,.
\end{equation}
We will take the potential and score estimate defining the Markov kernels $\Puld$ and $\Pulds$ from \S\ref{sec:diffusion} to be $U$ and $s$ respectively. Recall that these correspond respectively to running the underdamped Langevin diffusion with stationary distribution $q$ and running the discretized diffusion with score estimate $s$, both for time $h$. 

Given probability measures $p$ and $q$, we write $\bs p \deq p \otimes \gamma_d$ and $\bs q \deq q \otimes \gamma_d$, where $\gamma_d$ is the standard Gaussian measure in $\R^d$.

The main result of this section is to show that $\bs p\Pulds^N$ and $\bs q$ are close in total variation if $p$ and $q$ are close in Wasserstein.
Compared to \S\ref{sec:over}, the discretization error for the underdamped Langevin diffusion is smaller.

\begin{thm}[Underdamped corrector]\label{thm:main_underdamped}
    For $\Tcorr \lesssim 1/\sqrt{L}$,
    \begin{align*}
        \TV(\bs p \Pulds^N, \bs q)
        &\lesssim \frac{W_2(p,q)}{L^{1/4} \Tcorr^{3/2}} + \frac{\esc \Tcorr^{1/2}}{L^{1/4}} + L^{3/4} \Tcorr^{1/2} d^{1/2} h\,.
    \end{align*}
    In particular, if we take $\Tcorr\asymp 1/\sqrt L$, then
    \begin{align*}
        \TV(\bs p \Pulds^N, \bs q)
        &\lesssim \sqrt L \,W_2(p,q) + \esc/\sqrt L + \sqrt{Ld}\, h\,.
    \end{align*}
\end{thm}

\noindent We will bound $\TV(\bs p\Puld^N, \bs q)$ and $\TV(\bs p\Pulds^N,\bs p\Puld^N)$ separately. For the former, we use the short-time regularization result of~\cite{guillin2012degenerate}:

\begin{lem}\label{lem:underdamp_reg}
    If $\Tcorr\lesssim 1/\sqrt{L}$, then
    \begin{equation}
        \TV(\bs p\Puld^N, \bs q) \lesssim \sqrt{\KL(\bs p \Puld^N \mmid \bs q)} \lesssim \frac{W_2(p, q)}{L^{1/4} \Tcorr^{3/2}}\,.
    \end{equation}
\end{lem}
\begin{proof}
    This is a consequence of \cite[Corollary 4.7 (1)]{guillin2012degenerate}. The condition to check therein is their Eq.\ (3.6), which in our setting is satisfied by the constants $K_1 = L$ and $K_2 = \fric$. The Corollary then states that for the cost function 
    \begin{equation}
        c_{\Tcorr}((z,v),(z',v')) \triangleq \inf_{t\in(0,\Tcorr]} \frac{t}{2\fric}\,\Bigl\{ \Bigl(\frac{6}{t^2} + L + \frac{3\fric}{2t}\Bigr)\,\norm{z-z'} + \Bigl(\frac{4}{t} + \frac{4Lt}{27} + \fric\Bigr)\,\norm{v-v'}\Bigr\}^2\,,
    \end{equation}
    we have $\KL(\bs p\Puld^N\mmid \bs q) \le W_{c_{\Tcorr}}(p\otimes \gamma_d,q\otimes\gamma_d)$. For $v = v'$ and $\Tcorr\lesssim 1/\sqrt{L}$, note that
    \begin{equation}
        c_{\Tcorr}((z,v),(z',v)) \lesssim \frac{1}{L^{1/2} \Tcorr^3}\,\norm{z-z'}^2\,,
    \end{equation}
    so the claim follows by Pinsker's inequality.
\end{proof}

Next, we define the following processes: $\D \statz_t = \statv_t \, \D t$, $\D z_t = v_t \, \D t$,
\begin{align*}
    \D \statv_t
    &= -\fric \statv_t \, \D t -\nabla U(\statz_t) \, \D t + \sqrt{2\fric}\,\D B_t\,, & (\statz_0, \statv_0) \sim \bs q\,, \\
    \D v_t
    &= -\fric v_t \, \D t - \nabla U(z_t) \, \D t + \sqrt{2\fric}\,\D B_t\,, & (z_0, v_0) \sim \bs p\,.
\end{align*}
It follows that for any integer $k\ge 0$,
\begin{align*}
    (\statz_{kh}, \statv_{kh}) \sim \bs q \Puld^k = \bs q\,, \qquad (z_{kh}, v_{kh}) \sim \bs p \Puld^k\,.
\end{align*}
We couple these processes by using the same Brownian motion and coupling $q\otimes\gamma_d$ and $p\otimes\gamma_d$ optimally (in particular, $v_0 = \statv_0$).

\noindent Before we proceed to bound $\TV(p\Puld^N,p\Pulds^N)$, we start with the following lemma.

\begin{lem}\label{lem:nu_vs_stat}
    If $\Tcorr \lesssim 1/\sqrt{L}$, then for all $0 \le t \le \Tcorr$,
    \begin{equation}
        \E[\norm{z_t - \statz_t}^2] \lesssim W_2^2(p,q)\,.
    \end{equation}
\end{lem}
\begin{proof}
    We have
    \begin{equation}
        \nabla U(z_t) - \nabla U(\statz_t) = \Bigl(\int^1_0 \nabla^2 U(z_t + u\,(\statz_t - z_t))\, \D u\Bigr)\, (z_t - \statz_t) \triangleq \calH_t (z_t - \statz_t)\,,
    \end{equation}
    and the operator $\calH_t$ satisfies
    \begin{equation}
        \norm{\calH_t}_{\op} \le L\,. \label{eq:Hbound}
    \end{equation}
    Let $\alpha \triangleq 2/\fric$. For the vectors $\delta_t \triangleq (z_t + \alpha v_t) - (\statz_t + \alpha \statv_t)$ and $\eta_t \triangleq z_t - \statz_t$, we have
    \begin{align}
        \frac{1}{2}\,\D(\norm{\delta_t}^2 + \norm{\eta_t}^2) &= - (\delta_t, \eta_t)^\top \begin{bmatrix}
            (\fric - \frac{1}{\alpha})\,I_d & \frac{1}{2}\,(\alpha\calH_t - \fric\,I_d) \\
            \frac{1}{2}\,(\alpha\calH_t - \fric\,I_d) & \frac{1}{\alpha}\,I_d
        \end{bmatrix}(\delta_t,\eta_t) \\
        &\lesssim \sqrt{L}\,(\norm{\delta_t}^2 + \norm{\eta_t}^2) \,. \label{eq:timederiv}
    \end{align}
    By Gr\"onwall's inequality, 
    \begin{equation}
        \norm{\delta_t}^2 + \norm{\eta_t}^2 \le e^{O(\sqrt{L} t)}\,(\norm{\delta_0}^2 + \norm{\eta_0}^2)\,,    
    \end{equation}
    so if we couple the two processes by coupling $z_0$ and $\statz_0$ optimally and taking $v_0 = \statv_0$, we obtain
    \begin{equation}
        \E[\norm{z_t - \statz_t}^2]
        \lesssim \E[\norm{\delta_t}^2 + \norm{\eta_t}^2] \le e^{O(\sqrt{L} t)} \E[\norm{\delta_0}^2 + \norm{\eta_0}^2] \lesssim e^{O(\sqrt{L} t)}\, W_2^2(p,q) \lesssim W_2^2(p,q)\,, \label{eq:nu_vs_stat_W2}
    \end{equation}
    recalling that $t\le \Tcorr \lesssim 1/\sqrt{L}$ by hypothesis.
\end{proof}

\noindent It remains to bound $\TV(\bs p\Pulds^N,\bs p\Puld^N)$. 
    
\begin{lem}\label{lem:mut_nut}
If $\Tcorr\lesssim 1/\sqrt{L}$, then
\begin{align*}
    \TV(\bs p\Pulds^N, \bs p\Puld^N)
    &\lesssim \sqrt{\KL(\bs p\Puld^N \mmid\bs p\Pulds^N)} \\
    &\lesssim L^{3/4} \Tcorr^{1/2} \, W_2(p,q) + L^{-1/4} \Tcorr^{1/2} \esc + L^{3/4} \Tcorr^{1/2} d^{1/2} h\,.
\end{align*}
\end{lem}
\begin{proof}
    As $(z, v)$ and $(\statz, \statv)$ are driven by the same Brownian motion, 
    by Girsanov's theorem\footnote{Again, we can avoid checking Novikov's condition using the approximation argument of~\cite{Chenetal23diffmodels}.} and the data processing inequality we have
    \begin{equation}
        \KL(\bs p\Puld^N\mmid\bs p\Pulds^N) \lesssim \frac{1}{\fric} \sum^{N-1}_{k=0} \int^{(k+1)h}_{kh} \E[\norm{s(z_{kh}) - \nabla U(z_u)}^2] \, \D u \,.
    \end{equation}
    We can decompose the integrand as follows:
    \begin{align}
        \E[\norm{s(z_{kh}) - \nabla  U(z_u)}^2] &\lesssim \E\bigl[\norm{s(z_{kh}) - s(\statz_{kh})}^2 + \norm{s(\statz_{kh}) - \nabla U(\statz_{kh})}^2 \\
        &\qquad\qquad{} + \norm{\nabla U(\statz_{kh}) - \nabla  U(\statz_u)}^2  + \norm{\nabla  U(\statz_u) - \nabla  U(z_u)}^2\bigr] \\
        &\le L^2\E[\norm{z_{kh} - \statz_{kh}}^2] + \esc^2 + L^2\E[\norm{\statz_{kh} - \statz_u}^2] + L^2\E[\norm{\statz_u - z_u}^2] \\
        &\lesssim L^2 \,W_2^2(p, q) + \esc^2 + L^2 \E[\norm{\statz_{kh} - \statz_u}^2]\,,\label{eq:girsanov_integrand}
    \end{align}
    where we applied Lemma~\ref{lem:nu_vs_stat}.
    
    It remains to bound $\E[\norm{\statz_{kh} - \statz_u}^2]$. Note that
    \begin{equation}
        \E[\norm{\statz_{kh} - \statz_u}^2] = \E\Bigl[\Bigl\|\int^u_{kh} \statv_s \, \D s\Bigr\|^2\Bigr] \le h\int^u_{kh} \E[\norm{\statv_s}^2] \, \D s \le dh^2\,,
    \end{equation}
    where in the last step we used the fact that $\statv_s \sim \gamma_d$. Substituting this into \eqref{eq:girsanov_integrand}, we conclude that 
    \begin{align}
        \KL(\bs p\Puld^N\mmid \bs p\Pulds^N)
        &\lesssim L^{3/2} \Tcorr\, W_2^2(p,q) + L^{-1/2} \Tcorr\esc^2 + L^{3/2} dh^2 \Tcorr\,.
    \end{align}
    The claimed bound on $\TV(\bs p\Puld^N,\bs p\Pulds^N)$ follows by Pinsker's inequality.
\end{proof}

\begin{proof}[Proof of Theorem~\ref{thm:main_underdamped}]
    This is immediate from Lemma~\ref{lem:underdamp_reg} and Lemma~\ref{lem:mut_nut}, recalling that $\Tcorr \lesssim 1/\sqrt{L}$ so that the bound in Lemma~\ref{lem:underdamp_reg} dominates the $W_2(p,q)$ term in Lemma~\ref{lem:mut_nut}.
\end{proof}

\begin{rmk}
    In all other sections of this paper, we abuse notation as follows.
    Given a distribution $p$ on $\R^d$, we write $p\Pulds$ to denote the projection onto the $z$-coordinate of $\bs p \Pulds$, i.e., we view $\Pulds$ as a Markov kernel on $\R^d$ rather than on $\R^d\times \R^d$ (and similarly for $\Puld$).
\end{rmk}