 % use the "wcp" class option for workshop and conference
 % proceedingshttps://www.overleaf.com/project/6405cdbb5094beba9ad76408
 %\documentclass[gray]{jmlr} % test grayscale version
 %\documentclass[tablecaption=bottom]{jmlr}% journal article
 \documentclass[tablecaption=bottom,wcp]{jmlr} % W&CP article

 % The following packages will be automatically loaded:
 % amsmath, amssymb, natbib, graphicx, url, algorithm2e

 %\usepackage{rotating}% for sideways figures and tables
 %\usepackage{longtable}% for long tables

 % The booktabs package is used by this sample document
 % (it provides \toprule, \midrule and \bottomrule).
 % Remove the next line if you don't require it.
\usepackage{booktabs}
\usepackage{scalerel}
\usepackage{mathtools}

% \usepackage{todonotes}
 % The siunitx package is used by this sample document
 % to align numbers in a column by their decimal point.
 % Remove the next line if you don't require it.
\usepackage[load-configurations=version-1]{siunitx} % newer version
 %\usepackage{siunitx}

 % The following command is just for this sample document:
\newcommand{\cs}[1]{\texttt{\char`\\#1}}% remove this in your real article

\input{math_commands.tex}


\makeatletter
\newtheorem*{rep@theorem}{\rep@title}
\newcommand{\newreptheorem}[2]{%
\newenvironment{rep#1}[1]{%
 \def\rep@title{#2 \ref{##1}}%
 \begin{rep@theorem}}%
 {\end{rep@theorem}}}
\makeatother

% Start new counter at zero
\newcounter{set}
\setcounter{set}{0}

% Define problem environment with incrementing counter
\newenvironment{problem}{\refstepcounter{set} \begin{trivlist}
\item[\hskip \labelsep {\bfseries Problem}\hskip \labelsep {\bfseries \arabic{set}.}]}{\end{trivlist}}


\usepackage{todonotes}
%\usepackage[disable]{todonotes}
\makeatletter

\newcommand*\iftodonotes{\if@todonotes@disabled\expandafter\@secondoftwo\else\expandafter\@firstoftwo\fi}   % defines \iftodonotes{<true>}{<false>}, thanks to https://tex.stackexchange.com/questions/126559/conditional-based-on-packageoption
\makeatother
\newcommand{\noindentaftertodo}{\iftodonotes{\noindent}{}}
% Note that these macros accept optional arguments such as size=\small, bordercolor=red, and so on.  Capitalized versions are inline paragraphs instead of margin notes.
\newcommand{\fixme}[2][]{\todo[color=yellow,size=\scriptsize,fancyline,caption={},#1]{#2}} % to mark stuff that you know is missing or wrong when you write the text
\newcommand{\note}[4][]{\todo[author=#2,color=#3,size=\scriptsize,fancyline,caption={},#1]{#4}}

\newcommand{\anna}[2][]{\note[#1]{Anna}{violet!40}{#2}}
\newcommand{\Anna}[2][]{\anna[inline,#1]{#2}\noindentaftertodo}
% 
\newcommand{\fran}[2][]{\note[#1]{Francisco}{orange!40}{#2}}
\newcommand{\Fran}[2][]{\fran[inline,#1]{#2}\noindentaftertodo}

\newcommand{\teo}[2][]{\note[#1]{Teo}{green!40}{#2}}
\newcommand{\Teo}[2][]{\teo[inline,#1]{#2}\noindentaftertodo}

% \newtheorem{theorem}{Theorem}
% \newtheorem{lemma}{Lemma}
% \newtheorem{proposition}{Proposition}
\newtheorem{assumption}{Assumption}
% \newtheorem{corollary}{Corollary}

\newreptheorem{proposition}{Proposition}


\newreptheorem{corollary}{Corollary}
\newreptheorem{theorem}{Theorem}
\newreptheorem{lemma}{Lemma}
\newreptheorem{observation}{Observation}
\newreptheorem{remark}{Remark}
\newtheorem{solution}{Solution}
\newtheorem{observation}{Observation}


 % Define an unnumbered theorem just for this sample document for
 % illustrative purposes:
\theorembodyfont{\upshape}
\theoremheaderfont{\scshape}
\theorempostheader{:}
\theoremsep{\newline}
\newtheorem*{nnote}{Note}

\def\arxiv{0}
\if\arxiv1
    \jmlrproceedings{AABI 2023}{Preprint, under review.}
\else
    \jmlrproceedings{AABI 2023}{5th Symposium on Advances in Approximate Bayesian Inference, 2023}
\fi

 % The optional argument of \title is used in the header
\title[Expressiveness Remarks for DDS]{Expressiveness Remarks for Denoising Diffusion Models and Samplers
}

 % Anything in the title that should appear in the main title but 
 % not in the article's header or the volume's table of
 % contents should be placed inside \titletag{}

 %\title{Title of the Article\titletag{\thanks{Some footnote}}}


 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % \thanks must come after \Name{...} not inside the argument for
 % example \Name{John Smith}\nametag{\thanks{A note}} NOT \Name{John
 % Smith\thanks{A note}}

 % Anything in the name that should appear in the title but not in the 
 % article's header or footer or in the volume's
 % table of contents should be placed inside \nametag{}

% Anonymous authors (leave as is; do not reveal author names for your submission)



% \author{\Name{Anonymous Authors}\\
%   \addr Anonymous Institution}
% THE SUBMISSION MUST REMAIN ANONYMOUS

% \if\arxiv1
 \author{\Name{Francisco Vargas}\Email{fav25@cam.ac.uk}\\
  \Name{Teodora Reu} \Email{tr500@cam.ac.uk}\\
  \Name{Anna Kerekes} \Email{ak2229@cam.ac.uk}\\
  \addr Department of Computer Science, Cambridge University, Cambridge, CB3 0FD, UK.}
% \else
% \author{\Name{Anonymous Authors}\\
%   \addr Anonymous Institution}
% \fi

% Two authors with the same address
% \author{\Name{Author Name1\nametag{\thanks{A note}}} \Email{abc@sample.com}\and
%  \Name{Author Name2} \Email{xyz@sample.com}\\
%  \addr Address}

 % Three or more authors with the same address:
 % \author{\Name{Author Name1} \Email{an1@sample.com}\\
 %  \Name{Author Name2} \Email{an2@sample.com}\\
 %  \Name{Author Name3} \Email{an3@sample.com}\\
 %  \Name{Author Name4} \Email{an4@sample.com}\\
 %  \Name{Author Name5} \Email{an5@sample.com}\\
 %  \Name{Author Name6} \Email{an6@sample.com}\\
 %  \Name{Author Name7} \Email{an7@sample.com}\\
 %  \Name{Author Name8} \Email{an8@sample.com}\\
 %  \Name{Author Name9} \Email{an9@sample.com}\\
 %  \Name{Author Name10} \Email{an10@sample.com}\\
 %  \Name{Author Name11} \Email{an11@sample.com}\\
 %  \Name{Author Name12} \Email{an12@sample.com}\\
 %  \Name{Author Name13} \Email{an13@sample.com}\\
 %  \Name{Author Name14} \Email{an14@sample.com}\\
 %  \addr Address}


 % Authors with different addresses:
 % \author{\Name{Author Name1} \Email{abc@sample.com}\\
 % \addr Address 1
 % \AND
 % \Name{Author Name2} \Email{xyz@sample.com}\\
 % \addr Address 2
 %}



\begin{document}

\maketitle

\begin{abstract}
Denoising diffusion models are a class of generative models which have recently achieved state-of-the-art results across many domains. Gradual noise is added to the data using a diffusion process, which transforms the data distribution into a Gaussian. Samples from the generative model are then obtained by simulating an approximation of the time reversal of this diffusion initialized by Gaussian samples. Recent research has explored adapting diffusion models for sampling and inference tasks. In this paper, we leverage known connections to stochastic control akin to the F\"ollmer drift to extend established neural network approximation results for the F\"ollmer drift to denoising diffusion models and samplers. 
% \Fran{
% Whats wrong with akin ?

% could use similar instead.
%  No problem !! agh we have a bunch of broken references
% Cant figure ut how to fix them fixed
% }
% \Anna{Havent heard it before, so maybe just me problem :D}
\end{abstract}

% Keywords may be removed
%\begin{keywords}
%List of keywords
%\end{keywords}

\section{Introduction}
\label{sec:intro}

% \Fran{We should trim down the intro to not talk about MCMC as much and talk a bit more about DDPM/Gen modelling also. Bit more of a strong sell.}
Let $\pi$ be a probability density on $\mathbb{R}^d$ of the form
\begin{equation}
    \pi(x)=\frac{\gamma(x)}{Z},\qquad Z=\int_{\mathbb{R}^d} \gamma(x) \mathrm{d}x,
\end{equation}
where $\gamma:\mathbb{R}^d \rightarrow \mathbb{R}^{+}$ can be evaluated pointwise but the normalizing constant $Z$ is intractable. We are interested in obtaining approximate samples from $\pi$. A variety of Monte Carlo techniques has been developed to tackle this problem. Variational techniques are a popular alternative to Markov Chain Monte Carlo (MCMC) \citep{neal2011mcmc} where one considers a flexible family of easy-to-sample distributions $q^{\theta}$ whose parameters are optimized by minimizing a suitable cost, such as reverse Kullback--Leibler discrepancy $\KL(q^{\theta}||\pi)$.

% Recently, Monte Carlo techniques have been fruitfully combined with variational techniques. For example, AIS \citep{neal2001annealed} can be thought of as a procedure where $q^{\theta}(x,u)$ is the joint distribution of a Markov chain defined by a sequence of MCMC kernels whose final state is $x$, while $p^{\theta}(x,u)$ is the corresponding AIS extended target \citep{neal2001annealed}. Then $\theta$ can be learned by minimizing $\KL(q^{\theta}||p^{\theta})$ \citep{wunoe2020stochastic,Geffner:2021,Thin:2021,ZhangAIS2021,doucet2022annealed,geffner2022langevin}.

Instead recent work \citep{vargas2023denoising} tackles the sampling problem with a new class of samplers coined denoising diffusion samplers (DDS) that leverage Denoising Diffusion Probabilistic Models (DDPM), a powerful class of generative models \citep{sohl2015deep,ho2020denoising,song2020score} to sample from unnormalised densities. In this context, one adds noise progressively to data using diffusion to transform the complex target distribution into a Gaussian distribution. The time-reversal of this diffusion can then be used to transform a Gaussian sample into a sample from the target. 

In this work, we will explore in more detail the connection to stochastic control remarked between denoising diffusion models \citep{ho2020denoising,song2020score} in \cite{vargas2023denoising} and leverage this to show how the score of VP-SDEs \citep{song2020score} can be approximated with neural networks up to an arbitrarily small error and we quantify the induced sampling error. We do this by extending the theoretical results derived in \cite{tzen2019theoretical}. 

Our contributions in this paper can be summarized as follows: (1) establishing a connection between the VP-SDE score and OU-semigroup (Section \ref{ousec}), (2) exploring novel regularity properties for OU-semigroup (Section \ref{sec:regularity}), and (3) demonstrating neural network and sampling approximation results for a simplified VP-SDE (Proposition \ref{col:est}, Remark \ref{rem:approx}).




\section{Background - Denoising Diffusion Models and Samplers}\label{sec:DDSCT}
For the purpose of this work, we will introduce Denoising Diffusions and DDS in continuous time. Let $\mathcal{C}=C([0,T],\mathbb{R}^d)$ be the space of continuous functions from $[0,T]$ to $\mathbb{R}^d$ and $\mathcal{B}(\mathcal{C})$ the Borel sets on $\mathcal{C}$. We consider path measures, which are probability measures on $(\mathcal{C},\mathcal{B}(\mathcal{C}))$ \citep{leonard2013survey}. For synergy with the results in \cite{tzen2019theoretical} we will introduce DDS \citep{vargas2023denoising} with the time reversals flipped, meaning we interchange the backward and the forward processes compared to \citep{vargas2023denoising,ho2020denoising,song2020score,de2021diffusion}.


% \begin{figure}[t]
%   \centering
% \includegraphics[width=0.45\textwidth]{images/noising.png}
% \includegraphics[width=0.45\textwidth]{images/denoising.png}
%   \caption{a) Noise-adding process for exact reversal. The distribution  $\gN(0,I)$ is drawn for comparison to $p_T$. b) Exact and approximate time reversal starting from $\gN(0,I)$ the former exhibits only the mixing error whilst the latter incorporates the network's approximation error.}
% \end{figure}

\subsection{Backwards diffusion and its time-reversal}
Consider the forward noising diffusion given by a time-reversed Ornstein--Uhlenbeck (OU) process (\cite{song2020score} refer to this SDE as the VP-SDE).
\begin{equation}\label{eq:forwarddiffusionP}
    \mathrm{d}x_t=-\beta_t x_t \mathrm{d}t+\sigma \sqrt{2\beta_t}\mathrm{d}B_t,\qquad x_0 \sim \pi, 
\end{equation}
where $(B_t)_{t\in[0,T]}$ is a $d$-dimension Brownian motion and $t \rightarrow \beta_t$ is a non-decreasing positive function. This diffusion induces the path-measure $\mathcal{P}$ on the time interval $[0,T]$ and the marginal density of $x_t$ is denoted $p_t$. The transition density of this diffusion is given by $p_{t|0}(x_t|x_0)=\mathcal{N}(x_t;\sqrt{1-\lambda_t}x_0,\sigma^2 \lambda_t I)$, where $\lambda_t=1-\exp(-2\int^t_0\beta_s \mathrm{d}s)$. We will always consider a scenario where  $\int_0^T \beta_s \mathrm{d}s \gg 1$ so that $p_T(x_T)\approx \mathcal{N}(x_T;0,\sigma^2 I)$. 

From \citep{haussmann1986time},  its time-reversal $(y_t)_{t\in[0,T]}=(x_{T-t})_{t\in[0,T]}$, where equality is here in distribution, yields the forward time diffusion:
\begin{equation}\label{eq:exacttimereversalCT}
    \mathrm{d}y_t=\beta_{T-t}\{y_t+2\sigma^2 \nabla \log p_{T-t}(y_t)\} \mathrm{d}t+\sigma \sqrt{2\beta_{T-t}}\mathrm{d}W_t,\qquad y_0 \sim p_{T},
\end{equation}
where $(W_t)_{t\in[0,T]}$ is another $d$-dimensional Brownian motion. By definition this time-reversal starts from $y_0 \sim p_T(y_0)\approx \mathcal{N}(y_0;0,\sigma^2 I)$ and is such that $y_T \sim \pi$. This suggests that approximate simulation of diffusion (\ref{eq:exacttimereversalCT}) would result in approximate samples from $\pi$.  However, putting this idea into practice requires being able to approximate the intractable scores $(\nabla \log p_t(x))_{t \in [0,T]}$. Unlike DDPM, score matching techniques are not feasible, as sampling from (\ref{eq:forwarddiffusionP}) requires sampling $x_0 \sim \pi$, which is impossible by assumption.


\subsection{Reference diffusion and value function}\label{sec:refdiffusionvaluefunction}
In our context, it is useful to introduce a \emph{reference} process defined by the diffusion following (\ref{eq:forwarddiffusionP}), but initialized at $p^{\textup{ref}}_0(x_0)=\mathcal{N}(x_0;0,\sigma^2 I)$ rather than $\pi(x_0)$ thus ensuring that the marginals of the resulting path measure $\mathcal{P}^{\textup{ref}}$ all satisfy $p^{\textup{ref}}_t(x_t)=\mathcal{N}(x_t;0,\sigma^2 I)$. Following \cite{vargas2023denoising} we can identify $\mathcal{P}$  as the path measure minimizing the half bridge $\mathcal{P}= \argmin_\mathcal{Q} \{\KL(\mathcal{Q}||\mathcal{P}^{\textup{ref}}): q_T=\pi\}$ \citep{bernton2019SBsamplers,vargasshro2021, debortoli2021diffusion}.
% \begin{align}
%  \mathcal{P}= \argmin_\mathcal{Q} \{\KL(\mathcal{Q}||\mathcal{P}^{\textup{ref}}): q_T=\pi\}.
% \end{align}
where representation of $\mathcal{P}^{\textup{ref}}$ is given by 
\begin{equation}\label{eq:timereversalrefprocessCT}
    \mathrm{d}y_t =-\beta_{T-t} y_t \mathrm{d}t+\sigma \sqrt{2\beta_{T-t}} \mathrm{d}W_t,\qquad y_0 \sim p^{\textup{ref}}_0.
\end{equation}
As $\beta_{T-t} y_t +2\sigma^2 \nabla \log p^{\textup{ref}}_{T-t}(y_t)=-\beta_{T-t} y_t$, we can rewrite the time-reversal (\ref{eq:exacttimereversalCT}) of $\mathcal{P}$ as 
\begin{equation}\label{eq:diffusionvaluefunction}
    \mathrm{d}y_t= -\beta_{T-t} \{y_t -2\sigma^2 \nabla \log \phi_{T-t}(y_t)\} \mathrm{d}t+\sigma \sqrt{2\beta_{T-t}} \mathrm{d}W_t,\qquad y_0 \sim p_T,
\end{equation}
where $v_t(x) = -\ln \phi_t(x)=-\ln p_t(x)/p^{\textup{ref}}_t(x)$ is known as the value function \citep{fleming2012deterministic,Pham:2009,nusken2021solving,tzen2019theoretical}.

\subsection{Learning the Forward Diffusion - Reverse KL Formulation}
To approximate  (\ref{eq:exacttimereversalCT}) $\mathcal{P}$, consider a path measure $\mathcal{Q}^\theta$ which is induced by
\begin{equation}\label{eq:Qthetascore}
\mathrm{d}y_t=\beta_{T-t}\{y_t+2\sigma^2 s_{\theta}(T-t,y_t) \} \mathrm{d}t+\sigma \sqrt{2\beta_{T-t}}\mathrm{d}W_t,\qquad y_0  \sim \mathcal{N}(0,\sigma^2 I),
\end{equation}
so that $y_t \sim q^{\theta}_{t}$. To obtain $s_{\theta}(t,x) \approx \nabla \log p_t(x)$, we parameterize $s_{\theta}(t,x)$ by a neural network whose parameters are obtained by minimizing
\begin{align}\label{eq:scorematchinglike}
 \KL(\mathcal{Q}^\theta||\mathcal{P})
    &=\KL(\mathcal{N}(0,\sigma^2 I)||p_T)+\sigma^2 \mathbb{E}_{\mathcal{Q}^\theta}\Bigr[\scaleobj{.8}{\int_0^T} \beta_{T-t}||s_\theta(T-t,y_t)-\nabla \log p_{T-t}(y_t)||^2 \mathrm{d}t \Bigr],\nonumber
\end{align}
This expression closely resembles the expression obtained in \cite[Theorem 1]{song2021maximum} in the context of DDPM. However unlike DDPM \cite{ho2020denoising}, one cannot get rid of the intractable scores $(\nabla \log p_{t}(x))_{t \in [0,T]}$ using score matching ideas. Instead, using (\ref{eq:diffusionvaluefunction}), \cite{vargas2023denoising} reparameterize $\mathcal{Q}^\theta$ using.
\begin{equation}\label{eq:approximatetimereversalCT}
    \mathrm{d}y_t=-\beta_{T-t}\{y_t -2\sigma^2 f_{\theta}(T-t,y_t)\} \mathrm{d}t+\sigma \sqrt{2\beta_{T-t}} \mathrm{d}W_t,\qquad y_0 \sim \mathcal{N}(0,\sigma^2 I),
\end{equation}
unlike (\ref{eq:Qthetascore}) $f_\theta$ approximates $\nabla \ln \phi_t$ rather than the score $\nabla \ln p_t$. Then under this reparameterization \cite{vargas2023denoising} use standard results on half bridges \citep{bernton2019SBsamplers} to express  $\KL(\mathcal{Q}^\theta||\mathcal{P})$ in compact form:
\begin{align}
\KL(\mathcal{Q}^\theta||\mathcal{P})&
=\mathbb{E}_{\mathcal{Q}^\theta} \Bigr[ \sigma^2 \scaleobj{.8}{\int_0^T} \beta_{T-t} ||f_\theta(T-t,y_t)||^2 \mathrm{d}t
     +\scaleobj{.8}{\ln \left(\frac{\gN(y_T; 0, \sigma^2 I)}{\pi(y_T)}\right)} \Bigr]\label{eq:KLpathintegral}.
\end{align}
Where $q^{\theta^*}_0 = p_T \approx \gN(0,\sigma^2I)$. Then $\theta$ minimizing (\ref{eq:KLpathintegral}), approximate samples from $\pi$ can be obtained by simulating (\ref{eq:approximatetimereversalCT}) and returning $y_T\! \sim \!q^{\theta}_T$.  Note concurrent work \citep{berner2022optimal} also optimises an equivalent reverse KL to (\ref{eq:KLpathintegral}).


\section{Expressiveness and Regularity Results}

In this section, we present our main result. We demonstrate that $\nabla \ln \phi_t$ and thus the score of the OU-SDE can be approximated by a multi-layer neural network efficiently.

Theorem 3.1 in \cite{tzen2019theoretical} provides neural network approximation and sampling guarantees for a different class of SDEs than DDPM (i.e.  (\ref{eq:exacttimereversalCT}) or (\ref{eq:diffusionvaluefunction})). Thus in this section, we will adapt such results to denoising diffusion samplers \citep{vargas2023denoising} and via directly relating the approximations to the score of the VP-SDE (\ref{eq:forwarddiffusionP}) we motivate how these results extend to DDPM based methods \citep{song2020score, ho2020denoising,huang2021variational}.
% The  main result consists of a direct adaptation of the Theorem 3.1 in \cite{tzen2019theoretical} to the denoising diffusion samplers \citep{vargas2023denoising} and via directly relating the estimators we consider to the score of the VP-SDE (Equation \ref{eq:forwarddiffusionP}) we motivate how these estimation results extend to DDPM based methods \citep{song2020score, ho2020denoising,huang2021variational}. 

\cite{tzen2019theoretical} guarantee approximate sampling from a target distribution using a multilayer feedforward neural net drift, assuming the smoothness, Lipschitzness, and boundedness of $f(x)=\frac{\mathrm{d}\pi}{\mathrm{d}\gN(0, \sigma^2 I)}(x)$, (Assumption \ref{assump:a1}), as well as the smoothness of the activations (Assumption \ref{assump:a2}) and uniform approximability of $f$ and its gradient by a neural network (Assumption \ref{assump:a3}). In the following proposition and remark we present our adaption of their results to DDS.

\begin{proposition} \label{col:est}
Suppose Assumptions in Appendix \ref{assump} are in force. Let L denote the maximum of the Lipschitz constants of $f$ and $\nabla f$. Then for all $0< \epsilon < 16L^2/c^2$, there exists a neural net $\hat{v} : R^d \times [0,1] \to R^d$ with size polynomial in $1/\epsilon, d, L, c, 1/c$ such that the activation function  of each neuron in the set of $\{\sigma, \sigma', ReLU\}$, and the following hold: If $\{\hat{x_t}\}_{t\in[0,1]}$ is the diffusion process governed by the It\^o SDE:
\begin{align}\label{SDE}
d\hat{x}_t = \hat{b}(\hat{x}_{t}, t)\dd t + \sqrt{2 } \dd W_t
\end{align}
with $x_0 \sim p_1 \approx \gN(0, I)$ with the drift $\hat{b}(x,t) = - (x - 2 \hat{v}(x, 1-t))$, then $\hat{\mu} := \mathrm{Law}(\hat{x}_1)$, satisfies $D(\mu||\hat{\mu}) \leq \epsilon$.
\end{proposition}
\begin{remark}\label{rem:approx}
    Assuming $\pi$ satisfies a logarithmic Sobolev inequality, extending the time domain to $t\in [0,T]$ and sampling $\hat{x}_0 \sim \gN(0,I)$ approximately, it follows that $D(\mu||\hat{\mu}) \leq  e^{-T}\KL(\pi || \gN(0,1)) + T\epsilon$.
\end{remark}
The proof will closely follow \cite{tzen2019theoretical} however key steps must be slightly modified to show that the value function satisfies the required regularity properties to exploit the core results in \cite{tzen2019neural}.

\subsection{OU Semigroup and Time Reversal}\label{ousec}

This section introduces the OU semigroup \citep{metafune2002spectrum} whose logarithmic gradient can be directly connected to the score \citep{song2020score} in (\ref{eq:forwarddiffusionP}). Based on this reformulation of the score we are able to extend the results from \cite{tzen2019theoretical} to denoising diffusion via VP-SDEs. In the remainder of this section, we will introduce new results pertaining to the regularity properties of this operator that will enable us to prove Proposition \ref{col:est}.


\begin{definition}
We define the VP-SDE semigroup as,
\begin{align}
U^{\beta_t}_{t}f(y)= \mathbb{E}_{Z \sim \mathcal{N}(0, I)}\left[f\left(e^{-\int_0^{t} \beta_s \mathrm{d} s }y+\sigma(1-e^{-2\int_0^{t} \beta_s \mathrm{d}s})^{1/2} Z\right)\right]
\end{align}
Then the OU-semigroup \citep{metafune2002spectrum} (typically defined with $\beta_t=\beta=1$) is a simpler instance of the above.
\begin{align}
  U^{\beta}_{t}f(y)  = \mathbb{E}_{Z \sim \mathcal{N}(0, I)}\left[f\left(e^{- \beta t }y+\sigma(1-e^{-2 \beta t})^{1/2} Z\right)\right] 
\end{align}
\end{definition}

For the purpose of simplicity we will be working with the OU semi-group when $\beta=1$ (denoted $U_{t}$), however, these results can be extended to the more general case. In the following remark, we highlight the connection between the OU semi-group, the value function and the score in DDPM.

 \begin{remark}\label{rem:ou}
The time reversal of the VP-SDE (i.e. $b^{*}(y,t)= -\beta_{T-t} (y -2\sigma^2 \nabla \log \phi_{T-t}(y))$) can be expressed in terms of the OU semigroup via:
\begin{align}
    \nabla \log \phi_{T-t}(y) = \nabla_y \ln U_{T-t}^{\beta_{t}}f(y),
\end{align}
 When $f(x) = \frac{\mathrm{d}\pi}{\mathrm{d}\gN(0, \sigma^2 I)}(x)$. This in turn can be related to the score 
 \begin{align}
     \nabla \log p_{T-t}(y) &= -\left(\frac{y}{2\sigma^2}-\nabla \log \phi_{T-t}(y)\right) = -\left(\frac{y}{2\sigma^2}-\nabla_y \ln U_{T-t}^{\beta_{t}}f(y)\right). 
 \end{align}
\end{remark}
 
From this stage on we consider the case where $\sigma = \beta = 1$. Notice how the formulation in Remark \ref{rem:ou} is reminiscent of the F\"ollmer drift \citep{follmer1984entropy,dai1991stochastic,tzen2019theoretical,huang2021schrodinger}. Finally, we highlight that it is this very simple remark which facilitates porting over the theoretical results and insights from \citep{tzen2019theoretical} to diffusion-based models. Furthermore we remind the reader that the results in \cite{tzen2019theoretical} require adapting as they apply to the F\"ollmer drift and the heat semigroup (i.e.$\nabla_y \ln \phi_t(y)=  \nabla_y \ln Q_{t}f(y)$ with $Q_{t}f(y)  = \mathbb{E}_{Z \sim \mathcal{N}(0, I)}\left[f\left(y+\sqrt{t} Z\right)\right] $).

\subsection{Regularity Properties }\label{sec:regularity}
In this section, we will prove regularity properties pertaining to the OU semigroup which will allow us to extend the theoretical guarantees in \cite{tzen2019theoretical} to denoising diffusion models and samplers \citep{song2020score, ho2020denoising,vargas2023denoising}. Moving forward we prove a basic auxiliary result regarding the commutativity of the OU-semigroup with partial derivatives. From this result, by using Corollary \ref{reg:corr}, we were able to bound the OU-semigroup norm when differentiated. 
%\subsubsection{OU Semigroup}
%In this section, we prove two basic auxiliary results regarding the commutativity of the OU semigroup with partial derivatives as well as a bound on its norm when logarithmically differentiated. These results are useful for controlling the error between a neural network approximation and $\nabla \log U_t f(x)$.
\begin{lemma}\label{lem:ou_commute}
OU semigroup is commutative with the gradient operator that is for $f:\sR^d \to \sR$ we have $\partial_{y_i } U_t f(y) = U_t \partial_{y_i} f(y)$.
\end{lemma}

\subsubsection{Terminal Cost}

This section derives the regularity properties of $ g_{x,t}(z) = g(e^{-t}x + (1-e^{-2t})^{1/2}z)$, which we will refer to as the terminal cost. We want to underline to the reader that the optimal drift  can be expressed in terms of the OU-semigroup is applied to the terminal cost ($\nabla \ln \phi_t(x) = \nabla \ln U_t g_{x,t}(z)$) when $g=f$.
\begin{itemize}
    \item First we prove that a centred version of the terminal cost is $\mathscr{L}^2(Q)$ Lipchitz with respect to a newly defined metric. This will allow us to obtain a bound for the covering number of a function class induced by the terminal cost.
    \item We then derive an envelope for the terminal cost. This in conjunction with further results on covering numbers allows us to control Dudley's entropy integral \citep{dudley1967sizes}. This in turn provides us with results from empirical process theory \citep{gine2021mathematical} that quantify the error for an empirical estimate of the OU semigroup.
\end{itemize}


%This section derives the regularity properties of the terminal cost $ g_{x,t}(z) = g(e^{-t}x + (1-e^{-2t})^{1/2}z)$. First, we prove that a centred version of the terminal cost is $\mathscr{L}^2(Q)$ Lipchitz with respect to a newly defined metric. This will allow us to obtain a bound for the covering number of a function class induced by the terminal cost. Then, derive an envelope for the terminal cost. This in conjunction with further results on covering numbers allows us to control Dudley's entropy integral \citep{dudley1967sizes}. This in turn provides us with results from empirical process theory \citep{gine2021mathematical} that quantify the error for an empirical estimate of the OU semigroup. We would like to highlight that unlike in \cite{tzen2019theoretical} the Lipchitz property here is not respect to standard p-norm-based metrics, but a rather unique metric induced by the OU semigroup, as a result, we had to re-derive the bounds for the covering numbers.

\begin{lemma}\label{lem:expected_lip} ($\mathscr{L}^2$ Lipchitz condition)
    Let $\bar{g}_{t,x}(z) = g(e^{-t}x + (1-e^{-2t})^{1/2}z) - g(0)$ then it follows that:
    \begin{align*}
        || \bar{g}_{t,x}(z) - \bar{g}_{t',x'}(z)||_{\mathscr{L}^2(Q)} \leq L\left(1 + \sqrt{2}||z||_{\mathscr{L}^2(Q)} \right) \rho_{OU}((t,x), (t',x')) 
    \end{align*}
such that $\rho_{OU}((t,x), (t',x')) = || e^{-t}x - x'e^{-t'}||  + |t - t|^{1/2}$.
\end{lemma}
\begin{lemma}\label{lem:envelope} 
    Let $g :  R^d \to R$ to L-Lipschitz with respect to the Euclidean norm. Then for $F(z) := L((R \vee 1) + \sqrt{2}||z||)$. 
\begin{align}
    \Big|g\left(e^{- t}x+(1-e^{-2 t})^{1/2} z\right) - g(0)\Big| \leq F(z)
\end{align}
\end{lemma}


% \Fran{I think we can derive a better nicer metric for the Lipchitz condition}

% We need to establish a result of the form:

% \begin{conjecture}
% Given the metric space $\big( [0,T] \times B^d(R) , \rho_{OU}\big)$ where:
% \begin{align}
%     \rho_{OU}((t,x), (t',x')) = || e^{-t}x - x'e^{-t'}||  + |e^{-2t} - e^{-2t'}|^{1/2}
% \end{align}
% It follows that:
% \begin{align}
%      N(\gG,  \mathscr{L}^2(Q), \epsilon ||F ||_{\mathscr{L}^2(Q)}) \leq  \left(\frac{8}{\epsilon}\right)^{N([0,T] \times B^d(R),  \rho_{OU}, \epsilon/8) }
% \end{align}
% where $\mathrm{ddim}(\gX) = \ln \lambda_{\gX}$ such that $\lambda_{\gX}$ is the smallest value such that every ball in $\gX$ can be covered by $\lambda_{\gX}$.
% \end{conjecture}
% \begin{proof} (Strategy)

% We believe this should roughly follow from Lemma 6 in \cite{gottlieb2017efficient}. It might require some small modification as they assume $0 \leq f \leq 1$ whereas we assume $f$ is bounded from bellow by $c \in (0,1]$. However looking at the proof its looking ok as in the first half of the lemma this is not needed.
    
% \end{proof}

% \Fran{Actually quite sure we can do better than the above corollary, quite sure this is true:}
\subsubsection{Covering Number}
The $\mathscr{L}^2(Q)$ covering number of the function space $\gG$ is defined by:
\begin{align*}
    N\left(\mathcal{G}, \mathscr{L}^2(Q), \varepsilon\right):=\min \left\{K: \exists f_1, \ldots, \exists f_K \in \mathscr{L}^2(Q)\text { s.t. } \sup _{q \in \mathcal{G}} \min _{k \leq K}\left\|g-f_k\right\|_{L^2(P)} \leq \varepsilon\right\},
\end{align*}
in general the covering number $N\left(\gA, \rho, \varepsilon\right)$ is the smallest number of balls of size $\epsilon$ wrt to the metric $\rho$ that cover the set $\gA$. Once we obtain the appropriate bound on $N\left(\mathcal{G}, \mathscr{L}^2(Q), \varepsilon\right)$ the results from \citep{tzen2019theoretical} follow with minor modifications and thus Proposition \ref{col:est} will follow. In this section we will be bounding the $\mathscr{L}^2(Q)$ covering number of the function space $\mathcal{G}:=\left\{\bar{g}_{x, t}: x \in \mathrm{B}^d(R), t \in[0,1]\right\}$.
\begin{lemma}\label{lem:metlip}
Given the metric space $\big( [0,T] \times B^d(R) , \rho_{OU}\big)$ where:
\begin{align}
    \rho_{OU}((t,x), (t',x')) = || e^{-t}x - x'e^{-t'}||  + |t - t'|^{1/2}
\end{align}
and $||(t,x)||_{OU} =  \rho_{OU}((t,x), (0, 0))= || e^{-t}x ||  + |t|^{1/2}$.
It follows that:
\begin{align}
     N(\gG,  \mathscr{L}^2(Q), \epsilon ||F ||_{\mathscr{L}^2(Q)}) \leq  N([0,T] \times B^d(R),  \rho_{OU}, \epsilon) 
\end{align}

\end{lemma}

% \Fran{Metric space check:

% \textbf{Basic Properties}:

% Clearly $ \rho_{OU}((t,x), (t,x))  = || e^{-t}x -e^{-t}x|| =  0$ and also the metric is positive (when we take into acount the term $|t-t'|^{1/2}$) and symmetric.

% \textbf{Triangle}:

% Using the norms property we have 
% \begin{align}
%     || e^{-t}x + x'e^{-t'}|| \leq  || e^{-t}x || + ||x'e^{-t'}||  
% \end{align}

% thus it follows

% \begin{align}
%  \rho_{OU}((t,x), (t',x'))  &=|| e^{-t}x -e^{-l}y  - x'e^{-t'} + e^{-l}y  || \\
%  &\leq  || e^{-t}x -e^{-l}y || + || e^{-l}y -x'e^{-t'}   || \\
%  &=  \rho_{OU}((t,x), (l,y))  +  \rho_{OU}((l,y), (t',x')) 
% \end{align}
% }




% \Fran{

% Actually !

% Ok new idea (for simplicty let $T=1$:
% \begin{align}
%    4N([0,T], |\cdot|, \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2) &\leq 4 N(B^d(T), |\cdot|, \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2) \\
%    &\leq  N(B^{2d}(T), |\cdot|, \epsilon/2) N(B^{d}(R), ||\cdot ||, \epsilon/2) \\
%    &\leq \left(\frac{2 \cdot 3 }{\epsilon} \right)^{2d}\left(\frac{ 6 \cdot R}{\epsilon} \right)^{d}   \\
%    &\leq \left(\frac{2 \cdot 3 }{\epsilon} \right)^{3d}  R^d
% \end{align}


% }





% \Fran{

% Lets have a quick stab at getting the volumetric bound in \citep{tzen2019theoretical} that we are a bit uncertain about.

% Its not hard to see that if we have a $d$-dim ball of radius $R$ $B^d(R)$ we can cover it with
% \begin{align}
%   N(B^d(R), ||\cdot ||, \epsilon/2) \leq  \left( \frac{ 2 \cdot 3 \cdot R}{\epsilon} \right)^d 
% \end{align}
% see \href{https://mathoverflow.net/questions/371363/trade-off-between-covering-number-ball-radius-and-diameter-of-d-dimensional-s}{this example} for a reference. However they are interested in bounding this times the cover number of $N([0,T], |\cdot|, \epsilon^2/4)$ which can be bounded by $ 2 T \epsilon^{-2}$  so (in their case $T=1$):

% Ok new idea (for simplicty let $T=1$:
% \begin{align}
%    N([0,T], |\cdot|, \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2) &\leq  N(B^d(T), |\cdot|, \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2) \\
%    &\leq \left(\frac{3 \cdot 4 }{\epsilon^2} \right)^d\left(\frac{ 6 \cdot R}{\epsilon} \right)^d  \\
%    &\leq \left(\frac{2}{\epsilon} \right)^{2d}\left(\frac{ 2\cdot 3^2 \cdot R}{\epsilon} \right)^d  \\ 
%    &\leq \left(\frac{2}{\epsilon} \right)^{3d } (3\sqrt{R})^{2d} \\
%     &\leq \left(\frac{2}{\epsilon} \right)^{3d} (3\sqrt{R})^{3d} \\
%      &\leq \left(\frac{2 \cdot 3 \sqrt{R}}{\epsilon} \right)^{3d} 
% \end{align}
% note we consider the case  $N([0,T], |\cdot|, \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2)\geq1$ as in \cite{tzen2019theoretical} otherwise its log would be negative and $\sqrt{\cdot}$ not real. So for the log we get as desired:
% \begin{align}
%   \log   2 N([0,T], |\cdot|, \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2) &\leq 3d \log  \left(\frac{2 \sqrt{3R}}{\epsilon} \right)_{+} + \ln 2
% \end{align}
% For the square root we can use something like this $\sqrt{\log x} \leq \log 2 x$.  So:
% \begin{align}
%   \sqrt{\log   2 N([0,T], |\cdot|, \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2)} &\leq 3d \log  \left(\frac{2 \sqrt{3R}}{\epsilon} \right)_{+} + \ln 4\\
%   &\leq 3d \log  \left(\frac{2^2 \sqrt{3R}}{\epsilon} \right)_{+} 
% \end{align}

% }

\begin{lemma}\label{lem:covprod}
Given the metric space $\big( [0,T] \times B^d(R) , \rho_{OU}\big)$ it follows that
\begin{align}
     N([0,T] \times B^d(R),  \rho_{OU}, \epsilon)  \leq  N([0,T], |\cdot|,  \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2).
\end{align}
\end{lemma}

From Lemmas \ref{lem:metlip}, \ref{lem:covprod} it follows that :
\begin{align}
      N(\gG,  \mathscr{L}^2(Q), \epsilon ||F ||_{\mathscr{L}^2(Q)}) \leq N([0,T], |\cdot|,  \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2)
\end{align}
and thus it follows that Lemmas C.4 and thus Theorem C.1 in \cite{tzen2019theoretical} hold true in our setting. This provides us with the tools required to show the neural network approximation results (see Appendix \ref{apdx:approx}) which in turn enable our main result Proposition \ref{col:est}.

\section{Conclusion}

We establish a connection between the VP-SDE score and the OU-semigroup, revealing similarities between F\"{o}llmer drift-based and DDPM-based sampling approaches. Using this connection, we demonstrate how the VP-SDE score can be approximated efficiently by multilayer neural networks, under fairly general assumptions on the target distribution. In order to exploit previous results on the F\"{o}llmer drift \citep{tzen2019theoretical} we establish novel regularity properties for the OU-semigroup that allow us to adapt the results in \cite{tzen2019theoretical} to our setting. Although our results are derived for target distributions represented as densities, rather than empirical distributions as in DDPM, however, these results may apply in the large sample limit, given suitable assumptions.

% This result is wrong as $xe^{-t}$ is not Lipchitz. We might still be able to adapt the covering number result.



% \Teo{
% \paragraph{First bound:}
% For $2^{-1}T\epsilon^{-2}$ we pick the cover $U_T$ such that its elements $u_n$ are centered at $(2n+1)\epsilon^2$ if $T \in [2k\epsilon-1, 2k\epsilon)$, and $2n\epsilon^2$ if $T \in [2k\epsilon, 2k\epsilon+1)$ for $n, k \in \mathbb{N}$. Then: 

% \begin{align}
%    A = \begin{cases}
%         \bigcup_{u_n \in U_T} A_{(2n + 1) \epsilon^2}  & \text{if } T \in [2n\epsilon-1, 2n\epsilon)\\
%         \bigcup_{u_n \in U_T} A_{(2n) \epsilon^2}  & \text{if } T \in [2n\epsilon, 2n\epsilon+1)
%     \end{cases}
% \end{align}

% is a cover of is an $\epsilon$ cover of $[0, T] \times B^d(R)$ (with respect to the metric $\rho_{OU}$), notice this follows as $\forall x \in B^d(R)$ there exists an $x_0$ such that:
% \begin{align}
% \begin{cases}
% [(2n + 1) \epsilon^2 -\epsilon^2, (2n + 1) \epsilon^2 + \epsilon^2] \times \{x\} \subseteq B^{d+1}_{(2n + 1) \epsilon^2 \oplus x_0, \rho}(\epsilon) \in A_{(2n + 1) \epsilon^2} & \text{if } T \in [2n\epsilon-1, 2n\epsilon)\\ 
% [(2n) \epsilon^2 -\epsilon^2, (2n) \epsilon^2 + \epsilon^2] \times \{x\} \subseteq B^{d+1}_{(2n) \epsilon^2 \oplus x_0, \rho}(\epsilon) \in A_{(2n) \epsilon^2} & \text{if } T \in [2n\epsilon, 2n\epsilon+1)
% \end{cases}
% \end{align}

% Now we can see that $|A| \leq  |U_T| |A_0|$, where:
% \begin{align}
%     |A_0 | = \begin{cases}
%         \max_{n}|A_{(2n + 1) \epsilon^2} | & \text{if }
%         T \in [2n\epsilon-1, 2n\epsilon)\\   
%         \max_{n}|A_{(2n) \epsilon^2} | & \text{if } T \in [2n\epsilon, 2n\epsilon+1)\\ 
%     \end{cases}
% \end{align}

% \paragraph{Second bound}
% For $T\epsilon^{-2}$ we pick the cover $U_T$ such that its elements $u_n$ are centered at $(n+1)\epsilon^2$. Then: 

% \begin{align}
%    A = \bigcup_{u_n \in U_T} A_{n \epsilon^2} 
% \end{align}

% is a cover of is an $\epsilon$ cover of $[0, T] \times B^d(R)$ (with respect to the metric $\rho_{OU}$), notice this follows as $\forall x \in B^d(R)$ there exists an $x_0$ such that:
% \begin{align}
% [n \epsilon^2 -\epsilon^2, n \epsilon^2 + \epsilon^2] \times \{x\} \subseteq B^{d+1}_{n \epsilon^2 \oplus x_0, \rho}(\epsilon) \in A_{n \epsilon^2} 
% \end{align}

% Now we can see that $|A| \leq  |U_T| |A_0|$, where $|A_0 | =  \max_{n}|A_{n \epsilon^2}|. $

% }



\bibliography{jmlr-sample}
\newpage
\appendix


\section{List of Detailed Contributions}

Our contributions are.
\begin{itemize}
    \item Our overall contribution is porting over the expressiveness results from \cite{tzen2019theoretical} to denoising diffusion-based models in the setting where the target distribution admits a density.
    \item To facilitate this connection we provide Remark \ref{rem:ou} which expresses the score in DDPM in terms of the well-known OU semigroup. This expression is more akin to the F\"ollmer drift and thus motivates the connection to the results in \citep{tzen2019theoretical}.
    \item In order to do this we prove 3 novel results, specifically Lemmas \ref{lem:ou_commute}-\ref{lem:covprod} and Remark \ref{rem:metricspace} and highlight how they allow us to use Theorem C.1 of \cite{tzen2019theoretical}.
    \item For completeness we provide the adapted sketches for Theorem 3.2 of \cite{tzen2019theoretical} and our Proposition \ref{col:est} where we highlight the differences to \citep{tzen2019theoretical} in {\color{magenta}magenta}.
    \item Finally we provide Remark \ref{rem:approx} which quantifies the error from initialising $\hat{x}_0$ at $\gN(0,I)$ rather than $p_T$. The result combines the derived expressiveness/score approximation error with the mixing error of the OU-process.
\end{itemize}

\subsection{Limitations}
We would like to highlight that all our results are in continuous time, and additional work would be required to analyse them under a given discretisation (e.g. Euler Maruyama). It is possible with additional assumptions to apply results directly such as Theorem 2 in \cite{chen2022sampling} or it may be possible to adapt Corollary 2 in \cite{vargas2021bayesian}.
\section{Assumptions}\label{assump}

\begin{assumption}\label{assump:a0}
Throughout all this work we assume that the target distribution $\pi$ has a density that is it is absolutely continuous wrt to the Lebesgue measure on $\sR^d$.
\end{assumption}

\begin{assumption}\label{assump:a1}
The function $f$ is differentiable, both $f$ and $\nabla f$ are L-Lipschitz, and there exists a constant $c \in (0,1]$ such that $f \geq c$ everywhere. 
\end{assumption}

\begin{assumption}\label{assump:a2}
The activation function $\sigma : \sR \to \sR$ is differentiable. Moreover,  
 there exists $c_\sigma > 0$ depending only on $\sigma$, such that the following holds: For any L-Lipschitz function $h : \sR \to \R$ which is constant outside the interval $[-R, R]$ and for any $\delta > 0$, there exist real numbers $a$, $\{\alpha_i, \beta_i, \gamma_i\}^m_{i=1}$ where $m \leq c_{\sigma}\frac{RL}{\delta}$, such that the function  $\tilde{h}(x) = a + \sum \alpha_i \sigma(\beta_i x +\gamma_i)$ satisfies $\sup_{x\in \sR}|\tilde{h}(x)-h(x)|\leq \delta$.
\end{assumption}

Finally as per \cite{tzen2019theoretical} we introduce the assumption pertaining to the approximability of $f$ by neural nets. Let $\sigma: \mathbb{R} \rightarrow \mathbb{R}$ be a fixed nonlinearity. Given a vector $w \in \mathbb{R}^n$ and scalars $\alpha, \beta$, define the function
$$
N_{w, \alpha, \beta}^\sigma: \mathbb{R}^n \rightarrow \mathbb{R}, \quad N_{w, \alpha, \beta}^\sigma(x):=\alpha \cdot \sigma\left(w^T x+\beta\right) .
$$
For $\ell \geq 2$, we define the class $\mathcal{N}_{\ell}^\sigma$ of $\ell$-layer feedforward neural nets with activation function $\sigma$ recursively as follows: $\mathcal{N}_2^\sigma$ consists of all functions of the form $x \mapsto \sum_{i=1}^m N_{w_i, \alpha_i, \beta_i}^\sigma(x)$ for all $m \in \mathbb{N}, w_1, \ldots, w_m \in \mathbb{R}^d$, $\alpha_1, \ldots, \alpha_m, \beta_1, \ldots, \beta_m \in \mathbb{R}$, and, for each $\ell \geq 2$,
$$
\begin{aligned}
\mathcal{N}_{\ell+1}^\sigma:= & \bigcup_{k \geq 1} \bigcup_{m \geq 1}\left\{x \mapsto \sum_{i=1}^m N_{w_i, \alpha_i, \beta_i}^\sigma\left(h_1(x), \ldots, h_k(x)\right):\right. \\
& \left.\alpha_1, \ldots, \alpha_m, \beta_1, \ldots, \beta_m \in \mathbb{R}, w_1, \ldots, w_m \in \mathbb{R}^k, h_1, \ldots, h_k \in \mathcal{N}_{\ell}^\sigma\right\} .
\end{aligned}
$$

\begin{assumption}\label{assump:a3}
 %uniform approximability to both $h$ and $\nabla h$. TODO: Write full thing.
 For any $R>0$ and $\epsilon >0$, there exist a neural net $\hat{f}\in\mathcal{N}^\sigma _{l,s}$ with $l,s<\text{poly}(1/\epsilon,d,L,R)$, such that
 \begin{align}
     \sup_{x\in B^d(R)}|f(x)-\hat{f}(x)|\leq \epsilon \quad and \quad \sup_{x\in B^d(R)}||\nabla f(x)-\nabla \hat{f}(x)||\leq \epsilon.
 \end{align}
\end{assumption}


\section{Regularity Results}\label{reg}


\begin{repremark}{rem:ou}
The time reversal of the VP-SDE (i.e. $b^{*}(y,t)= -\beta_{T-t} (y -2\sigma^2 \nabla \log \phi_{T-t}(y))$) can be expressed in terms of the OU semigroup via:
\begin{align}
    \nabla \log \phi_{T-t}(y) = \nabla_y \ln U_{T-t}^{\beta_{t}}f(y),
\end{align}
 When $f(x) = \frac{\pi}{\gN(0, \sigma^2 I)}(x)$. This in turn can be related to the score 
 \begin{align}
     \nabla \log p_{T-t}(y) &= -\left(\frac{y}{2\sigma^2}-\nabla \log \phi_{T-t}(y)\right) = -\left(\frac{y}{2\sigma^2}-\nabla_y \ln U_{T-t}^{\beta_{t}}f(y)\right). 
 \end{align}
\end{repremark}
\begin{proof}
Consider the OU semigroup evaluated on the appropriate RND:
\begin{align*}
     U_{t}^{\beta_{t}}f(y) &= \mathbb{E}_{Z \sim \mathcal{N}(0, I)}\left[\frac{\pi}{\gN(0, \sigma^2 I)}\left(e^{- \beta t }y+\sigma(1-e^{-2 \beta t})^{1/2} Z\right)\right] \\
     &= \mathbb{E}_{x_T \sim p^{\mathrm{ref}}_{T|t}(\cdot|x)}\left[\frac{\pi}{\gN(0, \sigma^2 I)}\left(x_T\right)\right] \\ 
     &= \int p^{\mathrm{ref}}_{T|t}(x_T |x) \frac{\pi}{\gN(0, \sigma^2 I)}\left(x_T\right) \mathrm{d}x_T\\
     &= \int \frac{p^{\mathrm{ref}}_{t|T}(x |x_T)  p^{\mathrm{ref}}_T(x_T)}{p^{\mathrm{ref}}_t(x)}\frac{\pi}{\gN(0, \sigma^2 I)}\left(x_T\right) \mathrm{d}x_T \\
     &= \int \frac{p^{\mathrm{ref}}_{t|T}(x |x_T) }{p^{\mathrm{ref}}_t(x)}{\pi}\left(x_T\right) \mathrm{d}x_T = \frac{p_t(x) }{p^{\mathrm{ref}}_t(x)}
\end{align*}
and thus it follows that
\begin{align}
     \nabla \log p_{T-t}(y) = -\left(\frac{y}{2\sigma^2}-\nabla_y \ln U_{T-t}^{\beta_{t}}f(y)\right). 
\end{align}
relating the score and the OU semi-group as required.
\end{proof}



\begin{replemma}{lem:ou_commute}
OU semigroup is commutative with the gradient operator that is for $f: \sR^d \to \sR$ we have $\partial_{y_i } U_t f(y) = U_t \partial_{y_i} f(y)$.
\end{replemma}

\begin{proof}
    It suffices to show that 
\begin{align}
    d(x,z) = \delta^{-1}(f(e^{- t}x+(1-e^{-2 t})^{1/2} z) - f(e^{- t}(x+\delta \ve_i)+(1-e^{-2 t})^{1/2} z)),
\end{align}
is dominated, where $[\ve_i]_j =\delta_{ij}$. As $f$ is Lipchitz by assumption it follows that
\begin{align}
        |d(x,z)| \leq L|\delta^{-1} e^{- t} \delta | =L e^{- t} \leq L
\end{align}
As $L$ is integrable under $\gN(0, I)$ we have shown $d(x,z)$ is dominated for all $\delta$ and thus the partial derivative operator and the OU semigroup commute.
\end{proof}

The choice of $F(z) := L((R \vee 1) + \sqrt{2}||z||)$ with these specific constants arises from the following result.

\begin{replemma}{lem:expected_lip}($\mathscr{L}^2$ Lipchitz condition)
    Let $\bar{g}_{t,x}(z) = g(e^{-t}x + (1-e^{-2t})^{1/2}z) - g(0)$ then it follows that:
    \begin{align*}
        || \bar{g}_{t,x}(z) - \bar{g}_{t',x'}(z)||_{\mathscr{L}^2(Q)} \leq L\left(1 + \sqrt{2}||z||_{\mathscr{L}^2(Q)} \right) \rho_{OU}((t,x), (t',x')) 
    \end{align*}
such that:
\begin{align}
    \rho_{OU}((t,x), (t',x')) = || e^{t}x - x'e^{t'}||  + |t - t'|^{1/2}
\end{align}

\end{replemma}

\begin{proof}
\begin{align*}
 || \bar{g}_{t,x}(z) - \bar{g}_{t',x'}(z)||_{\mathscr{L}^2(Q)} &\leq L || || e^{-t}x + (1-e^{-2t})^{1/2}z - e^{-t'}x' - (1-e^{-2t'})^{1/2}z || ||_{\mathscr{L}^2(Q)}  \\ 
 &\leq L \Big|\Big| || e^{-t}x -e^{-t'}x'||  + |(1-e^{-2t})^{1/2} - (1-e^{-2t'})^{1/2}|\cdot ||z || \Big|\Big|_{\mathscr{L}^2(Q)} \\ 
  &\leq L \Bigg(|| e^{-t}x -e^{-t'}x'||  + |(1-e^{-2t})^{1/2} - (1-e^{-2t'})^{1/2}|\cdot ||z ||_{\mathscr{L}^2(Q)} \Bigg)\\
  &\leq L \Bigg(|| e^{-t}x -e^{-t'}x'||  + |e^{-2t} - e^{-2t'}|^{1/2}\cdot ||z ||_{\mathscr{L}^2(Q)} \Bigg) \\ 
&\leq L \Bigg(|| e^{-t}x -e^{-t'}x'||   + \sqrt{2}|t - t'|^{1/2}\cdot ||z ||_{\mathscr{L}^2(Q)} \Bigg)
 \end{align*}

 Where in the last line we use that $\sup_{t\in [0,T]}|(e^{-2t})'| = 2$ and thus $e^{-2t}$ is 2-Lipchitz.
\end{proof}

\begin{replemma}{lem:envelope}
    Let $g :  R^d \to R$ to L-Lipschitz with respect to the Euclidean norm. Then for $F(z) := L((R \vee 1) + \sqrt{2}||z||)$. 
\begin{align}
    \Big|g\left(e^{- t}x+(1-e^{-2 t})^{1/2} z\right) - g(0)\Big| \leq F(z)
\end{align}
\end{replemma}
% Since $||\cdot|| \leq ||\cdot||_{\psi_2}$, $F \in L^2(P)$. 
\begin{proof}
By Lipschitz continuity for all $z \in R^d, X\in B^d(R), t\in [0,T]$ we have:

\begin{align}
    |g\left(e^{- t}x+(1-e^{-2 t})^{1/2} z\right) - g(0)| &\leq L || e^{- t}x+(1-e^{-2 t})^{1/2} z|| \\
    &\leq L ( e^{- t}||x||+(1-e^{-2 t})^{1/2} ||z||)
\end{align}  

Since both $e^{- t}$ and $(1-e^{-2 t})^{1/2}$ are strictly smaller than $1$, we have:
\begin{align}
L ( e^{- t}||x||+(1-e^{-2 t})^{1/2} ||z||) &\leq L(R + ||z||) \\
&\leq L((R\vee 1) + ||z||) \leq F(z)
\end{align}  
\end{proof}


\section{Covering Number Results}


\begin{remark}\label{rem:metricspace}
    The space $([0,T]\times B^d(R),\rho_{OU})$ is a metric space, where 
\begin{align}
    \rho_{OU}((t,x), (t',x')) = || e^{-t}x - x'e^{-t'}||  + |t - t'|^{1/2}.
\end{align}
\end{remark}

\begin{proof}

\begin{itemize}
    \item \textbf{Positive definiteness}:
\begin{align}
    \rho_{OU}((t,x), (t',x')) &= 0 \Longleftrightarrow \\
    \label{eq::base}
    ||e^{-t}x-x'e^{-t'}|| + |t-t'|^{1/2} &= 0 \Longleftrightarrow \\
    \label{eq::conc}
    x=x' \text{ and } t&=t.
\end{align}
Since in (\ref{eq::base}) both terms are positive on the LHS, each has to be $0$ to get the RHS, thus we get (\ref{eq::conc}). 

\item \textbf{Symmetry}:
\begin{align}
    \rho_{OU}((t,x),(t',x'))=\rho_{OU}((t',x'),(t,x)).
\end{align}

\item \textbf{Triangle inequality}:
we show triangle inequality on $(t,x),(t',x')$ and $(t'',x'')$. First let us note, that $||e^{-t}x-x'e^{-t'}||+||e^{-t'}x'-x''e^{-t''}||\geq ||e^{-t}x-x''e^{-t''}||$, since $||\cdot||$ has the triangle inequality. Now:
\begin{align}
    |t-t'|^{1/2}+|t'-t''|^{1/2} &\geq |t-t''|^{1/2} \Longleftrightarrow \\
    \label{eq::end}
    |t-t'|+2|t-t'|^{1/2}|t'-t''|^{1/2}+|t'-t''|&\geq |t-t''|.
\end{align}
(\ref{eq::end}) is true, since $|\cdot|$ has the triangle inequality and $2|t-t'|^{1/2}|t'-t''|^{1/2}\geq 0$.
\end{itemize}
\end{proof}


\begin{replemma}{lem:metlip}
Given the metric space $\big( [0,T] \times B^d(R) , \rho_{OU}\big)$ where:

\begin{align}
    \rho_{OU}((t,x), (t',x')) = || e^{-t}x - x'e^{-t'}||  + |t - t'|^{1/2}
\end{align}
and
\begin{align}
    ||(t,x)||_{OU} =  \rho_{OU}((t,x), (0, 0))= || e^{-t}x ||  + |t|^{1/2}
\end{align}
It follows that:
\begin{align}
     N(\gG,  \mathscr{L}^2(Q), \epsilon ||F ||_{\mathscr{L}^2(Q)}) \leq  N([0,T] \times B^d(R),  \rho_{OU}, \epsilon) 
\end{align}

\end{replemma}
\begin{proof}

Consider the $\epsilon$-cover $A_{\rho_{OU}}$ with respect to $\rho_{OU}$ of $[0,T] \times B^d(R)$ it follows that for any $(t,x) \in [0,T] \times B^d(R)$ we have that there exists $(t',x') \in A_{\rho_{OU}}$  such that $\rho_{OU}((t,x), (t',x'))  \leq \epsilon$ then by Lemma \ref{lem:expected_lip} it follows that 
\begin{align}
 || \bar{g}_{t,x}(z) - \bar{g}_{t',x'}(z)||_{\mathscr{L}^2(Q)} &\leq L\left(1 + \sqrt{2}||z||_{\mathscr{L}^2(Q)} \right) \rho_{OU}((t,x), (t',x'))  \\ 
& \leq ||F ||_{\mathscr{L}^2(Q)} \rho_{OU}((t,x), (t',x'))  \\
& \leq ||F ||_{\mathscr{L}^2(Q)} \epsilon 
\end{align}
Hence the set:
\begin{align}
    \gG_{\rho_{OU}} = \{ \bar{g}_{t,x}:(t,x) \in A_{\rho_{OU}}\}
\end{align}
is an $||F || \epsilon$ cover of $\gG$ with respect to the metric $\rho_{OU}$
\end{proof}

\begin{replemma}{lem:covprod}
We have that
\begin{align}
     N([0,T] \times B^d(R),  \rho_{OU}, \epsilon)  \leq  N([0,T], |\cdot|,  \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2)
\end{align}
\end{replemma}
\begin{proof}

Let $B^d_{r_0}(R)$ denote a euclidean d-dimensional ball of radius $R$ centered at $r_0$ and let $B^{d+1}_{t_0 \oplus x_0, \rho}(R')$\footnote{$a\oplus b$ denotes the concatenation of $a$ and $b$.} denote it's counterpart with respect to the metric $\rho$.  Now notice that if $|| e^{-t} x- e^{-t_0}x_0||  + | t-t_0|^{1/2} \leq \epsilon$ then $|| e^{-t_0}  (x-x_0)||  \leq \epsilon$ and  $|| x-x_0||  \leq  e^{t_0} \epsilon,$ thus,
\begin{align}
   \{t_0\}  \times B^d_{x_0}(\epsilon) \subseteq \{t_0\}  \times  B^d_{x_0}(e^{t_0} \epsilon) \subseteq  B^{d+1}_{t_0 \oplus x_0, \rho}(\epsilon),
\end{align}
then since $\{t_0\}  \times B^d_{x_0}(e^{t_0} \epsilon) \subseteq  B^{d+1}_{t_0 \oplus x_0, \rho}(\epsilon)$ we can construct an $\epsilon$ cover namely $A_{t_0}$ of $\{t_0\} \times B^d(R)$ with $N(B^d(R), ||\cdot ||, \epsilon e^{t_0})$ balls. Finally notice that if $|| e^{-t}  x- e^{-t_0}x_0||  + | t-t_0|^{1/2} \leq \epsilon$ it follows that $|t-t_0|^{1/2} \leq \epsilon$ thus $[0,T]$ can be covered in  $N([0,T], |\cdot|,  \epsilon^2) \leq T\epsilon^{-2}$ sub intervals.


Let $U_T$ be the smallest cover containing $N([0,T], |\cdot|,  \epsilon^2)$ intervals $u_n$ each centered at $t_n$ , then: 
\begin{align}
   A =  \bigcup_{u_n \in U_T} A_{t_n} 
\end{align} 

is an $\epsilon$ cover of $[0, T] \times B^d(R)$ (with respect to the metric $\rho_{OU}$), notice this follows as $\forall x \in B^d(R)$ there exists an $x_0$ such that 
\begin{align}
[t_n -\epsilon^2, t_n+ \epsilon^2] \times \{x\} \subseteq B^{d+1}_{t_n\oplus x_0, \rho}(\epsilon) \in A_{t_n}
\end{align}

Now we can see that 
\begin{align}
|A| \leq  |U_T| |A_0| &= N([0,T], |\cdot|,  \epsilon^2) N(B^d(R), ||\cdot ||, \epsilon e^{t_0}), \\
&\leq  N([0,T], |\cdot|,  \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2) ,
\end{align}
where $|A_0 |=\max_{n}|A_{t_n} |$, completing our proof. \\


\Fran{
Note we could obtain a more tightly bound but not necessary for the proof (something like this, need to be more careful with the last step)
\begin{align}
    |A| \leq \sum_n  | A_{(n + 1) \epsilon^2/2}| &\leq C(d, \epsilon)\sum_n {e^{-4d(n + 1) \epsilon^2/2} } \\
    &=C(d, \epsilon)\frac{1-e^{-4dT}}{1-e^{-4d \epsilon^2/2}}
\end{align}
}
\end{proof}

From Lemmas \ref{lem:metlip}, \ref{lem:covprod} it follows that :
\begin{align}
      N(\gG,  \mathscr{L}^2(Q), \epsilon ||F ||_{\mathscr{L}^2(Q)}) \leq N([0,T], |\cdot|,  \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2)
\end{align}
and thus it follows (see the start of Page 18 in \cite{tzen2019neural}) that Lemmas C.4 and thus Theorem C.1 in \cite{tzen2019theoretical} hold true in our setting, with the modified choice of 
\begin{align}
 N=\left\lceil\left(\frac{C \sqrt{d}}{\varepsilon} \cdot L((R \vee 1)+ { \color{magenta}\sqrt{2d}}+\sqrt{6}) \cdot(16 \sqrt{6 \pi R d}+5 \sqrt{\log 4(d+1)})\right)^2\right\rceil,
\end{align}
for Theorem C.1., which we will restate now for completeness.

\begin{corollary}(Theorem C.1. from \cite{tzen2019theoretical})
    
 For any $\varepsilon>0$ and any $R>0$, there exist ${\color{magenta}N}=\operatorname{poly}(1 / \varepsilon, d, L, R)$ points $z_1, \ldots, z_{{\color{magenta}N}} \in \mathbb{R}^d$, for which the following holds:
\begin{align*}
\max _{n \leq N}\left\|z_n\right\| \leq 8 \sqrt{(d+6) \log {\color{magenta}N}} \\
\sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]}\left|\frac{1}{{\color{magenta}N}} \sum_{n=1}^{{\color{magenta}N}} \textcolor{magenta}{f\left(e^{- t}x+(1-e^{-2 t})^{1/2} z_n\right)}-U_t f(x)\right| \leq \varepsilon \\
\sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]}\left\|\frac{1}{{\color{magenta}N}} \sum_{n=1}^{{\color{magenta}N}} \nabla \textcolor{magenta}{f\left(e^{- t}x+(1-e^{-2 t})^{1/2} z_n\right)}-\nabla U_t f(x)\right\| \leq \varepsilon
\end{align*}
\end{corollary}

We now have everything that is required to show the neural network approximation results. 

\Fran{ OLD STUFF: This is wrong.  The bound on $|| e^{-t}x -e^{-t'}x'|| \leq || x -x'|| $  is incorrect. In fact you can actually show its not Lipchitz taking a first derivative.

We could instead define the metric:
\begin{align}
    d_{OU}((t,x), (t',x')) = || e^{t}x - x'e^{t'}||  + |e^{-2t} - e^{-2t'}|^{1/2}
\end{align}
We can then see that in this metric we have Lipchitz continuity with the proposed constant.

Then for the covering numbers we could potentially make the bound   :
\begin{align} \label{eq:ballcover}
    N(\gG,  \mathscr{L}^2(Q), \epsilon ||F ||_{\mathscr{L}^2(Q)}) \leq N(B^d(R) \times [0,T], || \cdot||_{OU}, k(\epsilon))
\end{align}

Im not sure what the function $k(\epsilon)$ might be it could be something like $k(\epsilon) = \epsilon \ln \epsilon/L_1 + \epsilon^2/L_2$. Even if this was the case I wouldnt know how to factor $N(B^d(R) \times [0,T], || \cdot||_{OU}, k(\epsilon))$ or use results to bind its volume, but doesn't seem too hard as we are covering setshere rather than functions, with a slight wonky norm, this could all work out assuming Eq \ref{eq:ballcover} is right and we can find some reference for it.

}

\section{Neural Network Approximation}\label{apdx:approx}

\begin{corollary}\label{reg:corr}
Under Assumption \ref{assump:a1}, the vector field $\nabla \log U_t f(x)$ is bounded in norm by $\frac{L}{c}$ and is Lipschitz with  constant $\frac{L}{c}+ \frac{L^2}{c^2}$ where L is the max of the Lip constant of $f$ and $\nabla f$.
\end{corollary}
\begin{proof}
    By direct application of Lemma B.1. (\cite{tzen2019theoretical}) and our Lemma \ref{lem:ou_commute}, which assures that OU semi-group commutes with the gradient operator, we have that the results of this Corollary hold. 
\end{proof}

We now proceed to adapt one of the main theorems in \cite{tzen2019theoretical}. Whilst the changes are minor to the sketch in \cite{tzen2019neural} some are subtle thus we have incorporated this proof for completeness. We highlight in {\color{magenta} magenta} the subtle changes required to adapt the result.

\begin{corollary}(Tzen and Ragisnky)
\label{cor:th3.2}
 Let  $0<\varepsilon<4 L / c$   and  $R>0$ be given. Then there exists a neural net $\widehat{v}: \mathbb{R}^d \times[0,1] \rightarrow \mathbb{R}^d$ of size polynomial in $1 / \varepsilon, d, L, R, c, 1 / c$, such that the activation function of each neuron is an element of the set $\left\{\sigma, \sigma^{\prime}, \operatorname{ReLU}\right\}$, and the following holds:
$$
\sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]}\left\|\widehat{v}(x, {t})-\nabla \log U_t f(x)\right\| \leq \varepsilon
$$
and
$$
\max _{i \in[d]} \sup _{x \in \mathbb{R}^d} \sup _{t \in[0,1]}\left|\widehat{v}_i(x, {t})\right| \leq \frac{2 L}{c} .
$$
\end{corollary} 
\begin{proof}
Let $\delta=\frac{c^2 \varepsilon}{16 L}$. By Theorem C.1 (which has been proved to hold true in our settings in Appendix C), there exist points $z_1, \ldots, z_N \in \mathbb{R}^d$ with $N=\operatorname{poly}(1 / \delta, d, L, R)$, such that $R_{N, d}:=\max _{n \leq N}\left\|z_n\right\| \leq 8 \sqrt{(d+6) \log N}$, and the function $\varphi: \mathbb{R}^d \times[0,1] \rightarrow \mathbb{R}$ defined by
\begin{align}
{\color{magenta}\varphi(x, t)\coloneqq \frac{1}{N} \sum_{n=1}^N f\left(e^{-t}x+ (1- e^{-2t})^{1/2} z_n\right)}
\end{align}
satisfies
$$
\sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]}\left|\varphi(x, t)-U_t f(x)\right| \leq \delta \quad \text { and } \quad \sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]}\left\|\nabla \varphi(x, t)-\nabla U_t f(x)\right\| \leq \delta
$$

By Assumption \ref{assump:a3}, there exists a neural net $\widehat{f}: \mathbb{R}^d \rightarrow \mathbb{R}$ be that approximates $f$ and the gradient of $f$ to accuracy $\delta$ on the blown-up ball $\mathrm{B}^d\left(R+R_{N, d}\right)$. Then the function
$$
\widehat{\varphi}: \mathbb{R}^d \times[0,1] \rightarrow \mathbb{R}, \quad  {\color{magenta}\widehat{\varphi}(x, t):=\frac{1}{N} \sum_{n=1}^N \widehat{f}\left(e^{-t}x+ (1- e^{-2t})^{1/2} z_n\right)}
$$
can be computed by a neural net of $\operatorname{size} N \cdot \operatorname{poly}(1 / \delta, d, L, R)$, such that

\begin{align}
\begin{aligned}
& \sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]}\left|\widehat{\varphi}(x, t)-U_t f(x)\right| \\
& \leq \sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]}|\widehat{\varphi}(x, t)-\varphi(x, t)|+\sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]}\left|\varphi(x, t)-U_t f(x)\right| \\
&\leq {\color{magenta}\sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]} } {\color{magenta}\left|\frac{1}{N} \sum_{n=1}^N \widehat{f}\left(x+ (1- e^{-2t})^{1/2} z_n\right)-\frac{1}{N} \sum_{n=1}^N {f}\left(x+ (1- e^{-2t})^{1/2} z_n\right)\right|}\nonumber\\
 &\quad \quad\quad +\sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]}\left|\varphi(x, t)-U_t f(x)\right| \\
& \quad \leq \sup _{x \in \mathrm{B}^d\left(R+R_{N, d}\right)}|\widehat{f}(x)-f(x)|+\sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]}\left|\varphi(x, t)-U_t f(x)\right| \leq 2 \delta
\end{aligned}
\end{align}

where the third inequality follows since $ {\color{magenta}e^{-t} \in [0,1]}$ and the final inequality follows since
\begin{align*}
 {\color{magenta}\max_n\sup_{t \in [0,1]} (1- e^{-2t})^{1/2} ||z_n|| =\max_n  ||z_n|| = R_{N,d}}
\end{align*}
Similarly
\begin{align*}
\begin{aligned}
& \sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]}\left\|\nabla \widehat{\varphi}(x, t)-\nabla U_t f(x)\right\| \\
& \leq \sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]}\|\nabla \widehat{\varphi}(x, t)-\nabla \varphi(x, t)\|+\sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]}\left\|\nabla \varphi(x, t)-\nabla U_t f(x)\right\| \\
& \quad \leq \sup _{x \in \mathrm{B}^d\left(R+R_{N, d}\right)}\|\nabla \widehat{f}(x)-\nabla f(x)\|+\sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]}\left\|\nabla \varphi(x, t)-\nabla U_t f(x)\right\| \leq 2 \delta .
\end{aligned}
\end{align*}

Since $f$ is $L$-Lipschitz and bounded below by $c$, we have $U_t f(x) \geq \E_{Z \sim \mathcal{N}(0, I)}[c] = c$, and
\begin{align*}
    {\color{magenta}U_t f(x) =  \E_{Z \sim \mathcal{N}(0, I)} \left[f( e^{-t}x +(1-e^{-2t} )^{1/2} Z )\right] }&{\color{magenta}\leq \E_{Z \sim \mathcal{N}(0, I)} \left[ L(||x|| + \sqrt{2} ||z||) + f(0)\right]} \\
     &{\color{magenta}= L||x|| + f(0) + L \sqrt{2}  \E[||z||]} \\
    &{\color{magenta}\leq L(||x|| +  \sqrt{2d}) + f(0)}
\end{align*}
Thus it follows that
${\color{magenta}c \leq U_t f(x) \leq L(\|x\|+\sqrt{2d})+f(0)}$ for any $x \in \mathbb{R}^d$ and $t \in[0,1]$. Therefore, on $\mathrm{B}^d(R) \times[0,1]$,
$$
{\color{magenta}\frac{c}{2} \leq \widehat{\varphi}(x, t) \leq L(R+\sqrt{2d})+f(0)+\frac{c}{2}}
$$

where we use $\delta \leq c/4$. Without loss of generality, we may assume that $L \geq 1$. Then, for any $x \in \mathrm{B}^d(R)$ and $t \in[0,1]$
$$
\begin{aligned}
& \left\|\nabla \log \widehat{\varphi}(x, t)-\nabla \log U_t f(x)\right\| \\
& =\left\|\frac{\nabla \widehat{\varphi}(x, t)}{\widehat{\varphi}(x, t)}-\frac{\nabla U_t f(x)}{U_t f(x)}\right\| \\
& \leq \frac{1}{\widehat{\varphi}(x, t)}\left\|\nabla \widehat{\varphi}(x, t)-\nabla U_t f(x)\right\|+\left\|\frac{\nabla U_t f(x)}{U_t f(x)}\right\| \frac{\left|\widehat{\varphi}(x, t)-U_t f(x)\right|}{\widehat{\varphi}(x, t)} \\
& \leq \frac{2 L}{c} \cdot 2 \delta+\frac{L}{c} \cdot \frac{2}{c} \cdot 2 \delta \\
& \leq \frac{\varepsilon}{2},
\end{aligned}
$$
where we have used Corollary \ref{reg:corr} to bound $\left\|\frac{\nabla U_t f}{U_t f}\right\| \leq L / c$. In other words, $\nabla \log \widehat{\varphi}(x, t)$ approximates $\nabla \log U_t f(x)$ to accuracy $\varepsilon / 2$ uniformly on $\mathrm{B}^d(R) \times[0,1]$. It remains to approximate $\nabla \log \widehat{\varphi}(x, t)$ by a neural net to accuracy $\varepsilon / 2$.

To that end, we first represent $\nabla \log \widehat{\varphi}(x, t)$ as a composition of several elementary operations and then approximate each step by a neural net. Specifically, the computation of $v_i=\partial_i \log \widehat{\varphi}(x, t)$ can be represented as a computation graph with the following structure:
\begin{enumerate}
    \item  Compute $a=\widehat{\varphi}(x, t)$.
    \item Compute $b_i=\partial_i \widehat{\varphi}(x, t)$.
    \item Compute $r=1 / a$.
    \item  Compute $v_i=r b_i$.
\end{enumerate}
Given $x$ and $t, a$ is computed by a neural net with activation function $\sigma$, of size $\operatorname{poly}(1 / \delta, d, L, R)$ and depth poly $(1 / \delta, d, L, R)$. Therefore, by the cheap gradient principle (Lemma D.1 from \cite{tzen2019theoretical}), $b_i$ can be computed by a neural net of size poly $(1 / \delta, d, L, R)$, where the activation function of each neuron is an element of the set $\left\{\sigma, \sigma^{\prime}\right\}$. Next, since $a$ takes values in $[c / 2, L(R+\sqrt{2d})+f(0)+c / 2]$, by Lemma D.2 from \cite{tzen2019theoretical} the reciprocal $r=1 / a$ can be computed to accuracy $\varepsilon /(4 L \sqrt{d})$ by a 2 -layer neural net with activation function $\sigma$ and of size
$$
\mathcal{O}\left(\frac{4}{c^2} \cdot\textcolor{magenta}{(L(R+\sqrt{2d})+f(0)+c / 2) }\cdot \frac{4 L \sqrt{d}}{\varepsilon}\right) \leq \operatorname{poly}(1 / \varepsilon, d, L, R, c, 1 / c)
$$
Let $\widehat{r}$ denote the resulting approximation. Then, since $\left|b_i\right| \leq 2 L$ and $|\widehat{r}| \leq 2 / c+\varepsilon /(4 L \sqrt{d}) \leq 4 / c$, by Lemma D.2 the product $\widehat{r} b_i$ can be approximated to accuracy $\varepsilon / 4 \sqrt{d}$ by a 2-layer neural net with activation function $\sigma$ and with at most
$$
\mathcal{O}\left((4 / c \vee 2 L)^2 \cdot \frac{4 \sqrt{d}}{\varepsilon}\right) \leq \operatorname{poly}(1 / \varepsilon, d, L, 1 / c)
$$
neurons. The overall accuracy of the approximation is
$$
\left|\widehat{v}_i-v_i\right| \leq\left|\widehat{v}_i-\widehat{r} b_i\right|+\left|\widehat{r} b_i-r b_i\right| \leq \frac{\varepsilon}{2 \sqrt{d}}
$$
Thus, the vector $v=\left(v_1, \ldots, v_d\right)$ can be $\varepsilon / 2$-approximated by $\tilde{v}(x, t)$, where $\tilde{v}: \mathbb{R}^d \times[0,1] \rightarrow \mathbb{R}^d$ is a neural net with vector-valued output that has the $\operatorname{size} \operatorname{poly}(1 / \varepsilon, d, L, R, c, 1 / c)$. Finally, since $\sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]}\left|\tilde{v}_i(x, t)\right| \leq 2 L / c$, the function
$$
\widehat{v}_i(x, t):=\min \left\{\max \left\{\tilde{v}_i(x, t),-2 L / c\right\}, 2 L / c\right\}
$$
is continuous, takes values in $[-2 L / c, 2 L / c]$ and coincides with $\tilde{v}_i$ on $\mathrm{B}^d(R) \times[0,1]$. Moreover, the min and max operations can each be implemented exactly using $\mathcal{O}(1)$ ReLU neurons.
\end{proof}

\begin{repproposition}{col:est}
Suppose Assumptions 1-3 are in force. Let L denote the maximum of the Lipschitz constants of $f$ and $\nabla f$. Then for all $0< \epsilon < 16L^2/c^2$, there exists a neural net $\hat{v} : R^d \times [0,1] \to R^d$ with size polynomial in $1/\epsilon, d, L, c, 1/c$ such that the activation function  of each neuron in the set of $\{\sigma, \sigma', ReLU\}$, and the following hold: If $\{\hat{x_t}\}_{t\in[0,1]}$ is the diffusion process governed by the It\^o SDE:
\begin{align} %\label{SDE}
d\hat{x}_t = \hat{b}(\hat{x}_{t}, t)\dd t + \sqrt{2 \beta} \dd W_t
\end{align}
with $x_0 \sim p_1 \approx \gN(0, I)$ with the drift $\hat{b}(x,t) = - (x - 2 \hat{v}(x, 1-t))$, then $\hat{\mu} := \mathrm{Law}(\hat{x}_1)$, satisfies $D(\mu||\hat{\mu}) \leq \epsilon$.
\end{repproposition}
\begin{proof}

 For any $R>0$, Corollary \ref{cor:th3.2} guarantees the existence of a neural net $\widehat{v}: \R^d \times[0,1] \rightarrow \R^d$ that satisfies
\begin{equation}\label{eq:error_bound}
    \sup _{x \in \mathrm{B}^d(R)} \sup _{t \in[0,1]}\left\|\widehat{v}(x, {t})-\nabla \log U_t f(x)\right\| \leq \sqrt{\varepsilon}
\end{equation}
 and
\begin{align}\label{eq:network_bound}
     \max _{i \in[d]} \sup _{x \in \mathbb{R}^d} \sup _{t \in[0,1]}\left|\widehat{v}_i(x, {t})\right| \leq \frac{2 L}{c} .
\end{align}

 Let $\boldsymbol{\mu}:=\operatorname{Law}\left(x_{[0,1]}\right)$ and $\widehat{\boldsymbol{\mu}}:=\operatorname{Law}\left(\widehat{x}_{[0,1]}\right)$. The Girsanov formula gives
$$
\KL(\boldsymbol{\mu} \| \widehat{\boldsymbol{\mu}})=\frac{1}{2} \int_0^1 \mathbf{E}\left\|b\left(x_t, t\right)-\widehat{b}\left(x_t, t\right)\right\|^2 \mathrm{~d} t
$$

where the interchange of the integral and the expectation follows from Fubini's theorem because both $b$ and $\widehat{b}$ are bounded by Corollary \ref{reg:corr} and (\ref{eq:network_bound}). We now proceed to estimate the integrand. For each $t \in[0,1]$
$$
\begin{aligned}
& \mathbf{E}\left\|b\left(x_t, t\right)-\widehat{b}\left(x_t, t\right)\right\|^2 \\
& =\mathbf{E}\left[\left\|b\left(x_t, t\right)-\widehat{b}\left(x_t, t\right)\right\|^2 \cdot \mathbf{1}\left\{x_t \in \mathrm{B}^d(R)\right\}\right]+\mathbf{E}\left[\left\|b\left(x_t, t\right)-\widehat{b}\left(x_t, t\right)\right\|^2 \cdot \mathbf{1}\left\{x_t \notin \mathrm{B}^d(R)\right\}\right] \\
& =: T_1+T_2,
\end{aligned}
$$
where $T_1 \leq \varepsilon$ by (\ref{eq:network_bound}). To estimate $T_2$, we first observe that, since the OU drift is bounded in norm by $L / c$ by \ref{reg:corr}, we have
$$
\mathbf{P}\left\{\sup _{t \in[0,1]}\left\|x_t\right\| \geq R\right\} \leq \frac{\sqrt{d}+L / c}{R}
$$
(\cite{bubeck2018sampling}, Lemma 3.8). Therefore,
$$
T_2 \leq \frac{9 d L^2}{c^2} \cdot \frac{\sqrt{d}+L / c}{R}
$$
Since some of the bounds differ from the original \cite{tzen2019theoretical} we verify that the bound still holds for our drift. We used that $d \geq 2$.
\begin{align*}
    \begin{aligned}
        T_2 = \mathbf{E}\left[\left\|b\left(x_t, t\right)-\widehat{b}\left(x_t, t\right)\right\|^2 \cdot \mathbf{1}\left\{x_t \notin \mathrm{B}^d(R)\right\}\right] = \int_{x_t \notin \mathrm{B}^d(R)} \|b\left(x_t, t\right)-\widehat{b}\left(x_t, t\right)\|^2 dP_{x_t} =
        \\= \int_{x_t \notin \mathrm{B}^d(R)}
        2\|b\left(x_t, t\right)\|^2+2\|\widehat{b}\left(x_t, t\right)\|^2 dP_{x_t} 
        \leq 
        \int_{x_t \notin \mathrm{B}^d(R)} 
        2\|b\left(x_t, t\right)\|^2 +
        2 d \left(\frac{2L}{c}\right)^2  dP_{x_t} \leq
        \\ \leq  
        \int_{x_t \notin \mathrm{B}^d(R)} 2\|  \nabla \ln  U_t f(x_t)\|^2 +
        8 d \left(\frac{L}{c}\right)^2  dP_{x_t} = \int_{x_t \notin \mathrm{B}^d(R)} 2\left\|  \frac{\nabla U_t f(x_t)}{ U_t f(x_t)}\right\|^2  +
        8 d \left(\frac{L}{c}\right)^2  dP_{x_t} \leq
        \\ \leq \int_{X_t \notin \mathrm{B}^d(R)} 2  \frac{L}{c}^2  +
        8 d \left(\frac{L}{c}\right)^2  dP_{x_t} \leq 9d\frac{L^2}{c^2}    P \left\{ \sup_{t\in [0,1]}\|x_t\| \geq R\right\} \leq \frac{9 d L^2}{c^2} \cdot \frac{\sqrt{d}+L / c}{R}
    \end{aligned}
\end{align*}

Choosing $R$ large enough to guarantee $T_2 \leq \varepsilon$ and putting everything together, we obtain $D(\boldsymbol{\mu} \| \widehat{\boldsymbol{\mu}}) \leq \varepsilon$. Therefore, $D(\mu \| \widehat{\mu}) \leq D(\boldsymbol{\mu} \| \widehat{\boldsymbol{\mu}}) \leq \varepsilon$ by the data processing inequality.
\end{proof}

Finally, we would like to highlight what happens when we sample $\hat{x}_0 \sim \gN(0,1)$ rather than $p_T$. Whilst our results are done for $t \in [0, 1]$ one can see that the overall approximation results will hold for  $t \in [0, T]$.

\begin{repremark}{rem:approx}
    Assuming $\pi$ satisfies a logarithmic Sobolev inequality we extend the time domain to $t\in [0,T]$ and sampling $\hat{x}_0 \sim \gN(0,I)$ approximately, it follows that $D(\mu||\hat{\mu}) \leq  e^{-T}\KL(\pi || \gN(0,1)) + T\epsilon$
\end{repremark}

\begin{proof}

First, we remark that the estimation results and the results in Proposition \ref{col:est} apply to the $t \in [0,T]$ setting, however, they will introduce a polynomial dependency in $T$ for the size of the network.
    
As in the above proof, we apply the Girsanov theorem to control the path KL, however here the starting distributions of the two Ito processes are no longer the same thus we get an extra term from the chain rule:
\begin{align}
    \KL(\boldsymbol{\mu} \| \widehat{\boldsymbol{\mu}})&=\KL(p_T || \gN(0,1)) +\frac{1}{2} \int_0^T \mathbf{E}\left\|b\left(x_t, t\right)-\widehat{b}\left(x_t, t\right)\right\|^2 \mathrm{~d} t  \\
    &\leq \KL(p_T || \gN(0,1)) +T \epsilon \\
    &\leq e^{-T}\KL(\pi || \gN(0,1)) +T\epsilon 
\end{align}
Where the final inequality follows from Theorem 5.2.1 in \cite{bakry2014analysis} under the assumption that $\pi$ satisfies a log-Sobolev inequality. This completes the circle and fully extends Theorem 3.1 from \cite{tzen2019theoretical} to our denoising diffusion setting. 

\end{proof}

Finally, note that if we assume that $\operatorname{supp} \pi \subseteq \mathrm{B}^d(R)$ from Theorem 2 of \cite{chen2022sampling} it follows that:
\begin{align}
    \mathrm{TV}\left(\law \hat{x}_t, \pi\right) \leq \gO \left( {\sqrt{\mathrm{KL}\left(\pi \| \gN(0, I)\right)} \exp (-T)}+ {\epsilon \sqrt{T}} \right) .
\end{align}
This result complements Proposition \ref{col:est} very nicely as unlike \cite{chen2022sampling} we no longer require assuming an $\epsilon$ error on the score but instead prove such error can be attained.





% Instead let $\hat{x}_0 \sim \gN(0,I)$, ${x}_0 \sim p_T$ and $\tilde{x}_0 \overset{a.s}{=} \hat{x}_0 $ where $\tilde{x}$ has the same drift as ${x}_t$ (i.e. $b$), then:
% \begin{align}
%   \gW_2(\law \hat{x}_T , \law {x}_T)   \leq  \gW_2(\law \hat{x}_T , \law 
%   \tilde{x}_T)  +  \gW_2(\law {x}_T , \law \tilde{x}_T) 
% \end{align}

% Treating terms separately
% \begin{align}
%     || \hat{x}_T - \tilde{x}_T ||^2_2 &\leq || \hat{x}_0 - \tilde{x}_0 ||^2_2 + \int_0^T ||\hat{b}(\hat{x}_t, t) - \hat{b}(\tilde{x}_t, t)||^2_2\mathrm{d}t + 2 || \hat{x}_0 - x_0 ||\int_0^T ||\hat{b}(\hat{x}_t, t) - {b}(\tilde{x}_t, t)||_2\mathrm{d}t  \nonumber \\
%     &\leq 2|| \hat{x}_0 - \tilde{x}_0 ||^2_2 + 2\int_0^T ||\hat{b}(\hat{x}_t, t) - {b}(\tilde{x}_t, t)||^2_2\mathrm{d}t  \nonumber \\ &\leq 2|| \hat{x}_0 - x_0 ||^2_2 + 4\int_0^T ||\hat{b}(\hat{x}_t, t) - \hat{b}(\tilde{x}_t, t)||^2_2\mathrm{d}t + 4\int_0^T ||\hat{b}(\tilde{x}_t, t) - {b}(\tilde{x}_t, t)||^2_2\mathrm{d}t  \nonumber \\
%     &\leq 2|| \hat{x}_0 - \tilde{x}_0 ||^2_2 + 4\int_0^T C_1||\hat{x}_t - \tilde{x}_t||^2_2\mathrm{d}t + 4\int_0^T ||\hat{b}(\tilde{x}_t, t) - {b}(\tilde{x}_t, t)||^2_2\mathrm{d}t  \nonumber \\
% \end{align}
% taking expectations on both sides yields:
% \begin{align}
%    \E || \hat{x}_T - \tilde{x}_T ||^2_2 &\leq  4C_1\int_0^T \E ||\hat{x}_t - \tilde{x}_t||^2_2\mathrm{d}t + 4\epsilon\nonumber 
% \end{align}
% % \cite{sarantsev2020convergence} and C
% The term $\E|| \hat{x}_0 - \tilde{x}_0 ||^2_2=0$ by construction and  Corollary \ref{reg:corr} on the term involving the drifts. Now we can apply Gronwalls inequality \citep{oksendal2003stochastic} :
% \begin{align}
%   \E || \hat{x}_T - \tilde{x}_T ||^2_2 &\leq    4 \epsilon   e^{4C_1T} \implies  \gW_2(\law \hat{x}_T , \law \tilde{x}_T) \leq    2\epsilon^{1/2}   e^{2C_1T} 
% \end{align}
% Finally the term $\gW_2(\law \hat{x}_T , \law {x}_T) \leq C_2 e^{-T/2}$ via can use standard results on the convergence of the OU process \citep{sarantsev2020convergence, debortoli2022convergence} thus:
% \begin{align}
%     \gW_2(\law \hat{x}_T , \law {x}_T)   \leq  C_2 e^{-T/2} +  2\epsilon^{1/2} e^{2C_1T} 
% \end{align}
\end{document}
