\documentclass{article}
\usepackage{graphicx} % Required for inserting images
\usepackage{arxiv}

\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\usepackage{lipsum}		% Can be removed after putting your text content
\usepackage{graphicx}
\usepackage{natbib}
\usepackage{xcolor}
\usepackage{mathtools}


\usepackage{algorithm}
\usepackage{algpseudocode}

\usepackage{doi}

% \usepackage{algorithmic}
% \usepackage[linesnumbered, ruled]{algorithm2e} 

\usepackage{adjustbox}
\usepackage{amsmath}

\usepackage{listings}
\usepackage{xcolor}
\usepackage{amsthm}

\newcommand\myeqrt{\stackrel{\mathclap{\normalfont\mbox{RT}}}{=}}
\newcommand\myeqhint{\stackrel{\mathclap{\normalfont\mbox{Hint}}}{=}}
\newcommand\myeqSL{\stackrel{\mathclap{\normalfont\mbox{Steins}}}{=}}

\makeatletter
\newtheorem*{rep@theorem}{\rep@title}
\newcommand{\newreptheorem}[2]{%
\newenvironment{rep#1}[1]{%
 \def\rep@title{#2 \ref{##1}}%
 \begin{rep@theorem}}%
 {\end{rep@theorem}}}
\makeatother

% Start new counter at zero
\newcounter{set}
\setcounter{set}{0}

% Define problem environment with incrementing counter
\newenvironment{problem}{\refstepcounter{set} \begin{trivlist}
\item[\hskip \labelsep {\bfseries Problem}\hskip \labelsep {\bfseries \arabic{set}.}]}{\end{trivlist}}

% \newtheorem{theorem}{Theorem}
% \newtheorem{lemma}{Lemma}
% \newtheorem{proposition}{Proposition}
\newtheorem{assumption}{Assumption}
% \newtheorem{corollary}{Corollary}

\newreptheorem{proposition}{Proposition}


\newreptheorem{corollary}{Corollary}
\newreptheorem{theorem}{Theorem}
\newreptheorem{lemma}{Lemma}
\newreptheorem{observation}{Observation}
\newreptheorem{remark}{Remark}
\newtheorem{solution}{Solution}
\newtheorem{observation}{Observation}
\newtheorem{remark}{Remark}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{proposition}{Proposition}

\newtheorem{corollary}{Corollary}
\newtheorem{definition}{Definition}
\input{math_commands.tex}

\title{Creative Component}
\author{Teodora Reu}
\date{March 2023}

\begin{document}

\maketitle


\section{Creative Component}
\begin{problem}
    As we mention before the objective used in \cite{anonymous2023denoising} has some bias due to sampling $\mX_0$ from $\gN(0, \sigma^2\I)$ instead of $p_T$, the goal of this question is to formally derive/understand how this bias affects the minima.

We would like you to find a bound of the form:
\begin{align}
    \left| \inf_{f}\gL^{\mathrm{true}}(f)  -  \inf_{f} \gL^{\mathrm{practice}}(f) \right| \leq E(T)
\end{align}
using  the last equation of section 4.2 in \cite{de2022convergence} which implies:
\begin{align}
    \gW_1\left(\gN(0 , \sigma^2 \I), p_T\right) \leq C e^{-T / \bar{\beta}}
\end{align}
for some constants $C$ and $\bar{\beta}$ defined in \cite{de2022convergence}, and $\gW_1$ denotes the Wasserstein-1 metric. Why is this bound useful to the practitioner? Explain. 

\paragraph{Assumption 1 (Lipschitz Value Function)}{ We define the value function as}
\begin{align}
v(\vx,s) =-\ln\phi(\vx, s) =  \inf_{f} \E_{\mX_t}\left[ \sigma^2\int_s^T \beta_t|| f(\mX_t,t)||^2 \dd t + \ln \frac{\gN(\mX_T; 0, \sigma^2I)}{\pi(\mX_T)} \Big | \mX_s = \vx\right],
\end{align}

Assume it is L-Lipchitz uniformly in $s \in [0, T]$ that is :
\begin{align}
    |v(\vx,s) - v(\vy,s)| \leq L || \vx - \vy ||
\end{align}



\paragraph{Hint 1  (Value function):} {How does $v(\vx, 0)$ relate to our objectives?
}

\paragraph{Hint 2 (Couplings):}{If we have:}
\begin{align}
    \E_{Y \sim p } [f(Y)] + \E_{X \sim q} [g(X)]
\end{align}
we can put under a single expectation over any "coupling" (joint distribution with the desired marginals):
\begin{align}
    \E_{Y \sim p } [f(Y)] + \E_{X \sim q} [g(X)] = \E_{X,Y \sim \gamma}[f(Y) + g(X) ]
\end{align}
such that:
\begin{align}
    \int \gamma(x,y) dy = q(x), \quad \int \gamma(x,y) dx = p(y)
\end{align}
Finally, remember the $\argmin$ in $\gW_1$ is itself a coupling thus we can pick said coupling to "merge" the expectations.

\end{problem}

\begin{solution}
    This bound is useful for practitioners because it allows them to control the magnitude of the error that arises from the fact that we sample from $\gN(x_T, 0, \sigma^2 I)$ instead of sampling from $p_T = N(x_T, \sqrt{1-\lambda_T}\pi, \sigma^2 \lambda_T I)$, where $\lambda_T = 1 - exp(-2\int_0^T\beta_s ds)$ where $\beta_s$ is a strictly increasing with $T$ such as $\int_0^T \beta_s ds >> 1$. We can indeed notice that $p_T$ gets closer to $\gN(x_T, 0, \sigma^2 I)$ exponentially faster, from Figure 1 in the problem set, but the mean only attains $0$ at $-\infty$.

Note that $v(\vx, 0) =-\ln\phi(\vx, 0) =  \inf_{f} \E_{\mX_t}\left[ \sigma^2\int_0^T \beta_t|| f(\mX_t,t)||^2\dd t + \ln \frac{\gN(\mX_T; 0, \sigma^2I)}{\pi(\mX_T)} \Big | \mX_0 = \vx\right] $. Now I will make the following  claim and prove it. 

\begin{align}
    \E_{x \sim p_T}[v(x,0)] = \inf_{f} \gL^{true}(f)
\end{align}

\begin{align}
    \E_{x \sim \gN(0,\sigma^2 I)}[v(x,0)] = \inf_{f} \gL^{practice}(f)
\end{align}

I will prove only for one of the above equations that the equation holds and then the proof for the other one would be similar. 

$$ \inf_{f} \gL^{true}(f) = \E_{x \sim p_T}[v(x,0)] = \int v(x,0) p_T(x) dx   
=$$

$$
= \E_{\mX_0 \sim p_T}\left[\inf_{f} \E_{\mX_t}\left[ \sigma^2\int_0^T \beta_t|| f(\mX_t,t)||^2\dd t + \ln \frac{\gN(\mX_T; 0, \sigma^2I)}{\pi(\mX_T)} \Big | \mX_0 = \vx\right]\right] =
$$

\begin{align}
 = \inf_{f}\E_{\mX_0 \sim p_T}\left[ \E_{\mX_t}\left[ \sigma^2\int_0^T \beta_t|| f(\mX_t,t)||^2\dd t + \ln \frac{\gN(\mX_T; 0, \sigma^2I)}{\pi(\mX_T)} \Big | \mX_0 = \vx\right]\right] = \label{inf}
\end{align}

Now notice how $\E_{\mX_0 \sim p_T}\left[ \E_{\mX_t}\left[ \ln \frac{\gN(\mX_T; 0, \sigma^2I)}{\pi(\mX_T)}| \mX_0 = \vx \right]\right] = \E_{\mX_0 \sim p_T}\left[\ln \frac{\gN(\mX_T; 0, \sigma^2I)}{\pi(\mX_T)} \right]$, 

and for $\E_{\mX_0 \sim p_T}\left[ \E_{\mX_t}\left[ \sigma^2\int_0^T \beta_t|| f(\mX_t,t)||^2\dd t | \mX_0 = \vx \right]\right] 
= \E_{\mX_0 \sim p_T}\left[ \E_{\mX_t}\left[ \sigma^2\int_0^T \beta_t|| f(\mX_t,t)||^2\dd t  \right]\right] 
= \E_{\mX_0 \sim p_T}\left[ \sigma^2 \E_{\mX_t}\left[ \int_0^T \beta_t|| f(\mX_t,t)||^2\dd t  \right]\right]
= \E_{\mX_0 \sim p_T}\left[ \sigma^2 \int_x p_{X_t}(x) \int_0^T \beta_t|| f(x,t)||^2\dd t \dd x \right] 
= \E_{\mX_0 \sim p_T}\left[ \sigma^2 \int_0^T \int_x  p_{X_t}(x) \beta_t|| f(x,t)||^2\dd x \dd t \right] 
= \E_{\mX_0 \sim p_T}\left[ \sigma^2 \int_0^T E_{X_t}[ \beta_t|| f(x,t)||^2\ ] \dd t \right]
$.

Now we can simply arrive at either equation (40), or (41). Continuing with the end of the proof. 

\begin{align}
    \left| \inf_{f}\gL^{\mathrm{true}}(f)  -  \inf_{f} \gL^{\mathrm{practice}}(f) \right| 
    = \left| \E_{x \sim p_T}[v(x,0)] - \E_{y \sim \gN(0, \sigma^2)}[v(y,0)]\right| = 
\end{align}

\begin{align}
    = \left| \E_{x,y \sim \gamma}[v(x,0) - v(y,0)]\right| = 
\end{align}

We will now apply the L-Lipschitz formula:

\begin{align}
    = \left| \E_{x,y \sim \gamma}[L||x-y||]\right| = L \left| \E_{x,y \sim \gamma}[||x-y||]\right| 
    = L \gW_1(x,y) \leq LCe^{-T/\bar{\beta}}
\end{align}

\paragraph{Note:} At line (\ref{inf}) moving the ${\inf}_f$ out of the expected value is not a straightforward step. What is for sure is that for $X$ arbitrary random variable and any function $f$, $\E[\inf_{f \in \gF} h(X,f)]  \leq \inf_{f \in \gF}\E[h(X,f)]$. This is because for any $X$, and $\inf_{f \in \gF}$ we have $\inf_{f \in \gF} h(X,f) \leq h(X,f)$. Therefore $\E[\inf_{f \in \gF} h(X,f)] \leq \E [h(X,f)]$. So $\E[\inf_{f \in \gF} h(X,f)]$ lower bound for $\E[h(X,f)]$ for any $y$, in conclusion it must be that $\E[\inf_{f \in \gF} h(X,f))] \leq \inf_{f \in \gF} \E[ h(X,f)]$.

In \cite{article}, the authors provide theoretical reasoning for which that equality is possible. In order to be able to write the equality we need "almost downward filtering" and the inf of the expected value to be bounded. In other words, $\inf_{f \in \gF}\E[h(X,f)] > \infty$ (for our case this is possible because for us f is a neural network and also X is distributed after $p_{t}$). For the other assumption, we need the following condition:  for any $\epsilon > 0$ and $f_1$, $f_2$ there exists $f \in F$ such that we need to have: $\E[h(X, f) - \min[h(X, f_1),  h(X, f_2)]] < \epsilon$. 
\end{solution}

\section{Parallel between F\"{o}llmer Drift techniqual guarantees and DDS drift}

In the publication referenced as \cite{tzen2019theoretical}, the authors offer a set of theoretical quantitative guarantees in the form of an error bound for approximating the F\"{o}llmer drift with a multilayer feedforward neural network, in a uniform manner. With these technical assurances in mind, our intention is to explore whether they can be extended to the DDS drift. This will be achieved by closely examining all the assumptions, lemmas, and theorems presented in \cite{tzen2019theoretical} to determine their applicability to our scenario. The fundamental difference between \cite{vargas2023denoising} and \cite{tzen2019theoretical} pertains to the choice of drift, which leads to different semigroups, pinned brownian motion results in Follmer's drift and . Notably, the main theorems and lemmas presented in \cite{tzen2019theoretical} are grounded on Heat Semigroup definitions, and our aim is to transfer these to the OU Semigroup definition. Additionally, the approach taken by the authors in \cite{tzen2019theoretical} involves controlled diffusion, whereas we work with reverse-diffusion and matching score, but these to are equivalent as outlined in the lecture notes. Nevertheless, it is important to highlight that the two methods are equivalent, as will be demonstrated in the subsequent section.

\subsection{Introduction to the problem}
Remember that the DDS drift can be written in the following way:
\begin{align}
b_{\mathrm{DDS}}(x, t)=-\beta_{T-t}\left(x-2 \sigma^2 \nabla_x \ln \phi_{T-t}(x)\right)
\end{align}
To simplify the problem we will consider for all $t$, that $\beta_t = \beta$.
\begin{align}
b_{\mathrm{DDS}}(x, t)=-\beta\left(x-2 \sigma^2 \nabla_x \ln \phi_{T-t}(x)\right)
\end{align}
with:
\begin{align}
f_{\mathrm{DDS}}(x,t) &=\nabla_x \ln \phi_{T-t}^{\mathrm{DDS}}(x)\\
&= \nabla_x\ln \mathbb{E}_{Z \sim \mathcal{N}(0, I)}\left[\frac{d \pi}{\mathcal{N}\left(0, \sigma^2 I\right)}\left(e^{- \beta(T-t)} x+\sigma\left(1-e^{-2 \beta (T-t)}\right)^{1 / 2} Z\right)\right]
\end{align}
Now we will introduce the F\"{o}llmer drift
\begin{align}
b_{\mathrm{Foll}}(x, t)=\sigma^2 \nabla_x \ln \phi_{T-t}(x)
\end{align}
with:
\begin{align}
f_{\mathrm{Foll}}(x,t)=  \nabla_x\ln \phi_{T-t}^{\mathrm{Foll}}(x)= \nabla_x\ln \mathbb{E}_{Z \sim \mathcal{N}(0, I)}\left[\frac{d \pi}{\mathcal{N}\left(0, \sigma^2 T I\right)}(x+\sigma\sqrt{T-t} Z)\right]
\end{align}
In \cite{tzen2019theoretical} the authors prove that $f_{\mathrm{Foll}}(x,t)$ can be approximated by a neural network well. We want to transfer these results to the DDS drift and ideally find an even tighter bound for the error.


\subsection{Semigroup Formulation}
As the authors of \cite{tzen2019theoretical} formalize their proofs by using semigroups we are going to introduce both of these and their formulas. 

\paragraph{Heat Semigroup} Consider
\begin{align}
\nabla_x\ln \phi_{T-t}^{\mathrm{Foll}}(x)= \nabla_x\ln \mathbb{E}_{Z \sim \mathcal{N}(0, I)}\left[\frac{d \pi}{d\mathcal{N}\left(0, \sigma^2 T I\right)}(x+\sigma\sqrt{T-t} Z)\right]
\end{align}

 we define the head semigroup operator acting on a function $h$ as follows:

\begin{align}
Q_{t}[h] (x)= \mathbb{E}_{Z \sim \mathcal{N}(0, I)}\left[h(x+\sigma\sqrt{t} Z)\right]
\end{align}

Taking with $g(x) = \frac{d\pi}{d\gN(0, \sigma^2TI)}(x)$, we can write 

\begin{align}
f_{\mathrm{Foll}}(x,t) = \nabla_x \ln Q_{T-t}[g](x)
\end{align}

\paragraph{OU Semigroup}
The OU semigroup is defined by
\begin{align}
U_{t}[h] (x)= \mathbb{E}_{Z \sim \mathcal{N}(0, I)}\left[h\left(e^{-\beta t}x+\sigma(1-e^{-2\beta t})^{1/2} Z\right)\right]
\end{align}
For simplicity we can work with the case where $\sigma = \beta = 1$ Then as before we can rewrite the optimal drift via:
\begin{align}
f_{\mathrm{DDS}}(x,t) = \nabla_x \ln U_{T-t}[g_2](x)
\end{align}

with $g_2(x) = \frac{d\pi}{\gN(0, \sigma^2 I)}(x)$.

\subsection{Building up the theory}
We are going to touch upon the main results of \cite{tzen2019theoretical}, this being Theorem 3.1, and adapt it to DDS drift \cite{vargas2023denoising}, so to its respective semigroup. In order to do so we have to assume all assumptions taken by the authors are true.

\assumption{} The function $h$ is differentiable, both $h$ and $\nabla h$ are L-Lipschitz, and there exists a constant $c \in [0,1]$ such that $b \geq c$ everywhere. \label{a1}

For the rest, we are going to assume that \textbf{Assumption 2} (bounding $h^*(x) = a + \sum \alpha_i \sigma(\beta_i x +\gamma_i)$, where $\sigma$ is the activation function) and \textbf{Assumption 3} (requires uniform approximability to both $h$ and $\nabla h$) from \cite{tzen2019theoretical} take place.
The authors show that this assumption holds for Gibbs measures, however, we need to prove that regularity has for the DDS drift, as a parallel to their Lemma B.1 in \cite{tzen2019theoretical}, following in the next corollary.
\lemma{} OU semigroup is commutative with the gradient semigroup that is for $h: R^d \to R$ we have $\delta U_t h = U_t \delta h $.

\proof{} Firstly we need to have the OU semigroup $U_t h(x) = \mathbb{E}_{Z \sim \mathcal{N}(0, I)}\left[h\left(e^{-\beta t}x+\sigma(1-e^{-2\beta t})^{1/2} Z\right)\right]$ to commute with the gradient operator for any differentiable and Lipshitz $h: R^d \to R$, that is $\delta U_t h = U_t \delta h $. Let $j(x, Z) = \left(e^{-\beta t}x+\sigma(1-e^{-2\beta})^{1/2} Z\right)$. We can prove this by using dominated convergence, with:

$$\delta U_t h = lim_{n \to \infty} [\frac{ U_t h(x+1/n) - U_t h(x)}{1/n}] = lim_{n \to \infty} \frac{\E_Z[h(j(x+1/n, Z))] - \E_Z[h(j(x, Z))]}{1/n} = $$ 

$$
=  lim_{n \to \infty} \frac{\E_Z[h(j(x+1/n, Z)) - h(j(x, Z))]}{1/n} = lim_{n \to \infty} \int_Z N(z,0,1) \frac{[h(j(x+1/n, Z)) - h(j(x, Z))]}{1/n} dz
$$

We know that $N(z,0,1) \frac{[h(j(x+1/n, Z)) - h(j(x, Z))]}{1/n} $ is bounded because the maximum of $ j(x) = e^{-\beta t}x+(1-e^{-2\beta})^{1/2}$ is $\sqrt{5}$ (Quite an ugly reasoning, but the solution is around this). Moving forward by applying dominated convergence theorem we get:
$$
=  \int_Z N(z,0,1) lim_{n \to \infty} 
 \frac{[h(j(x+1/n, Z)) - h(j(x, Z))]}{1/n} dz = \int_Z N(z,0,1) \delta h dz = U_t \delta h
$$

\paragraph{Note:} There might be a mistake in compounded derivative taking. My intuition tells me that if $e^{-\beta t}$ comes out as a coefficient from the derivative, it goes inside at this step or something like that.

\corollary{(Regularity of the DDS drift)} under assumption \ref{a1}, the DDS drift $b(x,t) = \nabla log U_t h(x)$ is bounded in norm by $L/c$ and is Lipschitz with Lipscitz constant $L/c + L^2/c^2$ where L is the maximum of the Lipshitz constant of $h$ and $\nabla h$.


Now that we have them commuting, with $h(x) \geq c$, and with $||\nabla f(x)|| \leq L$ for all $x$, we have $U_t h(x) = \E[h(j(x,Z))] \geq E [c] = c$ and $||\nabla U_t h(x)|| = || U_t\nabla h(x)|| \leq E[L] = L$. Moving forward the solution from Lemma.B.1 from \cite{tzen2019theoretical} is completely the same. 

 Moving forward we are going to state the main parallel result, corresponding with Theorem 3.1 from \cite{tzen2019theoretical}, in the shape of a Lemma.

\lemma{} Suppose Assumptions 1-3 are in force. Let L denote the maximum of the Lipschitz constants of $h$ and $\nabla h$. Then for all $0< \epsilon < 16L^2/c^2$, there exists a neural net $\hat{v} : R^d \times [0,1] \to R^d$ with size polynomial in $1/\epsilon, d, L, c, 1/c$ such that the activation function  of each neuron in the set of $\{\sigma, \sigma', ReLU\}$, and the following hold: If $\{\hat{X_t}\}_{t\in[0,1]}$ is the diffusion process governed by the Ito SDE:

\begin{align}
d\hat{X}_t = \hat{b}(\hat{X}_{t}, t)dt + \sqrt{2 \beta} dW_t
\label{SDE}
\end{align}
with $X_T = \gN(0, \sigma^2)$ with the drift $\hat{b}(x,t) = -\beta (x - 2 \sigma\hat{v}(x, \left(1-e^{-2 \beta (T-t)}\right)^{1 / 2}))$, then $\hat{\mu} := Law(\hat{X}_0)$, satisfies $D(\mu||\hat{\mu}) leq \epsilon$.


The proof of this Lemma will rely on three steps: (1) prove that the OU semigroup can be approximated  by a finite sum of the form $\frac{1}{N}\sum_{n < N} h\left(e^{-\beta t}x+\sigma(1-e^{-2\beta t})^{1/2} z_n\right)$ uniformly for all $x \in B^d(R)$ and all $t \in [0, T]$, where $z_1, z_2, ..., z_n \in R^d$ lie in a ball of radius
$\gO(\sqrt{d\ln N})$ (2)  replacing $f$ with a suitable neural net approximation, we build this result to show that the DDS drift $\nabla ln U_t h(x)$ can be approximated by using activation functions (3) step is to use Girsanov Theorem and prove the bounds. 

\lemma{} Let $g :  R^d \to R$ to L-Lipschitz with respect to Euclidean norm. Let $Z_1, Z_2, ... Z_N$ be i.i.d copied of a d-dimensional random vector Z, such that $U := ||Z||$ has finite norm $\psi_2$. The for $F(z) := L((R \vee 1) + ||z||)$. 

\begin{align}
    |g\left(e^{-\beta t}x+(1-e^{-2\beta t})^{1/2} z\right) - g(0)| \leq F(z)
\end{align}

\paragraph{Note: } Removed $\sigma$ for simplicity. (The bound is different otherwise, $F(z) := L((R \vee 1) + \sigma||z||)$ respectively.

\proof{}
Since $||\cdot|| \leq ||\cdot||_{\psi_2}$, $F \in L^2(P)$. By Lipschitz continuity for all $z \in R^d, X\in B^d(R), t\in [0,T]$ we have:

$$    |g\left(e^{-\beta t}x+(1-e^{-2\beta t})^{1/2} z\right) - g(0)| \leq L || e^{-\beta t}x+(1-e^{-2\beta t})^{1/2} z|| \leq L ( e^{-\beta t}||x||+(1-e^{-2\beta t})^{1/2} ||z||) $$

Since both $e^{-\beta t}$ and $(1-e^{-2\beta t})^{1/2}$ are smaller than strictly $1$ we have:
$$L ( e^{-\beta t}||x||+(1-e^{-2\beta t})^{1/2} ||z||) \leq L(R + ||z||) \leq L((R\vee 1) + ||z||)= F(z)
$$

So the bound preserves.

\remark{} Referring to Lemma 3, maybe a tighter bound could be found by using Chauchy-Schwarz inequality or something similar. $e^{-\beta t}||x||+(1-e^{-2\beta t})^{1/2} ||z|| \leq \sqrt{(e^{-2\beta t} + 1-e^{-2\beta t})(||x||^2 + ||z||^2)} = \sqrt{||x||^2 + ||z||^2} \leq \sqrt{R^2 + ||z||^2} \leq \sqrt{(R \vee 1)^2 + ||z||^2}$

The bound is tighter since $\sqrt{R^2 + ||z||} \leq R + ||z||$ iff $R^2 + ||z||^2 \leq R^2 + ||z||^2 + 2R||z||$, since $R>0$, and $||z||>0$.

\corollary{} From Lemma 3, we have now Theorem C.1 from \cite{tzen2019theoretical} applies to OU semigroup, so OU semigroup can be approximated  by a finite sum of the form $\frac{1}{N}\sum_{n < N} h\left(e^{-\beta t}x+\sigma(1-e^{-2\beta t})^{1/2} z_n\right)$ uniformly for all $x \in B^d(R)$ and all $t \in [0, T]$, where $z_1, z_2, ..., z_n \in R^d$ lie in a ball of radius
$\gO(\sqrt{d\ln N})$.

\corollary{} From Corollary 1, we can conclude that Theorem 3.2 from \cite{tzen2019theoretical} also applies to OU drift, and guarantees the existence of a neural net $\hat{v} : R^d \times [0,1] \to R^d$ that satisfies:

$$\sup_{x \in B^d(R)} \sup_{t \in [0,1]} ||\hat{v}(x, t) -\nabla \ln U_t h(x)||$$

and 

$$\max_{i \in [d]}\sup_{x \in B^d(R)} \sup_{t \in [0,1]} ||\hat{v}(x, t)|| \leq \frac{2L}{c}$$

\corollary{} Given both Corollaries 2, and 3 and by applying Girsanov Theorem on $\mu := Law(X_{[0,1]})$, $\hat{\mu} := Law(\hat{X}_{[0,1]})$ we can now conclude that Lemma 2 is true.

\section{Conclusion}

In this paper, we investigate whether the theoretical guarantees offered by \cite{tzen2019theoretical} for approximating the F\"{o}llmer drift with a neural network can be extended to the DDS drift. While the two drifts differ, we show that the approach in \cite{tzen2019theoretical} can be adapted to the DDS drift with some modifications. Specifically, we introduce the OU semigroup. In conclusion, we quantified the expressiveness of DDS \cite{vargas2023denoising} model through the lenses of \cite{tzen2019theoretical}. We showed that we can efficiently sample from a wide class of terminal target distribution by choosing the drift to be approximated by a multilayer feedforward neural network.
\bibliographystyle{unsrtnat}
\bibliography{old.bib}  %%% Uncomment this line and comment out the ``thebibliography'' section below to use the external .bib file (using bibtex) .

\end{document}
