 % use the "wcp" class option for workshop and conference
 % proceedings
 %\documentclass[gray]{jmlr} % test grayscale version
 %\documentclass[tablecaption=bottom]{jmlr}% journal article
 \documentclass[tablecaption=bottom,wcp]{jmlr} % W&CP article

 % The following packages will be automatically loaded:
 % amsmath, amssymb, natbib, graphicx, url, algorithm2e

 %\usepackage{rotating}% for sideways figures and tables
 %\usepackage{longtable}% for long tables

 % The booktabs package is used by this sample document
 % (it provides \toprule, \midrule and \bottomrule).
 % Remove the next line if you don't require it.
\usepackage{booktabs}
\usepackage{scalerel}
% \usepackage{todonotes}
 % The siunitx package is used by this sample document
 % to align numbers in a column by their decimal point.
 % Remove the next line if you don't require it.
\usepackage[load-configurations=version-1]{siunitx} % newer version
 %\usepackage{siunitx}

 % The following command is just for this sample document:
\newcommand{\cs}[1]{\texttt{\char`\\#1}}% remove this in your real article

\input{math_commands.tex}


\makeatletter
\newtheorem*{rep@theorem}{\rep@title}
\newcommand{\newreptheorem}[2]{%
\newenvironment{rep#1}[1]{%
 \def\rep@title{#2 \ref{##1}}%
 \begin{rep@theorem}}%
 {\end{rep@theorem}}}
\makeatother

% Start new counter at zero
\newcounter{set}
\setcounter{set}{0}

% Define problem environment with incrementing counter
\newenvironment{problem}{\refstepcounter{set} \begin{trivlist}
\item[\hskip \labelsep {\bfseries Problem}\hskip \labelsep {\bfseries \arabic{set}.}]}{\end{trivlist}}


\usepackage{todonotes}
%\usepackage[disable]{todonotes}
\makeatletter

\newcommand*\iftodonotes{\if@todonotes@disabled\expandafter\@secondoftwo\else\expandafter\@firstoftwo\fi}   % defines \iftodonotes{<true>}{<false>}, thanks to https://tex.stackexchange.com/questions/126559/conditional-based-on-packageoption
\makeatother
\newcommand{\noindentaftertodo}{\iftodonotes{\noindent}{}}
% Note that these macros accept optional arguments such as size=\small, bordercolor=red, and so on.  Capitalized versions are inline paragraphs instead of margin notes.
\newcommand{\fixme}[2][]{\todo[color=yellow,size=\scriptsize,fancyline,caption={},#1]{#2}} % to mark stuff that you know is missing or wrong when you write the text
\newcommand{\note}[4][]{\todo[author=#2,color=#3,size=\scriptsize,fancyline,caption={},#1]{#4}}

\newcommand{\anna}[2][]{\note[#1]{Anna}{violet!40}{#2}}
\newcommand{\Anna}[2][]{\anna[inline,#1]{#2}\noindentaftertodo}
% 
\newcommand{\fran}[2][]{\note[#1]{Francisco}{orange!40}{#2}}
\newcommand{\Fran}[2][]{\fran[inline,#1]{#2}\noindentaftertodo}

\newcommand{\teo}[2][]{\note[#1]{Teo}{green!40}{#2}}
\newcommand{\Teo}[2][]{\teo[inline,#1]{#2}\noindentaftertodo}

% \newtheorem{theorem}{Theorem}
% \newtheorem{lemma}{Lemma}
% \newtheorem{proposition}{Proposition}
\newtheorem{assumption}{Assumption}
% \newtheorem{corollary}{Corollary}

\newreptheorem{proposition}{Proposition}


\newreptheorem{corollary}{Corollary}
\newreptheorem{theorem}{Theorem}
\newreptheorem{lemma}{Lemma}
\newreptheorem{observation}{Observation}
\newreptheorem{remark}{Remark}
\newtheorem{solution}{Solution}
\newtheorem{observation}{Observation}


 % Define an unnumbered theorem just for this sample document for
 % illustrative purposes:
\theorembodyfont{\upshape}
\theoremheaderfont{\scshape}
\theorempostheader{:}
\theoremsep{\newline}
\newtheorem*{nnote}{Note}

\jmlrproceedings{AABI 2023}{5th Symposium on Advances in Approximate Bayesian Inference, 2023}

 % The optional argument of \title is used in the header
\title[Approximation Remarks for DDS]{Approximation Remarks for Denoising Diffusion Based Samplers
}

 % Anything in the title that should appear in the main title but 
 % not in the article's header or the volume's table of
 % contents should be placed inside \titletag{}

 %\title{Title of the Article\titletag{\thanks{Some footnote}}}


 % Use \Name{Author Name} to specify the name.
 % If the surname contains spaces, enclose the surname
 % in braces, e.g. \Name{John {Smith Jones}} similarly
 % if the name has a "von" part, e.g \Name{Jane {de Winter}}.
 % If the first letter in the forenames is a diacritic
 % enclose the diacritic in braces, e.g. \Name{{\'E}louise Smith}

 % \thanks must come after \Name{...} not inside the argument for
 % example \Name{John Smith}\nametag{\thanks{A note}} NOT \Name{John
 % Smith\thanks{A note}}

 % Anything in the name that should appear in the title but not in the 
 % article's header or footer or in the volume's
 % table of contents should be placed inside \nametag{}

% Anonymous authors (leave as is; do not reveal author names for your submission)
\author{\Name{Anonymous Authors}\\
  \addr Anonymous Institution}
% THE SUBMISSION MUST REMAIN ANONYMOUS

% Two authors with the same address
% \author{\Name{Author Name1\nametag{\thanks{A note}}} \Email{abc@sample.com}\and
%  \Name{Author Name2} \Email{xyz@sample.com}\\
%  \addr Address}

 % Three or more authors with the same address:
 % \author{\Name{Author Name1} \Email{an1@sample.com}\\
 %  \Name{Author Name2} \Email{an2@sample.com}\\
 %  \Name{Author Name3} \Email{an3@sample.com}\\
 %  \Name{Author Name4} \Email{an4@sample.com}\\
 %  \Name{Author Name5} \Email{an5@sample.com}\\
 %  \Name{Author Name6} \Email{an6@sample.com}\\
 %  \Name{Author Name7} \Email{an7@sample.com}\\
 %  \Name{Author Name8} \Email{an8@sample.com}\\
 %  \Name{Author Name9} \Email{an9@sample.com}\\
 %  \Name{Author Name10} \Email{an10@sample.com}\\
 %  \Name{Author Name11} \Email{an11@sample.com}\\
 %  \Name{Author Name12} \Email{an12@sample.com}\\
 %  \Name{Author Name13} \Email{an13@sample.com}\\
 %  \Name{Author Name14} \Email{an14@sample.com}\\
 %  \addr Address}


 % Authors with different addresses:
 % \author{\Name{Author Name1} \Email{abc@sample.com}\\
 % \addr Address 1
 % \AND
 % \Name{Author Name2} \Email{xyz@sample.com}\\
 % \addr Address 2
 %}



\begin{document}

\maketitle

\begin{abstract}
Denoising diffusion models are a class of generative models which have recently achieved state-of-the-art results across many domains. One adds gradually noise to data using a diffusion to transform the data distribution into a Gaussian. Samples from the generative model are then obtained by simulating an approximation of the time-reversal of this diffusion initialized by Gaussian samples. Recent works has seen the adaptation of diffusion models to sampling and inference tasks. In this paper we leverage known connections to stochastic control akin to the F\"ollmer drift to extend established universal approximation results in stochastic control to denoising diffusion models and samplers. 
\end{abstract}

% Keywords may be removed
%\begin{keywords}
%List of keywords
%\end{keywords}

\section{Introduction}
\label{sec:intro}

\Fran{We should trim down the intro to not talk about MCMC as much and talk a bit more about DDPM/Gen modelling also. Bit more of a strong sell.}
Let $\pi$ be a probability density on $\mathbb{R}^d$ of the form
\begin{equation}
    \pi(x)=\frac{\gamma(x)}{Z},\qquad Z=\int_{\mathbb{R}^d} \gamma(x) \mathrm{d}x,
\end{equation}
where $\gamma:\mathbb{R}^d \rightarrow \mathbb{R}^{+}$ can be evaluated pointwise but the normalizing constant $Z$ is intractable. We are here interested in both estimating $Z$ and obtaining approximate samples from $\pi$.

A large variety of Monte Carlo techniques has been developed tackle this problem. In particular Annealed Importance Sampling (AIS) \citep{neal2001annealed} and its Sequential Monte Carlo (SMC) extensions \citep{del2006sequential} are often regarded as the gold standard to compute normalizing constants. Variational techniques are a popular alternative to Markov Chain Monte Carlo (MCMC) and SMC where one considers a flexible family of easy-to-sample distributions $q^{\theta}$ whose parameters are optimized by minimizing a suitable metric, such as reverse Kullback--Leibler discrepancy $\KL(q^{\theta}||\pi)$.

Over recent years, Monte Carlo techniques have also been fruitfully combined with variational techniques. For example, AIS can be thought of as a procedure where $q^{\theta}(x,u)$ is the joint distribution of a Markov chain defined by a sequence of MCMC kernels whose final state is $x$ while $p^{\theta}(x,u)$ is the corresponding AIS extended target \citep{neal2001annealed}. The parameters $\theta$ of can then be learned by minimizing $\KL(q^{\theta}||p^{\theta})$ using gradient based methods \citep{wunoe2020stochastic,Geffner:2021,Thin:2021,ZhangAIS2021,doucet2022annealed,geffner2022langevin}.

Recent work \citep{vargas2023denoising} propose a new class of samplers coined denoising diffusion samplers (DDS). This method leverages Denoising Diffusion Probabilistic Models (DDPM), a powerful class of generative models \citep{sohl2015deep,ho2020denoising,song2020score} to sample from unnormalised densities. In this context, one adds noise progressively to data using diffusion to transform the complex target distribution into a Gaussian distribution. The time-reversal of this diffusion can then be used to transform a Gaussian sample into a sample from the target. In this work, we will explore in more detail the connection to stochastic control established between denoising diffusion models in \cite{vargas2023denoising} and leverage them to extend theoretical properties such as the ones derived in \cite{tzen2019theoretical} to denoising diffusion samplers (DDS).

\section{Denoising Diffusion Models and Samplers}\label{sec:DDSCT}
For the purpose of this work we will introduce Denoising Diffusions and DDS in continuous time. Let $\mathcal{C}=C([0,T],\mathbb{R}^d)$ be the space of continuous functions from $[0,T]$ to $\mathbb{R}^d$ and $\mathcal{B}(\mathcal{C})$ the Borel sets on $\mathcal{C}$. We consider path measures which are probability measures on $(\mathcal{C},\mathcal{B}(\mathcal{C}))$ \citep{leonard2013survey}. For synergy with the results in \cite{tzen2019theoretical} we will introduce DDS \citep{vargas2023denoising} with the time reversals flipped, meaning we interchange the backwards and the forwards processes compared to \citep{vargas2023denoising,ho2020denoising,song2020denoising,de2021diffusion}.

\subsection{Backwards diffusion and its time-reversal}
Consider the forward noising diffusion given by a time-reversed Ornstein--Uhlenbeck (OU) process.
\begin{equation}\label{eq:forwarddiffusionP}
    \mathrm{d}x_t=-\beta_t x_t \mathrm{d}t+\sigma \sqrt{2\beta_t}\mathrm{d}B_t,\footnote{\cite{song2020denoising} refer to ths SDE as the VP-SDE.}\qquad x_0 \sim \pi, 
\end{equation}
where $(B_t)_{t\in[0,T]}$ is a $d$-dimension Brownian motion and $t \rightarrow \beta_t$ is a non-decreasing positive function. This diffusion induces the path measure $\mathcal{P}$ on the time interval $[0,T]$ and the marginal density of $x_t$ is denoted $p_t$. The transition density of this diffusion is given by $p_{t|0}(x_t|x_0)=\mathcal{N}(x_t;\sqrt{1-\lambda_t}x_0,\sigma^2 \lambda_t I)$ where $\lambda_t=1-\exp(-2\int^t_0\beta_s \mathrm{d}s)$. We will always consider a scenario where  $\int_0^T \beta_s \mathrm{d}s \gg 1$ so that $p_T(x_T)\approx \mathcal{N}(x_T;0,\sigma^2 I)$. 

From \citep{haussmann1986time},  its time-reversal $(y_t)_{t\in[0,T]}=(x_{T-t})_{t\in[0,T]}$, where equality is here in distribution, yields the forward time diffusion:
\begin{equation}\label{eq:exacttimereversalCT}
    \mathrm{d}y_t=\beta_{T-t}\{y_t+2\sigma^2 \nabla \log p_{T-t}(y_t)\} \mathrm{d}t+\sigma \sqrt{2\beta_{T-t}}\mathrm{d}W_t,\qquad y_0 \sim p_{T},
\end{equation}
where $(W_t)_{t\in[0,T]}$ is another $d$-dimensional Brownian motion. By definition this time-reversal starts from $y_0 \sim p_T(y_0)\approx \mathcal{N}(y_0;0,\sigma^2 I)$ and is such that $y_T \sim \pi$. This suggests that if we could approximately simulate the diffusion (\ref{eq:exacttimereversalCT}), then we would obtain approximate samples from $\pi$.  However, putting this idea in practice requires being able to approximate the intractable scores $(\nabla \log p_t(x))_{t \in [0,T]}$. Unlike DDPM score matching techniques are not feasible as sampling from (\ref{eq:forwarddiffusionP}) requires sampling $x_0 \sim \pi$ which is impossible by assumption.


\subsection{Reference diffusion and value function}\label{sec:refdiffusionvaluefunction}
In our context, it is useful to introduce a \emph{reference} process defined by the diffusion following (\ref{eq:forwarddiffusionP}), but initialized at $p^{\textup{ref}}_0(x_0)=\mathcal{N}(x_0;0,\sigma^2 I)$ rather than $\pi(x_0)$ thus ensuring that the marginals of the resulting path measure $\mathcal{P}^{\textup{ref}}$ all satisfy $p^{\textup{ref}}_t(x_t)=\mathcal{N}(x_t;0,\sigma^2 I)$. Following \cite{vargas2023denoising} we can identify $\mathcal{P}$  as the path measure minimizing the following half bridge \citep{bernton2019SBsamplers,vargasshro2021, debortoli2021diffusion}:
\begin{align}
 \mathcal{P}= \argmin_\mathcal{Q} \{\KL(\mathcal{Q}||\mathcal{P}^{\textup{ref}}): q_T=\pi\}.
\end{align}
A representation of $\mathcal{P}^{\textup{ref}}$ is given by 
\begin{equation}\label{eq:timereversalrefprocessCT}
    \mathrm{d}y_t =-\beta_{T-t} y_t \mathrm{d}t+\sigma \sqrt{2\beta_{T-t}} \mathrm{d}W_t,\qquad y_0 \sim p^{\textup{ref}}_0.
\end{equation}
As $\beta_{T-t} y_t +2\sigma^2 \nabla \log p^{\textup{ref}}_{T-t}(y_t)=-\beta_{T-t} y_t$, we can rewrite the time-reversal (\ref{eq:exacttimereversalCT}) of $\mathcal{P}$ as 
\begin{equation}\label{eq:diffusionvaluefunction}
    \mathrm{d}y_t= -\beta_{T-t} \{y_t -2\sigma^2 \nabla \log \phi_{T-t}(y_t)\} \mathrm{d}t+\sigma \sqrt{2\beta_{T-t}} \mathrm{d}W_t,\qquad y_0 \sim p_T,
\end{equation}
where $v_t(x) = -\ln \phi_t(x)=-\ln p_t(x)/p^{\textup{ref}}_t(x)$ is known as the value function \citep{fleming2012deterministic,Pham:2009,nusken2021solving,tzen2019theoretical}.

\subsection{Learning the Forward Diffusion}
To approximate  (\ref{eq:exacttimereversalCT}) $\mathcal{P}$, consider a path measure $\mathcal{Q}^\theta$ which is induced by
\begin{equation}\label{eq:Qthetascore}
\mathrm{d}y_t=\beta_{T-t}\{y_t+2\sigma^2 s_{\theta}(T-t,y_t) \} \mathrm{d}t+\sigma \sqrt{2\beta_{T-t}}\mathrm{d}W_t,\qquad y_0  \sim \mathcal{N}(0,\sigma^2 I),
\end{equation}
so that $y_t \sim q^{\theta}_{t}$. To obtain $s_{\theta}(t,x) \approx \nabla \log p_t(x)$, we parameterize $s_{\theta}(t,x)$ by a neural network whose parameters are obtained by minimizing
\begin{align}\label{eq:scorematchinglike}
 \KL(\mathcal{Q}^\theta||\mathcal{P})
    &=\KL(\mathcal{N}(0,\sigma^2 I)||p_T)+\sigma^2 \mathbb{E}_{\mathcal{Q}^\theta}\Bigr[\scaleobj{.8}{\int_0^T} \beta_{T-t}||s_\theta(T-t,y_t)-\nabla \log p_{T-t}(y_t)||^2 \mathrm{d}t \Bigr],\nonumber
\end{align}
This expression of the KL is reminiscent of the expression obtained in \cite[Theorem 1]{song2021maximum} in the context of DDPM. However unlike DDPM \cite{ho2020denoising} highlight one cannot get rid of the intractable scores $(\nabla \log p_{t}(x))_{t \in [0,T]}$ using score matching ideas.

Instead, using (\ref{eq:diffusionvaluefunction}), \cite{vargas2023denoising} reparameterize $\mathcal{Q}^\theta$ using
\begin{equation}\label{eq:approximatetimereversalCT}
    \mathrm{d}y_t=-\beta_{T-t}\{y_t -2\sigma^2 f_{\theta}(T-t,y_t)\} \mathrm{d}t+\sigma \sqrt{2\beta_{T-t}} \mathrm{d}W_t,\qquad y_0 \sim \mathcal{N}(0,\sigma^2 I),
\end{equation}
This reparameterization allows us to express  $\KL(\mathcal{Q}^\theta||\mathcal{P})$ and an equivalent formulation of the half-bridge problem in compact form:
\begin{align}
\KL(\mathcal{Q}^\theta||\mathcal{P})&
=\mathbb{E}_{\mathcal{Q}^\theta} \Bigr[ \sigma^2 \scaleobj{.8}{\int_0^T} \beta_{T-t} ||f_\theta(T-t,y_t)||^2 \mathrm{d}t
     +\scaleobj{.8}{\ln \left(\frac{\gN(y_T; 0, \sigma^2 I)}{\pi(y_T)}\right)} \Bigr]\label{eq:KLpathintegral}.
\end{align}
Where $q^\theta_0 = p_T \approx \gN(0,\sigma^2I)$. Then $\theta$ minimizing (\ref{eq:KLpathintegral}), approximate samples from $\pi$ can be obtained by simulating (\ref{eq:approximatetimereversalCT}) and returning $y_T\! \sim \!q^{\theta}_T$. Additionally \cite{vargas2023denoising} propose a variety of unbiased estimators for the normalising constant $Z$. 

\section{Main Results}

In this section, we present our main results. The first main result consists of a direct adaptation of the main Theorem in \cite{tzen2019theoretical} to the denoising diffusion sampler methodology from \cite{vargas2023denoising} and via directly relating the the estimators we consider to the score of the VP-SDE (Equation \ref{eq:forwarddiffusionP}) we motivate how these estimation results are apply to DDPM based methods \citep{song2020denoising, ho2020denoising,huang2021variational}.

\begin{corollary} \label{col:est}
Suppose Assumptions 1-3 are in force. Let L denote the maximum of the Lipschitz constants of $f$ and $\nabla f$. Then for all $0< \epsilon < 16L^2/c^2$, there exists a neural net $\hat{v} : R^d \times [0,1] \to R^d$ with size polynomial in $1/\epsilon, d, L, c, 1/c$ such that the activation function  of each neuron in the set of $\{\sigma, \sigma', ReLU\}$, and the following hold: If $\{\hat{X_t}\}_{t\in[0,1]}$ is the diffusion process governed by the It\^o SDE:

\begin{align}
d\hat{X}_t = \hat{b}(\hat{X}_{t}, t)\dd t + \sqrt{2 \beta} \dd W_t
\label{SDE}
\end{align}
with $X_0 \sim p_T \approx \gN(0, I)$ with the drift $\hat{b}(x,t) = - (x - 2 \hat{v}(x, \left(1-e^{-2  (T-t)}\right)^{1 / 2}))$, then $\hat{\mu} := Law(\hat{X}_T)$, satisfies $D(\mu||\hat{\mu}) \leq \epsilon$.
\end{corollary}
The proof will strongly follow the same structure as in \cite{tzen2019theoretical} however key steps must be carried out again to show that the DDS drift and value function satisfy the required regularity properties to exploit the core results in \cite{tzen2019neural}.

\subsection{OU Semigroup}

In this section we introduce the OU semigroup whose logarithmic gradient can be directly connected to the score \citep{song2020denoising} in Equation \ref{eq:forwarddiffusionP}. Based on this reformulation of the score we are able to extend the results from \cite{tzen2019theoretical} to denoising diffusions via VP-SDEs. In the reminder of this section we will introduce new results pertaining to regularity properties of this operator that will enable us to prove Corollary \ref{col:est}.


\begin{definition}
We define the VP-SDE semigroup as,
\begin{align}
U^{\beta_t}_{t}f(y)= \mathbb{E}_{Z \sim \mathcal{N}(0, I)}\left[f\left(e^{-\int_0^{t} \beta_s \mathrm{d} s }y+\sigma(1-e^{-2\int_0^{t} \beta_s \mathrm{d}s})^{1/2} Z\right)\right]
\end{align}
Then the OU-semigroup (typically defined with $\beta=1$) is a simpler instance of the above.
\begin{align}
  U^{\beta}_{t}f(y)  = \mathbb{E}_{Z \sim \mathcal{N}(0, I)}\left[f\left(e^{- \beta t }y+\sigma(1-e^{-2 \beta t})^{1/2} Z\right)\right] 
\end{align}
\end{definition}

For the purpose of simplicity as in \cite{tzen2019theoretical} we will be working with the OU semi group when $\beta=1$ (denoted $U_{t}$), however these results can be extended to the more general case.

In the following remark we highlight the connection between the OU semi-group, the value function and the score in DDPM.

 \begin{remark}\label{rem:ou}
The time reversal of the VP-SDE (i.e. $b^{*}(y,t)= \beta_{T-t} (y -2\sigma^2 \nabla \log \phi_{T-t}(y))$) can be expressed in terms of the OU semigroup via:
\begin{align}
    \nabla \log \phi_{T-t}(y) = -\nabla_y \ln U_{T-t}^{\beta_{t}}f(y),
\end{align}
 When $f(x) = \frac{\pi}{\gN(0, \sigma^2 I)}(x)$. This in turn can be related to the score 
 \begin{align}
     \nabla \log p_{T-t}(y) &= -(\sigma^{-2}y+\nabla \log \phi_{T-t}(y)), \\
     &= -(\sigma^{-2}y+\nabla_y \ln U_{T-t}^{\beta_{t}}f(y)). 
 \end{align}
\end{remark}
 
From this stage on we consider the case where $\sigma = \beta = 1$. Notice how this formulation is reminiscent to the F\"ollmer drift \citep{follmer1984entropy,dai1991stochastic,tzen2019theoretical}.


\subsubsection{Regularity Properties }
In what remains of this section we will prove regularity regarding the OU semigroup which will allow us to extend the theoretical guarantees in \cite{tzen2019theoretical} to denoising diffusion models and samplers \citep{song2020denoising, ho2020denoising,vargas2023denoising}.

\begin{lemma}\label{lem:ou_commute}
OU semigroup is commutative with the gradient operator that is for $f:\sR^d \to \sR$ we have $\partial_{y_i } U_t f(y) = U_t \partial_{y_i} f(y)$.
\end{lemma}

\begin{corollary}
Under assumption \ref{a1}, the vector field $\nabla \log U_t f(x)$ is bounded in norm by $L/c$ and is Lipschitz with Lipschitz constant $L/c + L^2/c^2$ where L is the maximum of the Lipschitz constant of $f$ and $\nabla f$.
\end{corollary}


\begin{lemma}\label{lem:expected_lip} ($\mathscr{L}^2$ Lipchitz condition)
    Let $\bar{g}_{t,x}(z) = g(e^{-t}x + (1-e^{-2t})^{1/2}z) - g(0)$ then it follows that:
    \begin{align*}
        || \bar{g}_{t,x}(z) - \bar{g}_{t',x'}(z)||_{\mathscr{L}^2(Q)} \leq L\left(1 + \sqrt{2}||z||_{\mathscr{L}^2(Q)} \right) \rho_{OU}((t,x), (t',x')) 
    \end{align*}
such that:
\begin{align}
    \rho_{OU}((t,x), (t',x')) = || e^{-t}x - x'e^{-t'}||  + |t - t|^{1/2}
\end{align}


\end{lemma}

% \Fran{I think we can derive a better nicer metric for the Lipchitz condition}

% We need to establish a result of the form:

% \begin{conjecture}
% Given the metric space $\big( [0,T] \times B^d(R) , \rho_{OU}\big)$ where:
% \begin{align}
%     \rho_{OU}((t,x), (t',x')) = || e^{-t}x - x'e^{-t'}||  + |e^{-2t} - e^{-2t'}|^{1/2}
% \end{align}
% It follows that:
% \begin{align}
%      N(\gG,  \mathscr{L}^2(Q), \epsilon ||F ||_{\mathscr{L}^2(Q)}) \leq  \left(\frac{8}{\epsilon}\right)^{N([0,T] \times B^d(R),  \rho_{OU}, \epsilon/8) }
% \end{align}
% where $\mathrm{ddim}(\gX) = \ln \lambda_{\gX}$ such that $\lambda_{\gX}$ is the smallest value such that every ball in $\gX$ can be covered by $\lambda_{\gX}$.
% \end{conjecture}
% \begin{proof} (Strategy)

% We believe this should roughly follow from Lemma 6 in \cite{gottlieb2017efficient}. It might require some small modification as they assume $0 \leq f \leq 1$ whereas we assume $f$ is bounded from bellow by $c \in (0,1]$. However looking at the proof its looking ok as in the first half of the lemma this is not needed.
    
% \end{proof}

% \Fran{Actually quite sure we can do better than the above corollary, quite sure this is true:}
\begin{conjecture}
Given the metric space $\big( [0,T] \times B^d(R) , \rho_{OU}\big)$ where:
\begin{align}
    \rho_{OU}((t,x), (t',x')) = || e^{-t}x - x'e^{-t'}||  + |t - t'|^{1/2}
\end{align}
and
\begin{align}
    ||(t,x)||_{OU} =  \rho_{OU}((t,x), (0, 0))= || e^{-t}x ||  + |t|^{1/2}
\end{align}
It follows that:
\begin{align}
     N(\gG,  \mathscr{L}^2(Q), \epsilon ||F ||_{\mathscr{L}^2(Q)}) \leq  N([0,T] \times B^d(R),  ||\cdot||_{OU}, \epsilon) 
\end{align}

\end{conjecture}
\begin{proof}

Consider the $\epsilon$-cover $A_{\rho_{OU}}$ with respect to $\rho_{OU}$ of $[0,T] \times B^d(R)$ it follows that for any $(t,x) \in [0,T] \times B^d(R)$ we have that there exists $(t',x') \in A_{\rho_{OU}}$  such that $\rho_{OU}((t,x), (t',x'))  \leq \epsilon$ then by Lemma \ref{lem:expected_lip} it follows that 
\begin{align}
 || \bar{g}_{t,x}(z) - \bar{g}_{t',x'}(z)||_{\mathscr{L}^2(Q)} &\leq L\left(1 + \sqrt{2}||z||_{\mathscr{L}^2(Q)} \right) \rho_{OU}((t,x), (t',x'))  \\ 
& \leq ||F ||_{\mathscr{L}^2(Q)} \rho_{OU}((t,x), (t',x'))  \\
& \leq ||F ||_{\mathscr{L}^2(Q)} \epsilon 
\end{align}
Hence the set:
\begin{align}
    \gG_{\rho_{OU}} = \{ \bar{g}_{t,x}:(t,x) \in A_{\rho_{OU}}\}
\end{align}
is an $||F || \epsilon$ cover of $\gG$ with respect to the metric $\rho_{OU}$
\end{proof}

\Fran{Metric space check:

\textbf{Basic Properties}:

Clearly $ \rho_{OU}((t,x), (t,x))  = || e^{-t}x -e^{-t}x|| =  0$ and also the metric is positive (when we take into acount the term $|t-t'|^{1/2}$) and symmetric.

\textbf{Triangle}:

Using the norms property we have 
\begin{align}
    || e^{-t}x + x'e^{-t'}|| \leq  || e^{-t}x || + ||x'e^{-t'}||  
\end{align}

thus it follows

\begin{align}
 \rho_{OU}((t,x), (t',x'))  &=|| e^{-t}x -e^{-l}y  - x'e^{-t'} + e^{-l}y  || \\
 &\leq  || e^{-t}x -e^{-l}y || + || e^{-l}y -x'e^{-t'}   || \\
 &=  \rho_{OU}((t,x), (l,y))  +  \rho_{OU}((l,y), (t',x')) 
\end{align}
}




\Fran{

Actually !

Ok new idea (for simplicty let $T=1$:
\begin{align}
   4N([0,T], |\cdot|, \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2) &\leq 4 N(B^d(T), |\cdot|, \epsilon^2/12) N(B^d(R), ||\cdot ||, \epsilon/2) \\
   &\leq  N(B^{2d}(T), |\cdot|, \epsilon/12) N(B^{2d}(R), ||\cdot ||, \epsilon/2) \\
   &\leq \left(\frac{2 }{\epsilon} \right)^{2d}\left(\frac{ 6 \cdot R}{\epsilon} \right)^{2d}   =\left(\frac{ 2 \sqrt{3 R} }{\epsilon} \right)^{4d}
\end{align}


}


\Fran{

Lets have a quick stab at getting the volumetric bound in \citep{tzen2019theoretical} that we are a bit uncertain about.

Its not hard to see that if we have a $d$-dim ball of radius $R$ $B^d(R)$ we can cover it with
\begin{align}
  N(B^d(R), ||\cdot ||, \epsilon/2) \leq  \left( \frac{ 2 \cdot 3 \cdot R}{\epsilon} \right)^d 
\end{align}
see \href{https://mathoverflow.net/questions/371363/trade-off-between-covering-number-ball-radius-and-diameter-of-d-dimensional-s}{this example} for a reference. However they are interested in bounding this times the cover number of $N([0,T], |\cdot|, \epsilon^2/4)$ which can be bounded by $ 2 T \epsilon^{-2}$  so (in their case $T=1$):

Ok new idea (for simplicty let $T=1$:
\begin{align}
   N([0,T], |\cdot|, \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2) &\leq  N(B^d(T), |\cdot|, \epsilon^2/12) N(B^d(R), ||\cdot ||, \epsilon/2) \\
   &\leq \left(\frac{4 }{\epsilon^2} \right)^d\left(\frac{ 6 \cdot R}{\epsilon} \right)^d  \\
   &\leq \left(\frac{2}{\epsilon} \right)^{2d}\left(\frac{ 2\cdot 3 \cdot R}{\epsilon} \right)^d  \\ 
   &\leq \left(\frac{2}{\epsilon} \right)^{2d}\left(\frac{ 2\cdot 3 \cdot R}{\epsilon} \right)^d \\ 
   &\leq \left(\frac{2}{\epsilon} \right)^{3d } (\sqrt{3R})^{2d} \\
    &\leq \left(\frac{2}{\epsilon} \right)^{3d} (\sqrt{3R})^{3d} \\
     &\leq \left(\frac{2 \sqrt{3R}}{\epsilon} \right)^{3d} 
\end{align}
note we consider the case  $N([0,T], |\cdot|, \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2)\geq1$ as in \cite{tzen2019theoretical} otherwise its log would be negative and $\sqrt{\cdot}$ not real. So for the log we get as desired:
\begin{align}
  \log   2 N([0,T], |\cdot|, \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2) &\leq 3d \log  \left(\frac{2 \sqrt{3R}}{\epsilon} \right)_{+} + \ln 2
\end{align}
For the square root we can use something like this $\sqrt{\log x} \leq \log 2 x$.  So:
\begin{align}
  \sqrt{\log   2 N([0,T], |\cdot|, \epsilon^2/4) N(B^d(R), ||\cdot ||, \epsilon/2)} &\leq 3d \log  \left(\frac{2 \sqrt{3R}}{\epsilon} \right)_{+} + \ln 4\\
  &\leq 3d \log  \left(\frac{2^2 \sqrt{3R}}{\epsilon} \right)_{+} 
\end{align}

}

\begin{conjecture}
We have that
\begin{align}
     N([0,T] \times B^d(R),  ||\cdot||_{OU}, \epsilon)  \leq \left( \frac{e^{-\epsilon/2} 2 \sqrt{3R}}{\epsilon}\right)^{4d}
\end{align}
\end{conjecture}
\begin{proof}

Let $B^d_{r_0}(R)$ denote a euclidean d-dimensional ball of radius $R$ centered at $r_0$ and let $B^{d+1}_{t_0 \oplus x_0, \rho}(R')$ denote it's counterpart with respect to the metric $\rho$.  Now notice that if $|| e^{-t} x- e^{-t_0}x_0||  + | t-t_0|^{1/2} \leq \epsilon$ then $|| e^{-t_0}  (x-x_0)||  \leq \epsilon$ and  $|| x-x_0||  \leq  e^{t_0} \epsilon,$ thus,
\begin{align}
   \{t_0\}  \times B^d_{x_0}(\epsilon) \subseteq \{t_0\}  \times  B^d_{x_0}(e^{t_0} \epsilon) \subseteq  B^{d+1}_{t_0 \oplus x_0, \rho}(\epsilon),
\end{align}
then since $\{t_0\}  \times B^d_{x_0}(e^t \epsilon) \subseteq  B^{d+1}_{t \oplus x_0, \rho}(\epsilon)$ we can construct an $\epsilon$ cover namely $A_{t_0}$ of $\{t_0\} \times B^d(R)$ with $\left(  2 \sqrt{d}R{\epsilon }^{-1} e^{-t_0} \right)^{d}$ balls. Finally notice that if $|| e^{-t}  x- e^{-t_0}x_0||  + | t-t_0|^{1/2} \leq \epsilon$ it follows that $|t-t_0|^{1/2} \leq \epsilon$ thus $[0,T]$ can be covered in $2^{-1}T \epsilon^{-2}$\Anna{do we need 2 here}  \Fran{so 2  is for the diameter whilst $\epsilon^2$ is the radius, the grid itself is made up $T$ diameters of size $2\epsilon^2$}
\Fran{Feel free to add any refined covers, I typed up this sketch in a haste to show we can cover with a set $A$ such that $|A| \leq |U_T| |A_0|$} picking the cover $U_T$ such that its elements $u_n$ are centered at $(n + 1) \epsilon^2/2$ , then: 
\begin{align}
   A =  \bigcup_{u_n \in U_T} A_{(n + 1) \epsilon^2/2} 
\end{align} \Teo{We can choose a better cover}
is an $\epsilon$ cover of $[0, T] \times B^d(R)$ (with respect to the metric $\rho_{OU}$), notice this follows as $\forall x \in B^d(R)$ there exists an $x_0$ such that 
\begin{align}
[(n + 1) \epsilon^2/2 -\epsilon^2, (n + 1) \epsilon^2/2 + \epsilon^2] \times \{x\} \subseteq B^{d+1}_{(n + 1) \epsilon^2/2 \oplus x_0, \rho}(\epsilon) \in A_{(n + 1) \epsilon^2/2}
\end{align}

Now we can see that $|A| \leq  |U_T| |A_0|$ ( $|A_0 |=\max_{n}|A_{(n + 1) \epsilon^2/2} |$) completing our proof. \\


\Fran{
Note we could obtain a more tight bound but not necesary for the proof (something like this, need to be more careful with the last step)
\begin{align}
    |A| \leq \sum_n  | A_{(n + 1) \epsilon^2/2}| &\leq C(d, \epsilon)\sum_n {e^{-4d(n + 1) \epsilon^2/2} } \\
    &=C(d, \epsilon)\frac{1-e^{-4dT}}{1-e^{-4d \epsilon^2/2}}
\end{align}
}
\end{proof}

% \Fran{We will then Need a bound on the Pollard entropy and its integral wrt to $\epsilon$}


% This result is wrong as $xe^{-t}$ is not Lipchitz. We might still be able to adapt the covering number result.


\begin{lemma} \label{lem:envelope} 
    Let $g :  R^d \to R$ to L-Lipschitz with respect to Euclidean norm. The for $F(z) := L((R \vee 1) + \sqrt{2}||z||)$. 
\begin{align}
    \Big|g\left(e^{- t}x+(1-e^{-2 t})^{1/2} z\right) - g(0)\Big| \leq F(z)
\end{align}
\end{lemma}


\bibliography{jmlr-sample}

\appendix

\section{Assumptions}\label{apd:assump}

\begin{assumption}\label{a1}
The function $f$ is differentiable, both $f$ and $\nabla f$ are L-Lipschitz, and there exists a constant $c \in [0,1]$ such that $f \geq c$ everywhere. 
\end{assumption}

\begin{assumption}\label{a2}
The activation function $\sigma : \sR \to \sR$ is differentiable such that $\exists c_\sigma > 0$ depending only on $\sigma$, such that the following holds: For any L-Lipschitz function $h : \sR \to \R$ which
is constant outside the interval $[-R, R]$ and for any $\delta > 0$, there exist real numbers $a$, $\{\alpha_i, \beta_i, \gamma_i\}^m_{i=1}$ where $m \leq c_{\sigma}\frac{RL}{\delta}$, such that the function  $h^*(x) = a + \sum \alpha_i \sigma(\beta_i x +\gamma_i)$ satisfies $\sup_{x\in \sR}|h^*(x)-h(x)|\leq \delta$.
\end{assumption}

\begin{assumption}\label{a3}
 uniform approximability to both $h$ and $\nabla h$. TODO: Write full thing.
 %For any $R>0$ and $\epsilon >0$, there exist a neural net $\hat{f}\in\mathcal{N}^\sigma _{l,s}$ with $l,s<\text{poly}(1/\epsilon,d,L,R)$, such that
 %\begin{align}
     %\sup_{x\in B^d(R)}|f(x)-\hat{f}(x)|\leq \epsilon \quad and \quad \sup_{x\in B^d(R)}||\nabla f(x)-\nabla \hat{f}(x)||\leq \epsilon.
 %\end{align}
\end{assumption}


\section{OU Semigroup Results}


\begin{repremark}{rem:ou}
The time reversal of the VP-SDE (i.e. $b^{*}(y,t)= -\beta_{T-t} (y -2\sigma^2 \nabla \log \phi_{T-t}(y))$) can be expressed in terms of the OU semigroup via:
\begin{align}
    \nabla \log \phi_{T-t}(y) = -\nabla_y \ln U_{T-t}^{\beta_{t}}f(y),
\end{align}
 When $f(x) = \frac{\pi}{\gN(0, \sigma^2 I)}(x)$. This in turn can be related to the score 
 \begin{align}
     \nabla \log p_{T-t}(y) &= -(\sigma^{-2}y+\nabla \log \phi_{T-t}(y)), \\
     &= -(\sigma^{-2}y-\nabla_y \ln U_{T-t}^{\beta_{t}}f(y)). 
 \end{align}
\end{repremark}
\begin{proof}
Consider the OU semigroup evaluated on the appropiate RND:
\begin{align*}
     U_{t}^{\beta_{t}}f(y) &= \mathbb{E}_{Z \sim \mathcal{N}(0, I)}\left[\frac{\pi}{\gN(0, \sigma^2 I)}\left(e^{- \beta t }y+\sigma(1-e^{-2 \beta t})^{1/2} Z\right)\right] \\
     &= \mathbb{E}_{X_T \sim p^{\mathrm{ref}}_{T|t}(\cdot|x)}\left[\frac{\pi}{\gN(0, \sigma^2 I)}\left(X_T\right)\right] \\ 
     &= \int p^{\mathrm{ref}}_{T|t}(x_T |x) \frac{\pi}{\gN(0, \sigma^2 I)}\left(x_T\right) \mathrm{d}x_T\\
     &= \int \frac{p^{\mathrm{ref}}_{t|T}(x |x_T)  p^{\mathrm{ref}}_T(x_T)}{p^{\mathrm{ref}}_t(x)}\frac{\pi}{\gN(0, \sigma^2 I)}\left(x_T\right) \mathrm{d}x_T \\
     &= \int \frac{p^{\mathrm{ref}}_{t|T}(x |x_T) }{p^{\mathrm{ref}}_t(x)}{\pi}\left(x_T\right) \mathrm{d}x_T = \frac{p_t(x) }{p^{\mathrm{ref}}_t(x)}
\end{align*}
and thus it follows that
\begin{align}
     \nabla \log p_{T-t}(y) = -(\sigma^{-2}y+\nabla_y \ln U_{T-t}^{\beta_{t}}f(y)). 
\end{align}
relating the score and the OU semi-group as required.
\end{proof}

\begin{replemma}{lem:ou_commute}
OU semigroup is commutative with the gradient operator that is for $f: \sR^d \to \sR$ we have $\partial_{y_i } U_t f(y) = U_t \partial_{y_i} f(y)$.
\end{replemma}

\begin{proof}
    It suffices to show that 
\begin{align}
    d(x,z) = \delta^{-1}(f(e^{- t}x+(1-e^{-2 t})^{1/2} z) - f(e^{- t}(x+\delta \ve_i)+(1-e^{-2 t})^{1/2} z)),
\end{align}
is dominated, where $[\ve_i]_j =\delta_{ij}$. As $f$ is Lipchitz by assumption it follows that
\begin{align}
        |d(x,z)| \leq L|\delta^{-1} e^{- t} \delta | =L e^{- t} \leq L
\end{align}
As $L$ is integrable under $\gN(0, I)$ we have shown $d(x,z)$ is dominated for all $\delta$ and thus the partial derivative operator and the OU semigroup commute.
\end{proof}


\begin{replemma}{lem:envelope}
    Let $g :  \sR^d \to \sR$ to L-Lipschitz with respect to Euclidean norm. Let $Z_1, Z_2, ... Z_N$ be i.i.d copied of a d-dimensional random vector Z, such that $U := ||Z||$ has finite norm $\psi_2$. Then for $F(z) := L((R \vee 1) + \sqrt{2}||z||)$. 

\begin{align}
    \Big|g\left(e^{- t}x+(1-e^{-2 t})^{1/2} z\right) - g(0)\Big| \leq F(z)
\end{align}
\end{replemma}

\begin{proof}
    Since $||\cdot|| \leq ||\cdot||_{\psi_2}$, $F \in L^2(P)$. By Lipschitz continuity for all $z \in R^d, X\in B^d(R), t\in [0,T]$ we have:

\begin{align}
    |g\left(e^{- t}x+(1-e^{-2 t})^{1/2} z\right) - g(0)| &\leq L || e^{- t}x+(1-e^{-2 t})^{1/2} z|| \\
    &\leq L ( e^{- t}||x||+(1-e^{-2 t})^{1/2} ||z||)
\end{align}  

Since both $e^{- t}$ and $(1-e^{-2 t})^{1/2}$ are srtictly smaller than $1$, we have:
\begin{align}
L ( e^{- t}||x||+(1-e^{-2 t})^{1/2} ||z||) &\leq L(R + ||z||) \\
&\leq L((R\vee 1) + ||z||) \leq F(z)
\end{align}  
\end{proof}

The choice of $F(z) := L((R \vee 1) + \sqrt{2}||z||)$ with these specific constants arises from the following result.

\begin{replemma}{lem:expected_lip}($\mathscr{L}^2$ Lipchitz condition)
    Let $\bar{g}_{t,x}(z) = g(e^{-t}x + (1-e^{-2t})^{1/2}z) - g(0)$ then it follows that:
    \begin{align*}
        || \bar{g}_{t,x}(z) - \bar{g}_{t',x'}(z)||_{\mathscr{L}^2(Q)} \leq L\left(1 + \sqrt{2}||z||_{\mathscr{L}^2(Q)} \right) \rho_{OU}((t,x), (t',x')) 
    \end{align*}
such that:
\begin{align}
    \rho_{OU}((t,x), (t',x')) = || e^{t}x - x'e^{t'}||  + |t - t'|^{1/2}
\end{align}

\end{replemma}

\begin{proof}
\begin{align*}
 || \bar{g}_{t,x}(z) - \bar{g}_{t',x'}(z)||_{\mathscr{L}^2(Q)} &\leq L || || e^{-t}x + (1-e^{-2t})^{1/2}z - e^{-t'}x' - (1-e^{-2t'})^{1/2}z || ||_{\mathscr{L}^2(Q)}  \\ 
 &\leq L \Big|\Big| || e^{-t}x -e^{-t'}x'||  + |(1-e^{-2t})^{1/2} - (1-e^{-2t'})^{1/2}|\cdot ||z || \Big|\Big|_{\mathscr{L}^2(Q)} \\ 
  &\leq L \Bigg(|| e^{-t}x -e^{-t'}x'||  + |(1-e^{-2t})^{1/2} - (1-e^{-2t'})^{1/2}|\cdot ||z ||_{\mathscr{L}^2(Q)} \Bigg)\\
  &\leq L \Bigg(|| e^{-t}x -e^{-t'}x'||  + |e^{-2t} - e^{-2t'}|^{1/2}\cdot ||z ||_{\mathscr{L}^2(Q)} \Bigg) \\ 
&\leq L \Bigg(|| e^{-t}x -e^{-t'}x'||   + \sqrt{2}|t - t'|^{1/2}\cdot ||z ||_{\mathscr{L}^2(Q)} \Bigg)
 \end{align*}

 Where in the last line we use that $\sup_{t\in [0,T]}|(e^{-2t})'| = 2$ and thus $e^{-2t}$ is 2-Lipchitz.
\end{proof}


% This is wrong.  The bound on $|| e^{-t}x -e^{-t'}x'|| \leq || x -x'|| $  is incorrect. In fact you can actually show its not Lipchitz taking a first derivative.

% We could instead define the metric:
% \begin{align}
%     d_{OU}((t,x), (t',x')) = || e^{t}x - x'e^{t'}||  + |e^{-2t} - e^{-2t'}|^{1/2}
% \end{align}
% We can then see that in this metric we have Lipchitz continuity with the proposed constant.

% Then for the covering numbers we could potentially make the bound   :
% \begin{align} \label{eq:ballcover}
%     N(\gG,  \mathscr{L}^2(Q), \epsilon ||F ||_{\mathscr{L}^2(Q)}) \leq N(B^d(R) \times [0,T], || \cdot||_{OU}, k(\epsilon))
% \end{align}

% Im not sure what the function $k(\epsilon)$ might be it could be something like $k(\epsilon) = \epsilon \ln \epsilon/L_1 + \epsilon^2/L_2$. Even if this was the case I wouldnt know how to factor $N(B^d(R) \times [0,T], || \cdot||_{OU}, k(\epsilon))$ or use results to bind its volume, but doesn't seem too hard as we are covering setshere rather than functions, with a slight wonky norm, this could all work out assuming Eq \ref{eq:ballcover} is right and we can find some reference for it.



\end{document}
