\documentclass[accepted]{uai2022}

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage[tbtags]{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz}
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{multirow}
\usepackage{xr}

\usepackage[utf8]{inputenc}   % LaTeX, comprends les accents !
\usepackage[T1]{fontenc}      % Police contenant les caractÃ¨res franÃ§ais
\usepackage{comment}

\usepackage{natbib}

\usepackage[tbtags]{amsmath}
\usepackage{amsthm}
\allowdisplaybreaks
\usepackage{amssymb,mathrsfs}
\usepackage{amsfonts}
\usepackage{upgreek}
\usepackage{xspace}

\usepackage{graphicx}
\usepackage{subfig}
\usepackage{color}
\usepackage{algorithm, algorithmic}
\begin{comment}

\algnewcommand{\Inputs}[1]{%
  \State \textbf{Inputs:}
  \Statex \hspace*{\algorithmicindent}\parbox[t]{.8\linewidth}{\raggedright #1}
}
\algnewcommand{\Initialize}[1]{%
  \State \textbf{Initialize:}
  \Statex \hspace*{\algorithmicindent}\parbox[t]{.8\linewidth}{\raggedright #1}
}
\algnewcommand{\Outputs}[1]{%
  \State \textbf{Outputs:}
  \Statex \hspace*{\algorithmicindent}\parbox[t]{.8\linewidth}{\raggedright #1}
}
\end{comment}

\usepackage{stmaryrd}
\usepackage[inline]{enumitem}
\usepackage{url}
\def\UrlBreaks{\do\/\do-}
\usepackage{tikz}
\usetikzlibrary{calc,arrows.meta}
\newcommand\yBlock{1}
\newcommand\yNode{0.75}

\newcommand\xNodemoinstiny{-1}
\newcommand\xNodemoins{-1.5}
\newcommand\xNodemoinsint{-2.}
\newcommand\xNodeMoins{-3}
\newcommand\xNodeMOINS{-4.5}

\newcommand\xNodeplustiny{1}
\newcommand\xNodeplus{1.5}
\newcommand\xNodeplusint{2}
\newcommand\xNodePlus{3}
\newcommand\xNodePLUS{4.5}

\usepackage{pgfplots}
\usepackage{xcolor}
\usepackage{bbm}
\usepackage{ifthen}
\usepackage{xargs}

\usepackage{aliascnt}
\usepackage{cleveref}
\usepackage{autonum}
\makeatletter
\newtheorem{theorem}{Theorem}
\crefname{theorem}{theorem}{Theorems}
\Crefname{Theorem}{Theorem}{Theorems}


\newtheorem*{lemma_nonumber*}{Lemma}


\newaliascnt{lemma}{theorem}
\newtheorem{lemma}[lemma]{Lemma}
\aliascntresetthe{lemma}
\crefname{lemma}{lemma}{lemmas}
\Crefname{Lemma}{Lemma}{Lemmas}



\newaliascnt{corollary}{theorem}
\newtheorem{corollary}[corollary]{Corollary}
\aliascntresetthe{corollary}
\crefname{corollary}{corollary}{corollaries}
\Crefname{Corollary}{Corollary}{Corollaries}

\newaliascnt{proposition}{theorem}
\newtheorem{proposition}[proposition]{Proposition}
\aliascntresetthe{proposition}
\crefname{proposition}{proposition}{propositions}
\Crefname{Proposition}{Proposition}{Propositions}

\newaliascnt{definition}{theorem}
\newtheorem{definition}[definition]{Definition}
\aliascntresetthe{definition}
\crefname{definition}{definition}{definitions}
\Crefname{Definition}{Definition}{Definitions}

\newaliascnt{remark}{theorem}
\newtheorem{remark}[remark]{Remark}
\aliascntresetthe{remark}
\crefname{remark}{remark}{remarks}
\Crefname{Remark}{Remark}{Remarks}


\newtheorem{example}[theorem]{Example}
\crefname{example}{example}{examples}
\Crefname{Example}{Example}{Examples}

\newtheorem{technique}{Technique}
\crefname{technique}{technique}{techniques}
\Crefname{Technique}{Technique}{Techniques}


\crefname{figure}{figure}{figures}
\Crefname{Figure}{Figure}{Figures}


\newtheorem{assumption}{\textbf{A}\hspace{-3pt}}
\crefformat{assumption}{{\textbf{A}}#2#1#3}

\newtheorem{assumptionF}{\textbf{F}\hspace{-3pt}}
\crefformat{assumptionF}{{\textbf{F}}#2#1#3}

\newenvironment{assumptionbis}[1]
  {\renewcommand{\theassumptionF}{\ref*{#1}$\mathbf{b}$}%
   \addtocounter{assumptionF}{-1}%
   \begin{assumptionF}}
  {\end{assumptionF}}



\newtheorem{assumptionB}{\textbf{B}\hspace{-3pt}}
\Crefname{assumptionB}{\textbf{B}\hspace{-3pt}}{\textbf{B}\hspace{-3pt}}
\crefname{assumptionB}{\textbf{B}}{\textbf{B}}

\newtheorem{assumptionC}{\textbf{C}\hspace{-3pt}}
\Crefname{assumptionC}{\textbf{C}\hspace{-3pt}}{\textbf{C}\hspace{-3pt}}
\crefname{assumptionC}{\textbf{C}}{\textbf{C}}


\newtheorem{assumptionH}{\textbf{H}\hspace{-3pt}}
\Crefname{assumptionH}{\textbf{H}\hspace{-3pt}}{\textbf{H}\hspace{-3pt}}
\crefname{assumptionH}{\textbf{H}}{\textbf{H}}

\newtheorem{assumptionT}{\textbf{T}\hspace{-3pt}}
\Crefname{assumptionT}{\textbf{T}\hspace{-3pt}}{\textbf{T}\hspace{-3pt}}
\crefname{assumptionT}{\textbf{T}}{\textbf{T}}

\newtheorem{assumptionD}{\textbf{D}\hspace{-3pt}}
\Crefname{assumptionT}{\textbf{T}\hspace{-3pt}}{\textbf{T}\hspace{-3pt}}
\crefname{assumptionT}{\textbf{T}}{\textbf{T}}


\newtheorem{assumptionL}{\textbf{L}\hspace{-3pt}}
\Crefname{assumptionL}{\textbf{L}\hspace{-3pt}}{\textbf{L}\hspace{-3pt}}
\crefname{assumptionL}{\textbf{L}}{\textbf{L}}

\newtheorem{assumptionQ}{\textbf{Q}\hspace{-3pt}}
\Crefname{assumptionQ}{\textbf{Q}\hspace{-3pt}}{\textbf{Q}\hspace{-3pt}}
\crefname{assumptionQ}{\textbf{Q}}{\textbf{Q}}


\newtheorem{assumptionAR}{\textbf{AR}\hspace{-3pt}}
\Crefname{assumptionAR}{\textbf{AR}\hspace{-3pt}}{\textbf{AR}\hspace{-3pt}}
\crefname{assumptionAR}{\textbf{AR}}{\textbf{AR}}



\newcommand\diaW{11}
\newcommand\diaH{5}
\newcommand\diaJump{2.75}
\newcommand\nextRow{1.25}
\newcommand\imW{0.08}
\newcommand\imWB{0.1}
\newcommand\imOp{0.6}
\newcommand\bend{5}

\newcommand\offset{2}
\newcommand\offsety{2.3}
\newcommand\h{2.25}
\newcommand\hsmall{1.75}
\newcommand\ww{3.25}
\newcommand\www{1.8}
\newcommand\wwww{3.5}
\newcommand\wwwww{4.8}
\newcommand{\offsetsmall}{1.5}

\usepackage{bm}
\usepackage{wrapfig}


\newcommand{\detLigne}[1]{\det(#1)}
\def\hlf{\hat{\ell}^f}
\def\hlb{\hat{\ell}^b}
\def\Ent{\mathrm{H}}
\def\lyap{V_{p,t,x_t}}
\def\lyapp{V_{p}}

\def\contspace{\mathcal{C}}


\def\bpobs{\bar{p}_{\textup{obs}}}
\def\pobs{p_{\textup{obs}}}
\def\pjoin{p_{\textup{join}}}
\def\pjref{p_{\textup{jref}}}
\def\ppos{p_{\textup{pos}}}
\def\pdata{p_{\textup{data}}}
\def\qdata{q_{\textup{data}}}
\def\pref{p_{\textup{ref}}}

\def\yobs{y^{\textup{obs}}}


\def\for{\mathrm{f}}
\def\back{\mathrm{b}}
\def\lf{\ell^{\mathrm{f}}}
\def\lb{\ell^{\mathrm{b}}}
\def\sf{s^{\mathrm{f}}}
\def\sb{s^{\mathrm{b}}}

\def\Tcal{\mathcal{T}}
\def\bfpi{\bm{\pi}}
\def\bfnu{\bm{\nu}}

\def\Pens{\mathscr{P}}
\def\Mens{\mathscr{M}}
\def\pif{\overrightarrow{\pi}}
\def\lambdabff{\overrightarrow{\bm{\lambda}}}
\def\lambdabfb{\overleftarrow{\bm{\lambda}}}
\newcommand{\mail}[1]{\footnote{Email: \href{mailto:#1}{\textcolor{black}{#1}}}}
\def\Phif{\overrightarrow{\Phi}}
\def\Phib{\overleftarrow{\Phi}}
\def\scoref{\overrightarrow{\mathrm{S}}}
\def\scoreb{\overleftarrow{\mathrm{S}}}
\def\netf{\overrightarrow{\mathrm{NN}}}
\def\netb{\overleftarrow{\mathrm{NN}}}
\newcommand{\schro}{Schr\"{o}dinger\xspace}
\newcommand{\Cweakapp}{\ttd}
\def\ttfp{\Cweakapp_{p}}
\def\ttfpun{\Cweakapp_{p,1}}
\def\ttfpdeux{\Cweakapp_{p,2}}
\def\ttfptrois{\Cweakapp_{p,3}}
\def\ttfpquatre{\Cweakapp_{p,4}}
\def\ttamin{\mathtt{a}}
\def\ttfun{\Cweakapp_4}
\def\ttfdeux{\Cweakapp_5}
\def\btta{\bar{\mathtt{A}}}
\def\bfb{\mathbf{b}}
\def\bfsigma{\pmb{\sigma}}
\def\KuLo{Kurdyka-\L ojasiewicz}
\newcommand{\tta}{\mathtt{A}}
\newcommand{\ttb}{\mathtt{B}}
\newcommand{\ttc}{\mathtt{C}}
\newcommand{\ttd}{\mathtt{D}}
\def\tte{\mathtt{E}}
\newcommand{\ttM}{\mathtt{M}}
\def\boundLSig{\Lip\eta}

\newcommand{\Capprox}{\tta}
\newcommand{\Ctech}{\ttc}
\newcommand{\Cstrong}{\ttb}
\newcommand{\Cconv}{\ttc}
\newcommand{\Cweak}{C}

\def\conj{\varkappa}
\def\mtta{\mathtt{a}}
\def\explog{\vareps}
\newcommand{\note}[1]{\textcolor{red}{#1}}
\def\Cbeta{\Cweak_{\beta, \explog}}
\def\Aar{\Capprox_{\alpha, r}}
\def\xo{x_0}
\def\Db{\Ctech}
\def\intk{\int_{k\gua}^{(k+1)\gua}}
\newcommandx\ctun[1][1=T]{\Capprox_{#1,1}}
\def\btun{\mathtt{B}_1}
\def\btdeux{\mathtt{B}_2}
\def\dtun{\mathtt{D}_1}
\def\cttun{\tilde{\Capprox}_{T,1}}
\def\dtdeux{\mathtt{D}_2}
\def\ctdeux{\Capprox_{T,2}}
\def\cttrois{\Capprox_{T,3}}
\def\ctquatre{\Capprox_{T,4}}
\def\ctcinq{\Capprox_{T,5}}
\def\ctsix{\Capprox_{T,6}}
\def\ctsept{\Capprox_{T,7}}
\def\cthuit{\Capprox_{T,8}}
\def\ctneuf{\Capprox_{T,9}}
\def\gfun{\mathbb{G}}
\def\hash{\sharp}
\def\Cconvcontun{\Cconv_{1,\alpha}^{(c)}}
\def\Cconvcontdeux{\Cconv_{2,\alpha}^{(c)}}
\def\Cconvconttrois{\Cconv_{3,\alpha}^{(c)}}
\def\Cconvdiscun{\Cconv_{1,\alpha}^{(d)}}
\def\Cconvdiscdeux{\Cconv_{2,\alpha}^{(d)}}
\def\Cconvdisctrois{\Cconv_{3,\alpha}^{(d)}}
\def\Cconvcont{\Phibf_{\alpha}^{(c)}}
\def\Cconvdisc{\Phibf_{\alpha}^{(d)}}
\def\Csham{\Cconv_1}
\def\Cshamd{\Cconv_2}
\def\Cshama{\Cconv_{\alpha}}
\def\Cshamamoins{\Cshama^-}
\def\Cshamaplus{\Cshama^+}
\def\Ccont{\Cconv^{(c)}}
\def\Cdisc{\Cconv^{(d)}}
\def\Cconvk{{\Cconv^{(a)}_k}}
%\def\Cconvdun{\Cconv^{(b)}_1}
%\def\Cconvddeux{\Cconv^{(b)}_2}
\def\Cconvdtrois{\Cconv^{(b)}}
\def\Cconvdun{(\gamma\eta/2)}
\def\Cconvddeux{(\gamma/2)}
\def\Cshamdisc{\Cconv_{0}}
\def\Cshamt{\tilde{\Cconv}_{\alpha}}
\def\Psial{\Psibf_{\alpha}}
\def\Cstrongcont{\Cstrong_1}
\def\Cstrongcontf{\Cstrong_2}
\def\Cstrongdisc{\Cstrong_3}
\def\Cstrongdiscf{\Cstrong_4}
\def\Cstrongloj{\Cstrong_5}
\def\Cstronglojdisc{\Cstrong_6}
\def\Cstrongtilde{\tilde{\Cstrong}}
\def\maxnorm{C}
\newcommand{\pinv}{^{-1}}
\newcommand{\st}{^{\star}}
\newcommand{\gb}{\gamma^{\beta}}
\newcommand{\tr}{^{\top}}
\def\scrE{\mathscr{E}}
\def\scrV{\mathscr{V}}
\def\scrF{\mathscr{F}}
\newcommand{\rref}[1]{\tup{\Cref{#1}}}
\newcommand{\la}{\langle}
\newcommand{\ra}{\rangle}
\newcommand{\LL}{\L ojasciewicz~}
\newcommand{\gua}{\gamma_{\alpha}}
\newcommand{\bgua}{\bgamma_{\alpha}}
\newcommand{\gda}{\gua^{1/2}}
\newcommand{\tgua}{(t+\gua)^{\alpha}}
\newcommand{\guac}{c}
\newcommand{\et}{\quad\mbox{and}\quad}
%\newcommand{\sigb}{\ttM_{\Sigma}}
\newcommand{\sigb}{\eta}
\newcommand{\phe}{\varphi_{\varepsilon}}
\newcommand{\feps}{f_{\varepsilon}}
\newcommand{\nfeps}{\nabla f_{\varepsilon}}
\newcommand{\intd}{\int_{\bR^{\dim}}}
\newcommandx{\expec}[2]{{\mathbb E}\left[#1 \middle \vert #2  \right]} %%%% esperance conditionnelle
\newcommand{\expek}[1]{\expec{#1}{\cF_k}}
\newcommand{\expen}[1]{\expec{#1}{\cF_n}}
\newcommand{\nn}{_{n+1}}
\newcommand{\kk}{_{k+1}}
\newcommand{\pal}{^{\alpha}}
\newcommand{\pmal}{^{-\alpha}}
\newcommand{\cH}{\mathcal{H}}

\def\En{\tilde{E}_n}
\def\varepsn{\tilde{\vareps}_n}
\def\pow{p}
\def\ntt{\mathtt{n}_0}
\def\tlambda{\tilde{\lambda}}
\def\dim{d}
\newcommand{\tb}{\tilde{b}}
\newcommand{\Time}{T}
\newcommand{\mttun}{\mathtt{k}_1}
\newcommand{\mttdeux}{\mathtt{k}_2}
\newcommand{\mtttrois}{\mtt_3^+}
\newcommand{\bvareps}{\bar{\vareps}}
\newcommand{\transference}{\mathbf{T}}
\newcommand{\esssup}{\mathrm{ess sup}}
\newcommand{\ring}{\mathcal{C}_{\varrho}}
\newcommand{\measx}{\mathcal{X}}
\newcommand{\bkappa}{\bar{\kappa}}
\newcommand{\probaspace}[1]{\mathbb{P}\left( #1 \right)}
\newcommand{\dTVdeux}{d_{\mathrm{TV}, 2}}
\newcommand{\dTVDeux}[1]{d_{\mathrm{TV}, 2}\left( #1 \right)}
\newcommand{\bgM}{b_{\gamma, n}}
\newcommand{\bbgM}{\bar{b}_{\gamma, M}}
\newcommand{\rme}{\mathrm{e}}
\newcommand{\rmF}{\mathrm{F}}
\newcommand{\rmE}{\mathrm{E}}
\newcommand{\Fdr}{\mathrm{f}}
\newcommand{\Gdr}{\mathrm{g}}
\newcommand{\alphastar}{\alpha_{\star}}
\newcommand{\LipVset}{\mathrm{Lip}_{V, \alpha}}
\newcommand{\Lip}{\mathtt{L}}
\newcommand{\Mtt}{\mathtt{M}}
\newcommand{\Ktt}{\mathtt{K}}
\newcommand{\tLip}{\tilde{\mathtt{L}}}
\newcommand{\tell}{\tilde{\ell}}
\newcommand{\Lipb}{\mtt_b}
\newcommand{\step}{\ceil{1/\gamma}}
\newcommand{\bstep}{\ceil{1/\bgamma}}
\def\bdisc{b}
\def\bfDd{\mathbf{D}_{\mathrm{d}}}
\def\bfDc{\mathbf{D}_{\mathrm{c}}}
\newcommand{\SDE}{SDE}

\newcommand{\bbeta}{\bar{\beta}}
\newcommand{\measfun}{\mathbb{F}}
\newcommand{\btheta}{\boldsymbol{\theta}}
\newcommand{\bdeta}{\boldsymbol{\eta}}
\newcommand{\bvarphi}{\boldsymbol{\varphi}}

%\newcommand{\tau}{\boldsymbol{\tau}}
%\newcommand{\x}{\boldsymbol{x}}
%\newcommand{\X}{\boldsymbol{X}}
%\newcommand{\y}{\boldsymbol{y}}
%%\newcommand{\u}{\boldsymbol{u}}
%\newcommand{\w}{\boldsymbol{w}}
%\newcommand{\z}{\boldsymbol{z}}
%\newcommand{\p}{\boldsymbol{p}}
%\newcommand{\s}{\mathcal{S}}
%\newcommand{\ind}{\boldsymbol{1}}
%\newcommand{\dx}{\boldsymbol{\delta}\boldsymbol{x}}
%\newcommand{\argmax}{\operatornamewithlimits{argmax}}
%\newcommand{\argmin}{\operatornamewithlimits{argmin}}
%\newcommand{\prox}{\operatorname{prox}}
\def\x{{ \boldsymbol x}}
\def\u{{ \boldsymbol u}}
\def\y{{\boldsymbol y}}
\def\z{{\boldsymbol z}}
\def\w{{\boldsymbol w}}

\def\xt{ \boldsymbol x^t}
\newcommandx{\norm}[2][1=]{\ifthenelse{\equal{#1}{}}{\left\Vert #2 \right\Vert}{\left\Vert #2 \right\Vert^{#1}}}
\newcommandx{\normLigne}[2][1=]{\ifthenelse{\equal{#1}{}}{\Vert #2 \Vert}{\Vert #2\Vert^{#1}}}


\newcommand\mycomment[1]{\textcolor{red}{#1}}

%\theoremstyle{definition}
%\newtheorem{defn}{Definition}[section]
%\newtheorem{assump}{A}[paragraph]
%\newtheorem{prop}{Proposition}[section]
%\newtheorem{theo}{Theorem}[section]
%\newtheorem{coro}{Corollary}[section]
%\newtheorem{lemma}{Lemma}[section]
%\newtheorem{exmp}{Example}[section]

\def\xstart{x^{\star}_{\theta}}

%%%%%%%%%%%%%%%
%% mathbf

\def\bfn{\mathbf{n}}
\def\bfw{\mathbf{w}}
\def\bfc{\mathbf{c}}
\def\bfY{\mathbf{Y}}
\def\bbfY{\bar{\mathbf{Y}}}
\def\bfX{\mathbf{X}}
\def\tbfX{\tilde{\mathbf{X}}}
\def\hbfX{\hat{\mathbf{X}}}
\def\tbfY{\tilde{\mathbf{Y}}}
\def\hbfY{\hat{\mathbf{Y}}}
\def\bfs{\mathbf{s}}
\def\bfZ{\mathbf{Z}}
\def\bfXt{\tilde{\mathbf{X}}}
\def\bfXd{\overline{\mathbf{X}}}
\def\bfYd{\overline{\mathbf{Y}}}
\def\bfZ{\mathbf{Z}}
\def\bbfX{\tilde{\mathbf{X}}}
\def\bfM{\mathbf{M}}
\def\bfB{\mathbf{B}}
\def\bfP{\mathbf{P}}
%%% mathsf
\def\msi{\mathsf{I}}
\def\msa{\mathsf{A}}
\def\msd{\mathsf{D}}
\def\msk{\mathsf{K}}
\def\mss{\mathsf{S}}
\def\msn{\mathsf{N}}
\def\msat{\tilde{\mathsf{A}}}
\def\msb{\mathsf{B}}
\def\msc{\mathsf{C}}
\def\mse{\mathsf{E}}
\def\msf{\mathsf{F}}
\def\mso{\mathsf{o}}
\def\msg{\mathsf{G}}
\def\msh{\mathsf{H}}
\def\msm{\mathsf{M}}
\def\msu{\mathsf{U}}
\def\msv{\mathsf{V}}
\def\msr{\mathsf{R}}
\newcommand{\msff}[2]{\mathsf{F}_{#1}^{#2}}
\def\msp{\mathsf{P}}
\def\msq{\mathsf{Q}}
\def\msx{\mathsf{X}}
\def\msz{\mathsf{Z}}
\def\msy{\mathsf{Y}}



%% mathcal
\def\mca{\mathcal{A}}
\def\mct{\mathcal{T}}
\def\mcat{\tilde{\mathcal{A}}}
\def\mcab{\bar{\mathcal{A}}}
\def\mcbb{\mathcal{B}}  %%% \mcb est déjà pris
\newcommand{\mcb}[1]{\mathcal{B}(#1)}
\def\mcc{\mathcal{C}}
\def\mcz{\mathcal{Z}}
\def\mcy{\mathcal{Y}}
\def\mcx{\mathcal{X}}
\def\mce{\mathcal{E}}
\def\mcs{\mathcal{S}}
\def\mcf{\mathcal{F}}
\def\mcg{\mathcal{G}}
\def\mch{\mathcal{H}}
\def\mcm{\mathcal{M}}
\def\mcu{\mathcal{U}}
\def\mcv{\mathcal{V}}
\def\mcr{\mathcal{R}}
\newcommand{\mcff}[2]{\mathcal{F}_{#1}^{#2}}
\def\mcfb{\bar{\mathcal{F}}}
\def\bmcf{\bar{\mathcal{F}}}
\def\mcft{\tilde{\mathcal{F}}}
\def\tmcf{\tilde{\mathcal{F}}}
\def\mcp{\mathcal{P}}
\def\mcq{\mathcal{Q}}

%% mathbb

\def\Qbb{\mathbb{Q}}
\def\Rbb{\mathbb{R}}
\def\Mbb{\mathbb{M}}
\def\Pbb{\mathbb{P}}
\def\Hbb{\mathbb{H}}
\newcommand{\Qit}[1]{\Qbb^{(#1)}}
\newcommand{\Pit}[1]{\Pbb^{(#1)}}

\def\rset{\mathbb{R}}
\def\rsets{\mathbb{R}^*}
\def\cset{\mathbb{C}}
\def\zset{\mathbb{Z}}
\def\nset{\mathbb{N}}
\def\nsets{\mathbb{N}^{\star}}
\def\qset{\mathbb{Q}}
\def\Rset{\mathbb{R}}
\def\Cset{\mathbb{C}}
\def\Zset{\mathbb{Z}}
\def\Nset{\mathbb{N}}
\def\Tset{\mathbb{T}}

\def\bN{\mathbb{N}}
\def\bR{\mathbb{R}}
\def\bRd{\mathbb{R}^{\dim}}
\def\cF{\mathcal{F}}


%%%% mathrm

\def\rmP{\mathrm{P}}
\def\rmQ{\mathrm{Q}}
\def\rmR{\mathrm{R}}
\def\rmb{\mathrm{b}}
\def\mrb{\mathrm{b}}
\def\wrm{\mathrm{w}}
\def\rmw{\mathrm{w}}
\def\rmd{\mathrm{d}}
\def\rmm{\mathrm{m}}
\def\rms{\mathrm{s}}
\def\rmZ{\mathrm{Z}}
\def\rmS{\mathrm{S}}
\def\mrd{\mathrm{d}}
\def\mre{\mathrm{e}}
\def\rme{\mathrm{e}}
\def\rmn{\mathrm{n}}
\def\mrn{\mathrm{n}}
\def\mrc{\mathrm{C}}
\def\mrcc{\mathrm{c}}
\def\rmc{\mathrm{C}}
\def\rmC{\mathrm{C}}
\def\GaStep{\Gamma}
\def\rmcc{\mathrm{c}}
\def\rma{\mathrm{a}}
\def\rmf{\mathrm{f}}
\def\rmg{\mathrm{g}}
\def\rmh{\mathrm{h}}
\def\rmv{\mathrm{v}}
\def\mra{\mathrm{a}}

\def\cov{\mathrm{Cov}}

\newcommand{\cco}{\llbracket}
\newcommand{\ccf}{\rrbracket}
\newcommand{\po}{\left(}
\newcommand{\pf}{\right)}
\newcommand{\co}{\left[}
\newcommand{\cf}{\right]}
\newcommand{\R}{\mathbb R}
\newcommand{\Z}{\mathbb Z}
\newcommand{\D}{\mathcal D}
\newcommand{\dd}{\mathrm{d}}
\newcommand{\A}{\mathcal A}
\newcommand{\M}{\mathcal M}
\newcommand{\na}{\nabla}
\newcommand{\loiy}{\mu_{\mathrm{v}}}


\def\MeasFspace{\mathbb{M}}
\def\xstar{x^\star}
\def\Tr{\operatorname{T}}
\def\trace{\operatorname{Tr}}
\newcommandx{\functionspace}[2][1=+]{\mathbb{F}_{#1}(#2)}
%% argmin, argmax
\newcommand{\argmax}{\operatorname*{arg\,max}}
\newcommand{\argmin}{\operatorname*{arg\,min}}
\newcommand{\estimateur}[1]{\hat{\pi}_n^N(#1)}
\def\RichR{\operatorname{R}}
\def\piR{\hat{\pi}^{\RichR}}
\def\estimatorRR{\piR}
\newcommandx{\VarDeux}[3][3=]{\operatorname{Var}^{#3}_{#1}\left\{#2 \right\}}
\newcommand{\VarDeuxLigne}[2]{\operatorname{Var}_{#1}\{#2 \}}
\newcommand{\gramm}{\operatorname{Gramm}}
\newcommand{\1}{\mathbbm{1}}
\newcommand{\2}[1]{\mathbbm{1}_{\{#1\}}}




\newcommand{\LeftEqNo}{\let\veqno\@@leqno}

\newcommand{\lambdast}{\lambda^{s \rightarrow t}}
\newcommand{\etast}{\eta^{s \rightarrow t}}
\newcommand{\mst}{m^{s \rightarrow t}}
\newcommand{\mun}{m^{1 \rightarrow 2}}
\newcommand{\mdeux}{m^{2 \rightarrow 1}}
\newcommand{\lambdaun}{\lambda^{2 \rightarrow 1}}
\newcommand{\etaun}{\eta^{2 \rightarrow 1}}
\newcommand{\lambdadeux}{\lambda^{1 \rightarrow 2}}
\newcommand{\etadeux}{\eta^{1 \rightarrow 2}}
\newcommand{\mnun}{m^{n+1 \rightarrow \pi(n+1)}}
\newcommand{\etanun}{\eta^{n+1 \rightarrow \pi(n+1)}}
\newcommand{\lambdanun}{\lambda^{n+1 \rightarrow \pi(n+1)}}
\newcommand{\xpinun}{x_{\pi(n+1)}}
\newcommand{\xnun}{x_{n+1}}
\newcommand{\mpinun}{m^{\pi(n+1) \rightarrow n+1}}
\newcommand{\etapinun}{\eta^{\pi(n+1) \rightarrow n+1}}
\newcommand{\lambdapinun}{\lambda^{\pi(n+1) \rightarrow n+1}}
\newcommand{\pinun}{\pi(n+1)}
\newcommand{\vois}{\mathcal{N}}
\newcommand{\mpii}{m^{i \rightarrow \pi(n+1)}}
\newcommand{\etapii}{\eta^{i \rightarrow \pi(n+1)}}
\newcommand{\lambdapii}{\lambda^{i \rightarrow \pi(n+1)}}
\newcommand{\alphahat}{\widehat{\alpha}}
\newcommand{\betahat}{\widehat{\beta}}
\newcommand{\tildegamma}{\widetilde{\gamma}}
\newcommand{\tildeP}{\widetilde{P}}

\newcommand{\myeqref}[1]{Eq.~\eqref{#1}}



%%%% Floating Points Notation

\newcommand{\fpround}[1]{\lfloor #1 \rceil}
\newcommand{\floor}[1]{\left\lfloor #1 \right\rfloor}
\newcommand{\ceil}[1]{\left\lceil #1 \right\rceil}



%voc
\newcommand{\pth}{\ensuremath{p^{\text{th}}}}
\newcommand{\qth}{\ensuremath{q^{\text{th}}}}
\newcommand{\nth}{\ensuremath{n^{\text{th}}}}

%order
\newcommand{\ord}{\ensuremath{\operatorname{ord}}}
\newcommand{\rad}{\ensuremath{\operatorname{rad}}}



% Sets
\newcommand{\N}{\ensuremath{\mathbb{N}}}
\newcommand{\Q}{\ensuremath{\mathbb{Q}}}
\newcommand{\C}{\ensuremath{\mathbb{C}}}

%\newcommand{\F}{\ensuremath{\mathbb{F}}}
\newcommand{\primes}{\ensuremath{\mathcal P}}

\newcommand{\sfi}{\ensuremath{\mathcal{S}\!\mathcal{F}}}
\newcommand{\sfibt}{\ensuremath{\mathcal{S}\!\mathcal{F}'}}

\newcommand{\reghat}{\widehat{R}}

\newcommand{\reghatn}{\widehat{R}_n}

\newcommand{\arm}{\mathcal{A}}

\newcommand{\mX}{\widehat{X}}
\newcommand{\PE}{\mathbb{E}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\Ft}{\mathcal{F}}

\newcommand{\Sy}{\mathbf{S}}

\newcommand{\Kfrac}{\mathscr{K}}

% Operands
\newcommand{\absolute}[1]{\left\vert #1 \right\vert}
\newcommand{\abs}[1]{\left\vert #1 \right\vert}
\newcommand{\absLigne}[1]{\vert #1 \vert}
\newcommand{\tvnorm}[1]{\| #1 \|_{\mathrm{TV}}}
\newcommand{\tvnormLigne}[1]{\| #1 \|_{\mathrm{TV}}}
\newcommand{\tvnormEq}[1]{\left \| #1 \right \|_{\mathrm{TV}}}
\newcommandx{\Vnorm}[2][1=V]{\| #2 \|_{#1}}
\newcommandx{\VnormEq}[2][1=V]{\left\| #2 \right\|_{#1}}
% \newcommandx{\norm}[2][1=]{\ifthenelse{\equal{#1}{}}{\left\Vert #2 \right\Vert}{\left\Vert #2 \right\Vert^{#1}}}
% \newcommandx{\normLigne}[2][1=]{\ifthenelse{\equal{#1}{}}{\Vert #2 \Vert}{\Vert #2\Vert^{#1}}}
\newcommand{\crochet}[1]{\left\langle#1 \right\rangle}
\newcommand{\parenthese}[1]{\left(#1 \right)}
\newcommand{\parentheseLigne}[1]{(#1 )}
\newcommand{\parentheseDeux}[1]{\left[ #1 \right]}
\newcommand{\parentheseDeuxLigne}[1]{[ #1 ]}
\newcommand{\defEns}[1]{\left\lbrace #1 \right\rbrace }
\newcommand{\defEnsLigne}[1]{\lbrace #1 \rbrace }
\newcommand{\defEnsPoint}[1]{\left\lbrace #1 \right. }
\newcommand{\defEnsPointDeux}[1]{\left. #1 \right  \rbrace }
\newcommand{\defEnsL}[1]{\left\lbrace #1 \right. }
\newcommand{\defEnsR}[1]{\left. #1 \right  \rbrace }

%\newcommand{\defSystem}[1]{\left\lbrace #1 \right. }

\newcommand{\ps}[2]{\left\langle#1,#2 \right\rangle}
\newcommand{\eqdef}{=}
\newcommand{\defeq}{=}

% Relations
\newcommand{\divid}{\mid}
\newcommand{\ndivide}{\nmid}

% Proba
\newcommand{\proba}[1]{\mathbb{P}\left( #1 \right)}
\newcommand{\probaCond}[2]{\mathbb{P}\left( \left. #1  \middle\vert #2 \right.\right)}
\newcommand{\probaCondLigne}[2]{\mathbb{P}(#1  \vert #2 )}
\newcommand{\probaCondLignePi}[2]{\Pi(#1  \vert #2 )}
\newcommand{\probaLigne}[1]{\mathbb{P}( #1 )}
\newcommandx\probaMarkovTilde[2][2=]
{\ifthenelse{\equal{#2}{}}{{\widetilde{\mathbb{P}}_{#1}}}{\widetilde{\mathbb{P}}_{#1}\left[ #2\right]}}
\newcommand{\probaMarkov}[2]{\mathbb{P}_{#1}\left[ #2\right]}
\newcommand{\probaMarkovDD}[1]{\mathbb{P}_{#1}}
\newcommand{\expe}[1]{\PE \left[ #1 \right]}
\newcommand{\expesq}[1]{\PE^{1/2} \left[ #1 \right]}
\newcommand{\expeExpo}[2]{\PE^{#1} \left[ #2 \right]}
\newcommand{\expeLigne}[1]{\PE [ #1 ]}
\newcommand{\expeLine}[1]{\PE [ #1 ]}
\newcommand{\expeMarkov}[2]{\PE_{#1} \left[ #2 \right]}
\newcommand{\expeMarkovD}[3]{\PE_{#1}^{#3} \left[ #2 \right]}
\newcommand{\expeMarkovDD}[1]{\PE_{#1}}
\newcommand{\expeMarkovLigne}[2]{\PE_{#1} [ #2 ]}
\newcommand{\expeMarkovExpo}[3]{\PE_{#1}^{#2} \left[ #3 \right]}
\newcommand{\probaMarkovTildeDeux}[2]{\widetilde{\mathbb{P}}_{#1} \left[ #2 \right]}
\newcommand{\expeMarkovTilde}[2]{\widetilde{\PE}_{#1} \left[ #2 \right]}

% Landau notation (big O)
\newcommand{\bigO}{\ensuremath{\mathcal O}}
\newcommand{\softO}{\Tilde{\ensuremath{\mathcal O}}}

% Environments

%\renewenvironment{proof}[1][{\textit{Proof:}}]{\begin{trivlist} \item[\em{\hskip \labelsep #1}]}{\ensuremath{\qed} \end{trivlist}}

%\renewenvironment{proof}[1][{\textit{Proof:}}]{\begin{trivlist} \item[\em{\hskip \labelsep #1}]}{\ensuremath{\qed} \end{trivlist}}



%fleche limite
\newcommand{\flecheLimite}{\underset{n\to+\infty}{\longrightarrow}}
\newcommand{\flecheLimiteOption}[2]{\underset{#1\to#2}{\longrightarrow}}
\newcommand{\flecheLimiteHaut}{\overset{n\to+\infty}{\longrightarrow}}


%notation infini
\newcommand{\plusinfty}{+\infty}

%notation egale
\newcommand{\egale}[1]{\ensuremath{\underset{#1}{=}}}

%plusieurs ligne indice
%\sum\limits_{\substack{i=0 \\ i \neq i_0}}^{n}{A_



\newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}


\newcommand{\hilbert}{\mathcal{H}}


\def\ie{\textit{i.e.}}
\def\as{\textit{a.s}}
\def\cadlag{càdlàg}
\def\eqsp{\;}
\newcommand{\coint}[1]{\left[#1\right)}
\newcommand{\ocint}[1]{\left(#1\right]}
\newcommand{\ooint}[1]{\left(#1\right)}
\newcommand{\ccint}[1]{\left[#1\right]}
\newcommand{\cointLigne}[1]{[#1)}
\newcommand{\ocintLigne}[1]{(#1]}
\newcommand{\oointLigne}[1]{(#1)}
\newcommand{\ccintLigne}[1]{[#1]}

\def\primr{f_r}
\def\primrO{f_{r_0}}




\newcommand{\indi}[1]{\1_{#1}}
\newcommandx{\weight}[2][2=n]{\omega_{#1,#2}^N}
\newcommand{\loi}{\mathcal{L}}
\newcommand{\boule}[2]{\operatorname{B}(#1,#2)}
\newcommand{\ball}[2]{\operatorname{B}(#1,#2)}
\newcommand{\boulefermee}[2]{\bar{B}(#1,#2)}
\newcommand{\cball}[2]{\bar{\operatorname{B}}(#1,#2)}
\newcommand{\diameter}{\operatorname{diam}}
\newcommand{\deta}{d_{\eta}}

\def\TV{\mathrm{TV}}

\newcommand{\yuyang}[1]{\todo[color=blue!20]{{\bf YS:} #1}}
\newcommand{\james}[1]{\todo[color=blue!20]{{\bf JT:} #1}}
\newcommand{\arnaud}[1]{\todo[color=blue!20]{{\bf AD:} #1}}
\newcommand{\arnaudi}[1]{\todo[color=blue!20,inline]{{\bf AL:} #1}}
\newcommand{\valentin}[1]{\todo[color=blue!20]{{\bf VDB:} #1}}
\newcommand{\valentintxt}[1]{\textcolor{red}{\textbf{VDB}: #1}}
 \newcommand{\valentini}[1]{\todo[color=blue!20,inline]{{\bf VDB:} #1}}
 
% \newcommand{\aymeric}[1]{\todo[color=blue!20]{{\bf AD:} #1}}
% \newcommand{\francis}[1]{\todo[color=black!20]{{\bf FB:} #1}}
 \newcommand{\tcr}[1]{\textcolor{red}{#1}}
% \newcommand{\tcb}[1]{\textcolor{blue}{#1}}


\def\as{\ensuremath{\text{a.s.}}}
\def\dist{\operatorname{dist}}

\newcommandx\sequence[3][2=,3=]
{\ifthenelse{\equal{#3}{}}{\ensuremath{\{ #1_{#2}\}}}{\ensuremath{\{ #1_{#2}, \eqsp #2 \in #3 \}}}}

\newcommandx\sequenceD[3][2=,3=]
{\ifthenelse{\equal{#3}{}}{\ensuremath{\{ #1_{#2}\}}}{\ensuremath{( #1)_{ #2 \in #3} }}}

\newcommandx{\sequencen}[2][2=n\in\N]{\ensuremath{\{ #1_n, \eqsp #2 \}}}
\newcommandx\sequenceDouble[4][3=,4=]
{\ifthenelse{\equal{#3}{}}{\ensuremath{\{ (#1_{#3},#2_{#3}) \}}}{\ensuremath{\{  (#1_{#3},#2_{#3}), \eqsp #3 \in #4 \}}}}
\newcommandx{\sequencenDouble}[3][3=n\in\N]{\ensuremath{\{ (#1_{n},#2_{n}), \eqsp #3 \}}}


\newcommand{\wrt}{w.r.t.}
\newcommand{\Withoutlog}{w.l.o.g.}
\def\iid{i.i.d.}
\def\ifof{if and only if}
\def\eg{\textit{e.g.}}


\newcommand{\notered}[1]{{\textbf{\color{red}#1}}}


\newcommand{\opnorm}[1]{{\left\vert\kern-0.25ex\left\vert\kern-0.25ex\left\vert #1
    \right\vert\kern-0.25ex\right\vert\kern-0.25ex\right\vert}}



%\def\Lip{\operatorname{Lip}}
\def\generator{\mathcal{A}}
\def\generatort{\tilde{\mathcal{A}}}
\def\generatorsp{\generator^{\sphere^d}}
\def\generatorr{\generator^{\rset^d}}

\def\momentNoise{\mathrm{m}}
\def\bfe{\mathbf{e}}

\def\bfv{\mathbf{v}}
\def\ebf{\mathbf{e}}
\def\vbf{\mathbf{v}}


\def\Id{\operatorname{Id}}
\def\Idbf{\mathbf{I}}

\def\tildetheta{\tilde{\theta}}

\def\calC{\mathcal{C}}


\newcommandx{\CPE}[3][1=]{{\mathbb E}_{#1}\left[#2 \middle \vert #3  \right]} %%%% esperance conditionnelle
\newcommandx{\CPELigne}[3][1=]{{\mathbb E}_{#1}[#2  \vert #3  ]} %%%% esperance conditionnelle
\newcommandx{\CPEsq}[3][1=]{{\mathbb{E}^{1/2}}_{#1}\left[#2 \middle \vert #3  \right]} %%%% esperance conditionnelle
\newcommandx{\CPVar}[3][1=]{\mathrm{Var}^{#3}_{#1}\left\{ #2 \right\}}
\newcommand{\CPP}[3][]
{\ifthenelse{\equal{#1}{}}{{\mathbb P}\left(\left. #2 \, \right| #3 \right)}{{\mathbb P}_{#1}\left(\left. #2 \, \right | #3 \right)}}

\def\Ascr{\mathscr{A}}
\def\scrA{\mathscr{A}}
\def\scrB{\mathscr{B}}
\def\scrC{\mathscr{C}}

\def\barL{\bar{L}}

\def\YL{\mathbf{Y}}
\def\XEM{X}
\def\steps{\gamma}
\def\measSet{\mathbb{M}}

%\newcommand\Ent[2]{\mathrm{Ent}_{#1}\left(#2\right)}
\newcommandx{\osc}[2][1=]{\mathrm{osc}_{#1}(#2)}

\def\Ybar{\bar{Y}}
\def\Id{\operatorname{Id}}
\def\IdM{\operatorname{I}_d}
\newcommand\EntDeux[2]{\Ent_{#1}\left[#2 \right]}
\def\Ltwo{\mathrm{L}^2}
\def\Lone{\mathrm{L}^1}
\newcommand\densityPi[1]{\frac{\rmd #1}{\rmd \pi}}
\newcommand\densityPiLigne[1]{\rmd #1 /\rmd \pi}
\newcommand\density[2]{\frac{\rmd #1}{\rmd #2}}
\newcommand\densityLigne[2]{\rmd #1/\rmd #2}

\def\V{V}
\def\VD{V}
\def\Vsp{V^{\sphere^d}_{\b,\beta}}
\def\Vr{V^{\rset^d}_{\b,\c,\beta}}

\def\Prset{P^{\rset^d}}
\def\Psphere{P^{\sphere^d}}

\def\n{\mathrm{n}}
\def\Vpsi{\psi}
\def\Vkappa{\kappa}
\def\Vkappat{\tilde{\kappa}}
\def\Vchi{\chi}
\def\Vchit{\tilde{\chi}}
\def\Vphi{\phi}
\def\Vrho{\rho}
\def\psiV{\Vpsi}
\def\rhoV{\Vrho}
\def\phiV{\Vphi}
\def\fV{f}
\def\Vf{\fV}
\def\kappaVt{\tilde{\Vkappa}}
\def\kappaV{\Vkappa}
\def\chiV{\Vchi}
\def\chiVt{\Vchit}


\def\a{a}
\def\b{b}
\def\c{c}
\def\e{e}
\def\rU{\mathrm{r}}

\def\domain{\mathrm{D}}

\def\martfg{M^{f,g}}
\newcommand\Ddir[1]{D_{#1}}
\newcommand\maxplus[1]{\parenthese{#1}_+}
\def\Refl{\mathrm{R}}
\def\phibf{\pmb{\phi}}
\def\Gammabf{\mathbf{\Gamma}}


\def\transpose{\top}
\def\v{v}
\def\w{w}
\def\y{y}
\def\z{z}
%%%% bar
\def\bD{\bar{D}}
\def\bC{\bar{C}}
\def\brho{\bar{\rho}}
\def\bt{\bar{t}}
\def\bA{\bar{A}}
\def\bb{\overline{b}}
\def\bc{\bar{c}}
\def\bgamma{\bar{\gamma}}
\def\bU{\bar{U}}
\def\Ub{\bU}
\def\lambdab{\bar{\lambda}}
\def\blambda{\bar{\lambda}}
\def\blambdab{\bar{\lambda}}
\def\bv{\bar{v}}
\def\vb{\bv}
\def\yb{\bar{y}}
\def\by{\yb}
\def\Xb{\bar{X}}
\def\Yb{\bar{Y}}
\def\Gb{\bar{G}}
\def\Eb{\bar{E}}
\def\Tb{\bar{T}}
\def\taub{\bar{\tau}}

\def\bX{\bar{X}}
\def\bY{\bar{Y}}
\def\bG{\bar{G}}
\def\bE{\bar{E}}
\def\bT{\bar{T}}
\def\btau{\bar{\tau}}

\def\pib{\bar{\pi}}
\def\bpi{\pib}

\def\S{S}

%%%% tilde
\def\tgamma{\tilde{\gamma}}
\def\tC{\tilde{C}}
\def\tB{\tilde{B}}
\def\tc{\tilde{c}}
\def\tvareps{\tilde{\vareps}}
\def\trho{\tilde{\rho}}
\def\tmsk{\tilde{\msk}}
\def\tW{\tilde{W}}
\def\tvarsigma{\tilde{\varsigma}}
\def\tv{\tilde{v}}
\def\vt{\tv}
\def\yt{\tilde{y}}
\def\ty{\yt}
\def\Mt{\tilde{M}}
\def\tM{\Mt}

\def\tx{\tilde{x}}
\def\xt{\tx}
\def\Xt{\tilde{X}}
\def\Yt{\tilde{Y}}
\def\Gt{\tilde{G}}
\def\Et{\tilde{E}}
\def\Tt{\tilde{T}}
\def\St{\tilde{S}}
\def\taut{\tilde{\tau}}

\def\tX{\tilde{X}}
\def\tY{\tilde{Y}}
\def\tG{\tilde{G}}
\def\tE{\tilde{E}}
\def\tT{\tilde{T}}
\def\tS{\tilde{S}}
\def\ttau{\tilde{\tau}}


\def\Xb{\bar{X}}
\def\Yb{\bar{Y}}
\def\Gb{\bar{G}}
\def\Eb{\bar{E}}
\def\Tb{\bar{T}}
\def\Sb{\bar{S}}
\def\taub{\bar{\tau}}
\def\Hb{\bar{H}}
\def\Nb{\bar{N}}


\def\bX{\bar{X}}
\def\bY{\bar{Y}}
\def\bG{\bar{G}}
\def\bE{\bar{E}}
\def\bT{\bar{T}}
\def\btau{\bar{\tau}}
\def\bS{\bar{S}}
\def\bH{\bar{H}}
%\def\bN{\bar{N}}

%%%%%%%%

\def\mgU{\mathrm{m}_{\nabla U}}
\def\MintDrift{I}
\def\CU{C_U}
\def\RU{R_1}
\def\RV{R}
\def\Reps{R_{\epsilon}}
\def\Resp{\Reps}
\def\veps{\varepsilon}

\def\sphere{\mss}

\def\nablaUt{\overline{\nabla U}}
\def\measureSphere{\nu^d}

\def\etaU{\eta}
\def\epsilonU{\epsilon}

\def\Jac{\operatorname{Jac}}
\def\jac{\operatorname{Jac}}
\def\sign{\operatorname{sign}}
\def\rate{\lambda_{\mathrm{r}}}







\def\sigmaS{\sigma^2}

\newcommand{\ensemble}[2]{\left\{#1\,:\eqsp #2\right\}}
\newcommand{\ensembleLigne}[2]{\{#1\,:\eqsp #2\}}
\newcommand{\set}[2]{\ensemble{#1}{#2}}

\def\rmD{\mathrm{D}}%%rmd déjà pris
\def\mrd{\mathrm{D}}
\def\mrc{\mathrm{C}}

\def\diag{\Delta_{\rset^d}}

%\def\lyap{W}
\newcommand\coupling[2]{\Gamma(\mu,\nu)}
\def\supp{\mathrm{supp}}
\def\tpi{\tilde{\pi}}
\newcommand\adh[1]{\overline{#1}}

\def\ACb{\mathrm{AC}_{\mathrm{b}}}

\def\opK{\mathrm{K}}

\newcommand{\fracm}[2]{\left. #1 \middle / #2 \right.}
\newcommand{\fraca}[2]{ #1  / #2 }
\newcommand{\fracaa}[2]{ #1  / (#2) }

\newcommand{\complementary}{\mathrm{c}}

% \renewcommand{\geq}{\geqslant}
% \renewcommand{\leq}{\leqslant}
\def\poty{H}
\def\diam{\mathrm{diam}}
\def\talpha{\tilde{\alpha}}
% \def\Leb{\mathrm{Leb}}
\def\Leb{\lambda}
\newcommand{\iintD}[2]{\{#1,\ldots,#2\}}
\def\interior{\mathrm{int}}
\def\iff{ if and only if }

\def\vareps{\varepsilon}
\def\bvareps{\bar{\varepsilon}}
\def\varespilon{\varepsilon}
\def\si{\text{ if } }
\def\proj{\operatorname{proj}}
\def\projd{\operatorname{proj}^{\msd}}
\def\Phibf{\mathbf{\Phi}}
\def\Psibf{\mathbf{\Psi}}

\def\rker{\mathrm{R}}
\def\kker{\mathrm{K}}

\def\VEa{V}
\def\KUa{K}
\newcommandx{\KL}[2]{\operatorname{KL}\left( #1 | #2 \right)}
\newcommandx{\KLsqrt}[2]{\operatorname{KL}^{1/2}\left( #1 | #2 \right)}
\newcommandx{\Jef}[2]{\operatorname{J}\left( #1 , #2 \right)}
\newcommandx{\JefLigne}[2]{\operatorname{J}( #1 , #2 )}
\newcommandx{\KLLigne}[2]{\operatorname{KL}( #1 | #2 )}

\def\gaStep
\def\QKer{Q}
\def\Tg{\mathcal{T}_{\gamma}}
\def\Tk{\mathcal{T}_{k}}
\def\Tn{\mathcal{T}_{k}}
\def\Tnplusun{\mathcal{T}_{k+1}}
\def\mcurb{m}
%\newcommand{\coupling}[1]{\Gamma\left( #1 \right)}
\newcommand{\couplingLine}[1]{\Gamma( #1 )}
\def\distance{\mathbf{d}}
\newcommandx{\wasserstein}[3][1=\distance,3=]{\mathbf{W}_{#1}^{#3}\left(#2\right)}
\newcommandx{\wassersteinLigne}[3][1=\distance,3=]{\mathbf{W}_{#1}^{#3}(#2)}
\newcommandx{\wassersteinD}[1][1=\distance]{\mathbf{W}_{#1}}
\newcommandx{\wassersteinDLigne}[1][1=\distance]{\mathbf{W}_{#1}}


\def\Rcoupling{\mathrm{R}}
\def\Qcoupling{\mathrm{Q}}
\def\Sker{\mathrm{S}}
\def\Kcoupling{\mathrm{K}}
\def\tKcoupling{\tilde{\mathrm{K}}}
\def\Lcoupling{\mathrm{L}}
\def\Kcouplingproj{\mathrm{K}^P}
\def\vepsilon{\varepsilon}


\newcommand{\defEnsE}[2]{\ensemble{#1}{#2}}
\newcommand{\expeMarkovTildeD}[3]{\widetilde{\PE}_{#1}^{#3} \left[ #2 \right]}
\newcommand{\probaMarkovTildeD}[3]{\widetilde{\PP}_{#1}^{#3} \left[ #2 \right]}
\def\coordtildex{\mathrm{w}}
\def\PPtilde{\widetilde{\PP}}
\def\PEtilde{\widetilde{\PE}}
\def\transfrr{\mathrm{F}}
\def\diagSet{\Delta_{\msx}}
\def\Deltar{\diagSet}
\def\complem{\operatorname{c}}
\def\alphar{\alpha}
\def\tildex{\tilde{x}}
\def\tildez{\tilde{z}}
\def\tildey{\tilde{y}}
\def\ar{\mathrm{a}}
\def\Kr{\mathsf{K}}
\def\Kar{K^{(\mathrm{a})}}
\def\Xr{\mathrm{X}}
\def\Yr{\mathrm{Y}}
\def\Xrd{\mathit{X}}
\def\Yrd{\mathit{Y}}
\def\Zr{\mathrm{Z}}
\def\Ur{\mathrm{U}}
\def\sigmaD{\sigma^2}
\def\sigmakD{\sigma^2_k}
\newcommandx{\phibfs}[1][1=]{\pmb{\varphi}_{\sigmaD_{#1}}}
\def\vphibf{\pmb{\varphi}}
\def\varphibf{\pmb{\varphi}}
\def\phibfvs{\pmb{\varphi}_{\varsigma^2}}
\def\funreg{\mct}
\def\kappar{\varpi}
\def\Pr{\mathsf{P}}
\def\Par{P^{(\mathrm{a})}}
\def\Qr{\mathsf{Q}}
\def\Qar{Q^{(\mathrm{a})}}
\def\eventA{\msa}

\def\borelSet{\B}
\def\Er{\mathrm{E}}
\def\er{\mathrm{e}}
\def\transp{\operatorname{T}}

\newcommandx\sequenceg[3][2=,3=]
{\ifthenelse{\equal{#3}{}}{\ensuremath{( #1_{#2})}}{\ensuremath{( #1_{#2})_{ #2 \geq #3}}}}


\def\indiar{\iota}
\def\rated{\chi}
\def\transar{\tau}
\def\filtrationTilde{\tilde{\mcf}}

\def\discrete{\mathrm{d}}
\def\continuous{\mathrm{c}}


\def\Xar{X^{(\mathrm{a})}}
\def\Yar{Y^{(\mathrm{a})}}
\def\War{W^{(\mathrm{a})}}
\def\Xiar{\Xi^{(\mathrm{a})}}
\def\mcfar{\mcf^{(\mathrm{a})}}

\def\Xart{\tilde{X}^{(\mathrm{a})}}
\def\Yart{\tilde{Y}^{(\mathrm{a})}}


\def\Kker{\Kcoupling}
\def\KkerD{\tilde{\Kcoupling}}
\def\Rker{\Rcoupling}
\def\tRker{\tilde{\Rker}}
\def\Pker{\mathrm{P}}
\def\Pkerf{\overrightarrow{\mathrm{P}}}
\def\Pkerfou{\overrightarrow{\mathrm{P}}_{\mathrm{OU}}}
\def\Pkerb{\overleftarrow{\mathrm{P}}}
\def\Rkerb{\overleftarrow{\mathrm{R}}}
\def\Skerb{\overleftarrow{\mathrm{S}}}
\def\Qker{\mathrm{Q}}
\def\Lker{\mathrm{L}}
\def\rmL{\mathrm{L}}
\def\rmG{\mathrm{G}}
\def\bfmu{\bm{\mu}}

\def\VlyapD{W}
\def\VlyapDun{W_1}
\def\VlyapDdeux{W_2}
\def\VlyapDtrois{W_3}
% \newcommandx{\distV}[1][1=W]{\mathbf{d}_{#1}}
\newcommandx{\distV}[1][1=\bfc]{\mathbf{W}_{#1}}
\newcommandx{\distVdeux}[1][1=W_2]{\mathbf{d}_{#1}}

\def\inv{\leftarrow}
\newcommand{\couplage}[2]{\Pi(#1,#2)}
\def\mtt{\mathtt{m}}
\def\mttzero{\mathtt{m}_0}
\def\tmtt{\tilde{\mathtt{m}}}
\def\ttm{\mathtt{m}}
\def\mttplus{\mathtt{m}^{+}}
\def\mttplusun{\mathtt{m}_1^{+}}
\def\mttplusdeux{\mathtt{m}_2^{+}}
\def\ttmplus{\mathtt{m}^{+}}
\def\cconst{\mathtt{a}}
\def\Run{R_1}
\def\Rdeux{R_2}
\def\Rtrois{R_3}
\def\Rquatre{R_4}
\def\tR{\tilde{R}}
\def\tmttplus{\tilde{\mtt}^+}
\newcommand{\tup}[1]{\textup{#1}}
\def\Fix{\operatorname{Fix}}
\newcommand{\stopping}[1]{\T_{\msc,\mathtt{n}_0}^{(#1)}}
\def\wass{\mathcal{W}}
\def\distY{\mathbf{d}}
\def\Xibf{\boldsymbol{\Xi}}
\def\rhomax{\rho_{\rm{max}}}
\def\rhof{\overrightarrow{\rho}}
\def\familydrift{\mathscr{B}}

\def\wasscun{\mathbf{W}_{\bfc_1}}
\def\wasscdeux{\mathbf{W}_{\bfc_2}}
\def\wassctrois{\mathbf{W}_{\bfc_3}}

\def\loiz{\mu_{\msz}}
\def\muz{\loiz}
\def\funH{H}

\renewcommand{\doteq}{=}
\newcommand{\Idd}{\operatorname{I}_d}


 
\makeatletter
\newcommand*{\addFileDependency}[1]{% argument=file name and extension
  \typeout{(#1)}
  \@addtofilelist{#1}
  \IfFileExists{#1}{}{\typeout{No file #1.}}
}
\makeatother

\newcommand*{\myexternaldocument}[1]{%
    \externaldocument{#1}%
    \addFileDependency{#1.tex}%
    \addFileDependency{#1.aux}%
}

\myexternaldocument{shi_557}

\makeatletter

\DeclareMathOperator\erf{erf}

\providecommand{\assumptionname}{Assumption}
\providecommand{\lemmaname}{Lemma}
\providecommand{\propositionname}{Proposition}
\providecommand{\remarkname}{Remark}
\providecommand{\corollaryname}{Corollary}
\providecommand{\theoremname}{Theorem}


\makeatother

\begin{document}
\title{Conditional Simulation Using Diffusion Schr\"{o}dinger Bridges \\ (Supplementary Material)}
\author[1]{Yuyang~Shi}
\author[2]{Valentin~De~Bortoli}
\author[1]{George~Deligiannidis}
\author[1]{Arnaud~Doucet}

\affil[1]{%
    Department of Statistics\\
    University of Oxford, UK
}

\affil[2]{%
   ENS, PSL University, Paris, France 
}

\onecolumn

\maketitle

\appendix
\renewcommand{\thefigure}{\thesection.\arabic{figure}}    
\renewcommand{\thetable}{\thesection.\arabic{table}}  
\setcounter{equation}{7}
\setcounter{proposition}{3}
\setcounter{algorithm}{1}

\section{Organization of the supplementary}

The supplementary is organized as follows. We recall the DSB algorithm for unconditional simulation from
\cite{debortoli2021neurips} in \Cref{sec:DSBalgorithm}. The proofs of our
propositions are given in \Cref{sec:proof-propositions}. In
\Cref{sec:lossfunctions}, we give details on the loss functions we use to train
CDSB. A continuous-time version of the conditional time-reversal and conditional
DSB is presented in \Cref{sec:cont-time-vers}. The forward-backward technique
used in our experiments is detailed in \Cref{sec:forw-backw-sampl}. Finally, we
provide experimental details and guidelines in \Cref{sec:experimental-details}.

\section{Diffusion \schro bridge}\label{sec:DSBalgorithm}
We recall here the DSB algorithm introduced by \cite{debortoli2021neurips} which is a numerical approximation of IPF\footnote{For discrete measures, IPF is also known as the Sinkhorn algorithm and can be implemented exactly \citep{peyre2019computational}.}. 
\begin{algorithm}[H]
    \caption{Diffusion \schro Bridge \citep{debortoli2021neurips}}
    \label{algo:ipf_score}
    \begin{algorithmic}[1] 
      \FOR{$n \in \{0, \dots,L\}$} \WHILE{not converged}
      \STATE Sample $\{X^j_{k}\}_{k,j=0}^{N,M}$, where  $X^j_0 \sim \pdata$, and \\
      $X^{j}_{k+1} = \mathbf{F}_{\phi^n}(k, X^{j}_{k})+\sqrt{2
        \gamma_{k+1}} Z^{j}_{k+1}$ 
        \STATE Compute $\hlb_n(\theta^n)$ approximating \eqref{eq:regressionbuncond}
        \STATE $\theta^{n} \leftarrow \textrm{Gradient Step}(\hlb_n(\theta^n))$ 
      \ENDWHILE \WHILE{not
        converged}
      \STATE Sample $\{X^j_{k}\}_{k,j=0}^{N,M}$, where $X^j_N \sim \pref$, and \\
      $X^j_{k-1}=\mathbf{B}_{\theta^n}(k, X^{j}_k)+\sqrt{2 \gamma_{k}}
      \tilde{Z}^{j}_{k}$ 
      \STATE Compute $\hlf_{n+1}(\phi^{n+1})$ approximating \eqref{eq:regressionfuncond}
      \STATE
      $\phi^{n+1} \leftarrow \textrm{Gradient Step}(\hlf_{n+1}(\phi^{n+1}))$
      \ENDWHILE \ENDFOR \STATE \textbf{Output: } $( \theta^{L},\phi^{L+1})$
    \end{algorithmic}
\end{algorithm}

In this (unconditional) SB scenario, the transition kernels satisfy
$q^n_{k|k+1}(x|x')=
\mathcal{N}(x;\mathbf{B}_{\theta^n}(k+1,x'),2\gamma_{k+1} \Id)$ and
$p_{k+1|k}^n(x'|x) = \mathcal{N}(x';\mathbf{F}_{\phi^n}(k,x),
2\gamma_{k+1} \Id)$ where $\theta^n$ is obtained by minimizing
\begin{equation} \textstyle{\ell^{b}_n(\theta)=\mathbb{E}_{p^{n}}[\sum_{k}\normLigne{\mathbf{B}_\theta(k+1,X_{k+1})-G_{n,k}(X_{k},X_{k+1})}^2]}\label{eq:regressionbuncond}
\end{equation}
for $G_{n,k}(x,x')=x'+\mathbf{F}_{\phi^n}(k,x)-\mathbf{F}_{\phi^n}(k,x')$ and $\phi^{n+1}$ by minimizing 
\begin{equation} \textstyle{\ell^{f}_{n+1}(\phi)=\mathbb{E}_{q^{n}}[\sum_{k}\normLigne{\mathbf{F}_\phi(k,X_k)-H_{n,k}(X_k,X_{k+1})}^2]}\label{eq:regressionfuncond}
\end{equation}
for $H_{n,k}(x,x')=x + \mathbf{B}_{\theta^n}(k+1,x')-\mathbf{B}_{\theta^n}(k+1,x)$. See \cite{debortoli2021neurips} for a derivation of these loss functions.


\section{Proofs of Propositions}
\label{sec:proof-propositions}
\subsection{Proof of Proposition \ref{prop:SBreformulation}}


Let $\bar{\pi}$ such that $\textup{KL}(\bar{\pi}|\bar{p}) < +\infty$, which
exists since we have that $\textup{KL}(\bar{\pi}^\star|\bar{p}) < +\infty$, and
$\bar{\pi}_0= \pjoin,~ \bar{\pi}_N =\pjref$,  where we define the joint forward process
$\bar{p}(x_{0:N},y_{0:N}):=p_{y_0}(x_{0:N})\bpobs(y_{0:N})$. Recall that 
$p_{y_0}(x_{0:n}):=p(x_0|y_0) \prod_{k=0}^{N-1}p_{k+1|k}(x_{k+1}|x_k)$ is 
the forward process starting from the posterior $p(x_0|y_0)$, and
$\bpobs(y_{0:N}):=\pobs(y_0) \prod_{k=0}^{N-1}\delta_{y_k}(y_{k+1})$ is the extended $y$-process. 
Since $\KLLigne{\bar{\pi}}{\bar{p}}<+\infty$ we have using the transfer theorem
\cite[Theorem 2.4.1]{kullback1997information} that
$\KLLigne{\bar{\pi}_{\textup{obs}}}{\bpobs}<+\infty$, where
$\bar{\pi}_{\textup{obs}}(y_{0:N}): = \int_{(\rset^d)^N} \bar{\pi}(x_{0:N}, y_{0:N}) \rmd
x_{0:N}$. In addition, using the chain rule for the Kullback--Leibler
divergence, see \cite[Theorem 2.4]{leonard2014some}, we get that
\begin{equation}
  \textstyle{
  \KLLigne{\bar{\pi}_{\textup{obs}}}{\bpobs} =  \KLLigne{\bar{\pi}_{\textup{obs},0}}{\pobs} + \int_{\mcy} \KLLigne{\bar{\pi}_{\textup{obs}|0}}{\bar{p}_{\textup{obs}|0}} \pobs(y) \rmd y < +\infty},
\end{equation}
where $\bar{p}_{\textup{obs}|0} = \prod_{k=0}^{N-1}\delta_{y_k}(y_{k+1})$ and
therefore
$\bar{\pi}_{\textup{obs}|0} =\bar{p}_{\textup{obs}|0}$. Since we also have that
$\bar{\pi}_{\textup{obs},0} = \pobs$ we get that
$\bar{\pi}_{\textup{obs}} = \bpobs$. Hence, letting $\pi^{c}$ be the
kernel such that $\bar{\pi} = \pi^c \otimes \bpobs$ we have using
\cite[Theorem 2.4]{leonard2014some} that
\begin{equation}
  \label{eq:KL_eq}
  \textstyle{\KLLigne{\bar{\pi}}{\bar{p}} = \int_{\mcy} \KL{\pi^c_y}{p_y} \pobs(y) \rmd y  . }
\end{equation}
In addition, we have $\bar{\pi}_0 = \pi_0^c \otimes \pobs =
\pjoin$. Similarly, we have
$\bar{\pi}_N = \pi_N^c \otimes \pobs = \pjref$. Hence,
$\pi_{y,0}^c = p(\cdot|y)$ and $\pi_{y,N}^c = \pref$, $\pobs$-almost surely.  Let
$\bar{\pi}^\star = \pi^{\star,c} \otimes \bpobs$ be the minimizer of
\eqref{eq:conditionalSBextended} and $\hat{\pi}^{c}$ be the minimizer of
\eqref{eq:SBuncondreformulated}. Then, we have that
$\bar{\pi} = \hat{\pi}^{c} \otimes \bpobs$ satisfies
$\KLLigne{\bar{\pi}^\star}{\bar{p}} \leq \KLLigne{\bar{\pi}}{\bar{p}}$. Using
\eqref{eq:KL_eq}, we have that
$\expeLigne{\KL{\pi^{\star,c}_Y}{p_Y}} \leq
\expeLigne{\KL{\hat{\pi}^c_Y}{p_Y}}$. But we have that
$\expeLigne{\KL{\hat{\pi}^{c}_Y}{p_Y}} \leq
\expeLigne{\KL{\pi^{\star,c}_Y}{p_Y}}$ since $\hat{\pi}^{c}$ is the minimizer of
\eqref{eq:SBuncondreformulated}. Using the uniqueness of the minimizer of
\eqref{eq:SBuncondreformulated} we have that $\pi^{\star,c} = \hat{\pi}^{c}$,
which concludes the proof.


\subsection{Proof of Proposition \ref{prop:IPFrecursion}}

Let $n \in \nset$ and $\bar{q}$ be such that
$\KLLigne{\bar{q}}{\bar{p}^n}<+\infty$ and $\bar{q}_N = \pjref$ (note that the
existence of such a distribution is ensured since
$\KLLigne{\pjoin \otimes \pjref}{\bar{p}^n_{0,N}} < +\infty$).  Using the chain
rule for the Kullback--Leibler divergence, see \cite[Theorem
2]{leonard2014some}, we have
\begin{equation}
  \label{eq:KL_eq_y_traj}
  \textstyle{\KLLigne{\bar{q}}{\bar{p}^n} = \KL{\bar{q}_{\textup{obs}}}{\bpobs} + \int_{\mcy^{N+1}} \KLLigne{\bar{q}_{|\textup{obs}}}{\bar{p}_{|\textup{obs}}^n} \rmd \bar{q}_{\textup{obs}}(y_{0:N})  , }
\end{equation}
where
$\bar{q}_{\textup{obs}} = \int_{\mcx^{N+1}} \bar{q}(x_{0:N}, y_{0:N}) \rmd
x_{0:N} $ and $\bar{q}_{|\textup{obs}}$ and $\bar{p}_{|\textup{obs}}^n$ are the
conditional distribution of $\bar{q}$, respectively $\bar{p}^n$ w.r.t. to
$y_{0:N}$. Since
$ \KL{\bar{q}_{\textup{obs}}}{\bar{p}_{\textup{obs}}}< +\infty$, we can use
\cite[Theorem 2.4]{leonard2014some} and we have
\begin{equation}
  \textstyle{\KL{\bar{q}_{\textup{obs}}}{\bar{p}_{\textup{obs}}} = \KL{\bar{q}_{\textup{obs},N}}{\bar{p}_{\textup{obs},N}} + \int_{\mcy} \KL{\bar{q}_{\textup{obs}|N}}{\bar{p}_{\textup{obs}|N}} \rmd \bar{q}_{\textup{obs},N}(y_N),}
\end{equation}
with
$\bar{p}_{\textup{obs}|N}(y_{0:N-1}|y_N)
=\prod_{k=0}^{N-1}\delta_{y_{k+1}}(y_{k})$. Therefore, since
$\KL{\bar{q}_{\textup{obs}}}{\bar{p}_{\textup{obs}}}<+\infty$, we get that
$\bar{q}_{\textup{obs}|N}(y_{0:N-1}|y_N) =
\prod_{k=0}^{N-1}\delta_{y_{k+1}}(y_{k})$. Since
$\bar{q}_{\textup{obs}, N} = \pobs$, we get that
$\bar{q}(x_{0:N}, y_{0:N}) = \bpobs(y_{0:N})\bar{q}(x_{0:N}|y_{0:N}) =
\bpobs(y_{0:N})\bar{q}(x_{0:N}|y_{N})$, where we have used that $y_N = y_k$
for $k \in \{0, \dots, N\}$, $\bpobs(y_{0:N})$ almost surely. Combining this
result and \eqref{eq:KL_eq_y_traj} we get that
\begin{align}
  \KLLigne{\bar{q}}{\bar{p}^n} &=  \textstyle{\int_{\mcy^{N+1}} \KLLigne{\bar{q}_{|\textup{obs}}}{\bar{p}_{|\textup{obs}}^n} \rmd \pobs(y_{0:N})  }
  =  \textstyle{\int_{\mcy} \KLLigne{\bar{q}(\cdot|y_N)}{\bar{p}^n(\cdot|y_N)} \rmd \pobs(y_{N})  , }
  \end{align}
Using \cite[Theorem 2]{leonard2014some}, we have that for any $y_N \in \mcy$
\begin{equation}
  \textstyle{
  \KLLigne{\bar{q}(\cdot|y_N)}{\bar{p}^n(\cdot|y_N} = \KLLigne{\pref}{\bar{p}_N^n(\cdot|y_N)} + \int_{\mcy} \KLLigne{\bar{q}(\cdot|y_N, x_N)}{\bar{p}^n(\cdot|y_N, x_N)} \pref(x_N) \rmd x_N .}
\end{equation}
For the IPF solution $\bar{q}^n$, we get that $\bar{q}^n(\cdot|y_N, x_N) = \bar{p}^n(\cdot|y_N, x_N)$. Therefore for any $x_{0:N} \in \mcx^{N+1}$ and $y_N \in \mcy$,
\begin{equation}
 \textstyle{\bar{q}^n(x_{0:N}|y_{N})= \pref(x_N)\prod_{k=0}^{N-1}
  \bar{p}^n_{k|k+1}(x_k|x_{k+1},y_{N})} .
\end{equation}
The proof is similar for any $x_{0:N} \in \mcx^{N+1}$ and $y_0 \in \mcy$, we have
\begin{equation}
  \textstyle{\bar{p}^{n+1}(x_{0:N}|y_0)= p(x_0|y_0) \prod_{k=0}^{N-1} \bar{q}^{n}_{k+1|k}(x_{k+1}|x_{k},y_{0}).}
\end{equation}

\subsection{Proof of Proposition \ref{prop:fasterconverence}}

Using \cite[Corollary 1]{leger2020gradient}, we get that for any $n \in \nset$
with $n \geq 1$
\begin{equation}
  \label{eq:leger_res}
  \KLLigne{\bar{\pi}^n_0}{\pjoin} + \KLLigne{\bar{\pi}^n_N}{\pjref} \leq \frac{2}{n}\KLLigne{\bar{\pi}^\star}{\bar{p}}.
\end{equation}
Similarly to \Cref{prop:IPFrecursion}, we have that for any $n \in \nset$, there
exists a Markov kernel $\pi^{c,n}$ such that
$\bar{\pi}^n = \bpobs \otimes \pi^{c,n}$. Recall that there exists a Markov
kernel $\pi^{c,\star}$ such that $\bar{\pi}^\star = \bpobs \otimes \pi^{c,\star}$
and that $\bar{p} = \bpobs \otimes p_y$. Hence, using \cite[Theorem
2.4]{leonard2014some}, we get that for any $n \in \nset$,
\begin{equation}
  \label{eq:leonard_1}
  \KLLigne{\bar{\pi}^n_0}{\pjoin}  = \expeLigne{\KLLigne{\pi^{c,n}_{Y,0}}{p(\cdot|Y)}}, \qquad \KLLigne{\bar{\pi}^n_N}{\pjref}  = \expeLigne{\KLLigne{\pi^{c,n}_{Y,N}}{\pref}}.
\end{equation}
Similarly, we have that
\begin{equation}
  \label{eq:leonard_2}
  \KLLigne{\bar{\pi}^\star}{\bar{p}} = \expeLigne{\KLLigne{\pi^{c,\star}_Y}{p_Y}} .
\end{equation}
We conclude the proof upon combining \eqref{eq:leger_res}, \eqref{eq:leonard_1}
and \eqref{eq:leonard_2}.


\section{Details on the loss functions}\label{sec:lossfunctions}

In this section, we simplify notation and write $Y$ for all the random variables
$Y_0,Y_1,...,Y_N$ as they are all equal almost surely under $\bar{p}^{n}$ and
$\bar{q}^{n}$, similarly to \Cref{sec:cond-simul-sb}. 
In \Cref{sec:cond-simul-sb}, the transitions satisfy
$\bar{q}^n_{k|k+1}(x|x',y)=
\mathcal{N}(x;\mathbf{B}^{y}_{\theta^n}(k+1,x'),2\gamma_{k+1} \Id)$ and
$\bar{p}_{k+1|k}^n(x'|x,y) = \mathcal{N}(x';\mathbf{F}^{y}_{\phi^n}(k,x),
2\gamma_{k+1} \Id)$ where $\theta^n$ is obtained by minimizing
\begin{equation} \textstyle{\ell^{b}_n(\theta)=\mathbb{E}_{\bar{p}^{n}}[\sum_{k}\normLigne{\mathbf{B}_\theta^Y(k+1,X_{k+1})-G_{n,k}^Y(X_{k},X_{k+1})}^2]}\label{eq:regressionbcond_app}
\end{equation}
for $G_{n,k}^y(x,x')=x'+\mathbf{F}^{y}_{\phi^n}(k,x)-\mathbf{F}^{y}_{\phi^n}(k,x')$ 
and $\phi^{n+1}$ by minimizing 
\begin{equation} \textstyle{\ell^{f}_{n+1}(\phi)=\mathbb{E}_{\bar{q}^{n}}[\sum_{k}\normLigne{\mathbf{F}_\phi^Y(k,X_k)-H_{n,k}^Y(X_k,X_{k+1})}^2]}\label{eq:regressionfcond_app}
\end{equation}
for $H_{n,k}^y(x,x')=x +
\mathbf{B}^{y}_{\theta^n}(k+1,x')-\mathbf{B}^{y}_{\theta^n}(k+1,x)$. We
justify these formulas by proving the following result which is a
straightforward extension of \cite{debortoli2021neurips}. We recall that for any
$n \in \nset$, $k \in \{0, \dots, N\}$, $x_k,x_{k+1} \in \rset^d$ and
$y \in \mcy$,
$b^{n,y}_{k+1}(x_{k+1}) = -f^{n,y}_{k}(x_{k+1})+2 \nabla \log
\bar{p}^{n}_{k+1}(x_{k+1}|y)$ and
$f^{n+1,y}_{k}(x_{k}) = -b^{n,y}_{k+1}(x_{k})+2 \nabla \log
\bar{q}^{n}_{k}(x_k|y)$.\footnote{We should have conditioned w.r.t. $y_N$ and
  $y_0$ but since $y_0 = y_1 = \dots = y_N$ under $\pobs$ we simply conditioned
  by $y$ which can be any of these values.}


\begin{proposition}\label{prop:generalizedscorematching} 
  Assume that for any $n \in \nset$ and $k \in \{0, \dots, N-1\}$,
  $\bar{q}_k(\cdot|y)$ and $\bar{p}_k(\cdot|y)$ are bounded and
  \begin{equation}
    \bar{q}_{k|k+1}^n(x_k|x_{k+1},y) = \mathcal{N}(x_k;B_{k+1}^{n,y}(x_{k+1}), 2\gamma_{k+1}
  \Id)  ,\ \bar{p}_{k+1|k}^n(x_{k+1}|x_{k},y) = \mathcal{N}(x_{k+1};F_{k}^{n,y}(x_{k}), 2\gamma_{k+1}
  \Id) ,
  \end{equation}
 with $B^{n,y}_{k+1}(x) = x +\gamma_{k+1}b^{n,y}_{k+1}(x)$,
  $F^{n,y}_{k}(x)= x +\gamma_{k+1}f_k^{n,y}(x)$ for any $x \in \rset^d$. Then we
  have for any $n \in \nset$ and $k\in \{0, \dots, N-1\}$
\begin{align}
&\textstyle{B^{n}_{k+1}=\argmin_{\mathrm{B}\in \rmL^2(\rset^d \times \mcy, \rset^d)} \expeMarkovLigne{ \bar{p}^{n}}{\normLigne{\mathrm{B}(X_{k+1},Y)-G_{n,k}^Y(X_k,X_{k+1})}^2}},\label{eq:regressionb}\\
  &\textstyle{F^{n+1}_{k}=\argmin_{\mathrm{F}\in \rmL^2(\rset^d \times \mcy, \rset^d)} \expeMarkovLigne{\bar{q}^{n}}{\normLigne{\mathrm{F}(X_k,Y)-H_{n,k}^Y(X_k,X_{k+1})}^2 }},\label{eq:regressionf}\\
  &G_{n,k}^y(x,x') = x' + F^{n,y}_k(x)-F^{n,y}_{k}(x') , \qquad H_{n,k}^y(x,x') = x + B^{n,y}_{k+1}(x')-B^{n,y}_{k+1}(x) .
 \end{align} 
\end{proposition}

\begin{proof}
  We only prove \eqref{eq:regressionb} since the proof \eqref{eq:regressionf} is
similar. Let $n \in \nset$ and $k \in \{0, \dots, N-1\}$. For any
$x_{k+1} \in \rset^d$ we have
\begin{equation}
  \textstyle{
    \bar{p}^n_{k+1}(x_{k+1}|y) = (4 \uppi \gamma_{k+1})^{-d/2} \int_{\rset^d} \bar{p}^n(x_k|y) \exp[-\normLigne{F_k^{n,y}(x_k) - x_{k+1}}^2/(4\gamma_{k+1})] \rmd x_k  ,
    }
\end{equation}
with $F_k^{n,y}(x_k) = x_k + \gamma_{k+1} f_k^{n,y}(x_k)$. Since $\bar{p}^n_k>0$ is bounded
using the dominated convergence theorem we have for any $x_{k+1} \in \rset^d$
\begin{equation}
  \textstyle{\nabla_{x_{k+1}} \log \bar{p}^n_{k+1} (x_{k+1}|y) = \int_{\rset^d} (F_k^{n,y}(x_k) - x_{k+1})/(2 \gamma_{k+1})~\bar{p}_{k|k+1}(x_k | x_{k+1},y) \rmd x_{k}  . }
\end{equation}
Therefore we get that for any $x_{k+1} \in \rset^d$
\begin{equation}
  \textstyle{b_{k+1}^{n,y}(x_{k+1}) = \int_{\rset^d} (F_k^{n,y}(x_k) - F_{k}^{n,y}(x_{k+1}))/\gamma_{k+1}~  \bar{p}_{k|k+1}(x_k | x_{k+1},y) \rmd x_{k}  . }
\end{equation}
This is equivalent to
\begin{equation}
  \textstyle{B_{k+1}^{n,y}(x_{k+1})  = \CPELigne{X_{k+1} + F_k^{n,Y}(X_k) - F_{k}^{n,Y}(X_{k+1})}{X_{k+1} = x_{k+1}, Y=y}}  ,
\end{equation}
Hence, we get that
\begin{equation}
\textstyle{B^n_{k+1}=\argmin_{\mathrm{B}\in \rmL^2(\rset^d \times \mcy, \rset^d)} \expeMarkovLigne{ \bar{p}^{n}}{\normLigne{\mathrm{B}(X_{k+1},Y)-(X_{k+1} + F^{n,Y}_k(X_{k})-F^{n,Y}_{k}(X_{k+1}))}^2}}  ,
\end{equation}
which concludes the proof.
\end{proof}


\section{Continuous-time versions of CSGM and CDSB}
\label{sec:cont-time-vers}

In the following section, we consider the continuous-time version of CSGM and CDSB. The continuous-time dynamics we recover can be seen as the
extensions of the continuous-time dynamics obtained in the unconditional
setting, see \cite{song2020score,debortoli2021neurips}.

\subsection{Notation}

We start by introducing a few notations.  The space of continuous functions from
$\ccint{0,T}$ to $\rset^d \times \mcy$ is denoted
$\contspace = \rmc(\ccint{0,T}, \rset^d \times \mcy)$ and we denote
$\Pens(\contspace)$ the set of probability measures defined on $\contspace$.  A
probability measure $\Pbb \in \Pens(\contspace)$ is \emph{associated with a
  diffusion} if it is a solution to a martingale problem, i.e.
$\Pbb \in \Pens(\contspace)$ is associated with
$\rmd \bfX_t = b(t, \bfX_t) \rmd t + \sqrt{2} \rmd \bfB_t$  if for any
$\varphi \in \rmc_c^2(\rset^d, \rset)$, $(\bfZ_t^\varphi)_{t \in \ccint{0,T}}$ is a
$\Pbb$-local martingale, where for any $t \in \ccint{0,T}$
\begin{equation}
\label{eq:martingale_pbm}
\textstyle{\bfZ_t^\varphi = \varphi(\bfX_t) - \int_0^t \generator_s(\varphi)(\bfX_s) \rmd s }, \qquad  \generator_t(\varphi)(x) = \langle b(t, x) , \nabla \varphi(x) \rangle +  \Delta \varphi(x).
\end{equation}
Here $\rmc_c^2(\rset^d, \rset)$ denotes the space of twice differentiable functions from $\rset^d$ to $\rset$ with compact support.
Doing so, $\Pbb$ is uniquely defined up to the initial distribution $\Pbb_0$. Finally, for any
$\Pbb \in \Pens(\contspace)$, we introduce $\Pbb^R$ the time reversal of $\Pbb$,
\ie \ for any $\msa \in \mcb{\contspace}$ we have $\Pbb^R(\msa) = \Pbb(\msa^R)$
where $\msa^R = \ensembleLigne{t \mapsto\omega(T-t)}{\omega \in \msa}$.

\subsection{Continuous-time CSGM}

Recall that in the unconditional setting, we consider a forward noising dynamics
$(\bfX_t)_{t \in \ccint{0,T}}$ initialized with $\bfX_0 \sim \pdata$ and
satisfying the following Stochastic Differential Equation (SDE)
$\rmd \bfX_t = -\bfX_t \rmd t + \sqrt{2} \rmd \bfB_t$, i.e. an
Ornstein--Uhlenbeck process. In this case, under entropy condition on
$(\bfX_t)_{t \in \ccint{0,T}}$ (see \cite{cattiaux2021time} for instance) we
have that the time-reversal process
$(\tbfX_t)_{t \in \ccint{0,T}} = (\bfX_{T-t})_{t \in \ccint{0,T}}$ also satisfy
an SDE given by
$\rmd \tbfX_t = \{\tbfX_t + 2 \nabla \log p_{T-t}(\tbfX_t)\} \rmd t + \sqrt{2} \rmd
\bfB_t$, where $p_t$ is the density of $\bfX_t$ w.r.t. the Lebesgue
measure, and $(\tbfX_t)_{t \in \ccint{0,T}}$ is initialized with
$\tbfX_0 \sim \mathcal{L}(\bfX_T)$, the law of $\bfX_T$ of density $q_T$. Using the geometric ergodicity of the
Ornstein--Uhlenbeck process, $\mathcal{L}(\bfX_T)$ is close (w.r.t.
to the Kullback--Leibler divergence for instance) to $\pref= \mathcal{N}(0,\Id)$. Hence,
we obtain that considering $(\bfZ_t)_{t \in \ccint{0,T}}$ such that
$\bfZ_0 \sim \mathcal{N}(0,\Id)$ and
$\rmd \bfZ_t = \{\bfZ_t + 2 \nabla \log p_{T-t}(\bfZ_t)\} \rmd t + \sqrt{2} \rmd
\bfB_t$, $\bfZ_T$ is approximately distributed according to $\pdata$. The
Euler--Maruyama discretization of $(\bfZ_t)_{t \in \ccint{0,T}}$ is the SGM used
in existing work.

In the conditional setting, we consider the following dynamics
$\rmd \bfX_t = -\bfX_t \rmd t + \sqrt{2} \rmd \bfB_t$ and $\rmd \bfY_t = 0$,
where $(\bfX_0, \bfY_0) \sim \pjoin$. Note that we have $\bfY_t = \bfY_0$ for
all $t \in \ccint{0,T}$. Using the ergodicity of the Ornstein--Uhlenbeck process,
we get that $\mathcal{L}(\bfX_T, \bfY_t)$ is close (w.r.t. to the
Kullback--Leibler divergence for instance) to $\pjref$. Let
$(\tbfX_t, \tbfY_t)_{t \in \ccint{0,T}} = (\bfX_{T-t}, \bfY_{T-t})_{t \in
  \ccint{0,T}}$. We have that
$\rmd \tbfX_t = \{\tbfX_t + 2 \nabla \log p_{T-t}(\tbfX_t|\tbfY_t)\} \rmd t +
\sqrt{2} \rmd \bfB_t$ and $\rmd \tbfY_t = 0$ with $\tbfX_0,\tbfY_0 \sim \mathcal{L}(\bfX_T,\bfY_T)$. Hence, we obtain that considering
$(\bfZ_t)_{t \in \ccint{0,T}}$ such that $(\bfZ_0, \bfY_0) \sim \pjref$ and
$\rmd \bfZ_t = \{\bfZ_t + 2 \nabla \log p_{T-t}(\bfZ_t|\bfY_0)\} \rmd t + \sqrt{2}
\rmd \bfB_t$, $\bfZ_T$ is approximately distributed according to $\pdata$. The
Euler--Maruyama discretization of $(\bfZ_t,\bfY_t)_{t \in \ccint{0,T}}$ is the
conditional SGM.

\subsection{Connection with normalizing flows and estimation of the evidence}\label{sec:NFevidence}

It has been shown that SGMs can be used for log-likelihood computation. Here, we
further show that they can be used to estimate the evidence
$\log p(\yobs)$ when $g(\yobs|x)$ can be computed pointwise. This is the case for many models considered in the diffusion literature, see for
instance \cite{kadkhodaie2021stochastic,kawar2021snips,kawar2022denoising}. Indeed, we have that for any $x \in \rset^d$,
$\log p(\yobs) = \log g(\yobs|x) + \log p(x) - \log p(x|\yobs)$. The term $\log p(x)$ can be estimated using an unconditional SGM whereas the term
$\log p(x|\yobs)$ can be estimated using a CSGM. Note that both conditional and unconditional SGM can be trained simultaneously adding a
``sink'' state to $\mcy$, i.e. considering $\mcy \cup \{\emptyset\}$, see
\cite{ho2021classifier} for instance.

We briefly explain how one can compute $\log p(x|\yobs)$ and refer to
\cite{song2020score} for a similar discussion in the unconditional
setting. Recall that the forward noising process is given by
$\rmd \bfX_t = -\bfX_t \rmd t + \sqrt{2} \rmd \bfB_t$ and $\rmd \bfY_t = 0$,
where $(\bfX_0, \bfY_0) \sim \pjoin$. We introduce another process
$(\hbfX_t, \hbfY_t)_{t \in \ccint{0,T}}$ with deterministic dynamics which has the same marginal distributions, i.e. $\mathcal{L}(\bfX_T,\bfY_T)=\mathcal{L}(\hbfX_T,\hbfY_T)$. This process is defined by $\rmd \hbfX_t = \{-\hbfX_t - \nabla \log p_t(\hbfX_t|\hbfY_t)\} \rmd t$ and $\rmd \hbfY_t = 0$ with
$(\hbfX_0, \hbfY_0) \sim \pjoin$. As one has
$\rmd \log p_t(\hbfX_t|\hbfY_t) = \mathrm{div}(-\hbfX_t - \nabla \log
p_t(\hbfX_t|\hbfY_t)) \rmd t$, we can approximately compute
$\log p(\hbfX_0|\hbfY_0)$ by integrating numerically this Ordinary Differential Equation (ODE). There are practically three sources of errors, one is the score approximation, one is the numerical integration error and the last one one is due to the fact that $\mathcal{L}(\hbfX_T)$ is unknown so we use the approximation $\mathcal{L}(\hbfX_T) \approx \pref$.

\subsection{Continuous-time CDSB}

In this section, we introduce an IPF algorithm for solving CSB
problems in continuous-time. The following results are a generalization to the conditional framework of the continuous-time results of
\cite{debortoli2021neurips}. The CDSB algorithm described in \Cref{algo:ipf_score_cond} can be seen as a Euler--Maruyama discretization of
this IPF scheme combined to neural network approximations of the drifts. Let $\Pbb \in \Pens(\contspace)$ be a given reference measure
(thought as the continuous time analog of $\bar{p}$).  The dynamical continuous
formulation of the SB problem can be written as follows
\begin{equation}
  \label{eq:dynamic_schro}
  \textstyle{
    \Pi^\star = \argmin \ensemble{\KLLigne{\Pi}{\Pbb}}{\Pi \in \Pens(\mathcal{C}), \ \Pi_0 = \pjoin, \ \Pi_T = \pjref}.
    }
\end{equation}
We define the IPF $(\Pi^n)_{n \in \nset}$ such that $\Pi^0 = \Pbb$ and
associated with $\rmd \bfX_t = - \bfX_t + \sqrt{2} \rmd \bfB_t$ and
$\rmd \bfY_t = 0$, with $(\bfX_0, \bfY_0) \sim \pjoin$. Next for any
$n \in \nset$ we define
\begin{align}
  \textstyle{\Pi^{2n+1}} &= \textstyle{\argmin \ensemble{\KLLigne{\Pi}{\Pi^{2n}}}{\Pi \in \Pens(\mathcal{C}), \ \Pi_T = \pjref}, } \\
  \textstyle{\Pi^{2n+2}} &= \textstyle{\argmin \ensemble{\KLLigne{\Pi}{\Pi^{2n+1}}}{\Pi \in \Pens(\mathcal{C}), \ \Pi_0 = \pjoin}.}
\end{align}

The following result is the continuous
counterpart of \Cref{prop:IPFrecursion}.
\begin{proposition}
  \label{prop:continuous_schro}
  Assume that $p_N, \pref >0$, $\mathrm{H}(\pref)<+\infty$ and
  $\int_{\rset^d} \absLigne{\log p_{N|0}(x_N|x_0)} \pdata(x_0) \pref(x_N) <
  +\infty$. In addition, assume that there exist $\Mbb \in \Pens(\contspace)$,
  $U \in \rmc^1(\rset^d, \rset)$, $C \geq 0$ such that for any $n \in \nset$,
  $x \in \rset^d$, $\KLLigne{\Pi^n}{\Mbb} < +\infty$,
  $\langle x, \nabla U(x) \rangle \geq - C(1+\normLigne{x}^2)$ and $\Mbb$ is
  associated with $(\bfX_t, \bfY_t)_{t \in \ccint{0,T}}$ such that 
  \begin{equation}
    \label{eq:diff_q}
    \textstyle{
      \rmd \bfX_t = -\nabla U(\bfX_t) \rmd t + \sqrt{2} \rmd \bfB_t, \qquad \rmd \bfY_t = 0
      }
    \end{equation}
    with $\bfX_0$ distributed according to the invariant distribution of \eqref{eq:diff_q}.  Then,
    for any $n \in \nset$ we have:
  \begin{enumerate}[wide, labelwidth=!, itemindent=!, labelindent=0pt, label=(\alph*)]
  \item $(\Pi^{2n+1})^R$ is associated with
    $(\bfX_t^{2n+1},\bfY_t^{2n+1})_{t \in \ccint{0,T}}$ such that
    $\rmd \bfX_t^{2n+1} = b^n_{T-t}(\bfX_t^{2n+1},\bfY_t^{2n+1}) \rmd t + \sqrt{2} \rmd
    \bfB_t$ and $\rmd \bfY_t^{2n+1} = 0$ with
    $(\bfX_0^{2n+1},\bfY_0^{2n+1}) \sim \pjref$;
  \item $\Pi^{2n+2}$ is associated with
    $\rmd \bfX_t^{2n+2} = f^{n+1}_t( \bfX_t^{2n+2},\bfY_t^{2n+2}) \rmd t + \sqrt{2} \rmd
    \bfB_t$ with $(\bfX_0^{2n+2},\bfY_0^{2n+2}) \sim \pjoin$;
  \end{enumerate}
  \vspace{-.3cm} where for any $n \in \nset$, $t \in \ccint{0,T}$,
  $x \in \rset^d$ and $y \in \mcy$,
  $b^{n}_t(x,y) = -f^{n}_t(x,y) +2 \nabla \log p^{n}_t(x|y)$,
  $f^{n+1}_t(x,y) = -b^n_t(x,y) +2 \nabla \log q^n_t(x|y)$, with
  $f^0_t(x) = -x$, and $p^n_t(\cdot|y)$, $q_t^n(\cdot|y)$ the densities of
  $\Pi^{2n}_{t|y}$ and $\Pi_{t|y}^{2n+1}$.
\end{proposition}

\begin{proof}
  The proof of this proposition is a straightforward extension of \cite[Proposition 6]{debortoli2021neurips}.
\end{proof}

We have seen in \Cref{sec:NFevidence} that it is possible to use CSGM to evaluate numerically the evidence when $g(\yobs|x)$ can be computed pointwise. The same strategy can be applied to both DSB and CDSB; see \cite[Section H.3]{debortoli2021neurips} for details for DSB. In both cases, there exists an ordinary differential equation admitting the same marginals as the diffusion solving the SB, resp. the CSB, problem. By integrating these ODEs, we can obtain $\log p(x)$ and $\log p(x|\yobs)$ for any $x$ and thus can compute the evidence. Contrary to SGM and CSGM, the terminal state of the diffusion is exactly equal to the reference measure by design. So practically, we only have two instead of three sources of errors for SGM/CSGM: one is the drift approximation, one is the numerical integration error.

\section{Forward-Backward Sampling}
\label{sec:forw-backw-sampl}

We detail in this section the forward-backward sampling approach and its connection with \cite{spantini2019coupling} when using an unconditional $\pref$. In \cite{spantini2019coupling}, it is proposed to first learn a
deterministic transport map
$\mathcal{U}(x,y):\mathcal{X}\times\mathcal{Y}\to\mathcal{X}\times\mathcal{Y}$
from $(X,Y)\sim\pjoin$ to $\pjref$,
then transport back the $X$-component through
$\mathcal{S}(\cdot,\yobs)^{-1}$ where $\mathcal{S}:\mathcal{X}\times\mathcal{Y}\to\mathcal{X}$ is the $X$-component of $\mathcal{U}$.  In other words, this is to say sampling
$\hat{X}^{\textup{pos}} \sim p(x|\yobs)$ corresponds to the two-step transformation
\begin{equation}\label{eq:composedmap-supp}
\hat{X}^{\textup{ref}},\hat{Y}^{\textup{ref}}=\mathcal{U}(X,Y),~~ \hat{X}^{\textup{pos}}=\mathcal{S}(\cdot,\yobs)^{-1}(\hat{X}^{\textup{ref}}).
\end{equation}

The proposed CSB \eqref{eq:conditionalSBextended} can be thought of as the SB version of this idea. We learn a stochastic transport map from
$\pjoin(x,y)$ to $\pref(x,y)$. The CSB $\pi^{\star}$ defines, when conditioned on
$x_{0}$ and $\yobs$, a (stochastic) transport map $\pi^{c,\star}_{\yobs}(x_{N}|x_{0})$
from $p(x_{0}|\yobs)$ to $\pref(x_{N})$; and, when
conditioned on $x_{N}$ and $\yobs$, a (stochastic) transport map
$\pi^{c,\star}_{\yobs}(x_{0}|x_{N})$ from $\pref(x_{N})$ to $p(x_{0}|\yobs)$. In
practice, we learn using CDSB separate half-bridges
$\bar{p}^{L}(x_{1:N}|x_{0},\yobs)$ and $\bar{q}^{L}(x_{0:N-1}|x_{N},\yobs)$.

\cite{spantini2019coupling} remarked that, since the estimator ${\mathcal{S}}$
may be imperfect, $\hat{X}^{\textup{ref}}$ may not have distribution $\pref$
exactly. In this case, \eqref{eq:composedmap-supp} allows for the cancellation of errors between $\mathcal{S}$ and
$\mathcal{S}(\cdot,\yobs)^{-1}$. 

We can exploit a similar idea in the CSB framework by defining an analogous forward-backward sampling procedure 
\begin{equation}
\hat{X}_{N}\sim \bar{p}^{L}_{N|0}(x_{N}|X,Y),~~\hat{X}_{0}\sim \bar{q}^{L}_{0|N}(x_{0}|\hat{X}_{N},\yobs).\label{eq:fwdbwdsampling-supp}
\end{equation}
As $\bar{q}^{L}$ is the approximate time reversal of $\bar{p}^{L}$,
\eqref{eq:fwdbwdsampling-supp} exhibits similar advantages as \eqref{eq:composedmap-supp}
when the half-bridge $\bar{p}^{L}(x_{0:N}|\yobs)$ is only an approximation to the CSB solution. While the forward and backward processes are stochastic
and are not exact inverses of each other, using this forward-backward sampling
may inevitably lead to increased variance. However, we found in practice that
this forward-backward sampling procedure can still improve sampling quality (see \eg ~ Figures \ref{fig:2dconditional}, \ref{fig:mnistinpainting}).

\section{Experimental Details}
\label{sec:experimental-details}

\subsection{Experimental Setup}

\textbf{Network parameterization}. Two parameterizations are possible for learning $\mathbf{F}$ and $\mathbf{B}$. In the main text, we described one parameterization in which we parameterize $\mathbf{F},\mathbf{B}$ directly as $\mathbf{F}_\phi^y(k,x),\mathbf{B}_\theta^y(k,x)$ and learn the network parameters $\phi,\theta$. 
\begin{wrapfigure}{r}{5cm}
    \centering
    \vspace{-0.3cm}
    \includegraphics[width=.98\linewidth]{Plots/MNIST_superres_supp/N=5 PSNR.png} \\
    \includegraphics[width=.98\linewidth]{Plots/MNIST_superres_supp/N=5 SSIM.png}
    \vspace{-0.2cm}
    \caption{Test set PSNR and SSIM against the number of training steps for MNIST 4x SR.}
    \label{fig:psnrimprove}
    \vspace{-1.2cm}
\end{wrapfigure}
Alternatively, we can parameterize $\mathbf{F}^y(k,x)=x+\gamma_{k+1}\mathbf{f}_\phi^y(k,x),\mathbf{B}^y(k+1,x)=x+\gamma_{k+1}\mathbf{b}_\theta^y(k+1,x)$ and learn the network parameters $\phi,\theta$ for $\mathbf{f}_\phi^y,\mathbf{b}_\theta^y$ instead. 
For the 2D and BOD examples, we use a fully connected network with positional encodings as in \cite{debortoli2021neurips} to learn $\mathbf{f}_\phi^y,\mathbf{b}_\theta^y$, with $y$ as an additional input by concatenation with $x$. 
For the MNIST and CelebA examples, we follow earlier work and utilize the conditional U-Net architecture in \cite{nichol2021beatgans}. Since residual connections are already present in the U-Net architecture, we can adopt the $\mathbf{F}_\phi^y,\mathbf{B}_\theta^y$ parameterization. In our experiments, we experiment with both parameterizations and find that the $\mathbf{f}_\phi^y,\mathbf{b}_\theta^y$ parameterization is more suitable for neural network architectures without residual connections. On the other hand, both parameterizations obtained good results when using the U-Net architecture. For consistency, all reported image experiment results use the $\mathbf{F}_\phi^y,\mathbf{B}_\theta^y$ parameterization, and we leave the choice of optimal parameterization as future research. 

\textbf{Network warm-starting}. As observed by \cite{debortoli2021neurips}, since the networks at IPF iteration $n$ are close to the networks at iteration $n-1$, it is possible to warm-start $\phi^n,\theta^n$ at $\phi^{n-1},\theta^{n-1}$ respectively. Empirically, we observe that this approach can significantly reduce training time at each CDSB iteration. Compared to CSGM, we usually observe immediate improvement in $
\mathbf{B}_{\theta^2}$ during CDSB iteration 2 when the network is warm-started at $\theta^1$ after CDSB iteration 1 (see \eg ~ \Cref{fig:psnrimprove}). As CSGM corresponds to the training objective of $\theta^1$ at CDSB iteration 1, this shows that the CDSB framework is a generalization of CSGM with observable benefits starting CDSB iteration 2. 

\textbf{Conditional initialization}. 
In the main text, we considered joint reference measures of the form
$\pjref(x,y)=\pref(x|y)\pobs(y)$ 
and simple choices for $\pref(x|y)$ such as 
$\vois(x;y,\sigma_{\textup{ref}}^{2} \Id)$ for image super-resolution. We also explore two more choices for $\pref(x|y)$ in our experiments. The first choice simply replaces the initialization mean from $y$ to a neural network function $\mu_\textup{ref}(y)$. 
This neural network can be pre-trained directly to estimate the conditional mean of $p(x|y)$ using standard regression with MSE loss. In the case of multi-modal $p(x|y)$ such as in the case of image inpainting, we can also train $\mu_\textup{ref}(y)$ to estimate the conditional mean of $p_N(x_N|y)$, where $x_N$ follows a standard diffusion process. In essence, we can train $\mu_\textup{ref}(y)$ to facilitate $p_N(x_N|y)\approx\pref(x_N|y)$ and shorten the noising process. Note that the CDSB framework is still useful in this context since $p_N(x_N|y)$ may not be well-approximated by a Gaussian distribution, which is precisely the issue CDSB is designed to tackle. 
Another class of conditional initialization we consider is the Ensemble Kalman Filter (EnKF), which is an ensemble-based method approximating linear Gaussian posterior updates. In this case, $\pref(x|y)$ is taken to be $\vois(x;\mu_{\textup{ref}}(y),\textup{diag}(\sigma_{\textup{ref}}^{2}(y))$ where $\mu_{\textup{ref}}(y),\sigma_{\textup{ref}}^{2}(y)$ are the sample mean and variance of the EnKF posterior ensemble. Intuitively, $\pref(x|y)$ is now an approximation of the true posterior $p(x|y)$ using linear prior-to-posterior mappings, which is further corrected for non-linearity and non-Gaussianity by the CDSB. 


\textbf{Time step schedule}. For the selection of the time step sequence $\{\gamma_k\}_{k=1}^{N}$, we follow \cite{ho2020denoising,nichol2021beatgans} and consider a linear schedule where $\gamma_1=\gamma_\textup{min}$, $\gamma_N=\gamma_\textup{max}$, and $\gamma_k=\gamma_\textup{min}+\frac{k-1}{N-1}(\gamma_\textup{max}-\gamma_\textup{min})$. In this way, the diffusion step size gets finer as the reverse process approaches $\pi_0=\pdata$, so as to increase the accuracy of the generated samples.



\subsection{2D Synthetic Examples}
For the 2D examples, we use $N=50$ diffusion steps and choose the time step schedule such that $\gamma_\textup{min}={10}^{-4},\gamma_\textup{max}=0.005$. At each IPF iteration, we train the network for 30,000 iterations using the Adam optimizer with learning rate ${10}^{-4}$ and a batch size of 100. 


\subsection{Biochemical Oxygen Demand Model}
For the BOD example, we again use $N=50$ diffusion steps with time schedule $\gamma_\textup{min}=\gamma_\textup{max}=0.01$. For CDSB-C, we use the shortened time schedule $\gamma_\textup{min}=\gamma_\textup{max}=0.005$ and a neural network regressor of the same architecture (with $x$ and $k$ components removed) as the conditional initialization. The batch size and optimizer settings are the same as above. 

We report the estimated posterior moments as well as their standard deviation in \Cref{tab:bodresult-supp}. We further plot the convergence of RMSE for each of the statistics in \Cref{fig:bodconvergence}.
As can be observed, IPF converges after about 20 iterations, and errors for all statistics are improved compared with CSGM (corresponding to IPF iteration 1).
Using conditional initialization also helps with localizing the problem and reduces estimation errors especially in early iterations.


\tabcolsep=0.2cm
\begin{table}[t]
\small
    \begin{centering}
    \begin{tabular}{|c|c|c|c|c|c|c|c|}
    \hline 
    \multirow{1}{*}{} &  & MCMC & CDSB & CDSB-FB & CDSB-C & MGAN & IT\tabularnewline
    \hline 
    \multirow{2}{*}{Mean} & $x_{1}$ & .075 & .066\textpm.010 & .068\textpm.010 & \textbf{.072\textpm.007} & .048 & .034\tabularnewline
    \cline{2-8} \cline{3-8} \cline{4-8} \cline{5-8} \cline{6-8} \cline{7-8} \cline{8-8} 
     & $x_{2}$ & .875 & .897\textpm.019 & .897\textpm.017 & \textbf{.891\textpm.013} & .918 & .902\tabularnewline
    \hline 
    \multirow{2}{*}{Var} & $x_{1}$ & .190 & .184\textpm.007 & \textbf{.190\textpm.007} & .188\textpm.005 & .177 & .206\tabularnewline
    \cline{2-8} \cline{3-8} \cline{4-8} \cline{5-8} \cline{6-8} \cline{7-8} \cline{8-8} 
     & $x_{2}$ & .397 & .387\textpm.006 & .391\textpm.006 & \textbf{.393\textpm.005} & .419 & .457\tabularnewline
    \hline 
    \multirow{2}{*}{Skew} & $x_{1}$ & 1.94 & \textbf{1.90\textpm.038} & 2.01\textpm.041 & \textbf{1.90\textpm.028} & 1.83 & 1.63\tabularnewline
    \cline{2-8} \cline{3-8} \cline{4-8} \cline{5-8} \cline{6-8} \cline{7-8} \cline{8-8} 
     & $x_{2}$ & .681 & .591\textpm.018 & .628\textpm.018 & .596\textpm.014 & \textbf{.630} & .872\tabularnewline
    \hline 
    \multirow{2}{*}{Kurt} & $x_{1}$ & 8.54 & 7.85\textpm.210 & \textbf{8.54\textpm.239} & 8.00\textpm.147 & 7.64 & 7.57\tabularnewline
    \cline{2-8} \cline{3-8} \cline{4-8} \cline{5-8} \cline{6-8} \cline{7-8} \cline{8-8} 
     & $x_{2}$ & 3.44 & 3.33\textpm.035 & \textbf{3.51\textpm.041} & 3.27\textpm.035 & 3.19 & 3.88\tabularnewline
    \hline 
    \end{tabular}
    \par\end{centering}
\caption{\label{tab:bodresult-supp} Estimated posterior moments and their standard deviations for the BOD example. The closest estimates to MCMC are highlighted in bold. }
\end{table}


\begin{figure}[t]
\begin{centering}
\includegraphics[width=17cm]{Plots/BOD/biochemical_result.png}
\par\end{centering}
\caption{\label{fig:bodconvergence} Convergence of estimated posterior moments with increasing number of CDSB iterations. }
\end{figure}

\subsection{Image Experiments}
\label{subsec:imageexperimentdetail}
For all image experiments, we use the Adam optimizer with learning rate ${10}^{-4}$ and train for 500k iterations in total. Since both $\mathbf{F}$ and $\mathbf{B}$ needs to be trained, the training time is approximately doubled for CDSB. Following \cite{song2020improved}, we make use of the exponential moving average (EMA) of the network parameters with EMA rate 0.999 at test time. We use $\gamma_\textup{min}=5\times{10}^{-5}$ for all experiments unless indicated otherwise and perform a parameter sweep for $\gamma_\textup{max}$ in $\{0.005, 0.01, 0.05, 0.1\}$. The optimal $\gamma_\textup{max}$ depends on the number of timesteps $N$ and the discrepancy between $p(x|y)$ and $\pref$. When using large $N$ or conditional $\pref(x|y)$, we find $\gamma_\textup{max}$ can be taken smaller. 

\subsubsection{MNIST}
For the MNIST dataset, we use a U-Net architecture with 3 resolution levels each with 2 residual blocks. The numbers of filters at each resolution level are 64, 128, 128 respectively. The total number of parameters is 6.6m, and we use batch size 128 for training. Since we observe overfitting on the MNIST training set for all methods, we also apply dropout with $p=0.1$ for the MNIST experiments. For each CDSB iteration, 100k or 250k training steps are used, corresponding to $L=5$ or $L=2$ CDSB iterations in total, which we find to be sufficient on this simpler dataset. 

For $N=10$, CDSB generates a minibatch of 100 images in approximately 0.8 seconds when run on a GTX 1080Ti. 
As a baseline comparison, we experimented with the methodology in \cite{kadkhodaie2021stochastic}  on the same MNIST test set and find that it gives PSNR/SSIM values of 15.78/0.72 and 12.49/0.47 for super-resolution and inpainting respectively (\textit{c.f.} \Cref{tab:imagemetrics}). Around 250 iterations are required for generating each image, or approximately 1 second generation time for 1 image on a GTX 1080Ti. In comparison, the CDSB methodology is much more efficient and achieves better image quality on both tasks. 

\begin{figure}[h]
\begin{centering}
\subfloat[$\yobs$]{\includegraphics[width=5cm]{Plots/MNIST_superres_supp/Cond.png}} ~~
\subfloat[Ground truth]{\includegraphics[width=5cm]{\string"Plots/MNIST_superres_supp/True data\string".png}} \\
\subfloat[CSGM $N=5$]{\includegraphics[width=5cm]{\string"Plots/MNIST_superres_supp/N=5 CDiff\string".png}} ~~
\subfloat[CDSB $N=5$]{\includegraphics[width=5cm]{\string"Plots/MNIST_superres_supp/N=5 CDSB\string".png}} ~~
\subfloat[CDSB-C $N=5$]{\includegraphics[width=5cm]{\string"Plots/MNIST_superres_supp/N=5 CDSB-Cond\string".png}} \\
\subfloat[CSGM $N=10$]{\includegraphics[width=5cm]{\string"Plots/MNIST_superres_supp/N=10 CDiff\string".png}} ~~
\subfloat[CDSB $N=10$]{\includegraphics[width=5cm]{\string"Plots/MNIST_superres_supp/N=10 CDSB\string".png}} ~~
\subfloat[CDSB-C $N=10$]{\includegraphics[width=5cm]{\string"Plots/MNIST_superres_supp/N=10 CDSB-Cond\string".png}}
\par\end{centering}
\caption{Additional samples for the MNIST 4x SR task.}
\end{figure}  

\begin{figure}[h]
\begin{centering}
\subfloat[CSGM $N=10$]{\includegraphics[width=8cm]{\string"Plots/MNIST_inpaint/N=10 CDiff\string".png}} ~~
\subfloat[CDSB $N=10$]{\includegraphics[width=8cm]{\string"Plots/MNIST_inpaint/N=10 CDSB\string".png}}

\subfloat[CDSB-FB $N=10$]{\includegraphics[width=8cm]{\string"Plots/MNIST_inpaint/N=10 CDSB-FB\string".png}}~~
\subfloat[CDSB-C $N=10$]{\includegraphics[width=8cm]{\string"Plots/MNIST_inpaint/N=10 CDSB-Cond\string".png}}

\par\end{centering}
\caption{\label{fig:mnistinpainting}Uncurated conditional samples for the MNIST 14x14 inpainting task. The first two columns correspond to ground truth, $\yobs$, and the last two columns correspond to the mean and standard deviation of 100 samples. }

\end{figure}

\subsubsection{CelebA 64x64}
For the CelebA dataset, we use a U-Net architecture with 4 resolution levels each with 2 residual blocks and self-attention blocks at $16\times16$ and $8\times8$ resolutions. The numbers of filters at each resolution level are 128, 256, 256, 256 respectively. The total number of parameters is 39.6m, and we use batch size 128 for training. For each CDSB iteration, 10k or 25k training steps are used, corresponding to $L=50$ or $L=20$  CDSB iterations in total. For smaller $\gamma_\textup{max}$, we find that higher number of CDSB iterations are beneficial. 

For $N=20,50$, CDSB generates a minibatch of 100 images in approximately 12, 30 seconds when run on a Titan RTX.
As a baseline comparison, we find that CDSB-C with $N=20$ even outperforms a standard CSGM with $N=200$, which achieves PSNR/SSIM values of approximately 20.98/0.62. To ensure that conditional initialization is not the sole contributor to the gain in sample quality, we further compare CDSB-C ($N=50$) to a CSGM ($N=50$) with conditional initialization. The forward noising process is also modified to the discretized Ornstein--Uhlenbeck process targeting $\pref(x|y)$ as described in \Cref{subsec:targetawareforward}. This modification  achieved PSNR/SSIM values of 20.84/0.59 (\textit{c.f.} \Cref{tab:imagemetrics}), which indicates that the CDSB framework presents larger benefits in addition to conditional initialization. 

As another baseline comparison, the SNIPS algorithm \citep{kawar2021snips} reports PSNR of 21.90 for 8 CelebA test images and, when averaging across 8 predicted samples for each of the images, a PSNR of 24.31. The algorithm requires 2500 iterations for image generation, or approximately 2 minutes for producing 8 samples when run on an RTX 3080 as reported by \cite{kawar2021snips}. On the same test benchmark, CDSB with $N=50$ achieved PSNR  values of 21.87 and 24.20 respectively in 3.1 seconds, thus achieving similar levels of sample quality using much less iterations. Furthermore, the SNIPS algorithm is applicable specifically for tractable linear Gaussian inverse problems, whereas CDSB is more general and does not rely on tractable likelihoods. 

\subsubsection{CelebA 160x160}
We adopt the official implementation and pre-trained checkpoints of SRFlow\footnote{\href{https://github.com/andreas128/SRFlow}{\texttt {https://github.com/andreas128/SRFlow}}} and make use of a higher resolution version of CelebA (160x160) following \cite{lugmayr2020srflow} in only Section \ref{subsubsec:nongaussianref}. For CSGM and CDSB, we use a U-Net architecture with 4 resolution levels each with 2 residual blocks. The numbers of filters at each resolution level are 128, 256, 256, 512 respectively. The total number of parameters is 71.0m while SRFlow has total number of parameters 40.0m. We use a batch size of 32 for training the CSGM and CDSB models. 

When $\pref(x|y)$ is defined by SRFlow, it is infeasible to use a discretized Ornstein--Uhlenbeck process targeting $\pref(x|y)$ as in Section \ref{subsec:targetawareforward}. We instead use a discretized Brownian motion for $p_{k+1|k}$, or equivalently the Variance Exploding (VE) SDE \citep{song2019generative,song2020score}. This has the interpretation as a entropy
regularized Wasserstein-2 optimal transport problem as discussed in \Cref{subsec:linkwithot}, i.e. CDSB-C seeks to minimize the total squared transport distance between SRFlow $\pref(x|y)$ and the true posterior $p(x|y)$. We use the time schedule $\gamma_\textup{min}=\gamma_\textup{max}=0.005$ with comparatively higher $\gamma_\textup{min}$ in order to accelerate convergence under $N=10$ timesteps. We provide additional samples from SRFlow, CDSB-C as well as CSGM-C in Figures \ref{fig:imagecomparison-celeba160-supp1}, \ref{fig:imagecomparison-celeba160-supp2}, \ref{fig:imagecomparison-celeba160-supp3}. 

\begin{figure}[h]
\begin{centering}
\subfloat[$\yobs$]{\includegraphics[height=5.5cm]{Plots/CelebA_superres_supp/N=20 CDiff/im_grid_data_y_repeat0.png}} ~~
\subfloat[Ground truth]{\includegraphics[height=5.5cm]{Plots/CelebA_superres_supp/N=20 CDiff/im_grid_data_x_repeat0.png}} \\
\subfloat[CSGM $N=20$]{\includegraphics[height=5.5cm]{Plots/CelebA_superres_supp/N=20 CDiff/im_grid_last_repeat0.png}} ~~
\subfloat[CDSB $N=20$]{\includegraphics[height=5.5cm]{Plots/CelebA_superres_supp/N=20 CDSB/im_grid_last_repeat0.png}} ~~
\subfloat[CDSB-C $N=20$]{\includegraphics[height=5.5cm]{Plots/CelebA_superres_supp/N=20 CDSB-Cond/im_grid_last_repeat0.png}} \\
\subfloat[CSGM $N=50$]{\includegraphics[height=5.5cm]{Plots/CelebA_superres_supp/N=50 CDiff/im_grid_last_repeat0.png}} ~~
\subfloat[CDSB $N=50$]{\includegraphics[height=5.5cm]{Plots/CelebA_superres_supp/N=50 CDSB/im_grid_last_repeat0.png}} ~~
\subfloat[CDSB-C $N=50$]{\includegraphics[height=5.5cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_grid_last_repeat0.png}}
\par\end{centering}
\caption{Uncurated samples for the CelebA 4x SR with Gaussian noise task.}
\end{figure}  

\begin{figure}[h]
\begin{centering}
\subfloat[$\yobs$]{\includegraphics[height=5.5cm]{Plots/CelebA_superres_supp/N=20 CDiff/im_grid_data_y_repeat1.png}} ~~
\subfloat[Ground truth]{\includegraphics[height=5.5cm]{Plots/CelebA_superres_supp/N=20 CDiff/im_grid_data_x_repeat1.png}} \\
\subfloat[CSGM $N=20$]{\includegraphics[height=5.5cm]{Plots/CelebA_superres_supp/N=20 CDiff/im_grid_last_repeat1.png}} ~~
\subfloat[CDSB $N=20$]{\includegraphics[height=5.5cm]{Plots/CelebA_superres_supp/N=20 CDSB/im_grid_last_repeat1.png}} ~~
\subfloat[CDSB-C $N=20$]{\includegraphics[height=5.5cm]{Plots/CelebA_superres_supp/N=20 CDSB-Cond/im_grid_last_repeat1.png}} \\
\subfloat[CSGM $N=50$]{\includegraphics[height=5.5cm]{Plots/CelebA_superres_supp/N=50 CDiff/im_grid_last_repeat1.png}} ~~
\subfloat[CDSB $N=50$]{\includegraphics[height=5.5cm]{Plots/CelebA_superres_supp/N=50 CDSB/im_grid_last_repeat1.png}} ~~
\subfloat[CDSB-C $N=50$]{\includegraphics[height=5.5cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_grid_last_repeat1.png}}
\par\end{centering}
\caption{Uncurated samples for the CelebA 4x SR with Gaussian noise task.}
\end{figure}  



\begin{figure}[h]
\begin{centering}

\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_x_0.png}
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_y_0.png} ~~
\includegraphics[trim={0 2px 0 2px}, clip, height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_grid_0.png} ~~
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_mean_0.png}

\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_x_1.png}
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_y_1.png} ~~
\includegraphics[trim={0 2px 0 2px}, clip, height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_grid_1.png} ~~
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_mean_1.png}

\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_x_2.png}
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_y_2.png} ~~
\includegraphics[trim={0 2px 0 2px}, clip, height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_grid_2.png} ~~
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_mean_2.png}

\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_x_3.png}
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_y_3.png} ~~
\includegraphics[trim={0 2px 0 2px}, clip, height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_grid_3.png} ~~
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_mean_3.png}

\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_x_4.png}
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_y_4.png} ~~
\includegraphics[trim={0 2px 0 2px}, clip, height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_grid_4.png} ~~
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_mean_4.png}

\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_x_5.png}
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_y_5.png} ~~
\includegraphics[trim={0 2px 0 2px}, clip, height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_grid_5.png} ~~
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_mean_5.png}

\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_x_6.png}
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_y_6.png} ~~
\includegraphics[trim={0 2px 0 2px}, clip, height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_grid_6.png} ~~
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_mean_6.png}

\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_x_7.png}
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_y_7.png} ~~
\includegraphics[trim={0 2px 0 2px}, clip, height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_grid_7.png} ~~
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_mean_7.png}


\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_x_8.png}
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_y_8.png} ~~
\includegraphics[trim={0 2px 0 2px}, clip, height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_grid_8.png} ~~
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_mean_8.png}


\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_x_9.png}
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_y_9.png} ~~
\includegraphics[trim={0 2px 0 2px}, clip, height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_grid_9.png} ~~
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_mean_9.png}


\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_x_10.png}
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_y_10.png} ~~
\includegraphics[trim={0 2px 0 2px}, clip, height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_grid_10.png} ~~
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_mean_10.png}


\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_x_11.png}
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_y_11.png} ~~
\includegraphics[trim={0 2px 0 2px}, clip, height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_grid_11.png} ~~
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_mean_11.png}

\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_x_12.png}
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/data_y_12.png} ~~
\includegraphics[trim={0 2px 0 2px}, clip, height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_grid_12.png} ~~
\includegraphics[height=1.4cm]{Plots/CelebA_superres_supp/N=50 CDSB-Cond/im_/im_mean_12.png}

\par\end{centering}
\caption{Uncurated conditional samples using CDSB-C with $N=50$ for the CelebA 4x SR with Gaussian noise task. The first two columns correspond to ground truth, $\yobs$, and the last column corresponds to the mean of the middle 8 samples. }
\end{figure}  


\begin{figure}
\begin{centering}
\subfloat[$\yobs$]{\includegraphics[height=5.5cm]{Plots/CelebA160_supp/im_grid_data_y_repeat1}}~~
\subfloat[Ground truth]{\includegraphics[height=5.5cm]{Plots/CelebA160_supp/im_grid_data_x_repeat1}}\\
\subfloat[SRFlow]{\includegraphics[height=5.5cm]{Plots/CelebA160_supp/im_grid_srflow_repeat1}}~~
\subfloat[CSGM-C $N=10$]{\includegraphics[height=5.5cm]{Plots/CelebA160_supp/im_grid_csgm_repeat1}}~~
\subfloat[CDSB-C $N=10$]{\includegraphics[height=5.5cm]{Plots/CelebA160_supp/im_grid_cdsb_repeat1}}
\par\end{centering}
\caption{\label{fig:imagecomparison-celeba160-supp1}Additional uncurated samples for the CelebA 8x SR task.}

\end{figure}

\begin{figure}
\begin{centering}
\subfloat[$\yobs$]{\includegraphics[height=5.5cm]{Plots/CelebA160_supp/im_grid_data_y_repeat2}}~~
\subfloat[Ground truth]{\includegraphics[height=5.5cm]{Plots/CelebA160_supp/im_grid_data_x_repeat2}}\\
\subfloat[SRFlow]{\includegraphics[height=5.5cm]{Plots/CelebA160_supp/im_grid_srflow_repeat2}}~~
\subfloat[CSGM-C $N=10$]{\includegraphics[height=5.5cm]{Plots/CelebA160_supp/im_grid_csgm_repeat2}}~~
\subfloat[CDSB-C $N=10$]{\includegraphics[height=5.5cm]{Plots/CelebA160_supp/im_grid_cdsb_repeat2}

}
\par\end{centering}
\caption{\label{fig:imagecomparison-celeba160-supp2}Additional uncurated samples for the CelebA 8x SR task.}
\end{figure}

\begin{figure}
\begin{centering}
\subfloat[$\yobs$]{\includegraphics[height=5.5cm]{Plots/CelebA160_supp/im_grid_data_y_repeat3}}~~
\subfloat[Ground truth]{\includegraphics[height=5.5cm]{Plots/CelebA160_supp/im_grid_data_x_repeat3}}\\
\subfloat[SRFlow]{\includegraphics[height=5.5cm]{Plots/CelebA160_supp/im_grid_srflow_repeat3}}~~
\subfloat[CSGM-C $N=10$]{\includegraphics[height=5.5cm]{Plots/CelebA160_supp/im_grid_csgm_repeat3}}~~
\subfloat[CDSB-C $N=10$]{\includegraphics[height=5.5cm]{Plots/CelebA160_supp/im_grid_cdsb_repeat3}}
\par\end{centering}
\caption{\label{fig:imagecomparison-celeba160-supp3}Additional uncurated samples for the CelebA 8x SR task.}
\end{figure}



\subsection{Optimal Filtering in State-Space Models}
For the sake of completeness, we first give details of the Lorenz-63
model here. It is defined for $x\in\mathbb{R}^3$ under the following ODE
system
\[
\frac{\rmd x[1]}{\rmd \tau}=\sigma(x[2]-x[1]),\quad\frac{\rmd x[2]}{\rmd \tau}=x[1](\rho-x[3])-x[2],\quad\frac{\rmd x[3]}{\rmd \tau}=x[1]x[2]-\theta x[3].
\]
We consider the values $\sigma=10$, $\rho=28$ and $\theta=8/3$,
which results in chaotic dynamics famously known as the Lorenz
attractor. We integrate this system using the 4th order Runge--Kutta
method with step size 0.05. For the state-space model, we define $(X_{t})_{t\geq1}$
as the states $(x[1],x[2],x[3])$ of the system at regular intervals of $\delta \tau=0.1$
with small Gaussian perturbations of mean 0 and variance ${10}^{-4}$,
and $(Y_{t})_{t\geq1}$ as noisy observations of $(X_{t})_{t\geq1}$
with Gaussian noise of mean 0 and variance 4. 
More explicitly,
the transition density is thus defined for $x_t=(x_t[1],x_t[2],x_t[3])\in\mathbb{R}^3$ as 
\[
f(x_{t}|x_{t-1})=\mathcal{N}(x_{t};\textup{RK4}(x_{t-1},0.1),{10}^{-4}\Id),\quad g(y_{t}|x_{t})=\mathcal{N}(y_{t};x_{t},4\Id),
\]
 where $\textup{RK4}(x_{t},0.1)$ is the 4th order Runge--Kutta operator
(with step size 0.05) for the Lorenz-63 dynamics with initial condition
$x_{t}$ and termination time 0.1. 

We run the model for 4,000 time steps and perform Bayesian filtering
for the last 2,000 time steps. To accelerate the sequential inference
process, we use linear regression in this example to fit $\mathbf{F},\mathbf{B}$
with nonlinear feature expansion using radial basis functions. Similar
to \citet{spantini2019coupling}, we experiment with the number of
nonlinear features from 1 to 3 RBFs, in addition to the linear feature.
We find that as the ensemble size $M$ increases, increasing the number 
of features is helpful for lowering filtering errors, suggesting that
bias-variance tradeoff is at play. 

Since the system's dynamics are chaotic and can move far from the
origin and display different scaling for each dimension, it is not
suitable to choose $\pref(x)=\mathcal{N}(x;0,\Id)$.
Therefore, for CSGM and CDSB, we let $\pref(x)=\mathcal{N}(x;\mu_{\textup{ref}},\textup{diag}(\sigma_{\textup{ref}}^{2}))$
where $\mu_{\textup{ref}},\sigma_{\textup{ref}}^{2}$ are the estimated mean and variance
of the prior predictive distribution $p(x_{t}|\yobs_{1:t-1})$ at time $t$.
For CSGM-C and CDSB-C, we let $\pref(x|y)=\mathcal{N}(x;\mu_{\textup{ref}},\textup{diag}(\sigma_{\textup{ref}}^{2}))$
where the estimated posterior mean and variance are returned by EnKF.
Furthermore, we scale the diffusion process's time step dimensionwise
by the variance of the reference measure $\sigma_{\textup{ref}}^{2}$. We consider
a short diffusion process with $N=20$, and a long diffusion process
with $N=100$. We let $\gamma_{\textup{min}}=0.0005\cdot\sigma_{\textup{ref}}^{2}$
and $\gamma_{\textup{max}}=0.05\cdot\sigma_{\textup{ref}}^{2}$ for the short
diffusion process, and reduce $\gamma_{\textup{max}}$ by a half for
the long diffusion process.  

We report the RMSEs between each algorithm's filtering means and the ground truth filtering means in Table \ref{tab:filtering}. We compute the ground truth filtering means using a particle filter with $M={10}^6$ particles. In addition, we report the RMSEs between each algorithm's filtering means and the true states $x_{1:T}$ in Table \ref{tab:filteringa}, and between each algorithm's filtering standard deviations and the ground truth standard deviations in Table \ref{tab:filteringb}. Similarly, we observe that CDSB and CDSB-C achieve lower errors than CSGM and EnKF. Interestingly, CSGM-C performs similarly well as CDSB-C for state estimation when $N=100$ steps, but performs worse for standard deviation estimation.  In the case where the ensemble size $M=200$, however, when using the long diffusion process we observe occasional large errors for CDSB and CDSB-C. We conjecture that since CDSB is an iterative algorithm, inevitably small errors in regression can be accumulated. For small ensemble size and large number of diffusion steps, the model may thus be more prone to overfitting. However, for larger ensemble size $M\geq500$ we do not observe this issue.

\tabcolsep=0.25cm
\begin{table}
\begin{centering}
\subfloat[\label{tab:filteringa}]{

    \begin{tabular}{|c|c|c|c|c|}
    \hline 
    $M$ & 200 & 500 & 1000 & 2000\tabularnewline
    \hline 
    \hline 
    EnKF & .476\textpm .010 & .474\textpm .005 & .475\textpm .005 & .475\textpm .003\tabularnewline
    \hline 
    CSGM (short) & \multicolumn{4}{c|}{Diverges}\tabularnewline
    \hline 
    CDSB (short) & .464\textpm .013 & .391\textpm .010 & .369\textpm .007 & .352\textpm .008\tabularnewline
    \hline 
    CSGM-C (short)& \multicolumn{4}{c|}{Diverges}\tabularnewline
    \hline 
    CDSB-C (short) & \textbf{.428\textpm .016} & \textbf{.378\textpm .012} & \textbf{.359\textpm .015} & \textbf{.340\textpm .007}\tabularnewline
    \hline 
    \hline 
    CSGM (long) & \textbf{.431\textpm.010} & .376\textpm .008 & .360\textpm .012 & .343\textpm .006\tabularnewline
    \hline 
    CDSB (long) & .582\textpm.328 & .370\textpm .012 & .348\textpm .006 & .333\textpm .006\tabularnewline
    \hline 
    CSGM-C (long) & .434\textpm.057 & \textbf{.367\textpm.011}  & .346\textpm.008 & .336\textpm.004\tabularnewline
    \hline 
    CDSB-C (long) & .660\textpm.310 & .368\textpm .016 & \textbf{.344\textpm .010} & \textbf{.331\textpm .006}\tabularnewline
    \hline 
    \end{tabular}

} \qquad
\subfloat[\label{tab:filteringb}]{

    \begin{tabular}{|c|c|c|c|c|}
    \hline 
    $M$ & 200 & 500 & 1000 & 2000\tabularnewline
    \hline 
    \hline 
    EnKF & .255\textpm.003 &  .286\textpm .002 & .296\textpm .001 & .300\textpm .003\tabularnewline
    \hline 
    CSGM (short)& \multicolumn{4}{c|}{Diverges}\tabularnewline
    \hline 
    CDSB (short)& .203\textpm.005 &  .167\textpm .003 & .150\textpm .002 & .137\textpm .002\tabularnewline
    \hline 
    CSGM-C (short)& \multicolumn{4}{c|}{Diverges}\tabularnewline
    \hline 
    CDSB-C (short)& \textbf{.148\textpm.004} &  \textbf{.124\textpm .002} & \textbf{.108\textpm .002} & \textbf{.099\textpm .001}\tabularnewline
    \hline 
    \hline 
    CSGM (long) & .204\textpm.005 &  .163\textpm .008 & .140\textpm .002 & .129\textpm .001\tabularnewline
    \hline 
    CDSB (long) & \textbf{.140\textpm.008} &  .129\textpm .003 & .123\textpm .003 & .120\textpm .002\tabularnewline
    \hline 
    CSGM-C (long) & .186\textpm.005 & .142\textpm.003 & .120\textpm.001 & .109\textpm.002\tabularnewline
    \hline 
    CDSB-C (long) & .176\textpm.006 &  \textbf{.120\textpm .002} & \textbf{.110\textpm .003} & \textbf{.106\textpm .002}\tabularnewline
    \hline 
    \end{tabular}

}
\par\end{centering}
\caption{\label{tab:filtering-supp}RMSEs over 10 runs between (a) each algorithm's filtering means
and the true states $x_{1:T}$ for $N=20$ (short) and $N=100$ (long); (b) each algorithm's filtering standard deviations and the ground truth filtering standard deviations. 
The lowest errors are highlighted in bold.}
\end{table}


\bibliography{refs}

\end{document}