\documentclass[accepted]{uai2022}

%% Choose your variant of English; be consistent
\usepackage[american]{babel}
% \usepackage[british]{babel}

%% Some suggested packages, as needed:
\usepackage{natbib} % has a nice set of citation styles and commands
    \bibliographystyle{plainnat}
    \renewcommand{\bibsection}{\subsubsection*{References}}
\usepackage[tbtags]{mathtools} % amsmath with fixes and additions
% \usepackage{siunitx} % for proper typesetting of numbers and units
\usepackage{booktabs} % commands to create good-looking tables
\usepackage{tikz}
\usepackage{amsthm}
\usepackage{amssymb}
\usepackage{multirow}


\usepackage[utf8]{inputenc}   % LaTeX, comprends les accents !
\usepackage[T1]{fontenc}      % Police contenant les caractÃ¨res franÃ§ais
\usepackage{comment}

\usepackage{natbib}

\usepackage[tbtags]{amsmath}
\usepackage{amsthm}
\allowdisplaybreaks
\usepackage{amssymb,mathrsfs}
\usepackage{amsfonts}
\usepackage{upgreek}
\usepackage{xspace}

\usepackage{graphicx}
\usepackage{subfig}
\usepackage{color}
\usepackage{algorithm, algorithmic}
\begin{comment}

\algnewcommand{\Inputs}[1]{%
  \State \textbf{Inputs:}
  \Statex \hspace*{\algorithmicindent}\parbox[t]{.8\linewidth}{\raggedright #1}
}
\algnewcommand{\Initialize}[1]{%
  \State \textbf{Initialize:}
  \Statex \hspace*{\algorithmicindent}\parbox[t]{.8\linewidth}{\raggedright #1}
}
\algnewcommand{\Outputs}[1]{%
  \State \textbf{Outputs:}
  \Statex \hspace*{\algorithmicindent}\parbox[t]{.8\linewidth}{\raggedright #1}
}
\end{comment}

\usepackage{stmaryrd}
\usepackage[inline]{enumitem}
\usepackage{url}
\def\UrlBreaks{\do\/\do-}
\usepackage{tikz}
\usetikzlibrary{calc,arrows.meta}
\newcommand\yBlock{1}
\newcommand\yNode{0.75}

\newcommand\xNodemoinstiny{-1}
\newcommand\xNodemoins{-1.5}
\newcommand\xNodemoinsint{-2.}
\newcommand\xNodeMoins{-3}
\newcommand\xNodeMOINS{-4.5}

\newcommand\xNodeplustiny{1}
\newcommand\xNodeplus{1.5}
\newcommand\xNodeplusint{2}
\newcommand\xNodePlus{3}
\newcommand\xNodePLUS{4.5}

\usepackage{pgfplots}
\usepackage{xcolor}
\usepackage{bbm}
\usepackage{ifthen}
\usepackage{xargs}

\usepackage{aliascnt}
\usepackage{cleveref}
\usepackage{autonum}
\makeatletter
\newtheorem{theorem}{Theorem}
\crefname{theorem}{theorem}{Theorems}
\Crefname{Theorem}{Theorem}{Theorems}


\newtheorem*{lemma_nonumber*}{Lemma}


\newaliascnt{lemma}{theorem}
\newtheorem{lemma}[lemma]{Lemma}
\aliascntresetthe{lemma}
\crefname{lemma}{lemma}{lemmas}
\Crefname{Lemma}{Lemma}{Lemmas}



\newaliascnt{corollary}{theorem}
\newtheorem{corollary}[corollary]{Corollary}
\aliascntresetthe{corollary}
\crefname{corollary}{corollary}{corollaries}
\Crefname{Corollary}{Corollary}{Corollaries}

\newaliascnt{proposition}{theorem}
\newtheorem{proposition}[proposition]{Proposition}
\aliascntresetthe{proposition}
\crefname{proposition}{proposition}{propositions}
\Crefname{Proposition}{Proposition}{Propositions}

\newaliascnt{definition}{theorem}
\newtheorem{definition}[definition]{Definition}
\aliascntresetthe{definition}
\crefname{definition}{definition}{definitions}
\Crefname{Definition}{Definition}{Definitions}

\newaliascnt{remark}{theorem}
\newtheorem{remark}[remark]{Remark}
\aliascntresetthe{remark}
\crefname{remark}{remark}{remarks}
\Crefname{Remark}{Remark}{Remarks}


\newtheorem{example}[theorem]{Example}
\crefname{example}{example}{examples}
\Crefname{Example}{Example}{Examples}

\newtheorem{technique}{Technique}
\crefname{technique}{technique}{techniques}
\Crefname{Technique}{Technique}{Techniques}


\crefname{figure}{figure}{figures}
\Crefname{Figure}{Figure}{Figures}


\newtheorem{assumption}{\textbf{A}\hspace{-3pt}}
\crefformat{assumption}{{\textbf{A}}#2#1#3}

\newtheorem{assumptionF}{\textbf{F}\hspace{-3pt}}
\crefformat{assumptionF}{{\textbf{F}}#2#1#3}

\newenvironment{assumptionbis}[1]
  {\renewcommand{\theassumptionF}{\ref*{#1}$\mathbf{b}$}%
   \addtocounter{assumptionF}{-1}%
   \begin{assumptionF}}
  {\end{assumptionF}}



\newtheorem{assumptionB}{\textbf{B}\hspace{-3pt}}
\Crefname{assumptionB}{\textbf{B}\hspace{-3pt}}{\textbf{B}\hspace{-3pt}}
\crefname{assumptionB}{\textbf{B}}{\textbf{B}}

\newtheorem{assumptionC}{\textbf{C}\hspace{-3pt}}
\Crefname{assumptionC}{\textbf{C}\hspace{-3pt}}{\textbf{C}\hspace{-3pt}}
\crefname{assumptionC}{\textbf{C}}{\textbf{C}}


\newtheorem{assumptionH}{\textbf{H}\hspace{-3pt}}
\Crefname{assumptionH}{\textbf{H}\hspace{-3pt}}{\textbf{H}\hspace{-3pt}}
\crefname{assumptionH}{\textbf{H}}{\textbf{H}}

\newtheorem{assumptionT}{\textbf{T}\hspace{-3pt}}
\Crefname{assumptionT}{\textbf{T}\hspace{-3pt}}{\textbf{T}\hspace{-3pt}}
\crefname{assumptionT}{\textbf{T}}{\textbf{T}}

\newtheorem{assumptionD}{\textbf{D}\hspace{-3pt}}
\Crefname{assumptionT}{\textbf{T}\hspace{-3pt}}{\textbf{T}\hspace{-3pt}}
\crefname{assumptionT}{\textbf{T}}{\textbf{T}}


\newtheorem{assumptionL}{\textbf{L}\hspace{-3pt}}
\Crefname{assumptionL}{\textbf{L}\hspace{-3pt}}{\textbf{L}\hspace{-3pt}}
\crefname{assumptionL}{\textbf{L}}{\textbf{L}}

\newtheorem{assumptionQ}{\textbf{Q}\hspace{-3pt}}
\Crefname{assumptionQ}{\textbf{Q}\hspace{-3pt}}{\textbf{Q}\hspace{-3pt}}
\crefname{assumptionQ}{\textbf{Q}}{\textbf{Q}}


\newtheorem{assumptionAR}{\textbf{AR}\hspace{-3pt}}
\Crefname{assumptionAR}{\textbf{AR}\hspace{-3pt}}{\textbf{AR}\hspace{-3pt}}
\crefname{assumptionAR}{\textbf{AR}}{\textbf{AR}}



\newcommand\diaW{11}
\newcommand\diaH{5}
\newcommand\diaJump{2.75}
\newcommand\nextRow{1.25}
\newcommand\imW{0.08}
\newcommand\imWB{0.1}
\newcommand\imOp{0.6}
\newcommand\bend{5}

\newcommand\offset{2}
\newcommand\offsety{2.3}
\newcommand\h{2.25}
\newcommand\hsmall{1.75}
\newcommand\ww{3.25}
\newcommand\www{1.8}
\newcommand\wwww{3.5}
\newcommand\wwwww{4.8}
\newcommand{\offsetsmall}{1.5}

\usepackage{bm}
\usepackage{wrapfig}


\newcommand{\detLigne}[1]{\det(#1)}
\def\hlf{\hat{\ell}^f}
\def\hlb{\hat{\ell}^b}
\def\Ent{\mathrm{H}}
\def\lyap{V_{p,t,x_t}}
\def\lyapp{V_{p}}

\def\contspace{\mathcal{C}}


\def\bpobs{\bar{p}_{\textup{obs}}}
\def\pobs{p_{\textup{obs}}}
\def\pjoin{p_{\textup{join}}}
\def\pjref{p_{\textup{jref}}}
\def\ppos{p_{\textup{pos}}}
\def\pdata{p_{\textup{data}}}
\def\qdata{q_{\textup{data}}}
\def\pref{p_{\textup{ref}}}

\def\yobs{y^{\textup{obs}}}


\def\for{\mathrm{f}}
\def\back{\mathrm{b}}
\def\lf{\ell^{\mathrm{f}}}
\def\lb{\ell^{\mathrm{b}}}
\def\sf{s^{\mathrm{f}}}
\def\sb{s^{\mathrm{b}}}

\def\Tcal{\mathcal{T}}
\def\bfpi{\bm{\pi}}
\def\bfnu{\bm{\nu}}

\def\Pens{\mathscr{P}}
\def\Mens{\mathscr{M}}
\def\pif{\overrightarrow{\pi}}
\def\lambdabff{\overrightarrow{\bm{\lambda}}}
\def\lambdabfb{\overleftarrow{\bm{\lambda}}}
\newcommand{\mail}[1]{\footnote{Email: \href{mailto:#1}{\textcolor{black}{#1}}}}
\def\Phif{\overrightarrow{\Phi}}
\def\Phib{\overleftarrow{\Phi}}
\def\scoref{\overrightarrow{\mathrm{S}}}
\def\scoreb{\overleftarrow{\mathrm{S}}}
\def\netf{\overrightarrow{\mathrm{NN}}}
\def\netb{\overleftarrow{\mathrm{NN}}}
\newcommand{\schro}{Schr\"{o}dinger\xspace}
\newcommand{\Cweakapp}{\ttd}
\def\ttfp{\Cweakapp_{p}}
\def\ttfpun{\Cweakapp_{p,1}}
\def\ttfpdeux{\Cweakapp_{p,2}}
\def\ttfptrois{\Cweakapp_{p,3}}
\def\ttfpquatre{\Cweakapp_{p,4}}
\def\ttamin{\mathtt{a}}
\def\ttfun{\Cweakapp_4}
\def\ttfdeux{\Cweakapp_5}
\def\btta{\bar{\mathtt{A}}}
\def\bfb{\mathbf{b}}
\def\bfsigma{\pmb{\sigma}}
\def\KuLo{Kurdyka-\L ojasiewicz}
\newcommand{\tta}{\mathtt{A}}
\newcommand{\ttb}{\mathtt{B}}
\newcommand{\ttc}{\mathtt{C}}
\newcommand{\ttd}{\mathtt{D}}
\def\tte{\mathtt{E}}
\newcommand{\ttM}{\mathtt{M}}
\def\boundLSig{\Lip\eta}

\newcommand{\Capprox}{\tta}
\newcommand{\Ctech}{\ttc}
\newcommand{\Cstrong}{\ttb}
\newcommand{\Cconv}{\ttc}
\newcommand{\Cweak}{C}

\def\conj{\varkappa}
\def\mtta{\mathtt{a}}
\def\explog{\vareps}
\newcommand{\note}[1]{\textcolor{red}{#1}}
\def\Cbeta{\Cweak_{\beta, \explog}}
\def\Aar{\Capprox_{\alpha, r}}
\def\xo{x_0}
\def\Db{\Ctech}
\def\intk{\int_{k\gua}^{(k+1)\gua}}
\newcommandx\ctun[1][1=T]{\Capprox_{#1,1}}
\def\btun{\mathtt{B}_1}
\def\btdeux{\mathtt{B}_2}
\def\dtun{\mathtt{D}_1}
\def\cttun{\tilde{\Capprox}_{T,1}}
\def\dtdeux{\mathtt{D}_2}
\def\ctdeux{\Capprox_{T,2}}
\def\cttrois{\Capprox_{T,3}}
\def\ctquatre{\Capprox_{T,4}}
\def\ctcinq{\Capprox_{T,5}}
\def\ctsix{\Capprox_{T,6}}
\def\ctsept{\Capprox_{T,7}}
\def\cthuit{\Capprox_{T,8}}
\def\ctneuf{\Capprox_{T,9}}
\def\gfun{\mathbb{G}}
\def\hash{\sharp}
\def\Cconvcontun{\Cconv_{1,\alpha}^{(c)}}
\def\Cconvcontdeux{\Cconv_{2,\alpha}^{(c)}}
\def\Cconvconttrois{\Cconv_{3,\alpha}^{(c)}}
\def\Cconvdiscun{\Cconv_{1,\alpha}^{(d)}}
\def\Cconvdiscdeux{\Cconv_{2,\alpha}^{(d)}}
\def\Cconvdisctrois{\Cconv_{3,\alpha}^{(d)}}
\def\Cconvcont{\Phibf_{\alpha}^{(c)}}
\def\Cconvdisc{\Phibf_{\alpha}^{(d)}}
\def\Csham{\Cconv_1}
\def\Cshamd{\Cconv_2}
\def\Cshama{\Cconv_{\alpha}}
\def\Cshamamoins{\Cshama^-}
\def\Cshamaplus{\Cshama^+}
\def\Ccont{\Cconv^{(c)}}
\def\Cdisc{\Cconv^{(d)}}
\def\Cconvk{{\Cconv^{(a)}_k}}
%\def\Cconvdun{\Cconv^{(b)}_1}
%\def\Cconvddeux{\Cconv^{(b)}_2}
\def\Cconvdtrois{\Cconv^{(b)}}
\def\Cconvdun{(\gamma\eta/2)}
\def\Cconvddeux{(\gamma/2)}
\def\Cshamdisc{\Cconv_{0}}
\def\Cshamt{\tilde{\Cconv}_{\alpha}}
\def\Psial{\Psibf_{\alpha}}
\def\Cstrongcont{\Cstrong_1}
\def\Cstrongcontf{\Cstrong_2}
\def\Cstrongdisc{\Cstrong_3}
\def\Cstrongdiscf{\Cstrong_4}
\def\Cstrongloj{\Cstrong_5}
\def\Cstronglojdisc{\Cstrong_6}
\def\Cstrongtilde{\tilde{\Cstrong}}
\def\maxnorm{C}
\newcommand{\pinv}{^{-1}}
\newcommand{\st}{^{\star}}
\newcommand{\gb}{\gamma^{\beta}}
\newcommand{\tr}{^{\top}}
\def\scrE{\mathscr{E}}
\def\scrV{\mathscr{V}}
\def\scrF{\mathscr{F}}
\newcommand{\rref}[1]{\tup{\Cref{#1}}}
\newcommand{\la}{\langle}
\newcommand{\ra}{\rangle}
\newcommand{\LL}{\L ojasciewicz~}
\newcommand{\gua}{\gamma_{\alpha}}
\newcommand{\bgua}{\bgamma_{\alpha}}
\newcommand{\gda}{\gua^{1/2}}
\newcommand{\tgua}{(t+\gua)^{\alpha}}
\newcommand{\guac}{c}
\newcommand{\et}{\quad\mbox{and}\quad}
%\newcommand{\sigb}{\ttM_{\Sigma}}
\newcommand{\sigb}{\eta}
\newcommand{\phe}{\varphi_{\varepsilon}}
\newcommand{\feps}{f_{\varepsilon}}
\newcommand{\nfeps}{\nabla f_{\varepsilon}}
\newcommand{\intd}{\int_{\bR^{\dim}}}
\newcommandx{\expec}[2]{{\mathbb E}\left[#1 \middle \vert #2  \right]} %%%% esperance conditionnelle
\newcommand{\expek}[1]{\expec{#1}{\cF_k}}
\newcommand{\expen}[1]{\expec{#1}{\cF_n}}
\newcommand{\nn}{_{n+1}}
\newcommand{\kk}{_{k+1}}
\newcommand{\pal}{^{\alpha}}
\newcommand{\pmal}{^{-\alpha}}
\newcommand{\cH}{\mathcal{H}}

\def\En{\tilde{E}_n}
\def\varepsn{\tilde{\vareps}_n}
\def\pow{p}
\def\ntt{\mathtt{n}_0}
\def\tlambda{\tilde{\lambda}}
\def\dim{d}
\newcommand{\tb}{\tilde{b}}
\newcommand{\Time}{T}
\newcommand{\mttun}{\mathtt{k}_1}
\newcommand{\mttdeux}{\mathtt{k}_2}
\newcommand{\mtttrois}{\mtt_3^+}
\newcommand{\bvareps}{\bar{\vareps}}
\newcommand{\transference}{\mathbf{T}}
\newcommand{\esssup}{\mathrm{ess sup}}
\newcommand{\ring}{\mathcal{C}_{\varrho}}
\newcommand{\measx}{\mathcal{X}}
\newcommand{\bkappa}{\bar{\kappa}}
\newcommand{\probaspace}[1]{\mathbb{P}\left( #1 \right)}
\newcommand{\dTVdeux}{d_{\mathrm{TV}, 2}}
\newcommand{\dTVDeux}[1]{d_{\mathrm{TV}, 2}\left( #1 \right)}
\newcommand{\bgM}{b_{\gamma, n}}
\newcommand{\bbgM}{\bar{b}_{\gamma, M}}
\newcommand{\rme}{\mathrm{e}}
\newcommand{\rmF}{\mathrm{F}}
\newcommand{\rmE}{\mathrm{E}}
\newcommand{\Fdr}{\mathrm{f}}
\newcommand{\Gdr}{\mathrm{g}}
\newcommand{\alphastar}{\alpha_{\star}}
\newcommand{\LipVset}{\mathrm{Lip}_{V, \alpha}}
\newcommand{\Lip}{\mathtt{L}}
\newcommand{\Mtt}{\mathtt{M}}
\newcommand{\Ktt}{\mathtt{K}}
\newcommand{\tLip}{\tilde{\mathtt{L}}}
\newcommand{\tell}{\tilde{\ell}}
\newcommand{\Lipb}{\mtt_b}
\newcommand{\step}{\ceil{1/\gamma}}
\newcommand{\bstep}{\ceil{1/\bgamma}}
\def\bdisc{b}
\def\bfDd{\mathbf{D}_{\mathrm{d}}}
\def\bfDc{\mathbf{D}_{\mathrm{c}}}
\newcommand{\SDE}{SDE}

\newcommand{\bbeta}{\bar{\beta}}
\newcommand{\measfun}{\mathbb{F}}
\newcommand{\btheta}{\boldsymbol{\theta}}
\newcommand{\bdeta}{\boldsymbol{\eta}}
\newcommand{\bvarphi}{\boldsymbol{\varphi}}

%\newcommand{\tau}{\boldsymbol{\tau}}
%\newcommand{\x}{\boldsymbol{x}}
%\newcommand{\X}{\boldsymbol{X}}
%\newcommand{\y}{\boldsymbol{y}}
%%\newcommand{\u}{\boldsymbol{u}}
%\newcommand{\w}{\boldsymbol{w}}
%\newcommand{\z}{\boldsymbol{z}}
%\newcommand{\p}{\boldsymbol{p}}
%\newcommand{\s}{\mathcal{S}}
%\newcommand{\ind}{\boldsymbol{1}}
%\newcommand{\dx}{\boldsymbol{\delta}\boldsymbol{x}}
%\newcommand{\argmax}{\operatornamewithlimits{argmax}}
%\newcommand{\argmin}{\operatornamewithlimits{argmin}}
%\newcommand{\prox}{\operatorname{prox}}
\def\x{{ \boldsymbol x}}
\def\u{{ \boldsymbol u}}
\def\y{{\boldsymbol y}}
\def\z{{\boldsymbol z}}
\def\w{{\boldsymbol w}}

\def\xt{ \boldsymbol x^t}
\newcommandx{\norm}[2][1=]{\ifthenelse{\equal{#1}{}}{\left\Vert #2 \right\Vert}{\left\Vert #2 \right\Vert^{#1}}}
\newcommandx{\normLigne}[2][1=]{\ifthenelse{\equal{#1}{}}{\Vert #2 \Vert}{\Vert #2\Vert^{#1}}}


\newcommand\mycomment[1]{\textcolor{red}{#1}}

%\theoremstyle{definition}
%\newtheorem{defn}{Definition}[section]
%\newtheorem{assump}{A}[paragraph]
%\newtheorem{prop}{Proposition}[section]
%\newtheorem{theo}{Theorem}[section]
%\newtheorem{coro}{Corollary}[section]
%\newtheorem{lemma}{Lemma}[section]
%\newtheorem{exmp}{Example}[section]

\def\xstart{x^{\star}_{\theta}}

%%%%%%%%%%%%%%%
%% mathbf

\def\bfn{\mathbf{n}}
\def\bfw{\mathbf{w}}
\def\bfc{\mathbf{c}}
\def\bfY{\mathbf{Y}}
\def\bbfY{\bar{\mathbf{Y}}}
\def\bfX{\mathbf{X}}
\def\tbfX{\tilde{\mathbf{X}}}
\def\hbfX{\hat{\mathbf{X}}}
\def\tbfY{\tilde{\mathbf{Y}}}
\def\hbfY{\hat{\mathbf{Y}}}
\def\bfs{\mathbf{s}}
\def\bfZ{\mathbf{Z}}
\def\bfXt{\tilde{\mathbf{X}}}
\def\bfXd{\overline{\mathbf{X}}}
\def\bfYd{\overline{\mathbf{Y}}}
\def\bfZ{\mathbf{Z}}
\def\bbfX{\tilde{\mathbf{X}}}
\def\bfM{\mathbf{M}}
\def\bfB{\mathbf{B}}
\def\bfP{\mathbf{P}}
%%% mathsf
\def\msi{\mathsf{I}}
\def\msa{\mathsf{A}}
\def\msd{\mathsf{D}}
\def\msk{\mathsf{K}}
\def\mss{\mathsf{S}}
\def\msn{\mathsf{N}}
\def\msat{\tilde{\mathsf{A}}}
\def\msb{\mathsf{B}}
\def\msc{\mathsf{C}}
\def\mse{\mathsf{E}}
\def\msf{\mathsf{F}}
\def\mso{\mathsf{o}}
\def\msg{\mathsf{G}}
\def\msh{\mathsf{H}}
\def\msm{\mathsf{M}}
\def\msu{\mathsf{U}}
\def\msv{\mathsf{V}}
\def\msr{\mathsf{R}}
\newcommand{\msff}[2]{\mathsf{F}_{#1}^{#2}}
\def\msp{\mathsf{P}}
\def\msq{\mathsf{Q}}
\def\msx{\mathsf{X}}
\def\msz{\mathsf{Z}}
\def\msy{\mathsf{Y}}



%% mathcal
\def\mca{\mathcal{A}}
\def\mct{\mathcal{T}}
\def\mcat{\tilde{\mathcal{A}}}
\def\mcab{\bar{\mathcal{A}}}
\def\mcbb{\mathcal{B}}  %%% \mcb est déjà pris
\newcommand{\mcb}[1]{\mathcal{B}(#1)}
\def\mcc{\mathcal{C}}
\def\mcz{\mathcal{Z}}
\def\mcy{\mathcal{Y}}
\def\mcx{\mathcal{X}}
\def\mce{\mathcal{E}}
\def\mcs{\mathcal{S}}
\def\mcf{\mathcal{F}}
\def\mcg{\mathcal{G}}
\def\mch{\mathcal{H}}
\def\mcm{\mathcal{M}}
\def\mcu{\mathcal{U}}
\def\mcv{\mathcal{V}}
\def\mcr{\mathcal{R}}
\newcommand{\mcff}[2]{\mathcal{F}_{#1}^{#2}}
\def\mcfb{\bar{\mathcal{F}}}
\def\bmcf{\bar{\mathcal{F}}}
\def\mcft{\tilde{\mathcal{F}}}
\def\tmcf{\tilde{\mathcal{F}}}
\def\mcp{\mathcal{P}}
\def\mcq{\mathcal{Q}}

%% mathbb

\def\Qbb{\mathbb{Q}}
\def\Rbb{\mathbb{R}}
\def\Mbb{\mathbb{M}}
\def\Pbb{\mathbb{P}}
\def\Hbb{\mathbb{H}}
\newcommand{\Qit}[1]{\Qbb^{(#1)}}
\newcommand{\Pit}[1]{\Pbb^{(#1)}}

\def\rset{\mathbb{R}}
\def\rsets{\mathbb{R}^*}
\def\cset{\mathbb{C}}
\def\zset{\mathbb{Z}}
\def\nset{\mathbb{N}}
\def\nsets{\mathbb{N}^{\star}}
\def\qset{\mathbb{Q}}
\def\Rset{\mathbb{R}}
\def\Cset{\mathbb{C}}
\def\Zset{\mathbb{Z}}
\def\Nset{\mathbb{N}}
\def\Tset{\mathbb{T}}

\def\bN{\mathbb{N}}
\def\bR{\mathbb{R}}
\def\bRd{\mathbb{R}^{\dim}}
\def\cF{\mathcal{F}}


%%%% mathrm

\def\rmP{\mathrm{P}}
\def\rmQ{\mathrm{Q}}
\def\rmR{\mathrm{R}}
\def\rmb{\mathrm{b}}
\def\mrb{\mathrm{b}}
\def\wrm{\mathrm{w}}
\def\rmw{\mathrm{w}}
\def\rmd{\mathrm{d}}
\def\rmm{\mathrm{m}}
\def\rms{\mathrm{s}}
\def\rmZ{\mathrm{Z}}
\def\rmS{\mathrm{S}}
\def\mrd{\mathrm{d}}
\def\mre{\mathrm{e}}
\def\rme{\mathrm{e}}
\def\rmn{\mathrm{n}}
\def\mrn{\mathrm{n}}
\def\mrc{\mathrm{C}}
\def\mrcc{\mathrm{c}}
\def\rmc{\mathrm{C}}
\def\rmC{\mathrm{C}}
\def\GaStep{\Gamma}
\def\rmcc{\mathrm{c}}
\def\rma{\mathrm{a}}
\def\rmf{\mathrm{f}}
\def\rmg{\mathrm{g}}
\def\rmh{\mathrm{h}}
\def\rmv{\mathrm{v}}
\def\mra{\mathrm{a}}

\def\cov{\mathrm{Cov}}

\newcommand{\cco}{\llbracket}
\newcommand{\ccf}{\rrbracket}
\newcommand{\po}{\left(}
\newcommand{\pf}{\right)}
\newcommand{\co}{\left[}
\newcommand{\cf}{\right]}
\newcommand{\R}{\mathbb R}
\newcommand{\Z}{\mathbb Z}
\newcommand{\D}{\mathcal D}
\newcommand{\dd}{\mathrm{d}}
\newcommand{\A}{\mathcal A}
\newcommand{\M}{\mathcal M}
\newcommand{\na}{\nabla}
\newcommand{\loiy}{\mu_{\mathrm{v}}}


\def\MeasFspace{\mathbb{M}}
\def\xstar{x^\star}
\def\Tr{\operatorname{T}}
\def\trace{\operatorname{Tr}}
\newcommandx{\functionspace}[2][1=+]{\mathbb{F}_{#1}(#2)}
%% argmin, argmax
\newcommand{\argmax}{\operatorname*{arg\,max}}
\newcommand{\argmin}{\operatorname*{arg\,min}}
\newcommand{\estimateur}[1]{\hat{\pi}_n^N(#1)}
\def\RichR{\operatorname{R}}
\def\piR{\hat{\pi}^{\RichR}}
\def\estimatorRR{\piR}
\newcommandx{\VarDeux}[3][3=]{\operatorname{Var}^{#3}_{#1}\left\{#2 \right\}}
\newcommand{\VarDeuxLigne}[2]{\operatorname{Var}_{#1}\{#2 \}}
\newcommand{\gramm}{\operatorname{Gramm}}
\newcommand{\1}{\mathbbm{1}}
\newcommand{\2}[1]{\mathbbm{1}_{\{#1\}}}




\newcommand{\LeftEqNo}{\let\veqno\@@leqno}

\newcommand{\lambdast}{\lambda^{s \rightarrow t}}
\newcommand{\etast}{\eta^{s \rightarrow t}}
\newcommand{\mst}{m^{s \rightarrow t}}
\newcommand{\mun}{m^{1 \rightarrow 2}}
\newcommand{\mdeux}{m^{2 \rightarrow 1}}
\newcommand{\lambdaun}{\lambda^{2 \rightarrow 1}}
\newcommand{\etaun}{\eta^{2 \rightarrow 1}}
\newcommand{\lambdadeux}{\lambda^{1 \rightarrow 2}}
\newcommand{\etadeux}{\eta^{1 \rightarrow 2}}
\newcommand{\mnun}{m^{n+1 \rightarrow \pi(n+1)}}
\newcommand{\etanun}{\eta^{n+1 \rightarrow \pi(n+1)}}
\newcommand{\lambdanun}{\lambda^{n+1 \rightarrow \pi(n+1)}}
\newcommand{\xpinun}{x_{\pi(n+1)}}
\newcommand{\xnun}{x_{n+1}}
\newcommand{\mpinun}{m^{\pi(n+1) \rightarrow n+1}}
\newcommand{\etapinun}{\eta^{\pi(n+1) \rightarrow n+1}}
\newcommand{\lambdapinun}{\lambda^{\pi(n+1) \rightarrow n+1}}
\newcommand{\pinun}{\pi(n+1)}
\newcommand{\vois}{\mathcal{N}}
\newcommand{\mpii}{m^{i \rightarrow \pi(n+1)}}
\newcommand{\etapii}{\eta^{i \rightarrow \pi(n+1)}}
\newcommand{\lambdapii}{\lambda^{i \rightarrow \pi(n+1)}}
\newcommand{\alphahat}{\widehat{\alpha}}
\newcommand{\betahat}{\widehat{\beta}}
\newcommand{\tildegamma}{\widetilde{\gamma}}
\newcommand{\tildeP}{\widetilde{P}}

\newcommand{\myeqref}[1]{Eq.~\eqref{#1}}



%%%% Floating Points Notation

\newcommand{\fpround}[1]{\lfloor #1 \rceil}
\newcommand{\floor}[1]{\left\lfloor #1 \right\rfloor}
\newcommand{\ceil}[1]{\left\lceil #1 \right\rceil}



%voc
\newcommand{\pth}{\ensuremath{p^{\text{th}}}}
\newcommand{\qth}{\ensuremath{q^{\text{th}}}}
\newcommand{\nth}{\ensuremath{n^{\text{th}}}}

%order
\newcommand{\ord}{\ensuremath{\operatorname{ord}}}
\newcommand{\rad}{\ensuremath{\operatorname{rad}}}



% Sets
\newcommand{\N}{\ensuremath{\mathbb{N}}}
\newcommand{\Q}{\ensuremath{\mathbb{Q}}}
\newcommand{\C}{\ensuremath{\mathbb{C}}}

%\newcommand{\F}{\ensuremath{\mathbb{F}}}
\newcommand{\primes}{\ensuremath{\mathcal P}}

\newcommand{\sfi}{\ensuremath{\mathcal{S}\!\mathcal{F}}}
\newcommand{\sfibt}{\ensuremath{\mathcal{S}\!\mathcal{F}'}}

\newcommand{\reghat}{\widehat{R}}

\newcommand{\reghatn}{\widehat{R}_n}

\newcommand{\arm}{\mathcal{A}}

\newcommand{\mX}{\widehat{X}}
\newcommand{\PE}{\mathbb{E}}
\newcommand{\PP}{\mathbb{P}}
\newcommand{\Ft}{\mathcal{F}}

\newcommand{\Sy}{\mathbf{S}}

\newcommand{\Kfrac}{\mathscr{K}}

% Operands
\newcommand{\absolute}[1]{\left\vert #1 \right\vert}
\newcommand{\abs}[1]{\left\vert #1 \right\vert}
\newcommand{\absLigne}[1]{\vert #1 \vert}
\newcommand{\tvnorm}[1]{\| #1 \|_{\mathrm{TV}}}
\newcommand{\tvnormLigne}[1]{\| #1 \|_{\mathrm{TV}}}
\newcommand{\tvnormEq}[1]{\left \| #1 \right \|_{\mathrm{TV}}}
\newcommandx{\Vnorm}[2][1=V]{\| #2 \|_{#1}}
\newcommandx{\VnormEq}[2][1=V]{\left\| #2 \right\|_{#1}}
% \newcommandx{\norm}[2][1=]{\ifthenelse{\equal{#1}{}}{\left\Vert #2 \right\Vert}{\left\Vert #2 \right\Vert^{#1}}}
% \newcommandx{\normLigne}[2][1=]{\ifthenelse{\equal{#1}{}}{\Vert #2 \Vert}{\Vert #2\Vert^{#1}}}
\newcommand{\crochet}[1]{\left\langle#1 \right\rangle}
\newcommand{\parenthese}[1]{\left(#1 \right)}
\newcommand{\parentheseLigne}[1]{(#1 )}
\newcommand{\parentheseDeux}[1]{\left[ #1 \right]}
\newcommand{\parentheseDeuxLigne}[1]{[ #1 ]}
\newcommand{\defEns}[1]{\left\lbrace #1 \right\rbrace }
\newcommand{\defEnsLigne}[1]{\lbrace #1 \rbrace }
\newcommand{\defEnsPoint}[1]{\left\lbrace #1 \right. }
\newcommand{\defEnsPointDeux}[1]{\left. #1 \right  \rbrace }
\newcommand{\defEnsL}[1]{\left\lbrace #1 \right. }
\newcommand{\defEnsR}[1]{\left. #1 \right  \rbrace }

%\newcommand{\defSystem}[1]{\left\lbrace #1 \right. }

\newcommand{\ps}[2]{\left\langle#1,#2 \right\rangle}
\newcommand{\eqdef}{=}
\newcommand{\defeq}{=}

% Relations
\newcommand{\divid}{\mid}
\newcommand{\ndivide}{\nmid}

% Proba
\newcommand{\proba}[1]{\mathbb{P}\left( #1 \right)}
\newcommand{\probaCond}[2]{\mathbb{P}\left( \left. #1  \middle\vert #2 \right.\right)}
\newcommand{\probaCondLigne}[2]{\mathbb{P}(#1  \vert #2 )}
\newcommand{\probaCondLignePi}[2]{\Pi(#1  \vert #2 )}
\newcommand{\probaLigne}[1]{\mathbb{P}( #1 )}
\newcommandx\probaMarkovTilde[2][2=]
{\ifthenelse{\equal{#2}{}}{{\widetilde{\mathbb{P}}_{#1}}}{\widetilde{\mathbb{P}}_{#1}\left[ #2\right]}}
\newcommand{\probaMarkov}[2]{\mathbb{P}_{#1}\left[ #2\right]}
\newcommand{\probaMarkovDD}[1]{\mathbb{P}_{#1}}
\newcommand{\expe}[1]{\PE \left[ #1 \right]}
\newcommand{\expesq}[1]{\PE^{1/2} \left[ #1 \right]}
\newcommand{\expeExpo}[2]{\PE^{#1} \left[ #2 \right]}
\newcommand{\expeLigne}[1]{\PE [ #1 ]}
\newcommand{\expeLine}[1]{\PE [ #1 ]}
\newcommand{\expeMarkov}[2]{\PE_{#1} \left[ #2 \right]}
\newcommand{\expeMarkovD}[3]{\PE_{#1}^{#3} \left[ #2 \right]}
\newcommand{\expeMarkovDD}[1]{\PE_{#1}}
\newcommand{\expeMarkovLigne}[2]{\PE_{#1} [ #2 ]}
\newcommand{\expeMarkovExpo}[3]{\PE_{#1}^{#2} \left[ #3 \right]}
\newcommand{\probaMarkovTildeDeux}[2]{\widetilde{\mathbb{P}}_{#1} \left[ #2 \right]}
\newcommand{\expeMarkovTilde}[2]{\widetilde{\PE}_{#1} \left[ #2 \right]}

% Landau notation (big O)
\newcommand{\bigO}{\ensuremath{\mathcal O}}
\newcommand{\softO}{\Tilde{\ensuremath{\mathcal O}}}

% Environments

%\renewenvironment{proof}[1][{\textit{Proof:}}]{\begin{trivlist} \item[\em{\hskip \labelsep #1}]}{\ensuremath{\qed} \end{trivlist}}

%\renewenvironment{proof}[1][{\textit{Proof:}}]{\begin{trivlist} \item[\em{\hskip \labelsep #1}]}{\ensuremath{\qed} \end{trivlist}}



%fleche limite
\newcommand{\flecheLimite}{\underset{n\to+\infty}{\longrightarrow}}
\newcommand{\flecheLimiteOption}[2]{\underset{#1\to#2}{\longrightarrow}}
\newcommand{\flecheLimiteHaut}{\overset{n\to+\infty}{\longrightarrow}}


%notation infini
\newcommand{\plusinfty}{+\infty}

%notation egale
\newcommand{\egale}[1]{\ensuremath{\underset{#1}{=}}}

%plusieurs ligne indice
%\sum\limits_{\substack{i=0 \\ i \neq i_0}}^{n}{A_



\newcommand\numberthis{\addtocounter{equation}{1}\tag{\theequation}}


\newcommand{\hilbert}{\mathcal{H}}


\def\ie{\textit{i.e.}}
\def\as{\textit{a.s}}
\def\cadlag{càdlàg}
\def\eqsp{\;}
\newcommand{\coint}[1]{\left[#1\right)}
\newcommand{\ocint}[1]{\left(#1\right]}
\newcommand{\ooint}[1]{\left(#1\right)}
\newcommand{\ccint}[1]{\left[#1\right]}
\newcommand{\cointLigne}[1]{[#1)}
\newcommand{\ocintLigne}[1]{(#1]}
\newcommand{\oointLigne}[1]{(#1)}
\newcommand{\ccintLigne}[1]{[#1]}

\def\primr{f_r}
\def\primrO{f_{r_0}}




\newcommand{\indi}[1]{\1_{#1}}
\newcommandx{\weight}[2][2=n]{\omega_{#1,#2}^N}
\newcommand{\loi}{\mathcal{L}}
\newcommand{\boule}[2]{\operatorname{B}(#1,#2)}
\newcommand{\ball}[2]{\operatorname{B}(#1,#2)}
\newcommand{\boulefermee}[2]{\bar{B}(#1,#2)}
\newcommand{\cball}[2]{\bar{\operatorname{B}}(#1,#2)}
\newcommand{\diameter}{\operatorname{diam}}
\newcommand{\deta}{d_{\eta}}

\def\TV{\mathrm{TV}}

\newcommand{\yuyang}[1]{\todo[color=blue!20]{{\bf YS:} #1}}
\newcommand{\james}[1]{\todo[color=blue!20]{{\bf JT:} #1}}
\newcommand{\arnaud}[1]{\todo[color=blue!20]{{\bf AD:} #1}}
\newcommand{\arnaudi}[1]{\todo[color=blue!20,inline]{{\bf AL:} #1}}
\newcommand{\valentin}[1]{\todo[color=blue!20]{{\bf VDB:} #1}}
\newcommand{\valentintxt}[1]{\textcolor{red}{\textbf{VDB}: #1}}
 \newcommand{\valentini}[1]{\todo[color=blue!20,inline]{{\bf VDB:} #1}}
 
% \newcommand{\aymeric}[1]{\todo[color=blue!20]{{\bf AD:} #1}}
% \newcommand{\francis}[1]{\todo[color=black!20]{{\bf FB:} #1}}
 \newcommand{\tcr}[1]{\textcolor{red}{#1}}
% \newcommand{\tcb}[1]{\textcolor{blue}{#1}}


\def\as{\ensuremath{\text{a.s.}}}
\def\dist{\operatorname{dist}}

\newcommandx\sequence[3][2=,3=]
{\ifthenelse{\equal{#3}{}}{\ensuremath{\{ #1_{#2}\}}}{\ensuremath{\{ #1_{#2}, \eqsp #2 \in #3 \}}}}

\newcommandx\sequenceD[3][2=,3=]
{\ifthenelse{\equal{#3}{}}{\ensuremath{\{ #1_{#2}\}}}{\ensuremath{( #1)_{ #2 \in #3} }}}

\newcommandx{\sequencen}[2][2=n\in\N]{\ensuremath{\{ #1_n, \eqsp #2 \}}}
\newcommandx\sequenceDouble[4][3=,4=]
{\ifthenelse{\equal{#3}{}}{\ensuremath{\{ (#1_{#3},#2_{#3}) \}}}{\ensuremath{\{  (#1_{#3},#2_{#3}), \eqsp #3 \in #4 \}}}}
\newcommandx{\sequencenDouble}[3][3=n\in\N]{\ensuremath{\{ (#1_{n},#2_{n}), \eqsp #3 \}}}


\newcommand{\wrt}{w.r.t.}
\newcommand{\Withoutlog}{w.l.o.g.}
\def\iid{i.i.d.}
\def\ifof{if and only if}
\def\eg{\textit{e.g.}}


\newcommand{\notered}[1]{{\textbf{\color{red}#1}}}


\newcommand{\opnorm}[1]{{\left\vert\kern-0.25ex\left\vert\kern-0.25ex\left\vert #1
    \right\vert\kern-0.25ex\right\vert\kern-0.25ex\right\vert}}



%\def\Lip{\operatorname{Lip}}
\def\generator{\mathcal{A}}
\def\generatort{\tilde{\mathcal{A}}}
\def\generatorsp{\generator^{\sphere^d}}
\def\generatorr{\generator^{\rset^d}}

\def\momentNoise{\mathrm{m}}
\def\bfe{\mathbf{e}}

\def\bfv{\mathbf{v}}
\def\ebf{\mathbf{e}}
\def\vbf{\mathbf{v}}


\def\Id{\operatorname{Id}}
\def\Idbf{\mathbf{I}}

\def\tildetheta{\tilde{\theta}}

\def\calC{\mathcal{C}}


\newcommandx{\CPE}[3][1=]{{\mathbb E}_{#1}\left[#2 \middle \vert #3  \right]} %%%% esperance conditionnelle
\newcommandx{\CPELigne}[3][1=]{{\mathbb E}_{#1}[#2  \vert #3  ]} %%%% esperance conditionnelle
\newcommandx{\CPEsq}[3][1=]{{\mathbb{E}^{1/2}}_{#1}\left[#2 \middle \vert #3  \right]} %%%% esperance conditionnelle
\newcommandx{\CPVar}[3][1=]{\mathrm{Var}^{#3}_{#1}\left\{ #2 \right\}}
\newcommand{\CPP}[3][]
{\ifthenelse{\equal{#1}{}}{{\mathbb P}\left(\left. #2 \, \right| #3 \right)}{{\mathbb P}_{#1}\left(\left. #2 \, \right | #3 \right)}}

\def\Ascr{\mathscr{A}}
\def\scrA{\mathscr{A}}
\def\scrB{\mathscr{B}}
\def\scrC{\mathscr{C}}

\def\barL{\bar{L}}

\def\YL{\mathbf{Y}}
\def\XEM{X}
\def\steps{\gamma}
\def\measSet{\mathbb{M}}

%\newcommand\Ent[2]{\mathrm{Ent}_{#1}\left(#2\right)}
\newcommandx{\osc}[2][1=]{\mathrm{osc}_{#1}(#2)}

\def\Ybar{\bar{Y}}
\def\Id{\operatorname{Id}}
\def\IdM{\operatorname{I}_d}
\newcommand\EntDeux[2]{\Ent_{#1}\left[#2 \right]}
\def\Ltwo{\mathrm{L}^2}
\def\Lone{\mathrm{L}^1}
\newcommand\densityPi[1]{\frac{\rmd #1}{\rmd \pi}}
\newcommand\densityPiLigne[1]{\rmd #1 /\rmd \pi}
\newcommand\density[2]{\frac{\rmd #1}{\rmd #2}}
\newcommand\densityLigne[2]{\rmd #1/\rmd #2}

\def\V{V}
\def\VD{V}
\def\Vsp{V^{\sphere^d}_{\b,\beta}}
\def\Vr{V^{\rset^d}_{\b,\c,\beta}}

\def\Prset{P^{\rset^d}}
\def\Psphere{P^{\sphere^d}}

\def\n{\mathrm{n}}
\def\Vpsi{\psi}
\def\Vkappa{\kappa}
\def\Vkappat{\tilde{\kappa}}
\def\Vchi{\chi}
\def\Vchit{\tilde{\chi}}
\def\Vphi{\phi}
\def\Vrho{\rho}
\def\psiV{\Vpsi}
\def\rhoV{\Vrho}
\def\phiV{\Vphi}
\def\fV{f}
\def\Vf{\fV}
\def\kappaVt{\tilde{\Vkappa}}
\def\kappaV{\Vkappa}
\def\chiV{\Vchi}
\def\chiVt{\Vchit}


\def\a{a}
\def\b{b}
\def\c{c}
\def\e{e}
\def\rU{\mathrm{r}}

\def\domain{\mathrm{D}}

\def\martfg{M^{f,g}}
\newcommand\Ddir[1]{D_{#1}}
\newcommand\maxplus[1]{\parenthese{#1}_+}
\def\Refl{\mathrm{R}}
\def\phibf{\pmb{\phi}}
\def\Gammabf{\mathbf{\Gamma}}


\def\transpose{\top}
\def\v{v}
\def\w{w}
\def\y{y}
\def\z{z}
%%%% bar
\def\bD{\bar{D}}
\def\bC{\bar{C}}
\def\brho{\bar{\rho}}
\def\bt{\bar{t}}
\def\bA{\bar{A}}
\def\bb{\overline{b}}
\def\bc{\bar{c}}
\def\bgamma{\bar{\gamma}}
\def\bU{\bar{U}}
\def\Ub{\bU}
\def\lambdab{\bar{\lambda}}
\def\blambda{\bar{\lambda}}
\def\blambdab{\bar{\lambda}}
\def\bv{\bar{v}}
\def\vb{\bv}
\def\yb{\bar{y}}
\def\by{\yb}
\def\Xb{\bar{X}}
\def\Yb{\bar{Y}}
\def\Gb{\bar{G}}
\def\Eb{\bar{E}}
\def\Tb{\bar{T}}
\def\taub{\bar{\tau}}

\def\bX{\bar{X}}
\def\bY{\bar{Y}}
\def\bG{\bar{G}}
\def\bE{\bar{E}}
\def\bT{\bar{T}}
\def\btau{\bar{\tau}}

\def\pib{\bar{\pi}}
\def\bpi{\pib}

\def\S{S}

%%%% tilde
\def\tgamma{\tilde{\gamma}}
\def\tC{\tilde{C}}
\def\tB{\tilde{B}}
\def\tc{\tilde{c}}
\def\tvareps{\tilde{\vareps}}
\def\trho{\tilde{\rho}}
\def\tmsk{\tilde{\msk}}
\def\tW{\tilde{W}}
\def\tvarsigma{\tilde{\varsigma}}
\def\tv{\tilde{v}}
\def\vt{\tv}
\def\yt{\tilde{y}}
\def\ty{\yt}
\def\Mt{\tilde{M}}
\def\tM{\Mt}

\def\tx{\tilde{x}}
\def\xt{\tx}
\def\Xt{\tilde{X}}
\def\Yt{\tilde{Y}}
\def\Gt{\tilde{G}}
\def\Et{\tilde{E}}
\def\Tt{\tilde{T}}
\def\St{\tilde{S}}
\def\taut{\tilde{\tau}}

\def\tX{\tilde{X}}
\def\tY{\tilde{Y}}
\def\tG{\tilde{G}}
\def\tE{\tilde{E}}
\def\tT{\tilde{T}}
\def\tS{\tilde{S}}
\def\ttau{\tilde{\tau}}


\def\Xb{\bar{X}}
\def\Yb{\bar{Y}}
\def\Gb{\bar{G}}
\def\Eb{\bar{E}}
\def\Tb{\bar{T}}
\def\Sb{\bar{S}}
\def\taub{\bar{\tau}}
\def\Hb{\bar{H}}
\def\Nb{\bar{N}}


\def\bX{\bar{X}}
\def\bY{\bar{Y}}
\def\bG{\bar{G}}
\def\bE{\bar{E}}
\def\bT{\bar{T}}
\def\btau{\bar{\tau}}
\def\bS{\bar{S}}
\def\bH{\bar{H}}
%\def\bN{\bar{N}}

%%%%%%%%

\def\mgU{\mathrm{m}_{\nabla U}}
\def\MintDrift{I}
\def\CU{C_U}
\def\RU{R_1}
\def\RV{R}
\def\Reps{R_{\epsilon}}
\def\Resp{\Reps}
\def\veps{\varepsilon}

\def\sphere{\mss}

\def\nablaUt{\overline{\nabla U}}
\def\measureSphere{\nu^d}

\def\etaU{\eta}
\def\epsilonU{\epsilon}

\def\Jac{\operatorname{Jac}}
\def\jac{\operatorname{Jac}}
\def\sign{\operatorname{sign}}
\def\rate{\lambda_{\mathrm{r}}}







\def\sigmaS{\sigma^2}

\newcommand{\ensemble}[2]{\left\{#1\,:\eqsp #2\right\}}
\newcommand{\ensembleLigne}[2]{\{#1\,:\eqsp #2\}}
\newcommand{\set}[2]{\ensemble{#1}{#2}}

\def\rmD{\mathrm{D}}%%rmd déjà pris
\def\mrd{\mathrm{D}}
\def\mrc{\mathrm{C}}

\def\diag{\Delta_{\rset^d}}

%\def\lyap{W}
\newcommand\coupling[2]{\Gamma(\mu,\nu)}
\def\supp{\mathrm{supp}}
\def\tpi{\tilde{\pi}}
\newcommand\adh[1]{\overline{#1}}

\def\ACb{\mathrm{AC}_{\mathrm{b}}}

\def\opK{\mathrm{K}}

\newcommand{\fracm}[2]{\left. #1 \middle / #2 \right.}
\newcommand{\fraca}[2]{ #1  / #2 }
\newcommand{\fracaa}[2]{ #1  / (#2) }

\newcommand{\complementary}{\mathrm{c}}

% \renewcommand{\geq}{\geqslant}
% \renewcommand{\leq}{\leqslant}
\def\poty{H}
\def\diam{\mathrm{diam}}
\def\talpha{\tilde{\alpha}}
% \def\Leb{\mathrm{Leb}}
\def\Leb{\lambda}
\newcommand{\iintD}[2]{\{#1,\ldots,#2\}}
\def\interior{\mathrm{int}}
\def\iff{ if and only if }

\def\vareps{\varepsilon}
\def\bvareps{\bar{\varepsilon}}
\def\varespilon{\varepsilon}
\def\si{\text{ if } }
\def\proj{\operatorname{proj}}
\def\projd{\operatorname{proj}^{\msd}}
\def\Phibf{\mathbf{\Phi}}
\def\Psibf{\mathbf{\Psi}}

\def\rker{\mathrm{R}}
\def\kker{\mathrm{K}}

\def\VEa{V}
\def\KUa{K}
\newcommandx{\KL}[2]{\operatorname{KL}\left( #1 | #2 \right)}
\newcommandx{\KLsqrt}[2]{\operatorname{KL}^{1/2}\left( #1 | #2 \right)}
\newcommandx{\Jef}[2]{\operatorname{J}\left( #1 , #2 \right)}
\newcommandx{\JefLigne}[2]{\operatorname{J}( #1 , #2 )}
\newcommandx{\KLLigne}[2]{\operatorname{KL}( #1 | #2 )}

\def\gaStep
\def\QKer{Q}
\def\Tg{\mathcal{T}_{\gamma}}
\def\Tk{\mathcal{T}_{k}}
\def\Tn{\mathcal{T}_{k}}
\def\Tnplusun{\mathcal{T}_{k+1}}
\def\mcurb{m}
%\newcommand{\coupling}[1]{\Gamma\left( #1 \right)}
\newcommand{\couplingLine}[1]{\Gamma( #1 )}
\def\distance{\mathbf{d}}
\newcommandx{\wasserstein}[3][1=\distance,3=]{\mathbf{W}_{#1}^{#3}\left(#2\right)}
\newcommandx{\wassersteinLigne}[3][1=\distance,3=]{\mathbf{W}_{#1}^{#3}(#2)}
\newcommandx{\wassersteinD}[1][1=\distance]{\mathbf{W}_{#1}}
\newcommandx{\wassersteinDLigne}[1][1=\distance]{\mathbf{W}_{#1}}


\def\Rcoupling{\mathrm{R}}
\def\Qcoupling{\mathrm{Q}}
\def\Sker{\mathrm{S}}
\def\Kcoupling{\mathrm{K}}
\def\tKcoupling{\tilde{\mathrm{K}}}
\def\Lcoupling{\mathrm{L}}
\def\Kcouplingproj{\mathrm{K}^P}
\def\vepsilon{\varepsilon}


\newcommand{\defEnsE}[2]{\ensemble{#1}{#2}}
\newcommand{\expeMarkovTildeD}[3]{\widetilde{\PE}_{#1}^{#3} \left[ #2 \right]}
\newcommand{\probaMarkovTildeD}[3]{\widetilde{\PP}_{#1}^{#3} \left[ #2 \right]}
\def\coordtildex{\mathrm{w}}
\def\PPtilde{\widetilde{\PP}}
\def\PEtilde{\widetilde{\PE}}
\def\transfrr{\mathrm{F}}
\def\diagSet{\Delta_{\msx}}
\def\Deltar{\diagSet}
\def\complem{\operatorname{c}}
\def\alphar{\alpha}
\def\tildex{\tilde{x}}
\def\tildez{\tilde{z}}
\def\tildey{\tilde{y}}
\def\ar{\mathrm{a}}
\def\Kr{\mathsf{K}}
\def\Kar{K^{(\mathrm{a})}}
\def\Xr{\mathrm{X}}
\def\Yr{\mathrm{Y}}
\def\Xrd{\mathit{X}}
\def\Yrd{\mathit{Y}}
\def\Zr{\mathrm{Z}}
\def\Ur{\mathrm{U}}
\def\sigmaD{\sigma^2}
\def\sigmakD{\sigma^2_k}
\newcommandx{\phibfs}[1][1=]{\pmb{\varphi}_{\sigmaD_{#1}}}
\def\vphibf{\pmb{\varphi}}
\def\varphibf{\pmb{\varphi}}
\def\phibfvs{\pmb{\varphi}_{\varsigma^2}}
\def\funreg{\mct}
\def\kappar{\varpi}
\def\Pr{\mathsf{P}}
\def\Par{P^{(\mathrm{a})}}
\def\Qr{\mathsf{Q}}
\def\Qar{Q^{(\mathrm{a})}}
\def\eventA{\msa}

\def\borelSet{\B}
\def\Er{\mathrm{E}}
\def\er{\mathrm{e}}
\def\transp{\operatorname{T}}

\newcommandx\sequenceg[3][2=,3=]
{\ifthenelse{\equal{#3}{}}{\ensuremath{( #1_{#2})}}{\ensuremath{( #1_{#2})_{ #2 \geq #3}}}}


\def\indiar{\iota}
\def\rated{\chi}
\def\transar{\tau}
\def\filtrationTilde{\tilde{\mcf}}

\def\discrete{\mathrm{d}}
\def\continuous{\mathrm{c}}


\def\Xar{X^{(\mathrm{a})}}
\def\Yar{Y^{(\mathrm{a})}}
\def\War{W^{(\mathrm{a})}}
\def\Xiar{\Xi^{(\mathrm{a})}}
\def\mcfar{\mcf^{(\mathrm{a})}}

\def\Xart{\tilde{X}^{(\mathrm{a})}}
\def\Yart{\tilde{Y}^{(\mathrm{a})}}


\def\Kker{\Kcoupling}
\def\KkerD{\tilde{\Kcoupling}}
\def\Rker{\Rcoupling}
\def\tRker{\tilde{\Rker}}
\def\Pker{\mathrm{P}}
\def\Pkerf{\overrightarrow{\mathrm{P}}}
\def\Pkerfou{\overrightarrow{\mathrm{P}}_{\mathrm{OU}}}
\def\Pkerb{\overleftarrow{\mathrm{P}}}
\def\Rkerb{\overleftarrow{\mathrm{R}}}
\def\Skerb{\overleftarrow{\mathrm{S}}}
\def\Qker{\mathrm{Q}}
\def\Lker{\mathrm{L}}
\def\rmL{\mathrm{L}}
\def\rmG{\mathrm{G}}
\def\bfmu{\bm{\mu}}

\def\VlyapD{W}
\def\VlyapDun{W_1}
\def\VlyapDdeux{W_2}
\def\VlyapDtrois{W_3}
% \newcommandx{\distV}[1][1=W]{\mathbf{d}_{#1}}
\newcommandx{\distV}[1][1=\bfc]{\mathbf{W}_{#1}}
\newcommandx{\distVdeux}[1][1=W_2]{\mathbf{d}_{#1}}

\def\inv{\leftarrow}
\newcommand{\couplage}[2]{\Pi(#1,#2)}
\def\mtt{\mathtt{m}}
\def\mttzero{\mathtt{m}_0}
\def\tmtt{\tilde{\mathtt{m}}}
\def\ttm{\mathtt{m}}
\def\mttplus{\mathtt{m}^{+}}
\def\mttplusun{\mathtt{m}_1^{+}}
\def\mttplusdeux{\mathtt{m}_2^{+}}
\def\ttmplus{\mathtt{m}^{+}}
\def\cconst{\mathtt{a}}
\def\Run{R_1}
\def\Rdeux{R_2}
\def\Rtrois{R_3}
\def\Rquatre{R_4}
\def\tR{\tilde{R}}
\def\tmttplus{\tilde{\mtt}^+}
\newcommand{\tup}[1]{\textup{#1}}
\def\Fix{\operatorname{Fix}}
\newcommand{\stopping}[1]{\T_{\msc,\mathtt{n}_0}^{(#1)}}
\def\wass{\mathcal{W}}
\def\distY{\mathbf{d}}
\def\Xibf{\boldsymbol{\Xi}}
\def\rhomax{\rho_{\rm{max}}}
\def\rhof{\overrightarrow{\rho}}
\def\familydrift{\mathscr{B}}

\def\wasscun{\mathbf{W}_{\bfc_1}}
\def\wasscdeux{\mathbf{W}_{\bfc_2}}
\def\wassctrois{\mathbf{W}_{\bfc_3}}

\def\loiz{\mu_{\msz}}
\def\muz{\loiz}
\def\funH{H}

\renewcommand{\doteq}{=}
\newcommand{\Idd}{\operatorname{I}_d}


 
\makeatletter

\DeclareMathOperator\erf{erf}

\providecommand{\assumptionname}{Assumption}
\providecommand{\lemmaname}{Lemma}
\providecommand{\propositionname}{Proposition}
\providecommand{\remarkname}{Remark}
\providecommand{\corollaryname}{Corollary}
\providecommand{\theoremname}{Theorem}


\makeatother

\begin{document}
\title{Conditional Simulation Using Diffusion Schr\"{o}dinger Bridges}
\author[1]{Yuyang~Shi}
\author[2]{Valentin~De~Bortoli}
\author[1]{George~Deligiannidis}
\author[1]{Arnaud~Doucet}

\affil[1]{%
    Department of Statistics\\
    University of Oxford, UK
}

\affil[2]{%
   ENS, PSL University, Paris, France 
}

\maketitle

\begin{abstract}
  Denoising diffusion models have recently emerged as a powerful class of generative
  models. They provide state-of-the-art results, not
  only for unconditional simulation, but also when used to solve conditional
  simulation problems arising in a wide range of inverse problems. A limitation of these models is that they are
  computationally intensive at generation time as they require simulating a
  diffusion process over a long time horizon. When performing unconditional
  simulation, a Schr\"odinger bridge formulation of generative modeling leads to
  a theoretically grounded algorithm shortening generation time which is
  complementary to other proposed acceleration techniques.  We extend the
  Schr\"odinger bridge framework to conditional simulation. We demonstrate this
  novel methodology on various applications including image super-resolution, 
  optimal filtering for state-space models and the refinement of pre-trained networks. Our code can be found at \href{https://github.com/vdeborto/cdsb}{\texttt{https://github.com/vdeborto/cdsb}}. 
\end{abstract}


\section{Introduction}\label{sec:intro}
\emph{Score-Based Generative Models} (SGMs), also known as denoising diffusion models, are a class of generative models that have become recently very popular as they provide state-of-the-art performance; see \eg~
\cite{chen2020wavegrad,ho2020denoising,song2020score,saharia2021image,nichol2021beatgans}.
Existing SGMs proceed as follows. First, noise is gradually added to the data using a time-discretized diffusion so
as to provide a sequence of perturbed data distributions eventually
approximating an easy-to-sample reference distribution, typically a multivariate Gaussian. Second, one approximates the corresponding time-reversed
denoising diffusion using neural network approximations of the logarithmic
derivatives of the perturbed data distributions known as scores; these
approximations are obtained using denoising score matching techniques
\citep{vincent2011connection, hyvarinen2005estimation}. Finally, the generative
model is obtained by initializing this reverse-time process using samples from
the reference distribution
\citep{ho2020denoising,song2020score}.

In many applications, one is not interested in unconditional simulation but the generative model is used as an implicit prior
$\pdata (x)$ on some parameter $X$ (e.g. image) in a Bayesian
inference problem with a likelihood function $g(\yobs|x)$ for observation
$Y=\yobs$. SGMs have been extended to address such tasks, see
\eg~\cite{song2020score,saharia2021image,batzolis2021conditional,tashiro2021csdi}. In
this conditional simulation case, one only requires being able to simulate from
the joint distribution of data and synthetic observations
$(X,Y)\sim \pdata (x)g(y|x)$. As in the unconditional case, the
time-reversal of the noising diffusion is approximated using neural network
estimates of its scores, the key difference being that this network admits not
only $x$ but also $y$ as an input. Sampling from the posterior
$p(x|\yobs) \propto \pdata (x) g(\yobs|x)$ is achieved by
simulating the time-reversal using the scores evaluated
at $Y=\yobs$.

However, performing unconditional or conditional simulation using SGMs is
computationally expensive as, to obtain a good approximation of the
time-reversed diffusion, one needs to run the forward noising diffusion long enough
to converge to the reference distribution. Many techniques have been proposed to
accelerate simulation including \eg~ knowledge distillation
\citep{luhman2021knowledge,salimans2022progressive}, non-Markovian forward process and subsampling
\citep{song2020denoising}, optimized noising diffusions and improved numerical solvers
\citep{jolicoeur2021gotta,dockhorn2021score,kingma2021variational,watson2022learning}. In
the unconditional scenario, reformulating generative modeling as a \schro bridge
(SB) problem provides a principled theoretical framework to accelerate
simulation time complementary to most other acceleration techniques
\citep{debortoli2021neurips}.
The SB solution is
the finite time process which is the closest in terms of Kullback--Leibler (KL)
discrepancy to the forward noising process used by SGMs but admits as marginals
the data distribution at time $t=0$ and the reference distribution at time
$t=T$. The time-reversal of the SB thus enables unconditional generation from
the data distribution.
However, the use of the SB formulation has not yet been developed in the context of conditional simulation. 

The contributions of this paper are as follows.
\begin{itemize}
\item We develop conditional SB (CSB), an original SB formulation for conditional simulation.    
\item By adapting the Diffusion SB algorithm of \cite{debortoli2021neurips} to
  our setting, we propose an iterative algorithm, Conditional Diffusion SB
  (CDSB), to approximate the solution to the CSB problem.

\item CDSB performance is demonstrated on various examples. In particular,
  we propose the first application of score-based techniques to optimal
  filtering in state-space models.
\end{itemize}
 

 
\section{Score-Based Generative Modeling}\label{sec:SGM}

    \subsection{Unconditional Simulation}
    \label{sec:discr-sett-mark}
    Assume we are given samples from some data distribution with positive
    density\footnote{We assume here that all
      distributions admit a positive density w.r.t. Lebesgue measure.} $\pdata$ on $\mathbb{R}^d$. Our aim is to
    provide a generative model to sample new data from $\pdata$.  SGMs 
    achieve this as follows. We gradually add noise to data samples, i.e. we
    consider a Markov chain $x_{0:N}=\{x_k\}_{k=0}^N \in \mcx = (\rset^d)^{N+1}$
    of joint density
\begin{equation}\label{eq:mu_forward}
           \textstyle{p(x_{0:N}) = p_0(x_0) \prod_{k=0}^{N-1}p_{k+1|k}(x_{k+1}|x_{k}),}
\end{equation}
where $p_0=\pdata$ and $p_{k+1|k}$ are Markov transition densities
inducing the following marginal densities
$p_{k+1}(x_{k+1})=\int
p_{k+1|k}(x_{k+1}|x_{k})p_{k}(x_{k})\textrm{d}x_{k}$. These transition densities
are selected such that $p_N(x_N) \approx \pref(x_N)$ for large $N$, where
$\pref$ is an easy-to-sample \emph{reference} density. In practice we set
$\pref(x_N)=\mathcal{N}(x_N;0,\Id)$, while 
$p_{k+1|k}(x_{k+1}|x_{k})=\mathcal{N}(x_{k+1};x_k-\gamma_{k+1}x_k;2 \gamma_{k+1} \Id)$
for $\gamma_k>0$, $\gamma_k \ll 1$ so $x_{0:N}$ is a time-discretized Ornstein--Uhlenbeck diffusion (see supplementary for details).

The main idea behind SGMs is to obtain samples from $p_0$ by exploiting the backward decomposition of \eqref{eq:mu_forward}  
\begin{equation}\label{eq:timereversal}
  \textstyle{
    p(x_{0:N}) = p_N(x_N)\prod_{k=0}^{N-1}p_{k|k+1}(x_{k}|x_{k+1}),
    }
\end{equation}
i.e.\ by sampling $X_N\sim p_N(x_N)$ then sampling $X_k\sim p_{k|k+1}(x_k|X_{k+1})$
for $k \in \{N-1, \dots, 0\}$, we obtain $X_0 \sim p_0(x_0)$.  In practice, we know neither $p_N$ nor the backward transition densities $p_{k|k+1}$ for
$k\in\{0,...,N-1\}$ and therefore this ancestral sampling procedure cannot be implemented
exactly. We thus approximate $p_N$ by $\pref$ and $p_{k|k+1}$ using a Taylor expansion approximation
\begin{equation}
\textstyle{p_{k|k+1}(x_k|x_{k+1})\approx  \mathcal{N}(x_k;B_{k+1}(x_{k+1}), 2\gamma_{k+1}  \Id),}
\end{equation}
where $B_{k+1}(x)=x+\gamma_{k+1} \{x + 2 \nabla \log p_{k+1}(x)\}$. Finally, we
approximate the score terms $\nabla\log p_{k}$ using denoising score matching
methods \citep{hyvarinen2005estimation,vincent2011connection,song2020score}. Since $p_{k}(x_{k})=\int p_{0}(x_{0})p_{k|0}(x_{k}|x_{0})\textrm{d}x_{0}$, it
follows that $\nabla \log p_{k}(x_{k})=\mathbb{E}[\nabla_{x_{k}} \log
        p_{k|0}(x_{k}|X_0)]$, 
where the expectation is w.r.t. to the distribution of $X_0$ given $x_{k}$.  We learn a neural network approximation
$\mathbf{s}_{\theta^\star}(k,x_k) \approx \nabla \log p_{k}(x_k)$ by minimizing
w.r.t. $\theta$ the loss
\begin{equation}\label{eq:scorematching}
\textstyle{\mathbb{E}[\sum_{k=1}^{N} \lambda_k ||\mathbf{s}_\theta(k, X_{k})-\nabla_{x_k} \log p_{k|0}(X_{k}|X_{0})||^2] },
\end{equation}
where $\lambda_k>0$ is a weighting coefficient \citep{ho2020denoising,song2020score} and the expectation is w.r.t. $p(x_{0:N})$. Once we have estimated
$\theta^\star$ from noisy data, we start by first sampling
$X_N \sim \pref(x_N)$ and then sampling
$X_k \sim \hat{p}_{k|k+1}(x_k|X_{k+1})$ for $\hat{p}_{k|k+1}$ as in
$p_{k|k+1}$ but with $\nabla \log p_{k+1}(X_{k+1})$ replaced by
$\mathbf{s}_{\theta^\star}(k+1,X_{k+1})$.  Under regularity assumptions, the resulting $X_0$ can be
shown to be approximately distributed according to $p_0=\pdata$ if $p_N\approx \pref$ \cite[Theorem
1]{debortoli2021neurips}.
        
\subsection{Conditional Simulation}\label{sec:condSGMs}
We now consider the scenario where we have samples from $p_0=\pdata$ and are
interested in generating samples from the posterior
$p(x|\yobs) \propto p_0(x) g(\yobs|x)$ for some observation $Y=\yobs \in
\mcy$. Here it is assumed that it is possible to sample synthetic observations from
$Y|(X=x)\sim g(y|x)$ but the expression of $g(y|x)$ might not be available.

In this case, conditional SGMs (CSGMs) proceed as follows; see
e.g. \cite{saharia2021image,batzolis2021conditional,li2022srdiff,tashiro2021csdi}. For any
realization $Y=y$, we consider a Markov chain of the form \eqref{eq:mu_forward}
but initialized using $X_0 \sim p(x|y)$ instead of $p_0(x)$. Obviously it is not
possible to simulate this chain but this will not prove necessary. This chain
induces for $k\geq 0$ the marginals denoted $p_{k+1}(x_{{k+1}}|y)$ which satisfy
$p_{k+1}(x_{{k+1}}|y)=\int
p_{{k+1}|k}(x_{k+1}|x_{k})p_{k}(x_{k}|y)\textrm{d}x_{k}$ for $p_0(x_0|y)=p(x_0|y)$. Similarly to the
unconditional case, to perform approximate ancestral sampling from this Markov
chain, we need to sample from
$p_{k|k+1}(x_{k}|x_{k+1},y)\approx \mathcal{N}(x_{k};B_{k+1}(x_{k+1},y), 2\gamma_{k+1} \Id)$ where 
$B_{k+1}(x,y)=x + \gamma_{k+1} \{x+2 \nabla \log p_{k+1}(x|y)\}$. We can again
estimate these score terms using
\begin{equation}
        \nabla \log p_{k}(x_{k}|y)=\mathbb{E}[\nabla_{x_{k}} \log p_{k|0}(x_{k}|X_{0})],
\end{equation}
where the expectation is w.r.t. to the distribution of $X_0$ given
$(X_{k},Y)=(x_{k},y)$.  In this case, we learn again a neural network approximation
$\mathbf{s}_{\theta^\star}(k,x_k,y) \approx \nabla \log p_{k}(x_k|y)$ by minimizing
w.r.t. $\theta$ the loss
\begin{equation}\label{eq:scorematchingcond}
        \textstyle{\mathbb{E}[\sum_{k=1}^{N} \lambda_k ||\mathbf{s}_\theta(k, X_{k},Y)-\nabla_{x_k} \log p_{k|0}(X_{k}|X_{0})||^2] },
\end{equation}
where the expectation is w.r.t. $p(x_{0:N})g(y|x_0)$ which we can sample
from.  Once the neural network is trained, we simulate from the
posterior $p(x|\yobs) \propto p_0(x) g(\yobs|x)$ for any observation $Y=\yobs$
 as follows: sample first $X_N \sim \pref(x_N)$ and then         $X_k \sim \hat{p}_{k|k+1}(x_k|X_{k+1},\yobs)$ where this density is
similar to $p_{k|k+1}(x_k|X_{k+1},\yobs)$ but with
$\nabla \log p_{k+1}(X_{k+1}|\yobs)$ replaced by
$\mathbf{s}_{\theta^\star}(k+1,X_{k+1},\yobs)$. The resulting sample $X_0$ will
be approximately distributed according to $p(x|\yobs)$. This scheme can
be seen as an amortized variational inference
procedure.



\begin{figure*}
\centering
\subfloat[\label{fig:sbdiagram}]{
\resizebox{.46\textwidth}{!}{
\begin{tikzpicture}[%
  % common options for blocks:
  block/.style = {rounded corners, fill=gray!10, align=center, text width=2cm, minimum height=3cm, inner sep=0}]


\node[text width=2cm,align=center] (note1) {$\pdata(x)$}; 
\node[block,below of=note1,yshift=-0.8cm] (tray1) {\includegraphics[width=.75\textwidth]{Plots/Diagram/2u/x.png}};
\node[right of=note1,xshift=6cm,text width=2cm,align=center] (note2) {$\pref(x)$};
\node[block,below of=note2,yshift=-0.8cm] (tray2) {\includegraphics[width=.75\textwidth]{Plots/Diagram/2u/x_y.png}};

\draw[draw=red!30,line width=2cm,{Triangle[width=3cm,length=10pt]}-{Triangle[width=3cm,length=10pt]}] (tray1)  -- (tray2); 
\draw[draw=blue,latex-, thick] ([xshift=-0.2cm] tray1.east)  -- ([xshift=0.2cm] tray2.west) 
        node[midway,text=black]{\includegraphics[width=30pt]{Plots/Diagram/2u/x_5.png} \, \includegraphics[width=30pt]{Plots/Diagram/2u/x_10.png} \, \includegraphics[width=30pt]{Plots/Diagram/2u/x_15.png}};

\node[] () [below = 41pt] at (tray1.east) {};
\end{tikzpicture} 
}
} \quad
\subfloat[\label{fig:csbdiagram}]{
\resizebox{.46\textwidth}{!}{
\begin{tikzpicture}[
  % common options for blocks:
  block/.style = {rounded corners, fill=gray!10, align=center, text width=2cm, minimum height=3cm, inner sep=0}]


\node[text width=2cm,align=center] (note1) {$\pjoin(x,y)$}; 
\node[block,below of=note1,yshift=-0.8cm, fill=gray!60] (tray1) {};
\node[block,below of=note1,yshift=-0.3cm,minimum height=1.9cm, text width=1.8cm] (tray1x) {\small{$p(x|y)$}\\\includegraphics[width=.75\textwidth]{Plots/Diagram/2c/x.png}};
\node[block,below of=note1,yshift=-1.8cm, fill=gray!20,minimum height=0.9cm, text width=1.8cm] (tray1y) {\small{$\pobs(y)$}\\\includegraphics[width=.25\textwidth]{Plots/Diagram/2c/y.png}};

\node[right of=note1,xshift=6cm,text width=2cm,align=center] (note2) {$\pjref(x,y)$};
\node[block,below of=note2,yshift=-0.8cm, fill=gray!60] (tray2) {};
\node[block,below of=note2,yshift=-0.3cm,minimum height=1.9cm, text width=1.8cm] (tray2x) {\small{$\pref(x|y)$}\\\includegraphics[width=.75\textwidth]{Plots/Diagram/2c/x_y.png}};
\node[block,below of=note2,yshift=-1.8cm, fill=gray!20,minimum height=0.9cm, text width=1.8cm] (tray2y) {\small{$\pobs(y)$}\\\includegraphics[width=.25\textwidth]{Plots/Diagram/2c/y.png}};

\draw[draw=blue,-latex, thick] ([xshift=-0.15cm] tray2y.east)  to[bend right]  ([shift=({-0.15cm,-0.1cm})] tray2x.east); 

\draw[draw=red!30,line width=2.5cm,{Triangle[width=3.2cm,length=10pt]}-{Triangle[width=3.2cm,length=10pt]}] ([yshift=-0.1cm] tray1.east)  -- ([yshift=-0.1cm] tray2.west) 
        node[pos=0.225,rounded corners, text=black,fill=black!10,minimum height=2.25cm,text width=1cm,align=center]{
        \vspace{1.45cm}
        \\
        \includegraphics[width=12pt]{Plots/Diagram/2c/y.png}}
        node[pos=0.5,rounded corners, text=black,fill=black!10,minimum height=2.25cm,text width=1cm,align=center]{
        \vspace{1.45cm}
        \\
        \includegraphics[width=12pt]{Plots/Diagram/2c/y.png}}
        node[pos=0.775,rounded corners, text=black,fill=black!10,minimum height=2.25cm,text width=1cm,align=center]{
        \vspace{1.45cm}
        \\
        \includegraphics[width=12pt]{Plots/Diagram/2c/y.png}};

\draw[draw=blue,latex-, thick] ([shift=({-0.2cm,-0.2cm})] tray1x.east)  -- ([shift=({0.2cm,-0.2cm})] tray2x.west) 
        node[pos=0.255,text=black]{\includegraphics[width=30pt]{Plots/Diagram/2c/x_5.png}}
        node[pos=0.5,text=black]{\includegraphics[width=30pt]{Plots/Diagram/2c/x_10.png}}
        node[pos=0.745,text=black]{\includegraphics[width=30pt]{Plots/Diagram/2c/x_15.png}};

\draw[draw=black!80,latex-latex] ([shift=({0.1cm,-0.15cm})] tray1y.east)  -- ([shift=({-0.1cm,-0.15cm})] tray2y.west)
        node[midway, below, yshift=-0.125cm] {\footnotesize{fixed $y$}};

\node[above,rounded corners,draw=black!80,text width=9.25cm,minimum height=1.25cm,align=center] at (current bounding box.south) {};

\end{tikzpicture}
}
}

\caption{(a) An unconditional \schro bridge (SB) between $\pdata(x)$ and $\pref(x)$; (b) our proposed conditional \schro bridge (CSB) on the extended space between $\pjoin(x,y)$ and $\pjref(x,y)$. The blue arrows denote the direction of the generative procedure at simulation time. }

\end{figure*}


        
\section{\schro Bridges and Generative Modeling}
\label{sec:schro-bridges}

For SGMs to work well, we must diffuse the process long enough so that 
$p_N \approx \pref$. The SB methodology introduced in
\citep{debortoli2021neurips} allows us to mitigate this problem.  We refer to
\citet{chen2020optimal} for recent reviews on
the SB problem. We first
recall how the SB problem can be applied to perform unconditional
simulation.  

Consider the \emph{forward} density $p(x_{0:N})$ given by \eqref{eq:mu_forward}, describing the process adding noise to the data.  We want to find the joint density $\pi^\star(x_{0:N})$ such
that
\begin{equation}
  \label{eq:discrete_schro}
 \textstyle{\pi^\star = \argmin_{\pi} \ensemble{\KLLigne{\pi}{p}}{ \pi_0=\pdata,~ \pi_N= \pref}},
\end{equation}
where $\pi_0$,
resp. $\pi_N$, is the marginal of $X_0$, resp. $X_N$, under $\pi$.  
A visualization of the SB problem \eqref{eq:discrete_schro} is provided in Figure \ref{fig:sbdiagram}. 
Were $\pi^\star$ available, we would obtain a generative model by ancestral sampling: sample $X_N \sim \pref(x_N)$, then $X_k \sim \pi^\star_{k|k+1}(x_k | X_{k+1})$ for
$k \in \{N-1, \dots, 0\}$. 

The SB problem does not admit a closed-form solution
but it can be solved numerically using Iterative Proportional Fitting (IPF)
\citep{kullback1968probability}. This algorithm defines the following recursion initialized at $\pi^0=p$ given in
\eqref{eq:mu_forward}:
\begin{align}\label{eq:IFPrecursion}
  &\textstyle{\pi^{2n+1} = \argmin_{\pi} \ensemble{\KLLigne{\pi}{\pi^{2n}}}{\pi_N = \pref},} \\
  &\textstyle{\pi^{2n+2} =  \argmin_{\pi}  \ensemble{\KLLigne{\pi}{\pi^{2n+1}}}{\pi_0 = \pdata}. }
\end{align}
\cite{debortoli2021neurips,vargas2021solving} showed that the IPF iterates admit a representation suited to numerical approximation. Indeed, if we denote $p^n=\pi^{2n}$ and $q^n=\pi^{2n+1}$, then $p^0(x_{0:N})=p(x_{0:N})$ and
\begin{align}
     & \textstyle{q^n(x_{0:N}) = \pref(x_N) \prod_{k=0}^{N-1} q^n_{k|k+1}(x_k|x_{k+1}),}\\
     & \textstyle{p^{n+1}(x_{0:N}) = \pdata(x_0) \prod_{k=0}^{N-1} p^{n+1}_{k+1|k}(x_{k+1}|x_{k}),}
\end{align} 
where 
$q^n_{k|k+1} = p^n_{k|k+1}$  and
$p^{n+1}_{k+1|k} = q^n_{k+1|k}$. 
To summarize, at step $n=0$, 
$q^0$ is the backward
process obtained by reversing the dynamics of $p^0$ initialized at time $N$
from $\pref$. The forward process $p^1$ is then obtained from the reversed dynamics
of $q^0$ initialized at time $0$ from $\pdata$, and so on. Note that $q^0$ corresponds to the unconditional SGM described in \Cref{sec:discr-sett-mark}. 

\subsection{Diffusion \schro Bridge}

Similarly to SGMs, one can approximate the time-reversals appearing in the IPF iterates using score matching ideas. If
$p^{n}_{k+1|k}(x'|x)= \mathcal{N}(x';x+\gamma_{k+1} f^{n}_k(x),2 \gamma_{k+1}
\Id)$, with $f^0_k(x)=-x$, we approximate the reverse-time
transitions by
$q^n_{k|k+1}(x|x') \approx \mathcal{N}(x;x'+ \gamma_{k+1}
b^{n}_{k+1}(x'), 2\gamma_{k+1} \Id)$, where
$b^{n}_{k+1}(x') = -f^{n}_{k}(x')+2 \nabla \log p^{n}_{k+1}(x')$; and next
$p^{n+1}_{k+1|k}(x'|x) \approx \mathcal{N}(x';x+ \gamma_{k+1}
f^{n+1}_{k}(x), 2\gamma_{k+1} \Id)$, where
$f^{n+1}_{k}(x) = -b^{n}_{k+1}(x)+2 \nabla \log q^{n}_{k}(x)$. The drifts
$b^n_{k+1},f^{n+1}_k$ could be estimated by approximating
$\{\nabla \log p^{i}_{k+1}(x)\}_{i=0}^{n}$,
$\{\nabla \log q^{i}_{k}(x)\}_{i=0}^{n}$ using score matching. However this is too expensive both in terms of compute and memory. \cite{debortoli2021neurips} instead
directly approximate the mean of the Gaussians using neural networks, $\mathbf{B}_{\theta}$ and $\mathbf{F}_{\phi}$, by
generalizing the score matching approach, i.e.
$ q_{k|k+1}^n(x|x') = \mathcal{N}(x;\mathbf{B}_{\theta^n}(k+1,x'), 2 \gamma_{k+1}
\Id)$ and
$p_{k+1|k}^n(x'|x) = \mathcal{N}(x';\mathbf{F}_{\phi^n}(k,x), 2\gamma_{k+1} \Id)$,
where $\theta^n$ is obtained by minimizing
\begin{equation} \textstyle{\ell^{b}_n(\theta)=\mathbb{E}_{p^{n}}[\sum_{k}\normLigne{\mathbf{B}_\theta(k+1,X_{k+1})-G_{n,k}(X_k,X_{k+1})}^2]},\label{eq:regressionb}
\end{equation}
for $G_{n,k}(x,x')=x'+\mathbf{F}_{\phi^n}(k,x)-\mathbf{F}_{\phi^n}(k,x')$, 
and $\phi^{n+1}$ by minimizing 
\begin{equation} \textstyle{\ell^{f}_{n+1}(\phi)=\mathbb{E}_{q^{n}}[\sum_{k}\normLigne{\mathbf{F}_\phi(k,X_k)-H_{n,k}(X_k,X_{k+1})}^2]},\label{eq:regressionf}
\end{equation}
for $H_{n,k}(x,x')=x + \mathbf{B}_{\theta^n}(k+1,x')-\mathbf{B}_{\theta^n}(k+1,x)$.
This implementation of IPF, referred to as Diffusion SB (DSB), is presented in
the supplementary; see \cite{vargas2021solving,chen2021likelihood} for alternative
numerical schemes. After we have learned $\theta^L$ using $L$ DSB iterations, we
sample $X_N \sim \pref(x_N)$ and then set
$X_{k} = \mathbf{B}_{\theta^L}(k+1, X_{k+1})+ \sqrt{2 \gamma_{k+1}} Z_{k+1}$ with
$Z_k \overset{\textup{i.i.d.}}\sim \mathcal{N}(0,\Id)$ to obtain
$X_0$ approximately distributed from $\pdata$.

\subsection{Link With Optimal Transport}
\label{subsec:linkwithot}
It can be shown that the solution $\pi^\star$ of the SB problem \eqref{eq:discrete_schro}, $\pi^\star(x_{0:N})=\pi^{s,\star}(x_0,x_N)p_{|0,N}(x_{1:N-1}|x_0,x_N)$ where $\pi^{s,\star}(x_0,x_N)$ is the marginal of $\pi^\star(x_{0:N})$ at times $0$ and $N$. In this case, \eqref{eq:discrete_schro} reduces to the static SB problem
\begin{equation}
 \label{eq:static_schro}
 \textstyle{\pi^{s,\star} = \argmin_{\pi^s} \ensemble{\KLLigne{\pi^s}{p_{0,N}}}{ \pi^s_0=\pdata,~ \pi^s_N= \pref}}.
\end{equation}
The static SB problem can be interpreted as an entropy-regularized optimal transport problem between $\pdata$ and $\pref$, with regularized transportation cost $\mathbb{E}_{\pi^s} [-\log p_{N|0}(X_N|X_0)] - H(\pi^s)$. 
When $p_{N|0}(x_N|x_0) = \mathcal{N} (x_N ; x_0, \sigma^2)$ as in \cite{song2019generative}, the transportation cost $-\log p_{N|0}(x_N|x_0)$ reduces to the quadratic cost $\frac{1}{2\sigma^2}\|x_0-x_n\|^2$ up to a constant. In other words, the static SB solution $\pi^{s,\star}$
not only transports samples $X_N\sim\pref$ into samples from the data distribution $\pdata$, but also seeks to minimize an entropy-regularized Wasserstein distance of order $2$.
The regularization strength is controlled by the variance $\sigma^2$. 
Similar properties hold for the time-discretized Ornstein--Uhlenbeck diffusion defined by \eqref{eq:mu_forward} in Section \ref{sec:discr-sett-mark}.



\section{Conditional Diffusion \schro Bridge}
\label{sec:cond-simul-sb}

We now want to use SBs for conditional simulation, i.e.\ to be able sample from
a posterior distribution $p(x|\yobs) \propto \pdata(x) g(\yobs|x)$ assuming only
that it is possible to sample $(X,Y)\sim \pdata (x) g(y|x)$.  In
this case, an obvious approach would be to consider the SB problem where we
replace $\pdata(x)$ by the posterior $p(x|\yobs)$, i.e.
\begin{equation}\label{eq:discrete_schro-cond1}
 \textstyle{\pi^{\star}= \argmin_{\pi} \ensemble{\KLLigne{\pi}{p_{\yobs}}\hspace{-.15cm}}{ \hspace{-.15cm} \pi_0=p(\cdot|\yobs),~ \pi_N= \pref}, }
\end{equation}
where $p_{\yobs}(x_{0:n}):=p(x_0|\yobs) \prod_{k=0}^{N-1}p_{k+1|k}(x_{k+1}|x_k)$ is the forward noising process. However, DSB is not applicable here as it requires sampling from $p(x_{0}|\yobs)$ at step $0$.

We propose instead to solve an amortized problem. Let us introduce
$\pjoin(x,y)=\pdata(x)g(y|x)=p(x|y)\pobs(y)$ and $\pjref(x,y)=\pref(x)\pobs(y)$
where $\pobs(y)=\int \pdata(x)g(y|x) \rmd x$. We are interested in finding the
transition kernel $\pi^{c,\star} = (\pi^{c,\star}_y)_{y \in \mcy}$,
where
$\pi^{c,\star}_{y}$ defines a distribution on $\mcx = (\rset^d)^{N+1}$ for each
$y \in \mcy$, satisfying
\begin{align}
&\pi^{c,\star}= \text{argmin}_{\pi^c} \{\mathbb{E}_{Y \sim \pobs}[\textup{KL}(\pi_{Y}^c||p_{Y})]: \\
&\qquad \qquad  \pi_{0}^c\otimes \pobs= \pjoin,~ \pi_{N}^c \otimes \pobs =\pjref\}.\label{eq:SBuncondreformulated}
\end{align}
This corresponds to an averaged version of
\eqref{eq:discrete_schro-cond1} over the distribution $\pobs(y)$ of
$Y$. The first constraint
$\pi^{c,\star}_{y,0}(x_0)\pobs(y)=\pjoin(x_0,y)=p(x_0|y)\pobs(y)$ ensures that
$\pi^{c,\star}_{y,0}(x_0)=p(x_0|y)$,  $\pobs$-almost surely. Similarly
$\pi^{c,\star}_{y,N}(x_N) = \pref(x_N)$, $\pobs$-almost surely.  Hence,
to obtain a sample from $p(x|\yobs)$ for a given $Y=\yobs$, we can sample
$X_N \sim \pref(x_N)$ then
$X_k|X_{k+1} \sim \pi^{c,\star}_{\yobs,k|k+1}(x_k|X_{k+1})$ for $k=N-1,...,0$
and $X_0$ is a sample from $p(x|\yobs)$.

We show here that \eqref{eq:SBuncondreformulated} can be reformulated as a SB on
an extended space, which we will refer to as Conditional SB (CSB), so the theoretical results for existence and uniqueness of the solution to the SB problem apply. 
\begin{proposition}\label{prop:SBreformulation} Consider the following SB problem
\begin{align}
 \mkern-18mu \bar{\pi}^\star = \textup{argmin}_{\bar{\pi}} \{&\textup{KL}(\bar{\pi}|\bar{p}):  \text{s.t.} \ \label{eq:conditionalSBextended}\bar{\pi}_0= \pjoin,~ \bar{\pi}_N =\pjref\},
\end{align}
where we define $\bar{p}(x_{0:N},y_{0:N}):=p_{y_0}(x_{0:N})\bpobs(y_{0:N})$ with
$\bpobs(y_{0:N}):=\pobs(y_0) \prod_{k=0}^{N-1}\delta_{y_k}(y_{k+1})$ and $p_{y_0}$ is the forward process defined below \eqref{eq:discrete_schro-cond1}. If $\KLLigne{\bar{\pi}^\star}{\bar{p}}<+\infty$ then $\bar{\pi}^\star=\pi^{c,\star} \otimes \bpobs$ where $\pi^{c,\star}$ solves
\eqref{eq:SBuncondreformulated}. 
\end{proposition}
 We provide an illustration of the CSB problem \eqref{eq:conditionalSBextended} in Figure \ref{fig:csbdiagram}. 
 Under $\bar{p}$, the $Y$-component is sampled at time $0$ according to $\pobs$ and then is kept constant until time $N$ while the $X$-component is initialized at $p(x|y_0)$ and then diffuses according to $p_{k+1|k}(x_{k+1}|x_k)$.
 
Contrary to \eqref{eq:discrete_schro-cond1}, we can adapt DSB to solve numerically the CSB problem  \eqref{eq:conditionalSBextended} as both the distributions $\pjoin$ and
 $\pjref$ can be sampled. The
resulting algorithm is called Conditional DSB (CDSB). It approximates the following IPF recursion 
\begin{align}\label{eq:IFPrecursion}
  &\textstyle{\bar{\pi}^{2n+1} = \argmin_{\bar{\pi}} \ensemble{\KLLigne{\bar{\pi}}{\bar{\pi}^{2n}}}{\bar{\pi}_N = \pjref},} \\
  &\textstyle{\bar{\pi}^{2n+2} =  \argmin_{\bar{\pi}}  \ensemble{\KLLigne{\bar{\pi}}{\bar{\pi}^{2n+1}}}{\bar{\pi}_0 = \pjoin}}
\end{align}
initialized at $\bar{\pi}^0=\bar{p}$. For $\bar{p}^n=\bar{\pi}^{2n}$ and $\bar{q}^n=\bar{\pi}^{2n+1}$, we have the following representation of the IPF iterates.
\begin{proposition}
  \label{prop:IPFrecursion}
  Assume that $\KLLigne{\pjoin \otimes \pjref}{\bar{p}_{0,N}} < +\infty$. Then
  we have $\bar{p}^0(x_{0:N},y_{0:N})=\bar{p}(x_{0:N},y_{0:N})$ and for any
  $n>0$, $\bar{q}^n(x_{0:N},y_{0:N})=\bpobs(y_{0:N})\bar{q}^n(x_{0:N}|y_{N})$,
  $\bar{p}^{n+1}(x_{0:N},y_{0:N})= \bpobs(y_{0:N})\bar{p}^{n+1}(x_{0:N}|y_0)$
  with 
 \begin{align}
      &\textstyle{\bar{q}^n(x_{0:N}|y_{N})= \pref(x_N)\prod_{k=0}^{N-1} \bar{p}^n_{k|k+1}(x_k|x_{k+1},y_{N}),} \\
    &\textstyle{\bar{p}^{n+1}(x_{0:N}|y_0)= p(x_0|y_0) \prod_{k=0}^{N-1} \bar{q}^{n}_{k+1|k}(x_{k+1}|x_{k},y_{0}).}
  \end{align} 
\end{proposition}
Here we simplify notation and write $Y$ for all the random variables
$Y_0,Y_1,...,Y_N$ as they are all equal almost surely under $\bar{p}^{n}$ and
$\bar{q}^{n}$. We approximate the transition kernels as in DSB and refer to the
supplementary for more details. In particular, the transition kernels satisfy
$\bar{q}^n_{k|k+1}(x|x',y)=
\mathcal{N}(x;\mathbf{B}^{y}_{\theta^n}(k+1,x'),2\gamma_{k+1} \Id)$ and
$\bar{p}_{k+1|k}^n(x'|x,y) = \mathcal{N}(x';\mathbf{F}^{y}_{\phi^n}(k,x),
2\gamma_{k+1} \Id)$, where $\theta^n$ is obtained by minimizing
\begin{equation} \textstyle{\ell^{b}_n(\theta)=\mathbb{E}_{\bar{p}^{n}}[\sum_{k}\normLigne{\mathbf{B}_\theta^Y(k+1,X_{k+1})-G_{n,k}^Y(X_{k},X_{k+1})}^2]}\label{eq:regressionbcond}
\end{equation}
for $G_{n,k}^y(x,x')=x'+\mathbf{F}^{y}_{\phi^n}(k,x)-\mathbf{F}^{y}_{\phi^n}(k,x')$ and $\phi^{n+1}$ by minimizing 
\begin{equation} \textstyle{\ell^{f}_{n+1}(\phi)=\mathbb{E}_{\bar{q}^{n}}[\sum_{k}\normLigne{\mathbf{F}_\phi^Y(k,X_k)-H_{n,k}^Y(X_k,X_{k+1})}^2]}\label{eq:regressionfcond},
\end{equation}
$H_{n,k}^y(x,x')=x + \mathbf{B}^{y}_{\theta^n}(k+1,x')-\mathbf{B}^{y}_{\theta^n}(k+1,x)$. 


The resulting CDSB scheme is summarized in \Cref{algo:ipf_score_cond} where $Z^j_k,\tilde{Z}^j_k \overset{\textup{i.i.d.}}\sim \mathcal{N}(0,\Id)$. After $L$ iterations of CDSB, we have learned $\theta^L$. For any observation
$Y=\yobs$, we can then sample $X_N \sim \pref(x_N)$ and then compute
$X_{k} = \mathbf{B}_{\theta^L}^{\yobs}(k+1, X_{k+1})+ \sqrt{2 \gamma_{k+1}} Z_{k+1}$ with
$Z_k \overset{\textup{i.i.d.}}\sim \mathcal{N}(0,\Id)$ for $k=N-1,...,0$.  The resulting sample
$X_0$ will be approximately distributed from $p(x|\yobs)$.
\begin{algorithm}
    \caption{Conditional Diffusion \schro Bridge}
    \label{algo:ipf_score_cond}
    \begin{algorithmic}[1] 
      \FOR{$n \in \{0, \dots,L\}$} \WHILE{not converged}
      \STATE Sample $\{X^j_{k}\}_{k,j=0}^{N,M},\{Y^j\}_{j=0}^{M}$ where\\ $X^j_0 \sim \pdata, Y^j \sim g(\cdot|X^j_0)$, and \\
      $X^{j}_{k+1} = \mathbf{F}_{\phi^n}^{Y^j}(k, X^{j}_{k})+\sqrt{2
        \gamma_{k+1}} Z^{j}_{k+1}$ 
        \STATE Compute $\hlb_n(\theta^n)$ approximating \eqref{eq:regressionbcond}
        \STATE $\theta^{n} \leftarrow \textrm{Gradient Step}(\hlb_n(\theta^n))$ 
      \ENDWHILE \WHILE{not
        converged}
      \STATE Sample $\{X^j_{k}\}_{k,j=0}^{N,M}$,  $\{Y^j\}_{j=0}^{M}$ where\\ $X^j_N \sim \pref, Y^j\sim \pobs$, and \\
      $X^j_{k}=\mathbf{B}_{\theta^n}^{Y^j}(k+1, X^{j}_{k+1})+\sqrt{2 \gamma_{k+1}}
      \tilde{Z}^{j}_{k+1}$ 
      \STATE Compute $\hlf_{n+1}(\phi^{n+1})$ approximating \eqref{eq:regressionfcond}
      \STATE
      $\phi^{n+1} \leftarrow \textrm{Gradient Step}(\hlf_{n+1}(\phi^{n+1}))$
      \ENDWHILE \ENDFOR \STATE \textbf{Output: } $(\theta^{L},\phi^{L+1})$
    \end{algorithmic}
  \end{algorithm}
\section{CDSB Improvements}
\subsection{Conditional Reference Measure}
\label{subsec:targetawareinitial}
In standard SGMs and for the unconditional SB, we typically select
$\pref(x)=\mathcal{N}(x;0,\sigma_{\textup{ref}}^{2}\Id)$. However, initializing
ancestral sampling from random noise to eventually obtain samples from $p(x|y)$
can be inefficient as $y$ already contains useful information about $X$. Fortunately, it is easy to use a joint reference measure of the form
$\pjref(x,y)=\pref(x|y)\pobs(y)$ instead of $\pjref(x,y)=\pref(x)\pobs(y)$ in CSB and CDSB. The
only modification in \Cref{algo:ipf_score_cond} is that line 8 becomes
$Y^j \sim \pobs(y), X^j_N \sim \pref(x|Y^j)$.  In some interesting scenarios, we
can select $\pref(x|y)$ as an approximation to $p(x|y)$ in order
to accelerate the sampling process. This means we construct a CSB between $p(x|y)$ and its approximation $\pref(x|y)$, instead of between $p(x|y)$ and noise. We refer to this extension of CDSB as
CDSB-C.

As a simple example, consider obtaining super-resolution (SR) image samples from a low-resolution image $Y=y$. Assume that $y$
has been suitably upsampled to have the same dimensionality as $X$. In this
case, $y$ itself can serve as an approximate
initialization for sampling $X_N$. A simple model is to take
$\pref(x|y)=\vois(x;y,\sigma_{\textup{ref}}^{2} \Id)$ with $\sigma_{\textup{ref}}^{2}=\rho \sigma_{x|y}^{2}$, where $\rho$ is a variance
inflation parameter and $\sigma_{x|y}^{2}$ is an estimate of the conditional variance of $X$ given $Y$. 
See Figure \ref{fig:csbdiagram} for an illustration. 
In our experiments, we also explore other $\pref(x|y)$ obtained using the Ensemble Kalman Filter (EnKF) as well as neural network models.

\subsection{Conditional Forward Process}
\label{subsec:targetawareforward}
To accelerate the convergence of IPF, we also have the flexibility
to make the initial forward noising process dynamics dependent on $Y=y$, i.e. 
$p_{y}(x_{0:N})=p(x_{0}|y)\prod_{k=0}^{N-1}p_{k+1|k}(x_{k+1}|x_{k},y)$.
As shown below, it is  beneficial to initialize $p_{y}$ close to the CSB solution $\pi^{c,\star}_y$.
\begin{proposition}\label{prop:fasterconverence}
  For any $n \in \nset$ with $n \geq 1$, we have 
\begin{equation}
  \expeLigne{\KLLigne{\pi^{c,n}_{Y,0}}{p(\cdot|Y)}}
   \leq \tfrac{2}{n} \expeLigne{\KLLigne{\pi^{c,\star}_Y}{p_Y}} ,
\end{equation}
where for any $n \in \nset$, $\bar{\pi}^n = \bpobs \otimes \pi^{c,n}$ is the
$n^{\textup{th}}$ IPF iterate and the expectations are w.r.t. $Y \sim \pobs$.
\end{proposition}

As a result, we should choose the initial forward noising process $p_y$ such that its
terminal marginal $p_{y,N}$ targets $\pref(\cdot|y)$. However, contrary to
diffusion models, we recall that our framework does not strictly  require
$p_{y,N}\approx \pref(\cdot|y)$ to provide approximate samples from the
posterior of interest.

For tractable $\pref(x|y)$, we can define
$p_{y}(x_{0:N})$ using an unadjusted Langevin dynamics; i.e.
$p_{k+1|k}(x'|x,y)=\mathcal{N}(x';x+\gamma_{k+1}\nabla\log\pref(x|y),2\gamma_{k+1}\Id)$. In the case $\pref(x|y)=\mathcal{N}(x;\mu(y),\sigma^{2}(y)\Id)$, this reduces to a discretized Ornstein--Uhlenbeck process admitting $\pref(x|y)$ as limiting distribution as $\gamma\to0$ and $N\to\infty$ \citep{durmus2017nonasymp}. 


\subsection{Forward-Backward Sampling}

When we use an unconditional $\pref(x)$, our
proposed method also shares connections with the conditional transport methodology
developed by \cite{marzouk2016sampling,spantini2019coupling}. They propose
methods to learn a deterministic invertible transport map
$\mathcal{S}(x,y):\mathcal{X}\times\mathcal{Y}\to\mathcal{X}$ which maps samples
from $p(x|y)$ to $\pref(x)$. To sample from $p(x|\yobs)$, one samples
$X^{\textup{ref}}\sim\pref(x)$, then transports back the sample through the
inverse map
$X^{\textup{pos}}={\mathcal{S}}(\cdot,\yobs)^{-1}(X^{\textup{ref}})$. 

As noted by \cite{spantini2019coupling}, an alternative method to sample from
$p(x|\yobs)$ consists of first sampling
$(X,Y)\sim\pjoin$, then following the two-step
transformation
$\hat{X}^{\textup{ref}}=\mathcal{S}(X,Y),~~
\hat{X}^{\textup{pos}}=\mathcal{S}(\cdot,\yobs)^{-1}(\hat{X}^{\textup{ref}})$.  By
definition of $\mathcal{S}$, $\hat{X}^{\textup{ref}}$ is also distributed
according to $\pref$.  However, since the transport map ${\mathcal{S}}$ may be
imperfect in practice, this sampling strategy provides the advantage of
cancellation of errors between $\mathcal{S}$ and $\mathcal{S}(\cdot,\yobs)^{-1}$.

We also explore an analogous forward-backward sampling scheme in our
framework, which first samples $(X,Y)\sim\pjoin$,  
followed by sampling  $\hat{X}_{N}\sim \bar{p}^{L}_{N|0}(x_{N}|X,Y)$ through the forward half-bridge, then $\hat{X}_{0}\sim
\bar{q}^{L}_{0|N}(x_{0}|\hat{X}_{N},\yobs)$ through the backward half-bridge.
Since $\bar{q}^{L}$ is the approximate time-reversal of $\bar{p}^{L}$, this
strategy shares similar advantages as the method of \cite{spantini2019coupling}
when the half-bridge $\bar{q}^{L}(x_{0:N}|\yobs)$ does not solve the CSB problem
exactly. We call this extension CDSB-FB.
     
\section{Related work}

\textbf{Approximate Bayesian computation (ABC)}, also known as
likelihood-free inference, has been developed to approximate the posterior when
the likelihood is intractable but one can simulate synthetic data from it; see
\eg \citep{beaumont2019approximate}. However, these
methods typically require knowing the prior, while CDSB only needs to have access to
joint samples and learns about the posterior directly. For tasks such as image inpainting, the prior is indeed implicit. 

\textbf{\schro bridges} techniques to perform both static and sequential Bayesian inference for state-space models have been developed by \cite{bernton2019schr} and \cite{reich2018data}. However, these methods require being able to evaluate pointwise an unnormalized version of the target posterior distribution contrary to the CDSB-based methods developed here.

\textbf{Conditional transport}. Performing conditional simulation by
learning a transport map between joint distributions on $X,Y$ having the same
$Y$-marginals (as $\pjoin$ and $\pref$) has been first proposed by
\cite{marzouk2016sampling}. Various techniques have
been subsequently developed to approximate such maps such as polynomial or radial basis
representations \citep{marzouk2016sampling,baptista2020adaptive}, Generative
Adversarial Networks \citep{kovachki2021conditional,zhou2021deep} or normalizing
flows \citep{kruse2021hint}. 
CDSB also fits into this framework, but instead utilizes stochastic transport maps. 
Recently, \cite{taghvaei2022optimal} have also proposed independently using conditional transport ideas to perform optimal filtering for state-space models.

\textbf{Conditional SGMs}. SGMs have been applied to perform posterior
simulation, primarily for images, as described in Section
\ref{sec:condSGMs} and references therein.
An alternative line of work for image editing
\citep{song2019generative,choi2021ilvr,chung2021comecloserdiffusefaster,meng2022sdedit} 
utilizes the denoising property of SGMs to iteratively denoise noisy versions of
a reference image $y$ while restricted to retain particular features of $y$. 
However, $\pref(x)=\mathcal{N}(x;0,\sigma_{\textup{ref}}^{2}\Id)$ so image generation is started
from noise and typically hundreds or thousands of refinement steps are required. Our framework can incorporate in a principled way information given by $y$ in the reverse process's initialization (see Section \ref{subsec:targetawareinitial}). Recently \cite{zheng2022truncated,lu2022conditional} have also proposed suitable choices for $\pref(x)$ or $\pref(x|y)$ to shorten the diffusion process. In comparison, the CDSB framework is more flexible and allows for general $\pref(x|y)$ which can be non-Gaussian and different from the initial forward diffusion's terminal distribution $p_N(x_N|y)$. For instance, we explore 
using noiseless pre-trained super-resolution models as $\pref(x|y)$ in Section \ref{subsubsec:nongaussianref}, where CDSB further improves the SR samples closer to the data distribution. 
Finally, for linear Gaussian inverse
problems, \cite{kadkhodaie2021stochastic,kawar2021snips,kawar2022denoising} develop efficient methodologies using unconditional SGMs when the linear degradation model and the Gaussian noise level are known.

\textbf{SGM acceleration techniques}.  Many techniques have been proposed to
accelerate SGMs and CSGMs. For example, \cite{luhman2021knowledge,
  salimans2022progressive} propose to learn a distillation network on top of SGM
models, while \cite{song2020denoising} perform a subsampling of the timesteps in
a variational setting. \cite{watson2022learning} optimize the timesteps with a
fixed budget using dynamic programming.  
\cite{xiao2021tackling} perform multi-steps denoising
using GANs while \cite{dockhorn2021score} consider underdamped Langevin dynamics
as forward process. We emphasize that many of these techniques are complementary
to and can be readily applied in the SB setting; \eg  one could distill the last CDSB network $\mathbf{B}^{y}_{\theta^L}$.  Additionally, SB
and CSB provide a framework to perform few-step sampling.

\section{Experiments}
\label{sec:experiments}
\begin{figure}[h]
\centering
\begin{minipage}{\linewidth}
\raisebox{0.8cm}{\rotatebox[origin=t]{90}{CSGM}}\enskip{}\includegraphics[height=2.13cm]{\string"Plots/2D/Example 1/b_1_cond_histogram__50\string".png}\includegraphics[height=2.13cm]{\string"Plots/2D/Example 2/b_1_cond_histogram__50\string".png}\includegraphics[height=2.13cm]{\string"Plots/2D/Example 3/b_1_cond_histogram__50\string".png}


\raisebox{0.8cm}{\rotatebox[origin=t]{90}{CDSB}}\enskip{}\includegraphics[height=2.13cm]{\string"Plots/2D/Example 1/b_5_cond_histogram__50\string".png}\includegraphics[height=2.13cm]{\string"Plots/2D/Example 2/b_5_cond_histogram__50\string".png}\includegraphics[height=2.13cm]{\string"Plots/2D/Example 3/b_5_cond_histogram__50\string".png}


\raisebox{0.8cm}{\rotatebox[origin=t]{90}{CDSB-FB}}\enskip{}\includegraphics[height=2.13cm]{\string"Plots/2D/Example 1/b_5_cond_histogram_fwdbwd_50\string".png}\includegraphics[height=2.13cm]{\string"Plots/2D/Example 2/b_5_cond_histogram_fwdbwd_50\string".png}\includegraphics[height=2.13cm]{\string"Plots/2D/Example 3/b_5_cond_histogram_fwdbwd_50\string".png}


\raisebox{0.8cm}{\rotatebox[origin=t]{90}{MGAN}}\enskip{}\includegraphics[height=2.13cm]{\string"Plots/2D/Example 1/MGAN_Example1\string".png}\includegraphics[height=2.13cm]{\string"Plots/2D/Example 2/MGAN_Example2\string".png}\includegraphics[height=2.13cm]{\string"Plots/2D/Example 3/MGAN_Example3\string".png}

\captionof{figure}{\label{fig:2dconditional}True posterior $p(x|\yobs)$ for $\yobs\in\{-1.2,0,1.2\}$ (solid lines) and approximations for the 2D examples.}
\end{minipage}

\vspace{0.25cm}

\begin{minipage}{\linewidth}
\tabcolsep=0.06cm
\small
    \begin{centering}
    \begin{tabular}{|c|c|c|c|c|c|c|c|}
    \hline 
    \multirow{1}{*}{} &  & MCMC & CDSB & CDSB-FB & CDSB-C & MGAN & IT\tabularnewline
    \hline 
    \multirow{2}{*}{Mean} & $x_{1}$ & .075 & .066 & .068 & \textbf{.072} & .048 & .034\tabularnewline
    \cline{2-8} \cline{3-8} \cline{4-8} \cline{5-8} \cline{6-8} \cline{7-8} \cline{8-8} 
     & $x_{2}$ & .875 & .897 & .897 & \textbf{.891} & .918 & .902\tabularnewline
    \hline 
    \multirow{2}{*}{Var} & $x_{1}$ & .190 & .184 & \textbf{.190} & .188 & .177 & .206\tabularnewline
    \cline{2-8} \cline{3-8} \cline{4-8} \cline{5-8} \cline{6-8} \cline{7-8} \cline{8-8} 
     & $x_{2}$ & .397 & .387 & .391 & \textbf{.393} & .419 & .457\tabularnewline
    \hline 
    \multirow{2}{*}{Skew} & $x_{1}$ & 1.94 & \textbf{1.90} & 2.01 & \textbf{1.90} & 1.83 & 1.63\tabularnewline
    \cline{2-8} \cline{3-8} \cline{4-8} \cline{5-8} \cline{6-8} \cline{7-8} \cline{8-8} 
     & $x_{2}$ & .681 & .591 & .628 & .596 & \textbf{.630} & .872\tabularnewline
    \hline 
    \multirow{2}{*}{Kurt} & $x_{1}$ & 8.54 & 7.85 & \textbf{8.54} & 8.00 & 7.64 & 7.57\tabularnewline
    \cline{2-8} \cline{3-8} \cline{4-8} \cline{5-8} \cline{6-8} \cline{7-8} \cline{8-8} 
     & $x_{2}$ & 3.44 & 3.33 & \textbf{3.51} & 3.27 & 3.19 & 3.88\tabularnewline
    \hline 
    \end{tabular}
    \par\end{centering}
\captionof{table}{\label{tab:bodresult} Estimated posterior moments for the BOD example. The closest estimates to MCMC are highlighted in bold. }
\end{minipage}
\end{figure}

\subsection{2D Synthetic Examples}

\begin{figure*}[h!]
\centering
\small
\begin{minipage}{\textwidth}
\tabcolsep=0.04cm
    \subfloat[]{
    \begin{tabular}{ccc}
    \toprule 
     & $N=5$ & $N=10$\tabularnewline
    \midrule
    \midrule 
    CSGM & 17.22/0.672 & 20.03/0.795\tabularnewline
    \midrule 
    CDSB & 18.55/0.746 & 20.69/0.792\tabularnewline
    \midrule 
    CSGM-C & 18.61/0.749 & 20.83/0.838\tabularnewline
    \midrule 
    CDSB-C & \textbf{19.67}/\textbf{0.753} & \textbf{20.95}/\textbf{0.840}\tabularnewline
    \bottomrule
    \end{tabular}
    }\hspace{-3pt}\subfloat[]{
    \begin{tabular}{cc}
    \toprule 
    $N=10$ & $N=20$\tabularnewline
    \midrule
    \midrule 
    14.77/0.599 & 16.31/0.706\tabularnewline
    \midrule 
    16.24/0.618 & 16.61/0.657\tabularnewline
    \midrule 
    16.38/\textbf{0.701} & 16.53/0.730\tabularnewline
    \midrule 
    \textbf{16.60}/0.700 & \textbf{16.65}/\textbf{0.747}\tabularnewline
    \bottomrule
    \end{tabular}
    }\hspace{-3pt}\subfloat[]{
    \begin{tabular}{cc}
    \toprule 
    $N=20$ & $N=50$\tabularnewline
    \midrule
    \midrule 
    19.52/0.471/92.02 & 20.52/0.567/48.68\tabularnewline
    \midrule 
    19.72/0.504/57.22 & 20.70/0.590/40.08\tabularnewline
    \midrule 
    20.44/0.566/44.44 & 20.84/0.592/22.89\tabularnewline
    \midrule 
    \textbf{21.11}/\textbf{0.614}/\textbf{28.41} & \textbf{21.46}/\textbf{0.646}/\textbf{13.71}\tabularnewline
    \bottomrule
    \end{tabular}
    }\hspace{-3pt}\subfloat[]{
    \begin{tabular}{cc}
    \toprule 
    $N=20$ & $N=50$\tabularnewline
    \midrule
    \midrule 
    24.22/0.844/17.62 & 25.29/0.878/7.18\tabularnewline
    \midrule 
    24.88/0.850/19.85 & 26.61/0.894/3.87\tabularnewline
    \midrule 
    \textbf{28.26}/0.914/3.63 & \textbf{28.14}/0.913/1.31\tabularnewline
    \midrule 
    28.19/\textbf{0.915}/\textbf{2.28} & 28.06/\textbf{0.914}/\textbf{1.14}\tabularnewline
    \bottomrule
    \end{tabular}
    }
    \captionof{table}{\label{tab:imagemetrics}Results for (a) MNIST 4x SR;
    (b) MNIST 14x14 inpainting; (c) CelebA 4x SR with Gaussian
    noise; (d) CelebA 32x32 inpainting. Reported results are denoted in the format PSNR↑/SSIM↑(/FID↓). }
\end{minipage}

\setcounter{subfigure}{0}

\begin{minipage}{0.46\textwidth}
\pdfpxdimen=\dimexpr 1in/300\relax
\begin{centering}
\subfloat[$\yobs$]{\includegraphics[trim={40px 40px 40px 40px}, clip, width=.5\linewidth]{Plots/MNIST_superres_main/Cond.png}} 
\subfloat[Ground truth]{\includegraphics[trim={40px 40px 40px 40px}, clip, width=.5\linewidth]{\string"Plots/MNIST_superres_main/True data\string".png}} \\
\vspace{-0.2cm}
\subfloat[CSGM]{\includegraphics[trim={40px 40px 40px 40px}, clip, width=.5\linewidth]{\string"Plots/MNIST_superres_main/N=5 CDiff\string".png}}
\subfloat[CDSB-C]{\includegraphics[trim={40px 40px 40px 40px}, clip, width=.5\linewidth]{\string"Plots/MNIST_superres_main/N=5 CDSB-Cond\string".png}}
\par\end{centering}
\caption{\label{fig:imagecomparison-mnist}Uncurated samples for the MNIST 4x SR task with $N=5$.}
\end{minipage}  \qquad\quad
\begin{minipage}{0.46\textwidth}
\begin{centering}
\subfloat[$\yobs$]{\includegraphics[height=.5\linewidth]{Plots/CelebA_superres_main/Cond.png}}~~
\subfloat[Ground truth]{\includegraphics[height=.5\linewidth]{\string"Plots/CelebA_superres_main/True data\string".png}} \\
\vspace{-0.2cm}
\subfloat[CSGM]{\includegraphics[height=.5\linewidth]{\string"Plots/CelebA_superres_main/N=20 CDiff\string".png}}~~
\subfloat[CDSB-C]{\includegraphics[height=.5\linewidth]{\string"Plots/CelebA_superres_main/N=20 CDSB-Cond\string".png}}
\par\end{centering}
\caption{\label{fig:imagecomparison-celeba}Uncurated samples for the CelebA 4x SR with Gaussian noise task with $N=20$. }
\pdfpxdimen=\dimexpr 1in/72\relax
\end{minipage} 
\end{figure*}


We first demonstrate the validity and accuracy of our method
using the two-dimensional examples of \cite{kovachki2021conditional}.
We consider three nonlinear, non-Gaussian examples for $\pjoin(x,y)$: 
define $\pobs(y)=\textup{Unif}(y;[-3,3])$ for all examples and $p(x|y)$ is defined through
\begin{align}
\text{Example 1: } & X=\tanh(Y)+Z, &  & Z\sim\Gamma(1,0.3),\\
\text{Example 2: } & X=\tanh(Y+Z), &  & Z\sim\mathcal{N}(0,0.05),\\
\text{Example 3: } & X=Z\tanh(Y), &  & Z\sim\Gamma(1,0.3).
\end{align}
We run CDSB on each of the examples with 50,000 training points and compare with
the Monotone GAN (MGAN) algorithm \citep{kovachki2021conditional}.  CDSB uses a
neural network model with 32k parameters (approximately 6x less parameters than
MGAN) with $N=50$ diffusion steps. Figure \ref{fig:2dconditional} shows the
resulting histogram of the learned $p(x|\yobs)$ and the true posterior for
$\yobs\in\{-1.2,0,1.2\}$.  As can be observed, the empirical density of CDSB
samples is sharper and aligns more closely with the ground truth density.  We
also observe that using more CDSB iterations corrects the sampling bias compared
to using only one CDSB iteration (which corresponds to CSGM). Using
forward-backward sampling (CDSB-FB) further improves the sample quality.


\subsection{Biochemical Oxygen Demand Model}
We now consider a Bayesian inference problem on biochemical oxygen demand
(BOD) from \citet{marzouk2016sampling}. Let
$X_{1},X_{2}\overset{\textup{i.i.d.}}\sim\mathcal{N}(0,1)$,
$A=0.8+0.4 \erf(X_{1}/\sqrt{2})$, $B=0.16+0.15 \erf(X_{2}/\sqrt{2})$ and $Y=\{Y(t)\}_{t=1}^5$ satisfy
$Y(t)=A(1-\exp(-Bt))+Z$ with
$Z\sim\mathcal{N}(0,{10}^{-3})$.  Table \ref{tab:bodresult} displays moment statistics of the estimated posterior $p(x|y)$ (standard deviations are reported in the supplementary), in comparison with
the ``ground truth'' statistics computed using $6\times{10}^{6}$ MCMC steps as
reported in \citet{marzouk2016sampling}. To match the evaluation in
\citet{kovachki2021conditional}, the reported statistics are computed using
30,000 samples and averaged across the last 10 CDSB iterations. The resulting
posterior displays high skewness and high kurtosis, but all CDSB-based methods achieve more accurate posterior estimation than MGAN
and the inverse transport (IT) method in \cite{marzouk2016sampling}.


\subsection{Image Experiments}
\subsubsection{Gaussian Reference Measure}
We now apply CDSB to a range of inverse problems on image
datasets. We consider the following tasks: (a) MNIST 4x SR (7x7 to
28x28), (b) MNIST center 14x14 inpainting, (c) CelebA 4x SR (16x16 to
64x64) with Gaussian noise of $\sigma_{y}=0.1$, (d) CelebA center 32x32 inpainting. 
For CSGM-C and CDSB-C, we consider the
following choices for conditional $\pref(x|y)$: for tasks (a) and (c), we use
the upsampled $y$ directly as described in \Cref{subsec:targetawareinitial}; for inpainting tasks
(b) and (d), we use a separate neural network with the same architecture as
$\mathbf{F},\mathbf{B}$ to output the initialization mean. In \Cref{tab:imagemetrics} we report PSNR and SSIM (the higher the better), as well as FID scores (the lower the better) for RGB images only. We display a
visual comparison between the methods in Figures \ref{fig:imagecomparison-mnist} and \ref{fig:imagecomparison-celeba}, and additional image samples in the supplementary. CDSB and CDSB-C
both provide significant improvement in terms of quantitative metrics as well as visual evaluations, and high-quality images can be generated quickly under few iterations $N$. 

\subsubsection{Pre-trained SR Model for Reference Measure}
\label{subsubsec:nongaussianref}
We further explore here the possibility of using a non-Gaussian $\pref(x|y)$ to further bridge the gap towards the true posterior $p(x|y)$. We utilize the super-resolution model SRFlow \citep{lugmayr2020srflow}, which produces
a probability distribution over possible SR images using a conditional normalizing flow. We use their pre-trained model checkpoints for the 8x SR task for CelebA (160x160). We then train a short CDSB model 
with SRFlow as $\pref(x|y)$,
in order to take advantage of the high sampling quality of diffusion models.
As can be seen from Figure \ref{fig:imagecomparison-celeba160}, 
with only $N=10$ steps the CDSB model is able to make 
meaningful improvements to the SRFlow samples, 
especially in the finer details such as facial features and hair texture. 
Quantitatively, CDSB-C produces significant improvement over the
FID score at the cost of a decrease in PSNR; see Table \ref{tab:imagemetrics-celeba160}. 
Note that this choice of non-Gaussian $\pref(x|y)$ is not compatible with CSGM. Interestingly CSGM-C still improves the PSNR compared to SRFlow, but produces worse FID scores than CDSB-C and blurry samples. 

\begin{figure}[t]
\centering
\small
\begin{minipage}{0.46\textwidth}
    \begin{tabular}{ccc}
    \toprule 
    $\pref(x|y)$ & CSGM-C & CDSB-C\tabularnewline
    \midrule
    \midrule 
    Gaussian & 22.21/0.521/87.02 & 23.86/0.628/31.65 \tabularnewline 
    \midrule 
    SRFlow $\tau=0.8$ & \textbf{24.97}/0.701/26.83 & 24.34/0.674/\textbf{15.00} \tabularnewline
    \midrule
    \midrule 
    SRFlow $\tau=0.8$ & \multicolumn{2}{c}{24.83/\textbf{0.702}/30.92}\tabularnewline
    \bottomrule
    \end{tabular}
\captionof{table}{\label{tab:imagemetrics-celeba160}Results for CelebA 8x SR. Reported results are denoted in the format PSNR↑/SSIM↑/FID↓. The final row reports our evaluated results of the SRFlow model. }
\end{minipage}


\begin{minipage}{0.46\textwidth}
\subfloat[$\yobs$]{\includegraphics[width=.5\linewidth]{Plots/CelebA160/im_grid_data_y}}
\subfloat[Ground truth]{\includegraphics[width=.5\linewidth]{Plots/CelebA160/im_grid_data_x}}
\vspace{-0.2cm}
\\
\subfloat[SRFlow]{\includegraphics[width=.5\linewidth]{Plots/CelebA160/im_grid_srflow}}
\subfloat[CDSB-C]{\includegraphics[width=.5\linewidth]{Plots/CelebA160/im_grid_cdsb}}
\caption{\label{fig:imagecomparison-celeba160} Paired samples for CelebA 8x SR. The SRFlow samples (c) are inputted as conditional initialization into CDSB-C (d), which produces fine modifications over $N=10$ steps (Best viewed when zoomed in).}
\end{minipage}

\end{figure}

\subsection{Filtering in State-Space Models}\label{sec:filtering}
Consider a state-space model defined by a bivariate Markov chain
$(X_t,Y_t)_{t\geq 1}$ of initial density $\mu(x_1)g(y_1|x_1)$ and transition
density $f(x_{t+1}|x_{t})g(y_{t+1}|x_{t+1})$ where $X_t$ is latent while $Y_t$ is observed. We are interested in estimating
sequentially in time the filtering distribution $p(x_t|\yobs_{1:t})$,
that is the posterior of $X_t$ given the observations $Y_{1:t}=\yobs_{1:t}$. We
show here how CDSB can be used at each time $t$ to obtain a sample approximation
of these filtering distributions. This CDSB-based algorithm only requires us being
able to sample from the transition density $f(x_{t+1}|x_{t})g(y_{t+1}|x_{t+1})$ and is
thus more generally applicable than standard techniques such as particle filters
\citep{doucet2009tutorial}.

Assume at time $t$, one has a collection of samples $\{X^i_{t}\}_{i=1}^M$
distributed (approximately) according to $p(x_{t}|\yobs_{1:t})$. We 
sample $X^i_{t+1} \sim f(x_{t+1}|X^i_{t})$ and $Y^i_{t+1} \sim g(y_{t+1}|X^i_{t+1})$. The
resulting samples $\{X^i_{t+1},Y^i_{t+1}\}_{i=1}^M$ are thus distributed according to
$\pjoin(x_{t+1},y_{t+1}):=p(x_{t+1},y_{t+1}|\yobs_{1:t})$. We can also easily obtain samples
from $\pjref(x_{t+1},y_{t+1}):=\pref(x_{t+1}|y_{t+1},\yobs_{1:t}) p(y_{t+1}|\yobs_{1:t})$ where
$\pref(x_{t+1}|y_{t+1},\yobs_{1:t})$ is an easy-to-sample distribution designed by the
user. Thus we can use CDSB to obtain a (stochastic) transport map between
$\pjoin(x_{t+1},y_{t+1})$ and $\pjref(x_{t+1},y_{t+1})$ and applying it to $Y_{t+1}=\yobs_{t+1}$, we can
obtain new samples from $p(x_{t+1}|\yobs_{1:t+1})$.  A similar strategy for filtering
based on deterministic transport maps was recently proposed by
\cite{spantini2019coupling}.

We apply CSGM and CDSB to the Lorenz-63 model \citep{law2015data}
following the procedure above for a time series of length 2000. 
We consider a short diffusion process with $N=20$
steps, as well as a long one with $N=100$. To accelerate the sequential
inference process, in this example we use analytic basis regression instead of
neural networks for all methods, and we only run 5 iterations of CDSB. 
As the EnKF is applicable
to this model, we can use the resulting approximate Gaussian filtering
distribution it outputs for $\pref(x_{t+1}|y_{t+1},\yobs_{1:t})$ in CSGM-C and CDSB-C.

Table \ref{tab:filtering} shows that for $N=20$ both CDSB and
CDSB-C successfully perform filtering and outperform the EnKF,
whereas both CSGM and CSGM-C fail to track the state accurately and diverge after a few
hundred times steps. CDSB-C achieves the lowest error consistently.
When using $N=100$, CSGM can achieve RMSE comparable with CDSB-C using $N=20$, but CDSB still provides advantages compared to CSGM. CSGM-C achieves comparable RMSE as CDSB-C with suitably long diffusion process in this case. 
For lower ensemble size, e.g. $M=200$, occasional large errors occur for some of the runs; see supplementary for details. We conjecture that this is due to overfitting. 

\begin{table}[h]
\begin{centering}
\small
    \begin{tabular}{|c|c|c|c|c|}
    \hline 
    $M$ & 500 & 1000 & 2000\tabularnewline
    \hline 
    \hline 
    EnKF & .354\textpm 0.006 & .355\textpm .005 & .354\textpm .003\tabularnewline
    \hline 
    \hline 
    CSGM(-C) (short)& \multicolumn{3}{c|}{Diverges}\tabularnewline
    \hline 
    CDSB (short)& .251\textpm .011 & .218\textpm .008 & .196\textpm .005\tabularnewline
    \hline 
    CDSB-C (short)& \textbf{.236\textpm .012} & \textbf{.207\textpm .014} & \textbf{.178\textpm .007}\tabularnewline
    \hline 
    \hline 
    CSGM (long) & .232\textpm .008 & .203\textpm .009 & .182\textpm .009\tabularnewline
    \hline 
    CDSB (long) & .220\textpm .012 & .195\textpm .007 & .166\textpm .004\tabularnewline
    \hline 
    CSGM-C (long) & \textbf{.210\textpm .009} & \textbf{.185\textpm.005} & .162\textpm.004\tabularnewline
    \hline 
    CDSB-C (long) & .218\textpm .014 & \textbf{.185\textpm .008} & \textbf{.160\textpm .003}\tabularnewline
    \hline 
    \end{tabular}
\par\end{centering}
\caption{\label{tab:filtering}RMSEs over 10 runs between each algorithm's filtering means
and the ground truth filtering means for $N=20$ (short) and $N=100$ (long). }
\end{table}

\section{Discussion}
We have proposed a SB formulation of conditional simulation and an algorithm,
CDSB, to approximate its solution. The first iteration of CDSB coincides with
CSGM while subsequent ones can be thought of as refining it. 
This theoretically grounded approach is complementary to the many other techniques
that have been recently proposed to accelerate SGMs  and could
be used in conjunction with them.  However, it also suffers from limitations. As
CDSB approximates numerically the diffusion processes output by IPF, the minimum
$N$ one can pick to obtain reliable approximations is related to the steepness
of the drift of these iterates which is practically unknown. Additionally CSGM
and CDSB are only using $\yobs$ when we want to sample from $p(x|\yobs)$ but not
at the training stage. Hence if $\yobs$ is not an observation ``typical'' under
$\pobs(y)$, the approximation of the posterior can be unreliable. In the ABC
context, the best available methods rely on procedures which sample synthetic
observations in the neighbourhood of $\yobs$. It would be interesting but
challenging to extend such ideas to CSGM and CDSB. Other interesting potential
extensions include developing an amortized version of CDSB for filtering that
would avoid having to solve a SB problem at each time step, and a conditional
version of the multimarginal SB problem.
\label{sec:conclusion}

\begin{acknowledgements}
We thank James Thornton for his helpful comments. We are also grateful to the authors of \citep{kovachki2021conditional} for sharing their code with us.
\end{acknowledgements}

\bibliography{refs}

\end{document}
