\vspace{-0.1in}
\section{Convergence analysis via entropic optimal transport}


The dynamic IPF algorithm offers an efficient training scheme to fit marginals in high-dimensional problems. However, the understanding of the convergence remains unclear to the machine learning community. To get around this issue, we leverage the progress from the static optimal transport on bounded domains and costs \citep{Carlier_multi, Chen16_interpolation2, Deligiannidis_21}. 

Our analysis is illustrated as follows: We first draw connections between dynamic and static (primal) IPFs by projecting the path space $\mathcal{D}$ to the product space $\Pi$ and then show the equivalence between the dual and primal formulations. Next, we perturb the marginals (in terms of energy functions) and show the approximate linear convergence of the dual, potential, and then static couplings. The convergence of dynamic couplings can be expected given a reasonable estimate of diffusion bridge.
\begin{align*}
    \text{Dynamic Primal IPF}~\eqref{dynamic_IPF_projection} \xleftrightarrow[\text{Projection}]{\text{Disintegration}} \text{Static Primal IPF \eqref{staic_IPF_projections_}}  \xleftrightarrow[\text{Lemma } \ref{duality}]{\text{Equivalence} \eqref{equiv_primal_dual}} \text{Static Dual IPF \eqref{SB_eqn_IPF}}
\end{align*}

% The success of diffusion models has inspired many promising theoretical interpretations \citep{lee2022convergence, Sitan_22_sampling_is_easy, DSB, stat_efficiency_SGM}. However, sub-optimal transport \citep{Lavenant_Santambrogio_22} has limited the efficiency of diffusion models. By contrast, Schr\"{o}dinger bridge leverages optimal transport \citep{DSB, SBP_max_llk, gefei_21, forward_backward_SDE}, which accelerates the inference and also facilitates score estimations. %The connections of Schr\"{o}dinger bridge and entropic optimal transport are detailed hereinafter.

\subsection{Equivalence between Dynamic SBP and Static SBP}




Assuming the solutions exist, the disintegration of measures implies that the equivalence of solutions between the dynamic and static SBPs \citep{leonard_14}:
\begin{equation*}
\begin{split}\label{dynamic_static}
% \small
    \textbf{Dynamic SBP}\qquad \mathbb{P}_{\star}=\argmin_{\mathbb{P}\in \mathcal{D}(\mu_{\star}, \nu_{\star})} \text{KL}(\mathbb{P}\|\mathbb{Q}) \Longleftrightarrow \pi_{\star}=\argmin_{\pi\in \Pi(\mu_{\star}, \nu_{\star})} \text{KL}(\pi\|\mathcal{G}), \qquad \textbf{Static} 
\end{split}
\end{equation*}
% where $\pi\in \Pi(\mu_{\star}, \nu_{\star})$ and $\Pi$ is the space of couplings with marginals $\mu_{\star}$ and $\nu_{\star}$; $\mathcal{G}$ denotes a Gibbs measure $\dd\mathcal{G} \propto e^{-c_{\varepsilon}}\dd (\mu_{\star} \otimes \nu_{\star})$; $\otimes$ is the measure product and $c_{\varepsilon}(\bx, \by)$ is a loss function that models the transport cost between particles $\bx$ and $\by$; $\Omega$ are the parameter spaces; 
% the conditional probability of $\mathbb{P}$ (or $\mathbb{Q}$) given conditional information $\pi$ (or $\mathcal{G}$) is denoted by $\mathbb{P}_{\pi}$ (or $\mathbb{Q}_{\mathcal{G}}$) \citep{Valentin_slides}. Forcing $\mathbb{P}_{\pi}=\mathbb{Q}_{\mathcal{G}}$ yields the \emph{static} SBP with the optimal coupling $\pi_{\star}$: 
% \begin{equation}\label{static_SB}
%     \pi_{\star}=\argmin_{\pi\in \Pi(\mu_{\star}, \nu_{\star})} \text{KL}(\pi|\mathcal{G}),
% \end{equation}
where $\pi$ (respectively, $\mathcal{G}$) is the projection of the path measure $\mathbb{P}$ (respectively, $\mathbb{Q}$) on the product space at $t=0$ and $T$; $\mathrm{d}\mathcal{G} \propto e^{-c_{\varepsilon}}\mathrm{d} (\mu_{\star} \otimes \nu_{\star})$; $c_{\varepsilon}$ is a cost function. Both the dynamic and static SBP formulations yield structure properties (see the Born's formula in \citet{leonard_14}) and enables to represent Schr\"{o}dinger bridges $\mathbb{P}_{\star}$ and $\pi_{\star}$ using Schr\"{o}dinger potentials ${\varphi_{\star}}$ and $\psi_{\star}$:
\begin{align}
\label{main_solution_property}
    \textbf{Dynamic Struture} \quad \dd \mathbb{P}_{\star}=e^{{\varphi_{\star}}(\bx) +  \psi_{\star}(\by)} \dd \mathbb{Q} \Longleftrightarrow   \dd\pi_{\star}(\bx, \by)&=e^{{\varphi_{\star}}(\bx) +  \psi_{\star}(\by)}\dd \mathcal{G}.  \quad \textbf{Static}
\end{align} 
Moreover, the summation ${\varphi_{\star}}\oplus\psi_{\star}$ is unique such that $({\varphi_{\star}}+a)\oplus(\psi_{\star}-a)$ is also viable for any $a$.


This static structural representation establishes a connection between the static SBP and entropic optimal transport (EOT) with a unit entropy regularizer \citep{provably_schrodinger_bridge}, and the latter results in an efficient scheme to compute the optimal coupling: 
\begin{equation*}\label{EOT_problem_supp}
    \inf_{\pi\in \Pi(\mu_{\star}, \nu_{\star})} \iint_{\Omega^2}  c_{\varepsilon}(\bx, \by)\pi(\dd\bx, \dd\by) + \text{KL}(\pi\|\mu_{\star} \otimes \nu_{\star}).
\end{equation*}


\subsection{Duality for Schr\"{o}dinger bridges and Approximations}


The Schr\"{o}dinger bridge is a constrained optimization problem and possesses a computation-friendly dual formulation. Moreover, the duality gap is zero under probability measures \citep{leonard_01}.

\begin{lemma}[Duality \citep{Nutz22_note}] 
\label{duality} Given assumptions \ref{ass:regularity}-\ref{ass:smooth_measure}, 
% Assume the solutions exist. \nicole{what solution? Assume $\pi \in \Pi(..)$.. also in the entropic form if c is bounded, the solution exists, no need to assume then} \Wei{maybe we change it to given assumptions 1,2,3?} t
the dual via potentials $(\varphi, \psi)$ follows
\begin{equation}\label{convave_max}
    \min_{\pi\in \Pi(\mu_{\star}, \nu_{\star})} \text{\rm KL}(\pi|\mathcal{G})=\max_{\varphi, \psi} G(\varphi, \psi), \quad G(\varphi, \psi):=\mu_{\star}(\varphi) + \nu_{\star}(\psi) - \iint_{\Omega^2} e^{\varphi\oplus \psi} \dd \mathcal{G} + 1,
\end{equation}
where $\mu_{\star}(\varphi)=\int_{\Omega} \varphi \dd\mu_{\star}$, $\nu_{\star}(\psi)=\int_{\Omega} \psi \dd\nu_{\star}$, $\varphi\in L^1(\mu_{\star})$, and $\psi\in L^1(\nu_{\star})$. 
\end{lemma}

An effective solver is to maximize the dual $G$ via $\varphi_{k+1}=\argmax_{\varphi\in L^1(\mu_{\star})} G(\varphi, \psi_k)$ and $\psi_{k+1}=\argmax_{\psi\in L^1(\nu_{\star})} G(\varphi_{k+1}, \psi)$ alternatingly. From a geometric perspective, alternating maximization corresponds to alternating projections (detailed in Appendix \ref{dual_project_relation})
\begin{subequations}\label{sinkhorn_vs_marginal}
\begin{align}
\varphi_{k+1}=\argmax_{\varphi\in L^1(\mu_{\star})} G(\varphi, \psi_k)&\Longrightarrow \text{the first marginal of }\pi(\varphi_{k+1}, \psi_k) \text{\ is\ } \mu_{\star}, \label{sinkhorn_vs_marginal1}\\
\psi_{k+1}=\argmax_{\psi\in L^1(\nu_{\star})} G(\varphi_{k+1}, \psi)&\Longrightarrow \text{the second marginal of }\pi(\varphi_{k+1}, \psi_{k+1}) \text{\ is\ } \nu_{\star}. \label{sinkhorn_vs_marginal2}
\end{align}
\end{subequations}
% \begin{align}
%     \varphi_{k+1}=\argmax_{\varphi} G(\cdot, \psi_k)&\Longrightarrow \text{the first marginal of }\dd\pi(\varphi_{k+1}, \psi_k) \text{\ is\ } \mu_{\star}\label{sinkhorn_vs_marginal1}\\
%     \psi_{k+1}=\argmax_{\psi} G(\varphi_{k+1}, \cdot)&\Longrightarrow \text{the second marginal of }\dd\pi(\varphi_{k+1}, \psi_{k+1}) \text{\ is\ } \nu_{\star},\label{sinkhorn_vs_marginal2}
% \end{align}
% where fitting the marginals of the coupling $\pi$ alternatingly is the well-known iterative proportional fitting (IPF) algorithm  (also known as Sinkhorn algorithm) \citep{IPF_95}.

The marginal properties of the coupling implies the Schr\"{o}dinger equation \citep{Nutz_22_a} 
\begin{equation*}\label{SB_eqn}
       \varphi_{\star}(\bx)= -\log \int_{\Omega} e^{ \psi_{\star}(\by)-c_{\varepsilon}(\bx,\by)} \nu_{\star}(\dd\by),\quad \psi_{\star}(\by)=-\log \int_{\Omega} e^{ \varphi_{\star}(\bx)-c_{\varepsilon}(\bx,\by)} \mu_{\star}(\dd\bx).
\end{equation*}


Since the Schr\"{o}dinger potential functions $(\psi_{\star}, \varphi_{\star})$ are not known \emph{a priori}, the dual formulation of the static IPF algorithm was proposed to solve the alternating projections as follows:
\begin{equation}\label{SB_eqn_IPF}
     \small{\textbf{Static Dual IPF}}:\ \psi_{k}(\by)=-\log \int_{\Omega} e^{ \varphi_{k}(\bx)-c_{\varepsilon}(\bx,\by)} \mu_{\star}(\dd\bx),\  \varphi_{k+1}(\bx)= -\log \int_{\Omega} e^{ \psi_{k}(\by)-c_{\varepsilon}(\bx,\by)} \nu_{\star}(\dd\by).
\end{equation}
The equivalence between the primal IPF and dual IPF is further illustrated in Appendix \ref{equiv_primal_dual}.


However, given a limited computational budget, projecting to the ideal measure $\mu_{\star}$ (or $\nu_{\star}$) in Eq.\eqref{sinkhorn_vs_marginal} at each iteration may not be practical.  Instead, some close approximation $\mu_{\star, k+1}$ (or $\nu_{\star, k}$) is used at iteration $2k+1$ (or $2k$) via Gaussian processes \citep{SBP_max_llk} or neural networks \citep{DSB, forward_backward_SDE}. Therefore, one may resort to an approximate marginal that still achieves reasonable accuracy:
\begin{equation}
\begin{split}
\label{marginal_eqn}
    \mu_{2k+1}&=\mu_{\star, k+1}\approx \mu_{\star}, \quad \nu_{2k}=\nu_{\star, k}\approx \nu_{\star}.
\end{split}
\end{equation}
We refer to the IPF algorithm with approximate marginals as approximate IPF (aIPF) and present the static dual formulation of aIPF in Algorithm \ref{sinkhorn}. \begin{wrapfigure}{r}{0.4\textwidth}
   \begin{center}
   \vskip -0.2in
     \includegraphics[width=0.4\textwidth]{figures/sinhorn_compact_v4.png}
   \end{center}
   \vskip -0.2in
   \caption{IPF v.s. aIPF. The approximate (or exact) projections are highlighted through the dotted (or solid) lines.}%The dynamic IPF also models the trajectories of the projections.}
   \vskip -0.25in
   \label{fig:sinkhorn}
\end{wrapfigure} The difference between IPF and aIPF is detailed in Figure \ref{fig:sinkhorn}.  
The structure representation \eqref{main_solution_property} can be naturally extended based on approximate marginals and is also studied by \citet{Deligiannidis_21}
\begin{equation}
\begin{split}
    \dd \pi_{2k}&=e^{ \varphi_k\oplus  \psi_k-c_{\varepsilon}}\dd(\mu_{\star, k}\otimes \nu_{\star, k}), \\
    \dd  \pi_{2k-1}&=e^{ \varphi_k\oplus  \psi_{k-1}-c_{\varepsilon}}\dd(\mu_{\star, k}\otimes \nu_{\star, k-1}),\label{approx_potential}
\end{split}
\end{equation}
% \begin{align}
% \end{align}
where $\pi_{k}$ is the approximate coupling at iteration $k$. By the structural properties in Eq.\eqref{main_solution_property}, the representation also applies to the dynamic settings, which involves the computation of the static IPF, followed by its integration with a diffusion bridge \citep{Nutz_quantization}.

$\newline$

\begin{algorithm}[ht]
\caption{One iteration of aIPF (static). The static coupling $\pi_k$ can be recovered by the structural representation in \eqref{approx_potential}; the dynamic coupling $\mathbb{P}_k=\iint_{\Omega^2} \mathbb{P}_{k}^{\textbf{x}_0,\textbf{x}_T}(\cdot)\pi_k(\textbf{x}_0, \textbf{x}_T)$ can be solved by further learning a diffusion bridge $\mathbb{P}_{k}^{\textbf{x}_0,\textbf{x}_T}$.
}\label{sinkhorn}
\begin{equation}\label{FB-SDE_alg}
    \psi_k(\by)=-\log \int_{\Omega} e^{ \varphi_k(\bx)-c_{\varepsilon}(\bx,\by)} \mu_{\star, k}(\dd\bx),\quad \varphi_{k+1}(\bx)= -\log \int_{\Omega} e^{ \psi_k(\by)-c_{\varepsilon}(\bx,\by)} \nu_{\star, k}(\dd\by).
\end{equation}
\end{algorithm}


\subsection{Convergence of Couplings with bounded domain}

% Our analysis relies on 

% The Schr\"{o}dinger bridge problem (SBP) has made significant progress both theoretically and empirically \citep{Nutz_22_a, Nutz_sinkhorn_order2, Nutz_SIAM_Math, DSB, gefei_21, forward_backward_SDE, SBP_max_llk}. In particular, % . To the best of our knowledge, the stability analysis of IPF is less explored in the literature except \citep{Deligiannidis_21} and \citep{Nutz_22_a}. \citep{Nutz_quantization}.

Despite the rich literature on the analysis of SBP within bounded domains \citep{Chen16_interpolation2}, most of them are not applicable to practical scenarios where exact marginals are not available. To fill this gap, we extend the linear convergence analysis with perturbed marginals. The key to our proof is the strong convexity of the dual \eqref{convave_max}. To quantify the convergence, similar to \citet{diffusion_manifold}, we introduce an additional assumption to control the perturbation of the marginals in the sense that:

\begin{assump}[Marginal perturbation]\label{ass:approx_score}
$U_k=\nabla \log \frac{\dd\mu_{\star,k}}{\dd \bx}$ and $V_k=\nabla \log \frac{\dd\nu_{\star, k}}{\dd \bx}$ are the approximate energy functions at the $k$-th iteration and are $\epsilon$-close to energy functions $U_{\star}$ and $V_{\star}$
\begin{equation*}
    \begin{split}
        \label{approximate_error_supp}
        \big\|U_k(\bx)-U_{\star}(\bx)\big\|_2 \leq \epsilon(1+\|\bx\|_2), \ \ \big\|V_k(\bx)-V_{\star}(\bx)\big\|_{2} &\leq \epsilon(1+\|\bx\|_2), \quad \forall \bx \in \Omega.
    \end{split}
    \end{equation*}
\end{assump}

Note that the Lipschitz cost function on $\Omega^2$ is also a standard assumption \citep{Deligiannidis_21}. It is not required here by Assumption \ref{ass:regularity}, which leads to a smooth transition kernel and cost function. 

Recall the connections between dynamic primal IPF and static dual IPF, we know $\epsilon$ mainly depends on the score-function $(\overrightarrow\bz_t^{\theta}, \overleftarrow\bz_t^{\omega})$ estimations \citep{song_likelihood_training} and numerical discretizations. More concrete connections between them will be left as future work. In addition, the errors in the two marginals don't have to be the same, and we use a unified $\epsilon$ mainly for analytical convenience. 

Moreover, we use the same domain $\Omega$ for both marginals to be consistent with our algorithm in Section \ref{rSB_}. The proof can be easily extended to different domains $\bX$ and $\bY$ for $\mu_{\star}$ and $\nu_{\star}$.


\paragraph{Approximately linear convergence and proof sketches} We first follow \citet{Carlier_multi, Nutz22_note, Marino_2020} to build a \emph{centered} aIPF algorithm in Algorithm \ref{center_sinkhorn} with scaled potential functions $\bar\varphi_k$ and $\bar\psi_k$ such that $\mu_{\star}(\bar\varphi_k)=0$. Since the summations of the potentials ${\varphi_{\star}}$ and $\psi_{\star}$ are unique by \eqref{main_solution_property}, the \emph{centering} operation doesn't change the dual objective but ensures that the aIPF iterates are uniformly bounded in Lemma \ref{upper_bound_of_varphi_psi} with the help of the decomposition
\begin{equation*}
    {\|\bar\varphi \oplus \bar\psi \|^2_{L^2(\mu_{\star}\otimes \nu_{\star})}=\|\bar\varphi \|^2_{L^2(\mu_{\star})}+\| \bar\psi \|^2_{L^2(\nu_{\star})} \quad \text{ if }\ \ \mu_{\star}(\bar\varphi)=0.}
\end{equation*}
How to ensure centering with perturbed marginals in Algorithm \ref{center_sinkhorn} is crucial and one major novelty in our proof. We next exploit the \emph{strong convexity} of the exponential function $e^x$ associated with the concave dual. 
%%% recover it if image data cannot reproduced.
% in a supporting Lemma \ref{derive_property} to show a key result $G(\bar \varphi_{\star}, \bar \psi_{\star})-G(\bar \varphi_k, \bar \psi_k)\lesssim \|\bar \varphi_{k+1}-\bar \varphi_{k}\|_{L^2(\mu_{\star})}^2+O(\epsilon)$ in Lemma \ref{main_theorem}, where $\lesssim$ denotes less than similar to. Together with the upper bound $\|\bar \varphi_{k+1}-\bar \varphi_{k}\|_{L^2(\mu_{\star})}^2 \lesssim G(\bar \varphi_{k+1}, \bar \psi_{k+1})-G(\bar \varphi_{k}, \bar \psi_{k})+ O(\epsilon)$ in Lemma \ref{iterate_bound}, we can derive the desired contraction properties for the dual objective with a contraction factor $\beta_{\varepsilon}\in (0, 1)$: $$G(\bar \varphi_{\star}, \bar \psi_{\star})-G(\bar \varphi_{k+1}, \bar \psi_{k+1})\leq \beta_{\varepsilon}\bigg(G(\bar \varphi_{\star}, \bar \psi_{\star})-G(\bar \varphi_{k}, \bar \psi_{k})\bigg) + O(\epsilon).$$
We obtain an auxiliary result regarding the convergence of the dual and the potentials.

\begin{lemma}[Convergence of the Dual and Potentials]\label{main_theorem}
Let $(\bar\varphi_k, \bar\psi_k)_{k\geq 0}$ be the iterates of a variant of Algorithm \ref{sinkhorn}. Given assumptions \ref{ass:regularity}-\ref{ass:approx_score} with small enough marginal perturbations $\epsilon$, we have
\begin{align*}
    G(\bar\varphi_{\star}, \bar\psi_{\star})-G(\bar\varphi_k, \bar\psi_k)&\lesssim (1-e^{-24 \|c_{\varepsilon}\|_{\infty}})^k  + 
 e^{24\|c_{\varepsilon}\|_{\infty}}\epsilon,\\
    \|\bar \varphi_{\star}-\bar \varphi_k\|_{L^2(\mu_{\star})}+\|\bar \psi_{\star}-\bar \psi_k\|_{L^2(\nu_{\star})} &\lesssim 
e^{3\|c_{\varepsilon}\|_{\infty}}(1-e^{-24 \|c_{\varepsilon}\|_{\infty}})^{k/2}+ e^{15\|c_{\varepsilon}\|_{\infty}}\epsilon^{1/2}.
\end{align*}
\end{lemma}


Since the centering operation doesn't change the structure property \eqref{main_solution_property}, we are able to analyze the convergence of the static couplings. Motivated by Theorem 3 of \citet{Deligiannidis_21}, we exploit the structural property \eqref{main_solution_property} to estimate the $\mathbf{W}_1$ distance based on its dual formulation. 

\begin{theorem}[Convergence of Static Couplings]\label{coupling_convergence}
Given assumptions \ref{ass:regularity}-\ref{ass:approx_score} with small marginal perturbations $\epsilon$, the iterates of the couplings $(\pi_k)_{k\geq 0}$ in Algorithm \ref{sinkhorn} satisfy the following result
    \begin{align*}
        \mathbf{W}_1(\pi_k, \pi_{\star})\leq O(e^{9\|c_{\varepsilon}\|_{\infty}}(1-e^{-24 \|c_{\varepsilon}\|_{\infty}})^{k/2}+ e^{21\|c_{\varepsilon}\|_{\infty}}\epsilon^{1/2}).
    \end{align*}
\end{theorem}

Such a result provides the worst-case guarantee on the convergence of the static couplings $\pi_k$. For example, to obtain a $\epsilon_{\star}$-$\mathbf{W}_1$ distance, we can run $\Omega(e^{24\|c_{\varepsilon}\|_{\infty}}(\|c_{\varepsilon}\|_{\infty}-\log(\epsilon_{\star}\wedge 1)))$ 
% \nicole{derivation?} \Wei{clarified the contraction rate, may not need to explain in details}
iterations to achieve the goal. Recall that $c_{\varepsilon}=c/\varepsilon$ \citep{provably_schrodinger_bridge}, a large entropic-regularizer $\varepsilon$ may be needed in practice to yield reasonable performance, which also leads to specific tuning guidance on $\varepsilon$.

Our proof employs a non-geometric method to show the uniform in time stability, w.r.t. the marginals. Unlike the elegant approach \citep{Deligiannidis_21} based on the Hilbert-Birkhoff projective metric \citep{Chen16_interpolation2}, ours does not require advanced tools and may be more friendly to readers.


Recall the bridge representation in Eq.\eqref{bridge_representation}, we have $\mathbf{W}_1(\pi_k\otimes \mathbb{P}_k^{\mu_{\star}, \nu_{\star}}, \pi_{\star}\otimes \mathbb{P}_{\star}^{\mu_{\star}, \nu_{\star}})\leq \mathbf{W}_1(\pi_k, \pi_{\star})+\mathbf{W}_1(\mathbb{P}_k^{\mu_{\star}, \nu_{\star}}, \mathbb{P}_{\star}^{\mu_{\star}, \nu_{\star}})$. Assume the same assumptions as in Theorem \ref{coupling_convergence}, we arrive at the final result:
\begin{proposition}[Convergence of Dynamic Couplings]\label{coupling_convergence_dynamic}
The iterates of the dynamic couplings $(\mathbb{P}_k)_{k\geq 0}$ in Algorithm \ref{primal_dyanmic_IPF} satisfy the following result
    \begin{align*}
        \mathbf{W}_1(\mathbb{P}_k, \mathbb{P}_{\star})\leq O(e^{9\|c_{\varepsilon}\|_{\infty}}(1-e^{-24 \|c_{\varepsilon}\|_{\infty}})^{k/2}+ e^{21\|c_{\varepsilon}\|_{\infty}}\epsilon^{1/2})+\mathbf{W}_1(\mathbb{P}_k^{\mu_{\star}, \nu_{\star}}, \mathbb{P}_{\star}^{\mu_{\star}, \nu_{\star}}).
    \end{align*}
\end{proposition}
% \nicole{do we need the 'bridge' superscript? seems the same with $\mathbb{P}$ defined before} \Wei{modified the notation to be consistent}
The result paves the way for understanding the general convergence of the dynamic IPF algorithm by incorporating a proper approximation of the diffusion bridge \citep{Diffusion_bridge}.





