


% \section{Other preliminaries}

% \subsection{The Structure Property for Static Schr\"{o}dinger Bridge Problem (SBP)}

% \Wei{include dynamic or delete?}
% % \Wei{re-phrase the lemma since it is the same as ICML draft.}
% \label{static_SB_property}
% \begin{lemma}[Structure Property of Static SBP \citep{Compute_OT, Nutz22_note}]\label{solution_property}
% Suppose $\mathcal{G}$ is cyclical invariant with respect to $ \mu_{\star}\otimes \nu_{\star}$ \citep{Nutz_22_func} and $\dd\mathcal{G} \propto e^{-c_{\varepsilon}}\dd (\mu_{\star} \otimes \nu_{\star})$. %defined in section 2.2 \citep{Nutz22_note}. 
% Consider the static SBP with a unique solution $\pi_{\star}$ $$\pi_{\star} = \argmin_{\pi\in \Pi(\mu_{\star}, \nu_{\star})} \text{KL}(\cdot| \mathcal{G}).$$

% \begin{itemize}
%     \item Schr\"{o}dinger potentials $ \varphi_{\star}$ and $ \psi_{\star}: \Omega\rightarrow [-\infty, \infty)$ exist and follow that
%     \begin{equation}\label{formulation}
%     \frac{\dd\pi_{\star}}{\dd \mathcal{G}}=e^{ \varphi_{\star}\oplus \psi_{\star}},
%     \end{equation}
%     where $( \varphi_{\star}\oplus  \psi_{\star})(\bx, \by)= \varphi_{\star}(\bx)+ \psi_{\star}(\by)$ with a unique summation. Namely, the scaled potential $(\bar\varphi_{\star}, \bar\psi_{\star})$ are also Schr\"{o}dinger potentials if there exists a scalar $\lambda$ such that
%     \begin{equation}\label{scaled_invariant}
%          (\bar\varphi_{\star}, \bar\psi_{\star})=(\varphi_{\star}-\lambda, \psi_{\star}+\lambda).
%     \end{equation}
    
%     \item If there exists a solution ${\pi}_0$ that satisfies
%     \begin{equation*}
%     \frac{\dd{\pi}_0}{\dd \mathcal{G}}=e^{ \varphi_{\star}\oplus \psi_{\star}},
%     \end{equation*}
%     for functions $ \varphi_{\star}: \Omega\rightarrow [-\infty, \infty)$ and $ \psi_{\star}: \Omega\rightarrow [-\infty, \infty)$, ${\pi}_0$ is the Schr\"{o}dinger bridge.
% \end{itemize}

% \end{lemma}


\section{Convergence of dual, potentials, and couplings}




Next, we modify Algorithm \ref{sinkhorn} following the centering method developed in \citep{Carlier_multi}.


\begin{algorithm}[h]\caption{Centered Sinkhorn. Set $\bar \varphi_0:=0$. For $k\geq 0$, the iterate follows}\label{center_sinkhorn}
\begin{align}
    \bar \psi_{k}(\by)&:=-\log \int_{\Omega} e^{\bar \varphi_k(\bx)-c_{\varepsilon}(\bx,\by)}\mu_{\star, k}(\dd\bx)\label{bar_psi}\\
    \bar \varphi_{k+1}(\bx)&:=-\log \int_{\Omega} e^{\bar \psi_k(\by)-c_{\varepsilon}(\bx,\by)}\nu_{\star, k}(\dd\by) + \lambda_k, \quad\text{where}\label{bar_varphi}\\
    \lambda_k&:=\int_{\Omega} \log \left(\int_{\Omega} e^{\bar \psi_k(\by)-c_{\varepsilon}(\bx, \by)}\nu_{\star, k}(\dd\by)\right)\mu_{\star}(\dd\bx).\notag
    % \nicole{\mu_{\star,k+1}}
\end{align}
\end{algorithm}
% \nicole{explain the choice of $\mu$ in $\lambda_k$, and maybe also the duality gap when approximated} % Wei: added explainations.
% \nicole{$\bar{\varphi}_0 = \varphi_0 = 0$, thus $\bar{\psi}_0 = \widebar{\psi}_0$, $\bar{\varphi}_1 = \varphi_1 + \lambda_0$. Also from $\mu_{\star,k}(\varphi_k - (\lambda_0 + \ldots + \lambda_{t-1})) =\mu_{\star,k}(\bar{\varphi}_k) := 0$, so
% \[
% \mu_{\star,k}(\varphi_t) = -\lambda_0 - \ldots - \lambda_{t-1}
% \]
% Thus by induction, it follows
% \[
% \Bar{\varphi}_t = \varphi_t - \mu_{\star,k}(\varphi_k), \quad \Bar{\psi}_t = \psi_t + \mu_{\star,k}(\varphi_k)
% \]
% we can also show 
% $G(\bar\varphi_t, \bar\psi_t)=G(\varphi_t, \psi_t)$. }

The algorithm differs from Algorithm \ref{sinkhorn} in that an additional centering operation is included in the updates of $\bar \varphi_{k+1}$ to ensure $\mu_{\star}(\bar \varphi_{k+1})=0$. Notably, $\mu_{\star}$ is required for the centering operation to upper bound the divergence, although it is not directly accessible and no implementation is needed. The main contribution of the centering operation is that the two coordinates $(\bar\varphi, \bar\psi )$ become separable
\begin{equation}\label{decompose}
    {\|\bar\varphi \oplus \bar\psi \|^2_{L^2(\mu_{\star}\otimes \nu_{\star})}=\|\bar\varphi \|^2_{L^2(\mu_{\star})}+\| \bar\psi \|^2_{L^2(\nu_{\star})} \quad \text{ if }\ \ \mu_{\star}(\bar\varphi)=0.}
\end{equation}




The coordinate ascent is equivalent to the following updates
\begin{equation*}
    \bar \psi_k(y)=\argmax_{\bar \psi\in L^1(\nu_{\star})} G(\bar \varphi_k, \bar \psi),\quad \bar \varphi_k(y)=\argmax_{\bar \varphi\in L^1(\mu_{\star}): \mu_{\star}(\bar \varphi)=0} G(\bar \varphi, \bar \psi_k).
\end{equation*}


The relation between the Schr\"{o}dinger potentials $( \varphi_k,  \psi_k)$ and centered Schr\"{o}dinger potentials $(\bar \varphi_k, \bar \psi_k)$ is characterized as follows
\begin{lemma}\label{simple_result}
Denote by $( \varphi_k,  \psi_k)$ the Sinkhorn iterates in Algorithm \ref{sinkhorn}. For all $k\geq 0$, $\mu_{\star}( \varphi_k)=-(\lambda_0+\cdots+\lambda_{k-1})$. Moreover, we have 
\begin{equation*}
    \bar \varphi_k= \varphi_k-\mu_{\star}( \varphi_k),\quad \bar \psi_k= \psi_k +\mu_{\star}( \psi_k).
\end{equation*}
In particular, $\bar \varphi_k\oplus \bar \psi_k= \varphi_k\oplus  \psi_k$ and $G(\bar \varphi_k, \bar \psi_k)=G( \varphi_k,  \psi_k)$.
\end{lemma}

\begin{proof}
Applying the induction method completes the proof directly.
\qed
\end{proof}

Recall how $\bar \psi_k$ is defined through the Schr\"{o}dinger equation
\begin{equation*}\label{center_2nd_marginal}
    \text{The second marginal of\ } \pi_{2k}(\bar \varphi_k, \bar \psi_k)=e^{\bar \varphi_k\oplus\bar \psi_k-c_{\varepsilon}}\dd(\mu_{\star, k}\otimes \nu_{\star, k}) \text{ is }\nu_{\star, k},
\end{equation*}
as in Eq.\eqref{marginal_eqn}. However, $\dd\pi_{2k+1}(\bar \varphi_{k+1}, \bar \psi_k)=e^{\bar \varphi_{k+1}\oplus \bar \psi_k-c_{\varepsilon}}\dd(\mu_{\star, k}\otimes \nu_{\star, k})$ fails to yield the first marginal $\mu_{\star, k}$ due to the centering constraint.




Next, we show the modified iterates are bounded by the cost function $c$.

\begin{lemma}\label{upper_bound_of_varphi_psi}
For every $k\geq 0$, the potentials are bounded by
\begin{equation*}
\label{eq:ccombine}
    \|\bar \varphi_k\|_{\infty}\leq 2\|c_{\varepsilon}\|_{\infty},\quad  \|\bar \psi_k\|_{\infty}\leq 3\|c_{\varepsilon}\|_{\infty}.
\end{equation*}
\end{lemma}

\begin{proof} By Assumption \ref{ass:regularity}, the transition kernel $\mathcal{K}(\bx,\by)=e^{-c_{\varepsilon}(\bx,\by)}$ associated with $\bx_t=\bbf(\bx_t, t)\dd t + \sqrt{2\varepsilon}g(t)\dd \mathbf{w}_t+\bn(\bx)\dd \mathbf{L}_t$ is smooth in $\Omega$, hence the cost function is Lipschitz continuous, which implies the cost function is bounded (denoted by a constant $c_{\varepsilon}$).



Recall the definition of $\bar \varphi_{k+1}$ in  Algorithm \ref{center_sinkhorn}, we have $\forall \bx_1, \bx_2 \in \Omega$,
\begin{equation*}
\label{eq: cinf}
\begin{split}
    &\ \ \ \bar \varphi_{k+1}(\bx_1)- \bar \varphi_{k+1}(\bx_2)\\
    &=\log \int_{\Omega} e^{\bar \psi_k(\by)-c_{\varepsilon}(\bx_2, \by)}\nu_{\star, k}(\dd\by) - \log \int_{\Omega} e^{\bar \psi_k(\by)-c_{\varepsilon}(\bx_1, \by)}\nu_{\star, k}(\dd\by)\\
    &\leq \log \left[e^{\sup_{\by\in\Omega} |c_{\varepsilon}(\bx_1, \by)-c_{\varepsilon}(\bx_2, \by)|}\int_{\Omega} e^{\bar \psi_k(\by) -c_{\varepsilon}(\bx_1, \by)}\nu_{\star, k}(\dd\by)\right]-\log \int_{\Omega} e^{\bar \psi_k(\by)-c_{\varepsilon}(\bx_1, \by)}\nu_{\star, k}(\dd\by)\\
    &=\sup_{\by\in \Omega} |c_{\varepsilon}(\bx_1, \by)-c_{\varepsilon}(\bx_2, \by)|\leq 2\|c_{\varepsilon}\|_{\infty}.
\end{split}
\end{equation*}

As $\mu_{\star}(\bar \varphi_k)=0$, we have $\sup_x \bar \varphi_k(\bx)\geq 0$ and $\inf_{\bx} \bar \varphi_k(\bx)\leq 0$, hence the above implies $\|\bar \varphi_k\|_{\infty}\leq 2\|c_{\varepsilon}\|_{\infty}$. The definition of $\bar \psi_k$ in Eq.\eqref{bar_psi} yields $\|\bar \psi_k\|_{\infty}\leq \|\bar \varphi_k\|_{\infty}+\|c_{\varepsilon}\|_{\infty}\leq 3\|c_{\varepsilon}\|_{\infty}$.
\qed
\end{proof}



% \Wei{we don't need $\epsilon$ any more (used in nips submission, but not this version), just use $\epsilon$}

The key to the proof is to adopt the strong convexity of the function $e^x$ for $x\in[-\alpha, \infty)$ and some constant $\alpha\in\mathbb{R}$,
\begin{equation}\label{convex_exp_func}
    e^b-e^a\geq (b-a)e^a + \frac{e^{-\alpha}}{2}|b-a|^2 \quad \text{for } a, b\in [-\alpha, \infty).
\end{equation}

We also present two supporting lemmas in order to complete the proof
\begin{lemma}\label{derive_property}
Given $\varphi, \varphi'\in L^2(\mu_{\star})$ and $\psi, \psi'\in L^2(\nu_{\star})$, and define
\begin{equation}\label{def_G1_G2}
\begin{split}
    \partial_1 G(\varphi, \psi)(\bx) = 1- \int_{\Omega} e^{\varphi(\bx)+\psi(\by)-c_{\varepsilon}(\bx, \by)} \nu_{\star}(\dd\by)\\
    \partial_2 G(\varphi, \psi)(\by) = 1- \int_{\Omega} e^{\varphi(\bx)+\psi(\by)-c_{\varepsilon}(\bx, \by)} \mu_{\star}(\dd\bx).
\end{split}
\end{equation}
If both $\varphi\otimes \psi-c_{\varepsilon}\geq -\alpha$ and $\varphi'\oplus \psi'-c_{\varepsilon}\geq -\alpha$ for some $\alpha\in \mathbb{R}$, we have 
\begin{equation*}
\begin{split}
    G(\varphi', \psi')-G(\varphi, \psi)&\geq \ \int_{\Omega} \partial_1 G(\varphi', \psi')(\bx)[\varphi'(\bx)-\varphi(\bx)]\mu_{\star}(\dd\bx)\\
    &\ \ + \int_{\Omega} \partial_2 G(\varphi', \psi')(\by)[\psi'(\by)-\psi(\by)]\nu_{\star}(\dd\by)\\
    & \ \ + \frac{e^{-\alpha}}{2}\|(\varphi-\varphi')\oplus (\psi-\psi')\|_{L^2(\mu_{\star}\otimes \nu_{\star})}.
\end{split}
\end{equation*}
\end{lemma}


\begin{proof} By Eq.\eqref{convex_exp_func}, we have
\begin{align*}
    &\ \ \ G(\varphi', \psi')-G(\varphi, \psi)\\
    &=\mu_{\star}(\varphi'-\varphi)+\nu_{\star}(\psi'-\psi)+\iint_{\Omega^2} (e^{\varphi\oplus\psi-c_{\varepsilon}} - e^{\varphi'\oplus\psi'-c_{\varepsilon}})\dd(\mu_{\star}\otimes \nu_{\star})\\
    &\geq \mu_{\star}(\varphi'-\varphi)+\nu_{\star}(\psi'-\psi)+\iint_{\Omega^2} (\varphi\oplus\psi-\varphi'\oplus\psi')e^{\varphi'\oplus\psi'-c_{\varepsilon}}\dd(\mu_{\star}\otimes\nu_{\star})\\
    &\qquad\qquad+\frac{e^{-\alpha}}{2} \iint_{\Omega^2}\|\varphi\oplus\psi-\varphi'\oplus\psi'\|_2^2 \dd (\mu_{\star} \otimes \nu_{\star})  \\
    &=\int_{\Omega} \partial_1 G(\varphi', \psi')(\bx)[\varphi'(x)-\varphi(\bx)]\mu_{\star}(\dd\bx)+\int_{\Omega} \partial_2 G(\varphi', \psi')(\by)[\psi'(\by)-\psi(\by)]\nu_{\star}(\dd\by)\\
    &\quad +\frac{e^{-\alpha}}{2}\|(\varphi-\varphi')\oplus (\psi-\psi')\|_{L^2(\mu_{\star} \otimes \nu_{\star})}.
\end{align*} \qed
\end{proof}




\begin{lemma}\label{iterate_bound}
Given a small $\epsilon\leq \frac{1}{(D+1)^2}$, we have
\begin{align*}
    G(\bar \varphi_{k+1}, \bar \psi_{k+1})-G(\bar \varphi_{k}, \bar \psi_{k})\geq \frac{\sigma}{2}\left(\|\bar \varphi_{k+1}-\bar \varphi_{k}\|_{L^2(\mu_{\star})}^2+\|\bar \psi_{k+1}- \bar \psi_{k}\|_{L^2(\nu_{\star})}^2\right)-O(\epsilon),
\end{align*}
where $\sigma:=e^{-6\|c_{\varepsilon}\|_{\infty}}$; the big-O notation mainly depends on volume of the domain $\Omega$.
\end{lemma}

\begin{proof}
We first decompose the LHS as follows
\begin{equation*}
    G(\bar \varphi_{k+1}, \bar \psi_{k+1})-G(\bar \varphi_{k}, \bar \psi_{k})=\underbrace{G(\bar \varphi_{k+1}, \bar \psi_{k+1})-G(\bar \varphi_{k+1}, \bar \psi_{k})}_{\mathrm{I}}+\underbrace{G(\bar \varphi_{k+1}, \bar \psi_{k})-G(\bar \varphi_{k}, \bar \psi_{k})}_{\mathrm{II}}.
\end{equation*}

For the estimate of $\mathrm{I}$, by Lemma  \ref{derive_property} with $\sigma=e^{-6\|c_{\varepsilon}\|_{\infty}}$, we have
\begin{equation*}
    \mathrm{I}\geq \int_{\Omega} \partial_2 G(\bar \varphi_{k+1}, \bar \psi_{k+1})(\by)[\bar \psi_{k+1}(\by)-\bar \psi_k(\by)]\nu_{\star}(\dd\by)+ \frac{\sigma}{2}\|\bar \psi_k-\bar \psi_{k+1}\|_{L^2(\nu_{\star})}.
\end{equation*}

For the integral above, by the definition of $\partial_2 G$ in Eq.\eqref{def_G1_G2}, we have
\begin{equation}
\begin{split}\label{1st_marginal_0_before}
    &\quad \partial_2 G(\bar \varphi_{k+1}, \bar \psi_{k+1})(\by) \nu_{\star}(\dd\by)\\
    &=\nu_{\star}(\dd\by) - \int_{\Omega} e^{\bar \varphi_{k+1}(\bx)+\bar \psi_{k+1}(\by)-c_{\varepsilon}(\bx,\by)}\mu_{\star}(\dd\bx)\nu_{\star}(\dd\by)\\
    &=\nu_{\star}(\dd\by)- \int_{\Omega}\pi_{2k+2}(\dd\bx, \cdot)\frac{\dd \mu_{\star} \otimes\dd\nu_{\star}}{\dd\mu_{\star, k+1}\otimes \dd\nu_{\star, k+1}},
\end{split}
\end{equation}
where the last equality follows by the LHS of Eq.\eqref{approx_potential}, the last integral is with respect to $\bx$.

Apply Lemma \ref{control_diff_measure} with respect to $\frac{\dd \mu_{\star}}{\dd\mu_{\star, k+1}}(\bx)$ 
    \begin{equation}
    \begin{split}\label{1st_key_result}
        \quad \int_{\Omega}\pi_{2k+2}(\dd\bx, \cdot)\frac{\dd \mu_{\star} \otimes\dd\nu_{\star}}{\dd\mu_{\star, k+1}\otimes \dd\nu_{\star, k+1}} & \leq  \int_{\Omega} \big(1+O(\epsilon)\big)\pi_{2k+2}(\dd\bx, \cdot) \frac{\nu_{\star}(\dd\by)}{\nu_{\star, k+1}(\dd\by)} \\
        & \leq \big(1+O(\epsilon)\big) \nu_{\star, k+1}(\dd\by)\frac{ \nu_{\star}(\dd\by)}{\nu_{\star, k+1}(\dd\by)} \\
        &= \big(1+O(\epsilon)\big) \nu_{\star}(\dd\by),
    \end{split}
    \end{equation}
where the second inequality is derived by the fact that the second marginal of $\pi_{2k+2}$ is $\nu_{\star, k+1}$ in Eq.\eqref{marginal_eqn}. Similarly, we can show $\int_{\Omega}\pi_{2k+2}(\dd\bx, \cdot)\frac{\dd \mu_{\star} \otimes\dd\nu_{\star}}{\dd\mu_{\star, k+1}\otimes \dd\nu_{\star, k+1}}\gtrsim (1-O(\epsilon))\nu_{\star}(\dd\by)$.

Combining Eq.\eqref{1st_marginal_0_before} and \eqref{1st_key_result}, we have
\begin{equation}\label{partial2_G_upper}
\begin{split}
    & |\partial_2 G(\bar \varphi_{k+1}, \bar \psi_{k+1})(\by) \nu_{\star}(\dd\by)| \lesssim \epsilon \nu_{\star}(\dd\by).
\end{split}
\end{equation}

We now build the lower bound of the integral as follows
\begin{equation}
\begin{split}\label{1st_marginal_0}
    &\quad \int_{\Omega}\partial_2 G(\bar \varphi_{k+1}, \bar \psi_{k+1})(\by)[\bar \psi_{k+1}(\by)-\bar \psi_k(\by)]\nu_{\star}(\dd\by)\\
    &\gtrsim -\epsilon \int_{\Omega}  \big|\bar \psi_{k+1}(\by)-\bar \psi_k(\by)\big|\nu_{\star}(\dd \by)\\
    &\gtrsim -\epsilon,
\end{split}
\end{equation}
where the first inequality follows by Eq.\eqref{1st_marginal_0_before} and the second inequality follows by the boundedness of the potential function in Lemma \ref{upper_bound_of_varphi_psi}. The above means that $ \mathrm{I}\geq \frac{\sigma}{2}\|\bar \psi_k-\bar \psi_{k+1}\|_{L^2(\mu_{\star}\otimes \nu_{\star})} -O(\epsilon)$. For the estimate of $\mathrm{II}$, Lemma  \ref{derive_property} yields 
\begin{equation*}
    \mathrm{II}\geq \int_{\Omega} \partial_1 G(\bar \varphi_{k+1}, \bar \psi_{k})(\bx)[\bar \varphi_{k+1}(\bx)-\bar \varphi_k(\bx)]\mu_{\star}(\dd\bx)+ \frac{\sigma}{2}\|\bar \varphi_k-\bar \varphi_{k+1}\|_{L^2(\mu_{\star})}.
\end{equation*}

Recall the definition of $\bar \varphi_{k+1}$ in Eq.\eqref{bar_varphi} states that 
$\int_{\Omega} e^{\bar \psi_{k}(\by)-c_{\varepsilon}(\bx,\by)}\nu_{\star, k}(\dd\by)=e^{-\bar \varphi_{k+1}(\bx)+\lambda_k}$. Apply Lemma \ref{control_diff_measure} with respect to $\frac{\dd \nu_{\star}}{\dd\nu_{\star, k}}(\by)$ 
\begin{equation*}
\begin{split}
    \partial_1 G(\bar \varphi_{k+1}, \bar \psi_{k})(\bx)&=1-e^{\bar \varphi_{k+1}(\bx)}\int_{\Omega} e^{\bar \psi_k(\by)-c_{\varepsilon}(\bx, \by)}\nu_{\star, k}(\dd\by) \frac{\nu_{\star}(\dd\by)}{\nu_{\star, k}(\dd\by)}\\
    & \geq 1-e^{\bar \varphi_{k+1}(\bx)}\int_{\Omega} \big(1+O(\epsilon)\big) e^{\bar \psi_k(\by)-c_{\varepsilon}(\bx, \by)}\nu_{\star, k}(\dd\by)\\
    & \geq 1-(1+O(\epsilon))e^{\lambda_k} - O(\epsilon) \underbrace{\int_{\Omega} \|\by\|^2_2 e^{\bar \varphi_{k+1}(\bx)+\bar \psi_k(\by)-c_{\varepsilon}(\bx, \by)}\nu_{\star, k}(\dd\by)}_{\text{bounded and non-negative from Lemma~\ref{upper_bound_of_varphi_psi}}}\\
    & = 1-(1+O(\epsilon))e^{\lambda_k} - O(\epsilon)\\
    \partial_1 G(\bar \varphi_{k+1}, \bar \psi_{k})(\bx)&\leq 1+(1+O(\epsilon))e^{\lambda_k} + O(\epsilon),
\end{split}
\end{equation*}
which includes a deterministic scalar (independent of $\bx$) and a small perturbation (dependent of $\bx$ and $\epsilon$). Denote $R(\bx, \by) = \|\by\|^2_2 e^{\bar \varphi_{k+1}(\bx)+\bar \psi_k(\by)-c_{\varepsilon}(\bx, \by)}$,
% and the last two inequalities follow by Lemma \ref{interable_marginal_v2}. 
Combining the centering operation with $\mu_{\star}(\bar \varphi_{k+1})=\mu_{\star}(\bar \varphi_{k})=0$ 
\begin{equation*}
\begin{split}\label{2nd_marginal_0}
    &\ \ \ \int_{\Omega} \partial_1 G(\bar \varphi_{k+1}, \bar \psi_{k})(\bx)[\bar \varphi_{k+1}(\bx)-\bar \varphi_k(\bx)]\mu_{\star}(\dd\bx)\\
    &=\text{deterministic scalar}\cdot \underbrace{\int_{\Omega}[\bar \varphi_{k+1}(\bx)-\bar \varphi_{k}(\bx)]\mu_{\star}(\dd\bx)}_{:=0 \text{ by the centering operation}} + \epsilon \underbrace{\int_{\Omega} R(\bx) [\bar \varphi_{k+1}(\bx)-\bar \varphi_k(\bx)]\mu_{\star}(\dd\bx)}_{\text{integrable by the boundedness of } R, \bar\varphi_{k+1}, \psi_k}\\
    &=O(\epsilon).
\end{split}
\end{equation*}
Combining the estimates of $\mathrm{I}$ and $\mathrm{II}$ completes the proof.
\qed
\end{proof}


\subsection{Convergence of Dual and Potentials}

\textbf{Proof of Lemma \ref{main_theorem}}

\textbf{Part I: Convergence of the Dual}

By Lemma \ref{derive_property} with $\alpha=6\|c\|_{\infty}$ and the decomposition in Eq.\eqref{decompose}, we have
\begin{equation}\label{main_decompose}
\begin{split}
    &\ \ \ G(\bar \varphi_k, \bar \psi_k)-G(\bar \varphi_{\star}, \bar \psi_{\star})\\
    &\geq \int_{\Omega} \partial_1 G(\bar \varphi_k, \bar \psi_k)(\bx)[\bar \varphi_k(\bx)-\bar \varphi_{\star}(\bx)]\mu_{\star}(\dd\bx)\\
    &\ \ + \int_{\Omega} \partial_2 G(\bar \varphi_k, \bar \psi_k)(\by)[\bar \psi_k(\by)-\bar \psi_{\star}(\by)]\nu_{\star}(\dd\by)\\
    & \ \ + \frac{\sigma}{2}\left(\|\bar \varphi_{k}-\bar \varphi_{\star}\|_{L^2(\mu_{\star})}^2+\|\bar \psi_{k}- \bar \psi_{\star}\|_{L^2(\nu_{\star})}^2\right)\\
    &\geq \int_{\Omega} \partial_1 G(\bar \varphi_k, \bar \psi_k)(\bx)[\bar \varphi_k(\bx)-\bar \varphi_{\star}(\bx)]\mu_{\star}(\dd\bx)+ \frac{\sigma}{2} \|\bar \varphi_{k}-\bar \varphi_{\star}\|_{L^2(\mu_{\star})}^2-O(\epsilon),
\end{split}
\end{equation}
where $\sigma:=e^{-6\|c_{\varepsilon}\|_{\infty}}$, and the last inequality follows by Eq.\eqref{partial2_G_upper} and boundedness of $\bar\psi_k$ and $\bar\psi_{\star}$ in Lemma \ref{upper_bound_of_varphi_psi}. For the first integral,
$\int_{\Omega} \partial_1 G(\bar \varphi_{k+1}, \bar \psi_{k})(\bx)[\bar \varphi_{k}(\bx)-\bar \varphi_{\star}(\bx)]\mu_{\star}(\dd\bx)=O(\epsilon)$ because $\partial_1 G(\bar \varphi_{k+1}, \bar \psi_{k})(\bx)$ includes a deterministic scalar and a small perturbation with $\mu_{\star}(\bar \varphi_{k}(\bx)=\mu_{\star}(\bar \varphi_{\star}(\bx))=0$.

Hence
\begin{equation}
\begin{split}\label{2nd_integral}
    &\ \ \ \int_{\Omega} \partial_1 G(\bar \varphi_{k}, \bar \psi_{k})(\bx)[\bar \varphi_{k}(\bx)-\bar \varphi_{\star}(\bx)]\mu_{\star}(\dd\bx)\\
    &=\int_{\Omega} [\partial_1 G(\bar \varphi_{k}, \bar \psi_{k})(\bx)-\partial_1 G(\bar \varphi_{k+1}, \bar \psi_{k})(\bx)][\bar \varphi_{k}(\bx)-\bar \varphi_{\star}(\bx)]\mu_{\star}(\dd\bx)+O(\epsilon)\\
    &\geq -\frac{1}{2\sigma}\|\partial_1 G(\bar \varphi_{k}, \bar \psi_{k})-\partial_1 G(\bar \varphi_{k+1}, \bar \psi_{k})\|^2_{L^2(\mu_{\star})}-\frac{\sigma}{2}\|\bar \varphi_{k}(\bx)-\bar \varphi_{\star}(\bx)\|^2_{L^2(\mu_{\star})}+O(\epsilon),
\end{split}
\end{equation}
where the inequality follows from H\"{o}lder's inequality and Young's inequality.

Plugging Eq.\eqref{2nd_integral} into Eq.\eqref{main_decompose}, we have
\begin{equation}\label{iterate_up_bounded}
\begin{split}
    G(\bar \varphi_{\star}, \bar \psi_{\star})-G(\bar \varphi_k, \bar \psi_k)&\leq \frac{1}{2\sigma}\|\partial_1 G(\bar \varphi_{k}, \bar \psi_{k})-\partial_1 G(\bar \varphi_{k+1}, \bar \psi_{k})\|^2_{L^2(\mu_{\star})} + O(\epsilon).
\end{split}
\end{equation}

Note that 
\begin{equation}
\begin{split}
    \label{iterate_up_bounded_2}
    |\partial_1 G(\bar \varphi_{k}, \bar \psi_{k})(\bx)-\partial_1 G(\bar \varphi_{k+1}, \bar \psi_{k})(\bx)|&\leq \int_{\Omega} \left|e^{\bar \varphi_{k+1}\oplus\bar \psi_k-c_{\varepsilon}}-e^{\bar \varphi_{k}\oplus\bar \psi_k-c_{\varepsilon}}\right|\nu_{\star}(\dd\by)\\
    &\leq e^{6\|c_{\varepsilon}\|_{\infty}} \int_{\Omega}| \bar \varphi_{k+1}\oplus\bar \psi_k-\bar \varphi_{k}\oplus\bar \psi_k|\nu_{\star}(\dd\by)\\
    &=\frac{1}{\sigma}|\bar \varphi_{k+1}(\bx)-\bar \varphi_{k}(\bx)|,
\end{split}
\end{equation}
where the second inequality follows by Lemma \ref{upper_bound_of_varphi_psi} and the exponential function follows a Lipschitz continuity such that: $e^a-e^b\leq e^M |b-a|$ for $a, b\leq M$; $\sigma:=e^{-6\|c_{\varepsilon}\|_{\infty}}$.


First combining Eq.\eqref{iterate_up_bounded} and \eqref{iterate_up_bounded_2} and then including Lemma \ref{iterate_bound}, we conclude that
\begin{equation*}
\begin{split}
    G(\bar \varphi_{\star}, \bar \psi_{\star})-G(\bar \varphi_k, \bar \psi_k)&\leq \frac{1}{2\sigma^3}\|\bar \varphi_{k+1}-\bar \varphi_{k}\|_{L^2(\mu_{\star})}^2 + O(\epsilon)\\
    &\leq \frac{1}{\sigma^4}\left(G(\bar \varphi_{k+1}, \bar \psi_{k+1})-G(\bar \varphi_k, \bar \psi_k)\right) +  \frac{O(\epsilon)}{\sigma^4},\\
\end{split}
\end{equation*}
where the last inequality follows by $\sigma\leq 1$. Further writing $\Delta_k=G(\bar \varphi_{\star}, \bar \psi_{\star})-G(\bar \varphi_k, \bar \psi_k)$, we have
\begin{equation*}
    \Delta_k\leq \frac{1}{\sigma^4}\left(\Delta_k-\Delta_{k+1}\right) + \frac{  {O(\epsilon)}}{\sigma^4}.
\end{equation*}
In other words, we can derive the contraction property as follows
\begin{equation*}
    \Delta_{k+1}\leq (1-\sigma^4)\Delta_k +  O(\epsilon) \leq \cdots \leq (1-\sigma^4)^{k+1}\Delta_0  + O(e^{24\|c_{\varepsilon}\|_{\infty}} \epsilon).
\end{equation*}
which hereby completes the claim of the theorem for any $k\geq 1$. \qed




% Since we are still interested in the convergence of the original (un-centered) Sinkhorn algorithm, now we extend the result to Theorem \ref{main_theorem} and provide the proof as follows




% \begin{proof}[Proof of Theorem \ref{main_theorem}]

% As $G(\bar \varphi_k, \bar \psi_k)=G(\varphi_k, \psi_k)$ by Lemma \ref{simple_result}, the convergence of \eqref{uncenter_G_converge} follows directly from \eqref{center_G_converge}.

% Let $\mathcal{A}_k=\mu_{\star}(\varphi_{\star}-\varphi_k)$ and $\mathcal{B}_k=\nu_{\star}(\psi_{\star}-\psi_k)$.  Hence $\vartheta_k:=\psi_{\star}-\psi_k-\mathcal{B}_k$ is centered, and we recall that $\bar \varphi_{\star}-\bar \varphi_k$ is centered as well. By Jensen's inequality, we have $\mathcal{A}_k\leq \|\varphi_{\star}- \varphi_k\|_{L^2(\mu_{\star})}$ and $\mathcal{B}_k\leq \|\psi_{\star}- \psi_k\|_{L^2(\nu_{\star})}$. Combining Lemma \ref{simple_result} and Eq.\eqref{decompose}, we have


% \begin{equation*}
% \begin{split}
%     &\quad (\mathcal{A}_k +\mathcal{B}_k)^2 \\
%     &\leq\|\varphi_{\star}-\varphi_k\|_{L^2(\mu_{\star})}^2 + \|\psi_{\star}-\psi_k\|_{L^2(\nu_{\star})}^2+ 2\mathcal{A}_k \mathcal{B}_k\\
%     &=\|\bar \varphi_{\star}-\bar \varphi_k+\mathcal{A}_k\|_{L^2(\mu_{\star})}^2 + \|\vartheta_k+\mathcal{B}_k\|_{L^2(\nu_{\star})}^2+ 2\mathcal{A}_k \mathcal{B}_k \\
%     &=\|\bar \varphi_{\star}-\bar \varphi_k\|_{L^2(\mu_{\star})}^2+\mathcal{A}_k^2 + \|\vartheta_k\|_{L^2(\nu_{\star})}^2+\mathcal{B}^2_k+ 2\mathcal{A}_k \mathcal{B}_k\\
%     &= \|\bar \varphi_{\star}-\bar \varphi_k\|_{L^2(\mu_{\star})}^2+ \|\vartheta_k\|_{L^2(\nu_{\star})}^2+(\mathcal{A}_k+\mathcal{B}_k)^2\\
%     &= \|\bar \varphi_{\star}-\bar \varphi_k\|_{L^2(\mu_{\star})}^2+ \|\vartheta_k+\mathcal{A}_k+\mathcal{B}_k\|_{L^2(\nu_{\star})}^2\\
%     &= \|\bar \varphi_{\star}-\bar \varphi_k\|_{L^2(\mu_{\star})}^2+ \|\bar \psi_{\star}-\bar \psi_k\|_{L^2(\nu_{\star})}^2,\\
% \end{split}
% \end{equation*}
% which proves Eq.\eqref{uncenter_varphi_psi_converge} via Eq.\eqref{center_varphi_psi_converge}.
% \qed
% \end{proof}



\textbf{Part II: Convergence of the Potentials}

For the convergence of the potential function, in spirit to Lemma \ref{iterate_bound}, we obtain
\begin{equation*}
    G_{\star}(\bar \varphi_{\star}, \bar \psi_{\star})-G(\bar \varphi_{k}, \bar \psi_{k}):=\Delta_k\geq \frac{\sigma}{2}\left(\|\bar \varphi_{\star}-\bar \varphi_{k}\|_{L^2(\mu_{\star})}^2+\|\bar \psi_{\star}- \bar \psi_{k}\|_{L^2(\nu_{\star})}^2\right)-O(\epsilon).
\end{equation*}

We can upper bound the potential as follows 
\begin{equation*}
\begin{split}
    \|\bar \varphi_{\star}-\bar \varphi_{k}\|_{L^2(\mu_{\star})}^2+\|\bar \psi_{\star}- \bar \psi_{k}\|_{L^2(\nu_{\star})}^2 &\leq  \frac{2}{\sigma}\Delta_k + \frac{O(\epsilon)}{\sigma}\leq  \frac{2}{\sigma}(1-\sigma^4)^{k}\Delta_0  + O(e^{30\|c_{\varepsilon}\|_{\infty}} \epsilon).
\end{split}
\end{equation*}

Further applying $(|a|+|b|)^2\leq 2a^2+2b^2$ and $\sqrt{c^2+d^2}\leq |c|+|d|$, we have
\begin{equation}
\begin{split}\label{potential_convergence}
    \|\bar \varphi_{\star}-\bar \varphi_{k}\|_{L^2(\mu_{\star})}+\|\bar \psi_{\star}- \bar \psi_{k}\|_{L^2(\nu_{\star})} &\leq  \sqrt{\frac{4}{\sigma}(1-\sigma^4)^{k+1}\Delta_0}  + O(e^{15\|c_{\varepsilon}\|_{\infty}} \epsilon^{1/2})\\
    &\lesssim  e^{3\|c_{\varepsilon}\|_{\infty}}\beta_{\varepsilon}^{\frac{k}{2}}  + e^{15\|c_{\varepsilon}\|_{\infty}} \epsilon^{1/2},
\end{split}
\end{equation}
where $\beta_{\varepsilon}=1-\sigma^4=1-e^{-24 \|c_{\varepsilon}\|_{\infty}}$.\qed




\subsection{Convergence of the Static Couplings}
\textbf{Proof of Theorem \ref{coupling_convergence}}

Recall from the bounded potential in Lemma \ref{upper_bound_of_varphi_psi}, we have
\begin{align}\label{exp_minus}
    e^{\bar \varphi_{\star}\oplus\bar \psi_{\star}-c_{\varepsilon}} - e^{\bar \varphi_k\oplus\bar \psi_k-c_{\varepsilon}} &\leq e^{6\|c_{\varepsilon}\|_{\infty}}\big( |\bar\varphi_{\star}-\varphi_k| + |\bar \psi_{\star}-\bar \psi_k|\big).
\end{align}
 
Following Theorem 3 of \citet{Deligiannidis_21}, we define a class of 1-Lipschitz functions $\text{Lip}_1=\big\{F\big||F(\bx_0,\by_0)-F(\bx_1,\by_1)|\leq \|\bx_1-\bx_0\|_2+\|\by_1-\by_0\|_2\big\}$. Since the structural property \eqref{main_solution_property} allows to represent $\pi_{\star}$ using $(\varphi_{\star}+a) \oplus (\psi_{\star}-a)$ for any $a$. For any $F\in \text{Lip}_1$, we have
\begin{align*}
    &\ \ \ \iint_{\bX\times \bY} F e^{\bar \varphi_{\star}\oplus\bar \psi_{\star}-c_{\varepsilon}}\dd(\mu_{\star}\otimes \nu_{\star})-\iint_{\bX\times \bY} F e^{\bar \varphi_k\oplus\bar \psi_k-c_{\varepsilon}}\dd(\mu_{\star, k}\otimes \nu_{\star, k}) \\
    &\leq \iint_{\bX\times \bY} F e^{\bar \varphi_{\star}\oplus\bar \psi_{\star}-c_{\varepsilon}}\dd(\mu_{\star}\otimes \nu_{\star})-\iint_{\bX\times \bY} F e^{\bar \varphi_{k}\oplus\bar \psi_{k}-c_{\varepsilon}}\dd(\mu_{\star}\otimes \nu_{\star}) \\
    & \qquad\quad + \iint_{\bX\times \bY} F e^{\bar \varphi_{k}\oplus\bar \psi_{k}-c_{\varepsilon}}\dd(\mu_{\star}\otimes \nu_{\star})-\iint_{\bX\times \bY} F e^{\bar \varphi_k\oplus\bar \psi_k-c_{\varepsilon}}\dd(\mu_{\star, k}\otimes \nu_{\star, k}) \\
    &\leq \iint_{\bX\times \bY} F \underbrace{|e^{\bar \varphi_{\star}\oplus\bar \psi_{\star}-c_{\varepsilon}} - e^{\bar \varphi_k\oplus\bar \psi_k-c_{\varepsilon}}|}_{\text{by Eq.}\eqref{exp_minus}}\dd(\mu_{\star}\otimes \nu_{\star})   \\
    &\qquad\quad +  \iint_{\bX\times \bY} F e^{\bar \varphi_{k}\oplus\bar \psi_{k}-c_{\varepsilon}}\dd(\mu_{\star}\otimes |\nu_{\star}-\nu_{\star,k}|+|\mu_{\star}-\mu_{\star,k}|\otimes \nu_{\star,k})\\
    &\lesssim e^{9\|c_{\varepsilon}\|_{\infty}}\beta^{k/2}+ e^{21\|c_{\varepsilon}\|_{\infty}}\epsilon^{1/2},
\end{align*}
where the last inequality is mainly derived from the first term in the second inequality by combining \eqref{potential_convergence} and \eqref{exp_minus}; the second term in the second inequality can be upper bounded by Lemma \ref{control_diff_measure}.

Recall the definition of the duality of the 1-Wasserstein distance, we have
\begin{align*}
    \mathbf{W}_1(\pi_k, \pi_{\star})&=\sup\bigg\{ \iint_{\bX\times \bY} F e^{\bar \varphi_{\star}\oplus\bar \psi_{\star}-c_{\varepsilon}}\dd(\mu_{\star}\otimes \nu_{\star})\\
    &\qquad\qquad\qquad\qquad -\iint_{\bX\times \bY} F e^{\bar \varphi_k\oplus\bar \psi_k-c_{\varepsilon}}\dd(\mu_{\star, k}\otimes \nu_{\star, k}): F\in \text{Lip}_1\bigg\} \\
    &\leq O(e^{9\|c_{\varepsilon}\|_{\infty}}\beta^{k/2}+ e^{21\|c_{\varepsilon}\|_{\infty}}\epsilon^{1/2}).
\end{align*}


    \qed



\subsection{Auxiliary Results}


\begin{lemma}
\label{control_diff_measure}
Given probability densities $\rho(\bx)= e^{-U(\bx)} / \mathbb{Z}$ and $\widetilde \rho(\bx)= e^{-\widetilde U(\bx)}/ \widetilde{\mathbb{Z}}$ defined on $\Omega$, where $\Omega$ is a bounded domain that contains $\Omega$ and $\Omega$, $\mathbb{Z}$ and $\widetilde{\mathbb{Z}}$ are the normalizing constants. For small enough $\epsilon\lesssim \frac{1}{(D+1)^2}$, where $D$ is the radius of a centered ball covering $\Omega$, we have
    % \begin{equation}
    %     \bigg|\log\frac{\rho(\bx)}{\widetilde \rho(\bx)}\bigg|\lesssim \epsilon\|\bx\|_2.
    % \end{equation}    
    \begin{equation}\label{desired_output}
        1- O(\epsilon) \leq \frac{\rho(\bx)}{\widetilde \rho(\bx)}\leq 1+O(\epsilon),\  1- O(\epsilon) \leq \frac{\widetilde\rho(\bx)}{ \rho(\bx)}\leq 1+O(\epsilon).
    \end{equation}
    % where $\delta$ mainly depends on the size of $\Omega$ \Wei{double check this part}.
\end{lemma}

\begin{proof} 
% Assumption \ref{ass:smooth_potential} implies $\|\nabla U(\bx)\|_2 \leq \|\nabla U(\bx) - \nabla U(\bm{r})\|_2 +\|\nabla U(\bm{r})\|_2 \leq L \|\bx - \bm{r}\|_2+\|\nabla U(\bm{r})\|_2$, where $L$ is the Lipschitz constant.

From the approximation assumption \ref{ass:approx_score}: $\|\nabla \widetilde U(\bx) - \nabla U(\bx)\|_{2} \leq \epsilon (1+\|\bx\|_2).$

% \begin{equation*}\label{grad_l_inf_norm}
%     \begin{split}
%         \|\nabla \widetilde U(\bx) - \nabla U(\bx)\|_{2} \leq \epsilon (1+\|\bx\|_2).
%     \end{split}
%     \end{equation*}.
    
Moreover, $U$ satisfies the smoothness assumption \ref{ass:smooth_measure}. Note that for any $\bx, \by\in \Omega$
\begin{equation*}\label{diff_line_integral}
    U(\bx) - U(\by)=\int_0^1 \frac{\dd}{\dd t} U(t\bx + (1-t)\by) = \int_0^1 \langle \bx -\by, \nabla U(t\bx +(1-t) \by) \rangle \dd t.
\end{equation*}
% \begin{equation}\label{diff_line_integral2}
%     \widetilde U(\bx) - \widetilde U(\by)=\int_0^1 \frac{\dd}{\dd t} \widetilde U(t\bx + (1-t)\by) = \int_0^1 \langle \bx -\by, \nabla \widetilde U(t\bx +(1-t) \by) \rangle \dd t.
% \end{equation}

Moreover, there exist $\bx_0$ such that $U(\bx_0)=\widetilde U(\bx_0)$ since $\rho$ and $\widetilde\rho$ are probability densities. It follows
    \begin{equation*}
    \begin{split}\label{diff_U}
    |\widetilde U(\bx)- U(\bx)|&=\bigg|\int_{0}^{1} \langle \bx-\bx_0, \nabla \widetilde U(\dot{\bx}_t) - \nabla U(\dot{\bx}_t)\rangle \dd t \bigg| \\
    &\leq \int_0^1 \|\bx-\bx_0\|_2\cdot \big \|\nabla \widetilde U(\dot{\bx}_t) - \nabla U(\dot{\bx}_t)\big \|_2 \dd t\\
    & \leq \epsilon (\|\bx\|_2 + \|\bx_0\|_2)(1+\|\bx\|_2) \lesssim \epsilon (D+1)^2,
    \end{split}
    \end{equation*}
    where $\dot{\bx}_t=t\bx+(1-t)\bx_0$ is a line from $\bx_0$ to $\bx$.

For the normalizing constant, we have
    \begin{equation*}\label{diff_C}
    \begin{split}
        |\widetilde{\mathbb{Z}}- \mathbb{Z}| &\leq \int_{\Omega} e^{-U(\bx)}\big| e^{-\widetilde U(\bx)+U(\bx)} - 1\big| \dd \bx \lesssim \epsilon \int_{\Omega} e^{-U(\bx)} \epsilon (D+1)^2 \dd \bx,
    \end{split}
    \end{equation*}
where the last inequality follows by Eq.\eqref{diff_U} and $e^a\leq 1+2a$ for $a \in [0, 1]$. 
We deduce
\begin{align*}
\left| \log \frac{\rho(\bx)}{\widetilde \rho(\bx)} \right| &= \left| \widetilde U(\bx) - U(\bx) + \log \frac{\widetilde{\mathbb{Z}}}{\mathbb{Z}} \right| \leq O(\epsilon).
\end{align*} \qed

\end{proof}

Notably, the above lemma also implies that $\text{KL}(\rho\|\widetilde \rho)\leq O(\epsilon)$ and $\text{KL}(\widetilde\rho\|\rho)\leq O(\epsilon)$.




\subsection{Connections between dual optimization and projections}
\label{dual_project_relation}
To see why \eqref{sinkhorn_vs_marginal2} holds. We first denote the second marginal of $\dd\pi(\varphi_k, \psi_k):=e^{\varphi_k\oplus\psi_k}\dd \mathcal{G}$ by $\nu'$ and then proceed to show $\nu'=\nu_{\star}$ \citep{Nutz22_note}. Recall that $G$ is concave and $\psi_k=\argmax_{\psi\in L^1(\nu_{\star})} G(\varphi_k,\psi)$, it suffices to show that given fixed $\varphi_k\in L^1(\mu_{\star})$, $\psi_k\in L^1(\nu_{\star})$, a constant $\eta$ and bounded measurable function $\delta_{\psi}: \mathbb{R}^d\rightarrow \mathbb{R}$, the maximality of $G(\varphi_k, \psi_k)$ implies
% \begin{equation}
% \begin{split}
%     &G(\varphi_k, \psi_k)-  G(\varphi_k, \psi_k+\eta \delta_{\psi})\\
%     =&-\eta \nu_{\star}(\delta_{\psi})+\iint \left(e^{\varphi_k\oplus (\psi_k +\eta \delta_{\psi})}-e^{\varphi_k\oplus \psi_k}\right)\dd \mathcal{G}\\
%     % =&-\eta \nu_{\star}(\delta_{\psi})+\iint(e^{\eta \delta_{\psi} \otimes \psi}-1)\dd\pi(\varphi, \psi)\\
%     =&-\eta \nu_{\star}(\delta_{\psi})+\int (e^{\eta \delta_{\psi}}-1)\dd {\nu_k}\\
%     =& \eta (\nu_k(\delta_{\psi})-\nu_{\star}(\delta_{\psi}))+o(\eta)\geq 0,
% \end{split}
% \end{equation}
\begin{equation*}
\begin{split}
    0=&\frac{\dd}{\dd \eta}\bigg|_{\eta=0} G(\varphi_k, \psi_k+\eta \delta_{\psi})=\nu_{\star}(\delta_{\psi})-\iint_{\Omega^2} \delta_{\psi} e^{\varphi_k\oplus \psi_k} \dd\mathcal{G} %=\nu_{\star}(\delta_{\psi})-\int_{\Omega} \delta_{\psi} \dd \nu_{k}
    =\nu_{\star}(\delta_{\psi})-\nu'(\delta_{\psi}). 
\end{split}
\end{equation*}
Hence $\nu'=\nu_{\star}$. %the second equality follows by Taylor expansion. 
Similarly, we can show \eqref{sinkhorn_vs_marginal1}.


% \subsection{Connections between Static Primal IPF \eqref{staic_IPF_projections_} and Static Dual IPF \eqref{SB_eqn_IPF}}

\subsection{Connections between Static Primal IPF and Static Dual IPF}
\label{equiv_primal_dual}


It suffices to show the equivalence between $\pi_{2k}=\argmin_{\pi\in \Pi(\cdot, \nu_{\star})} \text{KL}(\pi\|\pi_{2k-1})$ and $\psi_{k}(\by)=-\log \int_{\Omega} e^{ \varphi_{k}(\bx)-c_{\varepsilon}(\bx,\by)} \mu_{\star}(\dd\bx)$.


For any $\pi_{2k}\in \Pi(\cdot, \nu_{\star})$, we invoke the disintegration of measures and obtain $\pi_{2k}=\mathrm{K}^{?}\otimes \nu_{\star}$. In addition, we have $\pi_{2k-1}=\mathrm{K}\otimes \nu'$. Now we can formulate 
\begin{equation*}
    \text{KL}(\pi\|\pi_{2k-1})=\text{KL}(\nu_{\star}\|\nu')+\text{KL}(\mathrm{K}^{?}\|\mathrm{K}).
\end{equation*}



The conditional probability of $\pi_{2k-1}$ given $\by$ is a normalized probability such that
\begin{align*}
    \dfrac{\frac{\dd \pi_{2k-1}}{\dd\mu_{\star}\otimes \nu_{\star}}}{\int \frac{\dd \pi_{2k-1}}{\dd\mu_{\star}\otimes \nu_{\star}}\dd\mu_{\star}} =\frac{\dd \mathrm{K}}{\dd \mu_{\star}}(\bx, \by)=\frac{e^{ \varphi_k\oplus  \psi_{k-1}-c_{\varepsilon}}}{\int e^{ \varphi_k\oplus  \psi_{k-1}-c_{\varepsilon}}\dd \mu_{\star}}=\frac{e^{ \varphi_k -c_{\varepsilon}}}{\int e^{ \varphi_k -c_{\varepsilon}}\dd \mu_{\star}}.
\end{align*}

The minimizer is achieved by setting $\mathrm{K}^{?}=\mathrm{K}$, namely $\pi_{2k}=\mathrm{K}\otimes \nu$. It follows that
\begin{align*}
    \dfrac{\dd\pi_{2k}}{\dd (\mu_{\star}\otimes \nu_{\star})}=\frac{\dd (\mathrm{K}\otimes \nu_{\star})}{\dd (\mu_{\star}\otimes \nu_{\star})}=\frac{e^{ \varphi_k -c_{\varepsilon}}}{\int e^{ \varphi_k -c_{\varepsilon}}\dd \mu_{\star}}:=e^{ \varphi_k\oplus  \psi_k-c_{\varepsilon}}.
\end{align*}

In other words, we have $\psi_{k}(\by)=-\log \int_{\Omega} e^{ \varphi_{k}(\bx)-c_{\varepsilon}(\bx,\by)} \mu_{\star}(\dd\bx)$, which verifies the connections.




\section{Experimental Details} \label{appendix:experiment}

\paragraph{Reflection Implementations} \citet{reflected_diffusion_model} already gave a nice tutorial on the implementation of hypercube-related domains with image applications. For extensions to general domains, we provide our solutions in Algorithm \ref{reflection_alg}. The \emph{crucial component} is a \emph{Domain Checker} to verify if a point is inside or outside of a domain, and it appears to be quite computationally expensive. To solve this problem, we propose two solutions :

1) If there exists a computationally efficient conformal map that transforms a manifold into simple shapes, such as a sphere or square, we can apply simple rules to conclude if a proposal is inside a domain.

% If we can efficiently transform Cartesian coordinates into polar coordinates with a radius $\mathrm{R}^{\theta}$ for a certain angle $\theta$, where the boundary radius is denoted as $\mathrm{R}_0^{\theta}$, we can easily verify whether $\mathrm{R}^{\theta} \leq \mathrm{R}_0^{\theta}$. A classical example is the $d$-dimensional ball in $\mathbb{R}^d$.

2) If the first solution is expensive, we can \textbf{cache} the domain through a fine-grid mesh $\{\bX_{i,j,\cdots}\}_{i,j,\cdots}$. Then, we approximate the condition via $$\min_{i,j,\cdots}\ \text{Distance}(\tilde \bx_{k-1}, \{\bX_{i,j,\cdots}\}_{i,j,\cdots})\leq \text{threshold}.$$ 
With the parallelism in Torch or JAX, the above calculation can be quite efficient. Nevertheless, a finer grid leads to a higher accuracy but also induces more computations. We can also expect the curse of dimensionality in ultra-high-dimensional problems and simpler domains are more preferred in such cases. Moreover, if $\bx_{k+1}\notin \Omega$ in extreme cases, one may consider ad-hoc rules with an error that decreases as we anneal the learning rate. Other elegant solutions include slowing down the process near the boundary or %\citep{reflected_BM} or 
warping the geometry with a Riemannian metric \citep{Diffusion_constrained}.

\begin{algorithm}[ht]
\caption{Practical Reflection Operator}\label{reflection_alg}
\begin{algorithmic}
 
\STATE{Simulate a proposal $\tilde \bx_{k+1}$ via an SDE given $\bx_k\in\Omega$.}

\IF{\textbf{Domain Checker:} $\tilde \bx_{k+1}\in \Omega$}
\STATE{Set $\bx_{k+1}=\tilde \bx_{k+1}$}
\ELSE
\STATE{Search (binary) the boundary $\dot{\bx}_{k+1}\in\partial \Omega$, where $\dot{\bx}_{k+1}=\eta\bx_k +(1-\eta) \tilde \bx_{k+1}$ for $\eta\in(0,1)$.}
\STATE{Compute $\bm{\nu}=\tilde \bx_{k+1}-\dot{\bx}_{k+1}$ and the unit normal vector $\bn$ associated with $\dot{\bx}_{k+1}$.}
\STATE{Set $\bx_{k+1}=\dot{\bx}_{k+1}+\bm{\nu}-2\langle \bm{\nu}, \bn \rangle \bn$.}
\ENDIF
\end{algorithmic}
 
\end{algorithm}



\subsection{Domains of 2D Synthetic Data}

We consider input $t\in[0,1]$ and output $(x, y)\in\mathbb{R}^2$. The normal vector can be derived accordingly. 

\emph{Flower} (petals $p=5$ and move out length $m=3$)
\begin{align*}
   r &= \sin(2 \pi p t) + m, \quad x= r \cos(2\pi t), \quad y= r \sin(2\pi t).
\end{align*}

\emph{Heart}
\begin{align*}
    x = 16 \sin(2\pi t)^3, \quad y = 13 \cos(2\pi t) - 5 \cos(4 \pi t) - 2 \cos(6 \pi t) - \cos(8 \pi t).
\end{align*}
\emph{Octagon} ($c+1$ edges $(X_i, Y_i)_{i=0}^{c}$ with $(X_{c}, Y_{c})=(X_0, Y_0)$)
\begin{align*}
    r=ct -\floor{ct}; \quad x = (1 - r) \cdot X_{\floor{ct}} +  r \cdot X_{\floor{ct}+1}; \quad y = (1 - r) \cdot Y_{\floor{ct}} +  r \cdot Y_{\floor{ct}+1}.
\end{align*}




\subsection{How to initialize: Warm-up Study}
\label{how_to_init}

% Schr\"odinger bridge (SB) is commonly initialized from standard diffusion models \citep{DSB, forward_backward_SDE} in large-scale experiments, however,

% For the initialization for reflected SB (rSB) remains unclear. In this section, we show rSB with VP-SDE benefits from a warm-up model using standard Brownian motion, while rSB with VE-SDE may not.

Consider a Langevin diffusion (LD) and a reflected Langevin (RLD) with score functions $S_1$ and $S_2$:
\begin{align*}
    \text{LD}:\quad\dd\bx_t &= S_1(\bx_t)\dd t + \dd  \mathbf{w}_t,\quad\ \  \ \qquad \qquad\bx_t\in\mathbb{R}^d\\
    \text{RLD}:\quad\dd\bx_t &= S_2(\bx_t)\dd t + \dd  \mathbf{w}_t+ \bn(\bx)\dd \mathbf{L}_t,\quad \bx_t\in\Omega.
\end{align*}
LD converges to $\mu_1\propto e^{S_1(\bx)}$ and RLD converges to $\mu_2\propto e^{S_2(\bx)}\mathbf{1}_{\bx\in\Omega}$ \citep{sebastien_bubeck, Andrew_Lamperski_21_COLT} as $t\rightarrow \infty$. In other words, inheriting the score function $S_1$ from LD to RLD (by setting $S_1=S_2$) yields the desired invariant distribution $\mu_1\mathbf{1}_{\bx\in\Omega}$.

Denote by RVP (or RVE) the VP-SDE (or VE-SDE) in reflected SB. One can easily show in Table \ref{sample-table} that VP (or RVP) converges approximately to a (or truncated) Gaussian prior within a practical training time, which implies an \emph{unconstrained VP-SDE diffusion model is a good warm-up candidate} for reflected SB. Empirically, we are able to verify this fact through 2D synthetic data.

However, this may not be the case for VE because it converges to a uniform measure in $\mathbb{R}^d$ but only obtains an approximate Gaussian in a short time. By contrast, RVE converges to the invariant uniform distribution much faster because it doesn't need to fully explore $\mathbb{R}^d$. This implies \emph{initializing the score function from VE-based diffusion models for reflected SB may not be a good choice}. Instead, we use RVE-based diffusion models  \citep{reflected_diffusion_model} as the warm-up. 

% the same given that the reflected process has converged approximately. %and the distribution can be viewed as simulating from an unreflected Langevin diffusion and then truncating it to $\Omega$.

% Empirically, VP-SDE is often chosen to converge approximately to the invariant Gaussian distribution (prior) within a given period. Invoking the convergence of reflected Langevin to the truncated Gibbs measure shows that inheriting the scores from unconstrained diffusion models provides the same score function within the domain.

% However, one can easily show that the invariant distribution of the unconstrained VE-SDE is a uniform distribution in $\mathbb{R}^d$ given $t\rightarrow\infty$ and 


% , while $\mu_{\star}$ (data distribution) always evolves approximately to a Gaussian distribution in a short time. 

% converges to a Gaussian invariant distribution with infinite variance given long enough time, implying a \emph{uniform distribution} in the bounded domain $\Omega$, as studied in \citep{reflected_diffusion_model}. 




% One can easily show that reflected Brownian motion converges close to a uniform distribution in $\Omega$ in a short time, while the unconstrained Brownian motion only approaches a small-variance Gaussian in a short time and is far away from the invariant measure (uniform in $\mathbb{R}^d$). This implies that the score function inherited from unconstrained diffusion models may not be a good enough warm-up for reflected SB. Similar findings are studied in \citet{reflected_diffusion_model}. 



\begin{table}[t]
\caption{Densities using (relected) Langevin diffusion with a practical running time and infinite time.}
\footnotesize
\label{sample-table}
\begin{center}
\begin{tabular}{lll|lll}
\multicolumn{1}{c}{\bf SDE}  &\multicolumn{1}{c}{\bf Practical Time} &\multicolumn{1}{c}{\bf Infinite Time} & \multicolumn{1}{|c}{\bf SDE}  &\multicolumn{1}{c}{\bf Practical Time} &\multicolumn{1}{c}{\bf Infinite Time}
\\ \hline 

VP & Approx. \textcolor{teal}{Gaussian} & \textcolor{teal}{Gaussian} & RVP & Approx. \textcolor{teal}{Truncated Gaussian} & \textcolor{teal}{Truncated Gaussian} \\
VE & Approx. \textcolor{red}{Gaussian} & \textcolor{red}{Uniform} & RVE & Approx. 
 \textcolor{teal}{Truncated Uniform} & \textcolor{teal}{Truncated Uniform} \\
\end{tabular}
\end{center}
\end{table}







\subsection{Generation of Image Data} 
\textbf{Datasets}.
Both CIFAR-10 and ImageNet 64$\times$64 are obtained from public resources. %\footnote{
% CIFAR-10: \url{https://www.cs.toronto.edu/~kriz/cifar.html}. \\
% Resized ImageNet 64$\times$64: \url{https://image-net.org/download-images.php}.
% }
All RGB values are between $[0, 1]$. The domain is $\Omega=[0,1]^d$, where $d=3\times 32 \times 32$ for the CIFAR-10 task, $d=3\times 64 \times 64$ for the ImageNet task, $d=1\times 32 \times 32$ for the MNIST task.


\textbf{SDE}.
We use reflected VESDE for the reference process due to its simplicity \citep{reflected_diffusion_model}, and it helps with facilitating the warmup training.
The SDE is discretized into 1000 steps.
The initial and the terminal scale of the diffusion are
 $\sigma_{\min}=0.01$ and $\sigma_{\max}=5$ respectively.
The prior reference is set as the uniform distribution on $\Omega$.


\textbf{Training}.
The alternate training Algorithm \ref{primal_dyanmic_IPF} can be accelerated with proper initialization, and the pre-training of the backward score model is critical for successfully training the model.
At the warmup phase, the forward score is set as zero and only the backward score model is trained by inheriting the setup in the reflected SGM \citep{reflected_diffusion_model}.
The learning rate is $10^{-5}$.
To improve the training efficiency and stabilize the full path-based training target in Algorithm \ref{primal_dyanmic_IPF}, 
We use Exponential Moving Average (EMA) in the training with the decay rate of 0.99 \citep{score_matching, vahdat2021score, reflected_diffusion_model}.




\textbf{Neural networks}.
As the high accuracy inference task relies more on the backward score model than the forward score model, the backward process is equipped with more advanced and larger structure. 
The backward score function uses NCSN++ \citep{score_sde} for the CIFAR-10 task and ADM \citep{SGMS_beat_GAN} for the ImageNet task.
The NCSN++ network has 107M parameters.
The ADM network has 295M parameters. For MNIST, a smaller U-Net structure with 1.3M parameters (2 attention heads per attention layer, 1 residual block per downsample, 32 base channels) is used for both forward and backward processes.
Many previous studies have verified the success of these neural networks in the diffusion based generative tasks \citep{score_sde, reflected_diffusion_model, forward_backward_SDE}.
The forward score function is modeled using a simpler U-Net %\citep{ronneberger2015u} % try less hyperparameters
with 62M parameters.


\textbf{Inference}.
In both CIFAR-10 and ImageNet 64$\times$64 tasks, images are generated unconditionally, and the quality of the samples is evaluated using Frechet Inception Distance (FID) over 50,000 samples \citep{heusel2017gans,score_sde}. In MNIST, the the quality of the samples is evaluated using Negative Log-Likelihood (NLL).
Predictor-Corrector using reflected Langevin dynamics is used to further improve the result which does not require any change of the model structure \citep{score_sde, forward_backward_SDE, reflected_diffusion_model, sebastien_bubeck}.
\begin{gather*}
\textbf{x}_t' 
= \mathrm{reflection} \Big( \textbf{x}_t 
+ \sigma_t \textbf{s}(t, \textbf{x}_t) 
+ \sqrt{2 \sigma_t} \varepsilon \Big), \quad \varepsilon \sim N(\mathbf{0}, I) \\
\textbf{s}(t, \textbf{x}_t) 
= \frac{1}{g} [ \overrightarrow\bz^{\theta}_t (t, \textbf{x}_t) 
+ \overleftarrow\bz^{\omega}_t (t, \textbf{x}_t)], \quad
\sigma_t = \frac{2 r_{\mathrm{SNR}}^2 g^2 \|\varepsilon\|^2 }
    {\| \textbf{s} (t, \textbf{x}_t) \|^2 }
\end{gather*}
where $\overrightarrow\bz^{\theta}_t, \overleftarrow\bz^{\omega}_t$
are the backward and forward score functions as in Algorithm \ref{primal_dyanmic_IPF}.

The likelihood of the diffusion model follows the probabilistic flow neural ODE \citep{score_sde, forward_backward_SDE, reflected_diffusion_model}
\begin{gather*}
d \textbf{x}_t = \big[ f(\textbf{x}_t, t) - \frac{1}{2} g(t)^2 \textbf{s}(t, \textbf{x}_t) \big] dt
:= \tilde{f}(\textbf{x}_t, t)  dt \\
\log p_0(\textbf{x}_0)
= \log p_T(\textbf{x}_T) 
    + \int_0^T \nabla \cdot \tilde{f}(\textbf{x}_t, t) dt
\end{gather*}


\subsection{Optimal Transport May Help Reduce NFEs}


% Diffusion models suffer from slow generations and require a large number of function evaluations (NFEs) to generate high-fidelity images. In addition to the successful distillation techniques \citep{Progressive_distillation, Consistency_models} in one- or few-step generations, optimal transport also obtains wide attention in both theory and practice.

To demonstrate the effectiveness of OT in reducing the number of function evaluations (NFEs), we study generations based on different NFEs on a simulation example and the MNIST dataset and compare the reflected Schr\"odinger bridge with reflected diffusion (implicit) \citep{Diffusion_constrained}. We use the same setup (e.g. implicit training loss with the same training budget) to be consistent except that reflected SB uses a well-optimized forward network $\textcolor{red}{\overrightarrow\bz_t^{\theta}}$ to train the backward network $\textcolor{teal}{\overleftarrow \bz_t^{\omega}}$ while reflected diffusion can be viewed as the first stage of SB training by fixing $\textcolor{red}{\overrightarrow\bz_t^{\theta}}\equiv \textbf{0}$. We use the standard setup to train the score functions with NFE=100 and employ a uniform time grid to simulate the probability
flow with reduced NFEs (10, 12, 20).

We observe in Figure \ref{NFE_spiral_flower} that in the regime of NFE=20, both reflected diffusion (implicit) and reflected SB demonstrate remarkable performance in the simulation example. Despite the inherent compromise in sample quality with smaller NFEs, our investigation revealed that a well-optimized $\textcolor{red}{\overrightarrow\bz_t^{\theta}}$ significantly contributes to training $\textcolor{teal}{\overleftarrow \bz_t^{\omega}}$ compared to the baseline with $\textcolor{red}{\overrightarrow\bz_t^{\theta}}\equiv \textbf{0}$, leading to an improved sample quality even in cases where NFE is set to 10 and 12. Similar findings are observed in the MNIST dataset in Figure \ref{NFE_MNIST}. We use the same setup as before, finding NFE=100 delivers reasonable performance in both reflected diffusion (implicit) and reflected SB. Decreasing NFE to 50 shows slightly stronger performance retention in reflected SB.



\begin{figure}[!ht]
  \centering
  \vskip -0.15in
    \subfigure{\includegraphics[scale=0.4]{figures/NFEs_exps/NFE_comparison.png}} 
    \vskip -0.1in
  % \caption{Reflected Schr\"odinger bridge on three custom domains: flower, octagon, and heart.}\label{rsb_3_domains}
  \caption{Reflected Schr\"odinger bridge v.s. reflected diffusion (implicit) based on different NFEs.}\label{NFE_spiral_flower}
  \vspace{-1em}
\end{figure}

\begin{figure}[!ht]
  \centering
  \vskip -0.1in
    \subfigure{\includegraphics[scale=0.35]{figures/NFEs_exps/NFE_MNIST_comparison.png}} 
    \vskip -0.05in
  % \caption{Reflected Schr\"odinger bridge on three custom domains: flower, octagon, and heart.}\label{rsb_3_domains}
  \caption{Reflected Schr\"odinger bridge v.s. reflected diffusion (implicit) based on MNIST}\label{NFE_MNIST}
  \vspace{-1em}
\end{figure}




$\newline$
$\newline$
$\newline$
$\newline$
$\newline$
\newpage

\begin{figure}[H]
\centering
\vskip -0.1in
\subfigure{\includegraphics[scale=0.38]{figures/mnist_32.png}} 
\vskip -0.1in
\caption{Generated samples via reflected SB on MNIST.
}
\end{figure}


\begin{figure}[H]
\centering
\subfigure{\includegraphics[width=\textwidth]{figures/cifar10_demo.png}} 
\caption{Generated samples via reflected SB on CIFAR-10.
}
\label{fig:cifar_demo}
\end{figure}



\begin{figure}[H]
\centering
\subfigure{\includegraphics[width=\textwidth]{figures/imagenet_demo.png}} 
\caption{Generated samples via reflected SB on ImageNet-64.
}
\label{fig:imagenet_demo}
\end{figure}



