
\documentclass[10pt]{article} % For LaTeX2e
%\usepackage{tmlr}
% If accepted, instead use the following line for the camera-ready submission:
\usepackage[accepted]{tmlr}
% To de-anonymize and remove mentions to TMLR (for example for posting to preprint servers), instead use the following:
%\usepackage[preprint]{tmlr}

% Optional math commands from https://github.com/goodfeli/dlbook_notation.
\input{math_commands.tex}

\usepackage{hyperref}
\usepackage{url}
\usepackage{booktabs}
\allowdisplaybreaks


\title{Understanding Guidance Scale in Diffusion Models \\ from a Geometric Perspective}

% Authors must not appear in the submitted version. They should be hidden
% as long as the tmlr package is used without the [accepted] or [preprint] options.
% Non-anonymous submissions will be rejected without review.

\author{\name Zhiyuan Zhan \email zhan@ms.k.u-tokyo.ac.jp \\
      \addr The University of Tokyo, Japan \\
      RIKEN Center for AIP, Japan
      \AND
      \name Liuzhuozheng Li \email liuzhuozheng-li@outlook.com \\
      \addr The University of Tokyo, Japan
      \AND
      \name Masashi Sugiyama \email sugi@k.u-tokyo.ac.jp\\
      \addr RIKEN Center for AIP, Japan \\
      The University of Tokyo, Japan}

% The \author macro works with any number of authors. Use \AND 
% to separate the names and addresses of multiple authors.

\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}

\def\month{02}  % Insert correct month for camera-ready version
\def\year{2026} % Insert correct year for camera-ready version
\def\openreview{\url{https://openreview.net/forum?id=nfHimL6g8G}} % Insert correct link to OpenReview for camera-ready version


%Theorem Style in Main body
\theoremstyle{plain}
\newtheorem{thm}{Theorem}
\newtheorem{lem}[thm]{Lemma}
\newtheorem{prop}[thm]{Proposition}
\newtheorem{cor}[thm]{Corollary}
\newtheorem{assum}{Assumption}
\renewcommand{\theassum}{\Roman{assum}}

\theoremstyle{definition}
\newtheorem{defn}{Definition}

\theoremstyle{remark}
\newtheorem{rmk}{Remark}


\begin{document}


\maketitle

\begin{abstract}
      Conditional diffusion models have become a leading approach for generating condition-consistent samples, such as class-specific images. In practice, the guidance scale is a key hyperparameter in conditional diffusion models, used to adjust the strength of the guidance term. While empirical studies have demonstrated that appropriately choosing the scale can significantly enhance generation quality, the theoretical understanding of its role remains limited. In this work, we analyze the probabilistic guidance term from a geometric view under the linear manifold assumption and, based on this analysis, construct a geometric guidance model that enables tractable theoretical study. To address regularity issues arising from multi-modal data, we introduce a mollification technique that ensures well-posed dynamics. Our theoretical results show that increasing the guidance scale improves alignment with the target data manifold, thereby enhancing generation performance. We further extend our framework to nonlinear manifolds, and empirical results on real-world datasets validate the effectiveness of the proposed model and are consistent with our theories.
\end{abstract}

\section{Introduction}

Diffusion models \citep{ho2020denoising,song2021denoising} have achieved state-of-the-art performance on generative tasks across various domains, including images \citep{dhariwal2021diffusion,rombach2022high}, text-to-image synthesis \citep{saharia2022photorealistic}, videos \citep{ho2022video}, and audio \citep{kong2021diffwave}. As a result,  their empirical success has led to increasing interest in understanding the theoretical foundations of diffusion models \citep{de2021diffusion,lee2022convergence,chen2023sampling,chen2023improved,gao2025wasserstein}. In particular, under the manifold hypothesis \citep{bengio2013representation}, the ability of diffusion models to output high-quality samples in high-dimensional spaces motivates researchers to investigate how these models can generate distributions supported on low-dimensional manifolds in high-dimensional ambient spaces \citep{debortoli2022convergence,oko2023diffusion,li2024adapting,wan2025elucidating}.

Controlling diffusion models to generate conditional distributions is another active area of research. Based on the theoretical framework proposed by \citet{song2021scorebased}, both classifier guidance and classifier-free guidance models \citep{dhariwal2021diffusion,ho2022classifier} apply a probabilistic guidance term—derived from Bayes’ rule—to guide the sampling process toward the target conditional distribution. These methods also introduce a scale to adjust the strength of the guidance, and they showed that the performance depends strongly on the choice of the guidance scale and an appropriate value can significantly improve generation quality. Recent empirical studies further demonstrated the importance of the guidance scale in conditional generation tasks \citep{dinh2023rethinking,sadat2024cads,sadat2025no}. However, the theoretical understanding of how the guidance scale affects the generation remains limited \citep{chidambaram2024what,wu2024theoretical}.

In this work, we propose a new geometric guidance model to enable the theoretical analysis of the role of the guidance scale in conditional generation. A key challenge in studying the guidance scale in classifier(-free) models is the analytical complexity of the probabilistic guidance term. To address this, we replace the probabilistic guidance with a new geometric guidance term. Specifically, under the linear manifold hypothesis \citep{chung2022improving}, we study the geometric property of the original probabilistic guidance term, building on an idea introduced by \citet{chen2023score}, and construct a linear geometric guidance term that plays the same role but more tractable for theoretical analysis.

As a next step, the analysis of the geometric guidance model requires certain regularity conditions on the score function, such as the Lipschitz continuity. However, because of the multi-modality of data distributions, these conditions generally fail to hold \citep{lee2022convergence,gao2025wasserstein}. To overcome this issue, we introduce a mollification technique inspired by mollifiers in mathematical analysis \citep{evans2018measure} to construct a surrogate score function that satisfies the required properties for our analysis.

Building on this, we construct a well-posed geometric guidance model through which we address two questions: (\rnum{1}) whether the model can recover the target data manifold, and (\rnum{2}) what is the upper bound on the distance between the generated distribution and the target conditional distribution. Our results reveal the effects of the guidance scale: increasing the scale encourages the generated data to lie closer to the target manifold, and large guidance scales do not significantly increase an upper bound on the generation error. 

Finally, for the nonlinear case and real-world data distributions, we extend our framework by constructing a nonlinear geometric guidance model. This model builds on the same principles as the linear case, with the theoretical foundation obtained by extending the results of \citet{chung2022improving} to nonlinear data manifolds. Experimentally, we evaluate the nonlinear geometric guidance model on CIFAR-10 \citep{krizhevsky2009learning} and demonstrate its effectiveness for conditional generation. We also report how performance varies with the guidance scale, providing empirical evidence consistent with the behavior suggested by our linear analysis.

In summary, our contributions are:
\begin{enumerate}[label=\arabic*.]
      \item We construct a new linear geometric guidance term to replace the original probabilistic guidance term by studying its geometric property under the linear manifold hypothesis.

      \item To ensure the regularity of the unconditional score function, we apply a mollification technique to construct a a surrogate score function, and build a well-posed geometric guidance model.

      \item By analyzing the geometric guidance model, we uncover the role of the guidance scale: a large guidance scale encourages the generated data to lie closer to the target data manifold and does not significantly affect the upper bound of the generation error.

      \item We propose a principled nonlinear geometric guidance model and evaluate it on CIFAR-10; the experiments demonstrate its effectiveness in conditional generation and illustrate guidance-scale effects beyond the linear setting.
\end{enumerate}

The remainder of this paper is organized as follows. Section \ref{sec:related_works} reviews related work, and Section \ref{sec:background} summarizes the technical background on diffusion models. Section \ref{sec:problem_setting_geometric_guidance} introduces the construction of the geometric guidance term, and Section \ref{sec:main_results_analysis_of_geometric_guidance} presents the theoretical analysis of the geometric guidance model. Section \ref{sec:experiments_nonlinear_case} extends the model to nonlinear settings and reports experimental results. Section \ref{sec:conclusion} concludes the paper and discusses limitations. Notation is summarized in Appendix \ref{appen:notations}.


\section{Related Works}\label{sec:related_works}

\paragraph{Convergence analysis:}

A number of recent works have analyzed the convergence properties of diffusion models under various assumptions \citep{de2021diffusion,lee2022convergence,lee2023convergence,chen2023sampling,chen2023improved,gao2025wasserstein}. \citet{de2021diffusion} established total variation bounds under $C^3$-regularity assumptions on the score for the target distribution. \citet{chen2023sampling} relaxed this requirement to Lipschitz continuity of the score function but for each intermediate density, which was further weakened in \citet{chen2023improved} to the Lipschitz continuity of the score only for the target density. Using functional inequalities, \citet{lee2022convergence,lee2023convergence} and \citet{gao2025wasserstein} have derived convergence guarantees under the assumption that the target density function is log-concave, with results in both total variation and Wasserstein distances. In contrast, our setting involves multi-modal target distributions for which log-concavity and smoothness assumptions do not hold \citep{lee2022convergence}. To address this, we introduce a technique that constructs a surrogate distribution satisfying the required regularity properties while closely approximating the original target.

\paragraph{Geometric structure:}

For real-world datasets, it is widely believed that high-dimensional data lie on a low-dimensional submanifold of the ambient space, a perspective known as the manifold hypothesis \citep{bengio2013representation}. When generating such data distributions, deep generative models often encounter challenges such as the curse of dimensionality \citep{bronstein2021geometric} and manifold overfitting \citep{loaiza-ganem2022diagnosing}. However, the strong empirical performance suggests that diffusion models can avoid these issues. As a result, understanding the theoretical behavior of diffusion models under the manifold hypothesis has attracted increasing attention. For example, \citet{debortoli2022convergence} established a Wasserstein convergence bound assuming that the target distribution is supported on a compact set. Under the additional assumption that the target data manifold is linear,  \citet{oko2023diffusion} showed that diffusion models can avoid the curse of dimensionality by providing a Wasserstein bound that depends only on the intrinsic dimension. \citet{chen2023score} further derived a total variation bound in terms of the intrinsic dimension based on a decomposition of the score function under the linear manifold assumption. Following this line of work, we further investigate the geometric structure of this decomposition to clarify the role of the score function in recovering the target data manifold, which in turn helps us construct the geometric guidance model.

\paragraph{Conditional generation:}

To control the generation \citep{song2021scorebased}, \citet{dhariwal2021diffusion} and \citet{ho2022classifier} applied the probabilistic guidance term to generate conditional distributions. Following their works and based on the geometric structure of noisy data manifolds under the linear assumption of the target data manifold \citep{chung2022improving}, \citet{chung2022improving,chung2024decomposed} and \citet{he2024manifold} proposed using a new time-dependent guidance in conditional generation to constrain geometric structure of the generation process. From a different perspective, \citet{song2023loss} and \citet{bansal2023universal} constructed a time-independent guidance constructed by a loss function that is designed to enforce desired constraints on the generated data. Instead, our geometric guidance is constructed by studying the geometric property of the probabilistic guidance,  with the goal of replacing its role in conditional generation.

To adjust the strength of guidance, \citet{dhariwal2021diffusion} and \citet{ho2022classifier} also introduced a guidance scale, and their experiments showed that selecting an appropriate scale can significantly improve performance. However, there are limited works on theoretically analyzing the effects of the guidance scale in conditional generation. \citet{chidambaram2024what} studied one-dimensional case and showed that increasing the scale not only reduces diversity of generated distributions but also leads generated data to drift to the extreme points in the support of the conditional distribution. \citet{wu2024theoretical} theoretically analyzed the influence of the guidance scale in the context of Gaussian mixture models, demonstrating that a large guidance scale diminishes distributional diversity while boosting classification confidence. Due to the analytical complexity of the probability guidance term, previous works have focused on special cases. Therefore, we propose a geometric guidance term that plays the same role as the probabilistic guidance but is more amenable to theoretical analysis of the guidance scale.

\section{Background}\label{sec:background}

\subsection{Diffusion Model}

Let $\bm{X} \sim \Pb_X \in \mathcal{P}(\R^D)$ denote the target data distribution. The forward process in denoising diffusion probabilistic models (DDPMs) \citep{ho2020denoising} is governed by the stochastic differential equation (SDE)
\begin{equation}\label{eq:DDPM_SDE}
      \mathrm{d}\bm{X}_t = -\frac{1}{2}\beta(t)\bm{X}_t\mathrm{d}t + \sqrt{\beta(t)}\mathrm{d}\bm{W}_t,\quad \forall~ t \in [0,T],
\end{equation}
with the initial condition $\bm{X}_0 \sim \Pb_X$, where $(\bm{W}_t)_{t \geq 0}$ is a standard Brownian motion and $\beta \colon [0,T] \sto (0,\infty)$ is smooth; see \citet{song2021scorebased}. This SDE admits the following analytical solution:
\begin{equation}\label{eq:sol_DDPM}
      \bm{X}_t \stackrel{\mathrm{d}}{=} \sqrt{\alpha_t}\bm{X}_0 + \sqrt{1-\alpha_t}\bm{\xi},\quad \forall t \in [0,T],
\end{equation}
where $\bm{\xi} \sim \mathcal{N}(\bm{0},\bm{I}_D)$ a standard Gaussian, $\alpha_t \defeq \exp\bc{-\int_0^t\beta(s)\mathrm{d}s}$, and ``$\stackrel{\mathrm{d}}{=}$'' means equal in distribution. The derivation is provided in Appendix \ref{appen:analytic_solution_for_ddpm}.

The reverse process of DDPMs aims to generate $\Pb_X$, which corresponds to the time-reversal process of (\ref{eq:DDPM_SDE}). To this end, we need to consider the process
\begin{equation*}
      \bm{X}_t^{\leftarrow} \defeq \bm{X}_{T-t}
\end{equation*}
and study its stochastic dynamics. As shown in \citet{anderson1982reverse} and \citet{haussmann1986time}, the process $(\bm{X}_t^{\leftarrow})_{t \in [0,T]}$ satisfies the following SDE:
\begin{equation}\label{eq:reverse_SDE}
      \mathrm{d}\bm{X}_t^{\leftarrow} = \bc{\frac{1}{2}\beta(T-t)\bm{X}^{\leftarrow}_t+\beta(T-t)\nabla_{\bm{x}}\log p_{T-t}(\bm{X}^{\leftarrow}_t)}\mathrm{d}t + \sqrt{\beta(T-t)}\mathrm{d}\clo{\bm{W}}_t,
\end{equation}
where $p_t$ is the density function of $\bm{X}_t$, and $(\clo{\bm{W}}_t)_{t \in [0,T]}$ is the Brownian motion in reverse time.  A simplified proof can be found in \citet{tang2024score}. 

In practice, a neural network $\bm{s}_\theta(t,\cdot)$ with parameter $\theta$ is trained to estimate the score function $\nabla_{\bm{x}} \log p_t(\cdot)$ using the score matching method \citep{vincent2011connection}. By substituting $\nabla_{\bm{x}} \log p_t$ with the estimator $\bm{s}_\theta(t, \cdot)$ in (\ref{eq:reverse_SDE}), experiments \citep{song2019generative,song2021scorebased,dhariwal2021diffusion} showed that DDPMs achieve state-of-the-art performance in data generation tasks.  

\subsection{Probability Flow ODE}

Instead of simulating the stochastic process (\ref{eq:reverse_SDE}), denoising diffusion implicit models (DDIMs) \citep{song2021denoising} employ a deterministic approach for generation, which corresponds to the following ordinary differential equation (ODE):
\begin{equation}\label{eq:reverse_ODE}
      \frac{\mathrm{d}}{\mathrm{d}t}\bm{X}_t^\leftarrow = \frac{1}{2}\beta(T-t)\bc{\bm{X}_t^\leftarrow+\nabla_{\bm{x}}\log p_{T-t}(\bm{X}^{\leftarrow}_t)},\quad \forall~ t\in [0,T],
\end{equation}
with the initial condition $\bm{X}_0^\leftarrow \sim p_T$, which is called the probability flow ODE. The evolution of the density functions of $\bm{X}_t^\leftarrow$ under this deterministic process is equivalent to that of the stochastic reverse process (\ref{eq:reverse_SDE}), as the continuity equation associated with the ODE coincides the Fokker–Planck equation corresponding to the SDE (\ref{eq:DDPM_SDE}); see \citet{song2021scorebased} for details.

In this paper, we focus on the deterministic dynamics, as the Wasserstein distance used as the main metric makes analyzing the ODE formulation more convenient than the SDE. It naturally extends to the SDE via It\^o's formula \citep{gao2025wasserstein}. Following \citet{chen2023sampling} and \citet{chen2023improved}, we consider the Ornstein–Uhlenbeck process by setting $\beta(t) \equiv 2$ in Equation (\ref{eq:DDPM_SDE}) for simplicity, where this constant choice is unimportant, as varying it merely rescales time.

\subsection{Conditional Diffusion Model}

When working with paired data $(\bm{X}, Y) \sim \Pb_{XY}$, the goal of conditional generation is to generate the conditional distribution $\Pb_{X \mid Y}(\cdot \mid Y)$. In \citet{song2021scorebased}, diffusion models are directly applied to $\Pb_{X \mid Y}(\cdot \mid Y)$. Specifically, the forward process (\ref{eq:DDPM_SDE}) is first run with the initial condition $\bm{X}_0 \sim \Pb_{X \mid Y}(\cdot \mid Y)$ to obtain the density functions $p^y_t$ of $\bm{X}_t$. Then, the stochastic reverse process (\ref{eq:reverse_SDE}), or the deterministic process (\ref{eq:reverse_ODE}), is simulated to generate samples from $\Pb_{X \mid Y}(\cdot \mid Y)$. 

Moreover, these intermediate densities $p^y_t$ admit more explicit expressions. Suppose $(\bm{X}, Y) \sim \Pb_{XY}$ and we run the SDE (\ref{eq:DDPM_SDE}) with initial condition $\bm{X}_0 \sim \Pb_X = \int \Pb_{XY}(\cdot, dy)$ to obtain $\bm{X}_t$. Let $p_t(\bm{x}_t, y)$ denote the joint density function of $(\bm{X}_t, Y)$. Then, it can be shown that
\begin{equation*}
      p_t(\bm{x}_t \mid y) = p^y_t(\bm{x}_t);
\end{equation*}
see Appendix \ref{appen:density_functions_in_conditional_ddpm} for details. 

Therefore, the score function for generating $\Pb_{X \mid Y}(\cdot \mid Y)$ can be decomposed as
\begin{equation}\label{eq:inteprete_cond_ddpm}
      \nabla_{\bm{x}} \log p^y_t(\bm{x}) = \nabla_{\bm{x}} \log p_t(\bm{x} \mid y) = \nabla_{\bm{x}} \log p_t(\bm{x}) + \nabla_{\bm{x}} \log p_t(y \mid \bm{x}),
\end{equation}
where $p_t(\bm{x})$ is the marginal density of $\bm{X}_t$ obtained by running (\ref{eq:DDPM_SDE}) with the initial condition $\bm{X}_0 \sim \Pb_X$. This term can be estimated using standard methods from unconditional DDPMs. The remaining term, $\nabla_{\bm{x}} \log p_t(y \mid \bm{x})$, is known as the guidance term, and there are two main approaches for approximating it: classifier guidance and classifier-free guidance \citep{dhariwal2021diffusion,ho2022classifier}. In classifier guidance, a time-dependent classifier is trained to approximate $p_t(y \mid \cdot)$ on all noisy data. In classifier-free guidance, a new neural network $\bm{s}_\theta(t, \bm{x}, y)$ is trained to estimate the conditional score $\nabla_{\bm{x}} \log p_t(\bm{x} \mid y)$, while $\bm{s}_\theta(t, \bm{x}, \emptyset)$ approximates the unconditional score $\nabla_{\bm{x}} \log p_t(\bm{x})$. The guidance term is then computed as
\begin{equation*}
      \nabla_{\bm{x}} \log p_t(y \mid \bm{x}) \approx \bm{s}_\theta(t,\bm{x},y) - \bm{s}_\theta(t,\bm{x},\emptyset).
\end{equation*}

In practice, a scaling parameter $\eta > 0$, known as the guidance scale, is typically introduced to control the strength of the guidance term \citep{dhariwal2021diffusion}. When using the deterministic dynamics (\ref{eq:reverse_ODE}), this modification is mathematically expressed as
\begin{equation}\label{eq:lambda_classifier_guidance}
      \frac{\mathrm{d}}{\mathrm{d}t}\bm{X}_t^\leftarrow = \bm{X}_t^\leftarrow+\nabla_{\bm{x}}\log p_{T-t}(\bm{X}^{\leftarrow}_t) + \eta \nabla_{\bm{x}}\log p_{T-t}( y \mid \bm{X}^{\leftarrow}_t),\quad \forall~ t\in [0,T],
\end{equation}
with the initial condition $\bm{X}_0^\leftarrow \sim p_T(\cdot \mid y)$.

As mentioned in Section \ref{sec:related_works}, although setting $\eta \neq 1$ may seem counterintuitive from a theoretical perspective, empirical studies \citep{dhariwal2021diffusion, ho2022classifier} have shown that selecting an appropriate value of $\eta$ can significantly improve performance.  In particular, increasing the guidance scale $\eta$ enhances the distinguishability of generated samples, but at the cost of reduced diversity \citep{ho2022classifier, chidambaram2024what, wu2024theoretical}.  However, theoretical understanding of how the guidance scale $\eta$ influences generation remains limited, due to the analytical complexity of the guidance term $\nabla_{\bm{x}} \log p_t(y \mid \bm{x})$ \citep{chidambaram2024what, wu2024theoretical}.

Therefore, the main objective of this work is to provide a theoretical analysis of the guidance scale $\eta$, under the assumption that the target data concentrate on a low-dimensional linear subspace $\mathcal{M}_y \subset \R^D$, called the target data manifold, i.e., $\supp \Pb_{X \mid Y}(\cdot \mid Y = y) \subset \mathcal{M}_y$. This analysis consists of two main steps: 
\begin{enumerate}[label=(\roman{*})]
      \item First, we replace the probabilistic guidance term with a geometric guidance term in order to avoid the difficulty of handling $\nabla_{\bm{x}} \log p_t(y \mid \bm{x})$ (see Section \ref{sec:problem_setting_geometric_guidance}). 
      \item Second, we analyze the modified dynamics under the geometric guidance from two perspectives: (a) how $\eta$ influences the recovery of the target data manifold, and (b) how it affects the distance between the generated distribution and the target distribution (see Section \ref{sec:main_results_analysis_of_geometric_guidance}).
\end{enumerate}

A central technical challenge in analyzing the geometric guidance dynamics is that $\nabla_{\bm{x}} \log p_t(\bm{x})$ may fail to satisfy desirable properties, such as the $L$-Lipschitz continuity (and the log-concavity of $p_t(\bm{x})$), due to the fact that $p_t(\bm{x})$ arises from a diffusion process initialized with a multi-modal distribution \citep{lee2022convergence,gao2025wasserstein}. To address this issue, we introduce a novel technique inspired by mollification in mathematical analysis \citep{evans2018measure}, which yields a surrogate distribution $p_t^\sigma(\bm{x})$ for which the geometric guidance dynamics is well-posed.

\section{Geometric Guidance Model}\label{sec:problem_setting_geometric_guidance}

In this section, our main objective is to construct a new guidance term to replace $\nabla_{\bm{x}} \log p_t(y \mid \bm{x})$ in Equation (\ref{eq:lambda_classifier_guidance}) from a geometric perspective. Specifically, the key idea is to understand the role that $\nabla_{\bm{x}} \log p_t(y \mid \bm{x})$ plays in recovering the target data manifold $\mathcal{M}_y$.

Note that $\nabla_{\bm{x}} \log p_t(y \mid \bm{x})$ appears as a component of $\nabla_{\bm{x}} \log p^y_t(\bm{x})$ by Equation (\ref{eq:inteprete_cond_ddpm}). This motivates us to investigate the geometric interpretation of the score function $\nabla_{\bm{x}} \log p_t(\bm{x})$ in the setting of unconditional DDPMs (see Section \ref{sub:geometric_interpretation_of_score_function}). Based on Equation (\ref{eq:inteprete_cond_ddpm}) and a basic property of $p_t(y \mid \bm{x})$, we then propose a replacement for $\nabla_{\bm{x}} \log p_t(y \mid \bm{x})$, which preserves its geometric role but is more tractable for theoretical analysis (see Section \ref{sub:geometric_guidance_for_conditional_generation}).

\subsection{Geometric Interpretation of Score Function}\label{sub:geometric_interpretation_of_score_function}

To study the geometric properties of the score function $\nabla_{\bm{x}} \log p_t(\bm{x})$, we first examine the geometric structure of the noisy data manifolds that arise during the DDPM process. \citet{chung2022improving} showed that, under the assumption that the target data lie on $\mathcal{M} \in \R^D$, a linear subspace of the ambient space $\R^D$ with significantly lower dimension, the noisy data $\bm{X}_t$ concentrate on a hypersurface, i.e., a $(D - 1)$-dimensional manifold embedded in $\R^D$, for any $t > 0$. We generalize this result \citep[Proposition 1]{chung2022improving} in the following proposition; the proof is provided in Appendix \ref{sub:omit_poofs_in_section_ref_sec_problem_setting_geometric_guidance}.

\begin{prop}\label{prop:data_mfd_linear}
      Assume $\bm{Z} \sim \Pb^Z$ on $\R^d$, and $\bm{X} = A\bm{Z} \sim \Pb_X$ on $\R^D$ for an $A \in \mathcal{O}^{D \times d}$, i.e., $A \in \R^{D \times d}$ and $A^\top A = \bm{I}_d$. Define 
      \begin{equation*}
            \mathcal{M}^t \defeq \bb{\bm{x} \in \R^D \colon \norm*{(\bm{I}_D - AA^\top)\bm{x}} = r(t)},
      \end{equation*}
      where $r(t) \defeq \sqrt{(D-d)(1-\alpha_t)}$ and $\alpha_t = e^{-2t}$. Let $\bm{X}_t$ be generated by the DDPM forward process (\ref{eq:DDPM_SDE}) with the initial condition $\bm{X}_0 = \bm{X}$. If $d \ll D$ , then $\bm{X}_t$ concentrates on $\mathcal{M}^t$ with high probability. 
\end{prop}

Based on this result, the next question is how the score function $\nabla_{\bm{x}} \log p_t(\bm{x})$ contributes to recovering these noisy data manifolds $\mathcal{M}^t$ during the reverse process (\ref{eq:reverse_ODE}). 

Under the same assumptions as those in Proposition \ref{prop:data_mfd_linear}, \citet{chen2023score} showed that
\begin{equation}\label{eq:decomoflogp}
      \nabla_{\bm{x}}\log p_t(\bm{x}) = A\lv{\nabla_{\bm{z}}\log p^Z_t(\bm{z})}_{\bm{z} = A^\top\bm{x}} - \frac{1}{1-\alpha_t}(\bm{I}_D-AA^\top)\bm{x},
\end{equation}
where $p^Z_t$ is the density associated with the forward process (\ref{eq:DDPM_SDE}) initialized from $p^Z$. An alternative derivation of this formula, along with an analysis of its geometric properties, is provided in Appendix \ref{appen:decomposition_of_score_function}.

Based on this orthogonal decomposition, we observe that the role of $\nabla_{\bm{x}} \log p_t(\bm{x})$ can be understood as two components: (\rnum{1}) the first term serves as generating the distribution $\Pb^Z$ in the latent space, and (\rnum{2}) the second term controls the reconstruction of the noisy data manifolds $\mathcal{M}^t$ in the ambient space. Informally, this decomposition can be summarized as
\begin{equation*}
      \nabla_{\bm{x}}\log p_t(\bm{x}) = \text{Generate Latent Distribution} ~+~ \text{Recover Data Manifolds } \mathcal{M}^t.
\end{equation*}
We formalize this intuition in the following theorem; see the proof in Appendix \ref{sub:omit_poofs_in_section_ref_sec_problem_setting_geometric_guidance}.

\begin{thm}\label{thm:role_of_score_linear}
    Under the same setting as that in Proposition \ref{prop:data_mfd_linear}, let $\bm{X}_{t,\parallel}^\leftarrow = AA^\top \bm{X}_t^\leftarrow$ and $\bm{X}_{t,\perp}^\leftarrow = \bm{X}_t^\leftarrow - \bm{X}_{t,\parallel}^\leftarrow$, where $\bm{X}_t^\leftarrow = \bm{X}_{T-t}$.
    \begin{enumerate}[label=(\alph{*})]
          \item Let $\bm{X}_{t,\parallel}^\leftarrow = A \bm{Z}_t^\leftarrow$ with $\bm{Z}_t^\leftarrow = A^\top \bm{X}_t^\leftarrow$. Then $\bm{Z}_t^\leftarrow$ satisfies
          \begin{equation*}
              \frac{\mathrm{d}}{\mathrm{d}t}\bm{Z}_t^\leftarrow = \bm{Z}_t^\leftarrow + \nabla_{\bm{z}}\log p_{T-t}^Z(\bm{Z}_t^\leftarrow),
          \end{equation*}
          which implies that $\bm{Z}_t = A^\top \bm{X}_t = \bm{Z}_{T-t}^\leftarrow$ follows the forward process (\ref{eq:DDPM_SDE}) initialized from $p^Z$.
          \item $\bm{X}_{t,\perp}^\leftarrow$ satisfies
          \begin{equation*}
              \frac{\mathrm{d}}{\mathrm{d}t}\bm{X}_{t,\perp}^\leftarrow = \bm{X}_{t,\perp}^\leftarrow - \frac{1}{1-\alpha_{T-t}}\bm{X}_{t,\perp}^\leftarrow.
          \end{equation*}
          Moreover, $\norm{\bm{X}_{t_0,\perp}^\leftarrow} = r(T-t_0)$ implies $\norm{\bm{X}_{t_0+\delta,\perp}^\leftarrow} = r(T-t_0-\delta)$, where $r(t) = \sqrt{(D-d)(1-\alpha_t)}$.
    \end{enumerate}
\end{thm}

In Theorem \ref{thm:role_of_score_linear}, statement (a) demonstrates that the parallel part $\nabla_{\bm{z}}\log p^Z(\bm{z})$ in the decomposition (\ref{eq:decomoflogp}) is responsible for generating the target latent distribution $p^Z$ via the reverse process of DDPMs, which has been thoroughly studied in \citet{chen2023score}. Meanwhile, statement (b) shows that, since
\begin{equation*}
    \norm*{(\bm{I}_D - AA^\top)\bm{X}_t^\leftarrow} = \norm*{\bm{X}_{t,\perp}^\leftarrow},
\end{equation*}
the orthogonal part $(\bm{I}_D - AA^\top)\bm{x}$ plays a key role in guiding the recovery of the noisy data manifolds $\mathcal{M}^t$, which provides an insight for designing geometric guidance in conditional generation.

\subsection{Geometric Guidance for Conditional Generation}\label{sub:geometric_guidance_for_conditional_generation}

Let us return to the conditional diffusion model. To apply the results from Section \ref{sub:geometric_interpretation_of_score_function} in studying the role of $\nabla_{\bm{x}} \log p_t(y \mid \bm{x})$ in guidance, we first impose the linear assumption for the target data manifold. 

We consider a two-class dataset $(\bm{X}, Y) \sim \Pb_{XY}$ on $\R^D \times \bb{1, 2}$ for simplicity; the following analysis readily extends to the multi-class case. Let $\Pb(Y=1) = w_1$ and $\Pb(Y=2) = w_2$ so that
\begin{equation*}
      \Pb_X = w_1\Pb_{X\mid Y}(\cdot \mid Y=1) + w_2\Pb_{X\mid Y}(\cdot \mid Y=2).
\end{equation*}
The linear assumption states as follows.

\begin{assum}\label{assum:condlineardata}
      For $i = 1,2$, there exists a $\bm{Z}_i \sim p^Z_i$ on $\R^{d_i}$ and an $A_i \in \mathcal{O}^{D \times d_i}$ such that
      \begin{equation*}
          \bm{X}_i \defeq A_i\bm{Z}_i \sim \Pb_{X\mid Y}(\cdot \mid Y=i),
      \end{equation*}
      and we further assume $A_1^\top A_2 = \bm{O}$.
\end{assum}
\begin{rmk}
    For this assumption, we provide two remarks.
    \begin{enumerate}[label=(\roman*)]
          \item It basically means that the support $\supp \Pb_{X\mid Y}(\cdot \mid Y=i) \subset \mathcal{M}_i \defeq \Img A_i$, the image of $\bm{x} \mapsto A\bm{x}$; in other words, $\Pb_{X\mid Y}(\cdot \mid Y=i)$ is supported on the linear space $\Img A_i$. The definition of the support of a probability measure is provided in Appendix \ref{appen:notations}. %Moreover, $(A_i)_{\#}\Pb_i^Z = \Pb_{X\mid Y}(\cdot \mid Y=i)$ (see the definition in }) implies that $\Pb_{X\mid Y}(\cdot \mid Y=i)$ admits a density function on $\mathcal{M}_i$ with respect to the canonical volume measure on $\mathcal{M}_i$, because $\Pb_i^Z$ has the density $p_i^Z$.

          \item $A_1^\top A_2 = \bm{O}$ indicates $\mathcal{M}_1 \perp \mathcal{M}_2$. This orthogonality assumption is introduced to simplify the subsequent analysis, but it does not significantly affect our conclusions regarding the guidance scale; see Appendix \ref{appen:more_details_for_orthogonality_assumption} for further discussion.
    \end{enumerate}
\end{rmk}

Next, we fix $Y=1$ and our goal is to generate the conditional distribution, which needs to consider the geometric structure of the condition score function $\nabla_{\bm{x}} \log p_t(\bm{x} \mid y = 1)$. By combining the results in Section \ref{sub:geometric_interpretation_of_score_function} with Equation (\ref{eq:inteprete_cond_ddpm}), the conditional score function has two different types of decomposition:
\begin{equation}\label{eq:two_decompo_condi_score}
      \begin{aligned}
            \nabla_{\bm{x}} \log p_t(\bm{x} \mid y = 1) & = \text{Generate Latent Distribution} ~+~ \text{Recover Data Manifolds } \mathcal{M}_1^t \\
            &= \nabla_{\bm{x}} \log p_t(\bm{x}) + \nabla_{\bm{x}} \log p_t(y = 1 \mid \bm{x}).
      \end{aligned}
\end{equation}
We will show that $\nabla_{\bm{x}} \log p_t(y = 1 \mid \bm{x})$ plays the role of recovering the data manifolds $\mathcal{M}_1^t$ with respect to the first decomposition.

For the first decomposition in (\ref{eq:two_decompo_condi_score}), based on Assumption \ref{assum:condlineardata} and Proposition \ref{prop:data_mfd_linear}, because the noisy data manifolds generated by the forward process starting from $\mathcal{M}_1$ are given by
\begin{equation}\label{eq:def_of_m_1_t}
      \mathcal{M}_1^t = \bb{\bm{x} \in \R^D \colon \norm*{(\bm{I}_D - A_1A_1^\top)\bm{x}} = r(t)},
\end{equation}
the orthogonal part of $\nabla_{\bm{x}} \log p_t(\bm{x} \mid y = 1)$ in the first decomposition responsible for recovering $\mathcal{M}_1^t$ is parallel to $(\bm{I}_D - A_1A_1^\top)\bm{x}$ as shown in Section \ref{sub:geometric_interpretation_of_score_function}.

Intuitively, for the second decomposition in (\ref{eq:two_decompo_condi_score}), since $p_t(y = 1 \mid \bm{x})$ acts as a classifier for $\mathcal{M}_1^t$, we have $p_t(y = 1 \mid \bm{x}) \approx 1$ for any $\bm{x} \in \mathcal{M}_1^t$, i.e., $\log p_t(y = 1 \mid \bm{x})$ is approximately constant on $\mathcal{M}^t_1$. Therefore, by Lemma \ref{lem:normal_mfd_const}, $\nabla_{\bm{x}} \log p_t(y = 1 \mid \bm{x})$ is almost normal to $\mathcal{M}_1^t$, 
\begin{equation*}
      \nabla_{\bm{x}} \log p_t(y = 1 \mid \bm{x}) \approx -\eta (\bm{I}_D - A_1A_1^\top)\bm{x},\quad \text{for some } \eta > 0,
\end{equation*}
because $(\bm{I}_D - A_1A_1^\top)\bm{x}$ is normal to $\mathcal{M}_1^t$ by Lemma \ref{lem:normal_mfd_const}. Rigorous details are provided in Appendix \ref{appen:construction_of_geometric_guidance}.

Therefore, the guidance term $\nabla_{\bm{x}} \log p_t(y = 1 \mid \bm{x})$ partially contributes to the recovery of the data manifolds $\mathcal{M}_1^t$ during the reverse process. Consequently, it can be replaced by $(\bm{I}_D - A_1A_1^\top)\bm{x}$. Based on this insight, we propose the following geometric guidance model for conditional generation:
\begin{equation}\label{eq:orginal_geom_guid}
      \frac{\mathrm{d}}{\mathrm{d}t}\bm{X}_t^\leftarrow = \bm{X}_t^\leftarrow+\nabla_{\bm{x}}\log p_{T-t}(\bm{X}^{\leftarrow}_t) - \eta P_1\bm{X}^{\leftarrow}_t,\quad P_1 \defeq \bm{I}_D - A_1A_1^\top.
\end{equation}

\section{Main Results: Analysis of Geometric Guidance Model}\label{sec:main_results_analysis_of_geometric_guidance}

In this section, we analyze the geometric guidance model (\ref{eq:orginal_geom_guid}) with the aim of uncovering the role of the guidance scale $\eta$. To understand its effects, we consider two related questions: whether the model can approximately estimate the target data manifold $\mathcal{M}_1$ (see Section \ref{sub:estimating_target_space}), and how to quantify the distance between the generated and target distributions (see Section \ref{sub:distance_to_target_distribution}). These two problems serve as a lens through which we investigate the influence of $\eta$ in conditional generation. 

Before addressing these two questions, it is necessary to ensure the well-posedness of the ODE (\ref{eq:orginal_geom_guid}); that is, we must establish regularities of $\nabla_{\bm{x}} \log p_t(\bm{x})$ such as its Lipschitz continuity and the log-concavity of $p_t(\bm{x})$, which requires careful analysis (see Section \ref{sub:smoothness_and_concavity}) because it is obtained from a multi-modal distribution $\Pb_X$.

\subsection{Well-posedness of Geometric Guidance Model}\label{sub:smoothness_and_concavity}

In general, the Lipschitz continuity of $\nabla_{\bm{x}} \log p_t(\bm{x})$ and the log-concavity of $p_t(\bm{x})$ induced by the DDPM forward process depend on properties of the initial distribution $\mu$. A basic requirement is that $\mu$ admit a density $p(\bm{x})$. Log-concavity of $p(\bm{x})$ then implies log-concavity of $p_t(\bm{x})$ \citep{gao2025wasserstein}, and Lipschitz continuity of $\nabla_{\bm{x}} \log p(\bm{x})$ implies the Lipschitz continuity of $\nabla_{\bm{x}} \log p_t(\bm{x})$ \citep{chen2023improved}.

However, in our setting, it is clear that $\Pb_X$ does not admit a density function. We therefore first deduce the necessary conditions on the latent distribution implied by $\Pb_X$; see Sections \ref{ssub:problems_in_latent_distribution} and \ref{ssub:mollification_technique}. Second, the multi-modality of $\Pb_X$ introduces irregularities in $p_t(\bm{x})$ \citep{lee2022convergence}, which we discuss in Section \ref{ssub:log_concavity_of_latent_density}. By solving these two problems, we construct a surrogate $p_t^\sigma(\bm{x})$ for use in the geometric guidance model (\ref{eq:orginal_geom_guid}), which is well-posed; see Section \ref{ssub:smoothness_and_concavity_of_score_function}.


\subsubsection{Problems in Latent Distribution}\label{ssub:problems_in_latent_distribution}

When $\mu$ does not admit a density function—for instance, when the support of $\mu$ lies on a lower-dimensional manifold in the ambient space—\citet{debortoli2022convergence} showed that the score function $\nabla_{\bm{x}} \log p_t$ is Lipschitz continuous under the assumption that $\supp \mu$ is compact, i.e., closed and bounded. This setting aligns with our problem but guarantees only Lipschitz continuity. In contrast, we establish a stronger result in the following Proposition \ref{prop:smoothoflatentdens}, which does not require the compactness, under the assumption that the target data manifold is linear. The proof is provided in Appendix \ref{appen:omitted_proofs_in_section_ref_sub_smoothness_and_concavity}. 

\begin{prop}\label{prop:smoothoflatentdens}
      Let $\bm{Z}$ be a random variable on $\R^k$ with the density function $p^Z$, and let $B \in \R^{n \times k}$. Assume there are $m_0,\Lambda > 0$ such that
      \begin{equation*}
            -\nabla_{\bm{z}}^2\log p^Z(\bm{z}) \succeq m_0I_k,\quad \norm*{B}^2_{\op{op}} \leq \Lambda,
      \end{equation*}
      and $\lambda \defeq \lambda_{\op{min}}(B^\top B) \geq 0$, the minimum of all eigenvalues of $B^\top B$. For $\alpha \in \R$ and $\beta > 0$, let
      \begin{equation*}
            \bm{X} = \alpha  B\bm{Z}+\beta \bm{\xi},\quad \bm{\xi} \sim \mathcal{N}(\bm{0},\bm{I}_n)
      \end{equation*}
      with the density function $p_X$ on $\R^n$. We have
      \begin{equation*}
            \norm*{\nabla^2_{\bm{x}}\log p_X(\bm{x})}_{\op{op}} \leq L,\quad L\defeq \frac{1}{\beta^2} + \frac{\alpha^2\Lambda}{\beta^2(\alpha^2\lambda+m_0\beta^2)}.
      \end{equation*}
\end{prop}
\begin{rmk}
      A direct application of this proposition is that it extends the result of \citet{debortoli2022convergence} to a non-compact setting, under the additional assumption that the latent distribution is strongly log-concave, i.e., $-\nabla_{\bm{z}}^2\log p^Z(\bm{z}) \succeq m_0I_k$. If we are only concerned with the $L$-smoothness \footnote{$L$-smoothness of $f$ and $L$-Lipschitz continuity of $\nabla f$ are equivalent for $C^2$ functions; we use them interchangeably.} of $\log p_X$, the log-concavity of $p^Z$ can be relaxed to the $L$-smoothness; see Appendix \ref{appen:weaker_assumption_l_smoothness} for details.
\end{rmk}

\begin{cor}\label{cor:upboundoflog}
      Using the same notations as in Proposition \ref{prop:smoothoflatentdens}, we have
      \begin{equation*}
            \nabla^2_{\bm{x}}\log p_X(\bm{x}) \preceq \bc{\frac{\alpha^2\Lambda}{\beta^2(\alpha^2\lambda+m_0\beta^2)} - \frac{1}{\beta^2}}\bm{I}_n.
      \end{equation*}
\end{cor}
\begin{rmk}
      Note that, if $\alpha^2\Lambda < \alpha^2\lambda+m_0\beta^2$, such as $\Lambda = \lambda$, then
      \begin{equation*}
            -\nabla^2_{\bm{x}}\log p_X(\bm{x}) \succeq m_x\bm{I}_n,\quad m_x \defeq \frac{1}{\beta^2} \bc{1-\frac{\alpha^2\Lambda}{\alpha^2\lambda+m_0\beta^2}} > 0,
      \end{equation*}
      which implies that $p_X$ is $m_x$-strongly log-concave. Therefore, it shows that the strong log-concavity of $p^Z$ ensures not only the $L$-smoothness, but also the concavity of $\log p_X$.
\end{rmk}

Based on results in Proposition \ref{prop:smoothoflatentdens}, even if $\Pb_X$ does not admit a density function, the desired properties of the score function can still be guaranteed, provided that the latent distribution admits a density and satisfies strong log-concavity. However, in our setting, these two conditions are not satisfied:
\begin{enumerate}[label=(\roman*)]
      \item For the latent distribution of $\Pb_X$, because $\bm{Z}_i \sim \Pb_i^Z$ on $\R^{d_i}$, we first lift them on $\R^d$ with $d \defeq d_1 + d_2$ by defining
      \begin{equation*}
            \tilde{\bm{Z}}_1 = (\bm{I}_{d_1},\bm{O}_{d_1 \times d_2})^\top\bm{Z}_1 \sim \tilde{\Pb}_1^Z,\quad \tilde{\bm{Z}}_2 = (\bm{O}_{d_2 \times d_1},\bm{I}_{d_2})^\top\bm{Z}_2 \sim \tilde{\Pb}_2^Z.
      \end{equation*}
      Let $A = (A_1,A_2) \in \mathcal{O}^{D \times d}$. It follows that
      \begin{equation*}
            A\tilde{\bm{Z}}_i = A_i\bm{Z}_i \sim \Pb_{X\mid Y}(\cdot \mid Y=i),\text{ i.e., } A_{\#}\tilde{\Pb}_i^Z = \Pb_{X\mid Y}(\cdot \mid Y=i).
      \end{equation*} 
      Therefore, by Lemma \ref{lem:pushcondprob}, if $\bm{Z} \sim \Pb^Z \defeq w_1\tilde{\Pb}_1^Z+w_2\tilde{\Pb}_2^Z$, we have
      \begin{equation*}
            \bm{X} = A\bm{Z} \sim \Pb_X = w_1\Pb_{X\mid Y}(\cdot \mid Y=1) + w_2\Pb_{X\mid Y}(\cdot \mid Y=2).
      \end{equation*}
      But the problem is that the latent distribution $\Pb^Z$ does not admit a density function on $\R^d$.

      \item For log-concavity, even if the latent distribution admits a density function, it typically does not satisfy strong log-concavity due to its multi-modality \citep{lee2022convergence}.
\end{enumerate}

Therefore, in the following, we first introduce a technique to address the log-concavity of the latent density (Sections \ref{ssub:mollification_technique} and \ref{ssub:log_concavity_of_latent_density}), and then apply Proposition \ref{prop:smoothoflatentdens} to establish the desired properties of the score function (Section \ref{ssub:smoothness_and_concavity_of_score_function}).

\subsubsection{Mollification Technique}\label{ssub:mollification_technique}

Mollification \citep{evans2018measure} is a standard technique in mathematical analysis to address non-smoothness of functions. When dealing with a non-smooth function $f$, the idea is to find a smooth kernel function $k$ such that the convolution $g \defeq f * k$, which is clearly smooth, is closed to $f$. 

Following this idea, we choose a Gaussian distribution $\mathcal{N}(\bm{0},\sigma^2 \bm{I}_d)$ with some $\sigma > 0$ as the kernel, and consider its convolution with $\tilde{\Pb}^Z$; see Remark \ref{rmk:conv_measure} for the definition of convolution between measures. Let
\begin{equation*}
      \Pb^Z_\sigma \defeq \tilde{\Pb}^Z * \mathcal{N}(\bm{0},\sigma^2 \bm{I}_d) = w_1\Pb_{1,\sigma}^Z+w_2\Pb_{2,\sigma}^Z,
\end{equation*}
where $\Pb_{i,\sigma}^Z \defeq \tilde{\Pb}_i^Z * \mathcal{N}(\bm{0},\sigma^2 \bm{I}_d)$. Note that both $\Pb^Z_\sigma$ and $\Pb_{i,\sigma}^Z$ admit density functions, denoted by $p^Z_\sigma$ and $p^Z_{i,\sigma}$ respectively, and
\begin{equation}\label{eq:multi_of_latent}
      p^Z_\sigma = w_1p^Z_{1,\sigma} + w_2p^Z_{2,\sigma}.
\end{equation}
Moreover, by the definition of convolution, if $\bm{Z}_i \sim p^Z_i$, then
\begin{equation}\label{eq:component_sigma_latent}
      \bm{Z}_{1,\sigma} \defeq (\bm{Z}_1,\bm{O})^\top + \sigma \bm{\zeta}_1 \sim p^Z_{1,\sigma},\quad\bm{Z}_{2,\sigma} \defeq (\bm{O},\bm{Z}_2)^\top + \sigma \bm{\zeta}_2 \sim p^Z_{2,\sigma},
\end{equation}
for $\bm{\zeta}_i \sim \mathcal{N}(\bm{0},\bm{I}_d)$ independent of $\bm{Z}_i$. Therefore, we obtain a smooth density $p^Z_\sigma$ on the latent space $\R^d$. 

Next, for $\Pb_X$, the following Proposition \ref{prop:linearcondsupp} addresses the question of whether sampling from $p^Z_\sigma$ yields a $\Pb_X^\sigma \defeq A_{\#} \Pb^Z_\sigma$ that is close $\Pb_X$. The proof is provided in Appendix \ref{appen:omitted_proofs_in_section_ref_sub_smoothness_and_concavity}.

To measure the distance between probability measures, we use the $1$-Wasserstein distance in this work for analytical convenience. For $\mu,\nu \in \mathcal{P}(\R^D)$, it is defined by
\begin{equation*}
      \mathcal{W}_1(\mu,\nu) \defeq \inf \bb{\int \norm*{\bm{x}-\bm{y}} \mathrm{d}\gamma(\bm{x},\bm{y}) \colon \gamma \in \Gamma(\mu,\nu)} = \inf \bb{\E\bj{\norm*{\bm{X}-\bm{Y}}} \colon \bm{X} \sim \mu,~\bm{Y} \sim \nu},
\end{equation*}
where $\Gamma(\mu,\nu) \defeq \bb{\gamma \in \mathcal{P}(\R^D\times \R^D) \colon \gamma(A \times \R^D) = \mu(A),~ \gamma(\R^D\times B) = \nu(B)}$; see \citet{chewi2024statistical} for more details.

\begin{prop}\label{prop:linearcondsupp}
      Using the above notation, if $\bm{Z}_\sigma \sim \Pb^Z_{\sigma}$, then for $\bm{X}^\sigma \defeq A\bm{Z}_\sigma \sim \Pb_X^\sigma$, we have
      \begin{equation*}
            \Pb_X^\sigma = w_1\Pb^\sigma_{X\mid Y}(\cdot \mid Y=1) + w_2\Pb^\sigma_{X\mid Y}(\cdot \mid Y=2),
      \end{equation*}
      where $\Pb^\sigma_{X\mid Y}(\cdot \mid Y=i) \defeq A_{\#}\Pb^Z_{i,\sigma}$ for $i=1,2$, and it follows that
      \begin{equation*}
          \mathcal{W}_1(\Pb_X^\sigma,\Pb_X) \leq \sigma \sqrt{d}.
      \end{equation*}
\end{prop}

Therefore, the mollification technique provides a smooth latent density function $p^Z_\sigma$ that induces a distribution $\Pb^\sigma_X$ approximating $\Pb_X$.

\subsubsection{Log-Concavity of Latent Density}\label{ssub:log_concavity_of_latent_density}

In general, even if $p^Z_\sigma$ is smooth, we cannot directly assume that it is strongly log-concave, as it is multi-modal by Equation (\ref{eq:multi_of_latent}). However, we can still assume that each of its components $p^Z_{i,\sigma}$ is strongly log-concave, which, in fact, follows from the assumption of strong log-concavity of the original latent density $p_i^Z$.

\begin{assum}\label{assum:logconcave}
      Let $p^Z_{i}$ be the density function of $\Pb^Z_{i}$ defined on $\R^{d_i}$. There exits a large $m > 1$ such that
      \begin{equation*}
            -\nabla_{\bm{z}}^2\log p_{i}^Z(\bm{z}) \succeq m\bm{I}_{d_i},
      \end{equation*}
      i.e., $p_i^Z$ is $m$-strongly log-concave for $i=1,2$.
\end{assum}

Assumption \ref{assum:logconcave} ensures the strong log-concavity of each component $p^Z_{i,\sigma}$, but it does not guarantee that the overall mixture $p^Z_\sigma$ is strongly log-concave—this is a common difficulty in the case of multi-modal distributions. However, due to the mollification construction, the parameter $\sigma$ can be freely chosen, which enables us to establish the strong log-concavity of $p^Z_\sigma$ under the following assumption.

\begin{assum}\label{assum:bounddifflog}
    For a chosen $\sigma$, we assume that
      \begin{equation*}
            M \defeq \sup_{\bm{z}}\norm*{\nabla_{\bm{z}} \log p^Z_{1,\sigma}(\bm{z}) - \nabla_{\bm{z}} \log p^Z_{2,\sigma}(\bm{z})} < 2\sqrt{m-1}.
      \end{equation*}
\end{assum}
\begin{rmk}
      This assumption is novel but essential for addressing log-concavity in multi-modal settings. Characterizing the classes of $p_i^Z$ that satisfy it is nontrivial. As a concrete example, if each $p_i^Z(z)$ is a Gaussian truncated to a compact, convex set, then compactness implies that the difference of $\nabla \log p^Z_{i,\sigma}$ is uniformly bounded by a quantity depending on $\sigma$. Therefore, with an appropriate choice of $\sigma$, Assumption \ref{assum:bounddifflog} holds. The details, with a sufficient condition for Assumption \ref{assum:bounddifflog}, are discussed in Appendix \ref{appen:more_details_of_assumption_ref_assum_bounddifflog}.
\end{rmk}

Assumption \ref{assum:bounddifflog} is required to obtain an upper bound on $\nabla^2 \log p$, even when the density $p$ is multi-modal, as shown in the following lemma; see the proof in Appendix \ref{appen:omitted_proofs_in_section_ref_sub_smoothness_and_concavity}.
\begin{lem}\label{lem:logconcofmix}
      Let $p_1,p_2$ be two probability density functions on $\R^n$ such that $\nabla^2 \log p_i \preceq L_i\bm{I}_n$ for some constant $L_i \in \R$. Suppose that
      \begin{equation*}
            \sup_{\bm{x}}\norm*{\nabla \log p_1(\bm{x}) - \nabla \log p_2(\bm{x})} \leq M < \infty.
      \end{equation*}
      Then, for the mixture density $p = w p_1 + (1-w)p_2$ with $w \in (0,1)$, it holds that
      \begin{equation*}
            \nabla^2 \log p \preceq \bc{\max\bb{L_1,L_2}+\frac{1}{4}M^2}\bm{I}_n.
      \end{equation*}
\end{lem}

By Lemma \ref{lem:logconcofmix} and Proposition \ref{prop:smoothoflatentdens}, the strong log-concavity of the multi-modal latent density function $p^Z_\sigma$ can be guaranteed.

\begin{thm}\label{thm:log_concave_latent}
      Under Assumptions \ref{assum:logconcave} and \ref{assum:bounddifflog}, if $\sigma^2 < (4m-M^2)/(M^2m)$, then $p^Z_{\sigma}$ is strongly log-concave for $p^Z_\sigma = w_1p^Z_{1,\sigma} + w_2p^Z_{2,\sigma}$, i.e.,
      \begin{equation*}
            -\nabla^2_{\bm{z}}\log p^Z_{\sigma}(\bm{z}) \succeq m^z_0\bm{I}_d,\quad m^z_0 \defeq \frac{4m-M^2(1+m\sigma^2)}{4(1+m\sigma^2)}.
      \end{equation*}
\end{thm}
\begin{proof}
      Note that
      \begin{equation*}
            \bm{Z}_{1,\sigma} = (\bm{I}_{d_1},\bm{O})^\top\bm{Z}_1 + \sigma \bm{\zeta}_1 \sim p^Z_{1,\sigma}.
      \end{equation*}
      By Assumption \ref{assum:logconcave} and Corollary \ref{cor:upboundoflog}, with the choices $B=(\bm{I}_{d_1},\bm{O})^\top$, $m_0=m$, $\alpha = 1$, and $\beta = \sigma$, we obtain
      \begin{equation*}
            \nabla^2_{\bm{z}}\log p^Z_{1,\sigma}(\bm{z}) \preceq \bc{\frac{1}{\sigma^2(1+m\sigma^2)} - \frac{1}{\sigma^2}}\bm{I}_d.
      \end{equation*}
      For $p^Z_{2,\sigma}$, we similarly have
      \begin{equation*}
            \nabla^2_{\bm{z}}\log p^Z_{2,\sigma}(\bm{z}) \preceq \bc{\frac{1}{\sigma^2(1+m\sigma^2)} - \frac{1}{\sigma^2}}\bm{I}_d.
      \end{equation*}
      Then, because $p^Z_\sigma = w_1 p^Z_{1,\sigma} + w_2 p^Z_{2,\sigma}$, it follows from Assumption \ref{assum:bounddifflog} and Lemma \ref{lem:logconcofmix} that
      \begin{equation*}
            \nabla^2_{\bm{z}}\log p^Z_{\sigma}(\bm{z}) \preceq \bc{\frac{1}{\sigma^2(1+m\sigma^2)} - \frac{1}{\sigma^2} + \frac{1}{4}M^2}\bm{I}_d = -m_0^z\bm{I}_d. \qedhere
      \end{equation*}
\end{proof}

\subsubsection{Smoothness and Concavity}\label{ssub:smoothness_and_concavity_of_score_function}

Before proceeding, let us recall that the latent distribution $\Pb^Z$ of $\Pb_X$ is not ``good''. To address this, we construct a new distribution $\Pb_X^\sigma$ whose latent distribution $\Pb^Z_\sigma$ admits a ``good'' density function $p^Z_\sigma$, and which is close to $\Pb_X$. Consequently, instead of considering the score function associated with a DDPM initialized from $\Pb_X$, we consider a DDPM initialized from $\Pb_X^\sigma$, i.e.,
\begin{equation}\label{eq:def_p_sig_t}
    \bm{X}^\sigma_t = \sqrt{\alpha_t}A\bm{Z}_\sigma + \sqrt{1 - \alpha_t}\bm{\xi} \quad\sim\quad p_t^\sigma,
\end{equation}
where $\bm{Z}_\sigma \sim p^Z_\sigma$ and $\bm{\xi} \sim \mathcal{N}(\bm{0},\bm{I}_D)$. We then modify the dynamics in (\ref{eq:orginal_geom_guid}) to define our final version of the geometric guidance model:

\begin{defn}[Geometric Guidance Model]\label{defn:geometric_guidance_model}
      For any $t \in [0,T-\delta]$,
      \begin{equation*}\label{eq:geometric_guidance}
            \frac{\mathrm{d}}{\mathrm{d}t}\tilde{\bm{X}}_t = \tilde{\bm{X}}_t+\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t) - \eta P_1\tilde{\bm{X}}_t,\quad \tilde{\bm{X}}_0 \sim \mathcal{N}(\bm{0},\bm{I}_D), \tag{$*$}
      \end{equation*}
      where $P_1 = \bm{I}_D - A_1A_1^\top$.
\end{defn}
\begin{rmk}
      (\rnum{1}) The initial condition is taken as $\mathcal{N}(\bm{0}, \bm{I}_D)$ instead of $p_T(\cdot \mid y)$ to reflect practical implementation settings. (\rnum{2}) The time interval is chosen as $[0, T - \delta]$ for some $\delta > 0$ to avoid the singularity at time $T$.
\end{rmk}

Therefore, our main objective now becomes establishing the Lipschitz continuity of $\nabla_{\bm{x}} \log p^\sigma_t(\bm{x})$ and the log-concavity of $p^\sigma_t(\bm{x})$, which follows from the strong log-concavity of the latent density $p^Z_\sigma(\bm{z})$. 

\begin{thm}\label{thm:smoothofscore}
      Under Assumption \ref{assum:condlineardata} and the same settings as in Theorem \ref{thm:log_concave_latent}, for the density function $p^\sigma_t$ defined in Equation (\ref{eq:def_p_sig_t}), we have
      \begin{equation*}
            \norm*{\nabla^2_{\bm{x}}\log p_t^\sigma(\bm{x})}_{\op{op}} \leq L_t,\quad L_t \defeq \frac{2\alpha_t+(1-\alpha_t)m_0^z}{(1-\alpha_t)\bc{\alpha_t+(1-\alpha_t)m_0^z}},
      \end{equation*}
      and 
      \begin{equation*}
            -\nabla^2_{\bm{x}}\log p_t^\sigma(\bm{x}) \succeq m_t\bm{I}_D,\quad m_t \defeq \frac{m_0^z}{\alpha_t+(1-\alpha_t)m_0^z}.
      \end{equation*}
\end{thm}
\begin{proof}
      First, by Theorem \ref{thm:log_concave_latent}, the latent density $p^Z_{\sigma}$ is $m_0^z$-strongly log-concave. By the definition of $p^\sigma_t$, Proposition \ref{prop:smoothoflatentdens} implies that
      \begin{equation*}
            \norm*{\nabla^2_{\bm{x}}\log p_t^\sigma(\bm{x})}_{\op{op}} \leq \frac{2\alpha_t+(1-\alpha_t)m_0^z}{(1-\alpha_t)\bc{\alpha_t+(1-\alpha_t)m_0^z}},
      \end{equation*}
      with the choices $B=A$, $m_0=m_0^z$, $\alpha = \sqrt{\alpha_t}$, and $\beta = \sqrt{1-\alpha_t}$. This follows from the fact that $A^\top A = \bm{I}_d$ (Assumption \ref{assum:condlineardata}), which indicates $\norm*{A}^2_{\op{op}} =1$ and $\lambda_{\op{min}}(A^\top A) = 1$. 

      For the log-concavity, Corollary \ref{cor:upboundoflog} directly yields
      \begin{equation*}
            -\nabla^2_{\bm{x}}\log p_t^\sigma(\bm{x}) \succeq \bc{\frac{1}{1-\alpha_t}\bc{1-\frac{\alpha_t}{\alpha_t+(1-\alpha_t)m_0^z}}}\bm{I}_D. \qedhere
      \end{equation*}
\end{proof}

Therefore, we have established the desired properties of $p^\sigma_t$, which ensure the well-posedness of the geometric guidance model (\ref{eq:geometric_guidance}). Moreover, from the definition of $m_t$ in Theorem \ref{thm:smoothofscore}, we can derive a lower bound that will be useful in the following analysis; see Appendix \ref{appen:omitted_proofs_in_section_ref_sub_smoothness_and_concavity} for the proof.
\begin{cor}\label{cor:infofm}
      There exists a small $\sigma > 0$ such that $m_0^z > 1$ and $m_I \defeq \inf_{t \in (0,T]} m_t > 1$.
\end{cor}

\subsection{Estimating Target Data Manifold}\label{sub:estimating_target_space}

For the geometric guidance model (\ref{eq:geometric_guidance}), the first problem is whether it can estimate the target data manifold $\mathcal{M}_1$. Specifically, we aim to show that the generated sample $\tilde{\bm{X}}_{T-\delta}$ approximately lies in $\mathcal{M}_1$. Since $\mathcal{M}_1 = \Img A_1$ is a linear subspace by Assumption \ref{assum:condlineardata}, it suffices to examine whether $\E\bj{\norm*{\tilde{\bm{Y}}_{T-\delta}}} \approx 0$, where
\begin{equation*}
      \tilde{\bm{Y}}_t = P_1\tilde{\bm{X}}_t,~P_1 = \bm{I}_D - A_1A_1^\top.
\end{equation*}
Multiplying both sides of Equation (\ref{eq:geometric_guidance}) by $P_1$, we obtain that $\tilde{\bm{Y}}_t$ satisfies the following dynamics:
\begin{equation}\label{eq:orthodynam}
      \frac{\mathrm{d}}{\mathrm{d}t}\tilde{\bm{Y}}_t = \tilde{\bm{Y}}_t + P_1\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t) - \eta \tilde{\bm{Y}}_t,\quad \tilde{\bm{Y}}_0 \sim \mathcal{N}(0,P_1),
\end{equation}
for $t \in [0,T-\delta]$. By analyzing the dynamics (\ref{eq:orthodynam}), the following theorem provides a convergence rate of $\E\bj{\norm*{\tilde{\bm{Y}}_{T-\delta}}} \sto 0$ with respect to the guidance scale $\eta$.

\begin{thm}\label{thm:estimmfd}
      Consider the dynamics (\ref{eq:orthodynam}) under Assumptions \ref{assum:logconcave} and \ref{assum:bounddifflog}. Then,
      \begin{equation*}
            \E\bj{\norm{\tilde{\bm{Y}}_{T-\delta}}} \leq \mathcal{O}\bc{e^{-\eta}+\frac{1}{\eta}}.
      \end{equation*}
      In particular, for any $\varepsilon > 0$, by choosing $\eta = \Theta(\max\{\log(1/\varepsilon),1/\varepsilon\})$, $\E\bj{\norm{\tilde{\bm{Y}}_{T-\delta}}} < \varepsilon$.
\end{thm}
\begin{proof}[Proof sketch]
      We provide a sketch of the proof here; the full proof is given in Appendix \ref{appen:proof_of_theorem_ref_thm_estimmfd}. 

      The key idea is to derive a differential inequality for $\E\bj{\norm*{\tilde{\bm{Y}}_t}}$. First, we have
      \begin{equation}\label{eq:mainbody_ineq_y_t}
            \frac{\mathrm{d}}{\mathrm{d}t}\E\bj{\norm*{\tilde{\bm{Y}}_{t}}} \leq (1-\eta) \E\bj{\norm*{\tilde{\bm{Y}}_{t}}} +\E\bj{\norm*{\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t)}}.
      \end{equation}
      To bound $\E\bj{\norm*{\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t)}}$, the $L_t$-smoothness of $\log p^\sigma_{t}$ is required, which follows from Theorem \ref{thm:smoothofscore}. The smoothness implies that
      \begin{equation*}
            \norm*{\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t)} \leq L_S\norm*{\tilde{\bm{X}}_t}+ C
      \end{equation*}
      for some constants $L_S$ and $C$. Therefore, it suffices to bound $\E\bj{\norm*{\tilde{\bm{X}}_t}}$. By deriving a differential inequality from Equation (\ref{eq:geometric_guidance}) and applying Gr\"onwall's inequality (Lemma \ref{lem:gronwall}), we obtain $\E\bj{\norm*{\tilde{\bm{X}}_t}} \leq M_1$ for some constant $M_1$, and thus $\E\bj{\norm*{\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t)}} \leq M_2$. Substituting this bound into (\ref{eq:mainbody_ineq_y_t}) and applying Gr\"onwall's inequality once more yields the desired result.
\end{proof}
\begin{rmk}
      For this theorem, we provide two remarks.
      \begin{enumerate}[label=(\roman*)]
            \item Note that this result depends only on the $L_t$-smoothness of $\log p^\sigma_t$, and not on strong log-concavity. Therefore, Assumptions \ref{assum:logconcave} and \ref{assum:bounddifflog} can be relaxed; see further discussion in Appendix \ref{appen:weaker_assumption_l_smoothness}.

            \item  The universal guidance model,
            \begin{equation*}
            \frac{\mathrm{d}\bm{X}_t^\leftarrow}{\mathrm{d}t} = \bm{X}_t^\leftarrow + \nabla_{\bm{x}} \log p_{T-t}(\bm{X}_t^\leftarrow) - \eta \nabla f(\bm{X}_t^\leftarrow),
            \end{equation*}
            was proposed by \cite{bansal2023universal} to control the generation process such that the generated images match the prompt $g(\bm{X}_T^\leftarrow) \approx \bm{c}$. In their setting, $f(\bm{x}) = \ell(\bm{c},g(\bm{x}))$ for some loss function $\ell$. A similar idea used in the proof of Theorem \ref{thm:estimmfd} can be extended to theoretically analyze the universal guidance model. If the $L$-smoothness of $\log p_t$ holds (see Appendix \ref{appen:weaker_assumption_l_smoothness}) and $f$ is strongly convex,
            \begin{equation*}
            \E\bj{f(\bm{X}_T^\leftarrow)} \rightarrow \min f,\text{ as } \eta \sto \infty;
            \end{equation*}
            see Appendix \ref{appen:theoretical_analysis_for_universal_guidance} for more details.
      \end{enumerate}
\end{rmk}

Theorem \ref{thm:estimmfd} shows that the geometric guidance model can approximate the target data manifold. Specifically, as the guidance scale $\eta$ increases, the generated data increasingly lie close to the target manifold. This result is consistent with empirical observations on both synthetic datasets \citep{wu2024theoretical,chidambaram2024what} and real-world datasets \citep{dhariwal2021diffusion,sadat2024cads,sadat2025no}, as well as with the theoretical results in the one-dimensional case studied by \citet{chidambaram2024what}, which demonstrate that increasing $\eta$ causes the generated data to move toward the extreme points in the support of the target conditional distribution.

\subsection{Distance to Target Distribution}\label{sub:distance_to_target_distribution}

Let $\tilde{p}_t$ be the density function of $\tilde{X}_t$ in the geometric guidance model (\ref{eq:geometric_guidance}). The second question is how to measure the $1$-Wasserstein distance between the generated density $\tilde{p}_{T-\delta}$ and the target conditional distribution $\Pb_{X \mid Y}(\cdot \mid Y=1)$. Specifically, the goal is to provide an upper bound on $\mathcal{W}_1\bc{\tilde{p}_{T-\delta}, \Pb_{X \mid Y}(\cdot \mid Y=1)}$.

First, we require an additional assumption: the boundedness of the first moment of each conditional distribution $\Pb_{X \mid Y}(\cdot \mid Y = i)$, which can be reduced to the same condition on the latent distribution $p^Z_i$.

\begin{assum}\label{assum:bound_moment_latent_comp}
      For $i = 1,2$ and $\bm{Z}_i \sim p^Z_i$, $\mathfrak{m}^Z_i \defeq \E\bj{\norm*{\bm{Z}_i}} < \infty$.
\end{assum}

\begin{thm}\label{thm:wassbound}
      Under Assumptions \ref{assum:condlineardata}, \ref{assum:logconcave}, \ref{assum:bounddifflog}, and \ref{assum:bound_moment_latent_comp}, we obtain that
      \begin{equation*}
            \mathcal{W}_1\bc{\tilde{p}_{T-\delta},\Pb_{X \mid Y}(\cdot \mid Y=1)} \leq \mathcal{O}(e^{-T}+\delta^{1/2} + \sigma + \eta^{-1}) + \tilde{C}
      \end{equation*}
      for some constant $\tilde{C}$.
\end{thm}
\begin{proof}[Proof sketch]
      The proof consists of two main steps:
      \begin{enumerate}[label=(\roman*)]
            \item Let $Q_1 = A_1A_1^\top$ be the orthogonal projection onto $\mathcal{M}_1 = \Img A_1$. By Theorem \ref{thm:estimmfd}, we have
            \begin{equation*}
                  \mathcal{W}_1\bc{\tilde{p}_{T-\delta},\Pb_{X \mid Y}(\cdot \mid Y=1)} \leq \mathcal{W}_1\bc{(Q_1)_{\#}\tilde{p}_{T-\delta},\Pb_{X \mid Y}(\cdot \mid Y=1)} + \mathcal{O}(e^{-T}+\eta^{-1}).
            \end{equation*}

            \item For $\mathcal{W}_1\bc{(Q_1)_{\#}\tilde{p}_{T-\delta},\Pb_{X \mid Y}(\cdot \mid Y=1)}$, it has
            \begin{equation*}
                  \mathcal{W}_1\bc{(Q_1)_{\#}\tilde{p}_{T-\delta},\Pb_{X \mid Y}(\cdot \mid Y=1)} \leq \mathcal{W}_1\bc{\tilde{p}_{T-\delta},\Pb_X} + \mathcal{W}_1\bc{(Q_1)_{\#}\Pb_X,\Pb_{X \mid Y}(\cdot \mid Y=1)},
            \end{equation*}
            where the first term $\mathcal{W}_1\bc{\tilde{p}_{T-\delta},\Pb_X}$ can be bounded by comparing the geometric guidance model (\ref{eq:geometric_guidance}) with the unconditional reverse dynamics, and the second term is directly bounded by Lemma \ref{lem:wassofpxtopxgiveny}. 
      \end{enumerate}
      The full proof is provided in Appendix \ref{appen:proof_of_theorem_ref_thm_wassbound}. \qedhere
\end{proof}
\begin{rmk}
      Since geometric guidance cannot carry as much information as probabilistic guidance due to its analytical simplicity, the error floor $\tilde{C}$ does not vanish as $\eta = 1$, unlike in probabilistic guidance models; see further discussion in Remark~\ref{rmk:discussion_of_tildeC}.
\end{rmk}

This result suggests that increasing the guidance scale does not harm the generating performance, which may appear counterintuitive and inconsistent with empirical observations. In practice, however, ODE dynamics are typically approximated using the Euler discretization (or the Euler–Maruyama scheme for SDEs), which introduces additional discretization error. In our setting, the Euler discretization error for the geometric guidance model (\ref{eq:geometric_guidance}) is bounded by $\mathcal{O}(h \eta^2)$, where $h$ denotes the step size; see Appendix~\ref{appen:discretization_error} for details. Therefore, the performance degradation observed at large guidance scales arises not from the model formulation itself, but from the discretization algorithm. For example, \citet[Figure 3]{wu2024theoretical} showed that the large guidance scale would harm the modality of the original data, but this problem can be mitigated by reducing the discretization step size.


\section{Nonlinear Extension}\label{sec:experiments_nonlinear_case}

In this section, our main objective is to construct a nonlinear geometric guidance model suitable for real-world image datasets, and to evaluate its generation performance under varying guidance scales $\eta$. 

The first challenge is to construct the geometric guidance term for image datasets, which may not lie in a linear subspace. To this end, we study the geometric structure of noisy data manifolds without assuming linearity of the target data manifold, by extending the result of Proposition \ref{prop:data_mfd_linear} to the nonlinear case (see Section \ref{sub:noisy_data_manifolds_for_nonlinear_case}). Then, following the idea of \cite{ross2024neural}, we train functions $F_{\theta}^t \colon \R^D \sto \R$ to model noisy data manifolds via $\mathcal{M}^t = (F_\theta^t)^{-1}(0)$ so that $\nabla_{\bm{x}}F^t_{\theta}$ can replace $(\bm{I}_D - AA^\top)\bm{x}$ to be the nonlinear geometric guidance term (see Section \ref{sub:learning_geometric_guidance}). Finally, we examine this nonlinear geometric guidance model on CIFAR-10 \citep{krizhevsky2009learning}, and evaluate its performance under the different guidance scale (see Section \ref{sub:experiments}).

\subsection{Noisy Data Manifolds for Nonlinear Case}\label{sub:noisy_data_manifolds_for_nonlinear_case}

The geometric guidance term $(\bm{I}_D - AA^\top)\bm{x}$ is constructed based on the result in Proposition \ref{prop:data_mfd_linear}, which assumes that the target data manifold $\mathcal{M} = \Img A$ is linear. However, for real-world image datasets, it may unrealistic to assume that the data lie in a linear subspace. Instead, it is more reasonable to assume that the target image data lie on a nonlinear manifold $\mathcal{M} \subset \R^D$ with intrinsic dimension $d \ll D$; see Appendix \ref{appen:preliminaries_of_manifolds} for basic knowledge of manifolds. This assumption is known as the manifold hypothesis \citep{bengio2013representation}, and it has been supported by both theoretical analyses \citep{fefferman2016testing} and empirical studies \citep{brown2022verifying,loaiza-ganem2022diagnosing}. 

To construct a new geometric guidance term, because of the nonlinearity of $\mathcal{M}$, we must extend the result of Proposition \ref{prop:data_mfd_linear} to uncover the geometric structure of noisy data manifolds. Although the $d$-dimensional manifold $\mathcal{M} \subset \R^D$ is not assumed to be linear, we additionally require that it is locally isometric to $\R^d$. More precisely, we assume the existence of a $C^\infty$ function $\phi \colon \R^d \to \R^D$ such that $\Img \phi = \mathcal{M}$ and $\phi$ is an isometry; that is, $J\phi^\top J\phi \equiv \bm{I}_d$. Then, by Lemma \ref{lem:isoencod}, we obtain an analogue of Proposition \ref{prop:data_mfd_linear} in Theorem \ref{thm:hypersubmfd}, which shows that the noisy data manifolds $\mathcal{M}^t$ are hypersurfaces—i.e., $(D - 1)$-dimensional submanifolds of $\R^D$; see the proofs in Appendix \ref{appen:omitted_proofs_in_section_ref_sec_experiments_nonlinear_case}.

\begin{lem}\label{lem:isoencod}
      Let $\phi \colon \R^d \sto \R^D$ be a $C^\infty$ isometry such that $\mathcal{M} = \Img \phi \subset \R^D$ is a $d$-dimensional submanifold. Then, there exists a $C^\infty$ function $\phi^* \colon \R^D \sto \R^d$ such that $\phi^* \circ \phi = \op{id}_{\R^d}$ and
      \begin{equation*}
          J\phi^*(\phi(\bm{z})) = J\phi(\bm{z})^\top,\quad \forall~ \bm{z} \in \R^d.
      \end{equation*}
\end{lem}
\begin{rmk}
      In fact, the isometry of $\phi$ implies that $\Img \phi$ is a submanifold, because it is proper (i.e., the preimage of every compact set is compact) by the Hopf–Rinow theorem \citep{jost2008riemannian}.
\end{rmk}

\begin{thm}\label{thm:hypersubmfd}
      Let $\mathcal{M} \subset \R^D$ be a $d$-dimensional submanifold as defined in Lemma \ref{lem:isoencod}, and let $\Pb_X$ on $\R^D$ such that $\supp \Pb_X \subset \mathcal{M}$. Let $\bm{X}_t$ be generated by DDPM (\ref{eq:DDPM_SDE}) initialized from $\Pb_X$. If $d \ll D$, then $\bm{X}_t$ concentrates on a hypersurface $\mathcal{M}^t \subset \R^D$ with high probability, where
      \begin{equation*}
          \mathcal{M}^t \defeq \bb{\bm{x} \colon f^t(\bm{x}) = r(t)},\quad r(t) = \sqrt{(D-d)(1-\alpha_t)},
      \end{equation*}
      for some $C^\infty$ function $f^t \colon \R^D \sto \R$.
\end{thm}

\subsection{Learning Geometric Guidance}\label{sub:learning_geometric_guidance}


For an image dataset $(\bm{X},Y) \sim \Pb_{XY}$ with class label $Y \in \bb{1,2,\ldots, K}$, we adopt the union of manifold hypothesis \citep{brown2022verifying}, that is, 
\begin{equation*}
      \supp \Pb_{X \mid Y} (\cdot \mid Y=y) \subset \mathcal{M}_y,
\end{equation*}
where $\mathcal{M}_y \subset \R^D$ is a $d_y$-dimensional submanifold. To apply Theorem \ref{thm:hypersubmfd}, we further assume that, for each $\mathcal{M}_y$, there exists an isometry $\phi_y \colon \R^{d_y} \sto \R^D$ such that $\Img \phi_y = \mathcal{M}_y$. Then, the noisy data manifolds generated by the forward process initialized from $\mathcal{M}_y$ are given by
\begin{equation*}
      \mathcal{M}^t_y \defeq \bb{\bm{x} \in \R^D \colon f^t_y(\bm{x}) = r(t)},\quad r(t) = \sqrt{(D-d)(1-\alpha_t)},
\end{equation*}
for some function $f^t_y \colon \R^D \sto \R$.

By adopting the same idea as in Section \ref{sub:geometric_guidance_for_conditional_generation}, for $\bm{x} \in \mathcal{M}^t_y$, the guidance term $\nabla_{\bm{x}} \log p_t(y \mid \bm{x})$ is approximately normal to $\mathcal{M}^t_y$ at $\bm{x}$—that is, it is approximately parallel to $\nabla_{\bm{x}} f^t_y(\bm{x})$. Therefore, we construct the nonlinear geometric guidance term as $\nabla_{\bm{x}} f^t_y(\bm{x})$ to replace the probabilistic guidance $\nabla_{\bm{x}} \log p_t(y \mid \bm{x})$ in the reverse process for conditional generation. The resulting nonlinear geometric guidance model (in deterministic form) is defined by
\begin{equation}\label{eq:non_linear_geo_guid}
      \frac{\mathrm{d}}{\mathrm{d}t}\bm{X}_t^\leftarrow = \bm{X}_t^\leftarrow+\nabla_{\bm{x}}\log p_{T-t}(\bm{X}^{\leftarrow}_t) - \eta \nabla_{\bm{x}} f^{T-t}_y(\bm{X}^{\leftarrow}_t),
\end{equation}
where $\nabla_{\bm{x}} \log p_t$ is the score function of the unconditional DDPM initialized from $\Pb_X$.

To implement the nonlinear geometric guidance model, one must estimate both the score function and the nonlinear geometric guidance term. The score function $\nabla_{\bm{x}} \log p_t(\bm{x})$ can be estimated using an unconditional diffusion model—specifically, by training a network $\bm{s}_\theta(t, \bm{x})$ via the score matching method \citep{vincent2011connection} on the unconditional data $\bm{X}$. The main task, then, is to estimate $\nabla_{\bm{x}} f^t_y(\bm{X}_t^\leftarrow)$. 

First, Theorem \ref{thm:estimmfd} shows that $\mathcal{M}^t_y = (f^t_y)^{-1}(r(t))$, so such function $f^t_y$ is called a manifold-defining function in \citet{ross2024neural}. Following a similar idea, we train a network $F_{y,\theta}^t \colon \R^D \sto \R$ to estimate $f^t_y - r(t)$, so $F_{y,\theta}^t$ needs to satisfy
\begin{equation*}
      F_{y,\theta}^t(\bm{x})= 0,\text{ and } \nabla_{\bm{x}} F^t_{y,\theta} (\bm{x}) \neq \bm{0},\quad \forall~\bm{x} \in \mathcal{M}^t_y,
\end{equation*}
where the first condition follows directly from the definition of $\mathcal{M}^t_y$, and the second condition, called the rank condition, ensures $F_{y,\theta}^t$ a manifold-defining function, as guaranteed by the Constant Rank Theorem (Lemma \ref{lem:const_rank}). Therefore, the loss function for training $F_{y,\theta}^t$ is designed as
\begin{equation}\label{eq:loss_geoguid}
      \mathcal{L}^t_y(\theta) \defeq \E_{\bm{X} \sim p_t(\cdot \mid y)}\bj{\abs{F_{y,\theta}^t(\bm{X})}^2 - \kappa\norm*{\nabla_{\bm{x}} F^t_{y,\theta} (\bm{X})}^2},
\end{equation}
where $\kappa > 0$ is chosen for controlling the strength of the rank condition. We simply set $\kappa = 1$.

\begin{figure*}[t]
    \centering
    \includegraphics[width=0.3\textwidth, height=0.3\textwidth]{images/generated_gegm.pdf}
    \caption{Images generated by GeGM on CIFAR-10}
    \label{fig:gene_img_cifar}
\end{figure*}

\subsection{Experiments}\label{sub:experiments}

\paragraph{Effectiveness of GeGM.} We use the Fr\'echet Inception Distance (FID) \citep{heusel2017gans} as the metric for evaluating generation performance, because it can be regarded as a practical surrogate for the Wasserstein distance. We compare the FID of samples generated by the nonlinear geometric guidance model (\ref{eq:non_linear_geo_guid}) (GeGM) with those generated by the classifier guidance model (CGM) (\ref{eq:lambda_classifier_guidance}). The results are reported in Table\ref{tab:comp_fid}, where we present results for selected classes; the remaining classes are provided in Appendix \ref{appen:more_experiments}. Note that the guidance scales used for CGM and GeGM differ, since the norms of the probabilistic and geometric guidance terms are not comparable. For visualization, Figure \ref{fig:gene_img_cifar} displays images generated by the nonlinear GeGM. These results demonstrate the effectiveness of the nonlinear GeGM in generating real-world images.

\begin{table}[ht]
\caption{Comparison of FID on CIFAR-10}
\label{tab:comp_fid}
\begin{center}
\begin{tabular}{rccccc}
\toprule
~ & Automobile & Frog & Horse & Ship & Truck  \\
\midrule
    CGM ($\eta = 1$)   & 13.46 & 17.87 & 13.97 & 11.61 & 16.85 \\
    \midrule
    GeGM ($\eta = 50$)  & 9.70 & 16.28 & 12.65 & 13.84 & 11.02 \\
\bottomrule
\end{tabular}
\end{center}
\end{table}


\paragraph{Performance vs. guidance scale.} By applying the nonlinear GeGM (\ref{eq:non_linear_geo_guid}), we evaluate how generation performance varies with the guidance scale $\eta$ on selected classes from CIFAR-10; results for the remaining classes are provided in Appendix \ref{appen:more_experiments}. As shown in Figure \ref{fig:fidvsscale}, performance improves with increasing $\eta$ within a reasonable range. Since FID serves as a practical approximation of the Wasserstein distance, this trend is consistent with Theorem \ref{thm:wassbound}, even in the nonlinear setting.

\begin{rmk}
      We emphasize that the observed trends are consistent with the spirit of Theorem \ref{thm:wassbound} in nonlinear regimes, but they are not derived from it. Establishing nonlinear analogues of Theorems \ref{thm:estimmfd}--\ref{thm:wassbound} will require additional analysis and is left for future work.
\end{rmk}

\begin{figure*}[t]
    \centering
    \includegraphics[width=0.5\textwidth, height=0.33\textwidth]{images/fid_vs_eta_selected_classes.pdf}
    \caption{FID v.s. guidance scale $\eta$ of GeGM on selected classes of CIFAR-10}
    \label{fig:fidvsscale}
\end{figure*}

\section{Conclusion}\label{sec:conclusion}

In this work, we studied the role of the guidance scale in conditional generation with diffusion models. To address the analytical intractability of the probabilistic guidance term, we introduced a geometric guidance model that enables theoretical analysis under the linear manifold hypothesis. To facilitate this analysis, we proposed a mollification technique to ensure the regularity of the score function in the presence of multi-modality. Our results showed that increasing the guidance scale within a reasonable range can enhance generation performance, in line with empirical observations reported in prior studies. We further extended the model to nonlinear settings, and experiments on real-world datasets demonstrated the effectiveness of the geometric guidance model and provided additional evidence consistent with our theoretical findings.

\paragraph{Limitations:}

While the geometric guidance offers a more tractable alternative to probabilistic guidance, it comes with certain limitations. Notably, our analysis showed that the upper bound of the Wasserstein distance between the generated and target conditional distributions is bounded by a constant, regardless of the choice of the guidance scale. This implies that, unlike probabilistic guidance, which can approximate the target conditional distribution by setting the scale to $1$, the geometric guidance does not guarantee convergence to the target distribution. This is a trade-off made for the sake of analytical tractability. 

Although our experiments on the nonlinear extension partially supported the theoretical results, our current theoretical analysis is restricted to the linear manifold setting. In the nonlinear case, the geometric structure of the score function remains unclear. Regarding regularity of the score function, while Lipschitz continuity can be ensured under compactness assumptions, extending this to the non-compact setting remains an open problem. Furthermore, the log-concavity of the score function cannot be guaranteed, even in compact nonlinear cases.


\subsubsection*{Acknowledgments}

We thank Ming Li and Luheng Wang for the helpful discussions. ZZ was supported by Institute for AI and Beyond at the University of Tokyo. MS was supported by JST ASPIRE Grant Number JPMJAP25B1. The authors also thank the anonymous reviewers for their careful reviews and insightful comments, which have been invaluable in improving both the clarity and rigor of this work.


%Save Old Theorem Eviroment
\let\oldthm\thm \let\endoldthm\endthm
\let\oldlem\lem \let\endoldlem\endlem
\let\oldprop\prop \let\endoldprop\endprop
\let\oldcor\cor \let\endoldcor\endcor
\let\oldrmk\rmk \let\endoldrmk\endrmk
\let\olddefn\defn \let\endolddefn\enddefn

\bibliography{main}
\bibliographystyle{tmlr}

\appendix
%Redefien Theorem Style in Main body
\theoremstyle{plain}
\newtheorem{thmApp}{Theorem}[section]
\newtheorem{lemApp}[thmApp]{Lemma}
\newtheorem{propApp}[thmApp]{Proposition}
\newtheorem{corApp}[thmApp]{Corollary}

\theoremstyle{definition}
\newtheorem{defnApp}{Definition}[section]

\theoremstyle{remark}
\newtheorem{rmkApp}{Remark}[section]

\let\thm\thmApp \let\endthm\endthmApp
\let\lem\lemApp \let\endlem\endlemApp
\let\prop\propApp \let\endprop\endpropApp
\let\cor\corApp \let\endcor\endcorApp
\let\rmk\rmkApp \let\endrmk\endrmkApp
\let\defn\defnApp \let\enddefn\enddefnApp



\section{Notation}\label{appen:notations}

The symbols used throughout this paper are clarified below.
\begin{enumerate}[label=\arabic*.]
      \item \textbf{Letters:} Unless otherwise specified, lowercase letters such as $x$ and $\bm{x}$ denote deterministic variables, while uppercase letters such as $X$ and $\bm{X}$ denote random variables. Scalars are typically represented by non-bold symbols such as $x$ and $Y$, whereas vectors are denoted using bold symbols such as $\bm{x}$ and $\bm{X}$. In particular, we use $\bm{I}_n \in \R^{n \times n}$ to denote the identity matrix and $\bm{0} \in \R^n$ to denote the zero vector.

      \item  \textbf{Linear Algebra:} 
      \begin{enumerate}[label=(\roman*)]
            \item Let $\mathcal{O}^{m \times n} \subset \R^{m \times n}$ (with $m > n$) denote the set of matrices whose columns are orthonormal, i.e., those satisfying $A^\top A = \bm{I}_n$.

            \item For a vector $\bm{x} \in \R^n$, the notation $\norm*{\bm{x}}$ refers to the $\ell_2$-norm. For a matrix $A \in \R^{m \times n}$, the operator norm is defined as
            \begin{equation*}
                  \norm*{A}_{\op{op}} = \sup_{\norm*{\bm{x}} = 1} \norm*{A\bm{x}} = \sqrt{\lambda_{\op{max}}(A^\top A)},
            \end{equation*}
            where $\lambda_{\op{max}}(\cdot)$ denotes the maximum eigenvalue.

            \item Let $A,B \in \R^{n\times n}$ be symmetric matrices, i.e., $A = A^\top$ and $B = B^\top$. We write $A \preceq B$ (or equivalently, $B \succeq A$) if $B - A$ is positive semi-definite, i.e.,
            \begin{equation*}
                  \bm{x}^\top (B-A)\bm{x} \geq 0,\quad \forall~\bm{x} \in \R^n.
            \end{equation*}
      \end{enumerate}

      \item \textbf{Calculus:} 
      \begin{enumerate}[label=(\roman*)]
            \item For a scalar-valued function $f \colon \R^n \sto \R$, the gradient with respect to $\bm{x}$ is denoted by $\nabla_{\bm{x}} f(\bm{x})$, and the Hessian matrix by $\nabla^2_{\bm{x}}f(\bm{x})$.

            \item For a vector-valued function $F \colon \R^n \sto \R^m$, $JF$ denotes the Jacobian matrix of $F$, and the second-order derivative $D^2 F$ is a bilinear map $D^2F(\bm{x}) \colon \R^n \times \R^n \sto \R^m$ defined by
            \begin{equation*}
                  D^2F(\bm{x})[\bm{v},\bm{w}] = \frac{\partial^2}{\partial s\partial t}F(\bm{x}+s\bm{v}+t\bm{w}) = \bc{\bm{v}^\top\nabla^2_{\bm{x}}F^1(\bm{x})\bm{w},\cdots,\bm{v}^\top\nabla^2_{\bm{x}}F^m(\bm{x})\bm{w}}^\top,
            \end{equation*}
            where $F = (F^1,\ldots,F^m)$.  If each $F^i$ has continuous derivative of order $k$, $F$ is called $C^k$.

            \item  For any set $U \subset \R^n$, the characteristic function $\chi_U \colon \R^n \sto \R$ is defined by $\chi_U(\bm{x}) = 1$ if $\bm{x} \in U$, and $\chi_U(\bm{x}) = 0$ otherwise.

            \item For integrable functions $f,g \colon \R^n \sto \R$, their convolution is denoted by
            \begin{equation*}
                  f * g (\bm{x}) = \int_{\R^n} f(\bm{y})g(\bm{x}-\bm{y})\mathrm{d}\bm{y}.
            \end{equation*}

            \item For a function $f \colon \R^n \sto \R^m$, let $\Img f = f(\R^n)$ denote the image of $f$. In particular, for a matrix $A \in \R^{m \times n}$, $\Img A$ refers the image of the linear map $\bm{x} \mapsto A\bm{x}$.
      \end{enumerate}

      \item \textbf{Probability-related Symbols:} 
      \begin{enumerate}[label=(\roman*)]
            \item We fix the base probability space $(\Omega,\mathcal{F},\Pb)$, where $\Omega$ is the sample space, $\mathcal{F}$ is a $\sigma$-algebra, and $\Pb$ is a probability measure on $\mathcal{F}$.

            \item On $\R^n$, we typically work with the Borel $\sigma$-algebra $\mathcal{B}(\R^n)$, and let $\mathcal{P}(\R^n)$ denote the set of all probability measures defined on $\mathcal{B}(\R^n)$. Symbols such as $\mu$ and $\nu$ represent elements of $\mathcal{P}(\R^n)$. The integral with respect to a measure $\mu$ is denoted by $\int f(\bm{x})\mathrm{d}\mu(\bm{x})$ or equivalently by $\int f(\bm{x})\mu(\mathrm{d}\bm{x})$.

            \item For a measurable map $f \colon \Omega \sto \R^n$, the push-forward measure of $\Pb$ under $f$ is denoted by $f_{\#}\Pb$, and is defined as
            \begin{equation*}
                  f_{\#}\Pb(U) = \Pb(f^{-1}(U)),\quad \forall~U \in \mathcal{B}(\R^n).
            \end{equation*}
            
            \item A random variable (or vector) $\bm{X} \colon \Omega \sto \R^n$ is a measurable map. Its distribution, denoted by $\Pb_X$ (or $\Pb^X$), is a probability measure on $\R^n$ defined by $\Pb_X = \bm{X}_{\#}\Pb$. For some $\mu \in \mathcal{P}(\R^n)$, we say $\bm{X} \sim \mu$ if $\mu = \Pb_X$. Two random variables $\bm{X}$ and $\bm{Y}$ are said to be equal in distribution, denoted by $\bm{X} \stackrel{\mathrm{d}}{=} \bm{Y}$, if $\Pb_X = \Pb_Y$.

            \item For $\bm{X} \sim \Pb_X$, if $\Pb_X$ is absolutely continuous with respect to the Lebesgue measure $\mathrm{d}\bm{x}$, then by the Radon-Nikodym Theorem, there is a function $p_X$ (or denoted by $p^X$) such that
            \begin{equation*}
                  \Pb_X(U) = \int_U p_X(\bm{x})\mathrm{d}\bm{x},\quad \forall~U \in \mathcal{B}(\R^n),
            \end{equation*}
            and $p_X$ is said the density function \footnote{When unambiguous, $p_X$ is also occasionally referred to as the distribution.} of $\bm{X}$. For a measurable function $g \colon \R^n \sto \R^m$, if $\bm{X} \sim \Pb_X$, then $g(\bm{X}) \sim g_{\#}\Pb_X$. When $\Pb_X$ admits a density $p_X$, the density of $g(\bm{X})$ is denoted by $g_{\#} p_X$. In particular, if $g(\bm{x}) = A\bm{x}$ for a matrix $A \in \R^{m \times n}$, $g_{\#}\Pb_X$ is also denoted by $A_{\#}\Pb_X$ for simplicity.

            \item For random variables $\bm{X} \colon \Omega \sto \R^n$ and $Y \colon \Omega \sto \R$, the joint distribution of $(\bm{X},Y) \colon \Omega \sto \R^n \times \R$ is denoted by $\Pb_{XY} = (\bm{X},Y)_{\#}\Pb$, a probability measure on $\R^n \times \R$. The conditional distribution $\Pb_{X\mid Y}(\cdot \mid Y)$ is defined as
            \begin{equation*}
                  \Pb_{X\mid Y}(U \mid Y) \defeq \Pb(\bm{X} \in U \mid Y),\quad \forall~U \in \mathcal{B}(\R^n),
            \end{equation*}
            which is a probability measure on $\R^n$.

            \item For a probability measure $\mu \in \mathcal{P}(\R^n)$, the support of $\mu$ is denoted by
            \begin{equation*}
                  \supp \mu = \bb{\bm{x} \in \R^n \colon \mu(B_r(\bm{x})) > 0,~\forall~r>0}
            \end{equation*}
            where $B_r(\bm{x}) \subset \R^n$ denotes the open ball centered at $\bm{x}$ with radius $r$. When $\mu$ admits a density function $p$,
            \begin{equation*}
                  \supp \mu = \clo{\bb{\bm{x} \in \R^n \colon p(\bm{x}) > 0}}
            \end{equation*}
      \end{enumerate}
\end{enumerate}

\section{More Details in Background}

\subsection{Analytic Solution for DDPMs}\label{appen:analytic_solution_for_ddpm}
To solve the SDE
\begin{equation*}
      \mathrm{d}\bm{X}_t = -\frac{1}{2}\beta(t)\bm{X}_t\mathrm{d}t + \sqrt{\beta(t)}\mathrm{d}\bm{W}_t,\quad \forall~ t \in [0,T],
\end{equation*}
we multiply both sides by the integrating factor $e^{\frac{1}{2}\int_0^t \beta(s)\mathrm{d}s}$. This gives
\begin{equation*}
      e^{\frac{1}{2}\int_0^t\beta(s)\mathrm{d}s}\mathrm{d}\bm{X}_t+\frac{1}{2}\beta(t)e^{\frac{1}{2}\int_0^t\beta(s)\mathrm{d}s}\bm{X}_t\mathrm{d}t = \sqrt{\beta(t)}e^{\frac{1}{2}\int_0^t\beta(s)\mathrm{d}s} \mathrm{d}\bm{W}_t,
\end{equation*}
which leads to
\begin{equation*}
      \mathrm{d}\bc{e^{\frac{1}{2}\int_0^t\beta(s)\mathrm{d}s}\bm{X}_t} = \sqrt{\beta(t)}e^{\frac{1}{2}\int_0^t\beta(s)\mathrm{d}s} \mathrm{d}\bm{W}_t,
\end{equation*}
by applying It\^o's formula to $e^{\frac{1}{2}\int_0^t\beta(s)\mathrm{d}s}\bm{X}_t$. Therefore, we obtain the solution
\begin{equation*}
  \bm{X}_t= \sqrt{\alpha_t}\bm{X}_0 + \bm{\xi}_t,
\end{equation*}
where $\alpha_t \defeq \exp\bc{-\int_0^t\beta(s)\mathrm{d}s}$, and
\begin{equation*}
      \bm{\xi}_t \defeq \int_0^te^{-\frac{1}{2}\int_s^t\beta(r)\mathrm{d}r}\sqrt{\beta(s)}\mathrm{d}\bm{W}_s.
\end{equation*}
Since $(\bm{W}_t)_{t \geq 0}$ is a standard Brownian motion on $\R^D$, it follows that $\bm{\xi}_t \sim \mathcal{N}(\bm{0},\sigma^2_{\xi_t} \bm{I}_D)$. To compute $\sigma^2_{\xi_t}$, let $\bj{\cdot,\cdot}$ denote the quadratic variation. Then
\begin{align*}
      \sigma^2_{\xi_t} &= \E\bj{\bj{\bm{\xi},\bm{\xi}}_t} \\
      &= \E\bj{\int_0^te^{-\int_s^t\beta(r)\mathrm{d}r}\beta(s) \mathrm{d} \bj{\bm{W},\bm{W}}_s} \\
      &= \int_0^te^{-\int_s^t\beta(r)\mathrm{d}r}\beta(s) \mathrm{d}s = 1-\exp\bc{-\int_0^t\beta(s)\mathrm{d}s}.
\end{align*}
(see \citet{le2016brownian} for details). As a result,
\begin{equation*}
      \bm{\xi}_t \stackrel{\mathrm{d}}{=} \sqrt{1-\alpha_t}\bm{\xi},\quad \bm{\xi} \sim \mathcal{N}(\bm{0},\bm{I}_D).
\end{equation*}

\subsection{Density Functions in Conditional DDPMs}\label{appen:density_functions_in_conditional_ddpm}

\begin{prop}\label{lem:conddpm}
      Consider a joint data density function $p(\bm{x}, y)$ and the process governed by the SDE:
      \begin{equation*}
            \mathrm{d}\bm{X}_t = -\frac{1}{2}\beta(t)\bm{X}_t\mathrm{d}t + \sqrt{\beta(t)}\mathrm{d}\bm{W}_t.
      \end{equation*}
      For the following two scenarios:
      \begin{enumerate}[label=(\alph*)]
            \item Let $\bm{X} \sim p(\bm{x} \mid Y=y)$, and run the SDE for $\bm{X}_0 =\bm{X}$. Let $p_t^y(\bm{x}_t)$ be the distribution of $\bm{X}_t$,
            \item Let $(\bm{X},Y) \sim p(\bm{x},y)$, and run the SDE for $\bm{X}_0 =\bm{X}$. Let $p_t(\bm{x}_t,y)$ be the distribution of $(\bm{X}_t,Y)$,
      \end{enumerate}
      Then, we have
      \begin{equation*}
            p_t^y(\bm{x}_t) = p_t(\bm{x}_t \mid y).
      \end{equation*}
\end{prop}
\begin{proof}
      As shown in Equation (\ref{eq:sol_DDPM}),
      \begin{equation*}
            \bm{X}_t\stackrel{\mathrm{d}}{=} \sqrt{\alpha_t}\bm{X}_0 + \sqrt{1-\alpha_t}\bm{\xi},\quad \bm{\xi} \sim \mathcal{N}(\bm{0},\bm{I}_D),
      \end{equation*}
      where $\alpha_t = \exp\bc{-\int_0^t\beta(s)\mathrm{d}s}$. Therefore, in the first case, we have
      \begin{equation*}
            p_t^y(\bm{x}_t) = (\sqrt{\alpha_t})_\#p(\bm{x} \mid y) * \mathcal{N}(\bm{0},(1-\alpha_t)\bm{I}_D).
      \end{equation*}
      Moreover, by Lemma \ref{lem:convsum}, since $\bm{\xi}$ is independent of $(\bm{X}_0, Y)$, it follows that
      \begin{align*}
            p_t(\bm{x}_t \mid y) &= p\bc{\sqrt{\alpha_t}\bm{x}_0 + \sqrt{1-\alpha_t}\bm{\xi} \mid y} \\
            &=  p\bc{\sqrt{\alpha_t}\bm{x} \mid y} * \mathcal{N}(\bm{0},(1-\alpha_t)\bm{I}_D) \\
            &= (\sqrt{\alpha_t})_\#p(\bm{x} \mid y) * \mathcal{N}(\bm{0},(1-\alpha_t)\bm{I}_D).
      \end{align*}
      Consequently, we obtain:
      \begin{equation*}
            p_t^y(\bm{x}_t) = p_t(\bm{x}_t \mid y). \qedhere
      \end{equation*} 
\end{proof}

\begin{lem}\label{lem:convsum}
      Consider three random variables, $\bm{X},\bm{Y} \in \R^n$, and $Z \in \R$. Let $\bm{Y}$ be independent of paired $(\bm{X},Z)$, and $\bm{W} = \bm{X}+\bm{Y}$. Then, we have
      \begin{equation*}
            p_{W\mid Z}(\bm{w} \mid z) = \bc{p_{X \mid Z}(\cdot \mid z) * p_Y(\cdot)}(\bm{w}).
      \end{equation*}
      Or informally,
      \begin{equation*}
            p_{XY \mid Z}(\bm{x}+\bm{y} \mid z) = p_{X \mid Z}(\bm{x} \mid z) * p_Y(\bm{y}).
      \end{equation*}
\end{lem}
\begin{proof}
      Because $\bm{Y}$ is independent of $(\bm{X},Z)$, 
      \begin{equation*}
            p_{XYZ}(\bm{x},\bm{y},z) = p_{XZ}(\bm{x},z)p_Y(\bm{y}).
      \end{equation*}
      Let $D_w =\bb{(\bm{x},\bm{y}) \colon \bm{x}+\bm{y} \leq \bm{w}}$. Then, we have
      \begin{align*}
            \Pb(\bm{W} \leq \bm{w},Z\leq z) &= \Pb(\bm{X}+\bm{Y} \leq \bm{w},Z\leq z) \\
            &= \int_0^z\bc{\iint_{D_w} p_{XYZ}(\bm{x},\bm{y},z) \mathrm{d}\bm{x}\mathrm{d}\bm{y}}\mathrm{d}z \\
            &= \int_0^z\bc{\iint_{D_w} p_{XZ}(\bm{x},z)p_Y(\bm{y}) \mathrm{d}\bm{x}\mathrm{d}\bm{y}}\mathrm{d}z \\
            &= \int_0^z \int_{\bm{0}}^{\bm{w}} \bc{p_{XZ}(\cdot,z) * p_Y(\cdot)}(\bm{s})\mathrm{d}\bm{s}\mathrm{d}z,
      \end{align*}
      where $\bm{W}=(W_i)_i \leq \bm{w}=(w_i)_i$ means $W_i \leq w_i$ for all $i=1,\ldots,n$, and $\int_{\bm{0}}^{\bm{w}}\mathrm{d}\bm{s} = \int_0^{w_n}\cdots \int_0^{w_1}\mathrm{d}s_1\cdots \mathrm{d}s_n$. It follows that
      \begin{equation*}
            p_{WZ}(\bm{w},z) = \bc{p_{XZ}(\cdot,z) * p_Y(\cdot)}(\bm{w}).
      \end{equation*}
      Therefore,
      \begin{equation*}
            p_{W\mid Z}(\bm{w} \mid z) = \frac{p_{WZ}(\bm{w},z)}{p_Z(z)} =  \bc{ \frac{p_{XZ}(\cdot,z)}{p_Z(z)}* p_Y(\cdot)}(\bm{w}) =  \bc{p_{X \mid Z}(\cdot \mid z) * p_Y(\cdot)}(\bm{w}). \qedhere
      \end{equation*}
\end{proof}

\begin{rmk}\label{rmk:conv_measure}
      In Lemma \ref{lem:convsum}, the existence of density functions is assumed, which also makes it necessary to assume the existence of the density for $\bm{X}_0$ in the proof of Proposition \ref{lem:conddpm}. However, this condition is often not satisfied in practice. To address this limitation, consider the convolution of two probability measures $\mu, \nu \in \mathcal{P}(\R^n)$, defined by
      \begin{equation*}
            \mu * \nu (U) \defeq \int_{\R^n}\int_{\R^n} \chi_U(\bm{x}+\bm{y}) \mathrm{d}\mu(\bm{x})\mathrm{d}\nu(\bm{y}).
      \end{equation*}
      Note that $\mu * \nu$ is still a probability measure. Moreover, it follows that if $\bm{X} \sim \mu$ and $\bm{Y} \sim \nu$ with $\bm{X}$ independent of $\bm{Y}$, then $\bm{X} + \bm{Y} \sim \mu*\nu$. Under this formulation, the conclusion of Lemma \ref{lem:convsum} remains valid in the general case:
      \begin{equation*}
            \Pb_{W\mid Z}(\cdot \mid Z) = \Pb_{X \mid Z}(\cdot \mid Z) * \Pb_{Y \mid Z}(\cdot \mid Z) =  \Pb_{X \mid Z}(\cdot \mid Z)*\Pb_Y(\cdot),
      \end{equation*}
      where the first equality follows from the fact that independence of $\bm{Y}$ and $(\bm{X}, Z)$ implies that $\bm{Y}$ is independent of $\bm{X}$ conditional on $Z$, and the second equality holds because $\bm{Y}$ is independent of $Z$ due to its independence from the pair $(\bm{X}, Z)$. Therefore, by following a similar line of reasoning as in the proof of Proposition \ref{lem:conddpm}—replacing statements about densities with statements about distributions—we can obtain the same result even when $\bm{X}_0$ does not admit a density function.
\end{rmk}

\section{More Details of Geometric Guidance}

\subsection{Omitted Poofs in Section \ref{sec:problem_setting_geometric_guidance}}\label{sub:omit_poofs_in_section_ref_sec_problem_setting_geometric_guidance}

\begin{proof}[Proof of Proposition \ref{prop:data_mfd_linear}]
      Fix a time $t > 0$. By Equation (\ref{eq:sol_DDPM}),
      \begin{equation*}
            \bm{X}_t = \sqrt{\alpha_t}A\bm{Z} + \sqrt{1-\alpha_t}\bm{\xi},
      \end{equation*}
      for some $\bm{\xi} \sim \mathcal{N}(\bm{0},\bm{I}_D)$. It follows that
      \begin{equation*}
            f(\bm{X}_t) \defeq \norm*{(\bm{I}_D - A A^\top)\bm{X}_t} = \sqrt{1-\alpha_t}\norm*{(\bm{I}_D - A A^\top)\bm{\xi}}.
      \end{equation*}
      Note that $A A^\top$ is the orthogonal projection to $\Img A$. Therefore, there exists a $U \in \mathcal{O}^{D \times D}$ such that
      \begin{equation*}
            \bm{I}_D - A A^\top = U^\top \diag(\underbrace{1,\ldots,1}_{D-d},0,\ldots,0)U.
      \end{equation*}
      Moreover, the orthogonality of $U$ implies that $\bm{\nu} = (\nu_1,\ldots,\nu_D)^\top = U\bm{\xi} \sim \mathcal{N}(0,\bm{I}_D)$. Hence,
      \begin{equation*}
            f(\bm{X}_t) =\sqrt{1-\alpha_t}\norm*{(\bm{I}_D - A A^\top)\bm{\xi}} = \sqrt{1-\alpha_t}\bc{\nu_1^2+\cdots+\nu_{D-d}^2}^{\frac{1}{2}}.
      \end{equation*}
      For any $\varepsilon > 0$, by setting $\alpha = (D-d)\varepsilon$ in the Laurent-Massart bound (Lemma \ref{lem:laurent_massart}), we obtain
      \begin{equation*}
            \Pb \bc{r(t)\sqrt{1-2\sqrt{\varepsilon}} \leq f(\bm{X}_t) \leq r(t)\sqrt{1+2\sqrt{\varepsilon}+2\varepsilon}} \geq 1-2e^{-2(D-d)\varepsilon}.
      \end{equation*}
      Since $d \ll D$, we can choose $\varepsilon$ sufficiently small such that $\delta = e^{-2(D-d)\varepsilon}$ is also sufficiently small. As a result, $\Pb(f(\bm{X}_t) \approx r(t)) \geq 1 - \delta$, i.e., $\bm{X}_t$ concentrates on $\mathcal{M}^t = f^{-1}(r(t))$ with high probability.
\end{proof}

\begin{proof}[Proof of Theorem \ref{thm:role_of_score_linear}]
      First, by applying the orthogonal decomposition of the score function in Equation (\ref{eq:decomoflogp}), the deterministic reverse process (\ref{eq:reverse_ODE}) can be rewritten as
      \begin{equation}\label{eq:reverse_ODE_decomp}
          \frac{\mathrm{d}}{\mathrm{d}t}\bm{X}_t^\leftarrow = \bm{X}_t^\leftarrow+A\nabla_{\bm{z}}\log p^Z_{T-t}(A^\top\bm{X}_t^\leftarrow) - \frac{1}{1-\alpha_{T-t}}(\bm{I}_D-AA^\top)\bm{X}_t^\leftarrow.
      \end{equation}
      \begin{enumerate}[label=(\alph{*})]
            \item Because $A \in \mathcal{O}^{D \times d}$, we have $A^\top A = \bm{I}_d$ and $A^\top(\bm{I}_D - AA^\top) = \bm{O}$. Therefore, by multiplying $A^\top$ on the both sides of (\ref{eq:reverse_ODE_decomp}), 
            \begin{equation*}
                  \frac{\mathrm{d}}{\mathrm{d}t}\bm{Z}_t^\leftarrow = \bm{Z}_t^\leftarrow + \nabla_{\bm{z}}\log p_{T-t}^Z(\bm{Z}_t^\leftarrow),
            \end{equation*}
            for $\bm{Z}_t^\leftarrow = A^\top \bm{X}_t$. Moreover, by the equivalence of the continuity equation of the Fokker-Planck equation (or by the statements in Appendix \ref{appen:decomposition_of_score_function}), $\bm{Z}_t = \bm{Z}_{T-t}^\leftarrow$ satisfies the forward process of DDPMs starting from $p^Z$.

            \item Similarly, by multiplying $\bm{I}_D - AA^\top$ on the both sides of (\ref{eq:reverse_ODE_decomp}),
            \begin{equation*}
                \frac{\mathrm{d}}{\mathrm{d}t}\bm{X}_{t,\perp}^\leftarrow = \bm{X}_{t,\perp}^\leftarrow - \frac{1}{1-\alpha_{T-t}}\bm{X}_{t,\perp}^\leftarrow = -\frac{\alpha_{T-t}}{1-\alpha_{T-t}}\bm{X}_{t,\perp}^\leftarrow,
            \end{equation*}
            for $\bm{X}_{t,\perp}^\leftarrow = (\bm{I}_D - AA^\top)\bm{X}_t$. Note that $\alpha_{T-t} = e^{-2(T-t)}$. Therefore, this equation has the analytical solution given by
            \begin{equation*}
                \bm{X}_{t_0 + \delta,\perp}^\leftarrow = \sqrt{\frac{1-e^{-2\bc{T-(t_0 + \delta)}}}{1-e^{-2(T-t_0)}}}\bm{X}_{t_0,\perp}^\leftarrow.
            \end{equation*}
            When $\norm{\bm{X}_{t_0,\perp}^\leftarrow} = \sqrt{(D-d)\bc{1-e^{-2(T-t_0)}}}$, it follows that
            \begin{equation*}
                  \norm*{\bm{X}_{t_0 + \delta,\perp}^\leftarrow} = \sqrt{\frac{1-e^{-2\bc{T-(t_0 + \delta)}}}{1-e^{-2(T-t_0)}}}\norm*{\bm{X}_{t_0,\perp}^\leftarrow} =  \sqrt{(D-d)\bc{1-e^{-2(T-(t_0 + \delta))}}}. \qedhere
            \end{equation*}
      \end{enumerate}
\end{proof}

\subsection{Decomposition of Score Function}\label{appen:decomposition_of_score_function}

By Equation (\ref{eq:sol_DDPM}) and the assumption $\bm{X}_0 = A\bm{Z}$, we have
\begin{align*}
      \bm{X}_t &= \sqrt{\alpha_t}\bm{X}_0 + \sqrt{1-\alpha_t}\bm{\xi}\\
      &= \underbrace{\sqrt{\alpha_t}\bm{X}_0+\sqrt{1-\alpha_t}Q\bm{\xi}}_{\eqdef\bm{X}_{t,\parallel}}+\underbrace{\sqrt{1-\alpha_t}(\bm{I}_D-Q)\bm{\xi}}_{\eqdef\bm{X}_{t,\perp}}
\end{align*}
for some $\bm{\xi} \sim \mathcal{N}(\bm{0},\bm{I}_D)$, where $Q = AA^\top$ is the orthogonal projection onto $\Img A$. 

We compute the covariance:
\begin{align*}
      \op{Cov}(Q\bm{\xi},(\bm{I}_D-Q)\bm{\xi}) &= \E\bj{Q\bm{\xi}\cdot((\bm{I}_D-Q)\bm{\xi})^\top} - \E\bj{Q\bm{\xi}}\cdot\E\bj{(\bm{I}_D-Q)\bm{\xi})}^\top \\
      &=\E\bj{Q\bm{\xi}\cdot((\bm{I}_D-Q)\bm{\xi})^\top} = Q\E\bj{\bm{\xi}\bm{\xi}^\top}(\bm{I}_D-Q) \\
      &=Q(\bm{I}_D-Q) = 0,
\end{align*}
which shows that $Q\bm{\xi}$ and $(\bm{I}_D - Q)\bm{\xi}$ are uncorrelated. Since both are Gaussian, they are independent. Hence, $\bm{X}_{t,\perp}$ is independent of $\sqrt{1 - \alpha_t} Q\bm{\xi}$. Combined with the fact that $\bm{\xi}$ is independent of $\bm{X}_0$, it follows that $\bm{X}_{t,\parallel}$ is independent of $\bm{X}_{t,\perp}$. By Lemma \ref{lem:orth_decomp_density}, the density of $\bm{X}_t$ admits the decomposition
\begin{equation}\label{eq:prod_comp_density}
      p_t(\bm{x}) = p_{t,\parallel}(\bm{x}_\parallel)p_{t,\perp}(\bm{x}_\perp),
\end{equation}
where $p_{t,\parallel}$ and $p_{t,\perp}$ are the densities of $\bm{X}_{t,\parallel}$ and $\bm{X}_{t,\perp}$ with respect to the canonical volume measures on $\Img A$ and $(\Img A)^\perp$, respectively. Here, $\bm{x}_{\parallel} = Q\bm{x}$ and $\bm{x}_{\perp} = \bm{x} - \bm{x}_\parallel$. 

Next, let us analyze $p_{t,\parallel}$ and $p_{t,\perp}$, respectively.
\begin{enumerate}[label=(\roman{*})]
      \item For the parallel part, first define $\bm{Z}_t \defeq A^\top \bm{X}_t$. Then, by multiplying $A^\top$ on the both sides of Equation (\ref{eq:DDPM_SDE}), we obtain
      \begin{equation*}
            \mathrm{d}\bm{Z}_t = -\bm{Z}_t\mathrm{d}t + \sqrt{2}\mathrm{d}\bm{B}_t,
      \end{equation*}
      where $(\bm{B}_t)_{t \geq 0} = (A^\top \bm{W}_t)_{t \geq 0}$ is a standard Brownian motion on $\R^d$ by Lemma \ref{lem:trans_browanian}. Therefore, the process $\bm{Z}_t \sim p^Z_t$ is governed by the DDPM dynamics initialized from $p^Z$. Since
      \begin{equation*}
            \bm{X}_{t,\parallel} = Q\bm{X}_t = A\bm{Z}_t,
      \end{equation*}
      this shows that $\bm{X}_{t,\parallel}$ evolves as a diffusion process on the target data manifold $\mathcal{M} = \Img A$.
      
      Moreover, applying Lemma \ref{lem:change_var_mfd} gives
      \begin{equation}\label{eq:para_part_score}
            p_{t,\parallel}(\bm{x}_\parallel) = A_{\#}p^Z_t(\bm{x}_\parallel) = p^Z_t(A^\top\bm{x}_\parallel) = p^Z_t(A^\top\bm{x}).
      \end{equation}

      \item For the orthogonal part, we have
      \begin{equation*}
            \bm{X}_{t,\perp} = \sqrt{1-\alpha_t}P\bm{\xi} \sim \mathcal{N}(\bm{0},(1-\alpha_t)P),
      \end{equation*}
      where $P = \bm{I}_D - Q$ is an orthogonal projection with rank $D-d$. So $P = B^\top B$ for some $B \in \mathcal{O}^{D \times (D-d)}$. It follows that $\bm{X}_{t,\perp}$ is a Gaussian on $\Img B$, i.e., $\bm{X}_{t,\perp} = B\bm{W}$ for some $\bm{W} \sim \mathcal{N}(\bm{0},(1-\alpha_t)\bm{I}_{D-d})$. Therefore, $\bm{X}_{t,\perp}$ is basically a $(D-d)$-dimensional Gaussian. When $d \ll D$, as shown in the proof in Proposition \ref{prop:data_mfd_linear},
      \begin{equation*}
            \norm*{(\bm{I}_D - AA^\top)\bm{X}_t} = \norm*{\bm{X}_{t,\perp}} \approx r(t),
      \end{equation*}
      which implies that the orthogonal part $\bm{X}_{t,\perp}$ is responsible for the concentration of $\bm{X}_t$ on $\mathcal{M}^t$ and endows $\bm{X}_t$ with its geometric structure. Furthermore, by Lemma \ref{lem:gaussunif}, $p_t^\perp$ is approximately uniform on the sphere $\mathbb{S}^{(D-d)-1}(r(t))$. In other words, the density $p_t$, which is concentrated on the cylindrical-like surface $\mathcal{M}^t$, remains constant along radial directions and varies only in the longitudinal direction governed by $p_{t,\parallel}$—a consequence of diffusion along the subspace $\Img A$.

      Moreover, applying Lemma \ref{lem:change_var_mfd} again, we obtain
      \begin{equation}\label{eq:orth_part_score}
            p_{t,\perp}(\bm{x}_\perp) = B_{\#}p^W(\bm{x}_\perp) = p^W(B^\top \bm{x}_\perp) = p^W(B^\top \bm{x}),
      \end{equation}
      where
      \begin{equation*}
            p^W(\bm{w}) = (2\pi(1-\alpha_t))^{-\frac{D-d}{2}}\exp\bc{-\frac{\norm*{\bm{w}}^2}{2(1-\alpha_t)}}.
      \end{equation*}
\end{enumerate}

Finally, for the decomposition, by combining (\ref{eq:para_part_score}) and (\ref{eq:orth_part_score}) with (\ref{eq:prod_comp_density}), we get
\begin{equation*}
      \log p_t(\bm{x}) = \log p^Z_t(A^\top\bm{x}) + \log p^W(B^\top\bm{x}),
\end{equation*}
from which the orthogonal decomposition formula immediately follows:
\begin{equation*}
    \nabla_{\bm{x}} \log p_t(\bm{x}) = A\lv{\nabla_{\bm{z}}\log p^Z_t(\bm{z})}_{\bm{z} = A^\top\bm{x}} - \frac{1}{1-\alpha_t}(\bm{I}_D-P)\bm{x},
\end{equation*}
as originally derived via direct computation by \citet{chen2023score}.

For the geometric property, the randomness of $\bm{X}_t$ arises from the diffusion process on the target data manifold $\mathcal{M} = \Img A$, while the geometric structure of $\bm{X}_t$ results from the concentration behavior of the orthogonal part.

\subsection{Construction of Geometric Guidance}\label{appen:construction_of_geometric_guidance}

To clarify our intuition about $\nabla_{\bm{x}} \log p_t(y = 1 \mid \bm{x})$ ``almost normal'' to $\mathcal{M}_1^t$, we will show that there exists a small $\beta_t > 0$ such that
\begin{equation*}
      \norm{\nabla_{\bm{x}} \log p_t(y = 1 \mid \bm{x}) + \eta_tP_1\bm{x}} \leq \beta_t,\quad \forall~\bm{x} \in \mathcal{M}_1^t,
\end{equation*}
for some scalar $\eta_t > 0$. But first, we need the following lemma.

\begin{lem}\label{lem:normal_approx}
      Let $\mathcal{M} \subset \R^D$ be a smooth manifold with dimension $D-1$. Let $V \subset \R^D$ be a tubular neighborhood of $\mathcal{M}$ with the orthogonal projection $\pi \colon V \sto \mathcal{M}$. Let $f \colon V \sto \R$ be a $C^2$-function satisfying the following two conditions.
      \begin{enumerate}[label=(\alph{*})]
            \item $\norm{\nabla^2_{\bm{x}}f(\bm{x})}_{\op{op}} \leq L$.
            \item $f|_{\mathcal{M}}$ is $\beta$-Lipschitz with the induced distance of $\R^n$ on $\mathcal{M}$.
      \end{enumerate}
      Then for any $\bm{x} \in V$,
      \begin{equation*}
            \norm{\nabla_{\bm{x}}f(\bm{x}) - \partial_nf(\pi(\bm{x}))n(\pi(\bm{x}))} \leq \beta + L\op{dist}(\bm{x},\mathcal{M}),
      \end{equation*}
      where $n \colon \mathcal{M} \sto \R^n$ is a continuous unit normal vector field along $\mathcal{M}$, $\partial_n f = \inn{\nabla f, n}$ the derivative along $n$, and $\op{dist}(\bm{x},\mathcal{M}) = \inf \bb{\norm{\bm{x} - \bm{y}} \colon \bm{y} \in \mathcal{M}}$ is the distance from $\bm{x}$ to $\mathcal{M}$.
\end{lem}
\begin{proof}
      Let $\mathcal{M}$ be equipped with the induced Riemannian structure of $\R^n$ and $\nabla^M$ be the corresponding Levi-Civita connection. Because $\mathcal{M} \subset \R^D$ is a hypersurface, i.e., submanifold with dimension $D-1$, 
      \begin{equation}\label{eq:levi_civita_decomp}
          \nabla f = \nabla^M f + (\partial_n f)n,
      \end{equation}
      see the details in \citet[Chapter 8]{lee2019introduction}. Fix $\bm{x} \in V$ with $\bm{y} = \pi(\bm{x}) \in \mathcal{M}$. Note that
      \begin{equation}\label{eq:dist_proj}
          \op{dist}(\bm{x},\mathcal{M}) = \norm{\bm{x} - \bm{y}},
      \end{equation}
      by \citet[Proposition 5.26 (c)]{lee2019introduction}. Writing
      \begin{equation}\label{eq:decomp_grad_ineq}
          \norm{\nabla_{\bm{x}}f(\bm{x}) - \partial_nf(\bm{y})n(\bm{y})} \leq \norm{\nabla_{\bm{x}}f(\bm{x}) - \nabla_{\bm{x}}f(\bm{y})} + \norm{\nabla_{\bm{x}}f(\bm{y}) - \partial_nf(\bm{y})n(\bm{y})}.
      \end{equation}
      \begin{enumerate}[label=\Roman{*}.]
            \item For the first term, by
            \begin{equation*}
                \nabla_{\bm{x}}f(\bm{x}) - \nabla_{\bm{x}}f(\bm{y}) = \int_0^1 \nabla_{\bm{x}}^2f(\bm{y} + s(\bm{x}-\bm{y}))(\bm{x} - \bm{y})\mathrm{d}s,
            \end{equation*}
            the fact that $\norm{\nabla^2_{\bm{x}}f(\bm{x})}_{\op{op}} \leq L$, and Equation (\ref{eq:dist_proj}), we have
            \begin{equation}\label{eq:grad_ineq_first}
                \norm{\nabla_{\bm{x}}f(\bm{x}) - \nabla_{\bm{x}}f(\bm{y})} \leq L\norm{\bm{x} - \bm{y}} = L\op{dist}(\bm{x},\mathcal{M}).
            \end{equation}

            \item For the second term, first, by (\ref{eq:levi_civita_decomp}),
            \begin{equation*}
                \norm{\nabla_{\bm{x}}f(\bm{y}) - \partial_nf(\bm{y})n(\bm{y})} = \norm{\nabla^M f(\bm{y})}.
            \end{equation*}
            By assumption, $f|_{\mathcal{M}}$ is $\beta$-Lipschitz with the induced distance of $\R^n$ on $\mathcal{M}$, i.e.,
            \begin{equation*}
                \abs{f(\bm{y}_1) - f(\bm{y}_2)} \leq \beta d_{\mathcal{M}}(\bm{y}_1,\bm{y}_2),
            \end{equation*}
            where $d_{\mathcal{M}}$. It implies that
            \begin{equation}\label{eq:grad_ineq_second}
                \norm{\nabla^M f(\bm{z})} \leq \beta,\quad \forall~\bm{z} \in \mathcal{M},
            \end{equation}
            see the details in \citet[Proposition 10.43]{boumal2023introduction}.
      \end{enumerate}
      Then combining the inequalities (\ref{eq:grad_ineq_first}) and (\ref{eq:grad_ineq_second}) with (\ref{eq:decomp_grad_ineq}),
      \begin{equation*}
          \norm{\nabla_{\bm{x}}f(\bm{x}) - \partial_nf(\bm{y})n(\bm{y})} \leq \beta + L\op{dist}(\bm{x},\mathcal{M}). \qedhere
      \end{equation*}
\end{proof}

Let $f_t(\bm{x}) = \log p_t(y = 1 \mid \bm{x})$. It is natural to assume that $f_t$ is $C^2$ on a tubular neighborhood $V$ of $\mathcal{M}^t_1$, that $\norm{\nabla^2 f_t}_{\op{op}} \leq L_t$ on $V$, and that $f_t$ is $\beta_t$-Lipschitz continuous on $\mathcal{M}^t_1$. Then by Lemma \ref{lem:normal_approx},
\begin{equation*}
    \norm{\nabla_{\bm{x}}f_t(\bm{x}) - \partial_nf_t(\pi(\bm{x}))n_t(\pi(\bm{x}))} \leq \beta_t + L_t\op{dist}(\bm{x},\mathcal{M}^t_1).
\end{equation*}
In particular, for any $\bm{x} \in \mathcal{M}^t_1$ and $\pi(\bm{x}) = \bm{x}$, we have
\begin{equation*}
    \norm{\nabla_{\bm{x}}f_t(\bm{x}) - \partial_nf_t(\bm{x})n_t(\bm{x})} \leq \beta_t.
\end{equation*}
Two questions remain: whether $\partial_nf_t(\bm{x})n_t(\bm{x}) = -\eta_tP_1\bm{x}$ for some scalar $\eta_t > 0$, and how to bound $\beta_t$.

For the first question, we can choose $n_t(\bm{x}) = P_1\bm{x} / \norm{P_1\bm{x}}$ by the definition (\ref{eq:def_of_m_1_t}) of $\mathcal{M}^t_1$ and Lemma \ref{lem:normal_mfd_const}. So
\begin{equation*}
      \partial_nf_t(\bm{x})n_t(\bm{x}) = -\eta_t P_1\bm{x},
\end{equation*}
for 
\begin{equation*}
      \eta_t = -\frac{\partial_n f_t(\bm{x})}{\norm{P_x \bm{x}}}.
\end{equation*}
Moreover, because $p_t(y = 1 \mid \bm{x})$ is the classifier for $(\bm{X}_t,y=1)$ and such $\bm{X}_t$ concentrates on $\mathcal{M}^t_1$ by Proposition \ref{prop:data_mfd_linear}, $f_t(\bm{x}) = \log p_t(y = 1 \mid \bm{x})$ decreases when $\bm{x}$ moves away from $M^t_1$. So
\begin{equation*}
      \partial_n f_t(\bm{x}) < 0\quad \Rightarrow \quad \eta_t > 0.
\end{equation*}

Next, to bound $\beta_t$, we introduce the following lemma.
\begin{lem}\label{lem:confi_grad}
      Let $p(y=k \mid \bm{x})$ be a softmax classifier with logits $g_k(\bm{x})$ for $k=1,2,\cdots,K$, that is,
      \begin{equation*}
            p(y=k \mid \bm{x}) = \frac{\exp(g_k(\bm{x}))}{\sum_{k=1}^K \exp(g_k(\bm{x}))}.
      \end{equation*} 
      Assume $\norm{\nabla_{\bm{x}}g_k(\bm{x})} \leq L$ for all $k,\bm{x}$. Let $\mathcal{M}_k$ be the region where points with label $y=k$ concentrate on. Assume classifier confidence
      \begin{equation*}
            p(y = k \mid \bm{x}) > 1 - \varepsilon,\quad \forall~\bm{x} \in \mathcal{M}_k.
      \end{equation*}
      Then
      \begin{equation*}
            \norm{\nabla_{\bm{x}} \log p(y = k \mid \bm{x})} \leq 2L\varepsilon,\quad\forall~\bm{x} \in \mathcal{M}_k.
      \end{equation*}
\end{lem}
\begin{proof}
      Fix $k$. Let $f(\bm{x}) = \log p(y = k \mid \bm{x})$.
      \begin{align*}
            \nabla_{\bm{x}} f(\bm{x}) &= \nabla_{\bm{x}} g_k(\bm{x}) - \sum_{j=1}^K p(y = j \mid \bm{x})\nabla_{\bm{x}} g_j(\bm{x}) \\
            &= \sum_{j=1}^K p(y = j \mid \bm{x})\bc{\nabla_{\bm{x}} g_j(\bm{x}) - \nabla_{\bm{x}} g_k(\bm{x})} \\
            &= \sum_{j\neq k} p(y = j \mid \bm{x})\bc{\nabla_{\bm{x}} g_j(\bm{x}) - \nabla_{\bm{x}} g_k(\bm{x})} 
      \end{align*}
      By assumption,
      \begin{equation*}
            \norm*{\nabla_{\bm{x}} g_j(\bm{x}) - \nabla_{\bm{x}} g_k(\bm{x})} \leq \norm*{\nabla_{\bm{x}} g_j(\bm{x})} +  \norm*{\nabla_{\bm{x}} g_k(\bm{x})} \leq 2L.
      \end{equation*}
      Therefore,
      \begin{align*}
          \norm*{\nabla_{\bm{x}} f(\bm{x})} &\leq \sum_{j\neq k} p(y = j \mid \bm{x})\norm*{\nabla_{\bm{x}} g_j(\bm{x}) - \nabla_{\bm{x}} g_k(\bm{x})} \\
          &\leq 2L\sum_{j\neq k} p(y = j \mid \bm{x}) = 2L(1-p(y = k \mid \bm{x})).
      \end{align*}
      It implies that
      \begin{equation*}
          \norm*{\nabla_{\bm{x}} f(\bm{x})} \leq 2L\varepsilon,\quad \forall~\bm{x} \in \mathcal{M}_k,
      \end{equation*}
      by the assumption that classifier confidence $> 1 - \varepsilon$ on $\mathcal{M}_k$. \qedhere
\end{proof}

Therefore, for all $p_t(y = 1 \mid \bm{x})$, we assume that they satisfy the conditions in Lemma \ref{lem:confi_grad}. Then if
\begin{equation*}
      p_t(y = 1 \mid \bm{x}) > 1- \varepsilon_t,\quad \forall~\bm{x} \in \mathcal{M}^t_1,
\end{equation*}
for a small $\varepsilon_t$, then
\begin{equation*}
      \norm{\nabla^M f_t(\bm{x})} \leq \norm{\nabla_{\bm{x}} f_t(\bm{x})} = \sqrt{\norm{\nabla^M f_t(\bm{x})}^2 + \abs{\partial_n f_t(\bm{x})}^2} \leq 2C\varepsilon_t,\quad \forall~\bm{x} \in \mathcal{M}^t_1.
\end{equation*}
So $\beta_t \leq 2C\varepsilon_t$. 

Combining above results, we have
\begin{equation*}
      \norm{\nabla_{\bm{x}} \log p_t(y = 1 \mid \bm{x}) + \eta_tP_1\bm{x}} \leq \beta_t,\quad \forall~\bm{x} \in \mathcal{M}_1^t,
\end{equation*}
for some $\eta_t > 0$. Moreover, $\beta_t  =\mathcal{O}(\varepsilon_t)$ for $p_t(y = 1 \mid \bm{x}) > 1- \varepsilon_t$ on $\mathcal{M}_1^t$.

%\begin{rmk}
%      At early stages of the reverse process (small $t$), $p_t(y = 1 \mid \bm{x})$ may be insufficiently confident on $\mathcal{M}_1^t$ due to class mixing, so the direction of $\nabla_{\bm{x}} \log p_t(y = 1 \mid \bm{x})$ may not be well approximated by $P_1 \bm{x}$. However, this does not affect our main linear-theory guarantee: the strong log-concavity of $p_t^\sigma$ (Theorem \ref{thm:smoothofscore}) also stabilizes the geometric guidance model (\ref{eq:geometric_guidance}) with respect to the initial condition. Accordingly, we may restrict~(\ref{eq:geometric_guidance}) to the interval $[t_0, T - \delta]$, where $T - t_0$ precedes class mixing, and set $\tilde{\bm{X}}_{t_0} \sim q = p_{T - t_0}$. This modification does not affect Theorem \ref{thm:estimmfd}. For Theorem \ref{thm:wassbound}, it only changes the initial error $\mathcal{W}_1(\hat{p}_\delta^\sigma, p_\delta^\sigma)$ in Proposition \ref{prop:w2ofdmfromdiffini}; taking $\hat{X}_{t_0} \sim q$ in (\ref{eq:origfromguass}), Equation (\ref{eq:wassboundofini}) still yields $\mathcal{W}_1(\hat{p}_\delta^\sigma, p_\delta^\sigma) \leq \mathcal{O}(e^{-T})$.
%\end{rmk}

\section{More Details related to Main Results}

\subsection{Omitted Proofs in Section \ref{sub:smoothness_and_concavity}}\label{appen:omitted_proofs_in_section_ref_sub_smoothness_and_concavity}

\begin{proof}[Proof of Proposition \ref{prop:smoothoflatentdens}]
      First, the Hessian is 
      \begin{equation*}
            \nabla^2_{\bm{x}}\log p_X(\bm{x}) = \frac{\nabla^2_{\bm{x}}p_X(\bm{x})}{p_X(\bm{x})} - \frac{\nabla_{\bm{x}}p_X(\bm{x})\nabla_{\bm{x}}p_X(\bm{x})^\top}{p_X(\bm{x})^2}.
      \end{equation*}
      To express the above formula explicitly, by the definition, for any $\bm{x} \in \R^n$,
      \begin{equation*}
            p_X(\bm{x}) = \int_{\R^k}K_{z}(\bm{x})p^Z(\bm{z})\mathrm{d}\bm{z},\quad K_{z}(\bm{x}) \defeq (2\pi \beta^2)^{-\frac{n}{2}}\exp\bc{-\frac{\norm*{\bm{x} - \alpha  B\bm{z}}^2}{2\beta^2}},
      \end{equation*}
      and so
      \begin{align*}
            \nabla_{\bm{x}}K_{z}(\bm{x}) &= \frac{\alpha B\bm{z}-\bm{x}}{\beta^2}K_z(\bm{x}),\\
            \nabla^2_{\bm{x}}K_{z}(\bm{x}) &= -\frac{1}{\beta^2}K_z(\bm{x})\bm{I}_n + \frac{(\bm{x} - \alpha  B\bm{z})(\bm{x} - \alpha  B\bm{z})^\top}{\beta^4}K_z(\bm{x}).
      \end{align*}
      Let 
      \begin{equation*}
            \mathrm{d}\mu_x(\bm{z}) = \frac{K_z(\bm{x})p^Z(\bm{z})}{p^X(\bm{x})}\mathrm{d}\bm{z}
      \end{equation*}
      be the posterior probability measure on $\R^k$. Then, for the first term
      \begin{equation*}
            \frac{\nabla^2_{\bm{x}}p_X(\bm{x})}{p_X(\bm{x})} = \frac{\int_{\R^k}\nabla^2_{\bm{x}}K_{z}(\bm{x})p^Z(\bm{z})\mathrm{d}\bm{z}}{p_X(\bm{x})} = -\frac{1}{\beta^2}\bm{I}_n + \frac{1}{\beta^4}\E_{\bm{Z}\sim\mu_x}\bj{(\bm{x} - \alpha B\bm{Z})(\bm{x} - \alpha B\bm{Z})^\top},
      \end{equation*}
      and for the second term
      \begin{equation*}
            \frac{\nabla_{\bm{x}}p_X(\bm{x})\nabla_{\bm{x}}p_X(\bm{x})^\top}{p_X(\bm{x})^2} = \frac{1}{\beta^4}\E_{\bm{Z}\sim\mu_x}\bj{\bm{x} - \alpha B\bm{Z}}\E_{\bm{Z}\sim\mu_x}\bj{\bm{x} - \alpha B\bm{Z}}^\top.
      \end{equation*}
      Moreover, note that
      \begin{align*}
            &\quad \E_{\bm{Z}\sim\mu_x}\bj{(\bm{x} - \alpha B\bm{Z})(\bm{x} - \alpha B\bm{Z})^\top} - \E_{\bm{Z}\sim\mu_x}\bj{\bm{x} - \alpha B\bm{Z}}\E_{\bm{Z}\sim\mu_x}\bj{\bm{x} - \alpha B\bm{Z}}^\top \\
            &= \op{Cov}_{\bm{Z}\sim\mu_x}(\bm{x} - \alpha B\bm{Z}) = \alpha^2 \op{Cov}_{\mu_x}( B\bm{Z}) = \alpha^2B\op{Cov}_{\mu_x}(\bm{Z})B^\top.
      \end{align*}
      Therefore, we get
      \begin{equation}\label{eq:logp}
            \nabla^2_{\bm{x}}\log p_X(\bm{x}) = \frac{\alpha^2}{\beta^4}B\op{Cov}_{\mu_x}(\bm{Z})B^\top - \frac{1}{\beta^2}\bm{I}_n.
      \end{equation} 
      It follows that
      \begin{equation}\label{eq:logpop}
            \norm*{\nabla^2_{\bm{x}}\log p_X(\bm{x})}_{\op{op}} \leq \frac{1}{\beta^2} + \frac{\alpha^2\Lambda}{\beta^4}\norm*{\op{Cov}_{\mu_x}(\bm{Z})}_{\op{op}}.
      \end{equation}
      It is sufficient to bound $\norm*{\op{Cov}_{\mu_x}(\bm{Z})}_{\op{op}}$. To do that, $\mu_x$ is required to satisfy the Poincar\'e Inequality. Let $p^Z(\bm{z}) = \exp(-V(\bm{z}))$ for some $V \colon \R^k \sto \R$ and 
      \begin{equation*}
            U_x(\bm{z}) \defeq \frac{\norm*{\bm{x}-\alpha B\bm{z}}^2}{2\beta^2} + V(\bm{z}),
      \end{equation*}
      which indicates that $\mathrm{d}\mu_x(\bm{z}) = e^{-U_x(\bm{z})}\mathrm{d}\bm{z} / \int e^{-U_x}$. Because $\nabla^2_{\bm{z}} V(\bm{z}) = - \nabla^2_{\bm{z}} \log p^Z(\bm{z}) \succeq m_0 \bm{I}_k$,
      \begin{equation*}
            \nabla^2_{\bm{z}}U_x(\bm{z}) = \frac{\alpha^2}{\beta^2}B^\top B + \nabla^2_{\bm{z}}V(\bm{z}) \succeq \bc{\frac{\alpha^2 \lambda}{\beta^2} + m_0}\bm{I}_k.
      \end{equation*}
      Then, by Lemma \ref{lem:poincare}, $\mu_x$ satisfies the Poincar\'e Inequality with constant $m \defeq {\alpha^2 \lambda}/{\beta^2} + m_0$. Thus, for any $C^1$ function $f \colon \R^k \sto \R$,
      \begin{equation*}
            \op{Var}_{\mu_x}(f) \leq \frac{1}{m}\E_{\mu_x}\bj{\norm*{\nabla f}^2}.
      \end{equation*}
      For any $\bm{u} \in \R^n$, let $f_u(\bm{z}) = \inn{\bm{u},\bm{z}}$ with $\nabla_{\bm{z}} f_u(\bm{z}) = \bm{u}$. The above inequality implies that 
      \begin{equation*}
            \bm{u}^\top \op{Cov}_{\mu_x}(\bm{Z})\bm{u} = \op{Var}_{\mu_x}(f_u) \leq \frac{1}{m}\E_{\mu_x}\bj{\norm*{\nabla_{\bm{z}} f_u}^2} \leq \frac{1}{m}\norm*{\bm{u}}^2,
      \end{equation*}
      for any $\bm{u} \in \R^k$. Therefore,
      \begin{equation}\label{eq:covmuop}
            \norm*{\op{Cov}_{\mu_x}(\bm{Z})}_{\op{op}} \leq \frac{1}{m}.
      \end{equation}
      Finally, by plugging inequality (\ref{eq:covmuop}) into Equation (\ref{eq:logpop}), we get the result
      \begin{equation*}
            \norm*{\nabla^2_{\bm{x}}\log p_X(\bm{x})}_{\op{op}} \leq \frac{1}{\beta^2} + \frac{\alpha^2\Lambda}{\beta^2(\alpha^2\lambda+m_0\beta^2)}. \qedhere
      \end{equation*}
\end{proof}

\begin{proof}[Proof of Corollary \ref{cor:upboundoflog}]
      By Equation (\ref{eq:covmuop}),
      \begin{equation*}
            \norm*{B\op{Cov}_{\mu_x}(\bm{Z})B^\top}_{\op{op}} \leq \frac{\Lambda}{m}~\Rightarrow~B\op{Cov}_{\mu_x}(\bm{Z})B^\top \preceq \frac{\Lambda}{m}\bm{I}_n.
      \end{equation*}
      By combining this with Equation (\ref{eq:logp}), we have
      \begin{equation*}
            \nabla^2_{\bm{x}}\log p_X(\bm{x}) \preceq \bc{\frac{\alpha^2\Lambda}{\beta^2(\alpha^2\lambda+m_0\beta^2)} - \frac{1}{\beta^2}}\bm{I}_n. \qedhere
      \end{equation*}
\end{proof}

\begin{proof}[Proof of Proposition \ref{prop:linearcondsupp}]
      By Lemma \ref{lem:pushcondprob},
      \begin{equation*}
            \Pb^\sigma_X = A_{\#}\Pb^Z_\sigma = w_1 A_{\#}\Pb^Z_{1,\sigma} + w_2 A_{\#}\Pb^Z_{2,\sigma}.
      \end{equation*}
      Moreover, because $\bm{Z}_{1,\sigma} = (\bm{Z}_1,0)^\top + \sigma \bm{\zeta} \sim \Pb^Z_{i,\sigma}$ with $\bm{\zeta} \sim \mathcal{N}(\bm{0},\bm{I}_d)$, 
      \begin{equation*}
            A\bm{Z}_{1,\sigma} = A_1\bm{Z}_1 + \sigma A\bm{\zeta} \sim \Pb^\sigma_{X\mid Y}(\cdot \mid Y = 1).
      \end{equation*}
      Note that $A_1\bm{Z}_1 \sim \Pb_{X \mid Y}(\cdot \mid Y=1)$. Therefore,
      \begin{equation*}
            \mathcal{W}_1(\Pb^\sigma_{X\mid Y}(\cdot \mid Y=1),\Pb_{X\mid Y}(\cdot \mid Y=1)) \leq \E\bj{\norm*{A\bm{Z}_{1,\sigma} - A_1 \bm{Z}_1}} = \sigma\E\bj{\norm*{A\bm{\zeta}}} \leq \sigma \sqrt{d},
      \end{equation*}
      where the final inequality is because $A\bm{\zeta} \sim \mathcal{N}(0,\bm{I}_d)$ and Lemma \ref{lem:l2boundl1}. Similarly, it can get
      \begin{equation*}
            \mathcal{W}_1(\Pb^\sigma_{X\mid Y}(\cdot \mid Y=2),\Pb_{X\mid Y}(\cdot \mid Y=2)) \leq \sigma \sqrt{d}.
      \end{equation*} 
      Combining these two inequality and by Lemma \ref{lem:convexofwasser}, we have
      \begin{align*}
            \mathcal{W}_1(\Pb_X^\sigma,\Pb_X) &\leq w_1\mathcal{W}_1(\Pb^\sigma_{X\mid Y}(\cdot \mid Y=1),\Pb_{X\mid Y}(\cdot \mid Y=1)) \\
            &\quad + w_2\mathcal{W}_1(\Pb^\sigma_{X\mid Y}(\cdot \mid Y=2),\Pb_{X\mid Y}(\cdot \mid Y=2))\\
            &\leq \sigma\sqrt{d}. \qedhere
      \end{align*}
\end{proof}

\begin{proof}[Proof of Lemma \ref{lem:logconcofmix}]
      Let
      \begin{equation*}
            r_1(\bm{x}) \defeq \frac{w p_1(\bm{x})}{p(\bm{x})},\quad r_2(\bm{x}) \defeq 1 - r_1(\bm{x}) = \frac{(1-w) p_2(\bm{x})}{p(\bm{x})}.
      \end{equation*}
      We have
      \begin{equation*}
            \nabla \log p = \frac{w \nabla p_1 + (1-w)\nabla p_2}{p} = r_1\nabla \log p_1 + r_2 \nabla \log p_2,
      \end{equation*}
      and
      \begin{equation*}
            \nabla^2 \log p = r_1 \nabla^2 \log p_1 + r_2 \nabla^2 \log p_2 + \nabla r_1 (\nabla \log p_1 - \nabla \log p_2)^\top.
      \end{equation*}
      For $r_1 = w p_1 / p$,
      \begin{align*}
            \nabla r_1 &= w \frac{p\nabla p_1 - p_1 \nabla p}{p^2} \\
            &= w\frac{(w p_1 +(1 - w)p_2)\nabla p_1 - p_1(w \nabla p_1 + (1-w)\nabla p_2)}{p^2} \\
            &= \frac{w(1-w)}{p^2}\bc{p_2\nabla p_1 - p_1 \nabla p_2}\\
            &= r_1r_2\bc{\nabla \log p_1 - \nabla \log p_2}.
      \end{align*}
      Therefore,
      \begin{equation*}
            \nabla^2 \log p = r_1 \nabla^2 \log p_1 + r_2 \nabla^2 \log p_2 + r_1r_2\bc{\nabla \log p_1 - \nabla \log p_2}\bc{\nabla \log p_1 - \nabla \log p_2}^\top.
      \end{equation*}
      For the first two terms, by the assumption,
      \begin{equation*}
            r_1 \nabla^2 \log p_1 + r_2 \nabla^2 \log p_2 \preceq r_1L_1\bm{I}_n + r_2L_2\bm{I}_n \preceq \max\bb{L_1,L_2}\bm{I}_n.
      \end{equation*}
      For the third term, because $\sup_{\bm{x}}\norm*{\nabla \log p_1(\bm{x}) - \nabla \log p_2(\bm{x})} \leq M$,
      \begin{equation*}
            \norm*{\bc{\nabla \log p_1 - \nabla \log p_2}\bc{\nabla \log p_1 - \nabla \log p_2}^\top}_{\op{op}} \leq M^2,
      \end{equation*}
      which implies that
      \begin{equation*}
            \bc{\nabla \log p_1 - \nabla \log p_2}\bc{\nabla \log p_1 - \nabla \log p_2}^\top \preceq M^2\bm{I}_n.
      \end{equation*}
      For the coefficients $r_1r_2$, because $r_1,r_2 \in (0,1)$, $r_1r_2 \leq 1/4$. Combining these results, we have
      \begin{equation*}
            \nabla^2 \log p \preceq \bc{\max\bb{L_1,L_2}+\frac{1}{4}M^2}\bm{I}_n. \qedhere
      \end{equation*}
\end{proof}

\begin{proof}[Proof of Corollary \ref{cor:infofm}]
      Because
      \begin{equation*}
            m_0^z = m_0^z(\sigma) = \frac{m}{1+m\sigma^2} - \frac{M^2}{4}
      \end{equation*}
      is decreasing in $\sigma$, 
      \begin{equation*}
            m_0^z \leq m_0^z(0) = m - \frac{M^2}{4}.
      \end{equation*}
      With the Assumption \ref{assum:bounddifflog}, we have
      \begin{equation*}
            m - \frac{M^2}{4} > 1
      \end{equation*}
      Therefore, by choosing a small $\sigma$, we can also have $m_0^z > 1$. It follows that
      \begin{equation*}
            m_t = \frac{m_0^z}{m_0^z + (1-m_0^z)e^{-2t}}
      \end{equation*}
      is decreasing in $t$. So
      \begin{equation*}
            m_I \defeq \inf_{t \in (0,T]} m_t = m_T = \frac{m_0^z}{m_0^z + (1-m_0^z)e^{-2T}} > 1. \qedhere
      \end{equation*}
\end{proof}

\subsection{Proof of Theorem \ref{thm:estimmfd}}\label{appen:proof_of_theorem_ref_thm_estimmfd}

\begin{proof}[Proof of Theorem \ref{thm:estimmfd}]
      By differentiating $\norm*{\tilde{\bm{Y}}_{t}}^2$ from (\ref{eq:orthodynam}),
      \begin{align*}
            \frac{1}{2}\frac{\mathrm{d}}{\mathrm{d}t}\norm*{\tilde{\bm{Y}}_t}^2 &= \inn{\tilde{\bm{Y}}_t, \frac{\mathrm{d}}{\mathrm{d}t}\tilde{\bm{Y}}_t} \\
            &= \inn{\tilde{\bm{Y}}_t, \tilde{\bm{Y}}_t + P_1\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t) - \eta \tilde{\bm{Y}}_t} \\
            &= (1-\eta) \norm*{\tilde{\bm{Y}}_t}^2 + \inn{\tilde{\bm{Y}}_t,P_1\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t)}\\
            &\leq (1-\eta) \norm*{\tilde{\bm{Y}}_t}^2 + \norm*{\tilde{\bm{Y}}_t}\norm*{\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t)}.
      \end{align*}
      Therefore,
      \begin{equation*}
            \frac{\mathrm{d}}{\mathrm{d}t}\norm*{\tilde{\bm{Y}}_t} \leq (1-\eta) \norm*{\tilde{\bm{Y}}_t} +\norm*{\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t)}.
      \end{equation*}
      Taking the expectation on the both sides yields
      \begin{equation}\label{eq:ineqofmomen}
            \frac{\mathrm{d}}{\mathrm{d}t}\mathfrak{m}_t \leq (1-\eta)\mathfrak{m}_t+ \E\bj{\norm*{\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t)}},
      \end{equation}
      where $\mathfrak{m}_t \defeq \E\bj{\norm*{\tilde{\bm{Y}}_t}}$. Therefore, the next step is to bound $\E\bj{\norm*{\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t)}}$.

      Let 
      \begin{equation}\label{eq:bound_at_fix_pt}
            L_S \defeq \sup_{t \in [\delta,T]} L_t,\quad C \defeq \sup_{t \in [\delta,T]} \norm*{\nabla_{\bm{x}}\log p_t^\sigma(\bm{0})} < \infty,
      \end{equation}
      where $L_t$ is defined in Theorem \ref{thm:smoothofscore}. By the $L_S$-Lipschitz of $\nabla_{\bm{x}}\log p^\sigma_t$ (Theorem \ref{thm:smoothofscore}),
      \begin{equation}\label{eq:ineqfornormlog}
            \begin{aligned}
                  \norm*{\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t)} &\leq \norm*{\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t)-\nabla_{\bm{x}}\log p^\sigma_{T-t}(\bm{0})} + \norm*{\nabla_{\bm{x}}\log p^\sigma_{T-t}(\bm{0})} \\
                  &\leq L_S\norm*{\tilde{\bm{X}}_t}+ C
            \end{aligned}
      \end{equation}
      For $\tilde{\bm{X}}_t$ in Equation (\ref{eq:geometric_guidance}), we have
      \begin{align*}
            \frac{\mathrm{d}}{\mathrm{d}t}\norm*{\tilde{\bm{X}}_t}^2 &=2\norm*{\tilde{\bm{X}}_t}^2 + 2\inn{\tilde{\bm{X}}_t,\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t)}-2\eta \inn{\tilde{\bm{X}}_t,P_1\tilde{\bm{X}}_t} \\
            &\leq 2\norm*{\tilde{\bm{X}}_t}^2 + 2\inn{\tilde{\bm{X}}_t,\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t)}\\
            &\leq 2\norm*{\tilde{\bm{X}}_t}^2+ 2\norm*{\tilde{\bm{X}}_t}\norm*{\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t)},
      \end{align*}
      where the second inequality is because $\inn{\tilde{\bm{X}}_t,P_1\tilde{\bm{X}}_t} \geq 0$. Combining this with (\ref{eq:ineqfornormlog}),
      \begin{equation*}
            \frac{\mathrm{d}}{\mathrm{d}t}\norm*{\tilde{\bm{X}}_t} \leq (1+L_S)\norm*{\tilde{\bm{X}}_t} + C.
      \end{equation*}
      By taking the expectation on the both sides of above inequality, Gr\"onwall's Inequality (Lemma \ref{lem:gronwall}) implies
      \begin{equation}\label{eq:boud_solution_of_dynamic}
            \E\bj{\norm*{\tilde{\bm{X}}_t}} \leq \E\bj{\norm*{\tilde{\bm{X}}_0}}e^{(1+L_S)t}+\frac{C}{1+L_S}\bc{e^{(1+L_S)t} - 1}.
      \end{equation}
      Because $\tilde{\bm{X}}_0 \sim \mathcal{N}(\bm{0},\bm{I}_D)$, $\E\bj{\norm*{\tilde{\bm{X}}_0}} \leq \sqrt{D}$ (Lemma \ref{lem:l2boundl1}). It follows that
      \begin{equation*}
            \sup_{t \in [0,T-\delta]}\E\bj{\|\tilde{\bm{X}}_t\|} \leq \sqrt{D}e^{(1+L_S)T}+\frac{C}{1+L_S}\bc{e^{(1+L_S)T} - 1} \eqdef M_1,
      \end{equation*}
      and (\ref{eq:ineqfornormlog}) implies
      \begin{equation}\label{eq:def_M_2}
            \begin{aligned}
                  \sup_{t \in [0,T-\delta]}\E\bj{\norm*{\nabla_{\bm{x}}\log p^\sigma_{T-t}(\tilde{\bm{X}}_t)}} &\leq \sup_{t \in [0,T-\delta]}L_S\E\bj{\norm*{\tilde{\bm{X}}_t}}+ C \\
                  &\leq L_SM_1+C \eqdef M_2.
            \end{aligned}
      \end{equation}
      Then by substituting this into (\ref{eq:ineqofmomen}),
      \begin{equation*}
            \frac{\mathrm{d}}{\mathrm{d}t}\mathfrak{m}_t \leq -(\eta-1)\mathfrak{m}_t + M_2.
      \end{equation*}
      Because $\mathfrak{m}_0=\E\bj{\norm*{\tilde{\bm{Y}}_0}} \leq \sqrt{D-d_1}$ by Lemma \ref{lem:l2boundl1}, by applying Gr\"onwall's Inequality again, we obtain
      \begin{equation}\label{eq:momentboundofy}
            \E\bj{\norm*{\tilde{\bm{Y}}_t}} = \mathfrak{m}_t \leq \sqrt{D-d_1}e^{-(\eta-1)t}+\frac{M_2}{\eta-1}\bc{1-e^{-(\eta-1)t}} \eqdef M_\eta(t),
      \end{equation}
      which implies that
      \begin{equation*}
            \E\bj{\norm*{\tilde{\bm{Y}}_{T-\delta}}} \leq \sqrt{D-d_1}e^{-(\eta-1)(T-\delta)}+\frac{M_2}{\eta-1}.
      \end{equation*}
      For any $\varepsilon > 0$, 
      \begin{equation*}
            \frac{M_2}{\eta-1} \leq \frac{\varepsilon}{2} ~\Rightarrow~\eta \geq \frac{2M_2}{\varepsilon} +1,
      \end{equation*}
      and
      \begin{equation*}
            \sqrt{D-d_1}e^{-(\eta-1)(T-\delta)} \leq \frac{\varepsilon}{2} ~\Rightarrow~ \eta \geq \frac{1}{T-\delta}\log \frac{2\sqrt{D-d_1}}{\varepsilon} + 1.
      \end{equation*}
      Therefore, for any $\varepsilon > 0$, by choosing
      \begin{equation*}
            \eta \geq \max\bb{\frac{2M_2}{\varepsilon},\frac{1}{T-\delta}\log \frac{2\sqrt{D-d_1}}{\varepsilon} } + 1,
      \end{equation*}
      we have 
      \begin{equation*}
            \E\bj{\norm*{\tilde{\bm{Y}}_{T-\delta}}} \leq \varepsilon. \qedhere
      \end{equation*}
\end{proof}

\subsection{Theoretical Analysis for Universal Guidance}\label{appen:theoretical_analysis_for_universal_guidance}

Consider the universal guidance model
\begin{equation}\label{eq:univer_guidance}
      \frac{\mathrm{d}\bm{X}_t^\leftarrow}{\mathrm{d}t} = \bm{X}_t^\leftarrow + \nabla_{\bm{x}} \log p_{T-t}(\bm{X}_t^\leftarrow) - \eta \nabla_{\bm{x}} f(\bm{X}_t^\leftarrow),~\bm{X}_0^\leftarrow \sim \mathcal{N}(\bm{0},\bm{I}_D),
\end{equation}
for $t \in [0,T]$, where $p_t$ is the density function in DDPMs. 

\begin{thm}\label{thm:conv_univer_guidance}
      For the dynamics (\ref{eq:univer_guidance}), assume that $\log p_t$ is $L$-smoothness, $f$ is $\rho$-strongly convex, and $\E\bj{f(\bm{X}_0^\leftarrow)} < \infty$. Then
      \begin{equation*}
            \E\bj{f(\bm{X}_T^\leftarrow)} - f(\bm{x}_*) = \mathcal{O}\bc{e^{-\eta} + \frac{1}{\eta}},
      \end{equation*}
      where $\bm{x}_*$ is the unique minimizer of $f$.
\end{thm}
\begin{proof}
      By differentiating $f(\bm{X}_t^\leftarrow)$,
      \begin{align*}
            \frac{\mathrm{d}}{\mathrm{d}t}f(\bm{X}_t^\leftarrow) &= \inn{\nabla_{\bm{x}}f(\bm{X}_t^\leftarrow),\frac{\mathrm{d}}{\mathrm{d}t}\bm{X}_t^\leftarrow} \\
            &= \inn{\nabla_{\bm{x}}f(\bm{X}_t^\leftarrow),\bm{X}_t^\leftarrow + \nabla_{\bm{x}} \log p_{T-t}(\bm{X}_t^\leftarrow) - \eta \nabla_{\bm{x}} f(\bm{X}_t^\leftarrow)} \\
            &\leq \norm*{\bm{X}_t^\leftarrow}\norm*{\nabla_{\bm{x}}f(\bm{X}_t^\leftarrow)} +\norm*{\nabla_{\bm{x}} \log p_{T-t}(\bm{X}_t^\leftarrow)}\norm*{\nabla_{\bm{x}}f(\bm{X}_t^\leftarrow)} - \eta \norm*{\nabla_{\bm{x}}f(\bm{X}_t^\leftarrow)}^2.
      \end{align*}
      Let $ C = \sup_{t \in [\delta,T]} \norm*{\nabla_{\bm{x}}\log p_t(\bm{0})} < \infty$. Then, the $L$-smoothness of $\log p_t$ implies that
      \begin{equation*}
            \norm*{\nabla_{\bm{x}} \log p_{T-t}(\bm{X}_t^\leftarrow)} \leq L\norm*{\bm{X}_t^\leftarrow} + C.
      \end{equation*}
      Therefore, by $ab \leq (a^2+b^2)/2$, we have
      \begin{align*}
            \frac{\mathrm{d}}{\mathrm{d}t}f(\bm{X}_t^\leftarrow) &\leq (1+L)\norm*{\bm{X}_t^\leftarrow}\norm*{\nabla_{\bm{x}}f(\bm{X}_t^\leftarrow)} + C\norm*{\nabla_{\bm{x}}f(\bm{X}_t^\leftarrow)} - \eta \norm*{\nabla_{\bm{x}}f(\bm{X}_t^\leftarrow)}^2 \\
            &\leq \frac{1+L}{2}\bc{\norm*{\bm{X}_t^\leftarrow}^2 + \norm*{\nabla_{\bm{x}}f(\bm{X}_t^\leftarrow)}^2} + \frac{1}{2}\bc{C^2 + \norm*{\nabla_{\bm{x}}f(\bm{X}_t^\leftarrow)}^2} - \eta \norm*{\nabla_{\bm{x}}f(\bm{X}_t^\leftarrow)}^2 \\
            &= -\frac{1}{2}(\eta-2-L)\norm*{\nabla_{\bm{x}}f(\bm{X}_t^\leftarrow)}^2 + \frac{1+L}{2}\norm*{\bm{X}_t^\leftarrow}^2 + \frac{C^2}{2}
      \end{align*}
      Because $f$ is $\rho$-strongly convex, by Lemma \ref{lem:convex_pl}, it satisfies the $\rho$-PL inequality,
      \begin{equation*}
            \norm*{\nabla_{\bm{x}}f(\bm{X}_t^\leftarrow)}^2 \geq 2\rho\bc{f(\bm{X}_t^\leftarrow) - f(\bm{x}_*)},
      \end{equation*}
      For $\eta > L+2$, we obtain
      \begin{equation*}
            \frac{\mathrm{d}}{\mathrm{d}t}f(\bm{X}_t^\leftarrow) \leq -\rho(\eta-2-L)\bc{f(\bm{X}_t^\leftarrow) - f(\bm{x}_*)} + \frac{1+L}{2}\norm*{\bm{X}_t^\leftarrow}^2 + \frac{C^2}{2}.
      \end{equation*}
      Taking the expectation on the both sides yields that
      \begin{equation}\label{eq:ineq_expec_f}
            \frac{\mathrm{d}}{\mathrm{d}t}\E\bj{f(\bm{X}_t^\leftarrow)} \leq -\rho(\eta-2-L)\bc{\E\bj{f(\bm{X}_t^\leftarrow)} - f(\bm{x}_*)} + \frac{1+L}{2}\E\bj{\norm*{\bm{X}_t^\leftarrow}^2} + \frac{C^2}{2}.
      \end{equation}
      The next step is to bound $\E\bj{\norm*{\bm{X}_t^\leftarrow}^2}$. Let $\bm{R}_t \defeq \bm{X}_t^\leftarrow - \bm{x}_*$. Then
      \begin{equation}\label{eq:ineq_r_t}
            \begin{aligned}
                  \frac{1}{2}\frac{\mathrm{d}}{\mathrm{d}t}\norm*{\bm{R}_t}^2 &= \inn{\bm{R}_t,\bm{X}_t^\leftarrow + \nabla_{\bm{x}} \log p_{T-t}(\bm{X}_t^\leftarrow) - \eta \nabla_{\bm{x}} f(\bm{X}_t^\leftarrow)} \\
                  &= \inn{\bm{R}_t,\bm{X}_t^\leftarrow} + \inn{\bm{R}_t,\nabla_{\bm{x}} \log p_{T-t}(\bm{X}_t^\leftarrow)} - \eta \inn{\bm{R}_t,\nabla_{\bm{x}} f(\bm{X}_t^\leftarrow)}.
            \end{aligned}
      \end{equation}
      To obtain the desired inequality, we consider these three terms respectively. For the first term,
      \begin{equation}\label{eq:ineq_a_r_t}
            \inn{\bm{R}_t,\bm{X}_t^\leftarrow} = \norm*{\bm{R}_t}^2 + \inn{\bm{R}_t,\bm{x}_*} \leq  \norm*{\bm{R}_t}^2 + \norm*{\bm{x}_*}\norm*{\bm{R}_t}.
      \end{equation}
      Let $c = \norm*{\nabla_{\bm{x}} \log p_{T-t}(\bm{x}_*)}$. By the $L$-smoothness of $\log p_t$, we have
      \begin{align*}
            \norm*{\log p_{T-t}(\bm{X}_t^\leftarrow)} &\leq \norm*{\log p_{T-t}(\bm{X}_t^\leftarrow) - \nabla_{\bm{x}} \log p_{T-t}(\bm{x}_*)} + \norm*{\nabla_{\bm{x}} \log p_{T-t}(\bm{x}_*)} \\
            & \leq L\norm*{\bm{R}_t} + c.
      \end{align*}
      Therefore, for the second term,
      \begin{equation}\label{eq:ineq_b_r_t}
            \begin{aligned}
                  \inn{\bm{R}_t,\nabla_{\bm{x}} \log p_{T-t}(\bm{X}_t^\leftarrow)} &\leq \norm*{\bm{R}_t} \norm*{\nabla_{\bm{x}} \log p_{T-t}(\bm{X}_t^\leftarrow)} \\
                  &\leq L\norm*{\bm{R}_t}^2 + c\norm*{\bm{R}_t}.
            \end{aligned}
      \end{equation}
      For the third term, because $f$ is $\rho$-strongly convex, $\nabla_{\bm{x}} f(\bm{x}_*) = 0$ and
      \begin{equation}\label{eq:ineq_c_r_t}
            \inn{\bm{R}_t,\nabla_{\bm{x}} f(\bm{X}_t^\leftarrow)} = \inn{\bm{R}_t,\nabla_{\bm{x}} f(\bm{X}_t^\leftarrow)-\nabla_{\bm{x}} f(\bm{x}_*)} \geq \rho \norm*{\bm{R}_t}^2.
      \end{equation}
      Then, by combining (\ref{eq:ineq_r_t}) with (\ref{eq:ineq_a_r_t}) (\ref{eq:ineq_b_r_t}) (\ref{eq:ineq_c_r_t}), we have
      \begin{align*}
            \frac{\mathrm{d}}{\mathrm{d}t}\norm*{\bm{R}_t}^2 &\leq 2(L+1 - \eta\rho)\norm*{\bm{R}_t}^2 + 2\tilde{c}\norm*{\bm{R}_t} \\
            &\leq (2L+3 - 2\eta\rho)\norm*{\bm{R}_t}^2 + \tilde{c}^2.
      \end{align*}
      where $\tilde{c} = \norm*{\bm{x}_*} + c$. By taking the expectation on the both sides, Gr\"onwall's Inequality (Lemma \ref{lem:gronwall}) implies that 
      \begin{equation*}
            \E\bj{\norm*{\bm{R}_t}^2} \leq \E\bj{\norm*{\bm{R}_0}^2}e^{-(2\eta\rho-2L-3)t}+\frac{\tilde{c}^2}{2\eta\rho-2L-3}\bc{1-e^{-(2\eta\rho-2L-3)t}}
      \end{equation*}
      By taking a sufficiently large $\eta$ such that $2\eta\rho-2L-3 > \tilde{c}^2 > 0$, we have
      \begin{equation*}
            \E\bj{\norm*{\bm{R}_t}^2} \leq \E\bj{\norm*{\bm{R}_0}^2} + 1
      \end{equation*}
      Note that $\bm{X}^\leftarrow_0 \sim \mathcal{N}(\bm{0},\bm{I}_D)$, which implies that $\E\bj{\norm*{\bm{X}^\leftarrow_0}^2} = D$. Therefore, 
      \begin{equation*}
            \E\bj{\norm*{\bm{R}_0}^2} \leq \E\bj{\norm*{\bm{X}^\leftarrow_0}^2} + \norm*{\bm{x}_*}^2 \leq D + \norm*{\bm{x}_*}^2,
      \end{equation*}
      and
      \begin{equation*}
            \E\bj{\norm*{\bm{X}_t^\leftarrow}^2} \leq \E\bj{\norm*{\bm{R}_t}^2} + \norm*{\bm{x}_*}^2 \leq D + 2\norm*{\bm{x}_*}^2 + 1 \eqdef M_3.
      \end{equation*}
      By substituting $M_3$ into (\ref{eq:ineq_expec_f}), we obtain
      \begin{equation*}
            \frac{\mathrm{d}}{\mathrm{d}t}\E\bj{f(\bm{X}_t^\leftarrow)} \leq -\rho(\eta-2-L)\bc{\E\bj{f(\bm{X}_t^\leftarrow)} - f(\bm{x}_*)} + M_4
      \end{equation*}
      for $M_4 \defeq ((1+L)M_3+C^2)/2$. Then, by Gr\"onwall's Inequality,
      \begin{equation*}
            \E\bj{f(\bm{X}_T^\leftarrow)} - f(\bm{x}_*) \leq  \bc{\E\bj{f(\bm{X}_0^\leftarrow)} - f(\bm{x}_*)}e^{-\rho(\eta-2-L)T} + \frac{M_4}{\rho(\eta-2-L)},
      \end{equation*}
      which means that
      \begin{equation*}
            \E\bj{f(\bm{X}_T^\leftarrow)} - f(\bm{x}_*) = \mathcal{O}\bc{e^{-\eta} + \frac{1}{\eta}}. \qedhere
      \end{equation*}
\end{proof}

\subsection{Proof of Theorem \ref{thm:wassbound}}\label{appen:proof_of_theorem_ref_thm_wassbound}

\begin{proof}[Proof of Theorem \ref{thm:wassbound}]
      The proof consists two main steps:
      \begin{enumerate}[label=(\roman*)]
            \item Let $Q_1 = A_1A_1^\top$. For any coupling $(\tilde{\bm{X}},\bm{X}) \sim \bc{\tilde{p}_{T-\delta},\Pb_{X \mid Y}(\cdot \mid Y=1)}$, we have
            \begin{align*}
                  \mathcal{W}_1\bc{\tilde{p}_{T-\delta},\Pb_{X \mid Y}(\cdot \mid Y=1)} &\leq \E\bj{\norm{\tilde{\bm{X}}-\bm{X}}} \\
                  &= \E\bj{\norm{Q_1\tilde{\bm{X}}-Q_1\bm{X}}}+\E\bj{\norm{P_1\tilde{\bm{X}}-P_1\bm{X}}}\\
                  &= \E\bj{\norm{Q_1\tilde{\bm{X}}-\bm{X}}}+\E\bj{\norm{\tilde{\bm{Y}}_{T-\delta}}},
            \end{align*}
            where the final equality holds because $\bm{X} \sim \Pb_{X \mid Y}(\cdot \mid Y=1)$ implies that $Q_1\bm{X} = \bm{X}$, and $\tilde{\bm{X}} \sim \tilde{p}_{T-\delta}$ implies that $P_1\tilde{\bm{X}} = \tilde{\bm{Y}}_{T-\delta}$. And by (\ref{eq:momentboundofy}),
            \begin{equation*}
                  \E\bj{\norm*{\tilde{\bm{Y}}_{T-\delta}}} \leq M_{\eta}(T-\delta) = \mathcal{O}(e^{-T}+\eta^{-1}).
            \end{equation*}
            Let $(Q_1\tilde{\bm{X}},\bm{X})$ be chosen as the optimal coupling for $\bc{(Q_1)_{\#}\tilde{p}_{T-\delta},\Pb_{X \mid Y}(\cdot \mid Y=1)}$, i.e.,
            \begin{equation*}
                  \mathcal{W}_1\bc{(Q_1)_{\#}\tilde{p}_{T-\delta},\Pb_{X \mid Y}(\cdot \mid Y=1)} = \E\bj{\norm{Q_1\tilde{\bm{X}}-\bm{X}}}.
            \end{equation*}
            Therefore, we have
            \begin{equation}\label{eq:bound1}
                  \mathcal{W}_1\bc{\tilde{p}_{T-\delta},\Pb_{X \mid Y}(\cdot \mid Y=1)} \leq \mathcal{W}_1\bc{(Q_1)_{\#}\tilde{p}_{T-\delta},\Pb_{X \mid Y}(\cdot \mid Y=1)} + \mathcal{O}(e^{-T}+\eta^{-1}).
            \end{equation}

            \item For $\mathcal{W}_1\bc{(Q_1)_{\#}\tilde{p}_{T-\delta},\Pb_{X \mid Y}(\cdot \mid Y=1)}$, by the triangular inequality,
            \begin{equation}\label{eq:q_1_tilde_p_cond}
                  \begin{aligned}
                        \mathcal{W}_1\bc{(Q_1)_{\#}\tilde{p}_{T-\delta},\Pb_{X \mid Y}(\cdot \mid Y=1)} &\leq \mathcal{W}_1\bc{(Q_1)_{\#}\tilde{p}_{T-\delta},(Q_1)_{\#}\Pb_X} \\
                        &\quad+ \mathcal{W}_1\bc{(Q_1)_{\#}\Pb_X,\Pb_{X \mid Y}(\cdot \mid Y=1)} \\
                        &\leq \mathcal{W}_1\bc{\tilde{p}_{T-\delta},\Pb_X} + \mathcal{W}_1\bc{(Q_1)_{\#}\Pb_X,\Pb_{X \mid Y}(\cdot \mid Y=1)},
                  \end{aligned}
            \end{equation}
            where the final inequality is because $Q_1$ is an orthogonal projection (Lemma \ref{lem:pushwasserstein}). 

            By Lemma \ref{lem:wassofpxtopxgiveny}, the second term in above inequality is bounded by
            \begin{equation}\label{eq:p_x_condition}
                  \mathcal{W}_1\bc{(Q_1)_{\#}\Pb_X,\Pb_{X \mid Y}(\cdot \mid Y=1)} \leq \tilde{C}_1
            \end{equation}
            for some constant $\tilde{C}_1$. For the first term, it can be divided into
            \begin{equation}\label{eq:divi_tilde_p_p_X}
                  \mathcal{W}_1\bc{\tilde{p}_{T-\delta},\Pb_X} \leq \mathcal{W}_1\bc{\tilde{p}_{T-\delta},\hat{p}_\delta} + \mathcal{W}_1\bc{\hat{p}_\delta, p^\sigma_\delta} + \mathcal{W}_1\bc{p^\sigma_\delta, \Pb^\sigma_X} + \mathcal{W}_1\bc{\Pb^\sigma_X,\Pb_X},
            \end{equation}
            where $\hat{p}_t$ is defined in dynamics (\ref{eq:origfromguass}), $p^\sigma_t$ is the density evolving in the DDPM initialized from $\Pb^\sigma_X$; see (\ref{eq:def_p_sig_t}), and $\Pb^\sigma_X$ is defined in Proposition \ref{prop:linearcondsupp}. For the four terms in (\ref{eq:divi_tilde_p_p_X}):
            \begin{enumerate}[label=(\alph{*})]
                  \item By Proposition \ref{prop:w1_geo_unconddm} and $m_I > 1$ (Corollary \ref{cor:infofm}),
                  \begin{equation}\label{eq:tilde_p_hat_p}
                        \mathcal{W}_1\bc{\tilde{p}_{T-\delta},\hat{p}^{\sigma}_{\delta}} \leq \mathcal{O}(e^{-T}+\eta^{-1}) + \tilde{C}_2
                  \end{equation}
                  for some constant $\tilde{C}_2$.

                  \item By Proposition \ref{prop:w2ofdmfromdiffini}, 
                  \begin{equation}\label{eq:hat_p_p_sig}
                        \mathcal{W}_1\bc{\hat{p}_\delta, p^\sigma_\delta} \leq \mathcal{O}(e^{-T}).
                  \end{equation}

                  \item Note that
                  \begin{equation*}
                        \bm{X}^\sigma_\delta = \sqrt{\alpha_\delta}A\bm{Z} + \sqrt{1-\alpha_\delta}\bm{\xi} \sim p^\sigma_\delta,\quad \alpha_\delta = e^{-2\delta}
                  \end{equation*}
                  for $\bm{Z} \sim p^Z_\sigma$. Moreover, $A\bm{Z} \sim \Pb_X^\sigma$. Therefore,
                  \begin{align*}
                        \mathcal{W}_1\bc{p^{\sigma}_{\delta},\Pb^{\sigma}_X} &\leq \E\bj{\|\bm{X}^\sigma_\delta -A\bm{Z} \|}\\
                        &\leq \E\bj{\|\bm{X}^\sigma_\delta -\sqrt{\alpha_\delta}A\bm{Z} \|} + (1-\sqrt{\alpha_\delta})\E[\|A\bm{Z}\|] \\
                        &= \sqrt{1-\alpha_\delta}\E\bj{\norm{\bm{\xi}}} + (1-\sqrt{\alpha_\delta})\E_{\bm{Z} \sim p^Z_{\sigma}}\bj{\|\bm{Z}\|} \\
                        &\leq \sqrt{2\delta D} + \delta\mathfrak{m}^Z_\sigma,
                  \end{align*}
                  where $\mathfrak{m}^Z_\sigma = \E_{\bm{Z} \sim p^Z_{\sigma}}\bj{\|\bm{Z}\|} < \infty$ by Lemma \ref{lem:bound_moment_latent}. It follows that
                  \begin{equation}\label{p_sigdelta_p_sig}
                        \mathcal{W}_1\bc{p^{\sigma}_{\delta},\Pb^{\sigma}_X} \leq \mathcal{O}(\delta^{1/2}).
                  \end{equation}

                  \item By Proposition \ref{prop:linearcondsupp},
                  \begin{equation}\label{p_sig_p_x}
                         \mathcal{W}_1\bc{\Pb^\sigma_X,\Pb_X} \leq \mathcal{O}(\sigma).
                  \end{equation}
            \end{enumerate}
            Then, combining (\ref{eq:tilde_p_hat_p}) (\ref{eq:hat_p_p_sig}) (\ref{p_sigdelta_p_sig}) (\ref{p_sig_p_x}) with (\ref{eq:divi_tilde_p_p_X}), we have
            \begin{equation}\label{tilde_p_p_x}
                  \mathcal{W}_1\bc{\tilde{p}_{T-\delta},\Pb_X} \leq \mathcal{O}(e^{-T} + \delta^{1/2} + \sigma + \eta^{-1}) + \tilde{C}_2.
            \end{equation}
            Combining (\ref{tilde_p_p_x}) (\ref{eq:p_x_condition}) with (\ref{eq:q_1_tilde_p_cond}), it follows
            \begin{equation*}
                  \mathcal{W}_1\bc{(Q_1)_{\#}\tilde{p}_{T-\delta},\Pb_{X \mid Y}(\cdot \mid Y=1)} \leq \mathcal{O}(e^{-T} + \delta^{1/2} + \sigma + \eta^{-1}) + \tilde{C},
            \end{equation*}
            where $\tilde{C} = \tilde{C}_1+\tilde{C}_2$. Therefore, substituting this in (\ref{eq:bound1}), we obtain
            \begin{equation*}
                  \mathcal{W}_1\bc{\tilde{p}_{T-\delta},\Pb_{X \mid Y}(\cdot \mid Y=1)} \leq \mathcal{O}(e^{-T} + \delta^{1/2} + \sigma + \eta^{-1}) + \tilde{C}. \qedhere
            \end{equation*}
      \end{enumerate}
\end{proof}
\begin{rmk}\label{rmk:discussion_of_tildeC}
      For the error floor $\tilde{C}$, we provide two further discussions.
      \begin{enumerate}[label=(\roman*)]
            \item First, it follows from the above proof that $\tilde{C} = \tilde{C}_1+\tilde{C}_2$, where
            \begin{itemize}
                  \item $\tilde{C}_1$ is determined by (\ref{eq:p_x_condition}) and Lemma \ref{lem:wassofpxtopxgiveny},
                  \begin{equation*}
                      \tilde{C}_1 = w_2\mathfrak{m}^Z_1,\quad \mathfrak{m}^Z_1 \defeq \mathfrak{m}^Z_1 = \E_{\bm{Z} \sim \Pb^Z_1}[\norm{\bm{Z}}],
                  \end{equation*}
                  which is independent of the parameters $T,\delta,\sigma$.
                  \item $\tilde{C}_2$ is given by (\ref{eq:tilde_p_hat_p}) and Proposition \ref{prop:w1_geo_unconddm},
                  \begin{equation*}
                      \tilde{C}_2 = \frac{M_2}{m_I - 1},
                  \end{equation*}
                  where $M_2$ is defined in (\ref{eq:def_M_2}) and depends on $L_S = \sup_{t \in [0,T-\delta]}L_t$ and $T$, while $m_I = \inf_{t \in [0,T-\delta]} m_t$. Since $L_t$ and $m_t$ are specified by Theorem \ref{thm:smoothofscore} through $p_t^\sigma$, $\tilde{C}_2$ depends implicitly on $T$, $\delta$, and $\sigma$.
            \end{itemize} 

            \item We believe the error floor is inherent to the geometric guidance model. Because of the analytical simplicity of the geometric guidance, it cannot provide as much information as the probability guidance term did. More precisely, in Appendix \ref{appen:construction_of_geometric_guidance}, we show that
            \begin{equation*}
                  \norm{\nabla_{\bm{x}} \log p_t(y = 1 \mid \bm{x}) + \eta_tP_1\bm{x}} \leq \beta_t,\quad \forall~\bm{x} \in \mathcal{M}_1^t,
            \end{equation*}
            for some scalar $\eta_t > 0$, and $\beta_t =\mathcal{O}(\varepsilon_t)$, when $p_t(y=1 \mid \bm{x}) > 1 -\varepsilon_t$ for all $\bm{x} \in \mathcal{M}_1^t$. This shows that the probabilistic guidance $\nabla_{\bm{x}} \log p_t(y = 1 \mid \bm{x})$ is “almost parallel” to the geometric guidance $P_1 \bm{x}$, but the norm of the probabilistic guidance carries additional information that the geometric term cannot capture. This is a trade-off made for the sake of analytical tractability. 
      \end{enumerate}
\end{rmk}


\begin{lem}\label{lem:bound_moment_latent}
      Let $\bm{Z}_i \sim p^Z_i$ for $i = 1,2$. If $\mathfrak{m}^Z_i = \E\bj{\norm*{\bm{Z}_i}} < \infty$, then for $p_\sigma^Z$ defined in (\ref{eq:multi_of_latent}) (\ref{eq:component_sigma_latent}), 
      \begin{equation*}
            \mathfrak{m}^Z_\sigma \defeq \E_{\bm{Z} \sim p_\sigma^Z}\bj{\norm*{\bm{Z}}} < \infty.
      \end{equation*}
\end{lem}
\begin{proof}
      By the definition of (\ref{eq:component_sigma_latent}),
      \begin{equation*}
            \E\bj{\norm*{\bm{Z}_{i,\sigma}}} \leq \E\bj{\norm*{\bm{Z}_i}} + \sigma \E\bj{\norm*{\bm{\zeta}_i}} \leq \mathfrak{m}^Z_i + \sigma \sqrt{d} < \infty
      \end{equation*}
      for $\bm{Z}_{i,\sigma} \sim p^Z_{i,\sigma}$, where the second inequality is by Lemma \ref{lem:l2boundl1}. Then, by (\ref{eq:multi_of_latent}),
      \begin{align*}
            \E_{\bm{Z} \sim p_\sigma^Z}\bj{\norm*{\bm{Z}}} &= \int_{\R^d} \bm{z}p^Z_{\sigma}(\bm{z})\mathrm{d}\bm{z} \\
            &= w_1\int_{\R^d} \bm{z}p^Z_{1,\sigma}(\bm{z})\mathrm{d}\bm{z} + w_2\int_{\R^d} \bm{z}p^Z_{2,\sigma}(\bm{z})\mathrm{d}\bm{z} \\
            &= w_1 \E\bj{\norm*{\bm{Z}_{1,\sigma}}} + w_2\E\bj{\norm*{\bm{Z}_{2,\sigma}}} < \infty. \qedhere
      \end{align*}
\end{proof}

\begin{lem}\label{lem:wassofpxtopxgiveny}
      For 
      \begin{equation*}
            \Pb_X = w_1\Pb_{X\mid Y}(\cdot \mid Y=1)+w_2\Pb_{X\mid Y}(\cdot \mid Y=2)
      \end{equation*}
      under Assumption \ref{assum:condlineardata},
      \begin{equation*}
            \mathcal{W}_1((Q_1)_{\#}\Pb_X,\Pb_{X\mid Y}(\cdot \mid Y=1)) \leq w_2\mathfrak{m}^Z_1,
      \end{equation*}
      where $Q_1 = A_1A_1^\top$ and $\mathfrak{m}^Z_1 = \E_{\bm{Z} \sim \Pb^Z_1}[\norm{\bm{Z}}]$.
\end{lem}
\begin{proof}
      First, by Lemma \ref{lem:pushcondprob},
      \begin{equation*}
            (Q_1)_{\#}\Pb_X = (Q_1)_{\#}\Pb_{X\mid Y}(\cdot \mid Y=1)+(Q_1)_{\#}\Pb_{X\mid Y}(\cdot \mid Y=2)
      \end{equation*}
      For the two terms, if $\bm{X} \sim \Pb_{X\mid Y}(\cdot \mid Y=1)$, then $Q_1\bm{X} = \bm{X}$, which implies that
      \begin{equation*}
            (Q_1)_{\#}\Pb_{X\mid Y}(\cdot \mid Y=1) = \Pb_{X\mid Y}(\cdot \mid Y=1)
      \end{equation*}
      On the other hand, $\bm{X} \sim \Pb_{X\mid Y}(\cdot \mid Y=2)$ implies that $Q_1\bm{X} = 0$ so that
      \begin{equation*}
            (Q_1)_{\#}\Pb_{X\mid Y}(\cdot \mid Y=2) = \delta_0,
      \end{equation*}
      the Dirichlet measure at $0$. Therefore, by Lemma \ref{lem:convexofwasser},
      \begin{equation*}
            \mathcal{W}_1((Q_1)_{\#}\Pb_X,\Pb_{X\mid Y}(\cdot \mid Y=1)) \leq w_2\mathcal{W}_1(\delta_0,\Pb_{X\mid Y}(\cdot \mid Y=1)).
      \end{equation*}
      For any coupling $(\bm{D},\bm{X}) \sim (\delta_0,\Pb_{X\mid Y}(\cdot \mid Y=1)$,
      \begin{align*}
            \mathcal{W}_1(\delta_0,\Pb_{X\mid Y}(\cdot \mid Y=1)) & \leq \E\bj{\norm{\bm{D}-\bm{X}}} \\
            &=\E\bj{\norm{\bm{X}}} = \E_{\bm{Z} \sim \Pb^Z_1}\bj{\norm{A_1\bm{Z}}} = \E_{\bm{Z} \sim \Pb^Z_1}\bj{\norm{\bm{Z}}},
      \end{align*}
      where the last two equalities are because $\Pb_{X\mid Y}(\cdot \mid Y=1) = (A_1)_{\#}\Pb^Z_1$ and $A_1 \in \mathcal{O}^{D \times d_1}$ by Assumption \ref{assum:condlineardata}.
\end{proof}

In the following, unless otherwise specified, we assume that Assumptions \ref{assum:condlineardata}, \ref{assum:logconcave}, \ref{assum:bounddifflog}, and \ref{assum:bound_moment_latent_comp} hold.

\begin{prop}\label{prop:w2ofdmfromdiffini}
      Let $p^\sigma_t$ be defined in (\ref{eq:def_p_sig_t}). Consider the following two dynamics:
      \begin{equation}\label{eq:origfromguass}
            \frac{d\hat{\bm{X}}_t}{\mathrm{d}t} = \hat{\bm{X}}_t + \nabla_{\bm{x}}\log p^{\sigma}_{T-t}(\hat{\bm{X}}_t),\quad \hat{\bm{X}}_0 \sim \mathcal{N}(0,\bm{I}_D)
      \end{equation}
      with the notation $\hat{\bm{X}}_t \sim \hat{p}^{\sigma}_{T-t}$, and
      \begin{equation*}
            \frac{d\bar{\bm{X}}_t}{\mathrm{d}t} = \bar{\bm{X}}_t + \nabla_{\bm{x}}\log p^{\sigma}_{T-t}(\bar{\bm{X}}_t),\quad \bar{\bm{X}}_0 \sim p^{\sigma}_T,
      \end{equation*}
      where note that $\bar{\bm{X}}_t \sim p^{\sigma}_{T-t}$. For $\delta > 0$, we
      \begin{equation*}
            \mathcal{W}_1(\hat{p}^{\sigma}_\delta,p^{\sigma}_\delta) \leq e^{-m_I(T-\delta)}\bc{\mathfrak{m}^Z_\sigma + \sqrt{D}},
      \end{equation*}
      where $\mathfrak{m}^Z_\sigma = \E_{\bm{Z} \sim p^Z_{\sigma}}\bj{\norm*{\bm{Z}}}$ and $m_I = \inf_{t \in [\delta,T]}m_t$ is defined in Theorem \ref{thm:smoothofscore}.
\end{prop}
\begin{proof}
      First, by the Theorem \ref{thm:smoothofscore}, $p^{\sigma}_{T-t}$ is $m_I$-strong log-concavity for $t \in [0,T-\delta]$, which follows that
      \begin{align*}
            \inn{\hat{\bm{X}}_t-\bar{\bm{X}}_t,\nabla_{\bm{x}}\log p^{\sigma}_{T-t}(\hat{\bm{X}}_t)-\nabla_{\bm{x}}\log p^{\sigma}_{T-t}(\bar{\bm{X}}_t)} &=\inn{\hat{\bm{X}}_t-\bar{\bm{X}}_t,\nabla_{\bm{x}}^2\log p^{\sigma}_{T-t}(\bm{x})\bc{\hat{\bm{X}}_t-\bar{\bm{X}}_t}} \\
            &\leq -m_I\norm{\hat{\bm{X}}_t-\bar{\bm{X}}_t}^2.
      \end{align*}
      Therefore, we have
      \begin{align*}
            \frac{\mathrm{d}}{\mathrm{d}t}\norm{\hat{\bm{X}}_t-\bar{\bm{X}}_t}^2 &= 2\inn{\hat{\bm{X}}_t-\bar{\bm{X}}_t,\frac{\mathrm{d}}{\mathrm{d}t}\bc{\hat{\bm{X}}_t-\bar{\bm{X}}_t}} \\
            &= 2\norm{\hat{\bm{X}}_t-\bar{\bm{X}}_t}^2+2\inn{\hat{\bm{X}}_t-\bar{\bm{X}}_t,\nabla_{\bm{x}}\log p^{\sigma}_{T-t}(\hat{\bm{X}}_t)-\nabla_{\bm{x}}\log p^{\sigma}_{T-t}(\bar{\bm{X}}_t)} \\
            &\leq -2(m_I-1)\norm{\hat{\bm{X}}_t-\bar{\bm{X}}_t}^2,
      \end{align*}
      which indicates that
      \begin{equation*}
            \frac{\mathrm{d}}{\mathrm{d}t}\norm{\hat{\bm{X}}_t-\bar{\bm{X}}_t} \leq -(m_I-1)\norm{\hat{\bm{X}}_t-\bar{\bm{X}}_t},
      \end{equation*}
      Then, by Gr\"onwall's Inequality(Lemma \ref{lem:gronwall}),
      \begin{equation*}
            \norm{\hat{\bm{X}}_{T-\delta}-\bar{\bm{X}}_{T-\delta}} \leq e^{-(m_I-1)(T-\delta)}\norm{\hat{\bm{X}}_0-\bar{\bm{X}}_0}.
      \end{equation*}
      Therefore, by the definition of Wasserstein distance,
      \begin{align*}
            \mathcal{W}_1(\hat{p}^{\sigma}_\delta,p^{\sigma}_\delta) &\leq \E\bj{\norm{\hat{\bm{X}}_{T-\delta}-\bar{\bm{X}}_{T-\delta}}} \\
            &\leq e^{-(m_I-1)(T-\delta)}\E\bj{\norm{\hat{\bm{X}}_0-\bar{\bm{X}}_0}}.
      \end{align*}
      By choosing $(\hat{\bm{X}}_0,\bar{\bm{X}}_0)$ as the optimal coupling, we obtain that
      \begin{equation}\label{eq:wassboundofini}
            \mathcal{W}_1(\hat{p}^{\sigma}_\delta,p^{\sigma}_\delta) \leq e^{-(m_I-1)(T-\delta)}\mathcal{W}_1(\mathcal{N}(0,\bm{I}_D),p^{\sigma}_T).
      \end{equation}
      For the right hand side of (\ref{eq:wassboundofini}), by the definition of $p^\sigma_t$ in Equation (\ref{eq:def_p_sig_t}), i.e.,
      \begin{equation*}
            \bm{X}^\sigma_t = \sqrt{\alpha_t}A\bm{Z} + \sqrt{1-\alpha_t}\bm{\xi} \sim p^\sigma_t
      \end{equation*}
      for $\bm{Z} \sim p^Z_\sigma$, $\bm{\xi} \sim \mathcal{N}(0,\bm{I}_D)$, and $\alpha_t = e^{-2t}$, we have
      \begin{equation*}
            \mathcal{W}_1(p^{\sigma}_t,\mathcal{N}(0,(1-\alpha_t)\bm{I}_D)) \leq \sqrt{\alpha_t}\E\bj{\norm*{A\bm{Z}}} = e^{-t}\E_{\bm{Z} \sim p^Z_{\sigma}}\bj{\norm*{\bm{Z}}}.
      \end{equation*}
      Moreover,
      \begin{equation*}
            \mathcal{W}_1(\mathcal{N}(0,(1-\alpha_T)\bm{I}_D),\mathcal{N}(0,\bm{I}_D)) \leq (1-\sqrt{1-\alpha_T})\E\bj{\norm{\bm{\xi}}} \leq e^{-T}\sqrt{D}.
      \end{equation*}
      Therefore,
      \begin{align*}
            \mathcal{W}_1(p^{\sigma}_T,\mathcal{N}(0,\bm{I}_D)) &\leq \mathcal{W}_1(p^{\sigma}_T,\mathcal{N}(0,(1-\alpha_T)\bm{I}_D))+\mathcal{W}_1(\mathcal{N}(0,(1-\alpha_T)\bm{I}_D),\mathcal{N}(0,\bm{I}_D)) \\
            &\leq e^{-T}\bc{\mathfrak{m}^Z_\sigma + \sqrt{D}}.
      \end{align*}
      Substituting this in the inequality (\ref{eq:wassboundofini}) implies that
      \begin{equation*}
            \mathcal{W}_1(\hat{p}^{\sigma}_\delta,p^{\sigma}_\delta) \leq e^{-m_I(T-\delta)-\delta}\bc{\mathfrak{m}^Z_\sigma + \sqrt{D}} \leq e^{-m_I(T-\delta)}\bc{\mathfrak{m}^Z_\sigma + \sqrt{D}}. \qedhere
      \end{equation*}
\end{proof}

\begin{prop}\label{prop:w1_geo_unconddm}
      Consider the geometric guidance model (\ref{eq:geometric_guidance}) and the dynamics (\ref{eq:origfromguass}), for the corresponding generated distribution $\tilde{p}_t^\sigma$ and $\hat{p}^\sigma_t$, we have
      \begin{equation*}
            \mathcal{W}_1\bc{\tilde{p}_{T-\delta},\hat{p}^{\sigma}_{\delta}} \leq \frac{\eta \sqrt{D-d_1}}{\eta-m_I} e^{-\left(m_I-1\right)(T-\delta)}+\frac{\eta M_2}{\left(m_I-1\right)(\eta-1)},
      \end{equation*}
      where $M_2$ is the constant defined in (\ref{eq:def_M_2}).
\end{prop}
\begin{proof}
      By the $m_I$-strong log-concavity of $p^\sigma_t$ (Theorem \ref{thm:smoothofscore}), we have
      \begin{align*}
            \frac{1}{2}\frac{\mathrm{d}}{\mathrm{d}t}\norm{\hat{\bm{X}}_t-\tilde{\bm{X}}_t}^2 &= \inn{\hat{\bm{X}}_t-\tilde{\bm{X}}_t,\frac{\mathrm{d}}{\mathrm{d}t}\bc{\hat{\bm{X}}_t-\tilde{\bm{X}}_t}} \\
            &= \inn{\hat{\bm{X}}_t-\tilde{\bm{X}}_t, \hat{\bm{X}}_t-\tilde{\bm{X}}_t+\nabla_{\bm{x}}\log p^{\sigma}_{T-t}(\hat{\bm{X}}_t)-\nabla_{\bm{x}}\log p^{\sigma}_{T-t}(\tilde{\bm{X}}_t)+\eta P_1\tilde{\bm{X}}_t} \\
            &\leq -(m_I-1)\norm{\hat{\bm{X}}_t-\tilde{\bm{X}}_t}^2+\eta\norm{\hat{\bm{X}}_t-\tilde{\bm{X}}_t}\norm*{P_1\tilde{\bm{X}}_t}
      \end{align*}
      Note that $P_1\tilde{\bm{X}}_t = \tilde{\bm{Y}}_t$. It follows that
      \begin{equation*}
            \frac{\mathrm{d}}{\mathrm{d}t}\norm{\hat{\bm{X}}_t-\tilde{\bm{X}}_t} \leq -(m_I-1)\norm{\hat{\bm{X}}_t-\tilde{\bm{X}}_t}+\eta\norm*{\tilde{\bm{Y}}_t}.
      \end{equation*}
      Moreover, by (\ref{eq:momentboundofy}), taking the expectation on the both sides yields
      \begin{equation*}
            \frac{\mathrm{d}}{\mathrm{d}t}\E\bj{\norm{\hat{\bm{X}}_t-\tilde{\bm{X}}_t}} \leq -(m_I-1)\E\bj{\norm{\hat{\bm{X}}_t-\tilde{\bm{X}}_t}}+\eta M_\eta(t).
      \end{equation*}
      Then, Gr\"onwall's inequality implies that
      \begin{align*}
            \mathcal{W}_1\bc{\tilde{p}_{T-\delta},\hat{p}^{\sigma}_{\delta}} &\leq \E\bj{\norm{\hat{\bm{X}}_{T-\delta}-\tilde{\bm{X}}_{T-\delta}}} \\
            &\leq \eta\int_0^{T-\delta}M_\eta(s)e^{-(m_I-1)(T-\delta-s)}\mathrm{d}s \eqdef I(\eta),
      \end{align*}
      when the initial coupling is chosen as $\hat{\bm{X}}_0=\tilde{\bm{X}}_0 \sim \mathcal{N}(0,\bm{I}_D)$. For $I(\eta)$, by the definition of $M_\eta(t)$ in (\ref{eq:momentboundofy}), we have
      \begin{align*}
            I(\eta) &\leq \eta\int_0^{T-\delta} \bc{\sqrt{D-d_1}e^{-(\eta-1)s}+\frac{M_2}{\eta-1}}e^{-(m_I-1)(T-\delta-s)}\mathrm{d}s \\
            &= \frac{\eta \sqrt{D-d_1}}{\eta-m_I} e^{-\left(m_I-1\right)(T-\delta)}\left(1-e^{-\left(\eta-m_I\right)(T-\delta)}\right)\\
            &\quad+\frac{\eta M_2}{\left(m_I-1\right)(\eta-1)}\left(1-e^{-\left(m_I-1\right)(T-\delta)}\right) \\
            &\leq \frac{\eta \sqrt{D-d_1}}{\eta-m_I} e^{-\left(m_I-1\right)(T-\delta)}+\frac{\eta M_2}{\left(m_I-1\right)(\eta-1)}.\qedhere
      \end{align*}
\end{proof}

\subsection{Discretization Error}\label{appen:discretization_error}

To clarify why performance degrades in practice when $\eta$ becomes too large, we analyze the discretization error of the geometric guidance model (\ref{eq:geometric_guidance}). In practice, ODEs are typically solved using the Euler method, while SDEs are solved using the Euler–Maruyama (EM) scheme. Since our model is formulated as a deterministic ODE in (\ref{eq:geometric_guidance}), we focus on the Euler approximation; the analysis for the corresponding SDE and the EM scheme is analogous.

More specifically, we partition the interval $[0, T - \delta]$ into $N$ subintervals with step size $h = (T - \delta)/N$, and define $t_k = kh$ for $k = 0, 1, \ldots, N$. The Euler scheme then constructs the sequence $\bb{\bm{X}_k^h}_{k=0}^N$ via
\begin{equation*}
    \bm{X}_{k+1}^h = \bm{X}_k^h + h\bc{\bm{X}_k^h + \nabla_{\bm{x}} \log p^\sigma_{T-t_k}(\bm{X}_k^h) - \eta P_1 \bm{X}^h_k},\quad \bm{X}_0^h \sim \mathcal{N}(\bm{0},\bm{I}_D).
\end{equation*}
Let $\bm{X}_k^h \sim \tilde{p}^h_k$. Our goal is to bound the Wasserstein error $\mathcal{W}_1(\tilde{p}_{T - \delta}, \tilde{p}^h_N)$. Under the Lipschitz continuity of $\nabla_{\bm{x}} \log p^\sigma_t$, standard results yield $\mathcal{W}_1(\tilde{p}_{T - \delta}, \tilde{p}^h_N) \leq \mathcal{O}(he^{\eta})$ \citep[Theorem 2.4]{griffiths2010numerical}. Because of Theorem \ref{thm:smoothofscore}, we not only have the $L_S$-smoothness
\begin{equation*}
    \norm*{\nabla_{\bm{x}}^2 \log p^\sigma_t(\bm{x})}_{\op{op}} \leq L_S, \quad L_S = \sup_{t \in [\delta,T]} L_t,
\end{equation*}
but also the $m_I$-strong log-concavity
\begin{equation*}
    -\nabla_{\bm{x}}^2 \log p^\sigma_t(\bm{x}) \succeq m_I\bm{I}_D,\quad m_I = \inf_{t \in [\delta,T]} m_t.
\end{equation*}
The additional strong log-concavity yields the improved bound
\begin{equation*}
    \mathcal{W}_1(\tilde{p}_{T - \delta}, \tilde{p}^h_N) \leq \mathcal{O}(h\eta^2).
\end{equation*}

\begin{thm}\label{thm:discretization_error}
      Assume that $t \mapsto \nabla_{\bm{x}} \log p^\sigma_t(\bm{x})$ is $C^1$ for each $\bm{x}$, and there exist $A,B \geq 0$ such that
      \begin{equation*}
            \norm*{\partial_t \nabla_{\bm{x}} \log p^\sigma_t(\bm{x})} \leq A + B \norm{x}.
      \end{equation*}
      For $\eta > 2$, if $h (\eta - 1) < 1$, we have
      \begin{equation*}
            \mathcal{W}_1(\tilde{p}_{T - \delta}, \tilde{p}^h_N) \leq \mathcal{O}(h\eta^2).
      \end{equation*}
\end{thm}
\begin{proof}
      Let
      \begin{equation*}
            b(t,\bm{x}) = \bm{x} + \nabla_{\bm{x}} \log p^\sigma_{T-t}(\bm{x}) - \eta P_1\bm{x}.
      \end{equation*}
      Let $\Phi_k(\bm{x}) \eqdef \bm{x}_{t_{k+1}}$, where $\bm{x}_t$ is the solution of ODE
      \begin{equation}\label{eq:geometric_guidance_ode_analy}
            \frac{\mathrm{d}\bm{x}_t}{\mathrm{d}t} = b(t,\bm{x}_t),\quad t \in [t_k,t_{k+1}],
      \end{equation}
      with initial value $\bm{x}_{t_k} = \bm{x}$. By (\ref{eq:geometric_guidance}), we can see
      \begin{equation*}
            \tilde{\bm{X}}_{t_{k+1}} = \Phi_k(\tilde{\bm{X}}_{t_k})
      \end{equation*}
      Moreover, define the Euler one-step map
      \begin{equation*}
            \Psi_k(\bm{x}) = \bm{x} + hb(t_k,\bm{x}),
      \end{equation*}
      so that Euler scheme is
      \begin{equation*}
            \bm{X}^h_{k+1} = \Psi_k(\bm{X}^h_k).
      \end{equation*}
      Therefore,
      \begin{align*}
            \bm{e}_{k+1} \defeq \tilde{\bm{X}}_{t_{k+1}} - \bm{X}^h_{k+1} &=  \Phi_k(\tilde{\bm{X}}_{t_k}) - \Psi_k(\bm{X}^h_k) \\
            &=\bc{\Phi_k(\tilde{\bm{X}}_{t_k}) - \Phi_k(\bm{X}^h_k)} + \bc{\Phi_k(\bm{X}^h_k) - \Psi_k(\bm{X}^h_k)}.
      \end{align*}
      Next, we analyze these two terms respectively.
      \begin{enumerate}[label = (\roman{*})]
            \item By the $m_I$-strong log-concavity, we have
            \begin{equation*}
                  \inn{\nabla_{\bm{x}}\log p^\sigma_{T-t}(\bm{x}) - \nabla_{\bm{x}}\log p^\sigma_{T-t}(\bm{y}), \bm{x} - \bm{y}} \leq -m_I \norm{\bm{x}-\bm{y}}^2.
            \end{equation*}
            Moreover, since $P_1$ is an orthogonal projection,
            \begin{equation*}
                  \inn{(\bm{I}_D - \eta P_1)(\bm{x} - \bm{y}),\bm{x} - \bm{y}} = \norm{\bm{x} - \bm{y}}^2 - \eta\norm{P_1(\bm{x} - \bm{y})}^2 \leq \norm{\bm{x} - \bm{y}}^2.
            \end{equation*}
            Therefore,
            \begin{equation*}
                  \inn{b(t,\bm{x})-b(t,\bm{y}),\bm{x}-\bm{y}} \leq -(m_I-1) \norm{\bm{x} - \bm{y}}^2
            \end{equation*}
            Let $\bm{x}_t,\bm{y}_t$ be the solution of (\ref{eq:geometric_guidance_ode_analy}) with the initial value $\bm{x}_{t_k}= \bm{x}$ and $\bm{y}_{t_k} = \bm{y}$. So we have
            \begin{equation*}
                  \frac{\mathrm{d}}{\mathrm{d}t} \norm*{\bm{x}_t - \bm{y}_t}^2 = 2 \inn{b(t,\bm{x}_t)-b(t,\bm{y}_t),\bm{x}_t-\bm{y}_t} \leq -2(m_I - 1)\norm{\bm{x}_t - \bm{y}_t}^2.
            \end{equation*}
            Then by the Gr\"onwall's Inequality (Lemma \ref{lem:gronwall}),
            \begin{equation*}
                  \norm*{\bm{x}_{t_{k+1}}-\bm{y}_{t_{k+1}}} \leq e^{-(m_I - 1)h} \norm{\bm{x} - \bm{y}},
            \end{equation*}
            which implies that
            \begin{equation}\label{eq:bound_of_phi_k}
                  \norm*{\Phi_k(\bm{x}) - \Phi_k(\bm{y})} \leq e^{-(m_I - 1)h} \norm{\bm{x} - \bm{y}}.
            \end{equation}

            \item Fix $\bm{x} \in \R^D$ and let $\bm{x}_t$ be the solution of (\ref{eq:geometric_guidance_ode_analy}) with the initial value $\bm{x}_{t_k}= \bm{x}$. Note that by definition
            \begin{equation*}
                \Phi_k(\bm{x}) = \bm{x} + \int_{t_k}^{t_{k+1}} b(t,\bm{x}_t)\mathrm{d}t.
            \end{equation*}
            Therefore,
            \begin{align*}
                  \Phi_k(\bm{x}) - \Psi_k(\bm{x}) &= \int_{t_k}^{t_{k+1}} \bc{b(t,\bm{x}_t) - b(t_k,\bm{x})}\mathrm{d}t \\
                  &= \int_{t_k}^{t_{k+1}} \bc{b(t,\bm{x}_t) - b(t,\bm{x})}\mathrm{d}t + \int_{t_k}^{t_{k+1}} \bc{b(t,\bm{x}) - b(t_k,\bm{x})}\mathrm{d}t.
            \end{align*}
            For above two terms, we analyze them respectively.
            \begin{enumerate}[label=(\alph{*})]
                  \item Since $\norm{\bm{I}_D - \eta P_1}_{\op{op}} = \eta - 1$ ($\eta > 2$) and $\nabla_{\bm{x}} \log p_t$ is $L_S$-Lipschitz continuous, $b(t,\cdot)$ is $K(\eta)$-Lipschitz continuous for $K(\eta) = L_S + \eta - 1$. So
                  \begin{equation*}
                      \int_{t_k}^{t_{k+1}}\norm*{b(t,\bm{x}_t) - b(t,\bm{x})}\mathrm{d}s \leq K(\eta)\int_{t_k}^{t_{k+1}}\norm*{\bm{x}_t - \bm{x}}\mathrm{d}s
                  \end{equation*}
                  Note that
                  \begin{equation*}
                      \norm*{\bm{x}_t - \bm{x}} = \norm*{\int_{t_k}^{t} b(s,\bm{x}_s)\mathrm{d}s} \leq \int_{t_k}^{t} \norm{b(s,\bm{x}_s)}\mathrm{d}s \leq (t - t_k)\sup_{s \in [t_k,t_{k+1}]}\norm*{b(s,\bm{x}_s)}.
                  \end{equation*}
                  Therefore,
                  \begin{equation}\label{eq:space_bound_b_1}
                        \int_{t_k}^{t_{k+1}}\norm*{b(t,\bm{x}_t) - b(t,\bm{x})}\mathrm{d}s \leq \frac{h^2}{2}K(\eta)\sup_{s \in [t_k,t_{k+1}]}\norm*{b(s,\bm{x}_s)}.
                  \end{equation}
                  For the right hand side, using same notation as (\ref{eq:bound_at_fix_pt}), let
                  \begin{equation*}
                      C = \sup_{t \in [\delta,T]} \norm*{\nabla_{\bm{x}}\log p_t^\sigma(\bm{0})} < \infty.
                  \end{equation*}
                  It implies that $\norm{b(t,\bm{0})} \leq C$ and so
                  \begin{equation*}
                      \norm*{b(s,\bm{x}_s)} \leq \norm*{b(s,\bm{x}_s) - b(s,\bm{0})} + \norm{b(t,\bm{0})} \leq C + K(\eta)\norm{\bm{x}_s}.
                  \end{equation*}
                  Let $S = \sup_{s \in [t_k,t_{k+1}]} \norm{\bm{x}_s} <\infty$. Using the similar idea as in the proof of Theorem \ref{thm:estimmfd} in Appendix \ref{appen:proof_of_theorem_ref_thm_estimmfd},
                  \begin{equation*}
                      S = \sup_{s \in [t_k,t_{k+1}]} \norm{\bm{x}_s} \leq C_1\norm{\bm{x}} + C_2
                  \end{equation*}
                  where $C_1 = \exp\bc{(1+L_S)h}$ and $C_2 = C(\exp((1+L_S)h) - 1) / (1+L_S)$ as shown in (\ref{eq:boud_solution_of_dynamic}) and they are independent of $\eta$. So
                  \begin{equation}\label{eq:space_bound_b_2}
                        \sup_{s \in [t_k,t_{k+1}]}\norm*{b(s,\bm{x}_s)} \leq C + K(\eta)\sup_{s \in [t_k,t_{k+1}]}\norm{\bm{x}_s} \leq C_1K(\eta)\norm{\bm{x}} + C_2K(\eta) + C.
                  \end{equation}
                  Combining (\ref{eq:space_bound_b_1}) and (\ref{eq:space_bound_b_2}), we have
                  \begin{equation}\label{eq:space_bound_b}
                        \int_{t_k}^{t_{k+1}}\norm*{b(t,\bm{x}_t) - b(t,\bm{x})}\mathrm{d}t \leq \frac{h^2}{2} \bc{C_1K(\eta)^2\norm{\bm{x}} + C_2K(\eta)^2 + CK(\eta)}.
                  \end{equation}

                  \item Since $b(t,\bm{x}) - b(t_k,\bm{x}) = \nabla_{\bm{x}} \log p_{T-t}^\sigma(\bm{x}) - \nabla_{\bm{x}} \log p_{T-t_k}^\sigma(\bm{x})$,
                  \begin{equation*}
                        \norm*{b(t,\bm{x}) - b(t_k,\bm{x})} \leq \int_{t_k}^t \norm*{\partial_t \nabla_{\bm{x}} \log p_{T-s}^\sigma(\bm{x})} \mathrm{d}s \leq (t-t_k)\bc{A + B\norm{\bm{x}}}.
                  \end{equation*}
                  It implies that
                  \begin{equation}\label{eq:time_bound_b}
                        \int_{t_k}^{t_{k+1}} \norm*{b(t,\bm{x}) - b(t_k,\bm{x})}\mathrm{d}t \leq \frac{h^2}{2}(A + B\norm{\bm{x}}).
                  \end{equation}
            \end{enumerate}
            Therefore, combining (\ref{eq:space_bound_b}) and (\ref{eq:time_bound_b}),
            \begin{equation}\label{eq:bound_of_psi_k}
                  \begin{aligned}
                        \norm*{\Phi_k(\bm{x}) - \Psi_k(\bm{x})} &\leq \int_{t_k}^{t_{k+1}}\norm*{b(t,\bm{x}_t) - b(t,\bm{x})}\mathrm{d}t + \int_{t_k}^{t_{k+1}} \norm*{b(t,\bm{x}) - b(t_k,\bm{x})}\mathrm{d}t \\
                  &\leq \frac{h^2}{2} \bc{A + C_2K(\eta)^2 + CK(\eta) + (B+C_1K(\eta)^2)\norm{\bm{x}}}.
                  \end{aligned}
            \end{equation}
      \end{enumerate}
      Combining (\ref{eq:bound_of_phi_k}) and (\ref{eq:bound_of_psi_k}), by setting $\bm{x} = \bm{X}_k^h$ and $\bm{y} = \tilde{\bm{X}}_{t_k}$,
      \begin{align*}
          \norm{\bm{e}_{k+1}} &\leq \norm*{\Phi_k(\tilde{\bm{X}}_{t_k}) - \Phi_k(\bm{X}^h_k)} + \norm*{\Phi_k(\bm{X}_k^h) - \Psi_k(\bm{X}_k^h)} \\
          &\leq e^{-(m_I - 1)h}\norm{\bm{e}_k} + \frac{h^2}{2} \bc{A + C_2K(\eta)^2 + CK(\eta) + (B+C_1K(\eta)^2)\norm{\bm{X}^h_k}}.
      \end{align*}
      Let $a_k = \E\bj{\norm{\bm{e}_k}}$. By the following Lemma \ref{lem:bound_euler_discrete_norm}, because $h(\eta - 1) < 1$, $\E\bj{\norm*{\bm{X}_k^h}} \leq M_e$. Taking the expectation of above inequality, we have
      \begin{equation*}
          a_{k+1} \leq e^{-(m_I - 1)h}a_k + \frac{h^2}{2} \bc{A + C_2K(\eta)^2 + CK(\eta) + (B+C_1K(\eta)^2)M_e}.
      \end{equation*}
      Therefore, by coupling $\tilde{\bm{X}}_0 = \bm{X}^h_0$, i.e., $a_0 = 0$, we have
      \begin{align*}
          a_N = \E\bj{\norm*{\tilde{\bm{X}}_{T-\delta} - \bm{X}^h_N}} &\leq \frac{h^2}{2} \bc{A + C_2K(\eta)^2 + CK(\eta) + (B+C_1K(\eta)^2)M_e}\sum_{k=0}^{N-1}e^{-(m_I - 1)hk} \\
          &\leq \frac{h}{2}\bc{A + C_2K(\eta)^2 + CK(\eta) + (B+C_1K(\eta)^2)M_e} \frac{e^{(m_I - 1)h}}{m_I-1}.
      \end{align*}
      It follows that as $h \sto 0$ and $\eta \sto \infty$,
      \begin{align*}
            \mathcal{W}_1(\tilde{p}_{T - \delta}, \tilde{p}^h_N) &\leq \E\bj{\norm*{\tilde{\bm{X}}_{T-\delta} - \bm{X}^h_N}} \leq \mathcal{O}(h\eta^2). \qedhere
      \end{align*}
\end{proof}


\begin{lem}\label{lem:bound_euler_discrete_norm}
      For $\eta > 2$, if $h(\eta - 1) < 1$, then
      \begin{equation*}
          \sup_{k}\E\bj{\norm*{\bm{X}_k^h}} \leq M_e,
      \end{equation*}
      where $M_e$ is independent of $\eta$.
\end{lem}
\begin{proof}
      First, by construction,
      \begin{equation*}
            \bm{X}_{k+1}^h = \bc{\bm{I}_D + h(\bm{I}_D -\eta P_1)}\bm{X}_k^h + h\nabla_{\bm{x}} \log p^\sigma_{T-t_k}(\bm{X}_k^h).
      \end{equation*}
      Let $M_h =\bm{I}_D + h(\bm{I}_D -\eta P_1)$. Then because $P_1$ is an orthogonal projection, there are only two eigenvalues of $M_h$: for $\bm{x} \in \ker P_1$, $M_h \bm{x} = (1 + h)\bm{x}$, and for $\bm{x} \in \Img P_1$, $M_h \bm{x} = (1 + h(1 - \eta))\bm{x}$. Because $h(\eta - 1) < 1$, $1 + h(1 - \eta) \in [0,1]$. So
      \begin{equation*}
            \norm*{M_h}_{\op{op}} = 1 + h.
      \end{equation*}
      Similarly, as shown in (\ref{eq:ineqfornormlog}),
      \begin{equation*}
            \norm*{\nabla_{\bm{x}} \log p^\sigma_{T-t_k}(\bm{X}_k^h)} \leq L_S\norm*{\bm{X}^h_k} + C.
      \end{equation*}
      Therefore,
      \begin{align*}
            \norm*{\bm{X}_{k+1}^h} &\leq \norm*{M_h}_{\op{op}}\norm*{\bm{X}_k^h} + h\norm*{\nabla_{\bm{x}} \log p^\sigma_{T-t_k}(\bm{X}_k^h)} \\
            &\leq \bc{1 + h(1+L_S)}\norm*{\bm{X}_k^h} + Ch.
      \end{align*}
      Taking expectations on the both sides, we have
      \begin{align*}
            \E\bj{\norm*{\bm{X}_k^h}} &\leq \bc{1 + h(1+L_S)}^k \E\bj{\norm*{\bm{X}_0^h}} + Ch \sum_{j=0}^{k-1}\bc{1 + h(1+L_S)}^j \\
            &\leq e^{(1 + L_S)t_k}\bc{\E\bj{\norm*{\bm{X}_0^h}} + Ct_k}.
      \end{align*}
      Because $\bm{X}_0^h \sim \mathcal{N}(\bm{0},\bm{I}_D)$, $\E\bj{\norm*{\bm{X}_0^h}} \leq \sqrt{\E\bj{\norm*{\bm{X}_0^h}^2}} = \sqrt{D}$. Therefore, if let
      \begin{equation*}
            M_e \defeq e^{(1 + L_S)(T-\delta)}\bc{\sqrt{D} + C(T-\delta)},
      \end{equation*}
      which is independent of $\eta$, then
      \begin{equation*}
            \E\bj{\norm*{\bm{X}_k^h}} \leq M_e. \qedhere
      \end{equation*}
\end{proof}


\section{Analysis for Assumptions}

\subsection{More Details for Orthogonality Assumption}\label{appen:more_details_for_orthogonality_assumption}

For Assumption \ref{assum:condlineardata}, consider the case where $A_1^\top A_2 \neq \bm{O}$, i.e., $\mathcal{M}_1$ is not orthogonal to $\mathcal{M}_2$. In this case, $A = (A_1, A_2) \notin \mathcal{O}^{D \times d}$, meaning that $A^\top A \neq \bm{I}_d$ and $AA^\top$ is no longer an orthogonal projection. We claim that this relaxation does not affect our analysis regarding the guidance scale $\eta$. Based on our results, it is necessary to examine its influence from three perspective: the smoothness and concavity of $\log p^\sigma_t$ (Section \ref{sub:smoothness_and_concavity}), the estimation of the target manifold $\mathcal{M}_1$ (Section \ref{sub:estimating_target_space}), and the distance between generated and target distributions (Section \ref{sub:distance_to_target_distribution}).

\begin{enumerate}[label=(\alph{*})]
      \item Smoothness and Convexity: First, the results on the strong log-concavity of the latent density in Theorem \ref{thm:log_concave_latent} are independent of the orthogonality of $A$. Therefore, to analyze the smoothness and concavity of $\log p_t^\sigma$, it suffices to revisit the proof of Theorem \ref{thm:smoothofscore}. Note that
      \begin{equation*}
            \bm{X}^\sigma_t = \sqrt{\alpha_t}A\bm{Z}_\sigma + \sqrt{1 - \alpha_t}\bm{\xi} \sim p_t^\sigma.
      \end{equation*}
      By Proposition \ref{prop:smoothoflatentdens}, Corollary \ref{cor:upboundoflog}, and the $m_0^Z$-strong log-concavity of the latent density $p^Z_\sigma$ (Theorem \ref{thm:log_concave_latent}), we obtain the following bounds:
      \begin{equation*}
            \norm*{\nabla^2_{\bm{x}}\log p_t^\sigma(\bm{x})}_{\op{op}} \leq L^A_t,\quad L^A_t \defeq \frac{\alpha_t(\Lambda_A+\lambda_A)+(1-\alpha_t)m_0^Z}{(1-\alpha_t)(\alpha_t\lambda_A+m^z_0(1-\alpha_t))},
      \end{equation*}
      and
      \begin{equation*}
             -\nabla^2_{\bm{x}}\log p_t^\sigma(\bm{x}) \succeq m_t^Z\bm{I}_D,\quad m_t^Z \defeq \frac{(1-\alpha_t)m_0^z - \alpha_t(\Lambda_A -\lambda_A)}{(1-\alpha_t)(\alpha_t\lambda_A+m^z_0(1-\alpha_t))},
      \end{equation*}
      where
      \begin{equation*}
            \Lambda_A = \norm*{A}_{\op{op}}^2 = \lambda_{\max}(A^\top A),\quad \lambda_A = \lambda_{\min}(A^\top A).
      \end{equation*}
      
      Because $A = (A_1,A_2)$ and $A_i$ are orthogonal,
      \begin{equation*}
            A^\top A = \bc{
                  \begin{array}{cc}
                      \bm{I}_{d_1} & C \\
                      C^\top & \bm{I}_{d_2}
                  \end{array}
            } = \bm{I}_d + \bc{
                  \begin{array}{cc}
                      \bm{O} & C \\
                      C^\top & \bm{O}
                  \end{array}
            },\quad C \defeq A_1^\top A_2.
      \end{equation*}
      Let $\sigma_{\max}(C)$ be the maximal singular value of $C$. Then, we have
      \begin{equation*}
            1 - \sigma_{\max}(C) \leq \lambda_A \leq \Lambda_A \leq 1 + \sigma_{\max}(C).
      \end{equation*}
      Moreover, because $\norm*{C}_{\op{op}} \leq \norm*{A_1}_{\op{op}}\norm*{A_2}_{\op{op}} = 1$, $\sigma_{\max}(C) \leq 1$, which implies that $0 \leq \lambda_A \leq \Lambda_A \leq 2$.

      For smoothness, it is clear that $0 < L_t^A < \infty$, so the non-orthogonality of $A$ does not affect the $L$-smoothness of $\log p_t^\sigma$,  except that the constant changes from $L_t$ to $L_t^A$. However, for strong log-concavity, it requires $m^A_t > 1$ (Corollary \ref{cor:infofm}), which holds if
      \begin{equation}\label{eq:lower_t}
            t > \frac{1}{2} \log \frac{m_0^z - \Lambda_A}{m_0^z - \lambda_A},
      \end{equation}
      under the condition $m_0^Z > 2 \geq \Lambda_A$. This requires a modification of Assumption \ref{assum:bounddifflog}, $M \leq 2\sqrt{m-2}$, for the same reason discussed in the proof of Corollary \ref{cor:infofm}.

      \item Estimating Target Manifold: Since Theorem \ref{thm:estimmfd} depends only on the $L$-smoothness of $\log p_t^\sigma$, and the geometric guidance model (\ref{eq:geometric_guidance}) does not involve $A$, the result of Theorem \ref{thm:estimmfd} remains valid even when $A$ is not orthogonal.

      \item Distance to Target Distribution: Because the condition $m^A_t > 1$ requires inequality (\ref{eq:lower_t}), one can set
      \begin{equation*}
            \delta > \frac{1}{2}\log \frac{m_0^z - \Lambda_A}{m_0^z - \lambda_A},
      \end{equation*}
      and consider the geometric guidance model (\ref{eq:geometric_guidance}) on interval $[0,T-\delta]$. With this adjustment, the results in Theorem \ref{thm:wassbound} still hold, up to changes in certain constants.  For instance, the non-orthogonality of $A$ changes the bound on $\mathcal{W}_1((Q_1){\#} \Pb_X, \Pb_{X \mid Y}(\cdot \mid Y = 1))$, specifically the constant $\tilde{C}$ in Theorem \ref{thm:wassbound}.
\end{enumerate}

\subsection{More details of Assumption \ref{assum:bounddifflog}}\label{appen:more_details_of_assumption_ref_assum_bounddifflog}

In the following, we demonstrate a family of distributions that satisfy both Assumption \ref{assum:logconcave} and Assumption \ref{assum:bounddifflog}. Consider the density function $p_i^Z$ of the distribution $\Pb_i^Z$, given by the form:
\begin{equation*}
      p_i^Z(\bm{z}) = e^{-V_i(\bm{z})} \chi_{K_i}(\bm{z}),
\end{equation*}
where $K_i \subset \R^{d_i}$ is a convex and compact set, and
\begin{equation*}
      \nabla_{\bm{z}}^2 V_i(\bm{z}) \succeq m \bm{I}_{d_i}.
\end{equation*}
In other words, $p_i^Z$ belongs to the class of strongly log-concave densities supported on convex and compact subsets of $\R^{d_i}$. 

First, for such $p_i^Z$, strong log-concavity on a convex set does not perfectly align with Assumption \ref{assum:logconcave}, which induces a question of whether this property can substitute for Assumption \ref{assum:logconcave} in deriving the strong log-concavity of the mixture latent density $p^Z_\sigma$ defined in Equation (\ref{eq:multi_of_latent}). 

In the proof of Theorem \ref{thm:log_concave_latent}, the strong log-concavity of $p^Z_\sigma$ is inherited from that of the component densities $p^Z_{i,\sigma}$ defined in Equation (\ref{eq:component_sigma_latent}), which are shown to be strongly log-concave via Corollary \ref{cor:upboundoflog}, under Assumption \ref{assum:logconcave}. In other words,  the key question is whether strong log-concavity on a convex set suffices to replace the strong log-concavity condition in Proposition \ref{prop:smoothoflatentdens}, and thereby still allow us to deduce the conclusion of Corollary \ref{cor:upboundoflog}. 

\begin{prop}\label{prop:cpt_logconcave}
      Let $\bm{Z}$ be a random variable on $\R^k$ with the density function $p^Z$ given by
      \begin{equation*}
            p^Z(\bm{z}) = e^{-V(\bm{z})} \chi_{K}(\bm{z}),
      \end{equation*}
      where $K$ is a convex set. Let $B \in \R^{n \times k}$. Assume there are $m_0,\Lambda > 0$ such that
      \begin{equation*}
            \nabla_{\bm{z}}^2 V(\bm{z}) \succeq m_0\bm{I}_k,\quad \norm*{B}^2_{\op{op}} \leq \Lambda,
      \end{equation*}
      and $\lambda \defeq \lambda_{\op{min}}(B^\top B) \geq 0$. For $\alpha \in \R$ and $\beta > 0$, let
      \begin{equation*}
            \bm{X} = \alpha  B\bm{Z}+\beta \bm{\xi},\quad \bm{\xi} \sim \mathcal{N}(\bm{0},\bm{I}_n)
      \end{equation*}
      with the density function $p_X$ on $\R^n$. We have
      \begin{equation*}
            \nabla^2_{\bm{x}}\log p_X(\bm{x}) \preceq \bc{\frac{\alpha^2\Lambda}{\beta^2(\alpha^2\lambda+m_0\beta^2)} - \frac{1}{\beta^2}}\bm{I}_n.
      \end{equation*}
\end{prop}
\begin{proof}
      By the same calculation, we have
      \begin{equation*}
            \nabla^2_{\bm{x}}\log p_X(\bm{x}) = \frac{\alpha^2}{\beta^4}B\op{Cov}_{\mu_x}(\bm{Z})B^\top - \frac{1}{\beta^2}\bm{I}_n,
      \end{equation*}
      Note in this case,
      \begin{equation*}
            \mathrm{d}\mu_x(\bm{z}) = \frac{e^{-U_x(\bm{z})}\chi_{K}(\bm{z})\mathrm{d}\bm{z}}{\int_{K} e^{-U}(\bm{y})\mathrm{d}\bm{y}},
      \end{equation*}
      where
      \begin{equation*}
            U_x(\bm{z}) = \frac{1}{2\sigma^2}\norm*{\bm{x}-B \bm{z}}^2 + V(\bm{z}).
      \end{equation*}
      It follows that 
      \begin{equation*}
            \nabla^2_{\bm{z}}U_x(\bm{z}) = \frac{\alpha^2}{\beta^2}B^\top B + \nabla^2_{\bm{z}}V(\bm{z}) \succeq m\bm{I}_k,~m \defeq \frac{\alpha^2 \lambda}{\beta^2} + m_0.
      \end{equation*}
      Instead of applying Lemma \ref{lem:poincare}, by using the Brascamp–Lieb Inequality on a convex set \citep[Proposition 2.1]{bobkov2000brunn}, we still have
      \begin{equation*}
            \op{Var}_{\mu_x}(f) \leq \frac{1}{m}\E_{\mu_x}\bj{\norm*{\nabla f}^2},
      \end{equation*}
      for any $C^1$ function $f \colon \R^k \sto \R$, which also indicates that
      \begin{equation*}
            \norm*{\op{Cov}_{\mu_x}(\bm{Z})}_{\op{op}} \leq \frac{1}{m}.
      \end{equation*}
      Then, the following proof is as same as the proof in Proposition \ref{prop:smoothoflatentdens} and in Corollary \ref{cor:upboundoflog} so that we have the same result
      \begin{equation*}
            \nabla^2_{\bm{x}}\log p_X(\bm{x}) \preceq \bc{\frac{\alpha^2\Lambda}{\beta^2(\alpha^2\lambda+m_0\beta^2)} - \frac{1}{\beta^2}}\bm{I}_n. \qedhere
      \end{equation*}
\end{proof}

Therefore, Proposition \ref{prop:cpt_logconcave} shows that, in our settings, Assumption \ref{assum:logconcave} can be replaced Assumption \ref{ass:IIp}:

\theoremstyle{plain}
\newtheorem{assumIIprime}{Assumption}     % private counter
\renewcommand{\theassumIIprime}{II$^\prime$}
\makeatletter
\renewcommand{\theHassumIIprime}{assumIIprime}
\makeatother
\begin{assumIIprime}\label{ass:IIp}
      For $i=1,2$, $\Pb^Z_{i}$ admits the density function $p^Z_i$ that has the form $p_i^Z(\bm{z}) = e^{-V_i(\bm{z})} \chi_{K_i}(\bm{z})$ such that $K_i \subset \R^{d_i}$ is a convex and compact set, and
      \begin{equation*}
            \nabla_{\bm{z}}^2 V_i(\bm{z}) \succeq m \bm{I}_{d_i}.
      \end{equation*}
\end{assumIIprime}

Next, we verify if $p_i^Z$ in such class can satisfy Assumption \ref{assum:bounddifflog}.

\begin{prop}\label{prop:cpt_to_assum}
      For $i=1,2$, let $p_i^Z$ satisfy Assumption \ref{ass:IIp}, and let $p^Z_{i,\sigma}$ defined by Equation (\ref{eq:component_sigma_latent}). Fix a $\sigma > 0$, we have
      \begin{equation*}
            \sup_{\bm{x}} \norm*{\nabla_{\bm{x}} \log p^Z_{1,\sigma}(\bm{x}) - \nabla_{\bm{x}} \log p^Z_{2,\sigma}(\bm{x})} \leq \frac{\sqrt{\abs{K_1}^2 + \abs{K_2}^2}}{\sigma^2},
      \end{equation*}
      where $\abs{K_i} = \sup\bb{\norm*{\bm{z}} \colon \bm{z} \in K_i}$.
\end{prop}
\begin{proof}
      First, by the definition (\ref{eq:component_sigma_latent}),
      \begin{equation*}
            p^Z_{1,\sigma}(\bm{z}) = (2\pi\sigma^2)^{-\frac{\mathrm{d}}{2}} \int_{K_1} \exp\bc{-\frac{1}{2\sigma^2}\norm*{\bm{z}-(\bm{z}_1,0)^\top}^2}p^Z_1(\bm{z}_1)\mathrm{d}\bm{z}_1,
      \end{equation*}
      Therefore,
      \begin{equation*}
            \nabla_{\bm{z}} \log p^Z_{1,\sigma}(\bm{z}) = \frac{\nabla_{\bm{z}} p^Z_{1,\sigma}(\bm{z})}{p^Z_{1,\sigma}(\bm{z})} = \frac{-\frac{1}{\sigma^2}\int_{K_1} (\bm{z} - (\bm{z}_1,0)^\top)\exp\bc{-\frac{1}{2\sigma^2}\norm*{\bm{z}-(\bm{z}_1,0)^\top}^2}p^Z_1(\bm{z}_1)\mathrm{d}\bm{z}_1}{\int_{K_1} \exp\bc{-\frac{1}{2\sigma^2}\norm*{\bm{z}-(\bm{z}_1,0)^\top}^2}p^Z_1(\bm{z}_1)\mathrm{d}\bm{z}_1}.
      \end{equation*}
      It follows that
      \begin{equation*}
            \nabla_{\bm{z}} \log p^Z_{1,\sigma}(\bm{z}) = \frac{1}{\sigma^2}\bc{(m_1(\bm{z}),0)^\top - \bm{z}},
      \end{equation*}
      where
      \begin{equation*}
            m_1(\bm{z}) = \frac{\int_{K_1}\bm{z}_1 \exp\bc{-\frac{1}{2\sigma^2}\norm*{\bm{z}-(\bm{z}_1,0)^\top}^2}p^Z_1(\bm{z}_1)\mathrm{d}\bm{z}_1}{\int_{K_1} \exp\bc{-\frac{1}{2\sigma^2}\norm*{\bm{z}-(\bm{z}_1,0)^\top}^2}p^Z_1(\bm{z}_1)\mathrm{d}\bm{z}_1}.
      \end{equation*}
      Similarly,
      \begin{equation*}
            \nabla_{\bm{z}} \log p^Z_{2,\sigma}(\bm{z}) = \frac{1}{\sigma^2}\bc{(0,m_2(\bm{z}))^\top - \bm{z}},
      \end{equation*}
      for
      \begin{equation*}
            m_2(\bm{z}) = \frac{\int_{K_2}\bm{z}_2 \exp\bc{-\frac{1}{2\sigma^2}\norm*{\bm{z}-(\bm{z}_2,0)^\top}^2}p^Z_2(\bm{z}_2)\mathrm{d}\bm{z}_2}{\int_{K_2} \exp\bc{-\frac{1}{2\sigma^2}\norm*{\bm{z}-(\bm{z}_2,0)^\top}^2}p^Z_2(\bm{z}_2)\mathrm{d}\bm{z}_2}.
      \end{equation*}
      Therefore,
      \begin{equation*}
            \norm*{\nabla_{\bm{z}} \log p^Z_{1,\sigma}(\bm{z}) - \nabla_{\bm{z}} \log p^Z_{2,\sigma}(\bm{z})} = \frac{1}{\sigma^2}\norm*{(m_1(\bm{z}),m_2(\bm{z}))^\top} = \frac{1}{\sigma^2}\sqrt{\norm*{m_1(\bm{z})}^2 + \norm*{m_2(\bm{z})}^2}.
      \end{equation*}
      Note that
      \begin{equation*}
            m_i(\bm{z}) = \E_{\bm{Z} \sim \mu^i_z}[\bm{Z}],\quad \mathrm{d}\mu^i_z(\bm{z}_i) \defeq \frac{\exp\bc{-\frac{1}{2\sigma^2}\norm*{\bm{z}-(\bm{z}_i,0)^\top}^2}p^Z_i(\bm{z}_i)\mathrm{d}\bm{z}_i}{\int_{K_i} \exp\bc{-\frac{1}{2\sigma^2}\norm*{\bm{z}-(\bm{z}_i,0)^\top}^2}p^Z_i(\bm{z}_i)\mathrm{d}\bm{z}_i}.
      \end{equation*}
      By the convexity of $K_i$ and Lemma \ref{lem:expeconv}, $m_i(\bm{z}) \in K_i$. Then, the boundedness of $K_i$ implies
      \begin{equation*}
            \sup_{\bm{x}} \norm*{\nabla_{\bm{x}} \log p^Z_{1,\sigma}(\bm{x}) - \nabla_{\bm{x}} \log p^Z_{2,\sigma}(\bm{x})} \leq \frac{\sqrt{\abs{K_1}^2 + \abs{K_2}^2}}{\sigma^2}. \qedhere
      \end{equation*}
\end{proof}

\paragraph{Sufficient conditions for Assumption \ref{assum:bounddifflog}.} If $p_i^Z$ belongs to the class of distributions
\begin{equation}\label{eq:suffi_class_assump2}
      \bb{e^{-V(\bm{z})}\chi_K(\bm{z}) \colon \nabla^2 V \succeq m\bm{I},~K \text{ is compact and convex.}},
\end{equation}
then $p^Z_{i,\sigma}$ given Equation (\ref{eq:component_sigma_latent}) is strongly log-concave by Proposition \ref{prop:cpt_logconcave}. Moreover, if we choose $\sigma$ such that 
\begin{equation}\label{eq:suffi_cond_assump2}
      M \leq \frac{\sqrt{\abs{K_1}^2 + \abs{K_2}^2}}{\sigma^2} \leq 2\sqrt{m-1} \quad\Leftrightarrow\quad \sigma^2 \geq \sqrt{\frac{\abs{K_1}^2 + \abs{K_2}^2}{4(m-1)}},
\end{equation}
Proposition \ref{prop:cpt_to_assum} shows that Assumption \ref{assum:bounddifflog} is satisfied. Then the mixture latent distribution $p^Z_\sigma$ given by Equation (\ref{eq:multi_of_latent}) is $m_0^z$-strongly log-concave provided by Theorem \ref{thm:log_concave_latent}, which further implies that $p^\sigma_t$ in the geometric guidance model (\ref{eq:geometric_guidance}) satisfies:
\begin{equation*}
      \norm*{\nabla^2_{\bm{x}}\log p_t^\sigma(\bm{x})}_{\op{op}} \leq L_t,\quad -\nabla^2_{\bm{x}}\log p_t^\sigma(\bm{x}) \succeq m_t\bm{I}_D.
\end{equation*}

\section{Lipschitz Continuity of Score Function}\label{appen:weaker_assumption_l_smoothness}

If we only focus on the Lipschitz continuity of the score function $\nabla_{\bm{x}} \log p_t$, where $p_t$ is obtained by a DDPM initialized from a distribution whose latent distribution admits a smooth density function $p^Z$, then the conditions in Proposition \ref{prop:smoothoflatentdens} can be relaxed. We consider two cases below.

The first case aligns with the setting considered in \citet{debortoli2022convergence}, where $\supp p^Z$ is assumed to be compact. We provide an alternative proof for this case, motivated by the argument used in the proof of Proposition \ref{prop:smoothoflatentdens}.

\begin{prop}\label{prop:cpt_supp_latent}
      Let $\bm{Z}$ be a random variable on $\R^k$ with the density function $p^Z$, and let $\phi \colon \R^k \sto \R^n$ be continuous. Assume $\supp p^Z$ is compact. For $\alpha \in \R$ and $\beta > 0$, let
      \begin{equation*}
            \bm{X} = \alpha  \phi(\bm{Z})+\beta \bm{\xi},\quad \bm{\xi} \sim \mathcal{N}(\bm{0},\bm{I}_n),
      \end{equation*}
      with the density function $p_X$ on $\R^n$. We have
      \begin{equation*}
            \norm*{\nabla^2_{\bm{x}}\log p_X(\bm{x})}_{\op{op}} \leq \frac{1}{\beta^2} + \frac{\alpha^2R^2}{\beta^4},
      \end{equation*}
      for some constant $R > 0$.
\end{prop}
\begin{proof}
      By the similar calculation as in the proof of Proposition \ref{prop:smoothoflatentdens},
      \begin{equation*}
            \nabla^2_{\bm{x}}\log p_X(\bm{x}) = \frac{\alpha^2}{\beta^4}\op{Cov}_{\mu_x}(\phi(\bm{Z})) - \frac{1}{\beta^2}\bm{I}_n,
      \end{equation*}
      which follows that
      \begin{equation*}
            \norm*{\nabla^2_{\bm{x}}\log p_X(\bm{x})}_{\op{op}} \leq \frac{1}{\beta^2} + \frac{\alpha^2}{\beta^4}\norm*{\op{Cov}_{\mu_x}(\phi(\bm{Z}))}_{\op{op}}.
      \end{equation*}
      To bound the second term, first by the definition of $\mu_x$, $\supp \mu_x = \supp p^Z$. Because $\supp p^Z$ is compact and $\phi$ is continuous, $\phi(\supp \mu_x)$ is compact, which means that there exists a $R > 0$ such that
      \begin{equation*}
            \sup \bb{\norm*{\phi(\bm{z})} \colon \bm{z} \in \supp \mu_x} \leq R.
      \end{equation*}
      Then, we obtain that for any $\bm{u} \in \R^n$,
      \begin{equation*}
            \bm{u}^\top \op{Cov}_{\mu_x}(\phi(\bm{Z}))\bm{u} = \op{Var}_{\mu_x}\bc{\bm{u}^\top\phi(\bm{Z})} \leq \op{Var}_{\mu_x}\bc{\norm*{\bm{u}}\norm*{\phi(\bm{Z})}} \leq R^2\norm*{\bm{u}}^2,
      \end{equation*}
      which indicates that
      \begin{equation*}
             \norm*{\op{Cov}_{\mu_x}(\phi(\bm{Z}))}_{\op{op}} \leq R^2.
      \end{equation*}
      Therefore, we have
      \begin{equation*}
            \norm*{\nabla^2_{\bm{x}}\log p_X(\bm{x})}_{\op{op}} \leq \frac{1}{\beta^2} + \frac{\alpha^2R^2}{\beta^4}. \qedhere
      \end{equation*}
\end{proof}
\begin{rmk}
      This proposition shows that when the latent density function has compact support, no additional conditions—such as log-concavity or $L$-smoothness—are required for the latent distribution. Moreover, under the compactness assumption, the results of Proposition \ref{prop:smoothoflatentdens} can be extended to the nonlinear case, as shown in \citet{debortoli2022convergence}.
\end{rmk}

Next, in the non-compact case, Proposition \ref{prop:smoothoflatentdens} requires the strong log-concavity of the latent density $p^Z$, as it is used to establish not only the $L$-smoothness but also the concavity of $\log p_X$ (see Corollary \ref{cor:upboundoflog}). However, if we are only interested in the $L$-Lipschitz continuity of the score function, the assumption of concavity can be relaxed to the $L_0$-smoothness of $\log p^Z$, i.e., $\norm*{\nabla_{\bm{z}}^2 \log p^Z(\bm{z})} \leq L_0$, or even to the weaker condition $\nabla_{\bm{z}}^2 \log p^Z(\bm{z}) \preceq L_0 \bm{I}_k$; see Proposition \ref{prop:weakersmoothoflatentdens} below.

\begin{prop}\label{prop:weakersmoothoflatentdens}
      Let $\bm{Z}$ be a random variable on $\R^k$ with the density function $p^Z$ and $B \in \R^{n \times k}$. Assume there are $L_0,\Lambda > 0$ such that
      \begin{equation*}
            \nabla_{\bm{z}}^2\log p^Z(\bm{z}) \preceq L_0\bm{I}_k,\quad \norm*{B}^2_{\op{op}} \leq \Lambda,
      \end{equation*}
      and $\lambda \defeq \lambda_{\op{min}}(B^\top B) > 0$, the minimum of all eigenvalues of $B^\top B$. For $\alpha \in \R$ and $\beta > 0$, let
      \begin{equation*}
            \bm{X} = \alpha  B\bm{Z}+\beta \bm{\xi},\quad \bm{\xi} \sim \mathcal{N}(\bm{0},\bm{I}_n),
      \end{equation*}
      with the density function $p_X$ on $\R^n$. If $\alpha^2\lambda-L_0\beta^2 > 0$, we have
      \begin{equation*}
            \norm*{\nabla^2_{\bm{x}}\log p_X(\bm{x})}_{\op{op}} \leq  \frac{1}{\beta^2} + \frac{\alpha^2\Lambda}{\beta^2(\alpha^2\lambda-L_0\beta^2)}
      \end{equation*}
      and
      \begin{equation*}
            \nabla^2_{\bm{x}}\log p_X(\bm{x}) \preceq \bc{\frac{\alpha^2\Lambda}{\beta^2(\alpha^2\lambda-L_0\beta^2)} - \frac{1}{\beta^2}}\bm{I}_n.
      \end{equation*}
\end{prop}
\begin{proof}
      The main difference of this proof to the proof in Proposition \ref{prop:smoothoflatentdens} is how to bound $\norm*{\op{Cov}_{\mu_x}(\bm{Z})}_{\op{op}}$. 

      Note that
      \begin{equation*}
            \nabla^2_{\bm{z}}U_x(\bm{z}) = \frac{\alpha^2}{\beta^2}B^\top B + \nabla^2_{\bm{z}}V(\bm{z}) \succeq \bc{\frac{\alpha^2 \lambda}{\beta^2} - L_0}I_k,
      \end{equation*}
      because $-\nabla_{\bm{z}}^2\log p^Z(\bm{z}) = \nabla^2_{\bm{z}}V(\bm{z}) \succeq -L_0I_k$. When $\alpha^2\lambda-L_0\beta^2 > 0$, we similarly obtain
      \begin{equation*}
            \norm*{\op{Cov}_{\mu_x}(\bm{Z})}_{\op{op}} \leq \frac{\beta^2}{\alpha^2\lambda-L_0\beta^2}.
      \end{equation*}
      Therefore,
      \begin{equation*}
            \norm*{\nabla^2_{\bm{x}}\log p_X(\bm{x})}_{\op{op}} \leq  \frac{1}{\beta^2} + \frac{\alpha^2\Lambda}{\beta^2(\alpha^2\lambda-L_0\beta^2)}.
      \end{equation*}
      On the other hand,
      \begin{equation*}
            B\op{Cov}_{\mu_x}(\bm{Z})B^\top \preceq \frac{\Lambda\beta^2}{\alpha^2\lambda-L_0\beta^2}\bm{I}_n,
      \end{equation*}
      which follows that
      \begin{equation*}
            \nabla^2_{\bm{x}}\log p_X(\bm{x}) \preceq \bc{\frac{\alpha^2\Lambda}{\beta^2(\alpha^2\lambda-L_0\beta^2)} - \frac{1}{\beta^2}}\bm{I}_n. \qedhere
      \end{equation*}
\end{proof}

\section{More Details for Nonlinear Extension}

\subsection{Omitted Proofs in Section \ref{sec:experiments_nonlinear_case}}\label{appen:omitted_proofs_in_section_ref_sec_experiments_nonlinear_case}

\begin{proof}[Proof of Lemma \ref{lem:isoencod}]
      The proof consists of the following three steps. First, let $\bm{z} \in \R^d$ be arbitrary.
      \begin{enumerate}[label=(\roman*)]
            \item Local construction: Because $\phi \colon \R^d \sto \mathcal{M} \subset \R^D$ is an isometry, the columns of $J\phi(\bm{z})$ form an orthonormal basis for the tangent space $T_{\phi(\bm{z})}\mathcal{M}$. These vectors can be extended to an orthonormal basis of $\R^D$ by adjoining
            \begin{equation*}
                  \bb{n_1(\bm{z}),n_2(\bm{z}),\ldots,n_{D-d}(\bm{z})},
            \end{equation*}
            where each $n_i$ is a smooth normal vector fields along $\mathcal{M}$. For such $n_i$, one can define the Fermi coordinates map as
            \begin{equation*}
                  F \colon \R^d \times \R^{D-d} \longrightarrow \R^D,\quad F(\bm{z},\bm{v}) = \phi(\bm{z}) + \sum_{i=1}^{D-d}v_in_i(\bm{z}).
            \end{equation*}
            Then, by the Tubular Neighborhood Theorem (Theorem \ref{thm:tubular}), there exists a $\varepsilon \colon \R^d \sto (0,\infty)$ such that for the set
            \begin{equation*}
                  V = \bb{(\bm{z},\bm{v}) \in \mathcal{M} \times \R^{D-d} \colon \norm{\bm{v}} < \varepsilon(\bm{z})},
            \end{equation*}
            $F \colon V \sto U=F(V)$ is a diffeomorphism, where the open set $U \subset \R^D$ is a neighborhood of $\mathcal{M}$, called a tubular neighborhood. Let $\pi \colon \R^d \times \R^{D-d} \sto \R^d$ be the projection, i.e., $\pi(\bm{z},\bm{v}) = \bm{z}$. Then, one can construct
            \begin{equation*}
                  \tilde{\phi}^* \colon U \longrightarrow \R^d,\quad \tilde{\phi}^*(\bm{x}) = \pi(F^{-1}(\bm{x}))
            \end{equation*}

            \item Check conditions: First, because $\mathcal{M} \subset U$, and $F$ is diffeomorphic from $V$ to $U$ with $F(\bm{z},0) = \phi(\bm{z})$,
            \begin{equation*}
                  \tilde{\phi}^*(\phi(\bm{z})) = \pi(F^{-1}(\phi(\bm{z}))) = \pi(\bm{z},0) = \bm{z},\quad \forall~\bm{z} \in \R^d.
            \end{equation*}
            For the derivative condition, by the definition of $F$, we have
            \begin{equation*}
            JF(\bm{z},0) = \bc{J_{\bm{z}}F(\bm{z},0),J_{\bm{v}}F(\bm{z},0)} = \bc{J\phi(\bm{z}),\bm{n}(\bm{z})},
            \end{equation*}
            where $\bm{n} = \bc{n_1(\bm{z}),\ldots,n_{D-d}(\bm{z})}$. By $J\phi^\top J\phi =\bm{I}_d$, $JF(\bm{z},0)$ is orthogonal, which follows that
            \begin{equation*}
                  J(F^{-1})(F(\bm{z},0)) = JF(\bm{z},0)^{-1} = JF(\bm{z},0)^\top = \bc{
                        \begin{array}{c}
                              J\phi(\bm{z})^\top \\
                              \bm{n}(\bm{z})^\top
                        \end{array}
                  }.
            \end{equation*}
            On the other hand, $F^{-1}$ can be written as $F^{-1}(\bm{x}) = \bc{F_1(\bm{x}),F_2(\bm{x})}$, where $F_1 = \pi \circ F^{-1} = \tilde{\phi}^*$ on $U$. It implies that
            \begin{equation*}
                  J(F^{-1})(F(\bm{z},0)) = \bc{
                        \begin{array}{c}
                              J\tilde{\phi}^*(\phi(\bm{z})) \\
                              JF_2(\phi(\bm{z}))
                        \end{array}
                  }.
            \end{equation*}
            Therefore, $J\tilde{\phi}^*(\phi(\bm{z})) = J\phi(\bm{z})^\top$.

            \item Global construction: By the Urysohn Lemma \citep{munkres_topology_2018}, there exists a smooth function $\chi \colon \R^D \sto [0,1]$ such that $\chi |_{\tilde{U}} \equiv 1$ and $\chi|_{\R^D \backslash U} \equiv 0$, where $\tilde{U} \subset U$ is a open neighborhood of $\mathcal{M}$. Let $h \colon \R^D \to \R^d$ be any smooth function—for instance, a constant function $h \equiv \bm{c}$. Define
            \begin{equation*}
                  \phi^*(\bm{x}) = \chi(\bm{x})\tilde{\phi}^*(\bm{x})+(1-\chi(\bm{x}))h(\bm{x}),
            \end{equation*}
            then the desired identities hold:
            \begin{equation*}
                  \phi^* \circ \phi = \op{id}_{\R^d},\quad J\phi^*(\phi(\bm{z})) = J\phi(\bm{z})^\top. \qedhere
            \end{equation*}
      \end{enumerate}
\end{proof}

\begin{proof}[Proof of Theorem \ref{thm:hypersubmfd}]
      Let $\phi \colon \R^d \sto \R^D$ be the isometry for defining $\mathcal{M} = \Img \phi$. Because $\supp \Pb_X \subset \mathcal{M}$, there exists a $\Pb^Z$ defined on $\R^d$ such that $\bm{X} = \phi(\bm{Z}) \sim \Pb_X$ when $\bm{Z} \sim \Pb_Z$. Let $t$ be fixed in $(0,T]$. By (\ref{eq:sol_DDPM}),
      \begin{equation*}
            \bm{X}_t = \sqrt{\alpha_t}\phi(\bm{Z}) + \sqrt{1-\alpha_t}\bm{\xi}.
      \end{equation*}
      Define $F^t \colon \R^D \sto \R^D$ by
      \begin{equation*}
            F^t(\bm{x}) \defeq \sqrt{\alpha_t}\phi\circ \phi^*\bc{\frac{\bm{x}}{\sqrt{\alpha_t}}},\quad \alpha_t = e^{-2t},
      \end{equation*}
      where $\phi^*$ is defined in Lemma \ref{lem:isoencod}. Then we have
      \begin{equation*}
            F^t(\bm{X}_t) = \sqrt{\alpha_t}\phi\circ \phi^*\bc{\bm{X}_0 + \sqrt{\frac{1-\alpha_t}{\alpha_t}}\bm{\xi}},\quad \bm{X}_0 \defeq \phi(\bm{Z}).
      \end{equation*}
      Now consider the Taylor expansion of $\varphi \defeq \phi\circ \phi^*$ at $\bm{X}_0=\phi(\bm{Z})$, with integral remainder. We obtain
      \begin{equation*}
            F^t(\bm{X}_t) = F^t(\bm{X}_0) + \sqrt{1-\alpha_t}J\varphi(\bm{X}_0)\bm{\xi} + R(\bm{\xi}),
      \end{equation*}
      where $R(\bm{\xi})$ denotes the remainder term. 

      Next, we analyze the three terms on the right-hand side one by one. For the first term, because $\bm{X}_0 = \phi(\bm{Z}) \in \mathcal{M}$, $\bm{Z} = \phi^*(\bm{X}_0)$ by the definition of $\phi^*$; see the proof of Lemma \ref{lem:isoencod}. It implies that
      \begin{equation*}
            F^t(\bm{X}_0) = \sqrt{\alpha_t}\phi\circ \phi^*(\bm{X}_0) = \sqrt{\alpha_t}\bm{X}_0.
      \end{equation*}
      For the second term, by Lemma \ref{lem:isoencod},
      \begin{equation*}
            J\varphi(\bm{X}_0) = J\phi(\bm{Z})J\phi^*(\phi(\bm{Z})) = J\phi(\bm{Z})J\phi(\bm{Z})^\top.
      \end{equation*}
      Moreover, because $J\phi^\top J\phi = \bm{I}_d$, $P \defeq J\varphi(\bm{X}_0)$ is an orthogonal projection with rank $d$. For the third term, 
      \begin{equation*}
            R(\bm{\xi}) = \frac{1-\alpha_t}{\sqrt{\alpha_t}}\int_0^1 (1-s)D^2\varphi\bc{\bm{X}_0 + s\sqrt{(1-\alpha_t)/\alpha_t} \bm{\xi}}[\bm{\xi},\bm{\xi}]\mathrm{d}s.
      \end{equation*}
      By the proof of Lemma \ref{lem:isoencod}, $\phi^* \equiv \bm{c}$ on $\R^D \backslash U$ for a tubular neighborhood $U$ of $\mathcal{M}$, which means $J\phi^* = 0$ and $D^2\phi^* = 0$ on $\R^D \backslash U$. It follows that
      \begin{equation*}
            D^2\varphi(\bm{x})[\bm{u},\bm{v}] = D^2\phi(\phi^*(\bm{x}))\bj{J\phi^*(\bm{x})\bm{u},J\phi^*(\bm{x})\bm{v}}+J\phi(\phi^*(\bm{x}))(D^2\phi^*(\bm{x})[\bm{u},\bm{v}]) = 0
      \end{equation*}
      for $\bm{x} \in \R^D \backslash U$. For a chosen $\delta$, we can choose a tubular neighborhood $U$ sufficiently thin such that $\bm{X}_0 + s\sqrt{(1-\alpha_t)/\alpha_t} \bm{\xi} \notin U$ for $s > \delta$. Therefore, we have
      \begin{equation*}
            R(\bm{\xi}) = \frac{1-\alpha_t}{\sqrt{\alpha_t}}\int_0^\delta (1-s)D^2\varphi\bc{\bm{X}_0 + s\sqrt{(1-\alpha_t)/\alpha_t} \bm{\xi}}[\bm{\xi},\bm{\xi}]\mathrm{d}s.
      \end{equation*}
      Assume $D^2\varphi$ is bounded on $U$. Then, for any small $\varepsilon^\prime > 0$, one can choose $\delta$ sufficiently small such that $\norm{R(\bm{\xi})} \leq \varepsilon^\prime$. 

      Combining these analyses, we obtain
      \begin{equation}\label{eq:bound_norm_F_t}
            \sqrt{1-\alpha_t}\norm{(\bm{I}_D - P)\bm{\xi}} - \varepsilon^\prime \leq \norm{\bm{X}_t - F^t(\bm{X}_t)} \leq \sqrt{1-\alpha_t}\norm{(\bm{I}_D - P)\bm{\xi}} + \varepsilon^\prime.
      \end{equation}
      Let $f^t(\bm{x}) \defeq \norm{\bm{x} - F^t(\bm{x})}$. Similarly as the proof of Proposition \ref{prop:data_mfd_linear}, by the Laurent-Massart bound (Lemma \ref{lem:laurent_massart}), (\ref{eq:bound_norm_F_t}) implies that
      \begin{equation*}
            \Pb\bc{r(t)\sqrt{1-2\sqrt{\varepsilon}}- \varepsilon^\prime \leq f^t(\bm{X}_t) \leq r(t)\sqrt{1+2\sqrt{\varepsilon}+2\varepsilon} + \varepsilon^\prime} \geq 1- 2e^{-2(D-d)\varepsilon},
      \end{equation*}
      where $r(t) = \sqrt{(D-d)(1-\alpha_t)}$. Because $d \ll D$, one can choose small $\varepsilon$ such that $\delta =e^{-2(D-d)\varepsilon}$ is also small enough. As a result, $\Pb(f^t(\bm{X}_t) \approx r(t)) \geq 1 - \delta$, i.e., $\bm{X}_t$ concentrates on $\mathcal{M}^t = (f^t)^{-1}(r(t))$ with high probability.
\end{proof}

\subsection{More Results of Experiments}\label{appen:more_experiments}

\paragraph{Comparison of FID.} Table \ref{tab:comp_fid_other} serves as a complement to Table \ref{tab:comp_fid}.
\begin{table}[ht]
\caption{Comparison of FID on CIFAR-10}
\label{tab:comp_fid_other}
\begin{center}
\begin{tabular}{rcccccc}
\toprule
~ & Airplane & Bird & Cat & Deer & Dog  & Overall \\
\midrule
    CGM ($\eta = 1$)   & 17.95 & 21.69 & 20.34 & 19.24 & 23.62   & 4.07 \\
    \midrule
    GeGM ($\eta = 50$)  & 18.98  & 18.39 & 17.35 & 17.38 & 18.45  & 5.15 \\
\bottomrule
\end{tabular}
\end{center}
\end{table}

\paragraph{FID v.s. guidance scale on CIFAR-10.} By sampling with the nonlinear GeGM (\ref{eq:non_linear_geo_guid}), Figure \ref{fig:fidvsscaleall} shows how the FID varies with the guidance scale $\eta$ across all classes from CIFAR-10, which is consistent with the result of Theorem \ref{thm:wassbound}.
\begin{figure*}[ht]
    \centering
    \includegraphics[width=0.5\textwidth, height=0.33\textwidth]{images/fid_vs_eta_all_classes.pdf}
    \caption{FID v.s. guidance scale $\eta$ of GeGM on all classes of CIFAR-10}
    \label{fig:fidvsscaleall}
\end{figure*}

\section{Technical Lemmas}

\begin{lem}[\citet{laurent2000adaptive}]\label{lem:laurent_massart}
      Let $X$ be a $\chi^2$-random variable with $n$ degrees of freedom, i.e., $X = \sum_{i=1}^n\xi_i^2$ with $\xi_i \stackrel{i.i.d.}{\sim} \mathcal{N}(0,1)$. Then, for any $\alpha > 0$, we have
      \begin{align*}
            \mathbb{P}(X-n \geq 2 \sqrt{n \alpha}+2 \alpha) & \leq e^{-\alpha}, \\
            \mathbb{P}(X-n \leq-2 \sqrt{n \alpha}) & \leq e^{-\alpha}.
      \end{align*}
\end{lem}

\begin{lem}\label{lem:orth_decomp_density}
      Let $\R^n = V \oplus V^\perp$ be an orthogonal decomposition of $\R^n$, where $V$ is a linear subspace and $V^\perp$ is its orthogonal complement. Let $\bm{X},\bm{Y} \in \R^n$ be random variables such that $\bm{X} \in V$, $\bm{Y} \in V^\perp$, and $\bm{X}$ independent of $\bm{Y}$. Suppose that $\bm{X}$ and $\bm{Y}$ admit densities $p_X$ and $p_Y$ on $V$ and $V^\perp$, respectively, with respect to the canonical volume measures on $V$ and $V^\perp$.  Then the density function of $\bm{Z} = \bm{X} + \bm{Y}$ is given by
      \begin{equation*}
            p_Z(\bm{z}) = p_X(Q\bm{x})p_Y(Q^\perp\bm{x}),
      \end{equation*}
      where $Q$ is the orthogonal projection onto $V$, and $Q^\perp = \bm{I}_n - Q$ is the orthogonal projection onto $V^\perp$.
\end{lem}
\begin{proof}
      Let $m_V$ and $m_{V^\perp}$ be the canonical volume measure on $V$ and $V^\perp$, respectively. Define $\Phi \colon V \times V^\perp \sto \R^n$ by $\phi(\bm{x},\bm{y}) = \bm{x} + \bm{y}$. Clearly, $\Phi$ is an orthogonal linear map, which indicates $\abs{\det J\Phi} = 1$, so
      \begin{equation*}
            \Phi_{\#}\bc{m_V \otimes m_{V^\perp}} = m_n,
      \end{equation*}
      where $m_n$ is the Lebesgue measure on $\R^n$.

      Let $\Pb_X$ and $\Pb_Y$ be the distributions of $\bm{X}$ and $\bm{Y}$, respectively. Then $\mathrm{d}\Pb_X = p_X\mathrm{d}m_V$ and $\mathrm{d}\Pb_Y = p_Y\mathrm{d}m_{V^\perp}$. By the independence of $\bm{X}$ and $\bm{Y}$, we have
      \begin{equation*}
            \mathrm{d}\bc{\Pb_X \otimes \Pb_Y} = p_X(\bm{x})p_Y(\bm{y}) \mathrm{d}\bc{m_V(\bm{x}) \otimes m_{V^\perp}(\bm{y})}.
      \end{equation*}
      Since $\bm{Z} = \bm{X} + \bm{Y} = \Phi(\bm{X},\bm{Y})$, it follows that $\bm{Z} \sim \Pb_Z = \Phi_{\#}\bc{\Pb_X \otimes \Pb_Y}$, and thus
      \begin{align*}
            \Pb_Z(U) &= \int_{\R^n} \chi_U(\bm{z})\mathrm{d}\Pb_Z(\bm{z}) \\
            &= \int_{V \times V^\perp} \chi_U(\bm{x}+\bm{y}) \mathrm{d}\bc{\Pb_X \otimes \Pb_Y} \\
            &= \int_{V \times V^\perp} \chi_U(\bm{x}+\bm{y}) p_X(\bm{x})p_Y(\bm{y}) \mathrm{d}\bc{m_V(\bm{x}) \otimes m_{V^\perp}(\bm{y})} \\
            &= \int_{\R^n} \chi_U(\bm{z}) p_X(Q\bm{z})p_Y(Q^\perp\bm{z}) \mathrm{d} \Phi_{\#}\bc{m_V(\bm{x}) \otimes m_{V^\perp}(\bm{y})} \\
            &= \int_{\R^n} \chi_U(\bm{z}) p_X(Q\bm{z})p_Y(Q^\perp\bm{z}) \mathrm{d}m_n(\bm{z}).
      \end{align*}
      Therefore, we have
      \begin{equation*}
          p_Z(\bm{z}) = p_X(Q\bm{x})p_Y(Q^\perp\bm{x}). \qedhere
      \end{equation*}
\end{proof}

\begin{lem}\label{lem:trans_browanian}
      Let $(\bm{W}_t)_{t \geq 0}$ be a standard Brownian motion on $\R^m$ and $A \in \mathcal{O}^{m \times n}$. Let
      \begin{equation*}
            \bm{B}_t \defeq A^\top \bm{W}_t.
      \end{equation*}
      Then $(\bm{B}_t)_{t \geq 0}$ is a standard Brownian motion on $\R^n$.
\end{lem}
\begin{proof}
      The path continuity of $t \mapsto \bm{B}_t = A^\top \bm{W}_t$ follows directly from the path continuity of $t \mapsto \bm{W}_t$, as does the independence of increments. The initial condition $\bm{B}_0 = A^\top \bm{W}_0 = 0$ is immediate. Moreover, since $A \in \mathcal{O}^{m \times n}$, we have,
      \begin{equation*}
          \bm{B}_t - \bm{B}_s = A^\top(\bm{B}_t - \bm{B}_s) \sim \mathcal{N}(0,(t-s)\bm{I}_m),\quad \forall~ t > s. \qedhere
      \end{equation*}
\end{proof}

\begin{lem}[\citet{jost2008riemannian}]\label{lem:change_var_mfd}
      For a function $g \colon \R^n \sto \R^m$, if $g \colon \R^n \sto \Img g$ is a diffeomorphism, that is, both $g$ and its inverse $g^{-1} \colon \Img g \sto \R^n$ are continuously differentiable, then $g_{\#} p_X$, the density function of $g_{\#} \Pb_X$ on $\Img g$ with respect to the canonical volume measure on $\Img g$, satisfies
      \begin{equation*}
            g_{\#} p_X (\bm{y}) = p_X(\bm{x})\abs{\det \bc{Jg(\bm{x})Jg(\bm{x})^\top}}^{\frac{1}{2}},\quad \bm{x} = g^{-1}(\bm{y}).
      \end{equation*}
      Moreover, when $g(\bm{x}) = A\bm{x}$ for an $A \in \mathcal{O}^{m \times n}$, $A_{\#}p_X(\bm{y}) = p_X(A^\top\bm{y})$.
\end{lem}
\begin{rmk}
      This result is essentially a general form of the change-of-variables formula, which has been widely used in the context of generative models on manifolds (see, e.g., \citet{loaiza-ganem2024deep}). To rigorously justify this result, some basic knowledge of Riemannian geometry is required. Since $g \colon \R^n \sto \Img g$ is a diffeomorphism, the image $\Img g \subset \R^n$ is a submanifold. When $\Img g$ is equipped with the canonical Riemannian structure induced from the ambient Euclidean space $\R^n$, the canonical volume measure on $\Img g$ coincides with the Riemannian volume measure. Therefore, the relevant results from \citet[Section 1.4]{jost2008riemannian} can be applied to establish the desired formula rigorously.
\end{rmk}

\begin{lem}\label{lem:gaussunif}
      Let $\bm{X} \sim \mathcal{N}(\bm{0}, \bm{I}_n)$ with large $n$. Then, with high probability, $\bm{X}$ is approximately uniformly distributed on the sphere $\mathbb{S}^{n-1}(\sqrt{n})$, i.e., $\bm{X} \sim \op{Unif}(\mathbb{S}^{n-1}(\sqrt{n}))$.
\end{lem}
\begin{proof}
      First, consider $\bm{Y} \defeq \frac{\bm{X}}{\norm*{\bm{X}}}$. We first show that $\bm{Y} \sim \op{Unif}(\mathbb{S}^{n-1})$. Note that $\mathbb{S}^{n-1}$ is a compact homogeneous space:
      \begin{equation*}
            \mathbb{S}^{n-1} \cong \op{SO}(n) / \op{SO}(n-1),
      \end{equation*}
      where $\op{SO}(n) \subset \R^{n\times n}$ denotes the special orthogonal group. Consider the natural action of $\op{SO}(n)$ on $\mathbb{S}^{n-1}$ given by $R \colon \mathbb{S}^{n-1} \to \mathbb{S}^{n-1}$, $\bm{x} \mapsto R\bm{x}$ for all $R \in \op{SO}(n)$. Then by the existence and uniqueness of Haar measure \citep[Theorem 2.49]{folland2016course}, $\op{Unif}(\mathbb{S}^{n-1})$ is the unique rotation-invariant probability measure on $\mathbb{S}^{n-1}$. Therefore, it is sufficient to prove that the distribution of $\bm{Y}$ is rotation-invariant, i.e., $\bm{Y} \stackrel{\mathrm{d}}{=} R\bm{Y}$ for all $R \in \op{SO}(n)$. 

      Since $\bm{X} \sim \mathcal{N}(\bm{0}, \bm{I}_n)$ and $R \in \op{SO}(n)$, we have $R\bm{X} \sim \mathcal{N}(\bm{0}, \bm{I}_n)$ and $\norm*{R\bm{X}} = \norm*{\bm{X}}$. Hence,
      \begin{equation*}
            \bm{Y} = \frac{\bm{X}}{\norm*{\bm{X}}} \stackrel{\mathrm{d}}{=} \frac{R\bm{X}}{\norm*{R\bm{X}}} = R\bm{Y},
      \end{equation*}
      which implies that $\bm{Y} \sim \op{Unif}(\mathbb{S}^{n-1})$. Similarly, by the uniqueness of the invariant measure,
      \begin{equation*}
            \sqrt{n}\bm{Y} = \frac{\sqrt{n}}{\norm*{\bm{X}}}\bm{X} \sim \op{Unif}(\mathbb{S}^{n-1}(\sqrt{n})).
      \end{equation*}
      Moreover, as shown in the proof in Proposition \ref{prop:data_mfd_linear}, the Laurent-Massart Bound implies that $\norm*{\bm{X}} \approx \sqrt{n}$ with high probability when $n$ is large. Therefore,
      \begin{equation*}
          \bm{X} \approx \frac{\sqrt{n}}{\norm*{\bm{X}}}\bm{X} \sim \op{Unif}(\mathbb{S}^{n-1}(\sqrt{n})). \qedhere
      \end{equation*}
\end{proof}

\begin{lem}[Corollary 4.8.2 of \citet{bakry2013analysis}]\label{lem:poincare}
    Let $U \colon \R^n \sto \R$ be $C^2$ such that $\nabla^2 U \succeq \rho \bm{I}_n$ for some $\rho > 0$. Then the probability measure
    \begin{equation*}
        \mathrm{d}\mu(\bm{x}) = \frac{e^{-U(\bm{x})}}{\int e^{-U(\bm{y})}\mathrm{d}\bm{y}}\mathrm{d}\bm{x}
    \end{equation*}
    on $\R^n$ satisfies the Poincar\'e Inequality with the constant ${1}/{\rho}$.
\end{lem}

\begin{lem}\label{lem:pushcondprob}
      Let $\mu,\nu \in \mathcal{P}(\R^n)$ be two probability measures, and let $f \colon \R^n \sto \R^m$ be measurable. Then
      \begin{equation*}
            f_{\#}(w_1\mu + w_2 \nu) = w_1 f_{\#}\mu + w_2f_{\#}\nu,
      \end{equation*} 
      for any $w_1,w_2 \in [0,1]$ with $w_1 + w_2 = 1$.
\end{lem}
\begin{proof}
      For any $A \in \mathcal{B}(\R^m)$,
      \begin{align*}
            f_{\#}(w_1\mu + w_2 \nu)(A) &= (w_1\mu + w_2 \nu)\bc{f^{-1}(A)} \\
            &= w_1\mu\bc{f^{-1}(A)} +  w_2 \nu\bc{f^{-1}(A)} \\
            &= w_1 f_{\#}\mu(A) + w_2f_{\#}\nu(A). \qedhere
      \end{align*}
\end{proof}

\begin{lem}\label{lem:l2boundl1}
      Let $\mu$ be a probability measure on $\R^n$. Then, we have
      \begin{equation*}
            \E_{\bm{X}\sim \mu}\bj{\norm*{\bm{X}}} \leq \sqrt{\E_{\bm{X}\sim \mu}\bj{\norm*{\bm{X}}^2}}.
      \end{equation*}
      In particular, if $\mu =\mathcal{N}(\bm{0},\bm{I}_n)$,
      \begin{equation*}
            \E\bj{\norm*{\bm{X}}} \leq \sqrt{n}.
      \end{equation*}
\end{lem}
\begin{proof}
      Because $\mu$ is a probability measure, by the H\"older's Inequality,
      \begin{equation*}
            \int_{\R^n} \norm*{\bm{x}}\cdot 1 \mathrm{d}\mu(\bm{x}) \leq \bc{\int_{\R^n} \norm*{\bm{x}}^2 \mathrm{d}\mu(\bm{x})}^{\frac{1}{2}}\bc{\int_{\R^n} 1 \mathrm{d}\mu(\bm{x})}^{\frac{1}{2}},
      \end{equation*}
      that is, $\E_{\bm{X}\sim \mu}\bj{\norm*{\bm{X}}} \leq \sqrt{\E_{\bm{X}\sim \mu}[\norm*{\bm{X}}^2}]$. In particular, if $\mu =\mathcal{N}(\bm{0},\bm{I}_n)$, $\E[\norm*{\bm{X}}^2] = n$. \qedhere
\end{proof}

\begin{lem}\label{lem:convexofwasser}
      Let $\mu_1,\mu_2,\nu_1,\nu_2$ be probability measures on $\R^n$, and let
      \begin{equation*}
            \mu = w \mu_1 + (1-w) \mu_2,\quad \nu = w \nu_1 + (1-w) \nu_2,\quad w \in [0,1].
      \end{equation*}
      Then, we have
      \begin{equation*}
            \mathcal{W}_1(\mu,\nu) \leq w \mathcal{W}_1(\mu_1,\nu_1) + (1-w)\mathcal{W}_1(\mu_2,\nu_2).
      \end{equation*}
\end{lem}
\begin{proof}
      By the existence of optimal coupling on $\R^n$ \citep{chewi2024statistical}, there is a $\gamma_i \in \Gamma(\mu_i,\nu_i)$ for $i=1,2$ such that
      \begin{equation*}
            \mathcal{W}_1(\mu_i,\nu_i) = \int_{\R^n \times \R^n}\norm*{\bm{x}-\bm{y}}\mathrm{d}\gamma_i(\bm{x},\bm{y}).
      \end{equation*}
      Let
      \begin{equation*}
            \pi = w \gamma_1 + (1-w) \gamma_2.
      \end{equation*}
      Clearly, $\pi$ is a probability measure on $\R^n \times \R^n$. Moreover, by definition,
      \begin{align*}
            \pi(A \times \R^n) &= w \gamma_1(A \times \R^n) + (1-w) \gamma_2(A \times \R^n) = w \mu_1(A) + (1-w) \mu_2(A) = \mu(A),\\
            \pi(\R^n \times B) &= w \gamma_1(\R^n \times B) + (1-w) \gamma_2(\R^n \times B) = w \nu_1(B) + (1-w) \nu_2(B) = \nu(B),
      \end{align*}
      which means $\pi \in \Gamma(\mu,\nu)$. Therefore,
      \begin{align*}
            \mathcal{W}_1(\mu,\nu) &\leq \int_{\R^n \times \R^n}\norm*{\bm{x}-\bm{y}}\mathrm{d}\pi(\bm{x},\bm{y})\\
            &= w \int_{\R^n \times \R^n}\norm*{\bm{x}-\bm{y}}\mathrm{d}\gamma_1(\bm{x},\bm{y}) + (1-w)\int_{\R^n \times \R^n}\norm*{\bm{x}-\bm{y}}\mathrm{d}\gamma_2(\bm{x},\bm{y})\\
            &= w \mathcal{W}_1(\mu_1,\nu_1) + (1-w)\mathcal{W}_1(\mu_2,\nu_2). \qedhere
      \end{align*}
\end{proof}

\begin{lem}\label{lem:expeconv}
      Let $\mu \in \R^n$ be a probability measure such that its support $K$ is closed and convex. Then
      \begin{equation*}
            \E_{\bm{X} \sim \mu}[\bm{X}] \in K.
      \end{equation*}
\end{lem}
\begin{proof}
      Suppose that $\bm{m} = \E_{\bm{X} \sim \mu}[\bm{X}] \notin K$. By the convexity and closedness of $K$, the strong separation theorem \citep{rockafellar1997convex} implies that there are $\bm{u} \in \R^n \backslash \bb{0}$ and $c \in \R$ such that $\inn{\bm{u},\bm{m}} > c$ and
      \begin{equation*}
            \inn{\bm{u},\bm{x}} \leq c,~ \forall~\bm{x} \in K.
      \end{equation*}
      Let $\bm{X} \sim \mu$. $\bm{X} \in K$ for almost everywhere and so
      \begin{equation*}
            \inn{\bm{u},\bm{X}} \leq c,\quad a.e..
      \end{equation*}
      Then taking the expectation on the both sides, we have
      \begin{equation*}
            \inn{\bm{u},\bm{m}} \leq c,
      \end{equation*}
      which induces a contradiction.
\end{proof}

\begin{lem}[Gr\"onwall's Inequality]\label{lem:gronwall}
      If $u \colon [0,T] \sto \R$ satisfies the linear ODE inequality as
      \begin{equation*}
            \frac{\mathrm{d}}{\mathrm{d}t}u(t) \leq a(t)u(t) + b(t),
      \end{equation*}
      then
      \begin{equation*}
            u(t) \leq u(0)e^{\int_0^ta(r)\mathrm{d}r} + \int_0^tb(s)e^{\int_s^ta(r)\mathrm{d}r}\mathrm{d}s.
      \end{equation*}
\end{lem}
\begin{proof}
      Let $\Phi(t) = \exp\bc{-\int_0^ta(s)\mathrm{d}s}$. Then, $\Phi^\prime(t) = -a(t)\Phi(t)$ and
      \begin{equation*}
            \Phi(t)\frac{\mathrm{d}}{\mathrm{d}t}u(t) \leq \Phi(t)a(t)u(t) + \Phi(t)b(t) ~\Rightarrow~ \frac{\mathrm{d}}{\mathrm{d}t}\bc{\Phi(t)u(t)} \leq \Phi(t)b(t).
      \end{equation*}
      By integrating on the both sides of above inequality, we have
      \begin{equation*}
            u(t) \leq u(0)e^{\int_0^ta(r)\mathrm{d}r} + \int_0^tb(s)e^{\int_s^ta(r)\mathrm{d}r}\mathrm{d}s. \qedhere
      \end{equation*}
\end{proof}

\begin{lem}\label{lem:convex_pl}
      If a $C^1$ function $f \colon \R^n \sto \R$ is $\rho$-strongly convex, then it satisfies $\rho$-Polyak--\L{}ojasiewicz (PL) inequality:
      \begin{equation*}
            \norm*{\nabla_{\bm{x}}f(\bm{x})}^2 \geq 2\rho\bc{f(\bm{x}) - f(\bm{x}_*)},
      \end{equation*}
      where $\bm{x}_*$ is the unique minimizer of $f$.
\end{lem}
\begin{proof}
      Because $f$ is $\rho$-strongly convex,
      \begin{equation*}
            f(\bm{y}) \geq f(\bm{x}) + \inn{\nabla_{\bm{x}}f(\bm{x}),\bm{y} - \bm{x}} + \frac{\rho}{2}\norm{\bm{y}-\bm{x}}^2.
      \end{equation*}
      Minimizing the both sides with respect to $\bm{y}$, we obtain
      \begin{equation*}
            f(\bm{x}_*) \geq f(\bm{x}) - \frac{1}{2\rho}\norm*{\nabla_{\bm{x}}f(\bm{x})}^2,
      \end{equation*}
      which is precisely the $\rho$-PL inequality. \qedhere
\end{proof}

\begin{lem}\label{lem:pushwasserstein}
      Let $f \colon \R^k \rightarrow \R^n$ be $L$-Lipschitz continuous. For two probability measures $\mu,\nu \in \mathcal{P}(\R^n)$,
      \begin{equation*}
            \mathcal{W}_1(f_{\#}\mu,f_{\#}\nu) \leq L\mathcal{W}_1(\mu,\nu).
      \end{equation*}
\end{lem}
\begin{proof}
      Let $(\bm{X},\bm{Y})$ be an optimal coupling for $(\mu,\nu)$, that is, $\bm{X} \sim \mu$, $\bm{Y} \sim \nu$, and $\mathcal{W}_1 = \E[\norm{\bm{X}-\bm{Y}}]$. Besides, $f(\bm{X}) \sim f_{\#}\mu$ and $f(\bm{Y}) \sim f_{\#}\nu$. Then, by the Lipschitz continuity of $f$,
      \begin{align*}
            \mathcal{W}_1(f_{\#}\mu,f_{\#}\nu) &\leq \E\bj{\norm{f(\bm{X})-f(\bm{Y})}}\\
            &\leq L\E\bj{\norm{\bm{X}-\bm{Y}}}\\
            &= L\mathcal{W}_1(\mu,\nu). \qedhere
      \end{align*}
\end{proof}

\section{Preliminaries for Manifold}\label{appen:preliminaries_of_manifolds}

We provide only the minimal background on smooth manifolds necessary for this work. For a comprehensive treatment, we refer the reader to \citet{lee2012smooth}.

\begin{defn}
      A subset $\mathcal{M} \subset \R^n$ is called a $m$-dimensional (embedded) (sub)manifold of $\R^n$ if there are a family open sets $\bb{U_\alpha}_{\alpha \in \Gamma}$ in $\R^n$, a family of open sets $\bb{V_\alpha}_{\alpha \in \Gamma}$ in $\R^m$, and a family of smooth ($C^\infty$) maps $\bb{\phi_\alpha}_{\alpha \in \Gamma}$ such that
      \begin{equation*}
            \mathcal{M} \subset \bigcup_{\alpha \in \Gamma} U_\alpha,\text{ and }\phi_\alpha \colon V_\alpha \rightarrow U_\alpha \cap \mathcal{M}
      \end{equation*}
      is a diffeomorphism, i.e., $\phi_\alpha^{-1} \colon U_\alpha \cap \mathcal{M} \sto V_\alpha$ is also smooth.
\end{defn}

Each pair $(\phi_\alpha,V_\alpha)$ is called a chart, and $\bb{(\phi_\alpha,V_\alpha)}_{\alpha \in \Gamma}$ is called an atlas of $\mathcal{M}$. In general, a single chart cannot cover the entire manifold $\mathcal{M}$. However, if $\mathcal{M}$ is closed, then there exists a chart $\phi \colon V \sto \mathcal{M}$ that can almost cover $\mathcal{M}$, in the sense that the volume measure of the set $\mathcal{M} \setminus \phi(V)$ is zero; see \citet{lee2019introduction} for more details.

\begin{defn}
      Let $\mathcal{M} \subset \R^n$ be a $m$-dimensional manifold. For any $\bm{x} \in \mathcal{M}$, the tangent space, denoted $T_{\bm{x}}\mathcal{M}$, is a vector space defined as
      \begin{equation*}
          T_{\bm{x}}\mathcal{M} \defeq \bb{\gamma^\prime(0) \colon \exists~\varepsilon>0,~ \gamma \colon (-\varepsilon,\varepsilon) \sto \mathcal{M} \text{ smooth, } \gamma(0) = \bm{x}}.
      \end{equation*}
\end{defn}

\begin{lem}\label{lem:normal_mfd_const}
      Let $\mathcal{M} \subset \R^n$ be a smooth submanifold. If a $C^1$ function $g \colon \R^n \sto \R$ is constant on $\mathcal{M}$, then for any $\bm{x} \in \mathcal{M}$, $\nabla g(\bm{x})$ is normal to $\mathcal{M}$; that is, $\nabla g(\bm{x}) \perp T_{\bm{x}}\mathcal{M}$.
\end{lem}
\begin{proof}
      For any $\bm{v} \in T_{\bm{x}}\mathcal{M}$, let $\gamma \colon [0,1] \sto \mathcal{M}$ be a smooth curve such that $\gamma(0) = \bm{x}$ and $\gamma^\prime(0) = \bm{v}$. Then, because $g(\gamma(t)) \equiv \bm{c}$,
      \begin{equation*}
            0 = \lv{\frac{\mathrm{d}}{\mathrm{d}t}}_{t=0}g(\gamma(t)) = \inn{\nabla g(\gamma(0)),\gamma^\prime(0)} = \inn{\nabla g(\bm{x}),\bm{v}}
      \end{equation*}
      Therefore, $\nabla g(\bm{x}) \perp T_{\bm{x}}\mathcal{M}$.
\end{proof}

\begin{thm}[Constant Rank Theorem \citep{lee2012smooth}]\label{lem:const_rank}
      Let $f \colon \R^n \sto \R^r$ be a smooth map and $\bm{c} \in \R^r$. Let
      \begin{equation*}
        \mathcal{M} \defeq \bb{\bm{x} \in \R^n \colon f(\bm{x}) = \bm{c}}.
      \end{equation*}
      If $\rank JF(\bm{x}) = r$ for any $\bm{x} \in \mathcal{M}$, then $\mathcal{M}$ is a $(n-r)$-dimensional manifold.
\end{thm}

\begin{thm}[Tubular Neighborhood Theorem \citep{lee2012smooth}]\label{thm:tubular}
      Let $\mathcal{M} \subset \R^D$ be a $d$-dimensional submanifold. There is a smooth $\varepsilon \colon \mathcal{M} \sto (0,\infty)$ such that for
      \begin{equation*}
            V \defeq \bb{(\bm{z},\bm{v}) \in \mathcal{M} \times \R^{D-d} \colon \norm{\bm{v}} < \varepsilon(\bm{z})},
      \end{equation*}
      $F \colon V \sto U = F(V)$ is a diffeomorphism and $U \subset \R^D$ is a neighborhood of $\mathcal{M}$.
\end{thm}
\begin{rmk}
      For a given tubular neighborhood $V$ of $\mathcal{M}$, we also call $U = F(V) \subset \R^D$ is its tubular neighborhood in $\R^D$. Moreover, we can define the corresponding orthogonal projection $\pi \colon U \sto \mathcal{M}$ as
      \begin{equation*}
            \pi(\bm{x}) = \pi_1(F^{-1}(\bm{x})),
      \end{equation*}
      where $\pi_1 \colon V \sto \mathcal{M}$ is $\pi_1(\bm{z},\bm{v}) = \bm{z}$.
\end{rmk}

\end{document}