Consider the role of the diffusion model only as a \emph{parametric, differentiable transition model}. Everything reduces to
\begin{align*}
s_{t+1} = f_{\theta}(s_t, a_t, \varepsilon_t),
\quad \varepsilon_t \sim p_0(\varepsilon),
\end{align*}
where $f_{\theta}$ is the \emph{reverse diffusion sampler}, written as a deterministic computation graph, and $\varepsilon_t$ is standard Gaussian noise. Equivalently, the diffusion model specifies a conditional distribution over next states,
\begin{align*}
p_{\theta}(s_{t+1}\mid s_t,a_t),
\end{align*}
together with a reparameterized sampling procedure $s_{t+1}=f_{\theta}(s_t,a_t,\varepsilon_t)$ whose randomness is isolated in $\varepsilon_t$.

\subsection{Reverse diffusion sampler as a computation graph}

We learn $p_{\theta}(s_{t+1}\mid s_t,a_t)$ from an offline dataset of transitions
\begin{align*}
\mathcal{D} = \{(s_t,a_t,r_t,s_{t+1})\}_{t=1}^N.
\end{align*}
The diffusion model is trained to represent the conditional law of $s_{t+1}$ given $(s_t,a_t)$ by introducing an auxiliary \emph{forward noising process} on $s_{t+1}$ and a learned \emph{reverse denoising process} that inverts this noising when conditioned on $(s_t,a_t)$. At sampling time, the reverse process induces the map $f_{\theta}$.

\subsubsection{Forward process}

Fix a diffusion horizon $K\in\mathbb{N}$ and a variance schedule $\{\alpha_k\}_{k=1}^K \subset (0,1)$. For each transition tuple $(s_t,a_t,s_{t+1})\in\mathcal{D}$, define a Markovian forward process that progressively corrupts the next state:
\begin{align*}
s_{t+1}^{(0)} &= s_{t+1}, \\
s_{t+1}^{(k)} \mid s_{t+1}^{(k-1)}
&\sim q\big(s^{(k)} \mid s^{(k-1)}\big)
= \mathcal{N}\big(\sqrt{\alpha_k}\, s_{t+1}^{(k-1)}, (1-\alpha_k) I\big),
\qquad k=1,\dots,K.
\end{align*}
Let $\bar\alpha_k := \prod_{j=1}^k \alpha_j$. This choice implies a closed-form marginal for any noise level $k$:
\begin{align*}
q\big(s_{t+1}^{(k)} \mid s_{t+1}\big)
&=
\mathcal{N}\big(\sqrt{\bar\alpha_k}\, s_{t+1}, (1-\bar\alpha_k)I\big).
\end{align*}
In particular, one can write a reparameterized sample from the marginal as
\begin{align*}
s_{t+1}^{(k)} = \sqrt{\bar\alpha_k}\, s_{t+1} + \sqrt{1-\bar\alpha_k}\,\epsilon,
\qquad \epsilon \sim \mathcal{N}(0,I).
\end{align*}

\subsubsection{Conditional reverse (generative) process}

The reverse process aims to sample $s_{t+1}$ conditioned on $(s_t,a_t)$ by iteratively denoising from a Gaussian reference distribution at level $K$. Let the conditioning be
\begin{align*}
c_t := (s_t,a_t).
\end{align*}
At sampling time, initialize from a base noise variable
\begin{align*}
s_{t+1}^{(K)} \sim \mathcal{N}(0,I),
\end{align*}
and then apply a learned reverse transition for $k=K,\dots,1$:
\begin{align*}
s_{t+1}^{(k-1)}
= \frac{1}{\sqrt{\alpha_k}}
  \Big(
    s_{t+1}^{(k)}
    - (1-\alpha_k)\, \hat\epsilon_{\theta}\big(s_{t+1}^{(k)}, k, c_t\big)
  \Big)
  + \sigma_k z_{k-1},
\qquad z_{k-1}\sim \mathcal{N}(0,I).
\end{align*}
Here $\hat\epsilon_{\theta}(\cdot,k,c_t)$ is a parametric predictor of the noise component at level $k$ (equivalently, a score-related parameterization), and $\{\sigma_k\}$ specifies the reverse-process variance. The final denoised sample is
\begin{align*}
s_{t+1}^{(0)} \sim p_{\theta}(\cdot \mid s_t,a_t),
\end{align*}
and we define the reverse sampler map $f_{\theta}$ by collecting all Gaussian random variables used by the reverse procedure into
\begin{align*}
\varepsilon_t := (z_K,z_{K-1},\dots,z_0),
\end{align*}
so that the sampled next state can be written as
\begin{align*}
s_{t+1}^{(0)} = f_{\theta}(s_t,a_t,\varepsilon_t).
\end{align*}

\paragraph{Deterministic computation graph for fixed noise.}
For any fixed realization of $\varepsilon_t$, the mapping $(s_t,a_t)\mapsto s_{t+1}^{(0)}$ is a deterministic composition of (i) linear operations, (ii) evaluations of the denoiser $\hat\epsilon_{\theta}(\cdot,k,c_t)$ at each reverse step, and (iii) additive terms determined by the fixed Gaussian draws. Consequently, $f_{\theta}$ is a differentiable computation graph in its inputs $(s_t,a_t)$ for fixed $\varepsilon_t$.

\subsubsection{Learning objective (conditional diffusion for one-step transitions)}

We train $\hat\epsilon_{\theta}$ to invert the forward corruption of $s_{t+1}$, conditioned on $(s_t,a_t)$. Using the marginal reparameterization at a randomly chosen diffusion level $k$, we form
\begin{align*}
s_{t+1}^{(k)} = \sqrt{\bar\alpha_k}\, s_{t+1} + \sqrt{1-\bar\alpha_k}\,\epsilon,
\qquad \epsilon \sim \mathcal{N}(0,I),
\end{align*}
and minimize the conditional noise-prediction error
\begin{align*}
\mathcal{L}(\theta)
=
\mathbb{E}_{(s_t,a_t,s_{t+1})\sim \mathcal{D}}
\,
\mathbb{E}_{k \sim \mathrm{Unif}(\{1,\dots,K\})}
\,
\mathbb{E}_{\epsilon \sim \mathcal{N}(0,I)}
\Big[
\big\|
\epsilon - \hat\epsilon_{\theta}\big(s_{t+1}^{(k)}, k, (s_t,a_t)\big)
\big\|_2^2
\Big].
\end{align*}
Intuitively, this objective teaches the denoiser to recover the injected Gaussian noise at arbitrary noise levels while leveraging $(s_t,a_t)$ as side information. After training, the resulting reverse process defines a conditional generative model $p_{\theta}(s_{t+1}\mid s_t,a_t)$, and its sampling procedure is precisely the transition map
\begin{align*}
s_{t+1} = f_{\theta}(s_t,a_t,\varepsilon_t),
\qquad \varepsilon_t \sim p_0(\varepsilon),
\end{align*}
which we treat as a learned, differentiable simulator of one-step dynamics.


\subsection{Embedding the sampler into the (reward-maximization) control objective}

Define the one-step lookahead objective at state $s_t$:
\begin{align*}
J_t(a)
=
\mathbb{E}_{\varepsilon_t}
\Big[ r(s_t,a) + \gamma\, V_{\phi}\big(f_{\theta}(s_t,a,\varepsilon_t)\big) \Big],
\end{align*}
where $r(s,a)$ denotes the immediate reward (or an immediate reward predictor), and $\gamma \in (0,1)$ is the discount factor.

The reparameterization principle is:
\begin{itemize}
  \item Instead of sampling $s_{t+1} \sim p_{\theta}(\cdot\mid s_t,a)$ directly, sample $\varepsilon_t \sim p_0(\varepsilon)$ and set $s_{t+1} = f_{\theta}(s_t,a,\varepsilon_t)$.
  \item Then the expectation is over $\varepsilon_t$ only, with everything else deterministic.
\end{itemize}

A Monte Carlo estimator with $M$ samples is
\begin{align*}
\hat J_t(a)
=
\frac1M \sum_{m=1}^M
\Big[
r(s_t,a) + \gamma\, V_{\phi}\big( f_{\theta}(s_t,a,\varepsilon_t^{(m)}) \big)
\Big],
\end{align*}
where $\varepsilon_t^{(m)} \sim p_0(\varepsilon)$.

All terms inside the sum are differentiable functions of $a$, because:
\begin{enumerate}
  \item $r(s_t,a)$ is differentiable in $a$,
  \item $V_{\phi}(\cdot)$ is a neural network in $s$, differentiable,
  \item $f_{\theta}(s_t,a,\varepsilon_t^{(m)})$ is the unrolled diffusion sampler, differentiable in $a$.
\end{enumerate}

\subsection{Backpropagation through the diffusion sampler and the critic}

Consider one Monte Carlo sample and denote
\begin{align*}
{}^{(m)}s_{t+1}(a)
= f_{\theta}(s_t,a,\varepsilon_t^{(m)}).
\end{align*}

Define the per-sample objective
\begin{align*}
\ell^{(m)}(a)
= r(s_t,a) + \gamma\, V_{\phi}\big( {}^{(m)}s_{t+1}(a) \big).
\end{align*}

The gradient of $\ell^{(m)}$ with respect to $a$ is
\begin{align*}
\nabla_a \ell^{(m)}(a)
=
\nabla_a r(s_t,a)
+
\gamma\, \underbrace{\nabla_s V_{\phi}\big( {}^{(m)}s_{t+1}(a) \big)}_{\text{critic gradient}}
\underbrace{\nabla_a {}^{(m)}s_{t+1}(a)}_{\text{sampler sensitivity}}.
\end{align*}

The sampler sensitivity $\nabla_a {}^{(m)}s_{t+1}(a)$ is obtained by differentiating through all reverse diffusion steps.

\subsubsection{Chain rule through the reverse diffusion steps}

Write each reverse diffusion step as a deterministic function. For step $k$, the DDPM-style reverse update can be written as
\begin{align*}
s_{t+1}^{(k-1)}
= h_k\big(s_{t+1}^{(k)}, s_t, a, z_{k-1}\big),
\end{align*}
where $h_k$ collects all operations performed at reverse step $k$. Concretely, for the update
\begin{align*}
s_{t+1}^{(k-1)}
&=
\frac{1}{\sqrt{\alpha_k}}
\Big(
s_{t+1}^{(k)}
-
(1-\alpha_k)\,\hat\epsilon_{\theta}\big(s_{t+1}^{(k)}, k, c_t\big)
\Big)
+
\sigma_k z_{k-1},
\\
c_t &= (s_t, a),
\end{align*}
we define
\begin{align*}
h_k(s, s_t, a, z)
=
\frac{1}{\sqrt{\alpha_k}}
\Big(
s
-
(1-\alpha_k)\,\hat\epsilon_{\theta}(s, k, (s_t, a))
\Big)
+
\sigma_k z.
\end{align*}
Here, $s$ denotes the noisy state at level $k$, $s_t$ is the current environment state, $a$ is the action, and $z$ is the Gaussian noise used at step $k$; $\hat\epsilon_{\theta}$ is the noise-prediction network.

For a fixed noise realization $\varepsilon_t = (z_K,\dots,z_0)$ and fixed state $s_t$, the dependence of $s_{t+1}^{(k)}$ on $a$ is entirely through the recursive application of the functions $h_k$. To track this dependence, define the sensitivity matrices
\begin{align*}
S_k(a)
:= \nabla_a s_{t+1}^{(k)}(a) \in \mathbb{R}^{d \times m},
\end{align*}
where $d$ is the state dimension and $m$ is the action dimension. At the top diffusion level,
\begin{align*}
s_{t+1}^{(K)} = \sqrt{1 - \bar\alpha_K}\, z_K,
\end{align*}
which does not depend on $a$, so the initial sensitivity is
\begin{align*}
S_K(a)
= \nabla_a s_{t+1}^{(K)}(a)
= 0.
\end{align*}

For $k = K, \dots, 1$, the next state in the reverse chain is
\begin{align*}
s_{t+1}^{(k-1)}(a)
= h_k\big(s_{t+1}^{(k)}(a), s_t, a, z_{k-1}\big).
\end{align*}
Applying the multivariate chain rule to this composition yields
\begin{align*}
S_{k-1}(a)
&= \nabla_a s_{t+1}^{(k-1)}(a) \\
&= \frac{\partial h_k}{\partial s}\big(s_{t+1}^{(k)}(a), s_t, a, z_{k-1}\big)\,
   \underbrace{\nabla_a s_{t+1}^{(k)}(a)}_{S_k(a)} \\
   & \qquad \qquad + 
   \frac{\partial h_k}{\partial a}\big(s_{t+1}^{(k)}(a), s_t, a, z_{k-1}\big).
\end{align*}
The Jacobian $\partial h_k / \partial s$ captures how the reverse update at step $k$ responds to changes in the noisy state $s_{t+1}^{(k)}$, while $\partial h_k / \partial a$ captures the direct dependence of the update on the action $a$ through the conditioning $c_t = (s_t, a)$. Both Jacobians are well-defined and can be obtained automatically in a deep learning framework.

Iterating this recurrence from $k = K$ down to $k = 1$ produces
\begin{align*}
S_0(a)
= \nabla_a s_{t+1}^{(0)}(a)
= \nabla_a f_{\theta}(s_t, a, \varepsilon_t),
\end{align*}
which is precisely the sampler sensitivity needed to compute the gradient of the one-step objective with respect to the action.

\subsection{Multistep unrolled diffusion and backpropagation through time}

For an $\ell$-step lookahead with terminal value, define
\begin{align*}
s_{k+1} = f_{\theta}(s_k,a_k,\varepsilon_k),
\end{align*}
\begin{align*}
s_{k+2} = f_{\theta}(s_{k+1},a_{k+1},\varepsilon_{k+1}),
\end{align*}
\begin{align*}
\dots,
\quad
s_{k+\ell} = f_{\theta}(s_{k+\ell-1},a_{k+\ell-1},\varepsilon_{k+\ell-1}),
\end{align*}
and the $\ell$-step reward objective
\begin{align*}
J_k(\mathbf{a})
=
\mathbb{E}\left[
r(s_k,a_k)
+ \sum_{i=k+1}^{k+\ell-1} \gamma^{i-k} r(s_i,a_i)
+ \gamma^{\ell} V_{\phi}(s_{k+\ell})
\right],
\end{align*}
where $\mathbf{a} = (a_k,\dots,a_{k+\ell-1})$.

For a single Monte Carlo noise sequence $\varepsilon^{(m)}$, the computational graph is
\begin{align*}
\mathbf{a}
\rightarrow s_{k+1}^{(m)} \rightarrow s_{k+2}^{(m)} \rightarrow \dots \rightarrow s_{k+\ell}^{(m)}
\rightarrow V_{\phi}(s_{k+\ell}^{(m)}).
\end{align*}

Each transition $s_{i+1}^{(m)} = f_{\theta}(s_i^{(m)}, a_i, \varepsilon_i^{(m)})$ is itself a multi-step reverse diffusion graph. The overall structure is a \emph{nested composition} of diffusion steps across time and across diffusion depth.

The gradient of $J_k$ with respect to the action sequence $\mathbf{a} = (a_k,\dots,a_{k+\ell-1})$ is computed by backpropagation through time:
\begin{itemize}
  \item Start from the final scalar objective
  \begin{align*}
  L^{(m)}(\mathbf{a})
  =
  r(s_k^{(m)},a_k)
  + \sum_{i=k+1}^{k+\ell-1} \gamma^{i-k} r(s_i^{(m)},a_i)
  + \gamma^{\ell} V_{\phi}(s_{k+\ell}^{(m)}).
  \end{align*}
  \item Autograd computes the derivatives
  $\nabla_{a_j} L^{(m)}(\mathbf{a})$ by:
  \begin{itemize}
    \item differentiating $V_{\phi}(s_{k+\ell}^{(m)})$ with respect to $s_{k+\ell}^{(m)}$,
    \item propagating gradients backward through each $f_{\theta}$ step to earlier states and actions,
    \item accumulating contributions from the running rewards $r(s_i^{(m)},a_i)$.
  \end{itemize}
\end{itemize}

The only requirement is that every component (the diffusion sampler and the critic) is differentiable. The learning framework automatically handles the chain-rule composition across:
\begin{enumerate}
  \item diffusion depth (reverse steps inside each $f_{\theta}$),
  \item time horizon (sequence of transitions),
  \item critic evaluation at the terminal state.
\end{enumerate}
