We now describe a receding-horizon model predictive control (MPC) procedure that uses the learned diffusion world model as a differentiable simulator. The world model is given by the reparameterized transition map
\begin{align*}
s_{t+1} = f_{\theta}(s_t,a_t,\varepsilon_t),
\qquad \varepsilon_t \sim p_0(\varepsilon),
\end{align*}
where $f_{\theta}$ denotes the reverse diffusion sampler unrolled as a computation graph. We combine this dynamics model with a learned reward predictor $r_{\xi}(s,a)$ and a learned terminal critic $Q_{\phi}(s,a)$. The role of the critic is to provide a terminal value for truncated rollouts, while $r_{\xi}$ supplies the immediate reward along imagined trajectories.

\subsection{Receding-horizon objective under the diffusion dynamics}

At a real environment state $s_t$, MPC optimizes a horizon-$H$ reward objective over imagined trajectories generated by $f_{\theta}$. Given a fixed noise sequence $\varepsilon_{t:t+H-1}=(\varepsilon_t,\dots,\varepsilon_{t+H-1})$, define the imagined rollout recursively by
\begin{align*}
\tilde{s}_0 &= s_t, \\
\tilde{a}_h &= \pi_{\psi}(\tilde{s}_h), \\
\tilde{s}_{h+1} &= f_{\theta}(\tilde{s}_h,\tilde{a}_h,\varepsilon_{t+h}),
\qquad h=0,\dots,H-1.
\end{align*}
We evaluate the finite-horizon return with a terminal critic as
\begin{align*}
L(\psi;\varepsilon_{t:t+H-1})
=
\sum_{h=0}^{H-1} \gamma^h\, r_{\xi}(\tilde{s}_h,\tilde{a}_h)
+
\gamma^H\, Q_{\phi}\big(\tilde{s}_H,\pi_{\psi}(\tilde{s}_H)\big),
\end{align*}
where $\gamma\in(0,1)$ is the discount factor. The MPC objective is the expectation over the diffusion noise,
\begin{align*}
J_t(\psi)
=
\mathbb{E}_{\varepsilon_{t:t+H-1}\sim p_0}
\Big[
L(\psi;\varepsilon_{t:t+H-1})
\Big].
\end{align*}
We approximate this expectation by Monte Carlo using $K$ imagined rollouts (Algorithm~\ref{alg:mpc_diffusion_world_model}, Lines~\ref{line:mpc_noise_start}--\ref{line:mpc_noise_end}).

\subsection{Monte Carlo estimator and common random numbers}

Let $\{\varepsilon_{t:t+H-1}^{(i)}\}_{i=1}^K$ be i.i.d.\ draws from $p_0$. For each $i$, generate an imagined rollout $\{\tilde{s}_h^{(i)},\tilde{a}_h^{(i)}\}_{h=0}^H$ by
\begin{align*}
\tilde{s}_0^{(i)} &= s_t,
\qquad
\tilde{a}_h^{(i)} = \pi_{\psi}(\tilde{s}_h^{(i)}),
\qquad
\tilde{s}_{h+1}^{(i)} = f_{\theta}(\tilde{s}_h^{(i)},\tilde{a}_h^{(i)},\varepsilon_{t+h}^{(i)}).
\end{align*}
The Monte Carlo estimator of $J_t(\psi)$ is
\begin{align*}
\widehat{J}_t(\psi)
=
\frac{1}{K}\sum_{i=1}^K
\Bigg[
\sum_{h=0}^{H-1} \gamma^h\, r_{\xi}(\tilde{s}_h^{(i)},\tilde{a}_h^{(i)})
+
\gamma^H\, Q_{\phi}\big(\tilde{s}_H^{(i)},\pi_{\psi}(\tilde{s}_H^{(i)})\big)
\Bigg].
\end{align*}
Optionally, within a single MPC solve at time $t$, we reuse the same sampled noises across multiple gradient steps (common random numbers). Conditioning on the chosen noise pack makes $\widehat{J}_t(\psi)$ a deterministic differentiable surrogate during the inner optimization loop.

% \subsection{Backpropagation through the diffusion sampler and the critic}
% \label{sec:mpc_backprop_diffusion_critic}

% This subsection derives the gradient of $\widehat{J}_t(\psi)$ by explicitly decomposing (i) backpropagation through time across the $H$-step rollout and (ii) backpropagation through diffusion depth inside each transition map $f_{\theta}$.

% \subsubsection{Pathwise gradient for one imagined rollout}

% Fix one noise realization $\varepsilon_{t:t+H-1}$ and consider the per-sample objective
% \begin{align*}
% L(\psi;\varepsilon_{t:t+H-1})
% =
% \sum_{h=0}^{H-1} \gamma^h\, r_{\xi}(\tilde{s}_h,\tilde{a}_h)
% +
% \gamma^H\, Q_{\phi}\big(\tilde{s}_H,\pi_{\psi}(\tilde{s}_H)\big),
% \end{align*}
% with trajectory defined by $\tilde{a}_h=\pi_{\psi}(\tilde{s}_h)$ and $\tilde{s}_{h+1}=f_{\theta}(\tilde{s}_h,\tilde{a}_h,\varepsilon_{t+h})$. For fixed noises, $L(\psi;\varepsilon_{t:t+H-1})$ is a deterministic scalar-valued function of $\psi$. Hence its gradient $\nabla_{\psi}L$ is the pathwise (reparameterization) gradient obtained by differentiating through the deterministic computation graph.

% To make the time-recursion explicit, define the policy Jacobians
% \begin{align*}
% \Pi_s(h) := \nabla_s \pi_{\psi}(s)\big|_{s=\tilde{s}_h} \in \mathbb{R}^{m\times d},
% \qquad
% \Pi_{\psi}(h) := \nabla_{\psi} \pi_{\psi}(\tilde{s}_h) \in \mathbb{R}^{m\times p},
% \end{align*}
% where $d$ is the state dimension, $m$ is the action dimension, and $p$ is the parameter dimension. Also define the transition Jacobians for the diffusion sampler at time $h$,
% \begin{align*}
% F_s(h) := \nabla_s f_{\theta}(s,a,\varepsilon_{t+h})\big|_{s=\tilde{s}_h,a=\tilde{a}_h} \in \mathbb{R}^{d\times d},
% \qquad
% F_a(h) := \nabla_a f_{\theta}(s,a,\varepsilon_{t+h})\big|_{s=\tilde{s}_h,a=\tilde{a}_h} \in \mathbb{R}^{d\times m}.
% \end{align*}
% We will show below how $F_a(h)$ and $F_s(h)$ are computed by differentiating through the reverse diffusion steps.

% Define the state-to-parameter sensitivities
% \begin{align*}
% G_h := \nabla_{\psi}\tilde{s}_h \in \mathbb{R}^{d\times p}.
% \end{align*}
% Since $\tilde{s}_0=s_t$ does not depend on $\psi$, we have $G_0=0$. For each $h=0,\dots,H-1$,
% \begin{align*}
% \tilde{s}_{h+1}
% =
% f_{\theta}\big(\tilde{s}_h,\pi_{\psi}(\tilde{s}_h),\varepsilon_{t+h}\big).
% \end{align*}
% Applying the chain rule gives the recursion
% \begin{align*}
% G_{h+1}
% &=
% \nabla_{\psi}\tilde{s}_{h+1} \\
% &=
% \Big(
% F_s(h) + F_a(h)\,\Pi_s(h)
% \Big)\,G_h
% +
% F_a(h)\,\Pi_{\psi}(h).
% \end{align*}
% This is the explicit backpropagation-through-time sensitivity recursion for the diffusion-generated rollout. It makes clear that the only model-specific derivatives required are the Jacobians $F_s(h)$ and $F_a(h)$ of the diffusion sampler.

% The objective gradient can now be written by differentiating each stage reward and the terminal critic. For a stage reward $r_{\xi}(\tilde{s}_h,\tilde{a}_h)$,
% \begin{align*}
% \nabla_{\psi} r_{\xi}(\tilde{s}_h,\tilde{a}_h)
% &=
% \nabla_s r_{\xi}(\tilde{s}_h,\tilde{a}_h)\,G_h
% +
% \nabla_a r_{\xi}(\tilde{s}_h,\tilde{a}_h)\,
% \Big(
% \Pi_s(h)\,G_h + \Pi_{\psi}(h)
% \Big).
% \end{align*}
% For the terminal critic term $Q_{\phi}(\tilde{s}_H,\pi_{\psi}(\tilde{s}_H))$,
% \begin{align*}
% \nabla_{\psi} Q_{\phi}\big(\tilde{s}_H,\pi_{\psi}(\tilde{s}_H)\big)
% &=
% \nabla_s Q_{\phi}(\tilde{s}_H,\tilde{a}_H)\,G_H
% +
% \nabla_a Q_{\phi}(\tilde{s}_H,\tilde{a}_H)\,
% \Big(
% \Pi_s(H)\,G_H + \Pi_{\psi}(H)
% \Big),
% \end{align*}
% where $\tilde{a}_H=\pi_{\psi}(\tilde{s}_H)$. Combining these,
% \begin{align*}
% \nabla_{\psi} L(\psi;\varepsilon_{t:t+H-1})
% &=
% \sum_{h=0}^{H-1}\gamma^h\,
% \nabla_{\psi} r_{\xi}(\tilde{s}_h,\tilde{a}_h)
% +
% \gamma^H\,
% \nabla_{\psi} Q_{\phi}\big(\tilde{s}_H,\pi_{\psi}(\tilde{s}_H)\big).
% \end{align*}
% The Monte Carlo gradient used by MPC is the average of these pathwise gradients over $K$ noise sequences:
% \begin{align*}
% \nabla_{\psi} \widehat{J}_t(\psi)
% =
% \frac{1}{K}\sum_{i=1}^K
% \nabla_{\psi} L(\psi;\varepsilon_{t:t+H-1}^{(i)}).
% \end{align*}

% \subsubsection{Backpropagation through diffusion depth: computing $\nabla_a f_{\theta}$ and $\nabla_s f_{\theta}$}

% We now derive the sampler Jacobians needed above. Fix a time index $h$ and abbreviate
% \begin{align*}
% s &= \tilde{s}_h,
% \qquad
% a = \tilde{a}_h,
% \qquad
% \varepsilon = \varepsilon_{t+h}.
% \end{align*}
% The diffusion sampler computes $s^+ := f_{\theta}(s,a,\varepsilon)$ by unrolling the reverse diffusion steps. For a fixed realization of $\varepsilon=(z_K,z_{K-1},\dots,z_0)$, write the reverse diffusion recursion as
% \begin{align*}
% s^{(K)} &= g_K(z_K), \\
% s^{(k-1)} &= h_k\big(s^{(k)}, s, a, z_{k-1}\big),
% \qquad k=K,K-1,\dots,1, \\
% s^{(0)} &= s^+ = f_{\theta}(s,a,\varepsilon),
% \end{align*}
% where $h_k$ is the deterministic reverse update at diffusion step $k$. For the DDPM-style parameterization from the world-model section,
% \begin{align*}
% h_k(u, s, a, z)
% =
% \frac{1}{\sqrt{\alpha_k}}
% \Big(
% u
% -
% (1-\alpha_k)\,\hat\epsilon_{\theta}(u,k,(s,a))
% \Big)
% +
% \sigma_k z.
% \end{align*}
% All operations in $h_k$ are differentiable, and the dependence on $(s,a)$ enters only through the conditioning input $(s,a)$ to $\hat\epsilon_{\theta}$.

% \paragraph{Action sensitivity.}
% Define the diffusion-level action sensitivities
% \begin{align*}
% S_k := \nabla_a s^{(k)} \in \mathbb{R}^{d\times m}.
% \end{align*}
% Since $s^{(K)}=g_K(z_K)$ does not depend on $a$, we have $S_K=0$. For $k=K,\dots,1$, apply the chain rule to
% \begin{align*}
% s^{(k-1)} = h_k\big(s^{(k)}, s, a, z_{k-1}\big).
% \end{align*}
% This yields the recursion
% \begin{align*}
% S_{k-1}
% =
% \frac{\partial h_k}{\partial u}\big(s^{(k)}, s, a, z_{k-1}\big)\, S_k
% +
% \frac{\partial h_k}{\partial a}\big(s^{(k)}, s, a, z_{k-1}\big).
% \end{align*}
% Iterating from $k=K$ down to $k=1$ gives
% \begin{align*}
% S_0 = \nabla_a s^{(0)} = \nabla_a f_{\theta}(s,a,\varepsilon).
% \end{align*}
% Therefore, in the rollout notation,
% \begin{align*}
% F_a(h) = \nabla_a f_{\theta}(\tilde{s}_h,\tilde{a}_h,\varepsilon_{t+h}) = S_0.
% \end{align*}

% \paragraph{State sensitivity.}
% Similarly define the diffusion-level state sensitivities
% \begin{align*}
% T_k := \nabla_s s^{(k)} \in \mathbb{R}^{d\times d}.
% \end{align*}
% Again $T_K=0$ because $s^{(K)}$ is independent of $s$. Applying the chain rule to the same recursion yields
% \begin{align*}
% T_{k-1}
% =
% \frac{\partial h_k}{\partial u}\big(s^{(k)}, s, a, z_{k-1}\big)\, T_k
% +
% \frac{\partial h_k}{\partial s}\big(s^{(k)}, s, a, z_{k-1}\big),
% \end{align*}
% and iterating gives
% \begin{align*}
% T_0 = \nabla_s s^{(0)} = \nabla_s f_{\theta}(s,a,\varepsilon).
% \end{align*}
% Thus, in the rollout notation,
% \begin{align*}
% F_s(h) = \nabla_s f_{\theta}(\tilde{s}_h,\tilde{a}_h,\varepsilon_{t+h}) = T_0.
% \end{align*}

% \paragraph{Reverse-step Jacobians.}
% For the DDPM-style $h_k$, the Jacobians appearing above have the form
% \begin{align*}
% \frac{\partial h_k}{\partial u}
% &=
% \frac{1}{\sqrt{\alpha_k}}
% \Big(
% I - (1-\alpha_k)\,\frac{\partial \hat\epsilon_{\theta}(u,k,(s,a))}{\partial u}
% \Big), \\
% \frac{\partial h_k}{\partial a}
% &=
% -\frac{1}{\sqrt{\alpha_k}}
% (1-\alpha_k)\,
% \frac{\partial \hat\epsilon_{\theta}(u,k,(s,a))}{\partial a}, \\
% \frac{\partial h_k}{\partial s}
% &=
% -\frac{1}{\sqrt{\alpha_k}}
% (1-\alpha_k)\,
% \frac{\partial \hat\epsilon_{\theta}(u,k,(s,a))}{\partial s}.
% \end{align*}
% These expressions are a direct consequence of the chain rule applied to the reverse update. In practice, the required products with $S_k$ and $T_k$ are computed efficiently by automatic differentiation through the unrolled sampler; the recursions above formalize the computation performed by backpropagation through diffusion depth.

\subsection{Backpropagation through the diffusion sampler and the critic}
\label{sec:mpc_backprop_diffusion_critic}

The MPC objective $\widehat{J}_t(\psi)$ is computed by simulating length-$H$ rollouts under the learned transition map $f_{\theta}$ and then differentiating the resulting scalar return with respect to the policy parameters $\psi$. The key point is that, after reparameterization, the only randomness in the rollout is the sampled noise variables, and for fixed noises the entire rollout is a deterministic differentiable computation graph.

\subsubsection{Fixing the noise makes the rollout deterministic}

Fix one noise realization $\varepsilon_{t:t+H-1}$ and define the imagined trajectory by
\begin{align*}
\tilde{s}_0 &= s_t,\\
\tilde{a}_h &= \pi_{\psi}(\tilde{s}_h),\\
\tilde{s}_{h+1} &= f_{\theta}(\tilde{s}_h,\tilde{a}_h,\varepsilon_{t+h}),
\qquad h=0,1,\ldots,H-1.
\end{align*}
Given this trajectory, define the per-noise return
\begin{align*}
L(\psi;\varepsilon_{t:t+H-1})
=
\sum_{h=0}^{H-1}\gamma^h\, r_{\xi}(\tilde{s}_h,\tilde{a}_h)
+
\gamma^H\, Q_{\phi}\big(\tilde{s}_H,\pi_{\psi}(\tilde{s}_H)\big).
\end{align*}
For fixed $\varepsilon_{t:t+H-1}$, the map $\psi \mapsto L(\psi;\varepsilon_{t:t+H-1})$ is deterministic. Therefore $\nabla_{\psi} L(\psi;\varepsilon_{t:t+H-1})$ is obtained by ordinary backpropagation through the rollout computation graph.

\subsubsection{Backpropagation through time across the $H$ steps}

To write the gradient in a clean recursive form, introduce the state-to-parameter sensitivity
\begin{align*}
G_h := \nabla_{\psi}\tilde{s}_h \in \mathbb{R}^{d\times p}.
\end{align*}
Since $\tilde{s}_0=s_t$ is fixed, $G_0=0$. Next define the Jacobians of the policy and of the world model along the imagined rollout:
\begin{align*}
\Pi_s(h) &:= \nabla_s \pi_{\psi}(s)\big|_{s=\tilde{s}_h} \in \mathbb{R}^{m\times d},
\qquad
\Pi_{\psi}(h) := \nabla_{\psi}\pi_{\psi}(\tilde{s}_h) \in \mathbb{R}^{m\times p},\\
F_s(h) &:= \nabla_s f_{\theta}(s,a,\varepsilon_{t+h})\big|_{s=\tilde{s}_h,a=\tilde{a}_h} \in \mathbb{R}^{d\times d},
\qquad
F_a(h) := \nabla_a f_{\theta}(s,a,\varepsilon_{t+h})\big|_{s=\tilde{s}_h,a=\tilde{a}_h} \in \mathbb{R}^{d\times m}.
\end{align*}
Differentiate the transition
\begin{align*}
\tilde{s}_{h+1}=f_{\theta}\big(\tilde{s}_h,\pi_{\psi}(\tilde{s}_h),\varepsilon_{t+h}\big)
\end{align*}
to obtain the backpropagation-through-time recursion
\begin{align*}
G_{h+1}
=
\Big(F_s(h)+F_a(h)\,\Pi_s(h)\Big)\,G_h
+
F_a(h)\,\Pi_{\psi}(h),
\qquad h=0,1,\ldots,H-1.
\end{align*}
This recursion says: sensitivity at the next state equals sensitivity propagated through the dynamics, plus a direct term coming from how $\psi$ changes the action.

Now differentiate the scalar return $L(\psi;\varepsilon_{t:t+H-1})$. For each stage reward $r_{\xi}(\tilde{s}_h,\tilde{a}_h)$, the chain rule gives
\begin{align*}
\nabla_{\psi} r_{\xi}(\tilde{s}_h,\tilde{a}_h)
=
\nabla_s r_{\xi}(\tilde{s}_h,\tilde{a}_h)\,G_h
+
\nabla_a r_{\xi}(\tilde{s}_h,\tilde{a}_h)\,
\Big(\Pi_s(h)\,G_h+\Pi_{\psi}(h)\Big).
\end{align*}
For the terminal critic term, with $\tilde{a}_H=\pi_{\psi}(\tilde{s}_H)$,
\begin{align*}
\nabla_{\psi} Q_{\phi}\big(\tilde{s}_H,\pi_{\psi}(\tilde{s}_H)\big)
=
\nabla_s Q_{\phi}(\tilde{s}_H,\tilde{a}_H)\,G_H
+
\nabla_a Q_{\phi}(\tilde{s}_H,\tilde{a}_H)\,
\Big(\Pi_s(H)\,G_H+\Pi_{\psi}(H)\Big).
\end{align*}
Combining the above expressions yields
\begin{align*}
\nabla_{\psi} L(\psi;\varepsilon_{t:t+H-1})
=
\sum_{h=0}^{H-1}\gamma^h\,\nabla_{\psi} r_{\xi}(\tilde{s}_h,\tilde{a}_h)
+
\gamma^H\,\nabla_{\psi} Q_{\phi}\big(\tilde{s}_H,\pi_{\psi}(\tilde{s}_H)\big).
\end{align*}

\subsubsection{Averaging over Monte Carlo particles}

In MPC we estimate the expectation over diffusion noise by sampling $K$ independent noise sequences $\{\varepsilon_{t:t+H-1}^{(i)}\}_{i=1}^K$. This gives
\begin{align*}
\widehat{J}_t(\psi)
=
\frac{1}{K}\sum_{i=1}^K
L\big(\psi;\varepsilon_{t:t+H-1}^{(i)}\big),
\qquad
\nabla_{\psi}\widehat{J}_t(\psi)
=
\frac{1}{K}\sum_{i=1}^K
\nabla_{\psi}L\big(\psi;\varepsilon_{t:t+H-1}^{(i)}\big).
\end{align*}

% --- Replace the start of the subsection with the following (explicit definitions) ---

\subsubsection{Backpropagation through diffusion depth inside $f_{\theta}$}

The only nonstandard derivatives needed above are $F_s(h)$ and $F_a(h)$, the Jacobians of the diffusion transition map $f_{\theta}$. These are obtained by differentiating through the reverse diffusion recursion.

Fix one transition evaluation and abbreviate $s=\tilde{s}_h$, $a=\tilde{a}_h$, and $\varepsilon=\varepsilon_{t+h}$. The sampler outputs
\begin{align*}
s^{+} = f_{\theta}(s,a,\varepsilon),
\end{align*}
by unrolling the reverse diffusion steps. To avoid conflicts with the outer time index $t$ and the particle count $K$, we index diffusion depth by $\tau \in \{0,1,\ldots,\mathcal{T}\}$, where $\mathcal{T}$ is the diffusion horizon.

\paragraph{Explicit definition of the reverse-step maps.}
Let $\{z_{\tau}\}_{\tau=0}^{\mathcal{T}}$ be independent standard Gaussians and define the full noise collection
\begin{align*}
\varepsilon := (z_{\mathcal{T}},z_{\mathcal{T}-1},\ldots,z_0),
\qquad
z_{\tau} \sim \mathcal{N}(0,I)\ \text{independently.}
\end{align*}
Define the initialization map $g_{\mathcal{T}}$ by
\begin{align*}
g_{\mathcal{T}}(z_{\mathcal{T}}) := z_{\mathcal{T}},
\end{align*}
and for each $\tau \in \{1,2,\ldots,\mathcal{T}\}$ define the reverse update map $h_{\tau}$ by
\begin{align*}
h_{\tau}(u,s,a,z)
:=
\frac{1}{\sqrt{\alpha_{\tau}}}
\Big(
u-(1-\alpha_{\tau})\,\hat{\epsilon}_{\theta}(u,\tau,(s,a))
\Big)
+
\sigma_{\tau} z.
\end{align*}
With these definitions, the reverse recursion is
\begin{align*}
s^{(\mathcal{T})} &= g_{\mathcal{T}}(z_{\mathcal{T}}),\\
s^{(\tau-1)} &= h_{\tau}\big(s^{(\tau)}, s, a, z_{\tau-1}\big),
\qquad \tau=\mathcal{T},\mathcal{T}-1,\ldots,1,\\
s^{(0)} &= s^{+}.
\end{align*}
Since $g_{\mathcal{T}}$ and each $h_{\tau}$ are differentiable, $f_{\theta}$ is differentiable in $(s,a)$ for fixed $\varepsilon$.

\paragraph{Action and state sensitivities across diffusion depth.}
Define diffusion-level sensitivities with respect to the action and the conditioning state:
\begin{align*}
A_{\tau} &:= \nabla_a s^{(\tau)} \in \mathbb{R}^{d\times m},
\qquad
B_{\tau} := \nabla_s s^{(\tau)} \in \mathbb{R}^{d\times d}.
\end{align*}
Because the initialization $s^{(\mathcal{T})}=g_{\mathcal{T}}(z_{\mathcal{T}})$ depends only on the sampled noise, it is independent of $(s,a)$, so
\begin{align*}
A_{\mathcal{T}}=0,
\qquad
B_{\mathcal{T}}=0.
\end{align*}
Differentiating the reverse step
\begin{align*}
s^{(\tau-1)}=h_{\tau}\big(s^{(\tau)},s,a,z_{\tau-1}\big)
\end{align*}
yields the linear recursions
\begin{align*}
A_{\tau-1}
&=
\frac{\partial h_{\tau}}{\partial u}\,A_{\tau}
+
\frac{\partial h_{\tau}}{\partial a},
\qquad
B_{\tau-1}
=
\frac{\partial h_{\tau}}{\partial u}\,B_{\tau}
+
\frac{\partial h_{\tau}}{\partial s},
\end{align*}
where all partial derivatives are evaluated at $(u,s,a,z)=(s^{(\tau)},s,a,z_{\tau-1})$. Iterating from $\tau=\mathcal{T}$ down to $\tau=1$ yields
\begin{align*}
\nabla_a f_{\theta}(s,a,\varepsilon) = A_{0},
\qquad
\nabla_s f_{\theta}(s,a,\varepsilon) = B_{0}.
\end{align*}
Therefore, along the rollout we identify
\begin{align*}
F_a(h)=\nabla_a f_{\theta}\big(\tilde{s}_h,\tilde{a}_h,\varepsilon_{t+h}\big),
\qquad
F_s(h)=\nabla_s f_{\theta}\big(\tilde{s}_h,\tilde{a}_h,\varepsilon_{t+h}\big).
\end{align*}
\subsection{Gradient-based inner-loop optimization and receding-horizon execution}

The MPC inner loop performs gradient ascent on the Monte Carlo objective $\widehat{J}_t(\psi)$, holding $f_{\theta}$, $r_{\xi}$, and $Q_{\phi}$ fixed. Because $\widehat{J}_t(\psi)$ is differentiable via the compositions derived above, we can update
\begin{align*}
\psi \leftarrow \psi + \alpha \nabla_{\psi}\widehat{J}_t(\psi),
\end{align*}
for a small number of gradient steps. After this inner optimization, MPC executes the first action in the real environment,
\begin{align*}
a_t = \pi_{\psi}(s_t),
\end{align*}
observes $(r_t,s_{t+1})$, and repeats the procedure at the next real state. This receding-horizon loop leverages the diffusion world model as a learned differentiable simulator, while the terminal critic provides value information beyond the truncated horizon and the reward predictor supplies an immediate reward signal along imagined trajectories.

\subsection{Algorithm description}

Algorithm~\ref{alg:mpc_diffusion_world_model} summarizes the full MPC procedure. Noise sequences are sampled either once per inner-loop solve (common random numbers) or freshly per gradient step (Lines~\ref{line:mpc_common_noise_flag}--\ref{line:mpc_noise_sample}). Imagined rollouts are generated by repeatedly applying the diffusion sampler (Line~\ref{line:mpc_forward_diffusion}), the objective is accumulated with $r_{\xi}$ and the terminal critic (Lines~\ref{line:mpc_reward_accum}--\ref{line:mpc_terminal}), and gradients are computed by the backpropagation derivation above (Line~\ref{line:mpc_backprop}) to update $\psi$ (Line~\ref{line:mpc_update}).

\begin{algorithm}[t]
\caption{MPC with Diffusion World Model}
\label{alg:mpc_diffusion_world_model}
\begin{algorithmic}[1]
\REQUIRE Diffusion sampler $f_{\theta}$, reward model $r_{\xi}$, terminal critic $Q_{\phi}$, policy $\pi_{\psi}$, horizon $H$, particles $K$, inner steps $E$, step size $\alpha$, discount $\gamma$, flag \texttt{common\_noise}.
\STATE Observe real environment state $s_t$
\IF{\texttt{common\_noise}} \label{line:mpc_common_noise_flag}
    \STATE Sample and cache $\{\varepsilon_{t:t+H-1}^{(i)}\}_{i=1}^K$ with $\varepsilon_{t:t+H-1}^{(i)} \sim p_0$ \label{line:mpc_noise_sample_common}
\ENDIF
\FOR{$e=1,2,\dots,E$}
    \IF{\textbf{not} \texttt{common\_noise}}
        \STATE Sample $\{\varepsilon_{t:t+H-1}^{(i)}\}_{i=1}^K$ with $\varepsilon_{t:t+H-1}^{(i)} \sim p_0$ \label{line:mpc_noise_sample_fresh}
    \ENDIF
    \FOR{$i=1,2,\dots,K$}
        \STATE $\tilde{s}_0^{(i)} \gets s_t$
    \ENDFOR
    \STATE $J \gets 0$
    \FOR{$h=0,1,\dots,H-1$}
        \FOR{$i=1,2,\dots,K$}
            \STATE $\tilde{a}_h^{(i)} \gets \pi_{\psi}(\tilde{s}_h^{(i)})$ \label{line:mpc_policy_action}
        \ENDFOR
        \STATE $J \gets J + \frac{1}{K}\sum_{i=1}^K \gamma^h\, r_{\xi}(\tilde{s}_h^{(i)},\tilde{a}_h^{(i)})$ \label{line:mpc_reward_accum}
        \FOR{$i=1,2,\dots,K$}
            \STATE $\tilde{s}_{h+1}^{(i)} \gets f_{\theta}(\tilde{s}_h^{(i)},\tilde{a}_h^{(i)},\varepsilon_{t+h}^{(i)})$ \label{line:mpc_forward_diffusion}
        \ENDFOR
    \ENDFOR
    \STATE $J \gets J + \frac{1}{K}\sum_{i=1}^K \gamma^H\, Q_{\phi}(\tilde{s}_H^{(i)},\pi_{\psi}(\tilde{s}_H^{(i)}))$ \label{line:mpc_terminal}
    \STATE Compute $\nabla_{\psi} J$ by backpropagation through time and through $f_{\theta}$ \label{line:mpc_backprop}
    \STATE $\psi \gets \psi + \alpha\, \nabla_{\psi} J$ \label{line:mpc_update}
\ENDFOR
\STATE Execute $a_t \gets \pi_{\psi}(s_t)$ in the real environment and observe $(r_t,s_{t+1})$ \label{line:mpc_execute}
\end{algorithmic}
\end{algorithm}

