We now describe a receding-horizon model predictive control (MPC) procedure that uses the learned diffusion world model as a differentiable simulator. The world model is given by the reparameterized transition map $s_{t+1} = f_{\theta}(s_t,a_t,\varepsilon_t), \varepsilon_t \sim p_0(\varepsilon)$,
% \label{eq:diffusion_transition_map}
where $f_{\theta}$ denotes the reverse diffusion sampler unrolled as a computation graph. We combine this dynamics model with a learned reward predictor $r_{\xi}(s,a)$ and a learned terminal critic $Q_{\phi}(s,a)$. The critic provides a terminal value for truncated rollouts, while $r_{\xi}$ supplies the immediate reward along imagined trajectories at inference time.
\begin{algorithm}[t]
\caption{\,\,\,$\MPCwDWM$\,\, (Model Predictive Control with Differentiable World Model)}
\label{alg:mpc_diffusion_world_model}
\begin{algorithmic}[1]
\REQUIRE Diffusion sampler $f_{\theta}$, noise $\{\varepsilon_{t:t+H-1}^{(i)}\}_{i=1}^M$ reward model $r_{\xi}$, terminal critic $Q_{\phi}$, policy $\pi_{\psi}$, horizon $H$, particles $M$, inner steps $E$, step size $\alpha$, discount $\gamma$.
\FOR{$t=1,2,3\ldots, T$}
\STATE Observe environment state $s_t$
\FOR{$e=1,2,\dots,E$}
    \STATE $\tilde{s}_0^{(i)} \gets s_t \quad \forall\, i \in \{1,2,\dots,M\}$
    \STATE $J \gets 0$
    \FOR{$j=0,1,\dots,H-1$}
        \FOR{$i=1,2,\dots,M$}
            \STATE $\tilde{a}_j^{(i)} \gets \pi_{\psi} (\tilde{s}_j^{(i)})$ \algc{\color{Green} Action for imagined rollout}
        \ENDFOR
        \STATE $J \gets J + \frac{1}{M}\sum_{i=1}^M \gamma^j\, r_{\xi}(\tilde{s}_j^{(i)},\tilde{a}_j^{(i)})$ \label{line:mpc_reward_accum}
        \FOR{$i=1,2,\dots,M$}
            \STATE $\tilde{s}_{j+1}^{(i)} \gets f_{\theta}(\tilde{s}_j^{(i)},\tilde{a}_j^{(i)},\varepsilon_{t+j}^{(i)})$ \algc{\color{Green} Next state}
        \ENDFOR
    \ENDFOR
    \STATE $J \gets J + \frac{1}{M}\sum_{i=1}^M \gamma^H\, Q_{\phi}(\tilde{s}_H^{(i)},\pi_{\psi}(\tilde{s}_H^{(i)}))$ \label{line:mpc_terminal}
    \vspace{1ex}
    % \STATE Compute $\nabla_{\psi} J$ \label{line:mpc_backprop}
    \STATE $\psi \gets \psi + \alpha\, \nabla_{\psi} J$ \algc{\color{Green} Policy Update}
    \label{line:mpc_update}
\ENDFOR
\STATE Execute $a_t \gets \pi_{\psi}(s_t)$ and observe $(r_t,s_{t+1})$ \label{line:mpc_execute}
\ENDFOR
\end{algorithmic}
\end{algorithm}
\subsection{Receding-horizon Objective}
\label{sec:mpc_objective}

At a real environment state $s_t$, MPC optimizes a horizon-$H$ objective over imagined trajectories generated by $f_{\theta}$. Given a fixed noise sequence
$\varepsilon_{t:t+H-1}=(\varepsilon_t,\ldots,\varepsilon_{t+H-1})$
define the imagined rollout recursively by
\begin{align}
\tilde{s}_0 &= s_t,
\,\,
\tilde{a}_j = \pi_{\psi}(\tilde{s}_j),
\,\,
\tilde{s}_{j+1} = f_{\theta}(\tilde{s}_j,\tilde{a}_j,\varepsilon_{t+j}),
\label{eq:mpc_rollout_recursion}
\end{align}
where $j=0,\ldots,H-1$.
We evaluate the finite-horizon return by summing predicted stage rewards and adding a terminal value given by the critic:
\begin{align}
L(\psi;\varepsilon_{t:t+H-1})
&=
\sum_{j=0}^{H-1}\gamma^j\,
r_{\xi}(\tilde{s}_j,\tilde{a}_j)
+
\gamma^H\,
Q_{\phi}\!\Big(
\tilde{s}_H,\pi_{\psi}(\tilde{s}_H)
\Big).
\label{eq:mpc_per_noise_return}
\end{align}
We then define the MPC objective as the expectation of this return over diffusion noise:
\begin{align*}
J_t(\psi)
&=
\mathbb{E}_{\varepsilon_{t:t+H-1}\sim p_0}
\Big[
L(\psi;\varepsilon_{t:t+H-1})
\Big].
% \label{eq:mpc_objective}
\end{align*}
\subsection{Monte Carlo Approximation}
\label{sec:mpc_mc}

We approximate $J_t(\psi)$ with $M$ i.i.d.\ noise sequences
$\{\varepsilon_{t:t+H-1}^{(m)}\}_{m=1}^{M}$, where
$\varepsilon_{t:t+H-1}^{(m)}\sim p_0$. For each $m$, generate an imagined rollout
$\{\tilde{s}^{(m)}_j,\tilde{a}^{(m)}_j\}_{j=0}^{H}$ by
\begin{align*}
\tilde{s}_0^{(m)} &= s_t, \quad
\tilde{a}_j^{(m)} = \pi_{\psi}\!\big(\tilde{s}_j^{(m)}\big),\\
\tilde{s}_{j+1}^{(m)} &=
f_{\theta}\!\Big(
\tilde{s}_j^{(m)},\tilde{a}_j^{(m)},\varepsilon_{t+j}^{(m)}
\Big).
% \label{eq:mpc_rollout_particles}
\end{align*}
The Monte Carlo estimator is
\begin{align*}
\widehat{J}_t(\psi)
&=
\frac{1}{M}\sum_{m=1}^{M}
\Bigg[
\sum_{j=0}^{H-1}\gamma^j\,
r_{\xi}\!\big(\tilde{s}_j^{(m)},\tilde{a}_j^{(m)}\big) \nonumber
\\
& \qquad \qquad \qquad +
\gamma^H\,
Q_{\phi}\!\Big(
\tilde{s}_H^{(m)},\pi_{\psi}(\tilde{s}_H^{(m)})
\Big)
\Bigg].
% \label{eq:mpc_mc_estimator}
\end{align*}
% Optionally, within a single MPC solve at time $t$, we reuse the same sampled noises across multiple inner-loop gradient steps. Conditioning on the chosen noise pack makes $\widehat{J}_t(\psi)$ a deterministic differentiable surrogate during the inner optimization.
\subsection{Gradient-based optimization}
\label{sec:mpc_inner_opt}

At time $t$, we perform $E$ steps of gradient ascent on $\widehat{J}_t(\psi)$ while holding
$f_{\theta}$, $r_{\xi}$, and $Q_{\phi}$ fixed:
\begin{align}
\psi \leftarrow \psi + \alpha \nabla_{\psi}\widehat{J}_t(\psi),
\qquad e=1,\ldots,E,
\label{eq:mpc_inner_update}
\end{align}
where $\alpha>0$ is a step size. After the inner loop, MPC executes the first action in the real environment $a_t = \pi_{\psi}(s_t)$, observes $(r_t,s_{t+1})$, and repeats the procedure at the next state $s_{t+1}$ (see Algorithm~\ref{alg:mpc_diffusion_world_model}).





\subsection{Gradient recursion through diffusion rollouts}
\label{sec:mpc_backprop_diffusion_critic}

The next theorem states a compact Jacobian recursion for the gradient of the per-noise objective
$L(\psi;\varepsilon_{t:t+H-1})$ in \eqref{eq:mpc_per_noise_return} under the rollout dynamics in
\eqref{eq:mpc_rollout_recursion} (see Appendix~\ref{sec:proof_app} for proof). Define the rollout Jacobians, for $j=0,\ldots,H-1$, 
\begin{align*}
\Pi_s(j) &:= \nabla_s \pi_{\psi}(s)\big|_{s=\tilde{s}_j},
\quad
\Pi_{\psi}(j) := \nabla_{\psi}\pi_{\psi}(\tilde{s}_j),
% \label{eq:mpc_policy_jacobians}
\\
F_s(j) &:= \nabla_s f_{\theta}(s,a,\varepsilon_{t+j})\big|_{s=\tilde{s}_j,a=\tilde{a}_j},
\\
F_a(j) &:= \nabla_a f_{\theta}(s,a,\varepsilon_{t+j})\big|_{s=\tilde{s}_j,a=\tilde{a}_j}.
% \label{eq:mpc_dynamics_jacobians}
\end{align*}
Let $G_j := \nabla_{\psi}\tilde{s}_j$ and $D_j := \nabla_{\psi}\tilde{a}_j$. Then $G_0=0$ and, for $j=0,\ldots,H-1$,
\begin{align*}
D_j &= \Pi_s(j)\,G_j + \Pi_{\psi}(j),
% \label{eq:mpc_action_sensitivity}
\\
G_{j+1} &= F_s(j)\,G_j + F_a(j)\,D_j.
% \label{eq:mpc_state_sensitivity}
\end{align*}
Further define the gradient of the reward and the critic as:
\begin{align*}
r_s(j) := \nabla_s r_{\xi}(\tilde{s}_j,\tilde{a}_j),
\;\;
r_a(j) := \nabla_a r_{\xi}(\tilde{s}_j,\tilde{a}_j),\\
Q_s := \nabla_s Q_{\phi}(\tilde{s}_H,\tilde{a}_H),
\;\;
Q_a := \nabla_a Q_{\phi}(\tilde{s}_H,\tilde{a}_H).
% \label{eq:mpc_terminal_grads}
\end{align*}
\begin{tcolorbox}[
  width=0.48 \textwidth,
  colback=gray!1,        % Light gray background
  colframe=black,         % Frame color
  arc=3pt,               % Rounded corner size
  boxrule=1pt,            % Frame line width
  left=3pt, right=3pt,  % Horizontal padding
  top=3pt,  bottom=2pt  % Vertical padding
]
\begin{theorem}[\textbf{Gradient recursion}]
\label{thm:grad_recursion_diffusion_mpc}
Fix a time $t$, a horizon $H$, and a noise sequence $\varepsilon_{t:t+H-1}$, and let
$\{(\tilde{s}_j,\tilde{a}_j)\}_{j=0}^{H}$ be defined by \eqref{eq:mpc_rollout_recursion}.
Assume $\pi_{\psi}$ is differentiable in $\psi$ and its state input, and $f_{\theta}$, $r_{\xi}$, and $Q_{\phi}$
are differentiable in their state and action arguments.

Then the gradient of the per-noise return in \eqref{eq:mpc_per_noise_return} is
\begin{align*}
\nabla_{\psi} L(\psi;\varepsilon_{t:t+H-1})
&=
\sum_{j=0}^{H-1}\gamma^j\,
\Big(
r_s(j)\,G_j + r_a(j)\,D_j
\Big) \nonumber
\\
&+
\gamma^H\,
\Big(
Q_s\,G_H + Q_a\,D_H
\Big).
% \label{eq:mpc_grad_L_compact}
\end{align*}
Moreover, if $f_{\theta}$ is implemented by a reverse diffusion recursion as given by \eqref{eq:g_h_def}
% \begin{align*}
% s^{(K)} &= g_{K}(z_{K}),
% \qquad
% s^{(k-1)} = h_{k}\big(s^{(k)},s,a,z_{k-1}\big),
% \\
% k&=K,\ldots,1,
% \qquad
% f_{\theta}(s,a,\varepsilon)=s^{(0)},
% \end{align*}
% with $\varepsilon=(z_{K},\ldots,z_0)$ and $g_{K}$ independent of $(s,a)$, 
then
\begin{align*}
\nabla_a f_{\theta}(s,a,\varepsilon) = A_0,
\qquad
\nabla_s f_{\theta}(s,a,\varepsilon) = B_0,
\end{align*}
where $A_{K}=0$, $B_{K}=0$, and for $k=K,\ldots,1$,
\begin{align*}
A_{k-1}
&=
\frac{\partial h_{k}}{\partial u}\,A_{k}
+
\frac{\partial h_{k}}{\partial a},
\qquad
B_{k-1}
=
\frac{\partial h_{k}}{\partial u}\,B_{k}
+
\frac{\partial h_{k}}{\partial s},
\end{align*}
with all partial derivatives evaluated at $(u,s,a,z)=(s^{(k)},s,a,z_{k-1})$.
\end{theorem}
\end{tcolorbox}
\vspace{1.5ex}
\begin{remark}
Theorem~\ref{thm:grad_recursion_diffusion_mpc} provides an explicit chain-rule decomposition of
$\nabla_{\psi}L(\psi;\varepsilon_{t:t+H-1})$ along the rollout \eqref{eq:mpc_rollout_recursion}.
The sensitivities $(G_j,D_j)$ propagate how changing $\psi$ perturbs the imagined state action sequence,
through the policy Jacobians $(\Pi_s(j),\Pi_{\psi}(j))$ and the diffusion-dynamics Jacobians $(F_s(j),F_a(j))$.
The final expression aggregates these perturbations through the stage rewards and the terminal critic via
$(r_s(j),r_a(j))$ and $(Q_s,Q_a)$, yielding the gradient used in the update \eqref{eq:mpc_inner_update}.
\end{remark}



\begin{figure*}[t]
        \includegraphics[width=\textwidth]{figures/rmse_across_steps_medium-replay_20260129_040908.pdf}
    \caption{One-step state prediction RMSE of diffusion models across training steps (20k–200k) on medium-replay datasets. RMSE decreases with training for all three environments (halfcheetah, hopper, walker2d), with shaded regions showing standard error over 1000 transitions.} 
    \label{fig:rmse_diffusion}
\end{figure*}

\begin{figure*}[t]
        \includegraphics[width=\textwidth]{figures/reward_rmse_across_steps_medium-replay_20260129_035721.pdf}
    \caption{Reward prediction RMSE of reward models across training steps (20k–200k) on medium-replay datasets. All environments show decreasing RMSE with training. Shaded regions indicate standard error over 1000 transitions.} 
    \label{fig:rmse_diffusion}
\end{figure*}





% \subsection{Backpropagation through diffusion rollouts}
% \label{sec:mpc_backprop}

% The update above relies on differentiating through the imagined rollouts. The key point is that, after reparameterization, the only randomness is the sampled diffusion noises; for fixed noises, the rollout is a deterministic computation graph composed of the policy $\pi_{\psi}$, the learned transition map $f_{\theta}$, the reward model $r_{\xi}$, and the terminal critic $Q_{\phi}$. The following theorem formalizes the resulting pathwise gradient used by MPC.

% \begin{theorem}[Pathwise gradient for diffusion MPC]
% \label{thm:pathwise_grad_diffusion_mpc}
% Fix $t$ and horizon $H$. Assume $\pi_{\psi}$ is differentiable in $\psi$ and in its state input, and that
% $f_{\theta}$, $r_{\xi}$, and $Q_{\phi}$ are differentiable in their state and action arguments.
% Define
% \begin{align*}
% J_t(\psi)
% =
% \mathbb{E}_{\varepsilon_{t:t+H-1}\sim p_0}
% \Big[
% L(\psi;\varepsilon_{t:t+H-1})
% \Big],
% \end{align*}
% with $L(\psi;\varepsilon_{t:t+H-1})$ given in \eqref{sec:mpc_objective}. If differentiation can be interchanged with expectation (e.g., by dominated convergence under standard boundedness conditions), then
% \begin{align*}
% \nabla_{\psi} J_t(\psi)
% =
% \mathbb{E}_{\varepsilon_{t:t+H-1}\sim p_0}
% \Big[
% \nabla_{\psi} L(\psi;\varepsilon_{t:t+H-1})
% \Big],
% \end{align*}
% where $\nabla_{\psi} L(\psi;\varepsilon_{t:t+H-1})$ is obtained by backpropagation through the deterministic rollout recursion
% \begin{align*}
% \tilde{s}_0 &= s_t,\quad
% \tilde{a}_h = \pi_{\psi}(\tilde{s}_h),\quad
% \tilde{s}_{h+1} = f_{\theta}(\tilde{s}_h,\tilde{a}_h,\varepsilon_{t+h}).
% \end{align*}
% Moreover, for i.i.d.\ samples $\{\varepsilon_{t:t+H-1}^{(i)}\}_{i=1}^{K}$, the Monte Carlo gradient
% \begin{align*}
% \nabla_{\psi}\widehat{J}_t(\psi)
% =
% \frac{1}{K}\sum_{i=1}^{K}
% \nabla_{\psi} L\big(\psi;\varepsilon_{t:t+H-1}^{(i)}\big)
% \end{align*}
% is an unbiased estimator of $\nabla_{\psi} J_t(\psi)$.
% \end{theorem}

% We compute $\nabla_{\psi}\widehat{J}_t(\psi)$ using automatic differentiation through the unrolled rollout.
% An explicit Jacobian-level derivation that decomposes (i) backpropagation through time over the horizon $H$
% and (ii) backpropagation through diffusion depth inside each call to $f_{\theta}$ is given in Appendix~\ref{sec:appendix_backprop_diffusion}.

