
{ \begin{table*}[h]
\centering
\small
\begin{tabular}{l|cccc|cc}
\toprule
Task Name & TD3+BC & IQL & CQL & SAC-RND & ReBRAC & MPCwDWM \\
\midrule
halfcheetah-random        & $30.9 \pm 0.4$ & $19.5 \pm 0.8$ & $31.1 \pm 3.5$ & $27.6 \pm 2.1$ & $29.5 \pm 1.5$  & $\textbf{29.9} \pm 1.5$ \\
halfcheetah-medium        & $54.7 \pm 0.9$ & $50.0 \pm 0.2$ & $46.9 \pm 0.4$ & $66.4 \pm 1.4$ & $65.6 \pm 1.0$  & $\textbf{70.05} \pm 1.8$ \\
halfcheetah-expert        & $93.4 \pm 0.4$ & $95.5 \pm 2.1$ & $97.3 \pm 1.1$ & $102.6 \pm 4.2$ & $105.9 \pm 1.7$ & $\textbf{106.14} \pm 2.1$ \\
halfcheetah-medium-expert & $89.1 \pm 5.6$ & $92.7 \pm 2.8$ & $95.0 \pm 1.4$ & $\textbf{108.1} \pm 1.5$ & $101.1 \pm 5.2$ & $105.35 \pm 1.8$ \\
halfcheetah-medium-replay & $45.0 \pm 1.1$ & $42.1 \pm 3.6$ & $45.3 \pm 0.3$ & $51.2 \pm 3.2$  & $51.0 \pm 0.8$  & $\textbf{59.89} \pm 1.2$ \\
halfcheetah-full-replay   & $75.0 \pm 2.5$ & $75.0 \pm 0.7$ & $76.9 \pm 0.9$ & $81.2 \pm 1.3$  & $82.1 \pm 1.1$  & $\textbf{85.149} \pm 0.9$ \\
\midrule
hopper-random             & $8.5 \pm 0.7$   & $10.1 \pm 5.9$ & $5.3 \pm 0.6$  & $19.6 \pm 12.4$ & $8.1 \pm 2.4$   & $\textbf{8.4} \pm 1.21$ \\
hopper-medium             & $60.9 \pm 7.6$  & $65.2 \pm 4.2$ & $61.9 \pm 6.4$ & $91.1 \pm 10.1$ & $102.0 \pm 1.0$ & $\textbf{103.38} \pm 0.35$ \\
hopper-expert             & $109.6 \pm 3.7$ & $108.8 \pm 3.1$ & $106.5 \pm 9.1$ & $\textbf{109.8} \pm 0.5$ & $100.1 \pm 8.3$ & $104.30 \pm 7.03$ \\
hopper-medium-expert      & $87.8 \pm 10.5$ & $85.5 \pm 29.7$ & $96.9 \pm 15.1$ & $109.8 \pm 0.6$ & $107.0 \pm 6.4$ & $107.42 \pm 5.3$ \\
hopper-medium-replay      & $55.1 \pm 31.7$ & $89.6 \pm 13.2$ & $86.3 \pm 7.3$  & $97.2 \pm 9.0$  & $98.1 \pm 5.3$  & $\textbf{103.14} \pm 0.45$ \\
hopper-full-replay        & $97.9 \pm 17.5$ & $104.4 \pm 10.8$ & $101.9 \pm 0.6$ & $107.4 \pm 0.8$ & $107.1 \pm 0.4$ & $\textbf{108.77} \pm 0.6434$ \\
\midrule
walker2d-random           & $2.0 \pm 3.6$   & $11.3 \pm 7.0$  & $5.1 \pm 1.7$   & $18.7 \pm 6.9$  & $18.4 \pm 4.5$  & $18.4 \pm 4.5 (\star)$ \\
walker2d-medium           & $77.7 \pm 2.9$  & $80.7 \pm 3.4$  & $79.5 \pm 3.2$  & $\textbf{92.7} \pm 1.2$  & $82.5 \pm 3.6$  & $88.91 \pm 0.6$ \\
walker2d-expert           & $110.0 \pm 0.6$ & $96.9 \pm 32.3$ & $109.3 \pm 0.1$ & $104.5 \pm 22.8$ & $112.3 \pm 0.2$ & $\textbf{116.65} \pm 0.41$ \\
walker2d-medium-expert    & $110.4 \pm 0.6$ & $112.1 \pm 0.5$ & $109.1 \pm 0.2$ & $104.6 \pm 11.2$ & $111.6 \pm 0.3$ & $\textbf{115.76} \pm 1.04$ \\
walker2d-medium-replay    & $68.0 \pm 19.2$ & $75.4 \pm 9.3$  & $76.8 \pm 10.0$ & $89.4 \pm 3.8$  & $77.3 \pm 7.9$  & $\textbf{95.87} \pm 1.19$ \\
walker2d-full-replay      & $90.3 \pm 5.4$  & $97.5 \pm 1.4$  & $94.2 \pm 1.9$  & $105.3 \pm 3.2$ & $102.2 \pm 1.7$ & $\textbf{105.76} \pm 2.91$ \\
\midrule
Average & $70.3$ & $72.9$ & $73.6$ & $82.6$ & $81.2$ & $0 \pm 0$ \\
\bottomrule
\caption{Average normalized score over the final evaluation and ten unseen training seeds on Gym-MuJoCo tasks. TD3+BC IQL CQL SAC-RND and ReBRAC scores are taken from \citep{tarasov2023minimalist}. The symbol $\pm$ represents the standard deviation across the seeds.}
\label{tab:rebrac_mujoco}
\end{tabular}
\end{table*}}

\begin{table*}[t]
\centering
\small
\setlength{\tabcolsep}{5pt}
\renewcommand{\arraystretch}{1.05}
\begin{tabular}{l|cccc|cc}
\toprule
Task Name & TD3+BC & IQL & CQL & SAC-RND & ReBRAC, our & MPCwDWM \\
\midrule
antmaze-umaze          & $66.3 \pm 6.2$  & $83.3 \pm 4.5$ & $74.0$ & $97.0 \pm 1.5$  & $97.8 \pm 1.0$  & $\textbf{98.3} \pm 1.1$ \\
antmaze-umaze-diverse  & $53.8 \pm 8.5$  & $70.6 \pm 3.7$ & $84.0$ & $66.0 \pm 25.0$ & $88.3 \pm 13.0$ & $88.3 \pm 13.0 $ ($\star$)\\
antmaze-medium-play    & $26.5 \pm 18.4$ & $64.6 \pm 4.9$ & $61.2$ & $38.5 \pm 29.4$ & $84.0 \pm 4.2$  & $\textbf{86.33} \pm 4.8$ \\
antmaze-medium-diverse & $25.9 \pm 15.3$ & $61.7 \pm 6.1$ & $53.7$ & $74.7 \pm 10.7$ & $76.3 \pm 13.5$ & $\textbf{84.7} \pm 6.7$ \\
antmaze-large-play     & $0.0 \pm 0.0$   & $42.5 \pm 6.5$ & $15.8$ & $43.9 \pm 29.2$ & $60.4 \pm 26.1$ & $\textbf{70.0} \pm 18.2$ \\
antmaze-large-diverse  & $0.0 \pm 0.0$   & $27.6 \pm 7.8$ & $14.9$ & $45.7 \pm 28.5$ & $54.4 \pm 25.1$ & $\textbf{62.4} \pm 26.61$ \\
\midrule
Average & $28.7$ & $58.3$ & $50.6$ & $60.9$ & $76.8$ & $0 \pm 0$ \\
\bottomrule
\caption{Average normalized score over the final evaluation and ten unseen training seeds on AntMaze tasks. TD3+BC IQL CQL SAC-RND and ReBRAC scores are taken from \citep{tarasov2023minimalist}. The symbol $\pm$ represents the standard deviation across the seeds.}
\label{tab:rebrac_antmaze}
\end{tabular}
\end{table*}



\paragraph{Datasets.}
We evaluate on the D4RL benchmark \citep{fu2020d4rl}, focusing on continuous-control domains where offline learning and long-horizon credit assignment are challenging. Our main results cover all Gym MuJoCo locomotion datasets (18 tasks spanning random, medium, expert, medium-expert, medium-replay, and full-replay settings across HalfCheetah, Hopper, and Walker2d) and all AntMaze datasets (6 tasks spanning maze, maze-diverse, medium-play, medium-diverse, large-play, and large-diverse). For each task, we report D4RL normalized scores computed from the return accumulated by the agent.
\paragraph{Baselines.}
We compare against strong ensemble-free offline RL baselines that are widely used on D4RL locomotion and AntMaze. These include TD3+BC \citep{fujimoto2021td3bc}, which adds behavior cloning regularization to a TD3 actor update; IQL \citep{kostrikov2022iql}, which performs offline policy improvement via value-weighted regression without explicit behavior constraints; CQL \citep{kumar2020cql}, which trains a conservative critic by penalizing high values on out-of-distribution actions; SAC-RND \citep{nikulin2023sacrnd}, which augments SAC with random-network-distillation-based penalties to discourage out-of-distribution actions; and ReBRAC \citep{tarasov2023rebrac}, which revisits behavior-regularized actor-critic with practical improvements and strong performance. We also compare against generative policy and planning approaches in Appendix~\ref{sec:generative_baselines}.

\paragraph{Experimental details.}
Our approach has an offline training stage and an inference-time adaptation stage. Offline, we first train a policy $\pi_{\psi}$ and critic $Q_{\phi}$ on the fixed dataset using ReBRAC \citep{tarasov2023rebrac}. We then train a diffusion transition model $f_{\theta}$ on dataset transitions and a reward predictor $r_{\xi}$ on dataset rewards using supervised learning. Figure~\ref{fig:diffusion_rmse} reports the diffusion model prediction RMSE across training steps, and Figure~\ref{fig:reward_rmse} reports the reward model prediction RMSE across training steps. See Appendix~\ref{sec:world_model_tables} for the full diffusion and reward model results.




At inference time, for each real environment state $s_t$, we solve a receding-horizon problem by sampling $M$ noise sequences, rolling out $H$-step imagined trajectories through $f_{\theta}$ under the current policy, and forming the Monte Carlo objective $\widehat{J}_t(\psi)$ given by discounted predicted rewards plus a terminal value from $Q_{\phi}$. We update $\psi$ for $E$ inner steps using gradients through the full computation graph, then execute the resulting action $a_t = \pi_{\psi}(s_t)$ in the real environment and repeat at the next state. We report mean and standard deviation over multiple evaluation seeds.




