# AntMaze Medium-Diverse Dataset - AntMaze Environment
# MMD-QL configuration aligned with reference/offline.py diffusion hyperparameters

env_name: antmaze-medium-diverse-v0
model: mmd_ql
ms: offline
exp: 2
seed: 42
device: cuda
dir: ./logs/antmaze-medium-diverse-v0
save_best_model: true
eval_freq: 10
num_steps_per_epoch: 1000
num_epochs: 1000
batch_size: 4096
lr_decay: true
early_stop: false
eval_episodes: 100
top_k: 4

model_args:
  - model: MLP
  - lr: 0.0001
  - eta: 0.0005
  - T: 0.9994
  - q_norm: true             # AntMaze稀疏奖励，必须标准化Q值
  - max_q_backup: true
  - reward_tune: cql_antmaze
  - gn: 2.0
  # Adam 优化器参数调优 (diverse 数据更保守)
  - adam_beta1: 0.9            # 标准 Adam beta1
  - adam_beta2: 0.99          # 标准 beta2，保持稳定
  
  - noise_schedule: "fm"
  - sigma_data: 0.3        # 稍大尺度，处理diverse数据的复杂性
  - f_type: "euler_fm"
  - eps: 0.0001
  - temb_type: "identity"
  - time_scale: 1000.0
  - mmd_sigma: 1.1          # 稍大带宽，适应多样化轨迹
  - sample_t_mode: "lognormal"
  - P_mean: -0.5            # 较高均值，平衡复杂数据分布
  - P_std: 1.2              # 适中方差，保持时间采样多样性
  - matrix_size: 512
  - sample_repeat: 1
  - k: 12              # 较高精度，diverse数据更复杂
  - a: 1.5             # 略高信号权重，处理多样化轨迹
  - b: 3               # 较高偏置，适应diverse数据的复杂性
  - min_tr_gap: null
  - cfg_scale: 0.0
  - cfg_dropout_prob: 0.1 # State dropout probability during training
  - use_cfg: false
  - action_select: "softmax"      # "greedy" | "softmax"
  - softmax_temperature: 0.6     # temperature for softmax selection
  - softmax_topk: 3           # optionally apply top-k filter before softmax
  
   # Optimization
  - ema_decay: 0.995
  - step_start_ema: 1000
  - update_ema_every: 5
  - grad_norm: 1.0