# @package _group_
env: "Humanoid-v3"

p_tremble: 0.025
reset_prob: 0.5
total_env_steps: 1_000_000  # 对齐ML-IRL的总训练步数 (600 itrs * 5 epochs * 1000 steps)
eval_frequency: 2_000  # 3M步 ÷ 500次记录 = 每6000步记录一次日志
expert_dataset_size: 1000  # 对齐ML-IRL的专家数据量
bc_init_steps: 0
sampling_schedule: [[0.2, 0.2, 900000], [0.2, 0.1, 3000000]]  # 调整采样调度以匹配新的训练步数

train_discriminator: true
discriminator:
  lr: 3e-4  # 对齐ML-IRL的判别器学习率
  train_every: 1_000
  num_sample_trajectories: 1
  num_update_steps: 1
  batch_size: 100  # 对齐ML-IRL的批次大小
  clip_output: false
  ensemble_size: 1  # 1 is equivalent to no ensemble

actor:
  bc_reg: False
  bc_weight: 0.5
  policy: "MlpPolicy"
  policy_kwargs:
    net_arch: [256, 256]
  train_freq: 64
  gradient_steps: 64
  gamma: 0.98
  tau: 0.02
