obj: maxentirl_sa 
IS: false
seed: 1
cuda: -1
experiment_tag: 32
env:
  env_name: AdroitHandPen-v1
  T: 200
  state_indices: all
irl:
  training_trajs: 10
  n_itrs: 1000
  save_interval: 0
  eval_episodes: 20
  expert_episodes: 10
  resample_episodes: 1
bc:
  epochs: 10000
  eval_freq: 100
  eval_episodes: 20
  expert_episodes: 16
sac:
  k: 1
  epochs: 5
  log_step_interval: 5000
  update_every: 1
  random_explore_episodes: 1
  update_num: 1
  batch_size: 256
  lr: 3e-5
  alpha: 0.2
  automatic_alpha_tuning: false
  buffer_size: 1000000
  num_test_episodes: 10
  reinitialize: false

reward:
  use_bn: false
  residual: false
  hid_act: relu
  hidden_sizes: [256, 256]
  clamp_magnitude: 10
  lr: 3e-5
  weight_decay: 1e-3
  gradient_step: 1
  momentum: 0.9
  # Reward update constraints parameters
  use_constraints: true  
  constraint_type: "simple"  # "simple" or "complex"
  constraint_weight: 0.01    # Weight for simple constraint
  max_reward_change: 1.0     # Maximum allowed reward change
  target_reward_diff: 0.1
  target_ratio_upper: 1.2
  target_ratio_lower: 0.8
  coef_scale_down: 0.9
  coef_scale_up: 1.1
  coef_min: 0.001
  coef_max: 10.0
  target_reward_l2_norm: 1.0
  l2_coef_scale_up: 1.5
  l2_coef_scale_down: 1.5
  n_episodes: 10
  discount: 0.99
disc:
  reinit: false
  model_type: resnet_disc
  num_layer_blocks: 3
  hid_dim: 128
  hid_act: tanh
  use_bn: false
  clamp_magnitude: 10.0
  batch_size: 800
  lr: 0.0003
  weight_decay: 0.0001
  momentum: 0.9
  iter: 1200
critic:
  lam: 0.5
  model_type: resnet_disc
  num_layer_blocks: 3
  hid_dim: 128
  hid_act: tanh
  use_bn: false
  batch_size: 800
  lr: 0.0003
  weight_decay: 0.0001
  momentum: 0.0
  iter: 1200
adv_irl: # https://github.com/KamyarGh/rl_swiss/blob/master/exp_specs/adv_irl.yaml
  normalize: true
  num_epochs: 100
  num_steps_per_epoch: 100000
  num_steps_between_train_calls: 1000
  min_steps_before_training: 5000
  num_update_loops_per_train_call: 100
  num_disc_updates_per_loop_iter: 1
  num_policy_updates_per_loop_iter: 1
  disc_optim_batch_size: 256
  policy_optim_batch_size: 256
  disc_lr: 0.0003
  disc_momentum: 0.0
  use_grad_pen: true
  grad_pen_weight: 4.0  # [2.0, 4.0, 8.0, 16.0]
  reward_scale: 0.2  # [2.0, 4.0, 8.0, 16.0]
  save_interval: 0
  eval_interval: 200
  replay_buffer_size: 200000
  disc:
    model_type: mlp_disc
    num_layer_blocks: 2
    hid_dim: 128
    hid_act: tanh
    use_bn: false
    clamp_magnitude: 10.0
