obj: 'maxentirl' # ['fkl', 'rkl', 'js', 'maxentirl', 'airl', 'gail', 'fairl', 'airl-disc']
IS: False
seed: 1
cuda: 1

task:
  expert_samples_n: 10000 
  task_name: 'mix_gaussian'
  goal: [[1, 3], [3, 1]]
  goal_radius: [0.3, 0.3]

env:
  env_name: "ContinuousVecGridEnv-v0"
  T: 30 
  state_indices: [0,1] 
  add_time: False # if true, will only add timestep to Q functions

irl:  
  training_trajs: 1000
  n_itrs: 800
  save_interval: 25
  
sac:
  k: 1
  epochs: 10 # to use AIRL training schedual, change to 60k/30 :2000
  steps_per_epoch: 30
  log_step_interval: 30 # to use AIRL training schedual, can change log frequency to be larger, e.g. 300
  update_every: 1 # update frequency. to use AIRL training schedule, change to 300
  update_num: 1 # how many update steps at each update time. to use AIRL training schedule, change to 20.
  random_explore_episodes: 5 # to use AIRL training schedule, change to 35 or 33. roughly 1000 steps.
  batch_size: 256 # 64
  lr: 0.003 # 3e-3
  alpha: 1.0
  automatic_alpha_tuning: False
  reinitialize: False
  buffer_size: 12000 # 6000

reward:
  arch: 'mlp'
  output_activation: 'clamp' # 'no', 'sigmoid', or 'clamp'.
  hidden_sizes: [64, 64]
  lr: 0.001
  weight_decay: 0
  gradient_step: 5

adv_irl: 
  # for comparsion with ours: Disc/SAC training iterations:
  # num_epochs * num_steps_per_epoch / num_steps_between_train_calls * num_disc_updates_per_loop_iter
  num_epochs: 300
  num_steps_per_epoch: 300
  num_steps_between_train_calls: 5
  min_steps_before_training: 500 # wait for enough samples to do density fit
  num_update_loops_per_train_call: 1 # fixed one
  num_disc_updates_per_loop_iter: 1 #orignal 5, change to 1 for fair comparison.
  num_policy_updates_per_loop_iter: 5
  num_initial_disc_iters: 1000

  disc_optim_batch_size: 128 # 800
  disc_lr: 0.0003
  disc_momentum: 0.0
  use_grad_pen: False
  grad_pen_weight: 1.0 # NOTE 10.0
  rew_clip_min: -10.0 # None
  rew_clip_max: 10.0  # None
  reward_scale: 1.0  # NOTE tune this: 1/alpha

disc:
  model_type: 'resnet_disc'
  num_layer_blocks: 6
  hid_dim: 128
  hid_act: 'tanh'
  use_bn: True
  clamp_magnitude: 10.0

density:
  model: 'kde'
  kde:
    bandwidth: 0.2 # may be adaptive by cross validation
    kernel: 'epanechnikov'
  gmm:
    component: 10
    covariance_type: 'full'
  vae:
    code_dim: 4
    mlp_dim: 32

evaluation:
  interval: 20 # interval in terms of discriminator / reward update
  epochs: 300
  random_explore_episodes: 100