# @package _global_
defaults:
  - override /env: pointmass
  - override /evaluator: pointmass

num_steps: 5
num_envs: 256
log_interval: 10
save_interval: 10000000000
eval_interval: 100
num_env_steps: 5e6

policy_updater:
  _target_: imitation_learning.f_irl.updater.fIRL
  _recursive_: False

  batch_size: 20 # only 20 demo points.
  reward_update_freq: 1
  num_discrim_batches: 2
  num_envs: ${num_envs}
  importance_sampling: False
  div_type: 'rkl'

  get_dataset_fn:
    _target_: imitation_learning.common.utils.get_transition_dataset
    dataset_path: data/traj/pm_100.pth
    env_name: ${env.env_name}

  reward:
    _target_: imitation_learning.common.net.NeuralReward
    obs_shape: ${obs_shape}
    action_dim: ${action_dim}
    reward_hidden_dim: 128
    reward_type: "NEXT_STATE"
    n_hidden_layers: 2
    clamp_max: 10.0

  discriminator:
    _target_: gail.discriminator.GailDiscriminator
    obs_shape: ${obs_shape}
    action_dim: ${action_dim}
    reward_hidden_dim: 128
    n_hidden_layers: 2
    cost_take_dim: -1
    use_actions: False
    reward_type: "GAIL"

  policy_updater:
    _target_: imitation_learning.policy_opt.ppo.PPO
    _recursive_: False

    use_clipped_value_loss: True
    max_grad_norm: -1
    value_loss_coef: 0.5
    clip_param: 0.2
    entropy_coef: 0.01
    num_epochs: 2
    num_mini_batch: 1

    # Returns calculation
    gae_lambda: 0.95
    use_gae: True
    gamma: ${gamma}
    optimizer_params:
      _target_: torch.optim.Adam
      lr: 3e-4
    num_steps: ${num_steps}
    num_envs: ${num_envs}

  discrim_opt:
    _target_: torch.optim.Adam
    lr: 3e-4

  reward_opt:
    _target_: torch.optim.Adam
    lr: 3e-4

eval_args:
  policy_updater:
    reward_update_freq: -1
  load_policy: False
  num_env_steps: 5e6

