pointmazeenv-v0-reward_overwriting:
  env: imitation/PointMazeLeftVel-v0
  run: RewardOverwriting
  local_dir: null
  checkpoint_freq: 5
  checkpoint_at_end: True
  keep_checkpoints_num: 2
  stop:
    timesteps_total: 8000000
  config:

    reward_models:
      model_name: "ground_truth"
      rewards:
        learned:
  
    framework: "torch"
    output: null
    gamma: 0.95
    lambda: 0.95
    lr: 5.0e-4
    num_sgd_iter: 2
    sgd_minibatch_size: 2048
    train_batch_size: 20480
    kl_coeff: 0.0
    kl_target: 0.01
    vf_loss_coeff: 0.5
    entropy_coeff: 0.01
    clip_param: 0.2
    vf_clip_param: 0.2
    grad_clip: 1.0
    horizon: null
    soft_horizon: False
    no_done_at_end: False

    model:
      fcnet_hiddens: [256,256]
      fcnet_activation: tanh
      vf_share_layers: True
    
    explore: True
    exploration_config:
      type: "StochasticSampling"

    num_workers: 8
    num_envs_per_worker: 40
    rollout_fragment_length: 64 # 16
    batch_mode: truncate_episodes
    num_cpus_per_worker: 1
    num_gpus_per_worker: 0.0
    num_cpus_for_driver: 1
    num_gpus: 1
    
    # evaluation_interval: 20
    # evaluation_num_episodes: 100
    # evaluation_config:
    #   input: sampler
