gfootball-5v5-contextual-bandit-hrl-ppo:
  run: PPO-hrl-curriculum
  checkpoint_freq: 1000
  checkpoint_at_end: true
#  restore: null
#  keep_checkpoints_num: 5
  local_dir: ray_results
  stop:
    timesteps_total: 100000000
  config:
    seed: 123
    callbacks:
      type: env.gfootball.utils.PvEMetricsCallback

    teacher_config:
      type: agents.curriculum.teacher.ContextualBanditTeacher
      num_contexts: 3
      gamma: 0.3
      update_interval: 20
      num_agents: [1, 2, 3, 4]

    env: env.gfootball.curriculum_env.FootballCurriculumPvEHierarchicalEnv
    env_config:
      env_name: 5_vs_5
      stacked: false
      rewards: scoring
      write_goal_dumps: false
      write_full_episode_dumps: false
      render: false
      write_video: false
      dump_frequency: 200
      representation: simple115v2
      number_of_left_players_agent_controls: 4
      logdir: dumps
      other_config_options:
        action_set: default  # "default": action_set_v1 (19), "v2": action_set_v2 (19 + 1 built-in ai)
      in_evaluation: false
      hrl_config:
        context_size: 5
        context_type: discrete
        high_level_interval: 10

    num_workers: 6
    num_cpus_for_driver: 1
    evaluation_num_workers: 2
    evaluation_interval: 1  # iterations
    evaluation_duration: 4  # sum all eval_workers (For example, if there are 2 envs to eval, each will be evaluated for half of the duration)
    evaluation_duration_unit: episodes
    evaluation_parallel_to_training: true
    custom_eval_function:
      type: env.gfootball.utils.EvalFn

    num_envs_per_worker: 3
    num_cpus_per_worker: 1
    num_gpus: 0
    num_gpus_per_worker: 0

    evaluation_config:
      env_config:
        env_name: 5_vs_5
        stacked: false
        rewards: scoring
        write_goal_dumps: false
        write_full_episode_dumps: false
        render: false
        write_video: false
        dump_frequency: 10
        representation: simple115v2
        number_of_left_players_agent_controls: 4
        logdir: eval_dumps
        other_config_options:
          action_set: default  # "default": action_set_v1 (19), "v2": action_set_v2 (19 + 1 built-in ai)
        in_evaluation: true
        hrl_config:
          context_size: 5
          context_type: discrete
          high_level_interval: 10

    explore: true

    high_level_policy_config:
      gamma: 0.99
      lambda: 1.0
      kl_coeff: 0.2
      rollout_fragment_length: 100
      train_batch_size: 1000
      sgd_minibatch_size: 1000
      num_sgd_iter: 10
      lr: 0.0001
      entropy_coeff: 0.0
      clip_param: 0.3
      vf_clip_param: 10.0

    low_level_policy_config:
      gamma: 0.99
      lambda: 1.0
      kl_coeff: 0.2
      rollout_fragment_length: 100
      train_batch_size: 1000
      sgd_minibatch_size: 1000
      num_sgd_iter: 10
      lr: 0.0001
      entropy_coeff: 0.0
      clip_param: 0.3
      vf_clip_param: 10.0

    # for other configurations, see agents/ppo/hierarchical/config.py and trainer.py