starcraft-contextual-bandit-hrl-att-com:
  run: PPO-hrl-curriculum
  checkpoint_freq: 1000
  checkpoint_at_end: true
#  restore: null
#  keep_checkpoints_num: 5
  local_dir: ray_results
  stop:
    timesteps_total: 100000000
  config:
    callbacks:
      type: env.starcraft.utils.PvEMetricsCallback

    teacher_config:
      type: agents.curriculum.teacher.ContextualBanditTeacher
      num_contexts: 3
      gamma: 0.3
      update_interval: 20
      num_agents: [3, 5, 8]
      min_rew: 0
      max_rew: 20

    env: env.starcraft.StarCraft2CurriculumPvEHierarchicalComEnv
    env_config:
      map_name: 3m
      max_num_agents: 8
      in_evaluation: false
      hrl_config:
        context_size: 5
        context_type: discrete
        high_level_interval: 10

    num_workers: 30
    # num_cpus_for_driver: 1
    evaluation_num_workers: 5
    evaluation_interval: 20  # iterations
    evaluation_duration: 40
    evaluation_duration_unit: episodes
    evaluation_parallel_to_training: true

    num_envs_per_worker: 1
    num_cpus_per_worker: 1
    num_gpus: 1
    num_gpus_per_worker: 0

    evaluation_config:
      env_config:
        map_name: 8m
        max_num_agents: 8
        in_evaluation: true
        hrl_config:
          context_size: 5
          context_type: discrete
          high_level_interval: 10

    explore: true

    high_level_policy_config:
      gamma: 0.99
      lambda: 1.0
      kl_coeff: 0.2
      rollout_fragment_length: 100
      train_batch_size: 4000
      sgd_minibatch_size: 128
      num_sgd_iter: 30
      lr: 0.00005
      entropy_coeff: 0.0
      clip_param: 0.3
      vf_clip_param: 10.0
      model:
        custom_model: invariant_att_com_model
        custom_action_dist: hom_multi_action
        custom_model_config:
          encoder_hidden_layers: [256, 256]
          num_heads: 8
          head_dim: 64
          decoder_hidden_layers: [256]

    low_level_policy_config:
      gamma: 0.99
      lambda: 1.0
      kl_coeff: 0.2
      rollout_fragment_length: 100
      train_batch_size: 4000
      sgd_minibatch_size: 128
      num_sgd_iter: 30
      lr: 0.00005
      entropy_coeff: 0.0
      clip_param: 0.3
      vf_clip_param: 10.0
      model:
        custom_model: action_mask_model

    # for other configurations, see agents/ppo/curriculum/config.py and trainer.py