nmmo-ppo:
  run: PPO
  checkpoint_freq: 10
  checkpoint_at_end: true
#  restore: null
#  keep_checkpoints_num: 5
  local_dir: ray_results
  stop:
    timesteps_total: 100000000
  config:
    seed: 321
    callbacks:
      type: env.nmmo.utils.NMMOMetricsCallback
    env: env.nmmo.multi_agent_env.RLlibNMMO2Env
    env_config:
      num_of_controlled_agents: 4
    
    # train_batch_size: 1024*30
    # rollout_fragment_length: 1024

    num_workers: 30
    # num_cpus_for_driver: 1
    evaluation_num_workers: 1
    evaluation_interval: 20  # iterations
    evaluation_duration: 4  # sum all eval_workers (For example, if there are 2 envs to eval, each will be evaluated for half of the duration)
    evaluation_duration_unit: episodes
    evaluation_parallel_to_training: true
#    custom_eval_function:
#      type: env.gfootball.utils.EvalFn

    num_envs_per_worker: 1
    num_cpus_per_worker: 1
    num_gpus: 1
    num_gpus_per_worker: 0

    evaluation_config:
      env_config:
        num_of_controlled_agents: 4

#    explore: true
#    gamma: 0.99
#    lr: 0.0003
#    observation_filter: MeanStdFilter
#    num_sgd_iter: 6
#    vf_loss_coeff: 0.01
    framework: tf
    model:
      custom_model: tf_nmmo_model
      vf_share_layers: True
      custom_model_config:
        input_shape: [15,15]
        conv_activation: relu
        token_dim: 256
        num_heads: 4
        head_dim: 256

    multiagent:
      policies: ["shared_policy"]
      # YAML-capable policy_mapping_fn definition via providing a callable class here.
      policy_mapping_fn:
        type: env.starcraft.utils.MultiAgentParameterSharingPolicyMappingFn
      policies_to_train: ["shared_policy"]
