---
# NOTE: root_dir and model_name will be specified to all configs in run.train.py
algorithm: &algo zero
name: zero
version: 0

precision: 32

n_agents: 1

# model path: root_dir/model_name/name
# tensorboard path: root_dir/model_name/logs
# the following names are just examples; they will be re-specified in the entry point
root_dir: *algo
model_name: *algo

controller:
    store_period: 5e4
    restart_runners_priod: null
    max_version_iterations: 1
    max_steps_per_iteration: 3e7
    initialize_rms: &irms True

parameter_server:
    root_dir: *algo
    model_name: *algo

    train_from_scratch_frac: 1
    online_frac: .2

    payoff:
        step_size: 1e-2     # step size towards the most recent data, 0 or null average payoff over the entire history
        update_interval: 180
        sampling_strategy:
            type: pfsp
            p: 1

ray_config:
    runner:
        num_cpus: 1
    agent:
        num_gpus: 0

monitor: {}

runner:
    n_runners: &nrunners 8
    n_steps: &nsteps 400
    push_every_episode: False

env:
    env_name: &env_name smac-3s5z_vs_3s6z
    n_envs: &nenvs 1
    use_state_agent: True
    use_mustalive: True
    add_center_xy: True
    timeout_done: True
    add_agent_id: True
    do_evaluation: False
    sample_mask_at_done: 0

agent: {}

strategy:
    train_loop:
        use_dataset: &dataset False
        n_epochs: &nepochs 5
        n_mbs: &nmbs 1
        n_aux_epochs: &naepochs 9
        n_pi: &npi 16
        n_segs: &nsegs 16
        n_aux_mbs_per_seg: &nambs 2
        max_kl: 0            # early stop when max_kl is violated. 0 or null suggests unbound

        # (once reuse null)
        # "once" updates values at the end of each epoch
        # "reuse" updates values using value from train which is staler than once
        # null doesn't update values.
        value_update: null

model:
    rnn_type: &rnn null
    sample_size: &ss 64
    K: *K
    L: *L
    extra_meta_step: *ems

    encoder: 
        nn_id: mlp
        units_list: []
        kernel_initializer: orthogonal
        activation: relu
        norm: layer
        norm_after_activation: True
        norm_kwargs:
            epsilon: 1e-5
    rnn:
        nn_id: *rnn
        units: 256
        kernel_initializer: orthogonal
        recurrent_initializer: orthogonal
        use_ln: True
    policy:
        nn_id: policy
        units_list: [64, 64]
        kernel_initializer: orthogonal
        activation: relu
        norm: layer
        norm_after_activation: True
        norm_kwargs:
            epsilon: 1e-5
        eval_act_temp: 1
        out_gain: .01
        init_std: 1
    value:
        nn_id: value
        units_list: [64, 64]
        kernel_initializer: orthogonal
        activation: relu
        norm: layer
        norm_after_activation: True
        norm_kwargs:
            epsilon: 1e-5
    embed: null
    meta:
        nn_id: meta
        pg_coef:
            outer: 1
            default: 1
            init: null
            act: sigmoid
        entropy_coef:
            outer: 0
            default: 0
            init: null
            act: sigmoid
        value_coef:
            outer: 1
            default: 1
            scale: 1
            bias: 0
            init: null
            act: sigmoid
        gamma: 
            outer: &gamma .995
            default: *gamma
            scale: 1
            bias: 0
            init: null
            act: sigmoid
        lam:
            outer: &lam .95
            default: *lam
            scale: 1
            bias: 0
            init: null
            act: sigmoid

loss:
    policy:
        ppo_clip_range: .2
        entropy_coef: .01
        sample_mask: True
    value:
        value_loss: mse
        value_clip_range: .2
        value_coef: 1
        sample_mask: False

trainer:
    algorithm: *algo
    sample_size: *ss         # BPTT length
    timeout_done: True
    meta_type: plain
    K: *K
    L: *L
    extra_meta_step: *ems
    store_state: True
    n_epochs: 5

    optimizer:
        opt_name: adam
        schedule_lr: False
        lr: 5e-4
        clip_norm: 10
        epsilon: 1e-5
        weight_decay: 0
    meta_opt:
        opt_name: adam
        schedule_lr: False
        lr: 5e-4
        clip_norm: 10
        epsilon: 1e-5

actor:
    algorithm: *algo

    rms:
        obs_names: [obs, global_state]
        normalize_obs: *irms
        normalize_reward: True
        obs_normalized_axis: [0, 1]
        reward_normalized_axis: [0, 1]
        update_reward_rms_in_time: True
        gamma: &gamma .99

buffer:
    type: ac
    use_dataset: *dataset

    n_runners: *nrunners
    n_envs: *nenvs
    n_steps: *nsteps
    fragment_size: null
    sample_size: *ss
    K: *K
    L: *L
    extra_meta_step: *ems
    queue_size: 2
    timeout_done: True
    rnn_type: *rnn

    # mini-batch size = n_runners * n_envs * epslen / n_mbs
    sample_keys:
        - obs
        - global_state
        - prev_reward
        - prev_action
        - action_mask
<<<<<<< HEAD
        - life_mask
=======
        - sample_mask
>>>>>>> e751fe84adf3cc9fc0d86bae8bd2aff0f0a866d5
        - action
        - reward
        - discount
        - reset
        - value
        - mu_logprob
        - mu
        - mask
        - h
        - c
