---
# NOTE: root_dir and model_name will be specified to all configs in run.train.py
algorithm: &algo sync2-zero_ir
name: zero_ir
version: 0

precision: 32

n_agents: 1

# model path: root_dir/model_name/name
# tensorboard path: root_dir/model_name/logs
# the following names are just examples; they will be re-specified in the entry point
root_dir: *algo
model_name: *algo

controller:
    store_period: 1e6
    restart_runners_priod: null
    max_version_iterations: 1
    max_steps_per_iteration: 2e8
    initialize_rms: &irms True

parameter_server:
    root_dir: *algo
    model_name: *algo

    train_from_scratch_frac: 1
    online_frac: .2

    payoff:
        step_size: 1e-2     # step size towards the most recent data, 0 or null average payoff over the entire history
        update_interval: 180
        sampling_strategy:
            type: pfsp
            p: 1

    # rule_strategies:
    #     # agent name / config
    #     random:
    #         aid: 1
    #         vid: 1
    #         path: rule/random   # path to the file that defines Strategy
    #         # other configs

ray_config:
    runner:
        num_cpus: 1
    agent:
        num_gpus: 1

monitor: {}

runner:
    n_runners: &nrunners 60
    n_steps: &nsteps 16
    push_every_episode: False

env:
    env_name: &env_name grf-11_vs_11_hard_stochastic
    representation: custom
    rewards: scoring,checkpoints
    render: False
    write_full_episode_dumps: False
    write_video: False
    dump_frequency: 1000
    logdir: results/grf
    extra_players: null
    number_of_left_players_agent_controls: 11
    number_of_right_players_agent_controls: 0
    add_role_to_obs: False
    shared_ckpt_reward: False

    max_episode_steps: 3000
    use_action_mask: False
    uid2aid: null

    n_envs: &nenvs 1

    use_idx: False
    use_hidden: False
    use_event: False
    agentwise_global_state: False

agent: {}

strategy:
    algorithm: *algo
    train_loop:
        use_dataset: &dataset False
        K: &K 5
        L: &L 0
        extra_meta_step: &ems 1 # 0/1

model:
    aid: 0
    rnn_type: &rnn null
    K: *K
    L: *L
    rl_reward: &rlreward sum
    gamma: &gamma .995
    meta_reward_type: intrinsic
    extra_meta_step: *ems

    encoder: 
        nn_id: null
    rnn:
        nn_id: *rnn
        units: 256
        kernel_initializer: orthogonal
        recurrent_initializer: orthogonal
        use_ln: True
    policy:
        nn_id: policy
        units_list: [256, 256, 256]
        kernel_initializer: orthogonal
        activation: relu
        norm: layer
        norm_after_activation: True
        norm_kwargs:
            epsilon: 1e-5
        eval_act_temp: 1
        out_gain: .01
        indexed: null
        indexed_config:
            use_shared_bias: True
            use_bias: True
            kernel_initializer: orthogonal
            activation: relu
    value:
        nn_id: value
        units_list: [256, 256, 256]
        kernel_initializer: orthogonal
        activation: relu
        norm: layer
        norm_after_activation: True
        norm_kwargs:
            epsilon: 1e-5
        indexed: null
        indexed_config:
            use_shared_bias: True
            use_bias: True
            kernel_initializer: orthogonal
            activation: relu
    meta_reward: 
        nn_id: reward
        units_list: [256, 256, 256]
        kernel_initializer: orthogonal
        activation: relu
        norm: layer
        norm_after_activation: True
        norm_kwargs:
            epsilon: 1e-5
        out_gain: .01
        out_scale: .1
        out_act: atan
        indexed: null
        indexed_config:
            use_shared_bias: True
            use_bias: True
            kernel_initializer: orthogonal
            activation: relu
    outer_value:
        nn_id: value
        units_list: [256, 256, 256]
        kernel_initializer: orthogonal
        activation: relu
        norm: layer
        norm_after_activation: True
        norm_kwargs:
            epsilon: 1e-5
        indexed: null
        indexed_config:
            use_shared_bias: False
            use_bias: True
            kernel_initializer: orthogonal
            activation: relu
    meta:
        nn_id: meta
        pg_coef:
            outer: 1
            default: 1
            init: null
            act: sigmoid
        entropy_coef:
            outer: 5e-4
            default: 5e-4
            scale: 5e-4
            bias: 5e-4
            init: null
            act: tanh
        value_coef:
            outer: .5
            default: .5
            scale: 1
            bias: 0
            init: null
            act: sigmoid
        gamma: 
            outer: *gamma
            default: *gamma
            scale: 1
            bias: 0
            init: null
            act: sigmoid
        lam:
            outer: &lam .95
            default: *lam
            scale: 1
            bias: 0
            init: null
            act: sigmoid
        reward_scale:
            outer: 1
            default: 1
            scale: 1
            bias: 0
            init: null
            act: sigmoid
        reward_bias:
            outer: 0
            default: 0
            scale: 1
            bias: 0
            init: null
            act: tanh
        reward_coef:
            outer: 1
            default: 1
            scale: 1
            bias: 0
            init: null
            act: sigmoid

loss:
    rl:
        # hyperparams for value target and advantage
        target_type: gae
        c_clip: 1
        rho_clip: 1
        adv_type: vtrace
        norm_adv: True

        # hyperparams for policy optimization
        pg_type: pg
        ppo_clip_range: .2
        use_dice: False
        dice_axis: null
        dice_lam: 1
        kl: reverse
        kl_coef: 1
        policy_sample_mask: True

        # hyperparams for value learning
        value_loss: mse
        value_clip_range: .2
        value_sample_mask: False
        stop_target_grads: False

        joint_objective: False
    meta:
        # hyperparams for value target and advantage
        target_type: gae
        c_clip: 1
        rho_clip: 1
        adv_type: vtrace
        norm_adv: True
        norm_meta_adv: True

        # hyperparams for policy optimization
        pg_type: pg 
        ppo_clip_range: .2
        use_dice: True
        dice_axis: null
        dice_lam: 1
        kl: reverse
        kl_coef: 1
        policy_sample_mask: True

        # hyperparams for value learning
        value_loss: mse
        value_clip_range: .2
        value_sample_mask: False
        stop_target_grads: False

        # hyperparams for meta-learning
        meta_reward_coef: 0
        joint_objective: False

trainer:
    algorithm: *algo
    aid: 0
    n_runners: *nrunners
    n_envs: *nenvs
    n_epochs: &nepochs 1
    n_mbs: &nmbs 1
    n_meta_epochs: 1
    n_steps: *nsteps         # BPTT length
    timeout_done: &td True
    meta_type: plain
    K: *K
    L: *L
    msmg_type: avg
    extra_meta_step: *ems
    store_state: True
    rl_reward: *rlreward

    rl_opt:
        opt_name: rmsprop
        schedule_lr: False
        lr: 1e-4
        clip_norm: .5
        epsilon: 1e-5
    meta_param_opt:
        opt_name: rmsprop
        schedule_lr: False
        lr: 1e-4
        clip_norm: .5
        epsilon: 1e-5
        weight_decay: null
    meta_reward_opt:
        opt_name: rmsprop
        schedule_lr: False
        lr: 1e-4
        clip_norm: .5
        epsilon: 1e-5
        weight_decay: null

actor:
    algorithm: *algo

    rms:
        obs_names: [obs, global_state]
        normalize_obs: *irms
        normalize_reward: False
        obs_normalized_axis: [0, 1]
        reward_normalized_axis: [0, 1]
        update_reward_rms_in_time: True
        gamma: *gamma

buffer:
    type: ac
    use_dataset: *dataset

    n_runners: *nrunners
    n_envs: *nenvs
    n_steps: *nsteps
    queue_size: 2
    K: *K
    L: *L
    extra_meta_step: *ems
    timeout_done: *td
    rnn_type: *rnn

    # mini-batch size = n_runners * n_envs * epslen / n_mbs
    sample_keys:
        - obs
        # - idx
        # - global_state
        # - hidden_state
        # - prev_reward
        # - prev_action
        # - action_mask
        - action
        - value
        - reward
        - discount
        - reset
        - mu_logprob
        - mu
        - mask
        - h
        - c
