---
# NOTE: root_dir and model_name will be specified to all configs in run.train.py
algorithm: &algo sync-bmg
name: sync-bmg
version: 0

precision: 32

n_agents: 2

# model path: root_dir/model_name/name
# tensorboard path: root_dir/model_name/logs
# the following names are just examples; they will be re-specified in the entry point
root_dir: *algo
model_name: *algo

controller:
    store_period: 5e4
    restart_runners_priod: null
    eval_period: 5e4
    max_version_iterations: 30
    max_steps_per_iteration: 3e6
        # - [0, 1e6]
        # - [20, 3e6]
    initialize_rms: &irms False

parameter_server:
    root_dir: *algo
    model_name: *algo

    score_metric: dense_score   # stats recorded in payoff tables
    train_from_scratch_frac: 1
    online_frac: .25
        # - [1, 1]
        # - [5, .75]
        # - [10, .5]
        # - [20, .25]
    update_interval: 1000

    payoff:
        step_size: 1e-1
        sampling_strategy:
            type: pfsp
            p: 1
            threshold: .3

    # rule_strategies:
    #     # agent name / config
    #     random:
    #         aid: 1
    #         vid: 1
    #         path: rule/random   # path to the file that defines Strategy
    #         # other configs

ray_config:
    runner:
        num_cpus: 1
    agent:
        num_gpus: 1

monitor: {}

runner:
    n_runners: &nrunners 8
    n_steps: &nsteps 2
    push_every_episode: False

env:
    env_name: &env_name spiel-leduc_poker
    uid2aid: [0, 1]
    n_envs: &nenvs 4

agent: {}

strategy:
    algorithm: *algo
    train_loop: {}

model:
    aid: 0
    meta_type: &meta bmg
    rnn_type: &rnn null
    gamma: &gamma .99
    eval_act_temp: 1
    K: &K 0
    L: &L 0

    policy:
        nn_id: policy
        units_list: [128]
        w_init: orthogonal
        activation: relu
        out_scale: .01
    value:
        nn_id: value
        units_list: [128]
        w_init: orthogonal
        activation: relu
    meta_params:
        nn_id: meta
        pg_coef:
            outer: 1
            default: 1
            init: null
            act: sigmoid
        entropy_coef:
            outer: 1e-1
            default: 1e-1
            scale: 1
            bias: 0
            init: -2
            act: sigmoid
            net:
                nn_id: mlp
                units_list: [64]
                out_scale: .01
                out_b_init: const_-2
                activation: relu
        value_coef:
            outer: .5
            default: .5
            scale: 1
            bias: 0
            init: 0
            act: sigmoid
        gamma: 
            outer: *gamma
            default: *gamma
            scale: 1
            bias: 0
            init: 4.6
            act: sigmoid
        lam:
            outer: &lam .95
            default: *lam
            scale: 1
            bias: 0
            init: 4.6
            act: sigmoid

loss:
    # hyperparams for value target and advantage
    target_type: gae
    c_clip: 1
    rho_clip: 1
    adv_type: vtrace
    norm_adv: False

    # hyperparams for policy optimization
    pg_type: pg 
    ppo_clip_range: .2
    use_dice: True
    dice_axis: null
    dice_lam: 1
    kl: forward
    kl_coef: 1
    policy_sample_mask: True

    # hyperparams for value learning
    value_loss: mse
    value_clip_range: .2
    value_sample_mask: False
    stop_target_grads: False

    # hyperparams for meta-learning
    meta_type: *meta

trainer:
    algorithm: *algo
    aid: 0
    n_runners: *nrunners
    n_envs: *nenvs
    n_steps: *nsteps         # BPTT length
    n_epochs: &nepochs 1
    n_mbs: &nmbs 1
    n_meta_epochs: 1
    timeout_done: &td True

    meta_type: *meta
    K: *K
    L: *L
    store_state: True

    theta_opt:
        opt_name: rmsprop
        lr: 1e-4
        clip_norm: 10
        eps: 1e-5
    meta_params_opt:
        opt_name: adam
        lr: 1e-3
        clip_norm: .5
        eps: 1e-5

actor:
    algorithm: *algo

    rms:
        obs_names: [obs, global_state]
        normalize_obs: *irms
        normalize_reward: False
        obs_normalized_axis: [0, 1]
        reward_normalized_axis: [0, 1]
        update_reward_rms_in_time: True
        gamma: *gamma

buffer:
    type: ac

    n_runners: *nrunners
    n_envs: *nenvs
    n_steps: *nsteps
    queue_size: 2
    meta_type: *meta
    K: *K
    L: *L
    timeout_done: *td
    rnn_type: *rnn

    # mini-batch size = n_runners * n_envs * epslen / n_mbs
    sample_keys:
        - obs
        - action
        - value
        - reward
        - discount
        - reset
        - mu_logprob
        - mu_logits
        - mask
        - h
        - c
