---
# NOTE: root_dir and model_name will be specified to all configs in run.train.py
algorithm: &algo sync-gpo
name: sync-gpo
version: 0

precision: 32

n_agents: 2

# model path: root_dir/model_name/name
# tensorboard path: root_dir/model_name/logs
# the following names are just examples; they will be re-specified in the entry point
root_dir: *algo
model_name: *algo

controller:
    store_period: 5e4
    restart_runners_priod: null
    eval_priod: 5e4
    max_version_iterations: 30
    max_steps_per_iteration: 3e6
        # - [0, 1e6]
        # - [20, 3e6]
    initialize_rms: &irms False

parameter_server:
    root_dir: *algo
    model_name: *algo

    score_metric: dense_score   # stats recorded in payoff tables
    train_from_scratch_frac: 1
    online_frac: .25
        # - [1, 1]
        # - [5, .75]
        # - [10, .5]
        # - [20, .25]
    update_interval: 1000

    payoff:
        step_size: 1e-1
        sampling_strategy:
            type: pfsp
            p: 1
            threshold: .3

    # rule_strategies:
    #     # agent name / config
    #     random:
    #         aid: 1
    #         vid: 1
    #         path: rule/random   # path to the file that defines Strategy
    #         # other configs

ray_config:
    runner:
        num_cpus: 1
    agent:
        num_gpus: 1

monitor: {}

runner:
    n_runners: &nrunners 16
    n_steps: &nsteps 2
    push_every_episode: False

env:
    env_name: &env_name spiel-leduc_poker
    squeeze_keys:
        - uid
    uid2aid: [0, 1]
    n_envs: &nenvs 8

agent: {}

strategy:
    train_loop:
        use_dataset: &dataset False
        n_epochs: &nepochs 5
        n_mbs: &nmbs 1
        n_aux_epochs: &naepochs 9
        n_pi: &npi 16
        n_segs: &nsegs 16
        n_aux_mbs_per_seg: &nambs 2
        max_kl: 0            # early stop when max_kl is violated. 0 or null suggests unbound

        # (once reuse null)
        # "once" updates values at the end of each epoch
        # "reuse" updates values using value from train which is staler than once'
        # null doesn't update values.
        value_update: null
        n_aux_value_updates: 10

model:
    actor_rnn_type: &arnn null
    value_rnn_type: &vrnn null
    sample_size: &ss 16

    policy:
        use_prev_reward: True
        use_prev_action: True
        encoder: 
            nn_id: mlp
            units_list: [128]
            kernel_initializer: orthogonal
            activation: relu
            norm: null
            norm_after_activation: True
            norm_kwargs:
                epsilon: 1e-5
        rnn:
            nn_id: *arnn
            units: 64
            kernel_initializer: orthogonal
            recurrent_initializer: orthogonal
            use_ln: True
        policy:
            nn_id: policy
            units_list: []
            kernel_initializer: orthogonal
            activation: relu
            norm: null
            eval_act_temp: 1
            out_gain: .01
            init_std: 1

    value:
        use_prev_reward: True
        use_prev_action: True

        encoder:
            nn_id: mlp
            units_list: [128]
            kernel_initializer: orthogonal
            activation: relu
            norm: null
            norm_after_activation: True
            norm_kwargs:
                epsilon: 1e-5
        rnn:
            nn_id: *vrnn
            units: 64
            kernel_initializer: orthogonal
            recurrent_initializer: orthogonal
            use_ln: True
        value:
            nn_id: value
            units_list: []
            kernel_initializer: orthogonal
            activation: relu
            norm: null


loss:
    policy:
        pg_coef: 1
        ppo_clip_range: .2
        entropy_coef: 1e-1
        aux_l1_coef: 0
        aux_l2_coef: 0
        weighted_l_dist: False
        kl_prior_coef: 0
        kl_prior: reverse
        kl_target_coef: 0
        kl_target: reverse_approx
        new_po_coef: 0
        target_prob: target_prob_prime
        js_target_coef: 0
        js_target: approx
        tsallis_target_coef: 0
        tsallis_target: reverse_approx
        tsallis_q: 1.2
        sample_mask: False

        normalize_adv: False
        zero_center_adv: True
        process_adv: null
        adv_clip_range: null
    value:
        value_loss: clip
        value_clip_range: .2
        value_coef: 1
        sample_mask: False

trainer:
    algorithm: *algo

    actor_rnn_type: *arnn
    value_rnn_type: *vrnn
    sample_size: *ss         # BPTT length
    store_state: True
    policy:
        optimizer:
            opt_name: adam
            schedule_lr: False
            lr: 1e-3
            clip_norm: .5
            epsilon: 1.e-5
            weight_decay: 0
    value:
        optimizer:
            opt_name: adam
            schedule_lr: False
            lr: 1e-3
            clip_norm: .5
            epsilon: 1.e-5

actor:
    algorithm: *algo

    rms:
        obs_names: [obs, global_state]
        normalize_obs: *irms
        normalize_reward: False
        obs_normalized_axis: [0, 1]
        reward_normalized_axis: [0, 1]
        normalize_reward_with_return: forward
        update_reward_rms_in_time: False
        gamma: &gamma .99

buffer:
    type: ac
    use_dataset: *dataset

    # PPO configs
    adv_type: gae     # nae or gae
    gamma: *gamma
    lam: .95
    n_runners: *nrunners
    n_envs: *nenvs
    n_steps: *nsteps
    n_epochs: *nepochs
    n_mbs: *nmbs        # number of minibatches
    fragment_size: null
    actor_rnn_type: *arnn
    value_rnn_type: *vrnn
    sample_size: *ss
    normalize_adv: False
    zero_center_adv: True
    process_adv: null
    adv_clip: null

    # Trust region arguments
    target_pi:
        target_type: pr
        adv_tau: 1
        alpha: .2
        pr_lower_clip: .2
        pr_upper_clip: .2
        exp_lower_clip: 1
        exp_upper_clip: 1
        adv_lower_clip: 1
        adv_upper_clip: 1
        lin_lower_clip: .5
        lin_upper_clip: 3
        lin_weights: 1
        normalize_adv: False
        zero_center: True
        process_adv: null
        step_size: .1

        target_prob_key: target_prob_prime

    # PPG configs
    n_pi: *npi
    n_segs: *nsegs
    n_aux_mbs_per_seg: *nambs
    n_aux_epochs: *naepochs

    # mini-batch size = n_runners * n_envs * epslen / n_mbs
    sample_keys:
        - obs
        - global_state
        - prev_reward
        - prev_action
        - action_mask
        - action
        - reward
        - value
        - traj_ret
        - advantage
        - target_prob
        - tr_prob
        - target_prob_prime
        - tr_prob_prime
        - logprob
        - pi
        - target_pi
        - mask
        - actor_h
        - actor_c
        - value_h
        - value_c
