---
# NOTE: root_dir and model_name will be specified to all configs in run.train.py
algorithm: &algo zero_mr
name: zero_mr
version: 0

precision: 32

n_agents: 1

# model path: root_dir/model_name/name
# tensorboard path: root_dir/model_name/logs
# the following names are just examples; they will be re-specified in the entry point
root_dir: *algo
model_name: *algo

controller:
    store_period: 50000
    restart_runners_priod: null
    max_version_iterations: 1
    max_steps_per_iteration: 3e8
    initialize_rms: &irms True

parameter_server:
    root_dir: *algo
    model_name: *algo

    train_from_scratch_frac: 1
    online_frac: 1

    payoff:
        step_size: .1
        sampling_strategy:
            type: fsp
            p: 1

ray_config:
    runner:
        num_cpus: 1

    agent:
        num_gpus: 1

monitor: {}

runner:
    n_runners: &nrunners 30
    n_steps: &nsteps 20
    record_self_play_stats: False
    push_every_episode: False

env:
    env_name: &env_name unity-combat_fly_control
    max_episode_steps: 500
    frame_skip: 10
    unity_config:
        worker_id: 20000
        file_name: '/home/ubuntu/wuyunkun/hm/env/unity_env/data/1_copy/3d.x86_64'
    reward_config:
        detect_reward: 0
        main_dead_reward: -1
        blue_dead_reward: 1
        grid_reward: 0
    n_envs: &nenvs 1    # This should be consistent with the Unity environemnt copies
    uid2aid: [0]

    bound_method: null
    to_rescale: False
    action_low: -2
    action_high: 2

agent: {}

strategy:
    algorithm: *algo
    train_loop: {}

model:
    aid: 0
    rnn_type: &rnn null
    gamma: &gamma .99

    encoder: 
        nn_id: null
    rnn:
        nn_id: *rnn
    policy:
        nn_id: policy
        units_list: [64, 64]
        w_init: orthogonal
        activation: relu
        norm: layer
        norm_after_activation: True
        norm_kwargs:
            axis: -1
            create_scale: True
            create_offset: True
        out_scale: .01
        init_std: .6065
        out_act: tanh
    value:
        nn_id: value
        units_list: [64, 64]
        w_init: orthogonal
        activation: relu
        norm: layer
        norm_after_activation: True
        norm_kwargs:
            axis: -1
            create_scale: True
            create_offset: True

loss:
    # hyperparams for value target and advantage
    target_type: gae
    c_clip: 1
    rho_clip: 1
    adv_type: vtrace
    norm_adv: True
    norm_meta_adv: True

    # hyperparams for policy optimization
    pg_type: ppo
    ppo_clip_range: .2
    policy_sample_mask: True

    # hyperparams for value learning
    value_loss: mse
    value_clip_range: .2
    value_sample_mask: False

    stats:
        gamma: *gamma
        lam: .95
        pg_coef: 1
        entropy_coef: 0
        value_coef: 1

trainer:
    algorithm: *algo
    aid: 0
    n_runners: *nrunners
    n_envs: *nenvs
    n_epochs: &nepochs 1
    n_mbs: &nmbs 1
    n_meta_epochs: 1
    n_steps: *nsteps         # BPTT length
    timeout_done: &td True

    theta_opt:
        opt_name: adam
        lr: 1e-3
        clip_norm: 10
        eps: 1e-5

actor:
    algorithm: *algo

    rms:
        obs_names: [obs, global_state]
        obs_normalized_axis: [0, 1]
        reward_normalized_axis: [0, 1]
        normalize_obs: True
        obs_clip: 10
        normalize_reward: True
        update_reward_rms_in_time: False
        gamma: *gamma

buffer:
    type: ac

    n_runners: *nrunners
    n_envs: *nenvs
    n_steps: *nsteps
    queue_size: 2
    timeout_done: *td
    rnn_type: *rnn

    # mini-batch size = n_runners * n_envs * epslen / n_mbs
    sample_keys:
        - obs
        - action
        - value
        - reward
        - discount
        - reset
        - mu_logprob
        - mu_mean
        - mu_std
        - mask
        - h
        - c