# --- VMIX specific parameters ---

action_selector: "multinomial"
epsilon_start: .5
epsilon_finish: .0
epsilon_anneal_time: 100000
mask_before_softmax: True

runner: "parallel"

buffer_size: 8
batch_size_run: 8
batch_size: 8
t_max: 10050000

env_args:
  state_last_action: False # critic adds last action internally

# update the target network every {} training steps
target_update_interval: 200

lr: 0.001
critic_lr: 0.001
td_lambda: 0.8

# use COMA
agent_output_type: "pi_logits"
learner: "policy_gradient_v2"
mixer: "qmix"
q_nstep: 0  # 0 corresponds to default Q, 1 is r + gamma*Q, etc
lam: 0.95
ent_coef: 0.01 # 0.005 for 3s5z_vs_3s6z, 0.02 for 3s_vs_5z, very important!!!!!
vf_coef: 0.1
mixing_embed_dim: 32
hypernet_layers: 2
hypernet_embed: 64
abs: True
optim: rmsprop # works better with RMSProp

# agent
mac: "ppo_mac"
agent: "rnn_ppo"

name: "vmix_env=8_adam_td_lambda"
