# --- COMA specific parameters ---

action_selector: "multinomial"
epsilon_start: .5
epsilon_finish: .05
epsilon_anneal_time: 500000
mask_before_softmax: False

runner: "parallel"
mac: "dop_mac"

buffer_size: 64
off_buffer_size: 5000 # size of the off-policy buffer
batch_size_run: 8 #10
batch_size: 32
off_batch_size: 64
t_max: 10050000

env_args:
  state_last_action: False # critic adds last action internally

# update the target network every {} training steps
target_update_interval: 600
step: 5 #steps for tree back up diagram  (adjustable based on # of agents)


ent_coef: 0
lr: 0.0005
critic_lr: 0.0001
td_lambda: 0.8
tb_lambda: 0.93

# use qmix
mixing_embed_dim: 32

# use COMA
agent_output_type: "pi_logits"
learner: "offpg_learner"
critic_q_fn: "coma"
critic_baseline_fn: "coma"
critic_train_mode: "seq"
critic_train_reps: 1
q_nstep: 0  # 0 corresponds to default Q, 1 is r + gamma*Q, etc
grad_norm_clip: 20

name: "dop_env=8_rmsprop_tb+td_lambda"

run: "dop_run"