# --- COMA specific parameters ---

action_selector: "soft_policies"
mask_before_softmax: True


runner: "parallel"

buffer_size: 10
batch_size_run: 10
batch_size: 10

env_args:
  state_last_action: False # critic adds last action internally

# update the target network every {} training steps
target_update_interval_or_tau: 200

lr: 0.0003

obs_agent_id: True
obs_last_action: False
obs_individual_obs: False


# use COMA
agent_output_type: "pi_logits"
learner: "coma_learner"
critic_q_fn: "coma"
standardise_returns: False
standardise_rewards: True

hidden_dim: 64

use_rnn: True
critic_baseline_fn: "coma"
critic_train_mode: "seq"
critic_train_reps: 1
entropy_coef: 0.001
q_nstep: 10  # 0 corresponds to default Q, 1 is r + gamma*Q, etc
critic_type: coma_critic

name: "coma"
t_max: 20050000