# --- MAPPO_NS specific parameters ---

action_selector: "soft_policies"
mask_before_softmax: True

runner: "parallel"

buffer_size: 10
batch_size_run: 10
batch_size: 10

#env_args:
#  state_last_action: False # critic adds last action internally

# update the target network every {} training steps
target_update_interval_or_tau: 200

lr: 0.0003

obs_agent_id: False
obs_last_action: False
obs_individual_obs: False

mac: "seqreverse_mac"
agent: "rnn"
agent_output_type: "pi_logits"
learner: "ppo_learner_ar"
entropy_coef: 0.01
use_rnn: False
standardise_returns: False
standardise_rewards: True
q_nstep: 5 # 1 corresponds to normal r + gammaV
critic_type: "cv_critic_ns"
epochs: 4
eps_clip: 0.2
name: "mappo_ar_reverse_ns"

#t_max: 20050000