
name: "poam"
on_policy: True
action_selector: "soft_policies"
mask_before_softmax: True

runner: "parallel"

buffer_size: 256
batch_size_run: 8
batch_size: 256

save_extra_stats: False

obs_agent_id: True
obs_last_action: True # POAM requires encoder to obs agents' last actions
obs_state: False # required for mappo critic
obs_individual_obs: False # used for cv_critic only
obs_team_composition: False 

agent_output_type: "pi_logits"
learner: "poam_learner"
mac: "agent_owned_mac"
agent: "rnn_poam"
critic_type: "poam_critic"

# poam only

ed_pretrain_ts: 0 
ed_pretrain_epochs: 10000
ed_grad_norm_clip: 5
ed_model_uncontrolled_only: False # if True, model both controlled and uncontrolled

ed_epochs: 1
n_ed_minibatch: 1 # 2
n_ed_hidden_layers: 1
ed_hidden_dim: 64
embed_dim: 64
ed_lr: 0.0005
ed_bce_loss: False

entropy_coef: 0.05
standardise_rewards: True
use_gae: True # if False, use n-step returns
gae_lambda: 0.95
q_nstep: 5 # 1 corresponds to normal r + gammaV

epochs: 5
n_minibatch: 1
eps_clip: 0.1
lr: 0.0005
optim_alpha: 0.99
optim_eps: .00001

use_popart: False # use PopArt to normalize rewards
clip_value_loss: False # limit how much value is allowed to change during PPO epochs
use_huber_loss: False
huber_delta: 10.0
use_adv_std: True # standardize advantages

use_obs_norm: True # apply layernorm to normalize inputs
use_orthogonal_init: True # if False, use xavier uniform initialization, else use orthogonal

trainable_agents_mask_actor: True # applies to open train only
trainable_agents_mask_critic: False # applies to open train only
mask_type: "team" # choices: [team, agent] 
# team: multiply default terminated and filled masks
# ind: compute individual mask from available actions

# --- Open train Parameters ---
trained_agents:
  agent_0:
    agent_loader: poam_train_agent_loader
    agent_path: ""
