# --- MADDPG specific parameters ---
action_range: ~
action_selector: "gumbel"
epsilon_start: 0.5
epsilon_finish: 0.05
epsilon_anneal_time: 50000
agent: qmixrnn
obs_agent_id: True # Include the agent's one_hot id in the observation
obs_last_action: True # Include the agent's last action (one_hot) in the observation
agent_output_type: "pi_logits"
batch_size_run: 1
batch_size: 32
buffer_size: 5000
buffer_warmup: 0
discretize_actions: False
double_q: False
epsilon_decay_mode: ~
exploration_mode: "gaussian"
start_steps: 0 # Number of steps for uniform-random action selection, before running real policy. Helps exploration.
act_noise: 0.1 # Stddev for Gaussian exploration noise added to policy at training time.
ou_theta: 0.15 # D
ou_sigma: 0.2 # D
ou_noise_scale: 0.3
final_ou_noise_scale: 0.
gamma: 0.99
grad_norm_clip: 10
learner: "maddpg_learner_discrete"
learn_interval: 1
lr: 0.0025
critic_lr: 0.0005
td_lambda: 0.8
critic_train_reps: 1
q_nstep: 0  # 0 corresponds to default Q, 1 is r + gamma*Q, etc
mac: basic_mac
name: "maddpg_smac"
n_runners: ~
n_train: 1
optimizer: "adam" # D
ou_stop_episode: 100 # training noise goes to zero after this episode
rnn_hidden_dim: 64
run_mode: ~
runner: "episode"
runner_scope: 'episodic'
target_update_interval: 200
target_update_mode: 'hard'
target_update_tau: 0.001
test_greedy: True
test_interval: 10000
test_nepisode: 32
testing_on: True
t_max: 2000000
save_model: False
save_model_interval: 10000
verbose: False
weight_decay: True
weight_decay_factor: 0.0001
env_args:
  state_last_action: False # critic adds last action internally
agent_return_logits: False
q_embed_dim: 1
mask_before_softmax: True