# --- QMIX specific parameters ---

# use epsilon greedy action selector
action_selector: "epsilon_greedy"
epsilon_start: 1.0
epsilon_finish: 0.05
epsilon_anneal_time: 50000

runner: "parallel_x"
batch_size_run: 32 #8
mac: "cate_broadcast_comm_mac_cen"

buffer_size: 10000 #5000

# update the target network every {} episodes
target_update_interval: 200

# use the Q_Learner to train
agent: "rnn" # Default rnn agent
agent_output_type: "q"
learner: "offpg_learner"
double_q: True
mixer: "qmix"
mixing_embed_dim: 32

name: "offpg_comm"

atom: 51
vmin: -10
vmax: 10

# Comm
comm: True
comm_embed_dim: 3
comm_method: "information_bottleneck_full"
c_beta: 1.
comm_beta: 0.0001 #0.001
comm_entropy_beta: 0.0 #1e-6
gate_loss_beta: 0.00001
only_downstream: True #False
use_IB: True
is_print: False

batch_size_run: 16

is_comm_beta_decay: False
comm_beta_start_decay: 20000000
comm_beta_target: 1e-2
comm_beta_end_decay: 50000000

is_comm_entropy_beta_decay: False
comm_entropy_beta_start_decay: 20000000
comm_entropy_beta_target: 1e-4
comm_entropy_beta_end_decay: 50000000

is_cur_mu: False
is_rank_cut_mu: False
cut_mu_thres: 1.
cut_mu_rank_thres: 80.0

test_is_cut: False
test_is_cut_prob: False
test_cut_prob_thres: 0.
test_cut_prob_list: [0., 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
test_cut_list: [0., 0.5, 1., 1.5, 2., 2.5, 3., 3.5, 4., 4.5, 5.]
#test_cut_list: [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]
test_is_print_once: False

is_batch_rl: True
is_from_start: True
load_buffer_id: 10001 #9208

# --- COMA specific parameters ---

#action_selector: "multinomial"
epsilon_start: .5
epsilon_finish: .05
epsilon_anneal_time: 500000
mask_before_softmax: False

#runner: "parallel"

#buffer_size: 32
#off_buffer_size: 70000 # size of the off-policy buffer default: 5000
batch_size_run: 10 # 10
batch_size: 16
#off_batch_size: 32

#env_args:
#  state_last_action: False # critic adds last action internally

# update the target network every {} training steps
target_update_interval: 600
step: 5 #steps for tree back up diagram  (adjustable based on # of agents)


lr: 0.0005
critic_lr: 0.0001
td_lambda: 0.8
tb_lambda: 0.93

# use qmix
mixing_embed_dim: 32

# use COMA
agent_output_type: "pi_logits"
learner: "offpg_learner"
critic_q_fn: "coma"
critic_baseline_fn: "coma"
critic_train_mode: "seq"
critic_train_reps: 1
q_nstep: 0  # 0 corresponds to default Q, 1 is r + gamma*Q, etc

name: "offpg_smac"
