
fc_hidden_dim:  64

egreedy_exploration: 1.0
min_exploration: 0.01
target_update_tau: 0.05
target_update_interval:  100
learning_rate: 0.1
min_learning_rate: 0.00001
clip_grad_norm:  10
hypernet_layers: 1
hypernet_embed_dim:  64
mixing_embed_dim:  32
learner_update_freq:  3
double_q:  True
replay_buffer_size: 1000
batch_size_run: 32 # Off policy uses experience replay
optimizer_type: 'rmsprop'
memory_warmup_size: 32
nonlinear_key: False
qtran_arch: "qtran_paper"
opt_loss: 1
nopt_min_loss: 0.1
network_size: small
batch_size: 32
buffer_size: 1000



action_selector: "epsilon_greedy"
epsilon_start: 1.0
epsilon_finish: 0.05
epsilon_anneal_time: 50000


rnn_discrim: False
rnn_agg_size: 32


noise_bandit: True
noise_bandit_lr: 0.1
noise_bandit_epsilon: 0.2


# pc mode buffer_size: 500


# use the Q_Learner to train
agent_output_type: "q"


mixer: "maven"

skip_connections: False
hyper_initialization_nonzeros: 0

# --- Noise parameters ---
mac: "noise_mac"
agent: "noise_rnn"


noise_dim: 2

mi_loss: 1

discrim_size: 32
discrim_layers: 1

noise_embedding_dim: 32


mi_intrinsic: False
mi_scaler: 0.1
hard_qs: False

bandit_epsilon: 0.1
bandit_iters: 8
bandit_batch: 64
bandit_buffer: 512
bandit_reward_scaling: 20
bandit_use_state: True
bandit_policy: False

gamma: 0.99
lr: 0.0005 # Learning rate for agents
critic_lr: 0.0005 # Learning rate for critics
recurrent_critic: False
optim_alpha: 0.99 # RMSProp alpha
optim_eps: 0.00001 # RMSProp epsilon
grad_norm_clip: 10 # Reduce magnitude of gradients above this L2 norm
entropy_scaling: 0.001



learner: "maven_learner"
name: "maven"

