seeds: [1606, 159, 2186] # 159, 1606
seed: 23
env_name: "stochastic-bipartite-matching"  # type: ignore
use_legal_actions: False
num_hidden: 64
discount: 1.0
cvar_alpha: 0.25
num_quantile_samples: 1024

qtransform: "qtransform_by_parent_and_siblings"
use_q_value_head: False
is_naive: False

# Training
num_simulations: 32
vf_coeff: 0.25
sc_coeff: 10.0
lr: 5e-4
min_lr: 1e-5  # Minimum learning rate
lr_linear_decay: True  # Whether to linearly decay the learning rate
lr_anneal_iterations: 2000  # Number of iterations to decay the learning rate
max_grad_norm: 5.0
optim_eps: 1e-5
n_step: 5
target_tau: 1.0
target_update_interval: 5  # Update every X epochs
gumbel_start: 3.0
gumbel_end: 1.0
gumbel_anneal_iterations: 30  # Number of iterations to anneal gumbel scale

huber_param: 1.0
num_quantiles: 64

# Buffer
eval_num_actors: 1024
selfplay_batch_size: 32
train_batch_size: 256
train_epochs_per_iter: 20  # For mountain car, this should be 100+
sample_sequence_length: 6
max_num_steps: 32
total_buffer_size: 32768  # selfplay_batch_size * max_num_steps
learning_start: 32  # Iters of max_num_steps to prefill the buffer

# Placeholders for dynamic values
num_actions: -1
is_state_vector: False

# Logging
eval_interval: 5
max_num_iters: 2000