seeds: [159, 1606, 2186, 3462, 8175]
seed: -1
env_name: "grid-risk-v2"  # type: ignore
use_legal_actions: False
num_hidden: 256
discount: 1.0
cvar_alpha: 0.25
num_quantile_samples: 4096

use_q_value_head: False
is_naive: False

# Training
num_simulations: 32
vf_coeff: 0.25
sc_coeff: 2.0
lr: 5e-3
min_lr: 1e-5  # Minimum learning rate
lr_linear_decay: True  # Whether to linearly decay the learning rate
lr_anneal_iterations: 200  # Number of iterations to decay the learning rate
max_grad_norm: 5.0
optim_eps: 1e-5
n_step: 5
target_tau: 1.0
gumbel_start: 3.0
gumbel_end: 1.0
gumbel_anneal_iterations: 30  # Number of iterations to anneal gumbel scale
history_length: 10  # Number of previous steps to consider in the history

qtransform: "qtransform_by_parent_and_siblings"
huber_param: 1.0
num_quantiles: 64

# Buffer
eval_num_actors: 1024
selfplay_batch_size: 32
train_batch_size: 1024
target_update_interval: 5  # Update every X epochs
train_epochs_per_iter: 20  
sample_sequence_length: 6
max_num_steps: 24
total_buffer_size: 65_280 # 32 * 30 * 24  # selfplay_batch_size * max_num_steps
learning_start: 30  # Iters of max_num_steps to prefill the buffer

# Placeholders for dynamic values
num_actions: -1
is_state_vector: False

# Logging
eval_interval: 5
max_num_iters: 1000