# This is the configuration file for the HAD3QN algorithm.
seed:
  # whether to use the specified seed
  seed_specify: True
  # seed
  seed: 1
device:
  # whether to use CUDA
  cuda: True
  # whether to set CUDA deterministic
  cuda_deterministic: True
  # arg to torch.set_num_threads
  torch_threads: 4
train:
  # number of parallel environments for training data collection
  n_rollout_threads: 20
  # number of total steps
  num_env_steps: 10000000
  # number of warmup steps
  warmup_steps: 10000
  # number of steps per train
  train_interval: 50
  # ratio of training iterations to train_interval
  update_per_train: 1
  # logging interval (currently unused)
  log_interval: ~
  # evaluation interval
  eval_interval: 1000
  # whether to use linear learning rate decay
  use_linear_lr_decay: False
  # whether to consider the case of truncation when an episode is done
  use_proper_time_limits: True
  # if set, load models from this directory; otherwise, randomly initialise the models
  model_dir: ~
eval:
  # whether to use evaluation
  use_eval: True
  # number of parallel environments for evaluation
  n_eval_rollout_threads: 10
  # number of episodes per evaluation
  eval_episodes: 20
render:
  # whether to use render
  use_render: False
  # number of episodes to render
  render_episodes: 10
model:
  # network parameters
  # hidden sizes for base mlp module in the network
  base_hidden_sizes: [128, 128]
  # hidden sizes for dueling v mlp module in the network
  dueling_v_hidden_sizes: [128]
  # hidden sizes for dueling a mlp module in the network
  dueling_a_hidden_sizes: [128]
  # base activation function, choose from sigmoid, tanh, relu, leaky_relu, selu, hardswish
  base_activation_func: relu
  # dueling v activation function, choose from sigmoid, tanh, relu, leaky_relu, selu, hardswish
  dueling_v_activation_func: hardswish
  # dueling a activation function, choose from sigmoid, tanh, relu, leaky_relu, selu, hardswish
  dueling_a_activation_func: hardswish
  # optimizer parameters
  # actor learning rate
  lr: 0.0005
  # critic learning rate
  critic_lr: 0.001
algo:
  # discount factor
  gamma: 0.95
  # off-policy buffer size
  buffer_size: 1000000
  # training batch size
  batch_size: 1000
  # coefficient for target model soft update
  polyak: 0.005
  # epsilon-greedy parameter
  epsilon: 0.05
  # the number of steps to look ahead
  n_step: 1
  # whether to share parameter among actors
  share_param: False
  # whether to use a fixed optimisation order
  fixed_order: False
logger:
  # logging directory
  log_dir: "./results"
