# This is the configuration file for the HASAC algorithm.
seed:
  # whether to use the specified seed
  seed_specify: True
  # seed
  seed: 1
device:
  # whether to use CUDA
  cuda: True
  # whether to set CUDA deterministic
  cuda_deterministic: True
  # arg to torch.set_num_threads
  torch_threads: 4
train:
  # number of parallel environments for training data collection
  n_rollout_threads: 20
  # number of total steps
  num_env_steps: 20000000
  # number of warmup steps
  warmup_steps: 10000
  # number of steps per train
  train_interval: 50
  # ratio of training iterations to train_interval
  update_per_train: 1
  # logging interval (currently unused)
  log_interval: ~
  # evaluation interval
  eval_interval: 10000
  # whether to use ValueNorm
  use_valuenorm: True
  # whether to use linear learning rate decay
  use_linear_lr_decay: False
  # whether to consider the case of truncation when an episode is done
  use_proper_time_limits: True
  # if set, load models from this directory; otherwise, randomly initialise the models
  model_dir: ~
eval:
  # whether to use evaluation
  use_eval: True
  # number of parallel environments for evaluation
  n_eval_rollout_threads: 20
  # number of episodes per evaluation
  eval_episodes: 40
render:
  # whether to use render
  use_render: False
  # number of episodes to render
  render_episodes: 10
model:
  # network parameters
  # hidden sizes for mlp module in the network
  hidden_sizes: [256, 256]
  # activation function, choose from sigmoid, tanh, relu, leaky_relu, selu
  activation_func: relu
  # whether to use feature normalization
  use_feature_normalization: True
  # final activation function, choose from sigmoid, tanh, relu, leaky_relu, selu
  final_activation_func: tanh
  # initialization method for network parameters, choose from xavier_uniform_, orthogonal_, ...
  initialization_method: orthogonal_
  # gain of the output layer of the network.
  gain: 0.01
  # optimizer parameters
  # actor learning rate
  lr: 0.0005
  # critic learning rate
  critic_lr: 0.0005
algo:
  # whether to auto-tune temperature
  auto_alpha: False
  # temperature parameter
  alpha: 0.001
  # alpha learning rate
  alpha_lr: 0.0003
  # discount factor
  gamma: 0.99
  # off-policy buffer size
  buffer_size: 1000000
  # training batch size
  batch_size: 1000
  # coefficient for target model soft update
  polyak: 0.005
  # the number of steps to look ahead
  n_step: 20
  # whether to use huber loss
  use_huber_loss: True
  # whether to use policy active masks
  use_policy_active_masks: True
  # huber delta
  huber_delta: 10.0
  # whether to share parameter among actors
  share_param: False
  # whether to use a fixed optimisation order
  fixed_order: False
logger:
  # logging directory
  log_dir: "./results"
