log_folder_suffix: 'dev_testfallback'
resources:
  device: 'cpu'
debug_config:
  debug_flag: False
  debug_flag: true
  show_progress: true
debugging:
  logdir: log
  logger: tensorboard
  seed: 0
  show_progress: false
running:
  resume_path:
  resume_id:
  watch: False
rollouts:
  num_envs_per_worker: 1
  exploration_noise: true
  buffer_size: 1000000
  start_timesteps: 10000 # init uniform steps
  # offp_trainer: epoch, step_per_epoch, step_per_collect
  max_epoch: 200
  step_per_epoch: 5000 # the number of transitions collected per epoch.
  # the number of transitions the collector would collect before
  # the network update, i.e., trainer will collect
  # "step_per_collect" transitions and do some policy network
  # update (gradient step) repeatedly in each epoch.
  step_per_collect: 1
environment:
  env: 'HalfCheetah-v4'
evaluation:
  # eval_num_envs is also how many episodes will be evaluated
  eval_num_envs_per_worker: 10
  render: 0.0
training:
  # offp_trainer: batch_size, update_per_step
  # losses = self.policy.update(self.batch_size, self.train_collector.buffer)
  batch_size: 256
  # OffpolicyTrainer: __next__ -> .policy_update_fn()
  # for _ in range(round(self.update_per_step * result["n/st"])):
  # how many gradient steps per collect
  gradient_step_per_collect: 1
  a_actor_lr: !!float 1e-3
  a_critic_lr: !!float 1e-3
  o_actor_lr: !!float 1e-3
  o_critic_lr: !!float 1e-3
  gamma: 0.99
  tau: 0.005 # soft update param of the target network. \rho in openai sup
  estimation_step: 1 # TD return, the number of steps to look ahead. Default to 1.
vmoc_training:
  use_MI: false
  a_Q_alpha: 0.1
  o_Q_alpha: 0.1
  a_P_alpha: 0.1
  o_P_alpha: 0.1
  auto_alpha: false
  a_alpha_lr: !!float 1e-3
  o_alpha_lr: !!float 1e-3
  detach_pO_oembed: false
  detach_pA_oembed: true
  detach_qA_oembed: true
net:
  hidden_sizes:
    - 256
    - 256
somo:
  num_options: 4
  dmodel: 20
