training:
  num_envs : 4
  envs : ["GetEKey", "GetHKey", "EKeyGoal", "HKeyGoal"]
  strategy: "q-value" # q-value|thompson|entropy
  episodes_in_each_iter : 10
  run_number: 0

env:
  render_mode : False
  has_continuous_action_space : False  # continuous action space; else discrete
  max_ep_len: 500                   # max timesteps in one episode
  max_training_timesteps : 1000000000   # break training loop if timeteps > max_training_timesteps
  print_freq : 5000        # print avg reward in the interval (in num timesteps)
  log_freq : 1000           # log avg reward in the interval (in num timesteps)
  save_model_freq : 100000          # save model frequency (in num timesteps)
  action_std : 0.6                    # starting std for action distribution (Multivariate Normal)
  action_std_decay_rate : 0.05        # linearly decay action_std (action_std = action_std - action_std_decay_rate)
  min_action_std : 0.1                # minimum action_std (stop decay after action_std <= min_action_std)
  action_std_decay_freq : 250000  # action_std decay frequency (in num timesteps)
  random_seed : 0
## Note : print/log frequencies should be > than max_ep_len

################ PPO hyperparameters ################
ppo:
  update_timestep : 3000      # update policy every n timesteps
  K_epochs : 15               # update policy for K epochs in one PPO update
  eps_clip : 0.2          # clip parameter for PPO
  gamma : 0.99            # discount factor
  lr_actor : 0.0004       # learning rate for actor network
  lr_critic : 0.001       # learning rate for critic network
  random_seed : 0         # set random seed if required (0 = no random seed)