# Copyright 2023 OmniSafe Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

defaults:
  # seed for random number generator
  seed: 0
  # training configurations
  train_cfgs:
    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
    device: cpu
    # number of threads for torch
    torch_threads: 16
    # number of vectorized environments
    vector_env_nums: 1
    # number of parallel agent, similar to a3c
    parallel: 1
    # total number of steps to train
    total_steps: 10000000
  # algorithm configurations
  algo_cfgs:
    # number of steps to update the policy
    steps_per_epoch: 20000
    # number of iterations to update the policy
    update_iters: 40
    # batch size for each iteration
    batch_size: 64
    # target kl divergence
    target_kl: 0.02
    # entropy coefficient
    entropy_coef: 0.0
    # normalize reward
    reward_normalize: False
    # normalize cost
    cost_normalize: False
    # normalize observation
    obs_normalize: True
    # early stop when kl divergence is bigger than target kl
    kl_early_stop: True
    # use max gradient norm
    use_max_grad_norm: True
    # max gradient norm
    max_grad_norm: 40.0
    # use critic norm
    use_critic_norm: True
    # critic norm coefficient
    critic_norm_coef: 0.001
    # reward discount factor
    gamma: 0.99
    # cost discount factor
    cost_gamma: 0.99
    # lambda for gae
    lam: 0.95
    # lambda for cost gae
    lam_c: 0.95
    # clip ratio
    clip: 0.2
    # advantage estimation method, options: gae, retrace
    adv_estimation_method: gae
    # standardize reward advantage
    standardized_rew_adv: True
    # standardize cost advantage
    standardized_cost_adv: True
    # penalty coefficient
    penalty_coef: 0.0
    # use cost
    use_cost: False
    # The safety budget, equal to the cost limit
    safety_budget: 25.0
    # the upper bound of safety budget
    upper_budget: 25.0
    # The saute gamma
    saute_gamma: 0.999
    # The max length of the episode
    max_ep_len: 1000
    # The reward when the agent is unsafe
    unsafe_reward: -1.0
  # logger configurations
  logger_cfgs:
    # use wandb for logging
    use_wandb: True
    # wandb project name
    wandb_project: omnisafe
    # use tensorboard for logging
    use_tensorboard: True
    # save model frequency
    save_model_freq: 100
    # save logger path
    log_dir: "./runs"
    # save model path
    window_lens: 100
  # model configurations
  model_cfgs:
    # weight initialization mode
    weight_initialization_mode: "kaiming_uniform"
    # actor type, options: gaussian, gaussian_learning
    actor_type: gaussian_learning
    # linear learning rate decay
    linear_lr_decay: True
    # exploration noise anneal
    exploration_noise_anneal: False
    # std upper bound, and lower bound
    std_range: [0.5, 0.1]
    # actor network configurations
    actor:
      # hidden layer sizes
      hidden_sizes: [64, 64]
      # activation function
      activation: tanh
      # out_activation: tanh
      # learning rate
      lr: 0.0003
    critic:
      # hidden layer sizes
      hidden_sizes: [64, 64]
      # activation function
      activation: tanh
      # learning rate
      lr: 0.0003
  # controller configurations
  control_cfgs:
    # The Kp of PID controller
    kp: 0.0005
    # The Ki of PID controller
    ki: 0.00001
    # The Kd of PID controller
    kd: 0.0
    # The polyak coefficient of the target
    polyak: 0.995

SafetyCarGoal1-v0:
  # algorithm configurations
  algo_cfgs:
    # number of iterations to update the policy
    update_iters: 80
    # normalize observation
    obs_normalize: False
    # The saute gamma
    saute_gamma: 0.9999
    # The reward when the agent is unsafe
    unsafe_reward: -0.2

SafetyCarGoal2-v0:
  # algorithm configurations
  algo_cfgs:
    # number of iterations to update the policy
    update_iters: 80
    # normalize observation
    obs_normalize: False
    # The saute gamma
    saute_gamma: 0.9999
    # The reward when the agent is unsafe
    unsafe_reward: -0.2

SafetyPointCircle1-v0:
  # algorithm configurations
  algo_cfgs:
    # number of iterations to update the policy
    update_iters: 80
    # The saute gamma
    saute_gamma: 0.9999
    # The reward when the agent is unsafe
    unsafe_reward: -0.2

SafetyPointCircle2-v0:
  # algorithm configurations
  algo_cfgs:
    # number of iterations to update the policy
    update_iters: 80
    # The saute gamma
    saute_gamma: 0.9999
    # The reward when the agent is unsafe
    unsafe_reward: -0.2

SafetyPointGoal1-v0:
  # algorithm configurations
  algo_cfgs:
    # number of iterations to update the policy
    update_iters: 80
    # normalize observation
    obs_normalize: False
    # The saute gamma
    saute_gamma: 0.9999
    # The reward when the agent is unsafe
    unsafe_reward: -0.2

SafetyPointGoal2-v0:
  # algorithm configurations
  algo_cfgs:
    # number of iterations to update the policy
    update_iters: 80
    # normalize observation
    obs_normalize: False
    # The saute gamma
    saute_gamma: 0.9999
    # The reward when the agent is unsafe
    unsafe_reward: -0.2
