# Copyright 2023 OmniSafe Team. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

defaults:
  # seed for random number generator
  seed: 0
  # training configurations
  train_cfgs:
    # device to use for training, options: cpu, cuda, cuda:0, cuda:0,1, etc.
    device: cpu
    # number of threads for torch
    torch_threads: 16
    # total number of steps to train
    total_steps: 1000000
    # number of vectorized environments
    vector_env_nums: 1
    # number of parallel agent, similar to a3c
    parallel: 1
  # dynamics configurations
  dynamics_cfgs:
    # number of network for ensemble model
    num_ensemble: 5
    # output size for ensemble model
    elite_size: 5
    # size of hidden layers
    hidden_size: 200
    # whether use decay loss
    use_decay: True
    # whether predict reward
    predict_reward: True
    # whether predict reward
    predict_cost: False
    # training batch size of dynamics
    batch_size: 256
    # training max epoch of dynamics
    max_epoch: 5
    # the reward size for dynamics prediction
    reward_size: 1
    # the cost size for dynamics prediction
    cost_size: 1
    # whether compute cost during dynamics imagination
    use_cost: False
    # whether compute cost during dynamics imagination
    use_terminal: False
    # whether use variance for dynamics imagination
    use_var: False
    # whether use reward critic for dynamics imagination
    use_reward_critic: True
    # whether use cost critic for dynamics imagination
    use_cost_critic: False
  planner_cfgs:
    # planning hoirzon
    plan_horizon: 7
    # planning iteration
    num_iterations: 5
    # the number of particle in plannnig
    num_particles: 4
    # the number of action sample in planning
    num_samples: 100
    # the number of candidate action in planning
    num_elites: 20
    # the momentum coefficients for the mean and variance update in planning
    momentum: 0.1
    # the var threshold in planning
    epsilon: 0.001
    # the initial variance of planning
    init_var: 0.01
    # mixture coefficient for neural actor and gaussian actor in plannnig
    mixture_coefficient: 0.05
    # the scale factor for reward in plannnig
    temperature: 10.0
  # evaluation configurations
  evaluation_cfgs:
    # whether evaluation
    use_eval: True
    # evaluation cycle
    eval_cycle: 10000
    # evaluation episode
    num_episode: 1
  # algorithm configurations
  algo_cfgs:
    # number of steps to update the policy
    steps_per_epoch: 20000
    # number of action repetition
    action_repeat: 5
    # update cycle to dynamics
    update_dynamics_cycle: 1200
    # actor perdorm random action before `start_learning_steps` steps
    start_learning_steps: 10000
    # normalize reward
    reward_normalize: False
    # normalize cost
    cost_normalize: False
    # normalize observation
    obs_normalize: False
    # whether to store done data to policy buffer
    policy_store_done: False
    # reward discount factor
    gamma: 0.99
    # cost discount factor
    cost_gamma: 1.0
    # policy buffer batch size
    policy_batch_size: 256
    # number of steps to update the policy
    update_policy_cycle: 250
    # number of iterations to update the policy
    update_policy_iters: 50
    # max gradient norm
    max_grad_norm: 40
    # use critic norm
    use_critic_norm: False
    # critic norm coefficient
    critic_norm_coeff: 0.001
    # the soft update coefficient
    polyak: 0.005
    # the value of alpha
    alpha: 0.2
    # whether to use alpha discount factor
    alpha_discount: True
    # whether to use deterministic actor output in computation of loss pi
    loss_pi_deterministic: True
    # whether to use gradient normalization
    use_grad_norm: False
    # alpha discount factor
    alpha_gamma: 0.99
    # whether to use auto alpha
    auto_alpha: False
    # policy update delay cycle
    policy_delay: 1
    # use cost
    use_cost: False
  # logger configurations
  logger_cfgs:
    # use wandb for logging
    use_wandb: False
    # wandb project name
    wandb_project: omnisafe
    # use tensorboard for logging
    use_tensorboard: True
    # save model frequency
    save_model_freq: 5
    # save logger path
    log_dir: "./runs"
    # save model path
    window_lens: 10
  # actor Critic configurations
  model_cfgs:
    # weight initialization mode
    weight_initialization_mode: "kaiming_uniform"
    # actor type
    actor_type: gaussian_sac
    # linear learning rate decay
    linear_lr_decay: False
    # configuration of Actor network
    actor:
      # size of hidden layers
      hidden_sizes: [64, 64]
      # activation function
      activation: relu
      # the learning rate of Actor network
      lr: 0.001
    # configuration of Critic network
    critic:
      # the number of critic networks
      num_critics: 2
      # size of hidden layers
      hidden_sizes: [64, 64]
      # activation function
      activation: relu
      # the learning rate of Critic network
      lr: 0.001

SafetyAntVelocity-v1:
  algo_cfgs:
    action_repeat: 1
    update_policy_cycle: 1
    update_policy_iters: 1
    policy_delay: 2
    loss_pi_deterministic: False
    use_grad_norm: True
    alpha_discount: False
    policy_store_done: True
  planner_cfgs:
    plan_horizon: 3
    init_var: 0.1
    temperature: 0.1
    alpha: 0.2
  model_cfgs:
    actor:
      # size of hidden layers
      hidden_sizes: [256, 256]
      lr: 0.0003
    critic:
      # size of hidden layers
      hidden_sizes: [256, 256]
      lr: 0.0003

SafetyHalfCheetahVelocity-v1:
  algo_cfgs:
    action_repeat: 1
    update_policy_cycle: 1
    update_policy_iters: 1
    policy_delay: 2
    loss_pi_deterministic: False
    use_grad_norm: True
    alpha_discount: False
    policy_store_done: True
    alpha: 0.2
  planner_cfgs:
    plan_horizon: 3
    init_var: 0.1
    temperature: 1
  model_cfgs:
    actor:
      # size of hidden layers
      hidden_sizes: [256, 256]
      lr: 0.0003
    critic:
      # ize of hidden layers
      hidden_sizes: [256, 256]
      lr: 0.0003
