task: 'PPO-Lag-highD-velocity'
group: 'PPO-Lag'
device: 'cuda'
verbose: 1
env:
  config_path: "../config/highD_velocity_constraint/highD_environment_configurations_no_velocity_penalty-40.yaml"
  train_env_id : 'commonroad-v1'
  eval_env_id: 'commonroad-v1'
  save_dir: '../save_model'
  cost_info_str: 'cost'
  use_cost: True
  reward_gamma: 0.99
  cost_gamma: 0.99
  dont_normalize_obs: False
  dont_normalize_reward: False
  dont_normalize_cost: False  # no cost
  record_info_names: ["ego_velocity_x", "ego_velocity_y"]
  record_info_input_dims: [ 6, 7 ] # the dim of record info in inputs=(obs, action)

running:
  n_iters: 2000
  n_eval_episodes: 10
  save_every: 100

PPO:
  policy_name: 'TwoCriticsMlpPolicy'
  learning_rate: 0.0005 # 0.0003
  n_steps: 1024 #2048
  n_epochs: 10
  clip_obs: 20
  reward_gamma: 0.99
  reward_gae_lambda: 0.95
  cost_gamma: 0.99
  cost_gae_lambda: 0.95
  clip_range: 0.2
  ent_coef: 0.0
  reward_vf_coef: 0.5
  cost_vf_coef: 0.5
  max_grad_norm: 0.5
  use_sde: False
  sde_sample_freq: -1
  target_kl: 0.01
  shared_layers: null
  policy_layers: [64, 64]
  reward_vf_layers: [64, 64]
  cost_vf_layers: [64, 64]
  batch_size: null
  eval_every: 2048
  use_curiosity_driven_exploration: False
  warmup_timesteps: False
  reset_policy: False
  forward_timesteps: 5000
  clip_range_reward_vf: null
  clip_range_cost_vf: null
  penalty_initial_value: 1
  penalty_learning_rate: 0.01
  budget: 0
  proportional_control_coeff: 10
  integral_control_coeff: 0.0001
  derivative_control_coeff: 0
  pid_delay: 1
  proportional_cost_ema_alpha: 0.5
  derivative_cost_ema_alpha: 0.5
