task: 'ICRL-Walker'
group: 'CICRL'
device: 'cuda'
verbose: 2
env:
  config_path: null
  train_env_id : 'WalkerWithPosNoise-v0'
  eval_env_id: 'WalkerWithPosNoise-v0'
  save_dir: '../save_model'
  cost_info_str: 'cost'
  use_cost: True
  reward_gamma: 0.99
  cost_gamma: 0.99
  dont_normalize_obs: False
  dont_normalize_reward: False
  dont_normalize_cost: False  # cost
  record_info_names: ['x_position']
  record_info_input_dims: [ 0 ] # the dim of record info in inputs=(obs, action)
  visualize_info_ranges: [ [ -10, 10 ] ]
  noise_mean: 0
  noise_std: 0.01

running:
  n_iters: 40
  n_eval_episodes: 10
  save_every: 10
  expert_rollouts: 50
  sample_rollouts: 50
  store_sample_rollouts: null
  expert_path: '../data/expert_data/BlockedWalker/'
  use_buffer: False
  store_by_game: True

PPO:
  policy_name: 'DistributionalTwoCriticsMlpPolicy'
  learning_rate: 0.0001
  n_steps: 2048
  n_epochs: 20
  reward_gamma: 0.99
  reward_gae_lambda: 0.9
  cost_gamma: 0.99
  cost_gae_lambda: 0.9
  clip_range: 0.4
  ent_coef: 0.0
  reward_vf_coef: 0.5
  cost_vf_coef: 0.5
  max_grad_norm: 0.5
  use_sde: False
  sde_sample_freq: -1
  target_kl: 0.02
  shared_layers: null
  policy_layers: [64, 64]
  reward_vf_layers: [64, 64]
  cost_vf_layers: [64, 64]
  batch_size: 128
  eval_every: 2048
  use_curiosity_driven_exploration: False
  warmup_timesteps: False
  reset_policy: False
  forward_timesteps: 200000
  clip_range_reward_vf: null
  clip_range_cost_vf: null
  penalty_initial_value: 0.1
  penalty_learning_rate: 0.05
  budget: 0
  proportional_control_coeff: 10
  integral_control_coeff: 0.0001
  derivative_control_coeff: 0
  pid_delay: 1
  proportional_cost_ema_alpha: 0.5
  derivative_cost_ema_alpha: 0.5
  cost_adv: True
  input_action: False

CN:
  cn_learning_rate: 0.001
  cn_reg_coeff: 0.6
  cn_layers: [64, 64]
  cn_batch_size: null
  cn_obs_select_name: null  # null means all
  cn_acs_select_name: null  # null means all
  no_importance_sampling: False
  per_step_importance_sampling: True
  clip_obs: 20
  cn_normalize: False
  cn_target_kl_old_new: 10
  cn_target_kl_new_old: 2.5
  train_gail_lambda: False
  cn_eps: 0.00001
  backward_iters: 5
  anneal_clr_by_factor: 0.9
  mode: 'sample'
  generated_mode: pair
  train_mode: separate
  generated_weight: 0.01
  sample_flow_num: 50

Distributional:
  method: SplineDQN
  N_quantiles: 64
  cost_quantile: 48
  tau_update: 0.01
  LR_QN: 0.0003
  qnet_layers: [256, 256]
  type: CVaR
  prob_yita: null