group: ICRL
verbose: false
env_type : 'gym'
env_id: 'CartPole-Middle'
env: "tools.environments.create('gym', 'CartPole-Middle', normalize_states=False)"
test_env: "tools.environments.create('gym', 'CartPole-Middle', normalize_states=False)"
sampling_env: "tools.environments.create('gym', 'CartPole-Middle', normalize_states=False)"
reward_gamma: 0.99
discount_factor: 0.99
learning_rate: 0.0005
expert_episodes: 50
window: 25
normalize_func: "max"
cost_condition: "lambda s, a: (s[0] > 1. and a == 1) or (s[0] < -1. and a == 0)"
cost_comparison: "lambda mc, c: __import__('tools').utils.mse(mc, c)"
accrual_comparison: "lambda ea, a: __import__('tools').utils.wasserstein_distance2d(ea[0, :].reshape(1, -1), a[0, :].reshape(1, -1)) + __import__('tools').utils.wasserstein_distance2d(ea[1, :].reshape(1, -1), a[1, :].reshape(1, -1))"
input_format: "lambda s, a: [*s, a]"
vector_input_format: "lambda S, A: torch.cat((S, A), dim=-1)"
state_reduction: "lambda s: s[:1]"
vector_state_reduction: "lambda S: S[:, :, :1]"
action_reduction: "lambda a: a"
vector_action_reduction: "lambda A: A"
i: 2
beta: 0.01
plot_interval: 10

running:
  sample_rollouts: 50
  n_eval_episodes: 100
  n_iters: 200

PPO:
  policy_name: 'TwoCriticsMlpPolicy'
  learning_rate: 0.0005
  n_steps: 2000
  n_epochs: 25
  reward_gamma: 0.99
  reward_gae_lambda: 0.99
  cost_gamma: 0.99
  cost_gae_lambda: 0.99
  clip_range: 0.1
  ent_coef: 0.01
  reward_vf_coef: 0.5
  cost_vf_coef: 0.5
  max_grad_norm: 0.5
  use_sde: False
  sde_sample_freq: -1
  target_kl: 0.02
  shared_layers: null
  policy_layers: [64, 64]
  reward_vf_layers: [64, 64]
  cost_vf_layers: [64, 64]
  batch_size: 64
  use_curiosity_driven_exploration: False
  warmup_timesteps: False
  reset_policy: False
  forward_timesteps: 10000
  clip_range_reward_vf: null
  clip_range_cost_vf: null
  penalty_initial_value: 0.1
  penalty_learning_rate: 0.01
  budget: 0
  proportional_control_coeff: 10
  integral_control_coeff: 0.0001
  derivative_control_coeff: 0
  pid_delay: 1
  proportional_cost_ema_alpha: 0.5
  derivative_cost_ema_alpha: 0.5

CN:
  cn_learning_rate: 0.0005
  cn_reg_coeff: 0.6
  cn_batch_size: null
  cn_obs_select_name: null  # null means all
  cn_acs_select_name: null  # null means all
  no_importance_sampling: False
  per_step_importance_sampling: True
  clip_obs: 20
  cn_target_kl_old_new: 10
  cn_target_kl_new_old: 2.5
  train_gail_lambda: False
  cn_eps: 0.00001
  backward_iters: 5
  anneal_clr_by_factor: 0.9