ppo:
  default:
    learning_rate: 1.0e-05
    n_steps: 3200
    batch_size: 32
    n_epochs: 20
    gamma: 0.99
    gae_lambda: 0.95
    clip_range: 0.2
    clip_range_vf: 0.2
    ent_coef: 0.05
    vf_coef: 0.5
    max_grad_norm: 0.5
    net_arch:
      pi:
      - 256
      - 256
      - 256
      vf:
      - 256
      - 256
      - 256
    _activation_fn: ReLU
    ortho_init: true
    # policy_type: MultiInputPolicy
    policy_type: MlpPolicy
    # policy_type: CustomPPOPolicy

# the famous ppo0
ppo_lrs:
  default:
    _learning_rate_schedule:
      type: linear
      initial_value: 0.0003
      final_value: 0.00005
    n_steps: 1600
    batch_size: 32
    n_epochs: 10
    gamma: 0.99
    gae_lambda: 0.95
    clip_range: 0.2
    clip_range_vf: 0.2
    ent_coef: 0.01
    vf_coef: 0.5
    max_grad_norm: 0.5
    net_arch:
      pi:
      - 256
      - 256
      vf:
      - 256
      - 256
    _activation_fn: ReLU
    ortho_init: true
    policy_type: MlpPolicy
  
dqn_her:
  default:
    learning_rate: 1.0e-04
    buffer_size: 1000000
    learning_starts: 10000
    batch_size: 64
    tau: 0.005
    gamma: 0.99
    train_freq: 1
    gradient_steps: -1
    target_update_interval: 100
    exploration_fraction: 0.2
    exploration_initial_eps: 1.0
    exploration_final_eps: 0.2
    goal_selection_strategy: future
    n_sampled_goal: 12
    net_arch:
    - 256
    - 256

dqn:
  default:
    learning_rate: 3.0e-04
    buffer_size: 100000
    learning_starts: 5000
    batch_size: 32
    tau: 0.005
    gamma: 0.99
    train_freq: 1
    gradient_steps: 16
    target_update_interval: 100
    exploration_fraction: 0.01
    exploration_initial_eps: 1.0
    exploration_final_eps: 0.1
    net_arch:
    - 256
    - 256
    - 256
    # policy_type: MlpPolicy 
    policy_type: MultiInputPolicy