# Tuned
MountainCarContinuous-v0:
  n_timesteps: !!float 50000
  policy: 'MlpPolicy'
  learning_rate: !!float 3e-4
  buffer_size: 50000
  batch_size: 512
  ent_coef: 0.1
  train_freq: 32
  gradient_steps: 32
  gamma: 0.9999
  tau: 0.01
  learning_starts: 0
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3.67, net_arch=[64, 64])"

Pendulum-v1:
  n_timesteps: 20000
  policy: 'MlpPolicy'
  learning_rate: !!float 1e-3

LunarLanderContinuous-v2:
  n_timesteps: !!float 5e5
  policy: 'MlpPolicy'
  learning_rate: lin_7.3e-4
  buffer_size: 1000000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.99
  tau: 0.01
  train_freq: 1
  gradient_steps: 1
  learning_starts: 10000
  policy_kwargs: "dict(net_arch=[400, 300])"

BipedalWalker-v3:
  n_timesteps: !!float 5e5
  policy: 'MlpPolicy'
  learning_rate: !!float 7.3e-4
  buffer_size: 300000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.98
  tau: 0.02
  train_freq: 64
  gradient_steps: 64
  learning_starts: 10000
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"

# Almost tuned
# History wrapper of size 2 for better performances
BipedalWalkerHardcore-v3:
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  learning_rate: lin_7.3e-4
  buffer_size: 1000000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.99
  tau: 0.01
  train_freq: 1
  gradient_steps: 1
  learning_starts: 10000
  policy_kwargs: "dict(net_arch=[400, 300])"

# === Bullet envs ===

# Tuned
HalfCheetahBulletEnv-v0: &pybullet-defaults
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: !!float 7.3e-4
  buffer_size: 300000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.98
  tau: 0.02
  train_freq: 8
  gradient_steps: 8
  learning_starts: 10000
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"

# Tuned
AntBulletEnv-v0:
  <<: *pybullet-defaults

# Tuned
HopperBulletEnv-v0:
  <<: *pybullet-defaults
  learning_rate: lin_7.3e-4
  top_quantiles_to_drop_per_net: 5

# Tuned
Walker2DBulletEnv-v0:
  <<: *pybullet-defaults
  learning_rate: lin_7.3e-4

ReacherBulletEnv-v0:
  <<: *pybullet-defaults
  n_timesteps: !!float 3e5

# Almost tuned
HumanoidBulletEnv-v0:
  n_timesteps: !!float 1e7
  policy: 'MlpPolicy'
  learning_rate: lin_7.3e-4
  buffer_size: 300000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.98
  tau: 0.02
  train_freq: 8
  gradient_steps: 8
  learning_starts: 10000
  top_quantiles_to_drop_per_net: 5
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"

InvertedDoublePendulumBulletEnv-v0:
  <<: *pybullet-defaults
  n_timesteps: !!float 5e5

InvertedPendulumSwingupBulletEnv-v0:
  <<: *pybullet-defaults
  n_timesteps: !!float 3e5

MinitaurBulletEnv-v0:
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: !!float 3e-4
  buffer_size: 100000
  batch_size: 256
  ent_coef: 'auto'
  train_freq: 1
  gradient_steps: 1
  learning_starts: 10000
  # top_quantiles_to_drop_per_net: 5

# === Mujoco Envs ===

HalfCheetah-v3: &mujoco-defaults
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_starts: 10000

Ant-v3:
  <<: *mujoco-defaults

Hopper-v3:
  <<: *mujoco-defaults
  top_quantiles_to_drop_per_net: 5

Walker2d-v3:
  <<: *mujoco-defaults

Humanoid-v3:
  <<: *mujoco-defaults
  n_timesteps: !!float 2e6

Swimmer-v3:
  <<: *mujoco-defaults
  gamma: 0.9999

# === HER Robotics GoalEnvs ===
FetchReach-v1:
  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 20000
  policy: 'MultiInputPolicy'
  buffer_size: 1000000
  ent_coef: 'auto'
  batch_size: 256
  gamma: 0.95
  learning_rate: 0.001
  learning_starts: 1000
  normalize: True
  replay_buffer_class: HerReplayBuffer
  replay_buffer_kwargs: "dict(
    online_sampling=True,
    goal_selection_strategy='future',
    n_sampled_goal=4
  )"
  policy_kwargs: "dict(net_arch=[64, 64], n_critics=1)"

FetchPush-v1: &her-defaults
  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 1e6
  policy: 'MultiInputPolicy'
  buffer_size: 1000000
  batch_size: 2048
  gamma: 0.95
  learning_rate: !!float 1e-3
  tau: 0.05
  replay_buffer_class: HerReplayBuffer
  replay_buffer_kwargs: "dict(
    online_sampling=True,
    goal_selection_strategy='future',
    n_sampled_goal=4,
  )"
  policy_kwargs: "dict(net_arch=[512, 512, 512], n_critics=2)"

FetchSlide-v1:
  <<: *her-defaults
  n_timesteps: !!float 3e6

FetchPickAndPlace-v1:
  <<: *her-defaults

PandaReach-v1:
  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 20000
  policy: 'MultiInputPolicy'
  buffer_size: 1000000
  ent_coef: 'auto'
  batch_size: 256
  gamma: 0.95
  learning_rate: 0.001
  learning_starts: 1000
  normalize: True
  replay_buffer_class: HerReplayBuffer
  replay_buffer_kwargs: "dict(
    online_sampling=True,
    goal_selection_strategy='future',
    n_sampled_goal=4
  )"
  policy_kwargs: "dict(net_arch=[64, 64], n_critics=1)"

PandaPush-v1:
  <<: *her-defaults

PandaSlide-v1:
  <<: *her-defaults
  n_timesteps: !!float 3e6

PandaPickAndPlace-v1:
  <<: *her-defaults

PandaStack-v1:
  <<: *her-defaults

parking-v0:
  <<: *her-defaults
  n_timesteps: !!float 1e5
  buffer_size: 1000000
  batch_size: 512
  gamma: 0.98
  learning_rate: !!float 1.5e-3
  tau: 0.005
  replay_buffer_class: HerReplayBuffer
  replay_buffer_kwargs: "dict(
    online_sampling=False,
    goal_selection_strategy='episode',
    n_sampled_goal=4,
    max_episode_length=100
  )"

A1Walking-v0:
  <<: *pybullet-defaults
