# Tuned
MountainCarContinuous-v0:
  n_timesteps: 300000
  policy: 'MlpPolicy'
  noise_type: 'ornstein-uhlenbeck'
  noise_std: 0.5

Pendulum-v1:
  n_timesteps: 20000
  policy: 'MlpPolicy'
  # gamma: 0.98
  buffer_size: 200000
  learning_starts: 10000
  noise_type: 'normal'
  noise_std: 0.1
  gradient_steps: -1
  train_freq: [1, "episode"]
  learning_rate: !!float 1e-3
  # policy_kwargs: "dict(net_arch=[400, 300])"

LunarLanderContinuous-v2:
  n_timesteps: !!float 3e5
  policy: 'MlpPolicy'
  gamma: 0.98
  buffer_size: 200000
  learning_starts: 10000
  noise_type: 'normal'
  noise_std: 0.1
  gradient_steps: -1
  train_freq: [1, "episode"]
  learning_rate: !!float 1e-3
  policy_kwargs: "dict(net_arch=[400, 300])"

BipedalWalker-v3:
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  gamma: 0.98
  buffer_size: 200000
  learning_starts: 10000
  noise_type: 'normal'
  noise_std: 0.1
  gradient_steps: -1
  train_freq: [1, "episode"]
  learning_rate: !!float 1e-3
  policy_kwargs: "dict(net_arch=[400, 300])"

# To be tuned
BipedalWalkerHardcore-v3:
  n_timesteps: !!float 1e7
  policy: 'MlpPolicy'
  gamma: 0.99
  buffer_size: 1000000
  learning_starts: 10000
  noise_type: 'normal'
  noise_std: 0.1
  batch_size: 256
  train_freq: 1
  learning_rate: lin_7e-4
  policy_kwargs: "dict(net_arch=[400, 300])"

# Tuned
HalfCheetahBulletEnv-v0: &pybullet-defaults
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  gamma: 0.98
  buffer_size: 200000
  learning_starts: 10000
  noise_type: 'normal'
  noise_std: 0.1
  gradient_steps: -1
  train_freq: [1, "episode"]
  learning_rate: !!float 1e-3
  policy_kwargs: "dict(net_arch=[400, 300])"

AntBulletEnv-v0:
  <<: *pybullet-defaults

HopperBulletEnv-v0:
  <<: *pybullet-defaults

Walker2DBulletEnv-v0:
  <<: *pybullet-defaults


# TO BE tested
HumanoidBulletEnv-v0:
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  gamma: 0.98
  buffer_size: 200000
  learning_starts: 10000
  noise_type: 'normal'
  noise_std: 0.1
  train_freq: 1
  learning_rate: !!float 1e-3
  policy_kwargs: "dict(net_arch=[400, 300])"

# Tuned
ReacherBulletEnv-v0:
  <<: *pybullet-defaults
  n_timesteps: !!float 3e5

# Tuned
InvertedDoublePendulumBulletEnv-v0:
  <<: *pybullet-defaults

# Tuned
InvertedPendulumSwingupBulletEnv-v0:
  <<: *pybullet-defaults
  n_timesteps: !!float 3e5


MinitaurBulletEnv-v0:
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  gamma: 0.99
  buffer_size: 1000000
  noise_type: 'normal'
  noise_std: 0.1
  learning_starts: 10000
  batch_size: 100
  learning_rate: !!float 1e-3
  train_freq: 1
  gradient_steps: 1
  policy_kwargs: "dict(net_arch=[400, 300])"

# === Mujoco Envs ===

HalfCheetah-v3: &mujoco-defaults
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  learning_starts: 10000
  noise_type: 'normal'
  noise_std: 0.1

Ant-v3:
  <<: *mujoco-defaults

Hopper-v3:
  <<: *mujoco-defaults
  # SAC Hyperparams
  n_timesteps: !!float 2e6
  train_freq: 1
  gradient_steps: 1
  learning_rate: !!float 3e-4
  batch_size: 256

Walker2d-v3:
  <<: *mujoco-defaults

Humanoid-v3:
  <<: *mujoco-defaults
  n_timesteps: !!float 2e6
  # SAC Hyperparams
  train_freq: 1
  gradient_steps: 1
  learning_rate: !!float 3e-4
  batch_size: 256

# Tuned
Swimmer-v3:
  <<: *mujoco-defaults
  gamma: 0.9999
  train_freq: 1
  gradient_steps: 1

FetchPush-v1: &her-defaults
  n_timesteps: !!float 1e6
  policy: 'MultiInputPolicy'
  buffer_size: 1000000
  batch_size: 2048
  gamma: 0.95
  learning_rate: !!float 1e-3
  noise_type: 'normal'
  noise_std: 0.1
  replay_buffer_class: HerReplayBuffer
  replay_buffer_kwargs: "dict(
    online_sampling=True,
    goal_selection_strategy='future',
    n_sampled_goal=4
  )"
  policy_kwargs: "dict(net_arch=[512, 512, 512])"

FetchSlide-v1:
  n_timesteps: !!float 1e6
  policy: 'MultiInputPolicy'
  buffer_size: 1000000
  batch_size: 2048
  gamma: 0.95
  learning_rate: !!float 1e-3
  noise_type: 'normal'
  noise_std: 0.1
  replay_buffer_class: HerReplayBuffer
  replay_buffer_kwargs: "dict(
    online_sampling=True,
    goal_selection_strategy='future',
    n_sampled_goal=4
  )"
  policy_kwargs: "dict(net_arch=[512, 512, 512])"

FetchPickAndPlace-v1:
  n_timesteps: !!float 1e6
  policy: 'MultiInputPolicy'
  buffer_size: 1000000
  batch_size: 1024
  gamma: 0.95
  learning_rate: !!float 1e-3
  noise_type: 'normal'
  noise_std: 0.1
  replay_buffer_class: HerReplayBuffer
  replay_buffer_kwargs: "dict(
    online_sampling=True,
    goal_selection_strategy='future',
    n_sampled_goal=4
  )"
  policy_kwargs: "dict(net_arch=[512, 512, 512])"
