# === Mujoco Envs ===

InvertedPendulum-v4: &inverted-pendulum
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  gamma: 0.9999
  buffer_size: 200000
  learning_starts: 10000
  noise_type: 'normal'
  noise_std: 0.1
  gradient_steps: -1
  train_freq: 256
  learning_rate: !!float 1e-4
  policy_kwargs: "dict(net_arch=[256, 256])"

InvertedPendulumWide-v4:
  <<: *inverted-pendulum

InvertedDoublePendulum-v4:
  gamma: 0.9999
  <<: *inverted-pendulum

InvertedDoublePendulumWide-v4:
  <<: *inverted-pendulum

HalfCheetah-v4: &mujoco-defaults
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_starts: 10000
  noise_type: 'normal'
  noise_std: 0.1

Ant-v4:
  <<: *mujoco-defaults

Hopper-v4:
  <<: *mujoco-defaults

Walker2d-v4:
  <<: *mujoco-defaults

Humanoid-v4:
  <<: *mujoco-defaults
  n_timesteps: !!float 1e6
  # SAC Hyperparams
  train_freq: 1
  gradient_steps: 1
  learning_rate: !!float 3e-4
  batch_size: 256

HumanoidStandup-v4:
  <<: *mujoco-defaults
  n_timesteps: !!float 1e6
  # SAC Hyperparams
  train_freq: 1
  gradient_steps: 1
  learning_rate: !!float 3e-4
  batch_size: 256

Swimmer-v4:
  <<: *mujoco-defaults
  gamma: 0.9999
  train_freq: 1
  gradient_steps: 1
  batch_size: 256
  learning_rate: !!float 1e-4

Goal2D-v0: &predator_prey
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  gamma: 0.99
  buffer_size: 1000000
  learning_starts: 10000
  batch_size: 128
  noise_type: 'normal'
  noise_std: 0.1
  gradient_steps: 32
  train_freq: 64
  learning_rate: !!float 1e-3
  policy_kwargs: "dict(net_arch=[64,64])"

#####################################################
PandaReach-v3:
  n_timesteps: !!float 100e3
#  policy: 'MultiInputPolicy'
  policy: 'MlpPolicy'
  buffer_size: 1000000
  batch_size: 256
  gamma: 0.95
  learning_rate: 0.001
  learning_starts: 1000
#  replay_buffer_class: HerReplayBuffer
#  replay_buffer_kwargs: "dict(
#    online_sampling=True,
#    goal_selection_strategy='future',
#    n_sampled_goal=4
#  )"
  policy_kwargs: "dict(n_critics=1, net_arch=[256,256,256])"
  noise_type: 'normal'
  noise_std: 0.2

PandaPush-v3:
  n_timesteps: !!float 1e6
#  policy: 'MultiInputPolicy'
  policy: 'MlpPolicy'
  buffer_size: 1000000
  batch_size: 256
  gamma: 0.95
  learning_rate: 0.001
  learning_starts: 1000
  tau: 0.05
#  replay_buffer_class: HerReplayBuffer
#  replay_buffer_kwargs: "dict(
#    online_sampling=True,
#    goal_selection_strategy='future',
#    n_sampled_goal=4
#  )"
  policy_kwargs: "dict(n_critics=1, net_arch=[256,256,256])"
  noise_type: 'normal'
  noise_std: 0.2

PandaSlide-v3:
  n_timesteps: !!float 1e6
#  policy: 'MultiInputPolicy'
  policy: 'MlpPolicy'
  buffer_size: 1000000
  batch_size: 256
  gamma: 0.95
  learning_rate: 0.001
  learning_starts: 1000
  tau: 0.05
#  replay_buffer_class: HerReplayBuffer
#  replay_buffer_kwargs: "dict(
#    online_sampling=True,
#    goal_selection_strategy='future',
#    n_sampled_goal=4
#  )"
  policy_kwargs: "dict(n_critics=1, net_arch=[256,256,256])"
  noise_type: 'normal'
  noise_std: 0.2

PandaPickAndPlace-v3:
  n_timesteps: !!float 1e6
#  policy: 'MultiInputPolicy'
  policy: 'MlpPolicy'
  buffer_size: 1000000
  batch_size: 256
  gamma: 0.95
  learning_rate: 0.001
  learning_starts: 1000
  tau: 0.05
#  replay_buffer_class: HerReplayBuffer
#  replay_buffer_kwargs: "dict(
#    online_sampling=True,
#    goal_selection_strategy='future',
#    n_sampled_goal=4
#  )"
  policy_kwargs: "dict(n_critics=1, net_arch=[256,256,256])"
  noise_type: 'normal'
  noise_std: 0.2

PandaFlip-v3:
  n_timesteps: !!float 1e6
#  policy: 'MultiInputPolicy'
  policy: 'MlpPolicy'
  buffer_size: 1000000
  batch_size: 256
  gamma: 0.95
  learning_rate: 0.001
  learning_starts: 1000
  tau: 0.05
#  replay_buffer_class: HerReplayBuffer
#  replay_buffer_kwargs: "dict(
#    online_sampling=True,
#    goal_selection_strategy='future',
#    n_sampled_goal=4
#  )"
  policy_kwargs: "dict(n_critics=1, net_arch=[256,256,256])"
  noise_type: 'normal'
  noise_std: 0.2

PandaStack-v3:
  n_timesteps: !!float 1e6
#  policy: 'MultiInputPolicy'
  policy: 'MlpPolicy'
  buffer_size: 1000000
  batch_size: 256
  gamma: 0.95
  learning_rate: 0.001
  learning_starts: 1000
  tau: 0.05
#  replay_buffer_class: HerReplayBuffer
#  replay_buffer_kwargs: "dict(
#    online_sampling=True,
#    goal_selection_strategy='future',
#    n_sampled_goal=4
#  )"
  policy_kwargs: "dict(n_critics=1, net_arch=[256,256,256])"
  noise_type: 'normal'
  noise_std: 0.2