Pendulum-v1:
  n_timesteps: 50000 # 20000
  policy: 'MlpPolicy'
  # gamma: 0.98
  buffer_size: 200000
  learning_starts: 10000
  noise_type: 'normal'
  noise_std: 0.1
  gradient_steps: -1
  train_freq: [1, "episode"]
  learning_rate: !!float 1e-3
  n_actors: 4
  n_critics: 2
  temperature_initial: 0.5
  temperature_final: 0.5
  temperature_fraction: 0.3
  exploration_initial_eps: 0.1
  exploration_final_eps: 0.1
  exploration_fraction: 0.3
  # policy_kwargs: "dict(net_arch=[400, 300])"

LunarLanderContinuous-v2:
  n_timesteps: !!float 3e5
  policy: 'MlpPolicy'
  gamma: 0.98
  buffer_size: 200000
  learning_starts: 10000
  noise_type: 'normal'
  noise_std: 0.1
  gradient_steps: -1
  train_freq: [1, "episode"]
  learning_rate: !!float 1e-3
  policy_kwargs: "dict(net_arch=[400, 300])"
  temperature_initial: 0.1
  temperature_final: 0.01
  temperature_fraction: 0.3
  n_actors: 4
  n_critics: 2
  exploration_fraction: 0.3
  exploration_initial_eps: 0.03
  exploration_final_eps: 0.03

# === Mujoco Envs (v3) ===

HalfCheetah-v3: &mujoco-defaults
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_starts: 10000
  noise_type: 'normal'
  noise_std: 0.1
  temperature_initial: 0.1
  temperature_final: 0.1
  temperature_fraction: 0.3
  n_actors: 5
  n_critics: 2
  policy_delay: 2
  target_policy_noise: 0.2
  target_noise_clip: 0.5
  exploration_fraction: 0.5
  exploration_initial_eps: 0.2
  exploration_final_eps: 0.02


# Hopper-v3:
#   <<: *mujoco-defaults
#   # SAC Hyperparams
#   train_freq: 1
#   gradient_steps: 1
#   learning_rate: !!float 3e-4
#   batch_size: 256
#   noise_std: 0.1
#   n_actors: 4
#   n_critics: 2
#   temperature_initial: 0.1 # 0.5
#   temperature_final: 0.1 # 0.5
#   temperature_fraction: 0.5
#   exploration_fraction: 0.5
#   exploration_initial_eps: 0.1
#   exploration_final_eps: 1.0

Hopper-v3:
  <<: *mujoco-defaults
  # SAC Hyperparams
  train_freq: 1
  gradient_steps: 1
  learning_rate: !!float 3e-4
  batch_size: 256
  noise_std: 0.1
  n_actors: 2
  n_critics: 2
  temperature_initial: 0.1
  temperature_final: 0.01
  temperature_fraction: 1.0
  exploration_initial_eps: 0.1
  exploration_final_eps: 0.1
  exploration_fraction: 1.0
  

Walker2d-v3:
  <<: *mujoco-defaults
  n_timesteps: !!float 1e6
  noise_std: 0.1
  temperature_initial: 0.1
  temperature_final: 0.1  # 0.01
  temperature_fraction: 1.0  # 0.3
  n_actors: 3
  n_critics: 2
  exploration_fraction: 0.5
  exploration_initial_eps: 0.3
  exploration_final_eps: 0.03

# Walker2d-v3:
#   <<: *mujoco-defaults
#   n_timesteps: !!float 1e6
#   learning_starts: 10000
#   noise_std: 0.1
#   temperature_initial: 0.3 # 0.1
#   temperature_final: 0.1 # 0.01
#   temperature_fraction: 0.3
#   n_actors: 3
#   n_critics: 2
#   learning_rate: !!float 3e-4
#   exploration_fraction: 1.0
#   exploration_initial_eps: 0.67 # 0.03
#   exploration_final_eps: 0.03 # 0.03

# Ant-v3:
#   <<: *mujoco-defaults
#   noise_std: 0.1
#   temperature_initial: 0.5
#   temperature_final: 0.5
#   temperature_fraction: 1.0
#   n_actors: 3 
#   n_critics: 2
#   exploration_fraction: 0.5
#   exploration_initial_eps: 0.3
#   exploration_final_eps: 0.03

Ant-v3:
  <<: *mujoco-defaults
  noise_std: 0.1
  learning_rate: !!float 3e-4
  temperature_initial: 0.1
  temperature_final: 0.1
  temperature_fraction: 1.0
  n_actors: 2 
  n_critics: 2
  exploration_fraction: 1.0
  exploration_initial_eps: 0.1
  exploration_final_eps: 0.1

# Humanoid-v3:
#   <<: *mujoco-defaults
#   n_timesteps: !!float 2e6
#   # SAC Hyperparams
#   train_freq: 1
#   gradient_steps: 1
#   learning_rate: !!float 3e-4
#   batch_size: 256
#   temperature_initial: 0.1
#   temperature_final: 0.01
#   temperature_fraction: 0.3
#   n_actors: 3
#   n_critics: 2
#   exploration_fraction: 0.3
#   exploration_initial_eps: 0.03
#   exploration_final_eps: 0.03

# Humanoid-v3:
#   <<: *mujoco-defaults
#   n_timesteps: !!float 2e6
#   # SAC Hyperparams
#   train_freq: 1
#   gradient_steps: 1
#   learning_rate: !!float 3e-4
#   batch_size: 256
#   temperature_initial: 0.1
#   temperature_final: 0.01
#   temperature_fraction: 0.3
#   n_actors: 3
#   n_critics: 2
#   exploration_fraction: 0.3
#   exploration_initial_eps: 0.03
#   exploration_final_eps: 0.03

Humanoid-v3:
  <<: *mujoco-defaults
  n_timesteps: !!float 2e6
  # SAC Hyperparams
  train_freq: 1
  gradient_steps: 1
  learning_rate: !!float 3e-4
  batch_size: 256
  temperature_initial: 0.3
  temperature_final: 0.1
  temperature_fraction: 0.3
  n_actors: 4 # 3
  n_critics: 2
  exploration_fraction: 0.3
  exploration_initial_eps: 0.3
  exploration_final_eps: 0.1


Swimmer-v3:
  <<: *mujoco-defaults
  gamma: 0.9999
  train_freq: 1
  gradient_steps: 1
  temperature_initial: 0.1
  temperature_final: 0.1
  temperature_fraction: 1.0
  n_actors: 3
  n_critics: 2
  exploration_fraction: 0.5
  exploration_initial_eps: 0.3
  exploration_final_eps: 0.03

# # === Mujoco Envs (v2) ===

# HalfCheetah-v2: &mujoco-defaults
#   n_timesteps: !!float 1e6
#   policy: 'MlpPolicy'
#   learning_starts: 10000
#   noise_type: 'normal'
#   noise_std: 0.1
#   temperature_initial: 0.1
#   temperature_final: 0.01
#   temperature_fraction: 0.3
#   n_actors: 3 # 5
#   n_critics: 2
#   policy_delay: 2
#   target_policy_noise: 0.2
#   target_noise_clip: 0.5
#   exploration_fraction: 1.0
#   exploration_initial_eps: 0.02
#   exploration_final_eps: 0.02

# Hopper-v2:
#   <<: *mujoco-defaults
#   # SAC Hyperparams
#   train_freq: 1
#   gradient_steps: 1
#   learning_rate: !!float 3e-4
#   batch_size: 256
#   n_actors: 2
#   n_critics: 2
#   exploration_fraction: 1.0
#   exploration_initial_eps: 0.05
#   exploration_final_eps: 0.05
#   temperature_initial: 0.0
#   temperature_final: 0.0
#   temperature_fraction: 0.3 # 1.0

# Walker2d-v2:
#   <<: *mujoco-defaults
#   temperature_initial: 0.1
#   temperature_final: 0.01
#   temperature_fraction: 0.3
#   n_actors: 3
#   n_critics: 2
#   exploration_fraction: 0.3
#   exploration_initial_eps: 0.03
#   exploration_final_eps: 0.03

# Ant-v2:
#   <<: *mujoco-defaults
#   temperature_initial: 0.01
#   temperature_final: 0.01
#   temperature_fraction: 0.3
#   n_actors: 3
#   n_critics: 2
#   exploration_fraction: 0.3
#   exploration_initial_eps: 0.03
#   exploration_final_eps: 0.03

# Humanoid-v2:
#   <<: *mujoco-defaults
#   n_timesteps: !!float 2e6
#   # SAC Hyperparams
#   train_freq: 1
#   gradient_steps: 1
#   learning_rate: !!float 3e-4
#   batch_size: 256
#   temperature_initial: 0.1
#   temperature_final: 0.01
#   temperature_fraction: 0.3
#   n_actors: 3
#   n_critics: 2
#   exploration_fraction: 0.3
#   exploration_initial_eps: 0.03
#   exploration_final_eps: 0.03

# Swimmer-v2:
#   <<: *mujoco-defaults
#   gamma: 0.9999
#   train_freq: 1
#   gradient_steps: 1
#   temperature_initial: 0.1
#   temperature_final: 0.01
#   temperature_fraction: 0.3
#   n_actors: 3
#   n_critics: 2
#   exploration_fraction: 0.3
#   exploration_initial_eps: 0.03
#   exploration_final_eps: 0.03

maze2d-v0:
  n_timesteps: !!float 2e4
  policy: 'MlpPolicy'
  learning_starts: 200
  noise_type: 'normal'
  noise_std: 0.1
  gradient_steps: 1
  train_freq: 1
  learning_rate: !!float 3e-4
  temperature_initial: 0.3
  temperature_final: 0.1
  temperature_fraction: 1.0
  n_actors: 4
  n_critics: 2
  exploration_initial_eps: 0.1
  exploration_final_eps: 0.01
  exploration_fraction: 1.0

FetchPush-v1: &her-defaults
  n_timesteps: !!float 1e6
  policy: 'MultiInputPolicy'
  buffer_size: 1000000
  batch_size: 2048
  gamma: 0.95
  learning_rate: !!float 1e-3
  noise_type: 'normal'
  noise_std: 0.1
  replay_buffer_class: HerReplayBuffer
  replay_buffer_kwargs: "dict(
    online_sampling=True,
    goal_selection_strategy='future',
    n_sampled_goal=4
  )"
  policy_kwargs: "dict(net_arch=[512, 512, 512])"
  temperature_initial: 0.3
  temperature_final: 0.1
  temperature_fraction: 1.0
  n_actors: 4
  n_critics: 2
  exploration_initial_eps: 0.1
  exploration_final_eps: 0.01
  exploration_fraction: 1.0

FetchSlide-v1:
  n_timesteps: !!float 1e6
  policy: 'MultiInputPolicy'
  buffer_size: 1000000
  batch_size: 2048
  gamma: 0.95
  learning_rate: !!float 1e-3
  noise_type: 'normal'
  noise_std: 0.1
  replay_buffer_class: HerReplayBuffer
  replay_buffer_kwargs: "dict(
    online_sampling=True,
    goal_selection_strategy='future',
    n_sampled_goal=4
  )"
  policy_kwargs: "dict(net_arch=[512, 512, 512])"
  temperature_initial: 0.3
  temperature_final: 0.1
  temperature_fraction: 1.0
  n_actors: 8
  n_critics: 2
  exploration_initial_eps: 0.1
  exploration_final_eps: 0.01
  exploration_fraction: 1.0

FetchPickAndPlace-v1:
  n_timesteps: !!float 1e6
  policy: 'MultiInputPolicy'
  buffer_size: 1000000
  batch_size: 1024
  gamma: 0.95
  learning_rate: !!float 1e-3
  noise_type: 'normal'
  noise_std: 0.1
  replay_buffer_class: HerReplayBuffer
  replay_buffer_kwargs: "dict(
    online_sampling=True,
    goal_selection_strategy='future',
    n_sampled_goal=4
  )"
  policy_kwargs: "dict(net_arch=[512, 512, 512])"
  temperature_initial: 0.3
  temperature_final: 0.1
  temperature_fraction: 1.0
  n_actors: 8
  n_critics: 2
  exploration_initial_eps: 0.1
  exploration_final_eps: 0.01
  exploration_fraction: 1.0
