atari:
  env_wrapper:
    - stable_baselines3.common.atari_wrappers.AtariWrapper
  frame_stack: 4
  policy: 'CnnPolicy'
  n_envs: 8
  n_steps: 128
  n_epochs: 4
  batch_size: 256
  n_timesteps: !!float 1e7
  learning_rate: lin_2.5e-4
  clip_range: lin_0.1
  vf_coef: 0.5
  ent_coef: 0.01

# Tuned
Pendulum-v1:
  n_envs: 4
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  n_steps: 1024
  gae_lambda: 0.95
  gamma: 0.9
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: !!float 1e-3
  clip_range: 0.2
  use_sde: True
  sde_sample_freq: 4

# Tuned
CartPole-v1:
  n_envs: 8
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  n_steps: 32
  batch_size: 256
  gae_lambda: 0.8
  gamma: 0.98
  n_epochs: 20
  ent_coef: 0.0
  learning_rate: lin_0.001
  clip_range: lin_0.2

MountainCar-v0:
  normalize: true
  n_envs: 16
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  n_steps: 16
  gae_lambda: 0.98
  gamma: 0.99
  n_epochs: 4
  ent_coef: 0.0

# Tuned
MountainCarContinuous-v0:
  normalize: "dict(norm_obs=True, norm_reward=False)"
  n_envs: 1
  n_timesteps: !!float 20000
  policy: 'MlpPolicy'
  batch_size: 256
  n_steps: 8
  gamma: 0.9999
  learning_rate: !!float 7.77e-05
  ent_coef: 0.00429
  clip_range: 0.1
  n_epochs: 10
  gae_lambda: 0.9
  max_grad_norm: 5
  vf_coef: 0.19
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3.29, ortho_init=False)"

Acrobot-v1:
  normalize: true
  n_envs: 16
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  n_steps: 256
  gae_lambda: 0.94
  gamma: 0.99
  n_epochs: 4
  ent_coef: 0.0

BipedalWalker-v3:
  normalize: true
  n_envs: 32
  n_timesteps: !!float 5e6
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.999
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: !!float 3e-4
  clip_range: 0.18

BipedalWalkerHardcore-v3:
  normalize: true
  n_envs: 16
  n_timesteps: !!float 10e7
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.001
  learning_rate: lin_2.5e-4
  clip_range: lin_0.2

LunarLander-v2:
  n_envs: 16
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  n_steps: 1024
  batch_size: 64
  gae_lambda: 0.98
  gamma: 0.999
  n_epochs: 4
  ent_coef: 0.01

LunarLanderContinuous-v2:
  n_envs: 16
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  n_steps: 1024
  batch_size: 64
  gae_lambda: 0.98
  gamma: 0.999
  n_epochs: 4
  ent_coef: 0.01

# Tuned
HalfCheetahBulletEnv-v0: &pybullet-defaults
  normalize: true
  n_envs: 16
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  batch_size: 128
  n_steps: 512
  gamma: 0.99
  gae_lambda: 0.9
  n_epochs: 20
  ent_coef: 0.0
  sde_sample_freq: 4
  max_grad_norm: 0.5
  vf_coef: 0.5
  learning_rate: !!float 3e-5
  use_sde: True
  clip_range: 0.4
  policy_kwargs: "dict(log_std_init=-2,
                       ortho_init=False,
                       activation_fn=nn.ReLU,
                       net_arch=dict(pi=[256, 256], vf=[256, 256])
                       )"

# Tuned
AntBulletEnv-v0:
  <<: *pybullet-defaults
  learning_rate: !!float 3e-5
  policy_kwargs: "dict(log_std_init=-1,
                       ortho_init=False,
                       activation_fn=nn.ReLU,
                       net_arch=dict(pi=[256, 256], vf=[256, 256])
                       )"

# Tuned
Walker2DBulletEnv-v0:
  <<: *pybullet-defaults
  learning_rate: !!float 3e-5
  clip_range: lin_0.4
  policy_kwargs: "dict(log_std_init=-2,
                       ortho_init=False,
                       activation_fn=nn.ReLU,
                       net_arch=dict(pi=[256, 256], vf=[256, 256])
                       )"

# Tuned
HopperBulletEnv-v0:
  <<: *pybullet-defaults
  learning_rate: !!float 3e-5
  clip_range: lin_0.4
  policy_kwargs: "dict(log_std_init=-2,
                       ortho_init=False,
                       activation_fn=nn.ReLU,
                       net_arch=dict(pi=[256, 256], vf=[256, 256])
                       )"

# Tuned
ReacherBulletEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  batch_size: 64
  n_steps: 512
  gamma: 0.99
  gae_lambda: 0.9
  n_epochs: 20
  ent_coef: 0.0
  sde_sample_freq: 4
  max_grad_norm: 0.5
  vf_coef: 0.5
  learning_rate: !!float 3e-5
  use_sde: True
  clip_range: lin_0.4
  policy_kwargs: "dict(log_std_init=-2.7,
                       ortho_init=False,
                       activation_fn=nn.ReLU,
                       net_arch=dict(pi=[256, 256], vf=[256, 256])
                       )"

MinitaurBulletEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2

MinitaurBulletDuckEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2

# To be tuned
HumanoidBulletEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 1e7
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2

InvertedDoublePendulumBulletEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2

InvertedPendulumSwingupBulletEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2

# Following https://github.com/lcswillems/rl-starter-files
# requires --gym-packages gym_minigrid
MiniGrid-DoorKey-5x5-v0:
  # Dict Observations are now supported
  # env_wrapper: gym_minigrid.wrappers.FlatObsWrapper
  normalize: true
  n_envs: 8 # number of environment copies running in parallel
  n_timesteps: !!float 1e5
  policy: MlpPolicy
  n_steps: 128 # batch size is n_steps * n_env
  batch_size: 64 # Number of training minibatches per update
  gae_lambda: 0.95 #  Factor for trade-off of bias vs variance for Generalized Advantage Estimator
  gamma: 0.99
  n_epochs: 10 #  Number of epoch when optimizing the surrogate
  ent_coef: 0.0 # Entropy coefficient for the loss caculation
  learning_rate: 2.5e-4 # The learning rate, it can be a function
  clip_range: 0.2 # Clipping parameter, it can be a function

# requires --gym-packages gym_minigrid
MiniGrid-FourRooms-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 4e6
  policy: 'MlpPolicy'
  n_steps: 512
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2

MiniGrid-Dynamic-Obstacles-6x6-v0:
  # Dict Observations are now supported
  # env_wrapper: gym_minigrid.wrappers.FlatObsWrapper
  env_wrapper:
    - minigrid.wrappers.ImgObsWrapper
    - src.utils.wrappers.TransposedObservation
    - src.utils.wrappers.LavaNegRewWrapper
  normalize: true
  n_envs: 8 # number of environment copies running in parallel
  n_timesteps: !!float 8e5
  policy: src.utils.policies_forked.ActorCriticCnnPolicyGrid
  n_steps: 256 # batch size is n_steps * n_env
  batch_size: 256 # Number of training minibatches per update
  gae_lambda: 0.95 #  Factor for trade-off of bias vs variance for Generalized Advantage Estimator
  gamma: 0.99
  n_epochs: 10 #  Number of epoch when optimizing the surrogate
  ent_coef: 0.1 # Entropy coefficient for the loss caculation
  learning_rate: 0.001 # The learning rate, it can be a function
  clip_range: 0.2 # Clipping parameter, it can be a function
  max_grad_norm: 0.1
  vf_coef: 0.5
  target_kl: 0.01
  policy_kwargs: "dict(ortho_init=True)"

MiniGrid-LavaGapS6-v0:
  # Dict Observations are now supported
  env_wrapper:
    - minigrid.wrappers.ImgObsWrapper
    - src.utils.wrappers.TransposedObservation
    - src.utils.wrappers.RescaleWrapper
  normalize: true
  n_envs: 8 # number of environment copies running in parallel
  n_timesteps: !!float 6e5
  policy: src.utils.policies_forked.ActorCriticCnnPolicyGrid
  n_steps: 256 # batch size is n_steps * n_env
  batch_size: 256 # Number of training minibatches per update
  gae_lambda: 0.95 #  Factor for trade-off of bias vs variance for Generalized Advantage Estimator
  gamma: 0.95
  n_epochs: 10 #  Number of epoch when optimizing the surrogate
  clip_range: 0.2 # Clipping parameter, it can be a function
  ent_coef: 0
  learning_rate: 0.001 # The learning rate, it can be a function
  max_grad_norm: 1
  vf_coef: 0.1
  target_kl: 0.01

MiniGrid-LavaCrossingS9N1-v0:
  # Dict Observations are now supported
  # env_wrapper: gym_minigrid.wrappers.FlatObsWrapper
  env_wrapper:
    - minigrid.wrappers.ImgObsWrapper
    - src.utils.wrappers.TransposedObservation
    - src.utils.wrappers.RescaleWrapper
  normalize: true
  n_envs: 8 # number of environment copies running in parallel
  n_timesteps: !!float 2e6
  policy: src.utils.policies_forked.ActorCriticCnnPolicyGrid
  n_steps: 512 # batch size is n_steps * n_env
  batch_size: 1024 # Number of training minibatches per update
  gae_lambda: 0.95 #  Factor for trade-off of bias vs variance for Generalized Advantage Estimator
  gamma: 0.97
  vf_coef: 0.1
  n_epochs: 10 #  Number of epoch when optimizing the surrogate
  ent_coef: 0.01 # Entropy coefficient for the loss caculation
  learning_rate: 0.001 # The learning rate, it can be a function
  clip_range: 0.2 # Clipping parameter, it can be a function
  max_grad_norm: 1
  target_kl: 0.01

CarRacing-v0:
  env_wrapper:
    - rl_zoo3.wrappers.FrameSkip:
        skip: 2
    - gym.wrappers.resize_observation.ResizeObservation:
        shape: 64
    - gym.wrappers.gray_scale_observation.GrayScaleObservation:
        keep_dim: true
  frame_stack: 2
  normalize: "{'norm_obs': False, 'norm_reward': True}"
  n_envs: 8
  n_timesteps: !!float 4e6
  policy: 'CnnPolicy'
  batch_size: 128
  n_steps: 512
  gamma: 0.99
  gae_lambda: 0.95
  n_epochs: 10
  ent_coef: 0.0
  sde_sample_freq: 4
  max_grad_norm: 0.5
  vf_coef: 0.5
  learning_rate: lin_1e-4
  use_sde: True
  clip_range: 0.2
  policy_kwargs: "dict(log_std_init=-2,
                       ortho_init=False,
                       activation_fn=nn.GELU,
                       net_arch=dict(pi=[256], vf=[256]),
                       )"


# === Mujoco Envs ===
# HalfCheetah-v3: &mujoco-defaults
#   normalize: true
#   n_timesteps: !!float 1e6
#   policy: 'MlpPolicy'

Ant-v3: &mujoco-defaults
  normalize: true
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'

# Hopper-v3:
#   <<: *mujoco-defaults
#
# Walker2d-v3:
#   <<: *mujoco-defaults
#
# Humanoid-v3:
#   <<: *mujoco-defaults
#   n_timesteps: !!float 2e6
#
# tuned
Swimmer-v3:
  <<: *mujoco-defaults
  gamma: 0.9999
  n_envs: 4
  n_steps: 1024
  batch_size: 256
  learning_rate: !!float 6e-4
  gae_lambda: 0.98

# Tuned
# 10 mujoco envs

HalfCheetah-v3:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 64
  n_steps: 512
  gamma: 0.98
  learning_rate: 2.0633e-05
  ent_coef: 0.000401762
  clip_range: 0.1
  n_epochs: 20
  gae_lambda: 0.92
  max_grad_norm: 0.8
  vf_coef: 0.58096
  policy_kwargs: "dict(
                    log_std_init=-2,
                    ortho_init=False,
                    activation_fn=nn.ReLU,
                    net_arch=dict(pi=[256, 256], vf=[256, 256])
                  )"

# Ant-v3:
#   normalize: true
#   n_envs: 1
#   policy: 'MlpPolicy'
#   n_timesteps: !!float 1e7
#   batch_size: 32
#   n_steps: 512
#   gamma: 0.98
#   learning_rate: 1.90609e-05
#   ent_coef: 4.9646e-07
#   clip_range: 0.1
#   n_epochs: 10
#   gae_lambda: 0.8
#   max_grad_norm: 0.6
#   vf_coef: 0.677239

Hopper-v3:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 32
  n_steps: 512
  gamma: 0.999
  learning_rate: 9.80828e-05
  ent_coef: 0.00229519
  clip_range: 0.2
  n_epochs: 5
  gae_lambda: 0.99
  max_grad_norm: 0.7
  vf_coef: 0.835671
  policy_kwargs: "dict(
                    log_std_init=-2,
                    ortho_init=False,
                    activation_fn=nn.ReLU,
                    net_arch=dict(pi=[256, 256], vf=[256, 256])
                  )"

HumanoidStandup-v3:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e7
  batch_size: 32
  n_steps: 512
  gamma: 0.99
  learning_rate: 2.55673e-05
  ent_coef: 3.62109e-06
  clip_range: 0.3
  n_epochs: 20
  gae_lambda: 0.9
  max_grad_norm: 0.7
  vf_coef: 0.430793
  policy_kwargs: "dict(
                    log_std_init=-2,
                    ortho_init=False,
                    activation_fn=nn.ReLU,
                    net_arch=dict(pi=[256, 256], vf=[256, 256])
                  )"

Humanoid-v3:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e7
  batch_size: 256
  n_steps: 512
  gamma: 0.95
  learning_rate: 3.56987e-05
  ent_coef: 0.00238306
  clip_range: 0.3
  n_epochs: 5
  gae_lambda: 0.9
  max_grad_norm: 2
  vf_coef: 0.431892
  policy_kwargs: "dict(
                    log_std_init=-2,
                    ortho_init=False,
                    activation_fn=nn.ReLU,
                    net_arch=dict(pi=[256, 256], vf=[256, 256])
                  )"

InvertedDoublePendulum-v3:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 512
  n_steps: 128
  gamma: 0.98
  learning_rate: 0.000155454
  ent_coef: 1.05057e-06
  clip_range: 0.4
  n_epochs: 10
  gae_lambda: 0.8
  max_grad_norm: 0.5
  vf_coef: 0.695929

InvertedPendulum-v3:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 64
  n_steps: 32
  gamma: 0.999
  learning_rate: 0.000222425
  ent_coef: 1.37976e-07
  clip_range: 0.4
  n_epochs: 5
  gae_lambda: 0.9
  max_grad_norm: 0.3
  vf_coef: 0.19816

Reacher-v2:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 32
  n_steps: 512
  gamma: 0.9
  learning_rate: 0.000104019
  ent_coef: 7.52585e-08
  clip_range: 0.3
  n_epochs: 5
  gae_lambda: 1.0
  max_grad_norm: 0.9
  vf_coef: 0.950368

Walker2d-v3:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 32
  n_steps: 512
  gamma: 0.99
  learning_rate: 5.05041e-05
  ent_coef: 0.000585045
  clip_range: 0.1
  n_epochs: 20
  gae_lambda: 0.95
  max_grad_norm: 1
  vf_coef: 0.871923
