atari:
  env_wrapper:
    - stable_baselines3.common.atari_wrappers.AtariWrapper
  frame_stack: 4
  policy: 'CnnPolicy'
  policy_kwargs: "dict(
    features_extractor_class=impala_policy.ImpalaCNN
    )"
  n_envs: 8
  n_steps: 128
  n_epochs: 4
  batch_size: 256
  n_timesteps: !!float 2.5e7
  learning_rate: lin_2.5e-4
  clip_range: lin_0.1
  vf_coef: 0.5
  ent_coef: 0.01

# Tuned
Pendulum-v1:
  n_envs: 4
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  n_steps: 1024
  gae_lambda: 0.95
  gamma: 0.9
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: !!float 1e-3
  clip_range: 0.2
  use_sde: True
  sde_sample_freq: 4

# Tuned
CartPole-v1:
  n_envs: 8
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  n_steps: 32
  batch_size: 256
  gae_lambda: 0.8
  gamma: 0.98
  n_epochs: 20
  ent_coef: 0.0
  learning_rate: lin_0.001
  clip_range: lin_0.2

MountainCar-v0:
  normalize: true
  n_envs: 16
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  n_steps: 16
  gae_lambda: 0.98
  gamma: 0.99
  n_epochs: 4
  ent_coef: 0.0

# Tuned
MountainCarContinuous-v0:
  normalize: true
  n_envs: 1
  n_timesteps: !!float 20000
  policy: 'MlpPolicy'
  batch_size: 256
  n_steps: 8
  gamma: 0.9999
  learning_rate: !!float 7.77e-05
  ent_coef: 0.00429
  clip_range: 0.1
  n_epochs: 10
  gae_lambda: 0.9
  max_grad_norm: 5
  vf_coef: 0.19
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3.29, ortho_init=False)"

Acrobot-v1:
  normalize: true
  n_envs: 16
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  n_steps: 256
  gae_lambda: 0.94
  gamma: 0.99
  n_epochs: 4
  ent_coef: 0.0

BipedalWalker-v3:
  normalize: true
  n_envs: 32
  n_timesteps: !!float 5e6
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.999
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: !!float 3e-4
  clip_range: 0.18

BipedalWalkerHardcore-v3:
  normalize: true
  n_envs: 16
  n_timesteps: !!float 10e7
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.001
  learning_rate: lin_2.5e-4
  clip_range: lin_0.2

LunarLander-v2:
  n_envs: 16
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  n_steps: 1024
  batch_size: 64
  gae_lambda: 0.98
  gamma: 0.999
  n_epochs: 4
  ent_coef: 0.01

LunarLanderContinuous-v2:
  n_envs: 16
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  n_steps: 1024
  batch_size: 64
  gae_lambda: 0.98
  gamma: 0.999
  n_epochs: 4
  ent_coef: 0.01

# Tuned
HalfCheetahBulletEnv-v0: &pybullet-defaults
  normalize: true
  n_envs: 16
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  batch_size: 128
  n_steps: 512
  gamma: 0.99
  gae_lambda: 0.9
  n_epochs: 20
  ent_coef: 0.0
  sde_sample_freq: 4
  max_grad_norm: 0.5
  vf_coef: 0.5
  learning_rate: !!float 3e-5
  use_sde: True
  clip_range: 0.4
  policy_kwargs: "dict(log_std_init=-2,
                       ortho_init=False,
                       activation_fn=nn.ReLU,
                       net_arch=dict(pi=[256, 256], vf=[256, 256])
                       )"

# Tuned
AntBulletEnv-v0:
  <<: *pybullet-defaults
  learning_rate: !!float 3e-5
  policy_kwargs: "dict(log_std_init=-1,
                       ortho_init=False,
                       activation_fn=nn.ReLU,
                       net_arch=dict(pi=[256, 256], vf=[256, 256])
                       )"

# Tuned
Walker2DBulletEnv-v0:
  <<: *pybullet-defaults
  learning_rate: !!float 3e-5
  clip_range: lin_0.4
  policy_kwargs: "dict(log_std_init=-2,
                       ortho_init=False,
                       activation_fn=nn.ReLU,
                       net_arch=dict(pi=[256, 256], vf=[256, 256])
                       )"

# Tuned
HopperBulletEnv-v0:
  <<: *pybullet-defaults
  learning_rate: !!float 3e-5
  clip_range: lin_0.4
  policy_kwargs: "dict(log_std_init=-2,
                       ortho_init=False,
                       activation_fn=nn.ReLU,
                       net_arch=dict(pi=[256, 256], vf=[256, 256])
                       )"

# Tuned
ReacherBulletEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  batch_size: 64
  n_steps: 512
  gamma: 0.99
  gae_lambda: 0.9
  n_epochs: 20
  ent_coef: 0.0
  sde_sample_freq: 4
  max_grad_norm: 0.5
  vf_coef: 0.5
  learning_rate: !!float 3e-5
  use_sde: True
  clip_range: lin_0.4
  policy_kwargs: "dict(log_std_init=-2.7,
                       ortho_init=False,
                       activation_fn=nn.ReLU,
                       net_arch=dict(pi=[256, 256], vf=[256, 256])
                       )"

MinitaurBulletEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2

MinitaurBulletDuckEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2

# To be tuned
HumanoidBulletEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 1e7
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2

InvertedDoublePendulumBulletEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2

InvertedPendulumSwingupBulletEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2


# Following https://github.com/lcswillems/rl-starter-files
MiniGrid-Empty-Random-5x5-v0: &minigrid-defaults
  env_wrapper: minigrid.wrappers.FlatObsWrapper # See GH/1320#issuecomment-1421108191
  normalize: true
  n_envs: 8 # number of environment copies running in parallel
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  n_steps: 128 # batch size is n_steps * n_env
  batch_size: 64 # Number of training minibatches per update
  gae_lambda: 0.95 #  Factor for trade-off of bias vs variance for Generalized Advantage Estimator
  gamma: 0.99
  n_epochs: 10 #  Number of epoch when optimizing the surrogate
  ent_coef: 0.0 # Entropy coefficient for the loss calculation
  learning_rate: 2.5e-4 # The learning rate, it can be a function
  clip_range: 0.2 # Clipping parameter, it can be a function

MiniGrid-FourRooms-v0:
  <<: *minigrid-defaults
  n_timesteps: !!float 5e6
  n_steps: 512

MiniGrid-DoorKey-5x5-v0:
  <<: *minigrid-defaults
  n_timesteps: !!float 1e5

MiniGrid-MultiRoom-N4-S5-v0:
  <<: *minigrid-defaults
  n_timesteps: !!float 1e7 # Unsolved

MiniGrid-Fetch-5x5-N2-v0:
  <<: *minigrid-defaults
  n_timesteps: !!float 5e6

MiniGrid-GoToDoor-5x5-v0:
  <<: *minigrid-defaults
  n_timesteps: !!float 5e6

MiniGrid-PutNear-6x6-N2-v0:
  <<: *minigrid-defaults
  n_timesteps: !!float 1e7

MiniGrid-RedBlueDoors-6x6-v0:
  <<: *minigrid-defaults
  n_timesteps: !!float 1e6
  n_steps: 512

MiniGrid-LockedRoom-v0:
  <<: *minigrid-defaults
  n_timesteps: !!float 1e7 # Unsolved

MiniGrid-KeyCorridorS3R1-v0:
  <<: *minigrid-defaults
  n_timesteps: !!float 5e5

MiniGrid-Unlock-v0:
  <<: *minigrid-defaults

MiniGrid-ObstructedMaze-2Dlh-v0:
  <<: *minigrid-defaults
  n_timesteps: !!float 1e7 # Unsolved


CarRacing-v2:
  env_wrapper:
    - rl_zoo3.wrappers.FrameSkip:
        skip: 2
    - gymnasium.wrappers.resize_observation.ResizeObservation:
        shape: 64
    - gymnasium.wrappers.gray_scale_observation.GrayScaleObservation:
        keep_dim: true
  frame_stack: 2
  normalize: "{'norm_obs': False, 'norm_reward': True}"
  n_envs: 8
  n_timesteps: !!float 4e6
  policy: 'CnnPolicy'
  batch_size: 128
  n_steps: 512
  gamma: 0.99
  gae_lambda: 0.95
  n_epochs: 10
  ent_coef: 0.0
  sde_sample_freq: 4
  max_grad_norm: 0.5
  vf_coef: 0.5
  learning_rate: lin_1e-4
  use_sde: True
  clip_range: 0.2
  policy_kwargs: "dict(log_std_init=-2,
                       ortho_init=False,
                       activation_fn=nn.GELU,
                       net_arch=dict(pi=[256], vf=[256]),
                       )"


# === Mujoco Envs ===
# HalfCheetah-v4: &mujoco-defaults
#   normalize: true
#   n_timesteps: !!float 1e6
#   policy: 'MlpPolicy'

Ant-v4: &mujoco-defaults
  normalize: true
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'

# Hopper-v4:
#   <<: *mujoco-defaults
#
# Walker2d-v4:
#   <<: *mujoco-defaults
#
# Humanoid-v4:
#   <<: *mujoco-defaults
#   n_timesteps: !!float 2e6
#
# tuned
Swimmer-v5:
  <<: *mujoco-defaults
  gamma: 0.9999
  n_envs: 4
  n_steps: 1024
  batch_size: 256
  learning_rate: !!float 6e-4
  gae_lambda: 0.98

# Tuned
# 10 mujoco envs

HalfCheetah-v5:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 64
  n_steps: 512
  gamma: 0.98
  learning_rate: 2.0633e-05
  ent_coef: 0.000401762
  clip_range: 0.1
  n_epochs: 20
  gae_lambda: 0.92
  max_grad_norm: 0.8
  vf_coef: 0.58096
  policy_kwargs: "dict(
                    log_std_init=-2,
                    ortho_init=False,
                    activation_fn=nn.ReLU,
                    net_arch=dict(pi=[256, 256], vf=[256, 256])
                  )"

Ant-v5:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 5e6
  batch_size: 32
  n_steps: 512
  gamma: 0.98
  learning_rate: 1.90609e-05
  ent_coef: 4.9646e-07
  clip_range: 0.1
  n_epochs: 10
  gae_lambda: 0.8
  max_grad_norm: 0.6
  vf_coef: 0.677239

Hopper-v5:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 5e6
  batch_size: 32
  n_steps: 512
  gamma: 0.999
  learning_rate: 9.80828e-05
  ent_coef: 0.00229519
  clip_range: 0.2
  n_epochs: 5
  gae_lambda: 0.99
  max_grad_norm: 0.7
  vf_coef: 0.835671
  policy_kwargs: "dict(
                    log_std_init=-2,
                    ortho_init=False,
                    activation_fn=nn.ReLU,
                    net_arch=dict(pi=[256, 256], vf=[256, 256])
                  )"

HumanoidStandup-v2:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e7
  batch_size: 32
  n_steps: 512
  gamma: 0.99
  learning_rate: 2.55673e-05
  ent_coef: 3.62109e-06
  clip_range: 0.3
  n_epochs: 20
  gae_lambda: 0.9
  max_grad_norm: 0.7
  vf_coef: 0.430793
  policy_kwargs: "dict(
                    log_std_init=-2,
                    ortho_init=False,
                    activation_fn=nn.ReLU,
                    net_arch=dict(pi=[256, 256], vf=[256, 256])
                  )"

Humanoid-v5:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 5e6
  batch_size: 256
  n_steps: 512
  gamma: 0.95
  learning_rate: 3.56987e-05
  ent_coef: 0.00238306
  clip_range: 0.3
  n_epochs: 5
  gae_lambda: 0.9
  max_grad_norm: 2
  vf_coef: 0.431892
  policy_kwargs: "dict(
                    log_std_init=-2,
                    ortho_init=False,
                    activation_fn=nn.ReLU,
                    net_arch=dict(pi=[256, 256], vf=[256, 256])
                  )"

InvertedDoublePendulum-v2:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 512
  n_steps: 128
  gamma: 0.98
  learning_rate: 0.000155454
  ent_coef: 1.05057e-06
  clip_range: 0.4
  n_epochs: 10
  gae_lambda: 0.8
  max_grad_norm: 0.5
  vf_coef: 0.695929

InvertedPendulum-v2:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 64
  n_steps: 32
  gamma: 0.999
  learning_rate: 0.000222425
  ent_coef: 1.37976e-07
  clip_range: 0.4
  n_epochs: 5
  gae_lambda: 0.9
  max_grad_norm: 0.3
  vf_coef: 0.19816

Reacher-v4:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 32
  n_steps: 512
  gamma: 0.9
  learning_rate: 0.000104019
  ent_coef: 7.52585e-08
  clip_range: 0.3
  n_epochs: 5
  gae_lambda: 1.0
  max_grad_norm: 0.9
  vf_coef: 0.950368

Walker2d-v5:
  normalize: true
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 32
  n_steps: 512
  gamma: 0.99
  learning_rate: 5.05041e-05
  ent_coef: 0.000585045
  clip_range: 0.1
  n_epochs: 20
  gae_lambda: 0.95
  max_grad_norm: 1
  vf_coef: 0.871923

procgen-coinrun-v0: &procgen-defaults
    policy: 'CnnPolicy'
    policy_kwargs: "dict(
        features_extractor_class=impala_policy.ImpalaCNN
    )"
    n_envs: 64
    gamma: 0.999
    gae_lambda: 0.95
    n_steps: 256
    n_epochs: 3
    batch_size: 2048
    clip_range: 0.2
    ent_coef: 0.01
    normalize: true
    learning_rate: !!float 5e-4
    n_timesteps: !!float 25e6

procgen-maze-v0:
    <<: *procgen-defaults

procgen-miner-v0:
    <<: *procgen-defaults
    
    
    
