atari:
  env_wrapper:
    - stable_baselines3.common.atari_wrappers.AtariWrapper
  # Equivalent to
  # vec_env_wrapper:
  #   - stable_baselines3.common.vec_env.VecFrameStack:
  #         n_stack: 4
  frame_stack: 4
  policy: 'CnnPolicy'
  n_envs: 16
  n_timesteps: !!float 1e7
  ent_coef: 0.01
  vf_coef: 0.2
  policy_kwargs: "dict(optimizer_class=RMSpropTFLike, optimizer_kwargs=dict(eps=1e-5))"

MiniGrid-LavaGapS6-v0:
  # Dict Observations are now supported
  env_wrapper:
    - minigrid.wrappers.ImgObsWrapper
    - src.utils.wrappers.TransposedObservation
#    - src.utils.wrappers.RescaleWrapper
  normalize: true
  n_envs: 16 # number of environment copies running in parallel
  n_timesteps: !!float 1e6
  policy: src.utils.policies_forked.ActorCriticCnnPolicyGrid
  n_steps: 64 # batch size is n_steps * n_env
  gae_lambda: 0.95 #  Factor for trade-off of bias vs variance for Generalized Advantage Estimator
  gamma: 0.99
  ent_coef: 0.01
  vf_coef: 0.04932
  learning_rate: 0.00176065 # The learning rate, it can be a function
  max_grad_norm: 0.9
  policy_kwargs: "dict(ortho_init=True, activation_fn=nn.ReLU)"

MiniGrid-LavaCrossingS9N1-v0:
  # Dict Observations are now supported
  env_wrapper:
    - minigrid.wrappers.ImgObsWrapper
#    - src.utils.wrappers.RescaleWrapper
  normalize: true
  n_envs: 16 # number of environment copies running in parallel
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  n_steps: 64 # batch size is n_steps * n_env
  gae_lambda: 0.95 #  Factor for trade-off of bias vs variance for Generalized Advantage Estimator
  gamma: 0.99
  ent_coef: 0.01495
  vf_coef: 0.04932
  learning_rate: 0.00176065 # The learning rate, it can be a function
  max_grad_norm: 0.9
  policy_kwargs: "dict(ortho_init=True, activation_fn=nn.ReLU)"

MiniGrid-Dynamic-Obstacles-6x6-v0:
  # Dict Observations are now supported
  # env_wrapper: gym_minigrid.wrappers.FlatObsWrapper
  env_wrapper:
    - minigrid.wrappers.ImgObsWrapper
    - src.utils.wrappers.LavaNegRewWrapper
  normalize: true
  normalize_advantage: true
  n_envs: 16 # number of environment copies running in parallel
  n_timesteps: !!float 1e6
  policy: MlpPolicy
  n_steps: 64 # batch size is n_steps * n_env
  gae_lambda: 0.95 #  Factor for trade-off of bias vs variance for Generalized Advantage Estimator
  gamma: 0.98
  ent_coef: 0.01161
  learning_rate: 0.001805
  max_grad_norm: 0.5
  policy_kwargs: "dict(activation_fn=nn.Tanh, ortho_init=True)"
  vf_coef: 0.87874

CartPole-v1:
  n_envs: 8
  n_timesteps: !!float 5e5
  policy: 'MlpPolicy'
  ent_coef: 0.0

LunarLander-v2:
  n_envs: 8
  n_timesteps: !!float 2e5
  policy: 'MlpPolicy'
  gamma: 0.995
  n_steps: 5
  learning_rate: lin_0.00083
  ent_coef: 0.00001

MountainCar-v0:
  normalize: true
  n_envs: 16
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  ent_coef: .0

Acrobot-v1:
  normalize: true
  n_envs: 16
  n_timesteps: !!float 5e5
  policy: 'MlpPolicy'
  ent_coef: .0

# Tuned
Pendulum-v1:
  normalize: True
  n_envs: 8
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  ent_coef: 0.0
  max_grad_norm: 0.5
  n_steps: 8
  gae_lambda: 0.9
  vf_coef: 0.4
  gamma: 0.9
  use_rms_prop: True
  normalize_advantage: False
  learning_rate: lin_7e-4
  use_sde: True
  policy_kwargs: "dict(log_std_init=-2, ortho_init=False)"

# Tuned
LunarLanderContinuous-v2:
  normalize: true
  n_envs: 4
  n_timesteps: !!float 5e6
  policy: 'MlpPolicy'
  ent_coef: 0.0
  max_grad_norm: 0.5
  n_steps: 8
  gae_lambda: 0.9
  vf_coef: 0.4
  gamma: 0.99
  use_rms_prop: True
  normalize_advantage: False
  learning_rate: lin_7e-4
  use_sde: True
  policy_kwargs: "dict(log_std_init=-2, ortho_init=False)"

# Tuned
MountainCarContinuous-v0:
  normalize: true
  n_envs: 4
  n_steps: 100
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  ent_coef: 0.0
  use_sde: True
  sde_sample_freq: 16
  policy_kwargs: "dict(log_std_init=0.0, ortho_init=False)"

# Tuned
BipedalWalker-v3:
  normalize: true
  n_envs: 16
  n_timesteps: !!float 5e6
  policy: 'MlpPolicy'
  ent_coef: 0.0
  max_grad_norm: 0.5
  n_steps: 8
  gae_lambda: 0.9
  vf_coef: 0.4
  gamma: 0.99
  use_rms_prop: True
  normalize_advantage: False
  learning_rate: lin_0.00096
  use_sde: True
  policy_kwargs: "dict(log_std_init=-2, ortho_init=False)"

# Tuned
BipedalWalkerHardcore-v3:
  normalize: true
  n_envs: 32
  n_timesteps: !!float 20e7
  policy: 'MlpPolicy'
  ent_coef: 0.001
  max_grad_norm: 0.5
  n_steps: 8
  gae_lambda: 0.9
  vf_coef: 0.4
  gamma: 0.99
  use_rms_prop: True
  normalize_advantage: False
  learning_rate: lin_0.0008
  use_sde: True
  policy_kwargs: "dict(log_std_init=-2, ortho_init=False)"

# Tuned
HalfCheetahBulletEnv-v0: &pybullet-defaults
  normalize: true
  n_envs: 4
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  ent_coef: 0.0
  max_grad_norm: 0.5
  n_steps: 8
  gae_lambda: 0.9
  vf_coef: 0.4
  gamma: 0.99
  use_rms_prop: True
  normalize_advantage: False
  # Both works
  learning_rate: lin_0.00096
  # learning_rate: !!float 3e-4
  use_sde: True
  policy_kwargs: "dict(log_std_init=-2, ortho_init=False)"

Walker2DBulletEnv-v0:
  <<: *pybullet-defaults

# Tuned
AntBulletEnv-v0:
  <<: *pybullet-defaults

# Tuned
HopperBulletEnv-v0:
  <<: *pybullet-defaults

# Tuned but unstable
# Not working without SDE?
ReacherBulletEnv-v0:
  <<: *pybullet-defaults
  learning_rate: lin_0.0008

# === Mujoco Envs ===

HalfCheetah-v3: &mujoco-defaults
  normalize: true
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'

Ant-v3:
  <<: *mujoco-defaults

Hopper-v3:
  <<: *mujoco-defaults

Walker2d-v3:
  <<: *mujoco-defaults

Humanoid-v3:
  <<: *mujoco-defaults
  n_timesteps: !!float 2e6

Swimmer-v3:
  <<: *mujoco-defaults
  gamma: 0.9999
