atari:
  env_wrapper:
    - stable_baselines3.common.atari_wrappers.AtariWrapper
  # Equivalent to
  # vec_env_wrapper:
  #   - stable_baselines3.common.vec_env.VecFrameStack:
  #         n_stack: 4
  frame_stack: 4
  policy: 'CnnPolicy'
  n_envs: 16
  n_timesteps: !!float 1e7
  ent_coef: 0.01
  vf_coef: 0.25
  policy_kwargs: "dict(optimizer_class=RMSpropTFLike, optimizer_kwargs=dict(eps=1e-5))"

CartPole-v1:
  n_envs: 8
  n_timesteps: !!float 5e5
  policy: 'QMLP'
  ent_coef: 0.0

LunarLander-v2:
  n_envs: 8
  n_timesteps: !!float 2e5
  policy: 'QMLP'
  gamma: 0.995
  n_steps: 5
  learning_rate: lin_0.00083
  ent_coef: 0.00001

MountainCar-v0:
  normalize: true
  n_envs: 16
  n_timesteps: !!float 1e6
  policy: 'QMLP'
  ent_coef: .0

Acrobot-v1:
  normalize: true
  n_envs: 16
  n_timesteps: !!float 5e5
  policy: 'QMLP'
  ent_coef: .0

# Tuned
Pendulum-v1:
  normalize: True
  n_envs: 8
  n_timesteps: !!float 1e6
  policy: 'QMLP'
  ent_coef: 0.0
  max_grad_norm: 0.5
  n_steps: 8
  gae_lambda: 0.9
  vf_coef: 0.4
  gamma: 0.9
  use_rms_prop: True
  normalize_advantage: False
  learning_rate: lin_7e-4
  use_sde: True
  policy_kwargs: "dict(log_std_init=-2, ortho_init=False)"


MiniGrid-LavaGapS6-v0:
  # Dict Observations are now supported
  env_wrapper:
    - minigrid.wrappers.ImgObsWrapper
#    - src.component.envs.RescaleWrapper
  normalize: true
  n_envs: 8 # number of environment copies running in parallel
  n_timesteps: !!float 1e6
  policy: 'QMLP'
#  n_steps: 64 # batch size is n_steps * n_env
  gae_lambda: 0.95 #  Factor for trade-off of bias vs variance for Generalized Advantage Estimator
  gamma: 0.99
  ent_coef: 0.01495
  vf_coef: 0.04932
  learning_rate: 0.00176065 # The learning rate, it can be a function
  max_grad_norm: 0.9
  policy_kwargs: "dict(ortho_init=True, activation_fn=nn.ReLU)"

MiniGrid-LavaCrossingS9N1-v0:
  # Dict Observations are now supported
  env_wrapper:
    - minigrid.wrappers.ImgObsWrapper
#    - src.component.envs.RescaleWrapper
  normalize: true
  n_envs: 16 # number of environment copies running in parallel
  n_timesteps: !!float 2e6
  policy: 'QMLP'
  n_steps: 64 # batch size is n_steps * n_env
  gae_lambda: 0.95 #  Factor for trade-off of bias vs variance for Generalized Advantage Estimator
  gamma: 0.99
  ent_coef: 0.01495
  vf_coef: 0.04932
  learning_rate: 0.00176065 # The learning rate, it can be a function
  max_grad_norm: 0.9
  policy_kwargs: "dict(ortho_init=True, activation_fn=nn.ReLU)"

MiniGrid-Dynamic-Obstacles-6x6-v0:
  # Dict Observations are now supported
  # env_wrapper: gym_minigrid.wrappers.FlatObsWrapper
  env_wrapper:
    - minigrid.wrappers.ImgObsWrapper
    - src.component.envs.LavaNegRewWrapper
  normalize: true
  normalize_advantage: true
  n_envs: 16 # number of environment copies running in parallel
  n_timesteps: !!float 1e6
  policy: 'QMLP'
  n_steps: 64 # batch size is n_steps * n_env
  gae_lambda: 0.95 #  Factor for trade-off of bias vs variance for Generalized Advantage Estimator
  gamma: 0.98
  ent_coef: 0.01161
  learning_rate: 0.001805
  max_grad_norm: 0.5
  policy_kwargs: "dict(activation_fn=nn.Tanh, ortho_init=True)"
  vf_coef: 0.87874