
MsPacmanNoFrameskip-v4:
  env_wrapper:
    - stable_baselines3.common.atari_wrappers.AtariWrapper
  frame_stack: 4
  policy: 'EnsembleCnnPolicy'
  n_envs: 8
  n_steps: 128
  n_epochs: 4
  batch_size: 256
  n_timesteps: !!float 1.5e7
  learning_rate: 0.00025
  clip_range: 0.2
  vf_coef: 0.5
  ent_coef: 0.01
  
SeaquestNoFrameskip-v4:
  env_wrapper:
    - stable_baselines3.common.atari_wrappers.AtariWrapper
  frame_stack: 4
  policy: 'MasksemblesCnnPolicy'
  n_envs: 8
  n_steps: 128
  n_epochs: 4
  batch_size: 256
  n_timesteps: !!float 1.5e7
  learning_rate: 0.00025
  clip_range: 0.2
  vf_coef: 0.5
  ent_coef: 0.2
  
BreakoutNoFrameskip-v4:
  env_wrapper:
    - stable_baselines3.common.atari_wrappers.AtariWrapper
  frame_stack: 4
  policy: 'MasksemblesCnnPolicy'
  n_envs: 8
  n_steps: 128
  n_epochs: 4
  batch_size: 256
  n_timesteps: !!float 1.5e7
  learning_rate: 0.00025
  clip_range: 0.2
  vf_coef: 0.5
  ent_coef: 0.01
  
BerzerkNoFrameskip-v4:
  env_wrapper:
    - stable_baselines3.common.atari_wrappers.AtariWrapper
  frame_stack: 4
  policy: 'CnnPolicy'
  n_envs: 8
  n_steps: 128
  n_epochs: 4
  batch_size: 256
  n_timesteps: !!float 1.5e7
  learning_rate: 0.00025
  clip_range: 0.2
  vf_coef: 0.5
  ent_coef: 0.01
  
VizdoomCorridor-v0:
  env_wrapper:
    - stable_baselines3.common.atari_wrappers.WarpFrame
  frame_stack: 4
  policy: 'CnnPolicy'
  n_envs: 8
  n_steps: 128
  n_epochs: 4
  batch_size: 256
  n_timesteps: !!float 1.5e7
  learning_rate: 0.00025
  clip_range: 0.1
  vf_coef: 0.5
  ent_coef: 0.1
  # use_conf_matrix_sampling: true
  # conf_matrix_copy_freq: 0.01
  # conf_matrix_k: 2
  

# Tuned
Pendulum-v0:
  n_envs: 4
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  n_steps: 1024
  gae_lambda: 0.95
  gamma: 0.9
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: !!float 1e-3
  clip_range: 0.2
  use_sde: True
  sde_sample_freq: 4

# Tuned
CartPole-v1:
  n_envs: 8
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  n_steps: 32
  batch_size: 256
  gae_lambda: 0.8
  gamma: 0.98
  n_epochs: 20
  ent_coef: 0.0
  learning_rate: lin_0.001
  clip_range: lin_0.2

MountainCar-v0:
  normalize: true
  n_envs: 16
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  n_steps: 16
  gae_lambda: 0.98
  gamma: 0.99
  n_epochs: 4
  ent_coef: 0.0

# Tuned
MountainCarContinuous-v0:
  normalize: true
  n_envs: 1
  n_timesteps: !!float 20000
  policy: 'MlpPolicy'
  batch_size: 256
  n_steps: 8
  gamma: 0.9999
  learning_rate: !!float 7.77e-05
  ent_coef: 0.00429
  clip_range: 0.1
  n_epochs: 10
  gae_lambda: 0.9
  max_grad_norm: 5
  vf_coef: 0.19
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3.29, ortho_init=False)"

Acrobot-v1:
  normalize: true
  n_envs: 16
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  n_steps: 256
  gae_lambda: 0.94
  gamma: 0.99
  n_epochs: 4
  ent_coef: 0.0

BipedalWalker-v3:
  normalize: true
  n_envs: 16
  n_timesteps: !!float 5e6
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.001
  learning_rate: !!float 2.5e-4
  clip_range: 0.2

BipedalWalkerHardcore-v3:
  normalize: true
  n_envs: 16
  n_timesteps: !!float 10e7
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.001
  learning_rate: lin_2.5e-4
  clip_range: lin_0.2

LunarLander-v2:
  n_envs: 16
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  n_steps: 1024
  batch_size: 64
  gae_lambda: 0.98
  gamma: 0.999
  n_epochs: 4
  ent_coef: 0.01

LunarLanderContinuous-v2:
  n_envs: 16
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  n_steps: 1024
  batch_size: 64
  gae_lambda: 0.98
  gamma: 0.999
  n_epochs: 4
  ent_coef: 0.01

# Tuned
HalfCheetahBulletEnv-v0: &pybullet-defaults
  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
  normalize: true
  n_envs: 16
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  batch_size: 128
  n_steps: 512
  gamma: 0.99
  gae_lambda: 0.9
  n_epochs: 20
  ent_coef: 0.0
  sde_sample_freq: 4
  max_grad_norm: 0.5
  vf_coef: 0.5
  learning_rate: !!float 3e-5
  use_sde: True
  clip_range: 0.4
  policy_kwargs: "dict(log_std_init=-2,
                       ortho_init=False,
                       activation_fn=nn.ReLU,
                       net_arch=[dict(pi=[256, 256], vf=[256, 256])]
                       )"

# Tuned
AntBulletEnv-v0:
  <<: *pybullet-defaults
  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
  learning_rate: !!float 3e-5
  policy_kwargs: "dict(log_std_init=-1,
                       ortho_init=False,
                       activation_fn=nn.ReLU,
                       net_arch=[dict(pi=[256, 256], vf=[256, 256])]
                       )"

# Tuned
Walker2DBulletEnv-v0:
  <<: *pybullet-defaults
  learning_rate: !!float 3e-5
  clip_range: lin_0.4
  policy_kwargs: "dict(log_std_init=-2,
                       ortho_init=False,
                       activation_fn=nn.ReLU,
                       net_arch=[dict(pi=[256, 256], vf=[256, 256])]
                       )"

# Tuned
HopperBulletEnv-v0:
  <<: *pybullet-defaults
  learning_rate: !!float 3e-5
  clip_range: lin_0.4
  policy_kwargs: "dict(log_std_init=-2,
                       ortho_init=False,
                       activation_fn=nn.ReLU,
                       net_arch=[dict(pi=[256, 256], vf=[256, 256])]
                       )"

# Tuned
ReacherBulletEnv-v0:
  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
  normalize: true
  n_envs: 8
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  batch_size: 64
  n_steps: 512
  gamma: 0.99
  gae_lambda: 0.9
  n_epochs: 20
  ent_coef: 0.0
  sde_sample_freq: 4
  max_grad_norm: 0.5
  vf_coef: 0.5
  learning_rate: !!float 3e-5
  use_sde: True
  clip_range: lin_0.4
  policy_kwargs: "dict(log_std_init=-2.7,
                       ortho_init=False,
                       activation_fn=nn.ReLU,
                       net_arch=[dict(pi=[256, 256], vf=[256, 256])]
                       )"

MinitaurBulletEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2

MinitaurBulletDuckEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2

# To be tuned
HumanoidBulletEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 1e7
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2

InvertedDoublePendulumBulletEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2

InvertedPendulumSwingupBulletEnv-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 2e6
  policy: 'MlpPolicy'
  n_steps: 2048
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2

# Following https://github.com/lcswillems/rl-starter-files
# requires --gym-packages gym_minigrid
MiniGrid-DoorKey-5x5-v0:
  # Dict Observations are now supported
  # env_wrapper: gym_minigrid.wrappers.FlatObsWrapper
  normalize: true
  n_envs: 8 # number of environment copies running in parallel
  n_timesteps: !!float 1e5
  policy: MlpPolicy
  n_steps: 128 # batch size is n_steps * n_env
  batch_size: 64 # Number of training minibatches per update
  gae_lambda: 0.95 #  Factor for trade-off of bias vs variance for Generalized Advantage Estimator
  gamma: 0.99
  n_epochs: 10 #  Number of epoch when optimizing the surrogate
  ent_coef: 0.0 # Entropy coefficient for the loss caculation
  learning_rate: 2.5e-4 # The learning rate, it can be a function
  clip_range: 0.2 # Clipping parameter, it can be a function

# requires --gym-packages gym_minigrid
MiniGrid-FourRooms-v0:
  normalize: true
  n_envs: 8
  n_timesteps: !!float 4e6
  policy: 'MlpPolicy'
  n_steps: 512
  batch_size: 64
  gae_lambda: 0.95
  gamma: 0.99
  n_epochs: 10
  ent_coef: 0.0
  learning_rate: 2.5e-4
  clip_range: 0.2

CarRacing-v0:
  env_wrapper:
    - gym.wrappers.resize_observation.ResizeObservation:
        shape: 64
    - gym.wrappers.gray_scale_observation.GrayScaleObservation:
        keep_dim: true
  frame_stack: 4
  n_envs: 8
  n_timesteps: !!float 1e6
  policy: 'CnnPolicy'
  batch_size: 128
  n_steps: 512
  gamma: 0.99
  gae_lambda: 0.9
  n_epochs: 20
  ent_coef: 0.0
  sde_sample_freq: 4
  max_grad_norm: 0.5
  vf_coef: 0.5
  learning_rate: !!float 3e-5
  use_sde: True
  clip_range: 0.4
  policy_kwargs: "dict(log_std_init=-2,
                       ortho_init=False,
                       )"

# Tuned
# 10 mujoco envs
Ant-v2:
  n_envs: 8
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e7
  batch_size: 32
  n_steps: 512
  gamma: 0.98
  learning_rate: 1.90609e-05
  ent_coef: 4.9646e-07
  clip_range: 0.1
  n_epochs: 10
  gae_lambda: 0.8
  max_grad_norm: 0.6
  vf_coef: 0.677239
  
Ant-v3:
  n_envs: 8
  policy: 'EnsembleMlpPolicy'
  n_timesteps: !!float 1e7
  batch_size: 32
  n_steps: 512
  gamma: 0.98
  learning_rate: 1.90609e-05
  ent_coef: 4.9646e-07
  clip_range: 0.1
  n_epochs: 10
  gae_lambda: 0.8
  max_grad_norm: 0.6
  vf_coef: 0.677239

HalfCheetah-v3:
  n_envs: 8
  policy: 'MasksemblesMlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 64
  n_steps: 512
  gamma: 0.98
  learning_rate: 2.0633e-05
  ent_coef: 0.000401762
  clip_range: 0.1
  n_epochs: 20
  gae_lambda: 0.92
  max_grad_norm: 0.8
  vf_coef: 0.58096
  #policy_kwargs: "dict(
  #  log_std_init=-2,
  #  ortho_init=False,
  #  net_arch=dict(hidden_dim=384,
  #  last_layer_dim_pi=384,
  #  last_layer_dim_vf=384,
  #  num_masks=4
  #  )
  #)"
  policy_kwargs: "dict(
                    log_std_init=-2,
                    ortho_init=False,
                    activation_fn=nn.ReLU,
                   net_arch=[dict(pi=[256, 256], vf=[256, 256])]
                  )"

Hopper-v2:
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 32
  n_steps: 512
  gamma: 0.999
  learning_rate: 9.80828e-05
  ent_coef: 0.00229519
  clip_range: 0.2
  n_epochs: 5
  gae_lambda: 0.99
  max_grad_norm: 0.7
  vf_coef: 0.835671
  policy_kwargs: "dict(
                    log_std_init=-2,
                    ortho_init=False,
                    activation_fn=nn.ReLU,
                    net_arch=[dict(pi=[256, 256], vf=[256, 256])]
                  )"

HumanoidStandup-v2:
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e7
  batch_size: 32
  n_steps: 512
  gamma: 0.99
  learning_rate: 2.55673e-05
  ent_coef: 3.62109e-06
  clip_range: 0.3
  n_epochs: 20
  gae_lambda: 0.9
  max_grad_norm: 0.7
  vf_coef: 0.430793
  policy_kwargs: "dict(
                    log_std_init=-2,
                    ortho_init=False,
                    activation_fn=nn.ReLU,
                    net_arch=[dict(pi=[256, 256], vf=[256, 256])]
                  )"

Humanoid-v2:
  n_envs: 8
  policy: 'MasksemblesMlpPolicy'
  n_timesteps: !!float 1e7
  batch_size: 256
  n_steps: 512
  gamma: 0.95
  learning_rate: 3.56987e-05
  ent_coef: 0.00238306
  clip_range: 0.3
  n_epochs: 5
  gae_lambda: 0.9
  max_grad_norm: 2
  vf_coef: 0.431892
  
  #policy_kwargs: "dict(
  #                  log_std_init=-2,
  #                  ortho_init=False,
  #                  activation_fn=nn.ReLU,
  #                  net_arch=[dict(pi=[256, 256], vf=[256, 256])]
  #                )"

InvertedDoublePendulum-v2:
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 512
  n_steps: 128
  gamma: 0.98
  learning_rate: 0.000155454
  ent_coef: 1.05057e-06
  clip_range: 0.4
  n_epochs: 10
  gae_lambda: 0.8
  max_grad_norm: 0.5
  vf_coef: 0.695929

InvertedPendulum-v2:
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 64
  n_steps: 32
  gamma: 0.999
  learning_rate: 0.000222425
  ent_coef: 1.37976e-07
  clip_range: 0.4
  n_epochs: 5
  gae_lambda: 0.9
  max_grad_norm: 0.3
  vf_coef: 0.19816

Reacher-v2:
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 32
  n_steps: 512
  gamma: 0.9
  learning_rate: 0.000104019
  ent_coef: 7.52585e-08
  clip_range: 0.3
  n_epochs: 5
  gae_lambda: 1.0
  max_grad_norm: 0.9
  vf_coef: 0.950368

Swimmer-v2:
  n_envs: 1
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 32
  n_steps: 512
  gamma: 0.9999
  learning_rate: 5.49717e-05
  ent_coef: 0.0554757
  clip_range: 0.3
  n_epochs: 10
  gae_lambda: 0.95
  max_grad_norm: 0.6
  vf_coef: 0.38782
  policy_kwargs: "dict(
                    log_std_init=-2,
                    ortho_init=False,
                    activation_fn=nn.ReLU,
                    net_arch=[dict(pi=[256, 256], vf=[256, 256])]
                  )"

Walker2d-v2:
  normalize: true
  n_envs: 8
  policy: 'MasksemblesMlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 32
  n_steps: 512
  gamma: 0.99
  learning_rate: 5.05041e-05
  ent_coef: 0.000585045
  clip_range: 0.1
  n_epochs: 20
  gae_lambda: 0.95
  max_grad_norm: 1
  vf_coef: 0.871923
  
Walker2d-v3:
  normalize: true
  n_envs: 8
  policy: 'MlpPolicy'
  n_timesteps: !!float 1e6
  batch_size: 32
  n_steps: 512
  gamma: 0.99
  learning_rate: 5.05041e-05
  ent_coef: 0.000585045
  clip_range: 0.1
  n_epochs: 20
  gae_lambda: 0.95
  max_grad_norm: 1
  vf_coef: 0.871923


atari:
  env_wrapper:
    - stable_baselines3.common.atari_wrappers.AtariWrapper
  frame_stack: 4
  policy: 'MasksemblesCnnPolicy'
  n_envs: 8
  n_steps: 128
  n_epochs: 4
  batch_size: 256
  n_timesteps: !!float 1.5e7
  learning_rate: 0.00025
  clip_range: 0.2
  vf_coef: 0.5
  ent_coef: 0.01
  # use_conf_matrix_sampling: true
  # conf_matrix_copy_freq: 0.01
  # conf_matrix_k: confmatrixschedule_9
  