FrozenLake-v1:
  n_timesteps: !!float 3e4
  policy: 'MlpPolicy'
  learning_rate: lin_7.3e-4
  batch_size: 32
  buffer_size: 1000
  learning_starts: 100
  gamma: 0.98
  initial_target_update_interval: 50
  train_freq: 100 #[1, "episode"]
  gradient_steps: 1
  exploration_fraction: 0.2
  exploration_final_eps: 0.07
  policy_kwargs: "dict(net_arch=[16])"

ConstructedMaxBias:
  n_timesteps: !!float 3e3
  policy: 'MlpPolicy'
  learning_rate: lin_7.3e-4
  batch_size: 128
  buffer_size: 10000
  learning_starts: 100
  gamma: 0.98
  initial_target_update_interval: 50
  train_freq: 16
  gradient_steps: 8
  exploration_fraction: 0.2
  exploration_final_eps: 0.07
  policy_kwargs: "dict(net_arch=[256, 256])"

MountainCar-v0:
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  learning_rate: lin_1.5e-3
  batch_size: 128
  buffer_size: 10000
  learning_starts: 1000
  gamma: 0.99
  initial_target_update_interval: 500
  train_freq: 16
  gradient_steps: 8
  exploration_fraction: 0.2
  exploration_final_eps: 0.07
  policy_kwargs: "dict(net_arch=[128, 128])"

LunarLander-v2:
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  learning_rate: lin_1.5e-3
  batch_size: 128
  buffer_size: 10000
  learning_starts: 1000
  gamma: 0.99
  initial_target_update_interval: 500
  train_freq: 16
  gradient_steps: 8
  exploration_fraction: 0.2
  exploration_final_eps: 0.07
  policy_kwargs: "dict(net_arch=[128, 128])"

LunarLander-v3:
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  learning_rate: !!float 6.3e-4
  batch_size: 128
  buffer_size: 50000
  learning_starts: 0
  gamma: 0.99
  initial_target_update_interval: 250
  train_freq: 4
  gradient_steps: -1
  exploration_fraction: 0.12
  exploration_final_eps: 0.1
  policy_kwargs: "dict(net_arch=[256, 256])"

atari:
  env_wrapper:
    - stable_baselines3.common.atari_wrappers.AtariWrapper
  frame_stack: 4
  policy: 'CnnPolicy'
  n_timesteps: !!float 1e7
  buffer_size: 100000
  learning_rate: !!float 1e-4
  batch_size: 32
  learning_starts: 100000
  initial_target_update_interval: 1000
  train_freq: 4
  gradient_steps: 1
  exploration_kwargs: 
    exploration_fraction: 0.1
    exploration_final_eps: 0.01
  # If True, you need to deactivate handle_timeout_termination
  # in the replay_buffer_kwargs
  optimize_memory_usage: False

# Almost Tuned
CartPole-v1:
  n_timesteps: !!float 5e4
  policy: 'MlpPolicy'
  learning_rate: !!float 2.3e-3
  batch_size: 64
  buffer_size: 100000
  learning_starts: 1000
  gamma: 0.99
  initial_target_update_interval: 10
  train_freq: 256
  gradient_steps: 128
  exploration_fraction: 0.16
  exploration_final_eps: 0.04
  policy_kwargs: "dict(net_arch=[256, 256])"


# Tuned
Acrobot-v1:
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  learning_rate: !!float 6.3e-4
  batch_size: 128
  buffer_size: 50000
  learning_starts: 0
  gamma: 0.99
  initial_target_update_interval: 250
  train_freq: 4
  gradient_steps: -1
  exploration_fraction: 0.12
  exploration_final_eps: 0.1
  policy_kwargs: "dict(net_arch=[256, 256])"
