atari:
  # env_wrapper:
  #   - stable_baselines3.common.atari_wrappers.AtariWrapper
  frame_stack: 4
  policy: 'CnnPolicy'
  n_timesteps: !!float 1e7
  buffer_size: 100000
  learning_rate: !!float 1e-5
  batch_size: 32
  learning_starts: 100000
  target_update_interval: 1
  train_freq: 5
  gradient_steps: 1
  exploration_fraction: 0.1
  exploration_final_eps: 0.01
  # If True, you need to deactivate handle_timeout_termination
  # in the replay_buffer_kwargs
  optimize_memory_usage: False
  policy_kwargs: {}
  LKTD_kwargs: {sgld_temperature: !!float 1e-2, prior_sd: !!float 5, obs_sd: !!float 1, alpha: !!float 0.9}

# Almost Tuned
CartPole-v1:
  n_timesteps: !!float 5e4
  policy: 'MlpPolicy'
  learning_rate: !!float 2.5e-5
  batch_size: 64
  buffer_size: 100000
  learning_starts: 1000
  gamma: 0.99
  target_update_interval: 1
  train_freq: 5
  gradient_steps: 1
  exploration_fraction: 0.16
  exploration_final_eps: 0.04
  policy_kwargs: {net_arch: [256, 256]}
  LKTD_kwargs: {sgld_temperature: !!float 1e-2, prior_sd: !!float 5, obs_sd: !!float 1, alpha: !!float 0.9}

# Tuned
MountainCar-v0:
  n_timesteps: !!float 2e5
  policy: 'MlpPolicy'
  learning_rate: !!float 1e-4
  batch_size: 128
  buffer_size: 10000
  learning_starts: 1000
  gamma: 0.98
  target_update_interval: 1
  train_freq: 16
  gradient_steps: 8
  exploration_fraction: 0.2
  exploration_final_eps: 0.07
  policy_kwargs: {net_arch:[256, 256]}
  LKTD_kwargs: {sgld_temperature: !!float 1e-2, prior_sd: !!float 20, obs_sd: !!float 1, alpha: !!float 0.9}

# Tuned
LunarLander-v2:
  n_timesteps: !!float 1.2e5
  policy: 'MlpPolicy'
  learning_rate: !!float 5e-6
  batch_size: 128
  buffer_size: 50000
  learning_starts: 0
  gamma: 0.99
  target_update_interval: 1
  train_freq: 4
  gradient_steps: -1
  exploration_fraction: 0.12
  exploration_final_eps: 0.25
  policy_kwargs: {net_arch:[256, 256]}
  LKTD_kwargs: {sgld_temperature: !!float 1e-2, prior_sd: !!float 5, obs_sd: !!float 1, alpha: !!float 0.9}

# Tuned
Acrobot-v1:
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  learning_rate: !!float 5e-5
  batch_size: 128
  buffer_size: 50000
  learning_starts: 0
  gamma: 0.99
  target_update_interval: 1
  train_freq: 5
  gradient_steps: 1
  exploration_fraction: 0.12
  exploration_final_eps: 0.1
  policy_kwargs: {net_arch:[256, 256]}
  LKTD_kwargs: {sgld_temperature: !!float 1e-2, prior_sd: !!float 5, obs_sd: !!float 1, alpha: !!float 0.9}
