# === Real Robot envs
NeckEnvRelative-v2:
  env_wrapper:
    - utils.wrappers.HistoryWrapper:
        horizon: 2
    - sb3_contrib.common.wrappers.TimeFeatureWrapper:
        test_mode: False
    # - utils.wrappers.LowPassFilterWrapper:
    #     freq: 2.0
    #     df: 25.0
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: !!float 7.3e-4
  buffer_size: 100000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.99
  tau: 0.02
  n_episodes_rollout: 1
  gradient_steps: -1
  train_freq: -1
  # 10 episodes of warm-up
  learning_starts: 3000
  use_sde_at_warmup: True
  use_sde: True
  sde_sample_freq: 64
  policy_kwargs: "dict(log_std_init=-2, net_arch=[256, 256])"

# Tuned
MountainCarContinuous-v0:
  # env_wrapper: utils.wrappers.PlotActionWrapper
  n_timesteps: !!float 50000
  policy: 'MlpPolicy'
  learning_rate: !!float 3e-4
  buffer_size: 50000
  batch_size: 512
  ent_coef: 0.1
  train_freq: 32
  gradient_steps: 32
  gamma: 0.9999
  tau: 0.01
  learning_starts: 0
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3.67, net_arch=[64, 64])"
# Note: the hyperparams for HalfCheetah also work:
# MountainCarContinuous-v0:
#   # env_wrapper:
#   #   - utils.wrappers.PlotActionWrapper:
#   #       plot_freq: 1 # Every 1 episode
#   callback:
#     - utils.callbacks.PlotNoiseRatioCallback:
#         display_freq: 500
#   n_timesteps: !!float 50000
#   policy: 'MlpPolicy'
#   learning_rate: !!float 7.3e-4
#   buffer_size: 300000
#   batch_size: 256
#   ent_coef: 'auto'
#   gamma: 0.98
#   tau: 0.02
#   train_freq: 64
#   gradient_steps: 64
#   learning_starts: 0
#   use_sde: True
#   policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"

Pendulum-v0:
  n_timesteps: 20000
  policy: 'MlpPolicy'
  learning_rate: !!float 1e-3
  use_sde: True
  n_episodes_rollout: 1
  gradient_steps: -1
  train_freq: -1
  policy_kwargs: "dict(log_std_init=-2, net_arch=[64, 64])"

LunarLanderContinuous-v2:
  n_timesteps: !!float 5e5
  policy: 'MlpPolicy'
  batch_size: 256
  learning_starts: 1000

# Tuned
BipedalWalker-v3:
  n_timesteps: !!float 5e5
  policy: 'MlpPolicy'
  learning_rate: !!float 7.3e-4
  buffer_size: 300000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.98
  tau: 0.02
  train_freq: 64
  gradient_steps: 64
  learning_starts: 10000
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"

# Almost tuned
BipedalWalkerHardcore-v3:
  n_timesteps: !!float 3e7
  policy: 'MlpPolicy'
  learning_rate: lin_7.3e-4
  buffer_size: 1000000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.99
  tau: 0.01
  train_freq: 64
  gradient_steps: 64
  learning_starts: 10000
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300], use_expln=True)"

# === Bullet envs ===

# Tuned
HalfCheetahBulletEnv-v0:
  # env_wrapper:
  #   - sb3_contrib.common.wrappers.TimeFeatureWrapper
  #   - utils.wrappers.DelayedRewardWrapper:
  #       delay: 10
  #   - utils.wrappers.HistoryWrapper:
  #       horizon: 10
  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: !!float 7.3e-4
  buffer_size: 300000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.98
  tau: 0.02
  train_freq: 64
  gradient_steps: 64
  learning_starts: 10000
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"

# Tuned
AntBulletEnv-v0:
  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: !!float 7.3e-4
  buffer_size: 300000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.98
  tau: 0.02
  train_freq: 64
  gradient_steps: 64
  learning_starts: 10000
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"

HopperBulletEnv-v0:
  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: lin_7.3e-4
  buffer_size: 300000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.98
  tau: 0.02
  train_freq: 64
  gradient_steps: 64
  learning_starts: 10000
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"

Walker2DBulletEnv-v0:
  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: lin_7.3e-4
  buffer_size: 300000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.98
  tau: 0.02
  train_freq: 64
  gradient_steps: 64
  learning_starts: 10000
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"


# Tuned
ReacherBulletEnv-v0:
  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 3e5
  policy: 'MlpPolicy'
  learning_rate: !!float 7.3e-4
  buffer_size: 300000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.98
  tau: 0.02
  train_freq: 64
  gradient_steps: 64
  learning_starts: 10000
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"

HumanoidBulletEnv-v0:
  normalize: "{'norm_obs': True, 'norm_reward': False}"
  n_timesteps: !!float 2e7
  policy: 'MlpPolicy'
  learning_rate: lin_3e-4
  buffer_size: 1000000
  batch_size: 64
  ent_coef: 'auto'
  train_freq: 1
  gradient_steps: 1
  learning_starts: 1000

# Tuned
InvertedDoublePendulumBulletEnv-v0:
  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 5e5
  policy: 'MlpPolicy'
  learning_rate: !!float 7.3e-4
  buffer_size: 300000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.98
  tau: 0.02
  train_freq: 64
  gradient_steps: 64
  learning_starts: 10000
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"

# Tuned
InvertedPendulumSwingupBulletEnv-v0:
  env_wrapper: sb3_contrib.common.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 3e5
  policy: 'MlpPolicy'
  learning_rate: !!float 7.3e-4
  buffer_size: 300000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.98
  tau: 0.02
  train_freq: 64
  gradient_steps: 64
  learning_starts: 10000
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"

# To be tuned
MinitaurBulletEnv-v0:
  normalize: "{'norm_obs': True, 'norm_reward': False}"
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: lin_3e-4
  buffer_size: 1000000
  batch_size: 64
  ent_coef: 'auto'
  # ent_coef: 0.0003
  train_freq: 1
  gradient_steps: 1
  learning_starts: 1000

# To be tuned
MinitaurBulletDuckEnv-v0:
  # normalize: "{'norm_obs': True, 'norm_reward': False}"
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: lin_3e-4
  buffer_size: 1000000
  batch_size: 256
  ent_coef: 'auto'
  train_freq: 1
  gradient_steps: 1
  learning_starts: 1000

# To be tuned
CarRacing-v0:
  env_wrapper:
    - gym.wrappers.resize_observation.ResizeObservation:
        shape: 64
    - gym.wrappers.gray_scale_observation.GrayScaleObservation:
        keep_dim: true
  frame_stack: 4
  n_envs: 1
  n_timesteps: !!float 1e6
  # TODO: try with MlpPolicy and low res (MNIST: 28) pixels
  # policy: 'MlpPolicy'
  policy: 'CnnPolicy'
  learning_rate: !!float 7.3e-4
  buffer_size: 50000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.98
  tau: 0.02
  train_freq: 64
  # n_episodes_rollout: 1
  gradient_steps: 64
  # sde_sample_freq: 64
  learning_starts: 1000
  use_sde: True
  use_sde_at_warmup: True
  policy_kwargs: "dict(log_std_init=-2, net_arch=[64, 64])"

# ==== Custom Envs ===

donkey-generated-track-v0:
  env_wrapper:
    - gym.wrappers.time_limit.TimeLimit:
        max_episode_steps: 500
    - utils.wrappers.HistoryWrapper:
        horizon: 5
    - sb3_contrib.common.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: !!float 7.3e-4
  buffer_size: 300000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.99
  tau: 0.02
  # train_freq: 64
  train_freq: -1
  n_episodes_rollout: 1
  # gradient_steps: -1
  gradient_steps: 64
  learning_starts: 500
  use_sde_at_warmup: True
  use_sde: True
  sde_sample_freq: 64
  policy_kwargs: "dict(log_std_init=-2, net_arch=[64, 64])"
