# Tuned
MountainCarContinuous-v0:
  # uncomment to plot the taken actions
  # env_wrapper: utils.wrappers.PlotActionWrapper
  n_timesteps: !!float 50000
  policy: 'MlpPolicy'
  learning_rate: !!float 3e-4
  buffer_size: 50000
  batch_size: 512
  ent_coef: 0.1
  train_freq: 32
  gradient_steps: 32
  gamma: 0.9999
  tau: 0.01
  learning_starts: 0
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3.67, net_arch=[64, 64])"
# Note: the hyperparams for HalfCheetah also work:
# MountainCarContinuous-v0:
#   # env_wrapper:
#   #   - utils.wrappers.PlotActionWrapper:
#   #       plot_freq: 1 # Every 1 episode
#   callback:
#     - utils.callbacks.PlotNoiseRatioCallback:
#         display_freq: 500
#   n_timesteps: !!float 50000
#   policy: 'MlpPolicy'
#   learning_rate: !!float 7.3e-4
#   buffer_size: 300000
#   batch_size: 256
#   ent_coef: 'auto'
#   gamma: 0.98
#   tau: 0.02
#   train_freq: 64
#   gradient_steps: 64
#   learning_starts: 0
#   use_sde: True
#   policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"

# === Bullet envs ===

# Tuned
HalfCheetahBulletEnv-v0:
  env_wrapper: utils.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: !!float 7.3e-4
  buffer_size: 300000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.98
  tau: 0.02
  train_freq: 64
  gradient_steps: 64
  learning_starts: 10000
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"

# Tuned
AntBulletEnv-v0:
  env_wrapper: utils.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: !!float 7.3e-4
  buffer_size: 300000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.98
  tau: 0.02
  train_freq: 64
  gradient_steps: 64
  learning_starts: 10000
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"

HopperBulletEnv-v0:
  env_wrapper: utils.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: lin_7.3e-4
  buffer_size: 300000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.98
  tau: 0.02
  train_freq: 64
  gradient_steps: 64
  learning_starts: 10000
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"

Walker2DBulletEnv-v0:
  env_wrapper: utils.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: lin_7.3e-4
  buffer_size: 300000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.98
  tau: 0.02
  train_freq: 64
  gradient_steps: 64
  learning_starts: 10000
  use_sde: True
  policy_kwargs: "dict(log_std_init=-3, net_arch=[400, 300])"

# === Real Robot envs
NeckEnvRelative-v2:
  env_wrapper:
    - utils.wrappers.HistoryWrapper:
        horizon: 2
    - utils.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: !!float 7.3e-4
  buffer_size: 100000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.99
  tau: 0.02
  n_episodes_rollout: 1
  gradient_steps: -1
  train_freq: -1
  # 10 episodes of warm-up
  learning_starts: 3000
  use_sde_at_warmup: True
  use_sde: True
  sde_sample_freq: 64
  policy_kwargs: "dict(log_std_init=-2, net_arch=[256, 256])"
