############################################################
# NOTE: STARTING WITH SB3 >= 1.1.0, because HER is now HerReplayBuffer,
# this file is no longer used.
# It is only here as a reference.
#############################################################

parking-v0:
  n_timesteps: !!float 1e5
  policy: 'MlpPolicy'
  model_class: 'tqc'
  n_sampled_goal: 4
  goal_selection_strategy: 'future'
  buffer_size: 1000000
  batch_size: 1024
  gamma: 0.95
  learning_rate: !!float 1e-3
  tau: 0.05
  policy_kwargs: "dict(n_critics=2, net_arch=[512, 512, 512])"
  online_sampling: True
  max_episode_length: 100
  # normalize: True

# Mujoco Robotic Env

FetchPush-v1:
  env_wrapper:
    - sb3_contrib.common.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  model_class: 'tqc'
  n_sampled_goal: 4
  goal_selection_strategy: 'future'
  buffer_size: 1000000
  batch_size: 2048
  gamma: 0.95
  learning_rate: !!float 1e-3
  tau: 0.05
  policy_kwargs: "dict(n_critics=2, net_arch=[512, 512, 512])"
  online_sampling: True

FetchSlide-v1:
  env_wrapper:
    - sb3_contrib.common.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 2.5e6
  policy: 'MlpPolicy'
  model_class: 'tqc'
  n_sampled_goal: 4
  goal_selection_strategy: 'future'
  buffer_size: 1000000
  batch_size: 2048
  gamma: 0.95
  learning_rate: !!float 1e-3
  tau: 0.05
  # ent_coef: 0.01
  policy_kwargs: "dict(n_critics=2, net_arch=[512, 512, 512])"
  online_sampling: True


FetchPickAndPlace-v1:
  env_wrapper:
    - sb3_contrib.common.wrappers.TimeFeatureWrapper
    # - utils.wrappers.DoneOnSuccessWrapper:
    #     reward_offset: 0
    #     n_successes: 4
    # - stable_baselines3.common.monitor.Monitor
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  model_class: 'tqc'
  n_sampled_goal: 4
  goal_selection_strategy: 'future'
  buffer_size: 1000000
  batch_size: 1024
  gamma: 0.95
  learning_rate: !!float 1e-3
  tau: 0.05
  policy_kwargs: "dict(n_critics=2, net_arch=[512, 512, 512])"
  online_sampling: True

# SAC hyperparams
FetchReach-v1:
  n_timesteps: !!float 20000
  policy: 'MlpPolicy'
  model_class: 'sac'
  n_sampled_goal: 4
  goal_selection_strategy: 'future'
  buffer_size: 1000000
  ent_coef: 'auto'
  batch_size: 256
  gamma: 0.95
  learning_rate: 0.001
  learning_starts: 1000
  online_sampling: True
  normalize: True

# === Real Robot envs
NeckGoalEnvRelativeSparse-v2:
  model_class: 'sac'
  # env_wrapper:
  #   - utils.wrappers.HistoryWrapper:
  #       horizon: 2
  #   - sb3_contrib.common.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: !!float 7.3e-4
  buffer_size: 100000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.99
  tau: 0.02
  train_freq: [1, "episode"]
  gradient_steps: -1
  # 10 episodes of warm-up
  learning_starts: 1500
  use_sde_at_warmup: True
  use_sde: True
  sde_sample_freq: 64
  policy_kwargs: "dict(log_std_init=-2, net_arch=[256, 256])"
  n_sampled_goal: 4
  goal_selection_strategy: 'future'
  online_sampling: False

NeckGoalEnvRelativeDense-v2:
  model_class: 'sac'
  env_wrapper:
    - utils.wrappers.HistoryWrapperObsDict:
        horizon: 2
  #   - sb3_contrib.common.wrappers.TimeFeatureWrapper
  n_timesteps: !!float 1e6
  policy: 'MlpPolicy'
  learning_rate: !!float 7.3e-4
  buffer_size: 200000
  batch_size: 256
  ent_coef: 'auto'
  gamma: 0.99
  tau: 0.02
  train_freq: [1, "episode"]
  gradient_steps: -1
  # 10 episodes of warm-up
  learning_starts: 1500
  use_sde_at_warmup: True
  use_sde: True
  sde_sample_freq: 64
  policy_kwargs: "dict(log_std_init=-2, net_arch=[256, 256])"
  n_sampled_goal: 4
  goal_selection_strategy: 'future'
  online_sampling: False
