defaults:

  # Train Script
  logdir: /dev/null
  seed: 0
  task: dmc_walker_walk
  envs: 1
  envs_parallel: none
  render_size: [64, 64]
  dmc_camera: -1
  atari_grayscale: True
  time_limit: 0
  action_repeat: 1
  steps: 1e8
  log_every: 1e4
  eval_every: 1e5
  eval_eps: 1
  prefill: 10000
  pretrain: 1
  train_every: 5
  train_steps: 1
  expl_until: 0
  replay: {capacity: 2e6, ongoing: False, minlen: 50, maxlen: 50, prioritize_ends: True, sample_recent: False, recent_episode_threshold: 0, initial_buffer_path: '', initial_buffer_capacity: 1e6, shuffle_blocks: False}
  dataset: {batch: 16, length: 50}
  goal_dataset: {batch: 150, length: 50}
  log_keys_video: ['image']
  log_keys_sum: '^$'
  log_keys_mean: '^$'
  log_keys_max: '^$'
  precision: 16
  jit: True
  state_key: none
  goal_key: none
  image_input: True
  no_render: False
  slurm_preempt: False
  gcp_train_factor: 1

  # Agent
  clip_rewards: tanh
  expl_behavior: greedy
  expl_noise: 0.0
  epsilon_expl_noise: 0.0
  eval_noise: 0.0
  eval_state_mean: False

  # World Model
  grad_heads: [decoder, reward, discount]
  pred_discount: True
  pred_reward: True
  rssm: {ensemble: 1, hidden: 1024, deter: 1024, stoch: 32, discrete: 32, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  encoder: {mlp_keys: '.*', cnn_keys: '.*', act: elu, norm: none, cnn_depth: 48, cnn_kernels: [4, 4, 4, 4], mlp_layers: [400, 400, 400, 400]}
  decoder: {mlp_keys: '.*', cnn_keys: '.*', act: elu, norm: none, cnn_depth: 48, cnn_kernels: [5, 5, 6, 6], mlp_layers: [400, 400, 400, 400]}
  reward_head: {layers: 4, units: 400, act: elu, norm: none, dist: mse}
  discount_head: {layers: 4, units: 400, act: elu, norm: none, dist: binary}
  loss_scales: {kl: 1.0, reward: 1.0, discount: 1.0, proprio: 1.0}
  kl: {free: 0.0, forward: False, balance: 0.8, free_avg: True}
  model_opt: {opt: adam, lr: 1e-4, eps: 1e-5, clip: 100, wd: 1e-6}

  # Actor Critic
  actor: {layers: 4, units: 400, act: elu, norm: none, dist: auto, min_std: 0.1}
  critic: {layers: 4, units: 400, act: elu, norm: none, dist: mse}
  actor_opt: {opt: adam, lr: 8e-5, eps: 1e-5, clip: 100, wd: 1e-6}
  critic_opt: {opt: adam, lr: 2e-4, eps: 1e-5, clip: 100, wd: 1e-6}
  discount: 0.99
  p2e_discount: 0.99
  discount_lambda: 0.95
  imag_horizon: 15
  actor_grad: auto
  actor_grad_mix: 0.1
  actor_ent: 2e-3
  slow_target: True
  slow_target_update: 100
  slow_target_fraction: 1
  slow_baseline: True
  reward_norm: {momentum: 1.0, scale: 1.0, eps: 1e-8}

  # Goal Conditioning Stuff
  gc_input: 'embed'
  gc_reward: 'dynamical_distance'
  gc_reward_shape: 'sum_dist' # sum_diff
  pred_embed: True
  embed_head: {layers: 3, units: 400, act: elu, norm: none, dist: mse}
  training_goals: 'batch'
  train_env_goal_percent: 0.0
  labelled_env_multiplexing: False
  subgoal_threshold: 5.0
  goal_policy_rollout_percentage: 0.3
  goal_strategy: 'SampleReplay'
  gcp_rollout_every: 1 # run gcp rollout every X algo update.
  exp_rollout_every: 1 # run exp rollout every X algo update.
  two_policy_rollout_every: 1 # run two policy rollout every X algo update
  goal_update_every: 0 # how often to update goal picker
  go_expl_rand_ac: False
  planner: {planner_type: shooting_cem, horizon: 50, batch: 500, cem_elite_ratio: 0.2, optimization_steps: 5, std_scale: 1.0, init_candidates: [123456789.0], goal_min: [0.0], goal_max: [0.0], cost_use_p2e_value: False, final_step_cost: False, sample_replay: False, evaluate_only: False, repeat_samples: 0, mega_prior: False, mppi_gamma: 10.0, sample_env_goal_percent: 0.0, init_env_goal_percent: 0.0}

  # Dynamical Distance
  dd_inp : 'embed'
  dd_num_positives : 256
  dd_neg_sampling_factor : 0.0
  dd_norm_inp : False
  dd_norm_reg_label : True
  dd_train_imag : True
  dd_train_off_policy : False
  dd_distance : 'steps_to_go'
  dd_loss : 'regression'
  dd_prob_balance: 1.0
  dd_opt: {opt: adam, lr: 8e-5, eps: 1e-5, clip: 100, wd: 1e-6}

  # Exploration
  expl_intr_scale: 1.0
  expl_extr_scale: 0.0
  expl_opt: {opt: adam, lr: 3e-4, eps: 1e-5, clip: 100, wd: 1e-6}
  expl_head: {layers: 4, units: 400, act: elu, norm: none, dist: mse}
  expl_reward_norm: {momentum: 1.0, scale: 1.0, eps: 1e-8}
  disag_target: stoch
  disag_log: False
  disag_models: 10
  disag_offset: 1
  disag_action_cond: True
  expl_model_loss: kl

  # noisy env hyperparams
  noise_dim: 5
  noise_low: -1.0
  noise_high: 1.0

atari:

  task: atari_pong
  encoder: {mlp_keys: '$^', cnn_keys: 'image'}
  decoder: {mlp_keys: '$^', cnn_keys: 'image'}
  time_limit: 27000
  action_repeat: 4
  steps: 5e7
  eval_every: 2.5e5
  log_every: 1e4
  prefill: 50000
  train_every: 16
  clip_rewards: tanh
  rssm: {hidden: 600, deter: 600}
  model_opt.lr: 2e-4
  actor_opt.lr: 4e-5
  critic_opt.lr: 1e-4
  actor_ent: 1e-3
  discount: 0.999
  loss_scales.kl: 0.1
  loss_scales.discount: 5.0

crafter:

  task: crafter_reward
  encoder: {mlp_keys: '$^', cnn_keys: 'image'}
  decoder: {mlp_keys: '$^', cnn_keys: 'image'}
  log_keys_max: '^log_achievement_.*'
  log_keys_sum: '^log_reward$'
  rssm: {hidden: 1024, deter: 1024}
  discount: 0.999
  model_opt.lr: 1e-4
  actor_opt.lr: 1e-4
  critic_opt.lr: 1e-4
  actor_ent: 3e-3
  .*\.norm: layer

dmc_vision:

  task: dmc_walker_walk
  encoder: {mlp_keys: '$^', cnn_keys: 'image'}
  decoder: {mlp_keys: '$^', cnn_keys: 'image'}
  action_repeat: 2
  eval_every: 1e4
  prefill: 1000
  pretrain: 100
  clip_rewards: identity
  pred_discount: False
  replay.prioritize_ends: False
  grad_heads: [decoder, reward]
  rssm: {hidden: 200, deter: 200}
  model_opt.lr: 3e-4
  actor_opt.lr: 8e-5
  critic_opt.lr: 8e-5
  actor_ent: 1e-4
  kl.free: 1.0

dmc_proprio:

  task: dmc_walker_walk
  encoder: {mlp_keys: '.*', cnn_keys: '$^'}
  decoder: {mlp_keys: '.*', cnn_keys: '$^'}
  action_repeat: 2
  eval_every: 1e4
  prefill: 1000
  pretrain: 100
  clip_rewards: identity
  pred_discount: False
  replay.prioritize_ends: False
  grad_heads: [decoder, reward]
  rssm: {hidden: 200, deter: 200}
  model_opt.lr: 3e-4
  actor_opt.lr: 8e-5
  critic_opt.lr: 8e-5
  actor_ent: 1e-4
  kl.free: 1.0

lexa_dmc_proprio:
  action_repeat: 2
  actor_opt.lr: 8e-5
  actor_ent: 1e-4
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  decoder: {mlp_keys: 'qpos', cnn_keys: '$^', mlp_layers: [400,400,400]}
  dd_neg_sampling_factor : 0.1
  disag_models: 10
  encoder: {mlp_keys: 'qpos', cnn_keys: '$^', mlp_layers: [400, 400, 400]}
  eval_every: 66 # 66 episodes = 1e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  exp_rollout_every: 0 # run exp rollout every X algo updates
  gc_reward: 'dynamical_distance'
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.5
  goal_update_every: 50 # every 50 episodes, update goal picker
  goal_strategy: 'SubgoalPlanner'
  goal_key: 'goal'
  gcp_rollout_every: 0 # run gcp rollout every X algo updates
  kl.free: 1.0
  # log_keys_mean: 'metric_'
  # log_keys_max: 'metric_'
  log_keys_video: ['none']
  model_opt.lr: 3e-4
  p2e_discount: 0.996
  prefill: 33 # 2500 steps.
  pretrain: 100
  pred_discount: False
  pred_reward: False
  pred_embed: True
  planner: {planner_type: shooting_cem, horizon: 75, batch: 500, cem_elite_ratio: 0.2, optimization_steps: 5, std_scale: 3.0, goal_min: [-1.3, -16.0, -3.14, -3.14, -3.14,-3.14,-3.14,-3.14,-3.14,], goal_max: [0.5, 16.0, 3.14, 3.14, 3.14, 3.14, 3.14, 3.14, 3.14], mega_prior: True}
  replay: {capacity: 2e6, ongoing: False, minlen: 50, maxlen: 50, prioritize_ends: False, sample_recent: True, recent_episode_threshold: 666 } # 100K steps
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  subgoal_threshold: 1.0
  state_key: 'qpos'
  steps: 2000000
  time_limit: 150
  train_every: 5
  task: dmc_walker_walk_proprio
  two_policy_rollout_every: 1 # run 2 policy rollout every X expl rollouts.

lexa_humanoid_proprio:
  action_repeat: 2
  actor_opt.lr: 8e-5
  actor_ent: 1e-4
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  decoder: {mlp_keys: 'qpos', cnn_keys: '$^', mlp_layers: [400,400,400]}
  dd_neg_sampling_factor : 0.1
  disag_models: 10
  encoder: {mlp_keys: 'qpos', cnn_keys: '$^', mlp_layers: [400, 400, 400]}
  eval_every: 66 # 66 episodes = 1e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  exp_rollout_every: 0 # run exp rollout every X algo updates
  gc_reward: 'dynamical_distance'
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.5
  goal_update_every: 50 # every 50 episodes, update goal picker
  goal_strategy: 'SubgoalPlanner'
  goal_key: 'goal'
  gcp_rollout_every: 0 # run gcp rollout every X algo updates
  kl.free: 1.0
  # log_keys_mean: 'metric_'
  # log_keys_max: 'metric_'
  log_keys_video: ['none']
  model_opt.lr: 3e-4
  p2e_discount: 0.996
  prefill: 33 # 2500 steps.
  pretrain: 100
  pred_discount: False
  pred_reward: False
  pred_embed: True
  planner: {planner_type: shooting_cem, horizon: 75, batch: 500, cem_elite_ratio: 0.2, optimization_steps: 5, std_scale: 0.5, goal_min: [-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1, -0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1, -0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1, -0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1], goal_max: [1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1], mega_prior: False}
  replay: {capacity: 2e6, ongoing: False, minlen: 50, maxlen: 50, prioritize_ends: False, sample_recent: True, recent_episode_threshold: 666 } # 100K steps
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  subgoal_threshold: 1.0
  state_key: 'qpos'
  steps: 2000000
  time_limit: 150
  train_every: 5
  task: dmc_humanoid_walk_proprio
  two_policy_rollout_every: 1 # run 2 policy rollout every X expl rollouts.

lexa_robobin_proprio:
  action_repeat: 2
  actor_opt.lr: 1e-5
  actor_grad: 'auto'
  actor_ent: 1e-4
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  decoder: {mlp_keys: 'qpos', cnn_keys: '$^', mlp_layers: [128, 128]}
  disag_models: 10
  disag_log: True
  encoder: {mlp_keys: 'qpos', cnn_keys: '$^', mlp_layers: [128, 128]}
  eval_every: 66 # 66 episodes = 1e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  gc_reward: 'dynamical_distance'
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.3
  goal_strategy: 'SampleReplay'
  goal_key: 'goal'
  kl.free: 1.0
  log_keys_mean: 'metric_'
  model_opt.lr: 3e-4
  prefill: 2500
  precision: 16
  pretrain: 100
  pred_discount: False
  pred_reward: False
  pred_embed: True
  replay.prioritize_ends: False
  rssm: {hidden: 200, deter: 200}
  subgoal_threshold: 2
  state_key: 'qpos'
  time_limit: 150
  task: robobin_proprio

lexa_robobin_vision:
  action_repeat: 2
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 1e-4
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  decoder: {mlp_keys: '$^', cnn_keys: 'image', act: elu, norm: none, cnn_depth: 32, cnn_kernels: [5, 5, 6, 6], mlp_layers: [400, 400, 400, 400]}
  disag_models: 10
  disag_log: True
  encoder: {mlp_keys: '$^', cnn_keys: 'image', act: elu, norm: none, cnn_depth: 32, cnn_kernels: [4, 4, 4, 4], mlp_layers: [400, 400, 400, 400]}
  eval_every: 665 # 665 episodes = 5e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  gc_reward: 'dynamical_distance'
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.3
  goal_strategy: 'SampleReplay'
  goal_key: 'image_goal'
  kl.free: 1.0
  log_keys_mean: 'metric_'
  model_opt.lr: 3e-4
  prefill: 2500
  precision: 16
  pretrain: 100
  pred_discount: False
  pred_reward: False
  pred_embed: True
  replay.prioritize_ends: False
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  subgoal_threshold: 2
  state_key: 'image'
  time_limit: 75
  task: robobin_vision

lexa_reach_proprio:
  action_repeat: 5
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 1e-4
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  decoder: {mlp_keys: 'qpos', cnn_keys: '$^', mlp_layers: [400,400,400,400]}
  disag_models: 3
  disag_log: True
  encoder: {mlp_keys: 'qpos', cnn_keys: '$^', mlp_layers: [400,400,400,400]}
  eval_every: 66 # 66 episodes = 1e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  gc_reward: 'dynamical_distance'
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.3
  goal_strategy: 'SampleReplay'
  goal_key: 'goal'
  kl.free: 1.0
  log_keys_mean: 'metric_'
  model_opt.lr: 3e-4
  prefill: 2500
  precision: 32
  pretrain: 100
  pred_discount: False
  pred_reward: False
  pred_embed: True
  replay.prioritize_ends: False
  rssm: {hidden: 200, deter: 200}
  subgoal_threshold: 2
  state_key: 'qpos'
  time_limit: 150
  task: mtmw_sawyer_SawyerReachEnv

mega_pointmaze_proprio:
  action_repeat: 1
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 1e-4
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  dd_neg_sampling_factor : 0.0
  decoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  disag_models: 10
  disag_log: False
  encoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  eval_every: 200 # 200 episodes = 1e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  exp_rollout_every: 0 # run exp rollout every X algo updates
  gc_reward: 'dynamical_distance'
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.5
  goal_update_every: 50 # every 50 episodes, update goal picker
  goal_strategy: 'MEGA'
  goal_key: 'goal'
  go_expl_rand_ac: True # use rand ac for exploration?
  gcp_rollout_every: 0 # run gcp rollout every X algo updates
  kl.free: 1.0
  log_keys_mean: 'metric_'
  log_keys_sum: 'metric_|subgoal_dist'
  log_keys_max: 'metric_|subgoal_success'
  model_opt.lr: 3e-4
  prefill: 50
  precision: 16
  pretrain: 100
  pred_discount: False
  pred_reward: False
  pred_embed: True
  planner: {planner_type: shooting_cem, horizon: 25, batch: 500, cem_elite_ratio: 0.2, optimization_steps: 5, std_scale: 4.0, init_candidates: [5.0, 5.0], goal_min: [0.0, 0.0],   goal_max: [10.0, 10.0], cost_use_p2e_value: True, final_step_cost: False }
  replay.prioritize_ends: False
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  subgoal_threshold: 5.0
  state_key: 'observation'
  time_limit: 50
  train_every: 5
  task: pointmaze
  two_policy_rollout_every: 1 # run 2 policy rollout every X algo updates

mega_umazefull_proprio:
  action_repeat: 1
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 1e-4
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  dd_neg_sampling_factor : 0.0
  decoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [32,32]}
  disag_models: 10
  disag_log: False
  encoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [32, 32]}
  eval_every: 20 # 200 episodes = 1e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  exp_rollout_every: 0 # run exp rollout every X algo updates
  gc_reward: 'dynamical_distance'
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.5
  goal_update_every: 50 # every 50 episodes, update goal picker
  goal_strategy: 'SubgoalPlanner'
  goal_key: 'goal'
  gcp_rollout_every: 0 # run gcp rollout every X algo updates
  kl.free: 1.0
  log_keys_mean: 'metric_'
  log_keys_max: 'metric_'
  log_keys_video: ['observation']
  model_opt.lr: 3e-4
  prefill: 5
  precision: 16
  pretrain: 100
  pred_discount: False
  pred_reward: False
  pred_embed: True
  planner: {planner_type: shooting_cem, horizon: 40, batch: 500, cem_elite_ratio: 0.2, optimization_steps: 5, std_scale: 4.0, init_candidates: [0.0, 0.0, 8.19890515e-01,  9.95145602e-01,
        3.48547286e-02, -6.19350100e-02,  6.80766655e-02, -4.42372144e-02,
       -4.81461428e-02,  1.45511675e-02, -7.75746132e-02,  5.00618279e-02,
        3.65949561e-02,  4.99939194e-02,  1.02664477e-02, -2.27597298e-01,
        8.01758031e-02, -8.81610163e-02,  7.77121806e-02,  3.36722131e-02,
        2.27648027e-02, -2.24103019e-02, -3.77942221e-02, -6.56355237e-02,
        1.35722257e-01,  6.93039877e-02, -1.71162114e-01, -1.12083335e-01,
        1.76819156e-02], goal_min: [-3.0, -2.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
        goal_max: [20.0, 20.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] }
  replay.prioritize_ends: False
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  subgoal_threshold: 5.0
  state_key: 'observation'
  time_limit: 500
  train_every: 5
  task: umazefull
  two_policy_rollout_every: 1 # run 2 policy rollout every X expl rollouts.


mega_umazefulldownscale_proprio:
  action_repeat: 1
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 1e-4
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  dd_neg_sampling_factor : 0.1
  decoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  disag_models: 10
  disag_log: False
  discount: 0.996
  encoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  eval_every: 20 # 20 episodes = 1e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  exp_rollout_every: 0 # run exp rollout every X algo updates
  gc_reward: 'dynamical_distance'
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.5
  goal_update_every: 50 # every 50 episodes, update goal picker
  goal_strategy: 'SubgoalPlanner'
  goal_key: 'goal'
  gcp_rollout_every: 0 # run gcp rollout every X algo updates
  kl.free: 1.0
  log_keys_video: ['observation']
  model_opt.lr: 3e-4
  p2e_discount: 0.996
  prefill: 5
  precision: 16
  pretrain: 100
  pred_discount: False
  pred_reward: False
  pred_embed: True
  planner: {planner_type: shooting_cem, horizon: 100, batch: 500, cem_elite_ratio: 0.2, optimization_steps: 5, std_scale: 10.0, init_candidates: [0.0, 0.0, 8.19890515e-01,  9.95145602e-01,
        3.48547286e-02, -6.19350100e-02,  6.80766655e-02, -4.42372144e-02,
       -4.81461428e-02,  1.45511675e-02, -7.75746132e-02,  5.00618279e-02,
        3.65949561e-02,  4.99939194e-02,  1.02664477e-02, -2.27597298e-01,
        8.01758031e-02, -8.81610163e-02,  7.77121806e-02,  3.36722131e-02,
        2.27648027e-02, -2.24103019e-02, -3.77942221e-02, -6.56355237e-02,
        1.35722257e-01,  6.93039877e-02, -1.71162114e-01, -1.12083335e-01,
        1.76819156e-02], goal_min: [-0.75, -0.5, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
        goal_max: [5.0, 5.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], mega_prior: True}
  replay: {capacity: 2e6, ongoing: False, minlen: 50, maxlen: 50, prioritize_ends: False, sample_recent: True, recent_episode_threshold: 200 } # 100K steps
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  subgoal_threshold: 1.0
  state_key: 'observation'
  steps: 1000000
  time_limit: 500
  train_every: 5
  task: umazefulldownscale
  two_policy_rollout_every: 1 # run 2 policy rollout every X expl rollouts.

hardmaze:
  task: hardumazefulldownscale
  eval_every: 40 # 40 episodes = 2e4 steps.
  planner.goal_max: [5.0, 9.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


mega_a1umazefulldownscale_proprio:
  action_repeat: 1
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 1e-4
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  dd_neg_sampling_factor : 0.0
  decoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  disag_models: 10
  disag_log: False
  discount: 0.996 # decays to 0 at 250 timesteps.
  encoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  eval_every: 20 # 200 episodes = 1e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  exp_rollout_every: 0 # run exp rollout every X algo updates
  gc_reward: 'dynamical_distance'
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.5
  goal_update_every: 50 # every 50 episodes, update goal picker
  goal_strategy: 'SubgoalPlanner'
  goal_key: 'goal'
  gcp_rollout_every: 0 # run gcp rollout every X algo updates
  kl.free: 1.0
  log_keys_video: ['observation']
  model_opt.lr: 3e-4
  p2e_discount: 0.996
  prefill: 5
  precision: 16
  pretrain: 100
  pred_discount: False
  pred_reward: False
  pred_embed: True
  planner: {planner_type: shooting_cem, horizon: 100, batch: 500, cem_elite_ratio: 0.2, optimization_steps: 5, std_scale: 10.0, init_candidates:
  [0.0, 0.0, 0.24556014,  0.986648,    0.09023235, -0.09100603, 0.10050705, -0.07250207, -0.01489305,  0.09989551, -0.05246516, -0.05311238, -0.01864055, -0.05934234,  0.03910208, -0.08356607,  0.05515265, -0.00453086, -0.01196933, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0,0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
  goal_min: [-0.75, -0.5, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
  goal_max: [2.5, 2.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], mega_prior: False}
  replay.prioritize_ends: False
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  subgoal_threshold: 1.0
  state_key: 'observation'
  steps: 2000000
  time_limit: 500
  train_every: 5
  task: a1umazefulldownscale
  two_policy_rollout_every: 1 # run 2 policy rollout every X expl rollouts.

mega_fetchpnp_proprio:
  action_repeat: 1
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 1e-4
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  dd_neg_sampling_factor : 0.0
  decoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  disag_models: 10
  disag_log: False
  encoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  eval_every: 200 # 200 episodes = 1e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  exp_rollout_every: 0 # run exp rollout every X algo updates
  gc_reward: 'dynamical_distance'
  gc_input: 'state'
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.5
  goal_update_every: 1 # every 1 episode, update goal picker
  goal_strategy: 'SubgoalPlanner'
  goal_key: 'goal'
  gcp_rollout_every: 0 # run gcp rollout every X algo updates
  kl.free: 1.0
  log_keys_sum: 'metric_|log_subgoal_dist'
  log_keys_max: 'is_success|log_subgoal_success'
  log_keys_video: ['none']
  model_opt.lr: 3e-4
  prefill: 50 # number of episodes of random exploration.
  precision: 16
  pretrain: 100
  pred_discount: False
  pred_reward: False
  pred_embed: False
  # planner: {planner_type: shooting_cem, horizon: 25, batch: 500, cem_elite_ratio: 0.2, optimization_steps: 5, std_scale: 1.0, goal_min: [1.1, 0.4, 0.41, 1.1, 0.4, 0.41, -1.0, -1.0, -1.0, -1.0, -1.0, -4.0, -4.0, -4.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0], goal_max: [1.5, 0.8, 0.8, 1.5, 0.8, 0.8, 1.0, 1.0, 1.0, 1.0, 1.0, 4.0, 4.0, 4.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] }
  planner: {planner_type: shooting_cem, horizon: 25, batch: 500, cem_elite_ratio: 0.2, optimization_steps: 5, std_scale: 1.0, goal_min: [1.1, 0.4, 0.41], goal_max: [1.5, 0.8, 0.8], evaluate_only: False, sample_replay: False, repeat_samples: 0, mega_prior: False}
  replay: {capacity: 2e6, ongoing: False, minlen: 50, maxlen: 50, prioritize_ends: False, sample_recent: True, recent_episode_threshold: 2000 } # 100K steps
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  subgoal_threshold: 5.0
  state_key: 'observation'
  time_limit: 50
  train_every: 5
  task: fetchpnp
  two_policy_rollout_every: 1 # run 2 policy rollout every X expl rollouts.

pnp_easy:
  task: fetchpnpeasy

pnp_oraclerew:
  gc_reward: l2
  subgoal_threshold: 0.02

mega_demofetchpnp_proprio:
  action_repeat: 1
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 1e-4
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  dd_neg_sampling_factor : 0.0
  decoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  disag_models: 10
  disag_log: False
  encoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  eval_every: 200  # 200 episodes = 1e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  exp_rollout_every: 0 # run exp rollout every X algo updates
  gc_reward: 'dynamical_distance'
  gc_input: 'embed'
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.5
  goal_update_every: 1 # every 1 episode, update goal picker
  goal_strategy: 'SubgoalPlanner'
  goal_key: 'goal'
  gcp_rollout_every: 0 # run gcp rollout every X algo updates
  kl.free: 1.0
  log_keys_sum: 'metric_|log_subgoal_dist'
  log_keys_max: 'is_success|log_subgoal_success'
  log_keys_video: ['none']
  model_opt.lr: 3e-4
  prefill: 50 # number of episodes of random exploration.
  precision: 16
  pretrain: 100
  pred_discount: False
  pred_reward: False
  pred_embed: True
  planner: {planner_type: shooting_cem, horizon: 25, batch: 500, cem_elite_ratio: 0.2, optimization_steps: 5, std_scale: 1.0, goal_min: [1.25, 0.5, 0.42, 0.0, 0.0, 1.25, 0.5, 0.42, 1.25, 0.5, 0.42], goal_max: [1.6, 1.0, 0.6, 0.05, 0.05, 1.6, 1.0, 0.6, 1.6, 1.0, 0.6], evaluate_only: False, sample_replay: False, repeat_samples: 0, mega_prior: False, sample_env_goal_percent: 0.5, init_env_goal_percent: 0.5}
  replay: {capacity: 1e7, ongoing: False, minlen: 50, maxlen: 50, prioritize_ends: False, sample_recent: True, recent_episode_threshold: 2000 } # 100K steps
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  subgoal_threshold: 5.0
  state_key: 'observation'
  time_limit: 50
  train_every: 5
  task: demofetchpnp
  two_policy_rollout_every: 1 # run 2 policy rollout every X expl rollouts.

walls_demofetchpnp:
  eval_every: 200  # 500 episodes = 5e4 steps.
  prefill: 25 # number of episodes of random exploration.
  planner: {planner_type: shooting_cem, horizon: 50, batch: 500, cem_elite_ratio: 0.2, optimization_steps: 5, std_scale: 0.5, goal_min: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], goal_max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], evaluate_only: False, sample_replay: False, repeat_samples: 0, mega_prior: False, sample_env_goal_percent: 0.0, init_env_goal_percent: 0.0}
  replay.shuffle_blocks: False
  task: wallsdemofetchpnp2
  time_limit: 100

walls_demofetchpnp_3blocks:
  eval_every: 133  # 133 episodes = 2e4 steps.
  gc_reward_shape: sum
  prefill: 25 # number of episodes of random exploration.
  planner: {planner_type: shooting_cem, horizon: 75, batch: 500, cem_elite_ratio: 0.2, optimization_steps: 5, std_scale: 0.5, goal_min: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], goal_max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], evaluate_only: False, sample_replay: False, repeat_samples: 0, mega_prior: False, sample_env_goal_percent: 0.0, init_env_goal_percent: 0.0}
  replay.shuffle_blocks: True
  task: wallsdemofetchpnp3
  time_limit: 150

walls_demofetchpnp_3blocks_2blockgoals:
  eval_every: 133  # 133 episodes = 2e4 steps.
  gc_reward_shape: sum
  prefill: 25 # number of episodes of random exploration.
  planner: {planner_type: shooting_cem, horizon: 50, batch: 500, cem_elite_ratio: 0.2, optimization_steps: 5, std_scale: 0.5, goal_min: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], goal_max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], evaluate_only: False, sample_replay: False, repeat_samples: 0, mega_prior: False, sample_env_goal_percent: 0.5, init_env_goal_percent: 0.5}
  replay.shuffle_blocks: False
  task: wallsdemofetchpnp3
  time_limit: 100

noisy_blocks_5dim:
  # noisy env hyperparams
  noise_dim: 5
  noise_low: 0.0
  noise_high: 1.0
  planner.goal_min: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
  planner.goal_max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]

noisy_blocks_10dim:
  # noisy env hyperparams
  noise_dim: 10
  noise_low: 0.0
  noise_high: 1.0
  planner.goal_min: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
  planner.goal_max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]

noisy_blocks_20dim:
  # noisy env hyperparams
  noise_dim: 20
  noise_low: 0.0
  noise_high: 10.0
  planner.goal_min: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
  planner.goal_max: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0]


disc_walls_demofetchpnp:
  task: discwallsdemofetchpnp

lexa_dmc_vision:
  action_repeat: 2
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 1e-4
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  decoder: {mlp_keys: '$^', cnn_keys: 'image', act: elu, norm: none, cnn_depth: 32, cnn_kernels: [5, 5, 6, 6], mlp_layers: [400, 400, 400, 400]}
  disag_models: 10
  disag_log: True
  encoder: {mlp_keys: '$^', cnn_keys: 'image', act: elu, norm: none, cnn_depth: 32, cnn_kernels: [4, 4, 4, 4], mlp_layers: [400, 400, 400, 400]}
  eval_every: 100 # 100 episodes = 5e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  gc_reward: 'dynamical_distance'
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.3
  goal_strategy: 'SubgoalPlanner'
  goal_key: 'image_goal'
  kl.free: 1.0
  log_keys_mean: 'metric_'
  model_opt.lr: 3e-4
  prefill: 5
  precision: 16
  pretrain: 100
  pred_discount: False
  pred_reward: False
  pred_embed: True
  replay.prioritize_ends: False
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  subgoal_threshold: 2
  state_key: 'image'
  time_limit: 500
  # train_steps: 10
  task: dmc_walker_walk_vision


gc_dmc_proprio:
  task: dmc_walker_walk
  encoder: {mlp_keys: 'qpos', cnn_keys: '$^', mlp_layers: [128, 128]}
  decoder: {mlp_keys: 'qpos', cnn_keys: '$^'}
  action_repeat: 2
  eval_every: 66 # 66 episodes = 1e4 steps.
  prefill: 16
  pretrain: 100
  clip_rewards: identity
  pred_discount: False
  pred_reward: False
  replay.prioritize_ends: False
  grad_heads: [decoder, reward]
  rssm: {hidden: 200, deter: 200}
  model_opt.lr: 3e-4
  actor_opt.lr: 8e-5
  critic_opt.lr: 8e-5
  actor_ent: 1e-4
  kl.free: 1.0
  pred_embed: True
  time_limit: 150
  subgoal_threshold: 2
  goal_policy_rollout_percentage: 0.3

lexa_kitchen_proprio:
  action_repeat: 2
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 1e-4
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  dd_neg_sampling_factor : 0.0
  decoder: {mlp_keys: 'state', cnn_keys: '$^', mlp_layers: [400,400,400]}
  disag_models: 10
  disag_log: False
  encoder: {mlp_keys: 'state', cnn_keys: '$^', mlp_layers: [400,400,400]}
  eval_every: 133  # 133 episodes = 1e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  exp_rollout_every: 0 # run exp rollout every X algo updates
  gc_reward: 'dynamical_distance'
  gc_input: 'embed'
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.5
  goal_update_every: 1 # every 1 episode, update goal picker
  goal_strategy: 'SubgoalPlanner'
  goal_key: 'goal'
  gcp_rollout_every: 0 # run gcp rollout every X algo updates
  kl.free: 1.0
  log_keys_sum: 'log_subgoal_dist'
  log_keys_max: 'metric_success_task_relevant/goal_|log_subgoal_success'
  log_keys_video: ['none']
  model_opt.lr: 3e-4
  prefill: 50 # number of episodes of random exploration.
  precision: 16
  pretrain: 100
  pred_discount: False
  pred_reward: False
  pred_embed: True
  planner: {planner_type: shooting_cem, horizon: 25, batch: 500, cem_elite_ratio: 0.2, optimization_steps: 5, std_scale: 1.0, goal_min: [-2.5, -2.5, -2.5, -2.5, -2.5,-2.5, -2.5, -2.5, -2.5, -2.5,-2.5, -2.5, -2.5, -2.5, -2.5,-2.5, -2.5, -2.5, -2.5, -2.5], goal_max: [2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5], evaluate_only: False, sample_replay: False, repeat_samples: 0, mega_prior: False, sample_env_goal_percent: 0.0, init_env_goal_percent: 0.0}
  replay: {capacity: 1e7, ongoing: False, minlen: 50, maxlen: 50, prioritize_ends: False, sample_recent: True, recent_episode_threshold: 2000 } # 100K steps
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  subgoal_threshold: 1.0
  state_key: 'state'
  time_limit: 200
  train_every: 5
  task: kitchen
  two_policy_rollout_every: 1 # run 2 policy rollout every X expl rollouts.

debug:
  jit: False
  # time_limit: 100
  eval_every: 5
  log_every: 500
  prefill: 1
  pretrain: 1
  train_steps: 1
  train_every: 100000
  # replay: {minlen: 10, maxlen: 30}
  # dataset: {batch: 1, length: 50}