defaults:

  # Train Script
  logdir: /dev/null
  seed: 0
  task: dmc_walker_walk
  envs: 1
  envs_parallel: none
  eval_env_distance_threshold: 0.2
  render_size: [64, 64]
  dmc_camera: -1
  atari_grayscale: True
  time_limit: 0
  action_repeat: 1
  steps: 1e8
  log_every: 1e4
  ckpt_every: 0
  eval_every: 1e5
  eval_eps: 1
  prefill: 10000
  pretrain: 1
  train_every: 5
  train_steps: 1
  expl_until: 0
  replay: {capacity: 2e6, ongoing: False, minlen: 50, maxlen: 50, prioritize_ends: True, sample_recent: False, recent_episode_threshold: 0, initial_buffer_path: '', initial_buffer_capacity: 1e6, shuffle_blocks: False, delete_old_trajectories: True}
#replay: {capacity: 2e6, ongoing: False, minlen: 50, maxlen: 50, prioritize_ends: True, delete_old_trajectories: True}
  dataset: {batch: 16, length: 50}
  goal_dataset: {batch: 150, length: 50}
  log_keys_video: ['image']
  log_keys_sum: '^$'
  log_keys_mean: '^$'
  log_keys_max: '^$'
  precision: 16
  jit: True
  state_key: none
  goal_key: none
  image_input: True
  no_render: False
  slurm_preempt: False
  gcp_train_factor: 1
  nth_gpu: 0
  episodic: False

  # Agent
  clip_rewards: tanh
  expl_behavior: greedy
  expl_noise: 0.0
  epsilon_expl_noise: 0.0
  eval_noise: 0.0
  eval_state_mean: False

  # World Model
  grad_heads: [decoder, reward, discount]
  pred_discount: True
  pred_reward: True
  rssm: {ensemble: 1, hidden: 1024, deter: 1024, stoch: 32, discrete: 32, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  encoder: {mlp_keys: '.*', cnn_keys: '.*', act: elu, norm: none, cnn_depth: 48, cnn_kernels: [4, 4, 4, 4], mlp_layers: [400, 400, 400, 400]}
  decoder: {mlp_keys: '.*', cnn_keys: '.*', act: elu, norm: none, cnn_depth: 48, cnn_kernels: [5, 5, 6, 6], mlp_layers: [400, 400, 400, 400]}
  reward_head: {layers: 4, units: 400, act: elu, norm: none, dist: mse}
  reward_type: 'onehot' # or 'onehot', when reward dist is binary, 'prob' will output probability while 'onehot' will output 0 or 1.
  discount_head: {layers: 4, units: 400, act: elu, norm: none, dist: binary}
  loss_scales: {kl: 1.0, reward: 1.0, discount: 1.0, proprio: 1.0}
  kl: {free: 0.0, forward: False, balance: 0.8, free_avg: True}
  model_opt: {opt: adam, lr: 1e-4, eps: 1e-5, clip: 100, wd: 1e-6}

  # Actor Critic
  actor: {layers: 4, units: 400, act: elu, norm: none, dist: auto, min_std: 0.1}
  critic: {layers: 4, units: 400, act: elu, norm: none, dist: mse}
  actor_opt: {opt: adam, lr: 8e-5, eps: 1e-5, clip: 100, wd: 1e-6}
  critic_opt: {opt: adam, lr: 2e-4, eps: 1e-5, clip: 100, wd: 1e-6}
  discount: 0.99
  p2e_discount: 0.99
  discount_lambda: 0.95
  imag_horizon: 15
  actor_grad: auto
  actor_grad_mix: 0.1
  actor_ent: 2e-3
  slow_target: True
  slow_target_update: 100
  slow_target_fraction: 1
  slow_baseline: True
  reward_norm: {momentum: 1.0, scale: 1.0, eps: 1e-8}

  # Goal Conditioning Stuff
  gc_input: 'embed'
  gc_reward: 'dynamical_distance'
  gc_reward_shape: 'sum' # sum_diff
  pred_embed: True
  embed_head: {layers: 3, units: 400, act: elu, norm: none, dist: mse}
  training_goals: 'batch'
  train_env_goal_percent: 0.0
  labelled_env_multiplexing: False
  subgoal_threshold: 5.0
  goal_policy_rollout_percentage: 0.3
  goal_strategy: 'SampleReplay'
  gcp_rollout_every: 1 # run gcp rollout every X algo update.
  exp_rollout_every: 1 # run exp rollout every X algo update.
  two_policy_rollout_every: 1 # run two policy rollout every X algo update
  goal_update_every: 0 # how often to update goal picker
  go_expl_rand_ac: False
  planner: {
    batch: 500,
    cem_elite_ratio: 0.2,
    cost_use_p2e_value: True,
    evaluate_only: False,
    final_step_cost: True,
    goal_min: [0.0],
    goal_max: [0.0],
    horizon: 50,
    init_candidates: [123456789.0],
    init_env_goal_percent: 0.0,
    mega_prior: False,
    mppi_gamma: 10.0,
    optimization_steps: 5,
    planner_type: shooting_mppi,
    repeat_samples: 0,
    std_scale: 1.0,
    sample_replay: False,
    sample_env_goal_percent: 0.0,
  }

  # Dynamical Distance
  dd_inp : 'embed'
  dd_num_positives : 256
  dd_neg_sampling_factor : 0.0
  dd_norm_inp : False
  dd_norm_reg_label : True
  dd_train_imag : True
  dd_train_off_policy : False
  dd_distance : 'steps_to_go'
  dd_loss : 'regression'
  dd_prob_balance: 1.0
  dd_opt: {opt: adam, lr: 8e-5, eps: 1e-5, clip: 100, wd: 1e-6}

  # Exploration
  expl_intr_scale: 1.0
  expl_extr_scale: 0.0
  expl_opt: {opt: adam, lr: 3e-4, eps: 1e-5, clip: 100, wd: 1e-6}
  expl_head: {layers: 4, units: 400, act: elu, norm: none, dist: mse}
  expl_reward_norm: {momentum: 1.0, scale: 1.0, eps: 1e-8}
  disag_target: stoch
  disag_log: False
  disag_models: 10
  disag_offset: 1
  disag_action_cond: True
  expl_model_loss: kl

  # Reward weights
  # Demos
  train_demo_percent: '0'
  w_td_reward: 0.0
  w_re_reward: 1.0
  # sawyer_door velocity
  add_velocity_info: 'None'
  augmented_env_goals: False
  demos_goals: False

  do_policy_priority: False
  policy_priority_params: {metric: 'TD', a: 0.7, b: 0.0, c: 1e3, running_min: False, temporal: 'add', maxval: 1e5,  e: 0.01 }
  policy_priority_weight: 0.0  # Make negative to train policy separately from world model (using policy_priority for policy). Make positive (and less than 1) to train together, but mixing policy_priority with priority according to this weight.
  replay_buffer_source: /dev/null
  rescan_priority_every: 0

  eval_scenario_match: False
  sample2goal: True #To use radian for goal optimization, or xyz.
  pretrained_wm: False

  # pseudo-episodic setting
  pseudo_episodic_train_gcp: False
  pseudo_episodic_train_img_hor: 15
  only_pseudo_train: False
  only_pseudo_train_steps: 10000
  pseudo_goal_sampling: True
  fb_p: 0.0

ant_maze:
  action_repeat: 1
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 0.00001
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  dd_neg_sampling_factor : 0.1
  decoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  disag_models: 10
  disag_log: False
  discount: 0.996
  encoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  eval_every: 40 # 40 episodes = 2e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  exp_rollout_every: 0 # run exp rollout every X algo updates
  gc_reward: 'dynamical_distance'
  gc_input: 'state'
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.3
  goal_update_every: 1 # every 50 episodes, update goal picker
  goal_strategy: 'SubgoalPlanner'
  goal_key: 'goal'
  gcp_rollout_every: 0 # run gcp rollout every X algo updates
  kl.free: 1.0
  log_keys_video: ['observation']
  model_opt.lr: 3e-4
  p2e_discount: 0.996
  prefill: 5
  precision: 16
  pretrain: 100
  pred_discount: False
  pred_reward: False
  pred_embed: True
  planner: {
    batch: 500,
    cem_elite_ratio: 0.2,
    cost_use_p2e_value: True,
    evaluate_only: False,
    final_step_cost: True,
#goal_min: [-0.75, -0.5, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0],
#goal_min: [-4.0, -4.0, -4.0, -4.0],
    goal_min: [-4.0, -4.0],
#goal_max: [5.0, 9.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
#goal_max: [4.0, 4.0, 4.0, 4.0],
    goal_max: [4.0, 4.0],
    horizon: 250,
#init_candidates: [0.0, 0.0, 8.19890515e-01,  9.95145602e-01,
#        3.48547286e-02, -6.19350100e-02,  6.80766655e-02, -4.42372144e-02,
#       -4.81461428e-02,  1.45511675e-02, -7.75746132e-02,  5.00618279e-02,
#        3.65949561e-02,  4.99939194e-02,  1.02664477e-02, -2.27597298e-01,
#        8.01758031e-02, -8.81610163e-02,  7.77121806e-02,  3.36722131e-02,
#        2.27648027e-02, -2.24103019e-02, -3.77942221e-02, -6.56355237e-02,
#        1.35722257e-01,  6.93039877e-02, -1.71162114e-01, -1.12083335e-01,
#        1.76819156e-02],
#init_candidates: [0.0, 0.0, 0.0, 0.0],
    init_candidates: [0.0, 0.0],
    init_env_goal_percent: 0.0,
    mega_prior: False,
    mppi_gamma: 2.0,
    optimization_steps: 5,
    planner_type: shooting_mppi,
    repeat_samples: 0,
    std_scale: 10.0,
    sample_replay: False,
    sample_env_goal_percent: 0.0,
  }
  replay: {capacity: 2e6, ongoing: False, minlen: 50, maxlen: 50, prioritize_ends: False, sample_recent: True, recent_episode_threshold: 200 } # 100K steps
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  subgoal_threshold: 2.0
  state_key: 'observation'
  steps: 1000000
  time_limit: 500
  train_every: 5
#task: hardumazefulldownscale
#task: sumazefulldownscale
  task: mumazefulldownscale
#  task: emptyumazefulldownscale
  two_policy_rollout_every: 1 # run 2 policy rollout every X expl rollouts.

earl_tabletop:
  action_repeat: 1
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 1e-4
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  dd_neg_sampling_factor : 0.0
  decoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  disag_models: 10
  disag_log: False
  encoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  eval_env_distance_threshold: 0.2
  eval_every: 100 # 100 episodes = 2e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  exp_rollout_every: 0 # run exp rollout every X algo updates
#gc_reward: 'learned_env_reward'
#gc_reward: 'dynamical_distance'
#gc_input: 'embed'
  gc_input: 'state'
  gc_reward: 'td_env'
#grad_heads: [decoder, reward]
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.5
  goal_update_every: 1 # every 50 episodes, update goal picker
#goal_strategy: 'SampleEnv'
  goal_strategy: 'SubgoalPlanner'
  goal_key: 'goal'
  go_expl_rand_ac: False
  gcp_rollout_every: 0 # run gcp rollout every X algo updates
  kl.free: 1.0
  log_keys_mean: 'metric_'
  log_keys_sum: 'metric_|subgoal_dist'
  log_keys_max: 'metric_|subgoal_success'
  model_opt.lr: 3e-4
  prefill: 50
  precision: 16
  pretrain: 100
  pred_discount: False
#pred_reward: True
  pred_reward: False
  pred_embed: True
  planner: {planner_type: shooting_cem, horizon: 100, batch: 500, cem_elite_ratio: 0.2, optimization_steps: 5, std_scale: 4.0, init_candidates: [0.0, 0.0, 0.0, 0.0, -1.0, -1.0], goal_min: [-2.8, -2.8, -2.8, -2.8, -1.0, -1.0], goal_max: [2.8, 2.8, 2.8, 2.8, 1.0, 1.0], cost_use_p2e_value: True, final_step_cost: True }
  replay.prioritize_ends: False
  replay.sample_recent: True
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  reward_head: {layers: 4, units: 400, act: elu, norm: none, dist: binary} # binary here, but use dist.mean() instead of dist.mode(), which will be prob of benurlli
  reward_type: 'onehot'
  subgoal_threshold: 2.0
  state_key: 'observation'
  steps: 400000
  time_limit: 200
  train_every: 5
  train_env_goal_percent: 0.0
  task: tabletop
  two_policy_rollout_every: 1 # run 2 policy rollout every X algo updates

earl_sawyer_door:
  action_repeat: 1
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 1e-5
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  dd_neg_sampling_factor : 0.0
  decoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  disag_models: 10
  disag_log: False
  encoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  eval_env_distance_threshold: 0.02
  eval_every: 100 # 100 episodes = 2e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  exp_rollout_every: 0 # run exp rollout every X algo updates
#gc_reward: 'learned_env_reward'
  gc_reward: 'dynamical_distance'
#gc_input: 'embed'
  gc_input: 'state'
#gc_reward: 'td_env'
#grad_heads: [decoder, reward]
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.5
  goal_update_every: 1 # every 50 episodes, update goal picker
#goal_strategy: 'SampleEnv'
  goal_strategy: 'SubgoalPlanner'
  goal_key: 'goal'
  go_expl_rand_ac: False
  gcp_rollout_every: 0 # run gcp rollout every X algo updates
  kl.free: 1.0
  log_keys_mean: 'metric_'
  log_keys_sum: 'metric_|subgoal_dist'
  log_keys_max: 'metric_|subgoal_success'
  model_opt.lr: 3e-4
  prefill: 50
  precision: 16
  pretrain: 100
  pred_discount: False
#pred_reward: True
  pred_reward: False
  pred_embed: True
#planner: {planner_type: shooting_cem, horizon: 150, batch: 500, mppi_gamma: 2.0, optimization_steps: 5, std_scale: 4.0, init_candidates: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], goal_min: [-0.5, 0.4, 0.05, -1.0, -0.25, 0.45, 0.0999], goal_max: [0.5, 1.0, 0.5, 1.0, 0.35, 0.85, 0.15], cost_use_p2e_value: True, final_step_cost: True }
#form goal optimization range according to goal max/min
#planner: {planner_type: shooting_cem, horizon: 150, batch: 500, mppi_gamma: 2.0, optimization_steps: 5, std_scale: 4.0, init_candidates: [0.0, 0.0, 0.0, 0.0, 0.0], goal_min: [-0.5, 0.4, 0.05, -1.0, -1.31], goal_max: [0.5, 1.0, 0.5, 1.0, -0.26], cost_use_p2e_value: True, final_step_cost: True }
#form goal optimization range according to goal initial/target
  planner: {planner_type: shooting_cem, horizon: 150, batch: 500, mppi_gamma: 2.0, optimization_steps: 5, std_scale: 0.5, init_candidates: [0.0, 0.0, 0.0, 0.0, 0.0], goal_min: [-0.5, 0.4, 0.05, -1.0, -1.06], goal_max: [0.5, 1.0, 0.5, 1.0, -0.016], cost_use_p2e_value: True, final_step_cost: True }
  replay.prioritize_ends: False
  replay.sample_recent: True
#rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  rssm: {ensemble: 1, hidden: 1024, deter: 1024, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  reward_head: {layers: 4, units: 400, act: elu, norm: none, dist: binary} # binary here, but use dist.mean() instead of dist.mode(), which will be prob of benurlli
  reward_type: 'onehot'
  subgoal_threshold: 2.0
  state_key: 'observation'
  steps: 1000000
  time_limit: 300
  train_every: 5
  train_env_goal_percent: 0.0
  task: sawyer_door
  two_policy_rollout_every: 1 # run 2 policy rollout every X algo updates
#train_demo_percent: linear(0.75, 0.25, 50000) # sampling a batch, which inclues 75% of data from demos, gradually decay to 25% in 50000 steps.
  train_demo_percent: 0 # sampling a batch, which inclues 75% of data from demos, gradually decay to 25% in 50000 steps.

fetch_reach:
  action_repeat: 1
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 1e-5
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  dd_neg_sampling_factor : 0.0
  decoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  disag_models: 10
  disag_log: False
  encoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  eval_env_distance_threshold: 0.02
  eval_every: 100 # 100 episodes = 1e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  exp_rollout_every: 0 # run exp rollout every X algo updates
#gc_reward: 'learned_env_reward'
  gc_reward: 'dynamical_distance'
#gc_input: 'embed'
  gc_input: 'state'
#gc_reward: 'td_env'
#grad_heads: [decoder, reward]
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.5
  goal_update_every: 1 # every 50 episodes, update goal picker
#goal_strategy: 'SampleEnv'
  goal_strategy: 'SubgoalPlanner'
  goal_key: 'goal'
  go_expl_rand_ac: False
  gcp_rollout_every: 0 # run gcp rollout every X algo updates
  kl.free: 1.0
  log_keys_mean: 'metric_'
  log_keys_sum: 'metric_|subgoal_dist'
  log_keys_max: 'metric_|subgoal_success'
  model_opt.lr: 3e-4
  prefill: 50
  precision: 16
  pretrain: 100
  pred_discount: False
#pred_reward: True
  pred_reward: False
  pred_embed: True
#form goal optimization range according to goal max/min
  planner: {planner_type: shooting_cem, horizon: 50, batch: 500, mppi_gamma: 2.0, optimization_steps: 5, std_scale: 0.5, init_candidates: [1.3, 1.7, 0.55], goal_min: [1.2, 0.55, 0.4], goal_max: [1.5, 0.9, 0.7], cost_use_p2e_value: True, final_step_cost: True }
  replay.prioritize_ends: False
  replay.sample_recent: True
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
#rssm: {ensemble: 1, hidden: 1024, deter: 1024, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  reward_head: {layers: 4, units: 400, act: elu, norm: none, dist: binary} # binary here, but use dist.mean() instead of dist.mode(), which will be prob of benurlli
  reward_type: 'onehot'
  subgoal_threshold: 2.0
  state_key: 'observation'
  steps: 1000000
  time_limit: 100
  train_every: 5
  train_env_goal_percent: 0.0
  task: fetch_reach_ergodic
  two_policy_rollout_every: 1 # run 2 policy rollout every X algo updates
  train_demo_percent: 0

fetch_push:
  action_repeat: 1
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 1e-5
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  dd_neg_sampling_factor : 0.0
  decoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  disag_models: 10
  disag_log: False
  encoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  eval_env_distance_threshold: 0.02
  eval_every: 100 # 100 episodes = 1e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  exp_rollout_every: 0 # run exp rollout every X algo updates
#gc_reward: 'learned_env_reward'
  gc_reward: 'dynamical_distance'
#gc_input: 'embed'
  gc_input: 'state'
#gc_reward: 'td_env'
#grad_heads: [decoder, reward]
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.5
  goal_update_every: 1 # every 50 episodes, update goal picker
#goal_strategy: 'SampleEnv'
  goal_strategy: 'SubgoalPlanner'
  goal_key: 'goal'
  go_expl_rand_ac: False
  gcp_rollout_every: 0 # run gcp rollout every X algo updates
  kl.free: 1.0
  log_keys_mean: 'metric_'
  log_keys_sum: 'metric_|subgoal_dist'
  log_keys_max: 'metric_|subgoal_success'
  model_opt.lr: 3e-4
  prefill: 50
  precision: 16
  pretrain: 100
  pred_discount: False
#pred_reward: True
  pred_reward: False
  pred_embed: True
#form goal optimization range according to goal max/min
  planner: {planner_type: shooting_cem, horizon: 50, batch: 500, mppi_gamma: 2.0, optimization_steps: 5, std_scale: 0.5, init_candidates: [1.3, 1.7, 0.42], goal_min: [1.2, 0.55, 0.424], goal_max: [1.5, 0.9, 0.424], cost_use_p2e_value: True, final_step_cost: True }
  replay.prioritize_ends: False
  replay.sample_recent: True
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
#rssm: {ensemble: 1, hidden: 1024, deter: 1024, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  reward_head: {layers: 4, units: 400, act: elu, norm: none, dist: binary} # binary here, but use dist.mean() instead of dist.mode(), which will be prob of benurlli
  reward_type: 'onehot'
  subgoal_threshold: 2.0
  state_key: 'observation'
  steps: 650000
  time_limit: 100
  train_every: 5
  train_env_goal_percent: 0.0
#task: fetch_push_ergodic
  task: boxpush
  two_policy_rollout_every: 1 # run 2 policy rollout every X algo updates
  train_demo_percent: 0

fetch_pick:
  action_repeat: 1
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 1e-5
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  dd_neg_sampling_factor : 0.0
  decoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  disag_models: 10
  disag_log: False
  encoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  eval_env_distance_threshold: 0.02
  eval_every: 100 # 100 episodes = 1e4 steps.
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  exp_rollout_every: 0 # run exp rollout every X algo updates
#gc_reward: 'learned_env_reward'
  gc_reward: 'dynamical_distance'
#gc_input: 'embed'
  gc_input: 'state'
#gc_reward: 'td_env'
#grad_heads: [decoder, reward]
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.5
  goal_update_every: 1 # every 50 episodes, update goal picker
#goal_strategy: 'SampleEnv'
  goal_strategy: 'SubgoalPlanner'
  goal_key: 'goal'
  go_expl_rand_ac: False
  gcp_rollout_every: 0 # run gcp rollout every X algo updates
  kl.free: 1.0
  log_keys_mean: 'metric_'
  log_keys_sum: 'metric_|subgoal_dist'
  log_keys_max: 'metric_|subgoal_success'
  model_opt.lr: 3e-4
  prefill: 50
  precision: 16
  pretrain: 100
  pred_discount: False
#pred_reward: True
  pred_reward: False
  pred_embed: True
#form goal optimization range according to goal max/min
  planner: {planner_type: shooting_cem, horizon: 50, batch: 500, mppi_gamma: 2.0, optimization_steps: 5, std_scale: 0.5, init_candidates: [1.3, 1.0, 0.4], goal_min: [1.0, 0.4, 0.4], goal_max: [1.7, 1.2, 0.7], cost_use_p2e_value: True, final_step_cost: True }
  replay.prioritize_ends: False
  replay.sample_recent: True
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
#rssm: {ensemble: 1, hidden: 1024, deter: 1024, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  reward_head: {layers: 4, units: 400, act: elu, norm: none, dist: binary} # binary here, but use dist.mean() instead of dist.mode(), which will be prob of benurlli
  reward_type: 'onehot'
  subgoal_threshold: 2.0
  state_key: 'observation'
  steps: 1000000
  time_limit: 100
  train_every: 5
  train_env_goal_percent: 0.0
#task: fetch_pickandplace_ergodic
  task: boxpick
  two_policy_rollout_every: 1 # run 2 policy rollout every X algo updates
  train_demo_percent: 0

point_umaze:
  action_repeat: 1
  actor_opt.lr: 8e-5
  actor_grad: 'auto'
  actor_ent: 1e-5
  clip_rewards: identity
  critic_opt.lr: 8e-5
  dataset: {batch: 45, length: 50}
  dd_neg_sampling_factor : 0.0
  decoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  disag_models: 10
  disag_log: False
  encoder: {mlp_keys: 'observation', cnn_keys: '$^', mlp_layers: [400,400,400]}
  eval_env_distance_threshold: 0.02
#eval_every: 100 # 100 episodes = 1e4 steps.
  eval_every: 10 # 100 episodes = 1e4 steps.
  log_every: 1e3
  expl_behavior: 'Plan2Explore'
  expl_extr_scale: 0.0
  exp_rollout_every: 0 # run exp rollout every X algo updates
#gc_reward: 'learned_env_reward'
  gc_reward: 'dynamical_distance'
#gc_input: 'embed'
  gc_input: 'state'
#gc_reward: 'td_env'
#grad_heads: [decoder, reward]
  grad_heads: [decoder]
  goal_policy_rollout_percentage: 0.5
  goal_update_every: 1 # every 50 episodes, update goal picker
#goal_strategy: 'SampleEnv'
  goal_strategy: 'SubgoalPlanner'
  goal_key: 'goal'
  go_expl_rand_ac: False
  gcp_rollout_every: 0 # run gcp rollout every X algo updates
  kl.free: 1.0
  log_keys_mean: 'metric_'
  log_keys_sum: 'metric_|subgoal_dist'
  log_keys_max: 'metric_|subgoal_success'
  model_opt.lr: 3e-4
  prefill: 50
  precision: 32
  pretrain: 100
  pred_discount: False
#pred_reward: True
  pred_reward: False
  pred_embed: True
#form goal optimization range according to goal max/min
  planner: {planner_type: shooting_cem, horizon: 50, batch: 500, mppi_gamma: 2.0, optimization_steps: 5, std_scale: 4.0, init_candidates: [4.0, 4.0], goal_min: [-10.0, -10.0], goal_max: [10.0, 10.0], cost_use_p2e_value: True, final_step_cost: True }
  replay.prioritize_ends: False
  replay.sample_recent: True
  rssm: {ensemble: 1, hidden: 200, deter: 200, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
#rssm: {ensemble: 1, hidden: 1024, deter: 1024, stoch: 50, discrete: 0, act: elu, norm: none, std_act: sigmoid2, min_std: 0.1}
  reward_head: {layers: 4, units: 400, act: elu, norm: none, dist: binary} # binary here, but use dist.mean() instead of dist.mode(), which will be prob of benurlli
  reward_type: 'onehot'
  subgoal_threshold: 2.0
  state_key: 'observation'
  steps: 1000000
  time_limit: 100
  train_every: 5
  train_env_goal_percent: 0.0
  task: point_umaze
#task: point_emptymaze
  two_policy_rollout_every: 1 # run 2 policy rollout every X algo updates
  train_demo_percent: 0
