data:
  dataset_type: preference_based
  train_dataset_filepath: ???
  val_dataset_filepath: ???
  env_name: CustomReacherEnv-v0
  segment_length: 15
  max_num_pairs: 20
  debug_size: null
  debug_size_mode: "shuffled"
  batch_size: 2048
  num_workers: 8
  shuffle: True
model:
  reward_type: preference_based
  model_type: learned_reacher
  hidden_sizes: [256, 256]
  learning_rate: 2e-4
  reward_reg_weight: 1e-3
  target_reward_model_class: CustomReacherEnvRewardModel
  target_reward_model_kwargs:
    reward_dist_factor: 1
    reward_ctrl_factor: 1
    reward_goal_factor: 1
    shaping_factor: 10.0
    shaping_discount: 0.95
training:
  output_dir: ???
  trainer_args:
    gradient_clip_val: 1.0
    gpus: 1
    max_epochs: 50
