exp_base: "results/"
exp_name: "hot_gp_coverage_task"
system: "Halluc_DDPG"
gpu_device: 0
num_workers: 0
seed: 1
total_env_steps: 200000
total_halluc_env_steps: ???
pretrain_dir: null
scenario:
  name: "sampling"
  use_continuous_actions: True
  n_agents: 1
  n_gaussians: 3
  state_dim: 19
  action_dim: 2
  cov: 0.05
env:
  max_steps: 150
  num_envs: 1
halluc_env:
  max_steps: 150
  num_envs: 1
eval_env:
  max_steps: 150
  evaluation_episodes: 200
  evaluation_interval: 20
policy:
  shared_parameters: True
  num_epochs: 40
  minibatch_size: 150
  lr: 0.00005
  max_grad_norm: 40.0
  gamma: 0.9
  tau: 0.005
replay_buffer:
  buffer_size: 5000
  use_priority: True
model:
  model_type: "GP"
  learn_smoothed_reward: True
  learn_unsmoothed_reward: False
  use_separate_reward_cov: False
  use_k_branching: True
  use_coregionalization: True
  max_nn_dataset_size: 10000
  max_gp_dataset_size: 1000
  num_epochs: 100
  minibatch_size: 150
  gp_lr: 0.00005
  nn_lr: 0.00005
  num_inducing_points: 100
  hidden_layer_width: 200
  num_pretrain_steps: 150
optimism:
  use_optimism: True
  use_thompson_sampling: False
  use_hucrl_approx: False # use with Thompson sampling when learning reward
  optimism_after_iter: 0
  num_samples: 5 # for H-UCRL
  initial_beta: 0.001 # for H-UCRL
  final_beta: 0.001 # for H-UCRL
  initial_lower_percentile: 0.1 # for HOT-GP
  final_lower_percentile: 0.5 # for HOT-GP
  upper_percentile: 1 # for HOT-GP
logger:
  backend: wandb
  project_name: "optimistic_rl_sampling"
  save_data: False
  output_dir: "training_logs"