defaults:
    - agent: sac_diverse
    
experiment: DPB2

wandb: true
wandb_project: RLHB
wandb_entity:

# reward learning
segment: 50
activation: tanh
num_seed_steps: 1000
num_unsup_steps: 9000
num_interact: 20000
reward_lr: 0.0003
reward_batch: 10
reward_update: 2000
reset_update: 100
topK: 5
ensemble_size: 1
max_feedback: 100
large_batch: 10
label_margin: 0.0
teacher_beta: -1
teacher_gamma: 1
teacher_eps_mistake: 0
teacher_eps_skip: 0
teacher_eps_equal: 0
rm_num_layers: 3
rm_hidden_size: 256

# logging
ground_truth_rm_num_samples: 10000
log_buffer_capacity: ${num_train_steps}

# scheduling
reward_schedule: 0
gradient_update: 1
num_train_steps: 1e6
replay_buffer_capacity: ${num_train_steps}

# evaluation config
eval_frequency: 2000
num_eval_episodes: 10
device: cuda

# setups
seed: 5

# environment
env: walker_walk

# hyperparameter of QPA 
data_aug_ratio: 20
max_reward_buffer_size: 10
explore: false
her_ratio: 0.5

# hyperparameter of population
population_size: 4
tpa: true
copy_agent: false
reset_feedback_done: false

# hyperparameter of discriminator
disc:
    batch_size: 256
    on_policy: true
    lr: 3e-4
    hidden_size: 256
    ema_alpha: 0.99
    layernorm: true

# hydra configuration
hydra:
    name: ${env}
    run:
        dir: ./exp/${env}/maxFeedback${max_feedback}_numInteract${num_interact}_seed${seed}