defaults:
    - agent: sac
    
experiment: QPA

# wandb log (need to be specified manually)
wandb: false
wandb_project: RLHB
wandb_entity: 


# reward learning
segment: 50
activation: tanh
num_seed_steps: 1000
num_unsup_steps: 9000
num_interact: 20000
reward_lr: 0.0003
reward_batch: 10
reward_update: 2000
reset_update: 100
topK: 5
ensemble_size: 1
max_feedback: 100
large_batch: 10
label_margin: 0.0
teacher_beta: -1
teacher_gamma: 1
teacher_eps_mistake: 0
teacher_eps_skip: 0
teacher_eps_equal: 0

# scheduling
reward_schedule: 0
gradient_update: 1
num_train_steps: 1e6
replay_buffer_capacity: ${num_train_steps}

# evaluation config
eval_frequency: 2000
num_eval_episodes: 10
device: cuda

# setups
seed: 5

# environment
env: walker_walk

# hyperparameter of QPA 
data_aug_ratio: 20
max_reward_buffer_size: 10
explore: false
her_ratio: 0.5

# hydra configuration
hydra:
    name: ${env}
    run:
        dir: ./exp/${env}/maxFeedback${max_feedback}_numInteract${num_interact}_seed${seed}