env: metaworld_dial-turn-v2
data_quality: 3.0
seed: 0
use_reward_model: true
trivial_reward: 0

# algorithm
actor_lr: 3e-4
critic_lr: 3e-4
tau: 0.005
alpha: 2.5
policy_noise: 0.2
noise_clip: 0.5
batch_size: 256
buffer_size: 10000000
discount: 0.99
eval_freq: 5000
load_model: ''
max_timesteps: 250000
n_episodes: 25
normalize: true
normalize_reward: true

# preference learning
feedback_num: 1000
threshold: 0.5
data_aug: none
segment_size: 25
hidden_sizes: 128
ensemble_num: 3
ensemble_method: mean
q_budget: 1
feedback_type: RLT
model_type: BT
noise: 0.0
human: false