env: metaworld_dial-turn-v2
data_quality: 3.0
seed: 0
use_reward_model: true
trivial_reward: 0

# algorithm
actor_lr: 3e-5
critic_lr: 3e-4
tau: 0.001
alpha: 0.2
auto_alpha: true
alpha_lr: 3e-4
batch_size: 256
traj_batch_size: 16
buffer_size: 10000000
discount: 0.99
eval_freq: 5000
load_model: ''
max_timesteps: 250000
n_episodes: 50
normalize: true
normalize_reward: true

# tuning parameter
lam: 3e-2

# preference learning
feedback_num: 1000
threshold: 0.5
data_aug: none
segment_size: 25
hidden_sizes: 128
ensemble_num: 3
ensemble_method: mean
q_budget: 1
feedback_type: RLT
model_type: BT
noise: 0.0
human: false