dataset_name: trl-lib/hh-rlhf-helpful-base
max_seq_length: 4096
task: assistant
model_name_or_path: google/flan-t5-xxl
dataset_train_split: train
dataset_test_split: test
run_name: reward_pair-hh-t5-xxl
output_dir: outputs/reward/reward_pair-hh-t5-xxl
per_device_train_batch_size: 8
per_device_eval_batch_size: 8
learning_rate: 1.0e-4
gradient_accumulation_steps: 1
num_train_epochs: 1
logging_steps: 25
eval_strategy: steps
eval_steps: 100
save_strategy: steps
save_steps: 1000
warmup_ratio: 0.03
remove_unused_columns: True
report_to: wandb
packing: False
lr_scheduler_type: cosine
dataset_text_field: text
bf16: True
gradient_checkpointing: True
ddp_find_unused_parameters: False
dataset_num_proc: 32

use_peft: True
lora_r: 512
lora_alpha: 512
lora_target_modules:
  - SelfAttention.q
  - SelfAttention.k
  - SelfAttention.v

