project_name: mixed_llama3

ckpt_path: saved_models/llama3-aligned-6e-7-0.25.pt

data_path: princeton-nlp/llama3-ultrafeedback-armorm

gradient_accumulation: 16

mixed_precision: bf16

policy_model_path:  NousResearch/Meta-Llama-3-8B-Instruct

ref_model_path: NousResearch/Meta-Llama-3-8B-Instruct

policy_tokenizer_name: NousResearch/Meta-Llama-3-8B-Instruct

use_grad_ckpt: True

on_policy: False

max_length: 4096

batch_size: 1

clip_grad_norm: 1.0

warmup_steps: 500

learning_rate: 6e-7

optimizer: AdamW

length_norm: True

save_per_step: 1000

epoch: 1

seed: 666

mask_rate: 0.25