CUDA_VISIBLE_DEVICES=3 python -u code/main.py reward_model=pythia1 datasets=[oai_summary] \
    loss=reward_loss gradient_accumulation_steps=16 batch_size=64 \
    eval_batch_size=4 trainer=RewardTrainer sample_during_eval=false \
    model.fsdp_policy_mp=bfloat16 n_epochs=1 debug=false \
    optimizer_eps=0.00001 optimizer=AdamW lr=3e-6 reward_model.archive=pythia1b-oai-summary-rm-1ep.pt \
    warmup_steps=0