export HF_HOME=/workspace/rlhf-code/.cache/root
python -u code/main.py reward_model=pythia1 datasets=[oai_summary_rew_gap] reward_model="pythia1" \
    loss=reward_gap gradient_accumulation_steps=16 batch_size=256 \
    eval_batch_size=256 trainer=RewardTrainer sample_during_eval=false \
    model.fsdp_policy_mp=bfloat16 n_epochs=1 debug=false \