policy_path=$change this$
reward_model_path=$change this$

CUDA_VISIBLE_DEVICES=0,1 python -u code/main.py model=gpt2-large datasets=[shp] loss=ppo \
    gradient_accumulation_steps=2 batch_size=4 eval_batch_size=4 trainer=FSDP.PPOTrainer \
    model.fsdp_policy_mp=bfloat16 debug=false \
    reward_model=gpt2-large ppo_epochs=5 max_prompt_length=128 max_length=256 \
    model.archive=${policy_path} reward_model.archive=${reward_model_path} \
