CUDA_VISIBLE_DEVICES=0,1 python -u code/main.py model=gpt2-large datasets=[shp] \
    loss=sft gradient_accumulation_steps=8 batch_size=64 \
    eval_batch_size=8 trainer=FSDP.BasicTrainer sample_during_eval=true \