torchrun --standalone --nproc_per_node=8 train_gpt2.py --input_folder unbalanced_pretrain/sft_power_train --load_checkpoint logs/size_base_20250128_161308_7af5545b-2947-4a79-a918-6c3874093fd8/state_step005572.pt --save_every 50 --wandb_run_name sft_power --warmup_ratio 0.05 --warmdown_ratio 0.9 --sequence_length 512 --device_batch_size 16 --num_epochs 2 --weight_decay 0.1 --learning_rate 0.00003 --batch_size 128 --bf16 --val_loss_every 50