# torchrun --standalone --nproc_per_node=8 train_gpt2.py --input_folder unbalanced_pretrain/powernew --save_every 50 --wandb_run_name pretrain_power_new --warmup_ratio 0.05 --warmdown_ratio 0.9 --sequence_length 512 --device_batch_size 16 --num_epochs 1 --weight_decay 0.1 --learning_rate 0.0003 --batch_size 128
# torchrun --standalone --nproc_per_node=8 train_gpt2.py --input_folder unbalanced_pretrain/powernew --save_every 50 --wandb_run_name pretrain_power_new --warmup_ratio 0.05 --warmdown_ratio 0.9 --sequence_length 512 --device_batch_size 16 --num_epochs 1 --weight_decay 0.1 --learning_rate 0.001 --batch_size 128
# torchrun --standalone --nproc_per_node=8 train_gpt2.py --input_folder unbalanced_pretrain/powernew --save_every 50 --wandb_run_name pretrain_power_new --warmup_ratio 0.05 --warmdown_ratio 0.9 --sequence_length 512 --device_batch_size 16 --num_epochs 1 --weight_decay 0.2 --learning_rate 0.001 --batch_size 128


## 2025/01/28 for single card
CUDA_VISIBLE_DEVICES=0 torchrun --standalone --nproc_per_node=1 train_gpt2.py --input_folder unbalanced_pretrain/powernew --save_every 500 --wandb_run_name pretrain_power_new --warmup_ratio 0.05 --warmdown_ratio 0.9 --sequence_length 512 --device_batch_size 16 --num_epochs 1 --weight_decay 0.1 --learning_rate 0.0003 --batch_size 128

## 2025/01/28 for 8 cards
torchrun --standalone --nproc_per_node=8 train_gpt2.py --input_folder unbalanced_pretrain/powernew --save_every 500 --wandb_run_name pretrain_power_new --warmup_ratio 0.05 --warmdown_ratio 0.9 --sequence_length 512 --device_batch_size 16 --num_epochs 1 --weight_decay 0.1 --learning_rate 0.0003 --batch_size 128

## 2025/01/28 for single card and validating the load checkpoint function 
CUDA_VISIBLE_DEVICES=0 torchrun --standalone --nproc_per_node=1 train_gpt2.py --input_folder unbalanced_pretrain/powernew --load_checkpoint logs/pretrain_power_new_20250128_112012_bbd3e6e4-4286-44f8-9fa5-675850e3a64f/state_step005500.pt --save_every 500 --wandb_run_name pretrain_power_new --warmup_ratio 0.05 --warmdown_ratio 0.9 --sequence_length 512 --device_batch_size 16 --num_epochs 1 --weight_decay 0.1 --learning_rate 0.0003 --batch_size 128 --bf
