# for pretraining
# torchrun --standalone --nproc_per_node=1 train_gpt2.py --input_folder bioS_single/pretrain --save_every 975 --load_checkpoint ckpt/pretrained.ckpt --wandb_run_name pretrain --warmup_ratio 0.1 --warmdown_ratio 0.2 --sequence_length 256 --device_batch_size 32 --num_epochs 4
# for finetuning
# torchrun --standalone --nproc_per_node=1 train_gpt2.py --input_folder bioS_single/SFT_mix_pretrain_10x --save_every 300 --load_checkpoint ckpt/pretrained.ckpt --wandb_run_name SFT_mix_pretrain_10x --warmup_ratio 0.1 --warmdown_ratio 0.8 --sequence_length 256 --device_batch_size 32 --num_epochs 1 --learning_rate 0.001
# for pretraining
# torchrun --standalone --nproc_per_node=1 train_gpt2.py --input_folder bioS_single/pretrain_sft10k_mix --save_every 975 --wandb_run_name pretrain_sft10k_mix --warmup_ratio 0.2 --warmdown_ratio 0 --sequence_length 256 --device_batch_size 32 --num_epochs 3
# for finetuning
torchrun --standalone --nproc_per_node=1 train_gpt2.py --input_folder bioS_single/SFT --save_every 125 --load_checkpoint logs/pretrain_sft10k_mix_20241108_115839_ae565786-93c7-463a-8368-f1ce70598bd8/state_step002925.pt --wandb_run_name SFT_new --warmup_ratio 0.1 --warmdown_ratio 0.8 --sequence_length 256 --device_batch_size 32 --num_epochs 1 --learning_rate 0.001 --batch_size 128
# for finetuning SFT_addition_20
torchrun --standalone --nproc_per_node=1 train_gpt2.py --input_folder bioS_single/SFT_addition_20 --save_every 125 --load_checkpoint logs/pretrain_sft10k_mix_20241108_115839_ae565786-93c7-463a-8368-f1ce70598bd8/state_step002925.pt --wandb_run_name SFT_addition_20 --warmup_ratio 0.1 --warmdown_ratio 0.8 --sequence_length 256 --device_batch_size 32 --num_epochs 1 --learning_rate 0.001 --batch_size 128
# new pretrain using adam
torchrun --standalone --nproc_per_node=1 train_gpt2.py --input_folder bioS_single/pretrain_sft10k_mix --save_every 3000 --wandb_run_name pretrain --warmup_ratio 0.05 --warmdown_ratio 0.5 --sequence_length 512 --device_batch_size 16 --num_epochs 5 --weight_decay 0.1 --learning_rate 0.001 --batch_size 96
# new pretrain using adam with fullname
torchrun --standalone --nproc_per_node=1 train_gpt2.py --input_folder bioS_single/pretrain_fullname_sft10k_mix --save_every 3000 --wandb_run_name pretrain --warmup_ratio 0.05 --warmdown_ratio 0.5 --sequence_length 512 --device_batch_size 16 --num_epochs 5 --weight_decay 0.1 --learning_rate 0.001 --batch_size 96
# new pretrain using perturbed data
torchrun --standalone --nproc_per_node=1 train_gpt2.py --input_folder bioS_single/pretrain_perturbed_sft10k_independent_mix --save_every 3000 --val_loss_every 1000 --wandb_run_name pretrain --warmup_ratio 0.05 --warmdown_ratio 0.5 --sequence_length 512 --device_batch_size 16 --num_epochs 5 --weight_decay 0.1 --learning_rate 0.001 --batch_size 96