uid="$(date +%Y%m%d_%H%M%S)"
base_model="Qwen/Qwen2.5-32B-Instruct"
lr=1e-5
min_lr=0
epochs=6
weight_decay=1e-4 # -> the same training pipe as slurm_training
micro_batch_size=1 # -> batch_size will be 16 if 16 gpus
gradient_accumulation_steps=4 # requires more GPU memory, can try 2
max_steps=-1
gpu_count=$(nvidia-smi -L | wc -l)
push_to_hub=false

torchrun --nproc-per-node ${gpu_count} --master_port 12345 \
    train/scalable_soft.py \
    --block_size=32768 \
    --per_device_train_batch_size=${micro_batch_size} \
    --per_device_eval_batch_size=${micro_batch_size} \
    --gradient_accumulation_steps=${gradient_accumulation_steps} \
    --num_train_epochs=${epochs} \
    --train_file_path=".....put your tokenized data after using our sot_tokenization.py to tokenize the data we provided....." \
    --model_name=${base_model} \
    --warmup_ratio=0.05 \
    --fsdp="full_shard auto_wrap" \
    --fsdp_config="train/fsdp_config_qwen.json" \
    --bf16=True \
    --eval_strategy="no" \
    --logging_steps=1 \
    --save_strategy="no" \
    --lr_scheduler_type="cosine" \
    --learning_rate=${lr} \
    --weight_decay=${weight_decay} \
    --adam_beta1=0.9 \
    --adam_beta2=0.95 \
    --output_dir="ckpts/ssft_32b" \
    --push_to_hub=${push_to_hub} \
    --save_only_model=True \
    --use-liger=True \
    --use-liger-kernel=True \
    --gradient_checkpointing=True \
    --accelerator_config='{"gradient_accumulation_kwargs": {"sync_each_batch": true}}' \
    --N_num_sot_tokens=6 \
    --L_first_matching_tokens=1000 \
    --precomputed_num_update_steps_per_epoch_list 126 126 126 126 126 126 \
    --instantaneous_pad_batch_size=1 \
    --bmsft_phase_train=True \
    --logging_dir="./debug" \
    --qwen_liger_version="qwen_soft" \
	--debug_randomized_matching=False \
    # \ # Enable gradient checkpointing for efficient memory usage with 8 H100 GPUs.
