
MODEL_PATH="xxx"

CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 FORCE_TORCHRUN=1 llamafactory-cli train \
--stage sft \
--model_name_or_path $MODEL_PATH \
--do_train true \
--finetuning_type full \
--deepspeed examples/deepspeed/ds_z3_config.json \
--dataset agent_trace \
--template qwen3 \
--cutoff_len 12288 \
--max_samples 100000 \
--overwrite_cache true \
--preprocessing_num_workers 16 \
--output_dir output/models/Qwen3-8B-SFT \
--logging_steps 1 \
--save_strategy epoch \
--plot_loss true \
--per_device_train_batch_size 2 \
--gradient_accumulation_steps 8 \
--learning_rate 2.0e-5 \
--num_train_epochs 2.0 \
--lr_scheduler_type cosine \
--warmup_ratio 0.1 \
--bf16 true \
--report_to none \
--ddp_timeout 180000000