# qwen2.5-vl-3b
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
NPROC_PER_NODE=8 \

swift sft \
    --model /dfs/data/model/Qwen2.5-VL-3B-Instruct \
    --dataset "/dfs/data/CR/sft_train_1/sft_train_1.jsonl" \
    --split_dataset_ratio 0.05 \
    --model_type qwen2_5_vl \
    --dataloader_num_workers 4 \
    --train_type full\
    --torch_dtype bfloat16 \
    --num_train_epochs 2 \
    --per_device_train_batch_size 4 \
    --per_device_eval_batch_size 8 \
    --learning_rate 1e-5 \
    --freeze_vit True \
    --freeze_llm False\
    --freeze_aligner True\
    --gradient_accumulation_steps 8\
    --eval_steps 50 \
    --save_steps 50 \
    --save_total_limit 5 \
    --logging_steps 10 \
    --max_length 4096 \
    --max_pixels 262144 \
    --deepspeed zero3 \
    --output_dir /dfs/data/CR/ChartVR/3b/921 \
    --warmup_ratio 0.05