export CUDA_VISIBLE_DEVICES=0,1,2,3

torchrun --nproc_per_node=4 --master_port=29503 script/train/SFT_train.py \
    --config recipe/fuse_mc_concat_tokenwise.json \