set -x
GPUS=${GPUS:-8}

export PYTHONPATH="${PYTHONPATH}:$(pwd)"
export MASTER_PORT=32210
export TF_CPP_MIN_LOG_LEVEL=3
export LAUNCHER=pytorch

export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

OUTPUT_DIR="work_dirs/${WANDB_PROJECT}/${WANDB_NAME}"
if [ ! -d "$OUTPUT_DIR" ]; then
  mkdir -p "$OUTPUT_DIR"
fi


torchrun \
  --nnodes=1 \
  --node_rank=0 \
  --master_addr=127.0.0.1 \
  --nproc_per_node=${GPUS} \
  --master_port=${MASTER_PORT} \
  internvl/train/internvl_chat_pretrain.py \
  --model_is_pe True \
  --vision_path "PE-Lang-L14-448/PE-Lang-L14-448.pt" \
  --pe_model_type "PE-Lang-L14-448" \
  --llm_path "llama31_8b" \
  --conv_style "internvl2_5" \
  --use_fast_tokenizer False \
  --output_dir ${OUTPUT_DIR} \
  --meta_path "datameta.json" \
  --overwrite_output_dir True \
  --force_image_size 448 \
  --down_sample_ratio 0.5 \
  --drop_path_rate 0.0 \
  --min_num_frame 8 \
  --max_num_frame 32 \
  --freeze_llm True \
  --freeze_mlp False \
  --freeze_backbone True \
  --vision_select_layer -1 \
  --use_swigffn True \
  --dataloader_num_workers 8 \
  --bf16 True \
  --max_steps 8000 \
  --per_device_train_batch_size 4 \
  --gradient_accumulation_steps 4 \
  --evaluation_strategy "no" \
  --save_strategy "steps" \
  --save_steps 500 \
  --save_total_limit 5 \
  --learning_rate 2e-5 \
  --weight_decay 0.05 \
  --warmup_ratio 0.03 \
  --lr_scheduler_type "cosine" \
  --logging_steps 1 \
  --max_seq_length 16384 \
  --max_num_frame 12 \
  --do_train True \
  --grad_checkpoint True \
  --group_by_length False \
  --dynamic_image_size True \
  --use_thumbnail True \
  --ps_version 'v2' \
  --deepspeed "zero_stage3_config.json" \
  --report_to "wandb" \
  --use_packed_ds True \
  --num_images_expected 96 \
  --max_packed_tokens 16384 \
  --max_buffer_size 20 \
  --log_freq 1000 \
  --strict_mode True \
  --replacement False \
  --allow_overflow False \
  --remove_unused_columns False \
  --loss_reduction "square" \
  --loss_reduction_all_gather True \
  2>&1 | tee -a "${OUTPUT_DIR}/training_log.txt"