

env_defaults:
  NODES: 2
  GPUS: 8
  MEM: 32
  SLA: Premium # Premium, Standard, Basic
  PRIORITY: high

search:
  job_template:
    name: qwen_unkVQA_finetune_${NODES}x${GPUS}GPU_bsz{bsz}_gacc{grad_accu}_lr{lr}
    sku: ${NODES}x${MEM}G${GPUS}-IB
    sla_tier: ${SLA} 
    priority: ${PRIORITY}
    # mpi: False
    command:
     - torchrun --nnodes=${NODES} --nproc_per_node=${GPUS} 
        --rdzv-id={job_id}
        --rdzv-backend=c10d
        --rdzv-endpoint=$${MASTER_ADDR}:$${MASTER_PORT} finetune.py
        --model_name_or_path models/Qwen/Qwen-VL-Chat 
        --data_path <DATA_FOLDER>unk_v1+gqa+docci_train_for_qwenvl.json
        --bf16 False 
        --fp16 True 
        --fix_vit True 
        --output_dir <OUTPUT_FOLDER>/qwen-vl/qwen-vl-chat-lora-finetune_unk+gqa_idk+docci_idk-_ep{ep} 
        --num_train_epochs {ep} 
        --per_device_train_batch_size {bsz} 
        --per_device_eval_batch_size {bsz} 
        --gradient_accumulation_steps {grad_accu} 
        --evaluation_strategy "no" 
        --save_strategy "steps" 
        --save_steps 5000 
        --save_total_limit 10 
        --learning_rate {lr}
        --weight_decay 0.1 
        --adam_beta2 0.95 
        --warmup_ratio 0.01 
        --lr_scheduler_type "cosine" 
        --logging_steps 1 
        --report_to "none" 
        --model_max_length 2048 
        --lazy_preprocess True 
        --use_lora 
        --gradient_checkpointing 
        --deepspeed finetune/ds_config_zero2.json
    process_count_per_node: 1
    submit_args: 
      env:
        NCCL_IB_DISABLE: 0
        NCCL_DEBUG: INFO
        NCCL_IB_TIMEOUT: 60
        NCCL_ASYNC_ERROR_HANDLING: 0
        MKL_THREADING_LAYER: GNU
        max_attempts: 1
  type: grid
  max_trials: 2
  params:
    - name: job_id
      spec: discrete
      values: [4242]
    - name: lr
      spec: discrete
      values: [1e-5]
    - name: grad_accu
      spec: discrete
      values: [8]
    - name: bsz
      spec: discrete
      values: [2]
      # values: [1]
    - name: ep
      spec: discrete
      values: [1]
