#!/bin/bash
export PYTHONPATH="Megatron-LM:grouter_ep_optimizer:$PYTHONPATH"
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7

C4_HOME="/workspace/Megatron-LM-router/qwen3_dataset"
DATA_BLEND=""
#for i in 0040; do 
    #DATA_BLEND="${DATA_BLEND} 0.04 ${C4_HOME}/qwen3-c4-${i}_text_document"
#done
DATA_BLEND="1.0 /workspace/Megatron-LM-router/qwen3_dataset/qwen3-c4-0040_text_document"

torchrun --nproc-per-node 8 grouter_ep_optimizer/tools/finetune_grouter.py \
    --batch-size 4 \
    --gradient-accumulation-steps 1 \
    --max-length 4096 \
    --bf16 \
    --random-seed 1423 \
    --tokenizer-type HuggingFaceTokenizer \
    --tokenizer-model model_home/qwen3-30b-a3b \
    --data-prefix $DATA_BLEND \
    --grouter-config-path grouter_ep_optimizer/grouter/qwen3_30b/cvt32_mapping_affinity.json \
    --grouter-checkpoint-path grouter_ep_optimizer/grouter/qwen3_30b/grouter_30b.pth \
    --output-dir grouter_ep_optimizer/grouter/qwen3_30b \
    --learning-rate 1e-3 \
    --max-steps 400 \
    --loss-type aux_loss \
    --finetune-optim gradient \
    --warmup-steps 0 \
    --log-interval 1 \
    --finetune-mode last_layer \
    --verbose 
