
CONFIG_FILE=profiles/rapid_profile/agent/toolbench_for_llama/llama_31_corvus_G1cate.json

MODEl_NAME=$(jq -r '.policy.model' "$CONFIG_FILE")

echo $MODEl_NAME

port_text=$(jq -r '.policy.port[]' "$CONFIG_FILE")
port_list=()
while IFS= read -r number; do
    echo $number
    port_list+=("$number")
done <<< "$port_text"

for port in "${port_list[@]}"; do
    echo "$port"
done

cuda_text=$(jq -r '.policy.cuda[]' "$CONFIG_FILE")
cuda_list=()
while IFS= read -r cuda; do
    echo $cuda
    cuda_list+=("$cuda")
done <<< "$cuda_text"

for cuda in "${cuda_list[@]}"; do
    echo "$cuda"
done

export LD_LIBRARY_PATH=/usr/local/cuda-12.2/lib64
EVAL_PARALLEL=true

echo "run vllm server parallel..."
parallel=1

if [ "$EVAL_PARALLEL" == "true" ]; then
    length=${#cuda_list[@]}
else
    length=1
fi

for ((i=0; i<$length; i=i+$parallel)); do
    port=${port_list[$i]}
    cuda=""
    for ((j=0; j<$parallel; j++))
    do
        if [ $j -eq 0 ]; then
            cuda="${cuda_list[$i+j]}"
        else
            cuda="$cuda,${cuda_list[$i+j]}"
        fi
    done

  CUDA_VISIBLE_DEVICES=$cuda python -m vllm.entrypoints.openai.api_server \
    --port $port \
    --model $MODEl_NAME \
    --tokenizer $MODEl_NAME \
    --trust-remote-code &
    # --max-model-len 8192 \
    # --enforce-eager &
    # --gpu-memory-utilization 0.9 &
    sleep 1
    echo "run server on cuda $cuda, port $port "

done
