#!/bin/bash

# Initialize parameters
file_path=swap_perf_DS_Q_32B_chunked_prefill
device_id=0

#llm_model=deepseek-ai/DeepSeek-R1-Distill-Llama-8B
llm_model=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
#llm_model=meta-llama/Llama-3.3-70B-Instruct
#llm_model=codellama/CodeLlama-34b-hf
#llm_model=tiiuae/falcon-40b
#llm_model=Qwen/Qwen2.5-32B-Instruct
#hf_dataset=anon8231489123/ShareGPT_Vicuna_unfiltered

# 로그 디렉토리 생성
mkdir -p ./$file_path

#input_len=None
#output_len=None

#input_len=5000
#output_len=5000
#max_model_len=$((output_len + input_len))
#quantization=None
#batch_token=$max_model_len
        #--random-len \
        #--disable-log-stats \
        #--tensor-parallel-size 2 \
        #--gpu-memory-utilization 0.9 \
        #--distributed-executor-backend mp \
#--max-model-len $max_model_len \
for input_len in 15000 20000 25000 30000 #2500 3000 3500 4000 4500 5000 10000 #15000 20000 25000 30000
do
    output_len=$input_len
    max_model_len=$((output_len + input_len))
    batch_tokens=$max_model_len
    num_prompts=10
    echo "Running benchmark with max_model_len: $max_model_len"
    CUDA_VISIBLE_DEVICES=$device_id python ../../benchmark_throughput_rand_length.py \
        --backend vllm \
        --model $llm_model \
        --tokenizer $llm_model \
        --preemption-mode swap \
        --swap-space 0 \
        --swap-fc-space 100 \
        --num-thread 8 \
        --io-size-mb 32 \
        --dtype auto \
        --enable_chunked_prefill true \
        --max_num_batched_tokens $batch_tokens \
        --num-prompts $num_prompts \
        --input-len $input_len \
        --output-len $output_len \
        --block-size 128 \
        --max-model-len $max_model_len \
        --gpu-memory-utilization 0.9 \
        --tensor-parallel-size 2 \
        --distributed-executor-backend mp \
        > "${file_path}/fc_${max_model_len}.log"
    
    echo "Completed benchmark with max_model_len: $max_model_len"
done

echo "All benchmarks completed!" 