#!/bin/bash

# Initialize parameters
file_path=swap_perf_DS_Q_32B_max_seq_overhead
#file_path=swap_perf_LM_34B_len_test
device_id=0

#llm_model=deepseek-ai/DeepSeek-R1-Distill-Llama-8B
llm_model=deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
#llm_model=meta-llama/Llama-3.3-70B-Instruct
#llm_model=codellama/CodeLlama-34b-hf
#llm_model=tiiuae/falcon-40b
#llm_model=Qwen/Qwen2.5-32B-Instruct
#hf_dataset=anon8231489123/ShareGPT_Vicuna_unfiltered

# 로그 디렉토리 생성
mkdir -p ./$file_path

#input_len=None
#output_len=None
#input_len=5000
#output_len=5000
#max_model_len=$((output_len + input_len))
#quantization=None
#batch_token=$max_model_len
        #--random-len \
        #--disable-log-stats \
        #--tensor-parallel-size 2 \
        #--gpu-memory-utilization 0.9 \
        #--distributed-executor-backend mp \
#--max-model-len $max_model_len \

input_len=2500
#max_num_seqs=1

for max_num_seqs in 8 # 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
do
    output_len=$input_len
    max_model_len=$((output_len + input_len))
    batch_tokens=$max_model_len
    num_prompts=20
    echo "Running benchmark with max_num_seqs: $max_num_seqs"
    CUDA_VISIBLE_DEVICES=$device_id python ../../benchmark_throughput_rand_length.py \
        --backend vllm \
        --model $llm_model \
        --tokenizer $llm_model \
        --preemption-mode swap \
        --swap-space 0 \
        --swap-fc-space 100 \
        --num-thread 8 \
        --io-size-mb 32 \
        --dtype auto \
        --enable_chunked_prefill true \
        --max_num_batched_tokens $batch_tokens \
        --num-prompts $num_prompts \
        --input-len $input_len \
        --output-len $output_len \
        --block-size 128 \
        --max-model-len $max_model_len \
        --max-num-seqs $max_num_seqs \
        --log-stats-interval 1 \
        --scheduler-delay-factor 0 \
        > "${file_path}/fc_${max_num_seqs}.log"
    
    echo "Completed benchmark with max_num_seqs: $max_num_seqs"
done

echo "All benchmarks completed!" 