#!/bin/bash

model="your/model/path"
device=0
num_warmups=2
num_runs=10

tao_version="1.0"
kv_budget=512
task_query_len=15
kv_warmup_budget=10000
kv_prune_trigger_size=4096
chunk_size=4096
warmup_layers=16
alpha=0.32
chunk_window_size=8
chunk_sink=4
kernel_size=15
pooling="avg"
use_task_cache=true
test_performance=true


output_dir="output/ttft"
mkdir -p $output_dir

# 测试TAO模式

CUDA_VISIBLE_DEVICES=$device python performance_benchmark/ttft_prefill.py \
--model $model \
--mode take \
--version $tao_version \
--seqlen 131072 \
--num_warmups $num_warmups \
--num_runs $num_runs \
--kv_budget $kv_budget \
--task_query_len $task_query_len \
--kv_warmup_budget $kv_warmup_budget \
--kv_prune_trigger_size $kv_prune_trigger_size \
--chunk_size $chunk_size \
--warmup_layers $warmup_layers \
--alpha $alpha \
--chunk_window_size $chunk_window_size \
--chunk_sink $chunk_sink \
--kernel_size $kernel_size \
--pooling $pooling \
--use_task_cache $use_task_cache \
--test_performance $test_performance \
--seed 42




