#!/bin/bash

# python perplexity_yarn.py --tokenized output/govreport-test-tokenized \\
# --dataset-min-tokens 16384 --samples 50 --output-file data/govreport.csv \\
# --min-tokens 16384 --max-tokens 16384 -m /home/yeq6/Research_project/llama/llama-2-7b-chat_hf \\
# --ntk 8 --aggressive-memory  --quant_weight_bit_width int4

# python perplexity_yarn.py --tokenized output/govreport-test-tokenized \
# --dataset-min-tokens 20384 --samples 10 --output-file ppl_results/govreport_fp16.csv \
# --min-tokens 256 --max-tokens 20384 -m /home/yeq6/Research_project/llama/llama-2-7b-chat_hf \
# --ntk 8 --sliding-window 2048 --tokens-step 384 --aggressive-memory

python perplexity_yarn.py --tokenized output/govreport-test-tokenized \
--dataset-min-tokens 20384 --samples 10 --output-file ppl_results/govreport_int8.csv \
--min-tokens 256 --max-tokens 20384 -m /home/yeq6/Research_project/llama/llama-2-7b-chat_hf \
--ntk 8 --sliding-window 2048 --tokens-step 384 --aggressive-memory --load-in-8bit

python perplexity_yarn.py --tokenized output/govreport-test-tokenized \
--dataset-min-tokens 20384 --samples 10 --output-file ppl_results/govreport_nf4.csv \
--min-tokens 256 --max-tokens 20384 -m /home/yeq6/Research_project/llama/llama-2-7b-chat_hf \
--ntk 8 --sliding-window 2048 --tokens-step 384 --aggressive-memory --load-in-4bit


python perplexity_yarn.py --tokenized output/govreport-test-tokenized \
--dataset-min-tokens 20384 --samples 10 --output-file ppl_results/govreport_no_pi.csv \
--min-tokens 256 --max-tokens 20384 -m /home/yeq6/Research_project/llama/llama-2-7b-chat_hf \
--ntk 8 --sliding-window 2048 --tokens-step 384 --aggressive-memory

python perplexity_yarn.py --tokenized output/govreport-test-tokenized \
--dataset-min-tokens 20384 --samples 10 --output-file ppl_results/govreport_awq_int4.csv \
--min-tokens 256 --max-tokens 20384 -m /home/yeq6/Research_project/llama/llama-2-7b-chat_hf \
--quant_path /home/yeq6/Research_project/llama/llm-awq/quant_cache/llama-2-7b-chat-w4-g128-awq-v2.pt \
--sliding-window 2048 --tokens-step 384 --aggressive-memory --awq


python perplexity_yarn.py \
--dataset-min-tokens 20384 --samples 10 --output-file ppl_results/govreport_awq_int4.csv \
--min-tokens 256 --max-tokens 20384 -m /home/yeq6/Research_project/llama/llama-2-7b-chat_hf \
--quant_path /home/yeq6/Research_project/llama/llm-awq/quant_cache/llama-2-7b-chat-w4-g128-awq-v2.pt \
--sliding-window 2048 --tokens-step 384 --aggressive-memory --awq


# test dynamic yarn without awq
python perplexity_yarn.py --tokenized output/govreport-test-tokenized \
--dataset-min-tokens 20384 --samples 10 --output-file ppl_results/govreport_yarn.csv \
--min-tokens 256 --max-tokens 19456 -m /home/yeq6/Research_project/llama/llama-2-7b-chat_hf \
--sliding-window 2048 --tokens-step 384 --aggressive-memory --yarn 8 --original --custom-model \
--original-max-position-embeddings 2048