#!/bin/bash

# Perform AWQ search after yarn interpolation and save search results
python -m awq.entry --model_path /home/yeq6/Research_project/llama/llama-2-7b-chat_hf \
--original-max-position-embeddings 2048 --w_bit 4 --q_group_size 128 --run_awq \
--dump_awq awq_cache/llama2-7b-w4-g128_mine_preyarn.pt --yarn 8 --yarn-aware-search \
--custom-model --no-use-cache

# Generate real quantized weights (INT4)
python -m awq.entry --model_path /home/yeq6/Research_project/llama/llama-2-7b-chat_hf \
    --w_bit 4 --q_group_size 128 \
    --load_awq awq_cache/llama2-7b-w4-g128_yarn_mine.pt \
    --q_backend real --dump_quant quant_cache/llama-2-7b-chat-w4-g128_mine_preyarn.pt