BASE_PATH="[PATH_to_THIS_REPO]"

# 1. data
export LONGBENCH_LOCAL_PATH="${BASE_PATH}/evaluation/eval_by_LongBenchV1/LongBench"
cd ${BASE_PATH}/evaluation/eval_by_LongBenchV1

export VLLM_WORKER_MULTIPROC_METHOD=spawn
export CUDA_VISIBLE_DEVICES=0,1,2,3

# 2. output_dir
OUTPUT_DIR_PATH="./results"
# 3. model_dir
MODEL_DIR_PATH="${BASE_PATH}/output"
MAX_TOKENS=4096
MAX_MODEL_INPUT_LEN=28000
TEMPERATURE=0.0
# 目前在short2long_qa中已经有提示次了，实际上这里没有用
PROMPT=step_by_step

# 1. for CKP
MODEL_NAMES=("[MODEL_NAME]")
CKP_LIST=("60")
for CKP in "${CKP_LIST[@]}"; do
    for ROOT_MODEL_NAME in "${MODEL_NAMES[@]}"; do
        MODEL_NAME=${ROOT_MODEL_NAME}/checkpoint-${CKP}
        MODEL_NAME_OR_PATH="${MODEL_DIR_PATH}/${MODEL_NAME}"
        # (1). inference
        python ./eval_longbench_qa.py \
            --stage inference \
            --prompt $PROMPT \
            --model_name_or_path $MODEL_NAME_OR_PATH \
            --tensor_parallel_size 4 \
            --max_tokens $MAX_TOKENS \
            --temperature $TEMPERATURE \
            --max_model_input_len $MAX_MODEL_INPUT_LEN \
            --output_path ${OUTPUT_DIR_PATH}/$MODEL_NAME/$PROMPT/predictions

        # (2). compute metrics
        python ./eval_longbench_qa.py \
            --stage evaluation \
            --eval_strategy all \
            --dataset ${OUTPUT_DIR_PATH}/$MODEL_NAME/$PROMPT/predictions \
            --output_path ${OUTPUT_DIR_PATH}/$MODEL_NAME/$PROMPT/metrics.json
    done
done

# 2. for instruct
# MODEL_NAMES=("Qwen2.5-7B-Instruct-wo-yarn")
# for MODEL_NAME in "${MODEL_NAMES[@]}"; do
#     MODEL_NAME_OR_PATH="${MODEL_DIR_PATH}/${MODEL_NAME}"
#     # (1). inference
#     python ./eval_longbench_qa.py \
#         --stage inference \
#         --prompt $PROMPT \
#         --model_name_or_path $MODEL_NAME_OR_PATH \
#         --tensor_parallel_size 4 \
#         --max_tokens $MAX_TOKENS \
#         --temperature $TEMPERATURE \
#         --max_model_input_len $MAX_MODEL_INPUT_LEN \
#         --output_path ${OUTPUT_DIR_PATH}/$MODEL_NAME/$PROMPT/predictions

#     # (2). compute metrics
#     python ./eval_longbench_qa.py \
#         --stage evaluation \
#         --eval_strategy all \
#         --dataset ${OUTPUT_DIR_PATH}/$MODEL_NAME/$PROMPT/predictions \
#         --output_path ${OUTPUT_DIR_PATH}/$MODEL_NAME/$PROMPT/metrics.json
# done