set -u
set -e

PROJECT_ROOT="/path/to/data"  # Change this to your project root
BOOK_BASE_COPYRIGHT="${PROJECT_ROOT}/NovelQA_Fullset/Books/CopyrightProtected"
TEST_BASE_COPYRIGHT="${PROJECT_ROOT}/NovelQA_Fullset/Data/CopyrightProtected"


RUN_ID="your_run_id"  # Change this to your desired run ID
TRAJECTORIES_DIR="your_trajectories_directory"
RESULTS_DIR="your_results_directory"
RESULT_OUTPUT=StateLM/novelqa_results_v3_8b.txt

MODEL_NAME="Qwen3-8B-SLM"
TOOL_CONFIG_PATH="StateLM/tools_qwen_with_search.json"
SYSTEM_PROMPT_NAME="TRAIN_SYSTEM_PROMPT"

TEMP=0.7
TOP_P=0.8
TOP_K=20
MAX_CONTEXT=32000
MAX_TURNS_EXP=150
MAX_TURNS_TO_FAIL=200
MAX_OUTPUT_TOKENS=2048

for i in {1..3}
do
    OUTPUT_FP=${PROJECT_ROOT}/${RUN_ID}_generations_$(date +%Y%m%d_%H%M%S).jsonl
    python -m StateLM.inference.nqa_test_runner_full eval_nqa_openai \
        --cfg_path StateLM/inference/openai_endpoint.json \
        --temperature $TEMP \
        --topp $TOP_P \
        --topk $TOP_K \
        --tool_config_path $TOOL_CONFIG_PATH \
        --system_prompt_name $SYSTEM_PROMPT_NAME \
        --max_turns_exp $MAX_TURNS_EXP \
        --max_context_exp $MAX_CONTEXT_EXP \
        --max_output_tokens $MAX_OUTPUT_TOKENS \
        --books_base_dir "$BOOK_BASE_COPYRIGHT" \
        --questions_base_dir "$TEST_BASE_COPYRIGHT" \
        --log_dir "$TRAJECTORIES_DIR" \
        --result_dir "$RESULTS_DIR" \
        --output_fp "$OUTPUT_FP" \
        --model_name $MODEL_NAME \
        --max_turns_to_fail $MAX_TURNS_TO_FAIL

    python StateLM/evaluation/compute_scores.py compute_scores \
        --preds_path "$OUTPUT_FP" \
        --results_output $RESULT_OUTPUT \
        --task_name "longbook_choice_eng" \
        --model_name $MODEL_NAME \
        --label_key "correct_answer" \
        --pred_key "final_answer"

    printf "\nPREDICTIONS_FILE=%s\n" "$OUTPUT_FP" >> "$RESULT_OUTPUT"
done
