#!/usr/bin/env bash
set -e
set -u

PROJECT_ROOT="/path/to/your/project/agent_data"
MODEL_NAME=Qwen3-8B-SLM
RUN_ID="your-run-id"

DATASET=path/to/InfiniteBench_QA
SPLITS=("longbook_choice_eng")
RESULT_TXT=StateLM/infbench_qa_choice_results.txt

TRAJECTORIES_DIR="your_trajectories_directory"  # e.g., ${PROJECT_ROOT}/qwen3-agent/trajectories/InfiniteBench_QA/${RUN_ID}
RESULTS_DIR="your_results_directory"  # e.g., ${PROJECT_ROOT}/qwen3-agent/evals/InfiniteBench_QA/${RUN_ID}

TEMP=0.7
TOP_P=0.8
TOP_K=20
MAX_CONTEXT=32000
MAX_CONTEXT_EXP=32000
MAX_TURNS_EXP=150
MAX_TURNS_TO_FAIL=200

TOOL_CONFIG_PATH="StateLM/tools_qwen_with_search.json"
SYSTEM_PROMPT_NAME="TRAIN_SYSTEM_PROMPT"

for i in {1..3}; do
    for SPLIT in ${SPLITS[@]}; do
        OUTPUT_FP=${RESULTS_DIR}/${SPLIT}_generations_$(date +%Y%m%d_%H%M%S).jsonl
        python -m StateLM.inference.hf_test_runner eval_hfds_openai \
            --vllm_cfg StateLM/inference/openai_endpoint.json \
            --model_name $MODEL_NAME \
            --temperature $TEMP \
            --top_p $TOP_P \
            --top_k $TOP_K \
            --max_turns_exp $MAX_TURNS_EXP \
            --max_context_exp $MAX_CONTEXT_EXP \
            --max_context $MAX_CONTEXT \
            --dataset_name $DATASET \
            --dataset_split $SPLIT \
            --item_to_question StateLM/inference/hf_process_fns.py:infinitebench_${SPLIT}_i2q \
            --item_to_context StateLM/inference/hf_process_fns.py:infinitebench_${SPLIT}_i2c \
            --item_to_answer  StateLM/inference/hf_process_fns.py:infinitebench_${SPLIT}_i2a \
            --trajectory_dir $TRAJECTORIES_DIR \
            --output_fp $OUTPUT_FP \
            --tokenizer_path Qwen/Qwen3-4B \
            --max_turns_to_fail $MAX_TURNS_TO_FAIL \
            --tool_config_path $TOOL_CONFIG_PATH \
            --system_prompt_name $SYSTEM_PROMPT_NAME

        python StateLM/evaluation/compute_scores.py compute_scores \
            --preds_path "$OUTPUT_FP" \
            --task_name $SPLIT \
            --model_name $MODEL_NAME \
            --label_key "correct_answer" \
            --pred_key "final_answer" \
            --results_output $RESULT_TXT
    done
done
