#!/usr/bin/env bash
set -e
set -u


PROJECT_ROOT="/path/to/your/project/agent_data"
MODEL_NAME=Qwen3-8B-SLM
RUN_ID="your_run_id"


DATASET=path_to_your_dataset  # e.g., ruler_niah_single
SPLITS=(
    "32768"
    "65536"
    "131072"
    "262144"
    "524288"
    "786432"
    "1048576"
    "2097152"
)
BENCHMARK="synthetic"
TRAJECTORIES_DIR="your_trajectories_directory"  # e.g., ${PROJECT_ROOT}/qwen3-agent/trajectories/Ruler_Niah/${RUN_ID}
RESULTS_DIR="your_results_directory"  # e.g., ${PROJECT_ROOT}/qwen3-agent/evals/Ruler_Niah/${RUN_ID}

TEMP=0.7
TOP_P=0.8
TOP_K=20
MAX_CONTEXT_EXP=32000
MAX_TURNS_EXP=150
MAX_TURNS_TO_FAIL=200

TOOL_CONFIG_PATH="StateLM/tools_qwen_without_search.json"
SYSTEM_PROMPT_NAME="TRAIN_SYSTEM_PROMPT"

for i in {1..3}; do
  for SPLIT in ${SPLITS[@]}; do
    OUTPUT_DIR=${RESULTS_DIR}/${SPLIT}_$(date +%Y%m%d_%H%M%S)
    OUTPUT_FP=${OUTPUT_DIR}/${SPLIT}_generations.jsonl
    python -m StateLM.inference.hf_test_runner eval_hfds_openai \
        --vllm_cfg StateLM/inference/openai_endpoint.json \
        --model_name $MODEL_NAME \
        --temperature $TEMP \
        --top_p $TOP_P \
        --top_k $TOP_K \
        --max_turns_exp $MAX_TURNS_EXP \
        --max_context_exp $MAX_CONTEXT_EXP \
        --max_output_tokens 1024 \
        --dataset_name $DATASET \
        --dataset_split $SPLIT \
        --item_to_question StateLM/inference/hf_process_fns.py:ruler_niah_i2q \
        --item_to_context StateLM/inference/hf_process_fns.py:ruler_niah_i2c \
        --item_to_answer  StateLM/inference/hf_process_fns.py:ruler_niah_i2a \
        --item_to_meta StateLM/inference/hf_process_fns.py:ruler_niah_i2meta \
        --output_postprocess StateLM/inference/hf_process_fns.py:ruler_niah_postprocess \
        --correct_answer_key "outputs" \
        --model_answer_key "pred" \
        --trajectory_dir $TRAJECTORIES_DIR \
        --output_fp $OUTPUT_FP \
        --tokenizer_path Qwen/Qwen3-8B \
        --max_turns_to_fail $MAX_TURNS_TO_FAIL \
        --tool_config_path $TOOL_CONFIG_PATH \
        --system_prompt_name $SYSTEM_PROMPT_NAME \


    python StateLM/evaluation/evaluate.py \
        --data_dir "${OUTPUT_DIR}" \
        --benchmark "${BENCHMARK}" \
        --results_dir "${RUN_ID}"

    done
done
