#!/usr/bin/env bash
# run_evaluations.sh
# ────────────────────────────────────────────────────────────
# Hard‑coded list of report *prefixes* to process.
# For each prefix the pipeline runs:
#   1) make_verfication_report.py   → creates <prefix>_n.verify.md
#   2) make_report_score.py         → produces output_eval/<prefix>/summary.json
# After all prefixes are processed a single call to make_tableandchart.py
# builds combined radar PNGs + the Excel workbook.
# --------------------------------------------------------------------
export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
export SSL_CERT_FILE=/etc/ssl/certs/ca-certificates.crt


# ✏️  EDIT PREFIXES AS NEEDED
PREFIXES=(
  # # fast
  # qwen3-235b-fast
  # gemini-2.5-pro-fast
  # gemini-3.0-flash-fast
  # claude_opus4.5_fast
  # gpt5.2_fast
  # # think
  # qwen3-235b-think
  # gemini-2.5-pro-think
  # gemini-3.0-pro-think
  # claude_opus4.5_think
  # gpt5.2_think
  # # think_search
  # qwen3-235b-think_search
  # gemini-2.5-pro-think_search
  # claude_opus4.5_think_search
  # gpt5.2_think_search
  # # deep research
  # webthinker
  # qwen3-235b-deep
  # gemini-2.5-pro_deep
  # gemini-3-pro-deep
  claude_opus4.5_deep
  # gpt5_deep
)
# --------------------------------------------------------------------
# Configurable options
ROOT="data/micro1_math"          # where 1/,2/,3/… live
OUT="output/math"            # destination for make_report_score outputs
VERIFIER_MODEL="gpt-4.1-mini"     # model for make_verfication_report.py
EVAL_MODEL="gpt-5.2"           # model for make_report_score.py claude-sonnet-4-20250514 ,o3,o4-mini, gpt-5
SAMPLES="1-6"                      # numeric folders 1..SAMPLES
DPI=150                        # radar chart resolution for charts
# --------------------------------------------------------------------
set -euo pipefail

if (( ${#PREFIXES[@]} == 0 )); then
  echo "❌  PREFIXES array is empty. Edit run_evaluations.sh first." >&2
  exit 1
fi

echo "▶️  Running pipeline for prefixes: ${PREFIXES[*]}"

# # ────────────────────────────────────────────────────────────
for P in "${PREFIXES[@]}"; do

  echo -e "\n=== [$P] 1️⃣ run_information_verification.py ==="
  python run_information_verification.py \
    --root "$ROOT" \
    --prefix  "$P" \
    --samples "$SAMPLES" \
    --eval_model "$VERIFIER_MODEL" \
    --output_root "$OUT" || { echo "verification failed for $P"; exit 1; }
  
  # echo -e "\n=== [$P] 2️⃣ run_report_evaluation.py ==="
  # python run_report_evaluation.py \
  #   --root "$ROOT" \
  #   --prefix     "$P" \
  #   --eval_model "$EVAL_MODEL" \
  #   --samples    "$SAMPLES" \
  #   --output_dir "$OUT" || { echo "scoring failed for $P"; exit 1; }
  
  echo -e "\n=== [$P] 3️⃣ run_score_integration.py ==="
  python run_score_integration.py \
  --prefix     "$P" \
  --samples    "$SAMPLES" \
  --output_dir "$OUT" || { echo "integration failed for $P"; exit 1; }
done

# # ────────────────────────────────────────────────────────────
# # # Combined radar charts + Excel summary
# TARGET="webthinker"                 # baseline for GAP rows
# MODEL_LIST=$(IFS=,; echo "${PREFIXES[*]}")

# python make_tableandchart_v6.py \
#   --root     "$OUT" \
#   --models   "$MODEL_LIST" \
#   --target   "$TARGET" \
#   --outdir   "visualize_bio_draft_mini/radar_imgs" \
#   --excel    "visualize_bio_draft_mini/excel_tables/evaluation_summary.xlsx" \
#   --dpi      "$DPI"
# echo "✅  Pipeline complete. Results → $OUT , radar_imgs/, excel_tables/"
