#!/usr/bin/env python3
from __future__ import annotations

import json
import sys
from pathlib import Path
from typing import Dict, Any, List

# Ensure project root import
THIS_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = THIS_DIR.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from src.a4s.llm_client import LLMClient


def ensure_env() -> None:
    try:
        from dotenv import load_dotenv, find_dotenv  # type: ignore
        load_dotenv(find_dotenv(), override=False)
    except Exception:
        pass


def score_report(client: LLMClient, model_report_md: str, eval_model: str = "doubao-seed-1-6-thinking-250715") -> Dict[str, Any]:
    system = (
        "You are an independent evaluator of cross-disciplinary counterfactual reasoning quality. "
        "Score fairly, avoid verbosity, and return strict JSON only."
    )
    # New task-aligned rubric (0-100 total), five dimensions + overall
    user = (
        "Evaluate the report using a 5-DIMENSION RUBRIC (0-100 total).\n"
        "Return STRICT JSON with numeric scores (floats) for EXACT keys:\n"
        "- rigor_traceability (0-25): clarity of assumptions, data/source grounding, traceable reasoning and checks.\n"
        "- integration_causality (0-25): cross-domain causal links, mechanism coherence, synthesis quality.\n"
        "- feasibility_minimality (0-20): realism under constraints, minimal additional assumptions.\n"
        "- uncertainty_adaptation (0-15): calibrated ranges, sensitivity, substitution/adaptation framing.\n"
        "- decisionability (0-15): actionable indicators, thresholds, branch decision rules.\n"
        "- overall (0-100) = sum of the five dimensions.\n\n"
        f"Report to evaluate:\n\n{model_report_md}\n\n"
        "Respond with ONLY a single JSON object with those keys."
    )
    # First attempt
    resp = client.chat([
        {"role": "system", "content": system},
        {"role": "user", "content": user},
    ], model=eval_model, temperature=0.0, max_tokens=900)
    text = resp["choices"][0]["message"]["content"] or ""
    try:
        data = json.loads(text)
    except Exception:
        # Try to extract JSON substring
        import re
        m = re.search(r"\{[\s\S]*\}", text)
        if m:
            try:
                data = json.loads(m.group(0))
            except Exception:
                data = None  # fallthrough to retry
        else:
            data = None

    # Retry with stricter instruction if needed
    if not isinstance(data, dict):
        strict_user = (
            user
            + "\n\nIMPORTANT: Respond with ONLY a single raw JSON object. "
              "No preface, no markdown, no backticks, no comments."
        )
        resp2 = client.chat([
            {"role": "system", "content": system},
            {"role": "user", "content": strict_user},
        ], model=eval_model, temperature=0.0, max_tokens=900)
        text2 = resp2["choices"][0]["message"]["content"] or ""
        try:
            data = json.loads(text2)
        except Exception:
            m2 = re.search(r"\{[\s\S]*\}", text2)
            if m2:
                data = json.loads(m2.group(0))
            else:
                raise RuntimeError("Evaluator did not return JSON")

    # Normalize numeric fields to float to avoid formatting errors when models return strings
    expected_keys = [
        "rigor_traceability",
        "integration_causality",
        "feasibility_minimality",
        "uncertainty_adaptation",
        "decisionability",
        "overall",
    ]
    for k in expected_keys:
        if k in data:
            try:
                data[k] = float(data[k])
            except Exception:
                raise RuntimeError(f"Evaluator returned non-numeric for {k}: {data[k]!r}")
    return data


def evaluate_run_dir(run_dir: Path, client: LLMClient, eval_model: str = "doubao-seed-1-6-thinking-250715") -> Dict[str, Any]:
    models = {
        "agents4sci_v2": run_dir / "agents4sci_v2" / "report.md",
        "baseline_single": run_dir / "baseline_single" / "report.md",
        "baseline_tree": run_dir / "baseline_tree" / "report.md",
        "baseline_debate": run_dir / "baseline_debate" / "report.md",
    }

    results: Dict[str, Any] = {}
    table_lines: List[str] = [
        "| Model | Rigor/Trace (0-25) | Integration/Causality (0-25) | Feasibility/Minimality (0-20) | Uncertainty/Adaptation (0-15) | Decisionability (0-15) | Overall (0-100) |",
        "|---|---:|---:|---:|---:|---:|---:|",
    ]

    for name, path in models.items():
        if not path.exists():
            results[name] = {"error": f"Missing report: {path}"}
            continue
        md = path.read_text(encoding="utf-8")
        score = score_report(client, md, eval_model=eval_model)
        results[name] = score
        table_lines.append(
            f"| {name} | {score['rigor_traceability']:.1f} | {score['integration_causality']:.1f} | {score['feasibility_minimality']:.1f} | {score['uncertainty_adaptation']:.1f} | {score['decisionability']:.1f} | {score['overall']:.1f} |"
        )

    (run_dir / "evaluation.json").write_text(json.dumps(results, ensure_ascii=False, indent=2), encoding="utf-8")

    summary_md = [
        f"## LLM Evaluation Summary for {run_dir.name}",
        "",
        *table_lines,
        "",
    ]
    overall_items = [(k, v.get("overall", -1)) for k, v in results.items() if isinstance(v, dict) and "overall" in v]
    if overall_items:
        best = max(overall_items, key=lambda kv: kv[1])
        summary_md.append(f"**Best overall**: `{best[0]}` with score {best[1]:.2f}.")
    summary_md.append("")
    # No textual justifications requested in the new rubric; keep summary concise.

    (run_dir / "evaluation.md").write_text("\n".join(summary_md), encoding="utf-8")
    return results


def main() -> None:
    import argparse
    ensure_env()
    parser = argparse.ArgumentParser(description="Evaluate a single run directory with LLM-based scoring")
    parser.add_argument("--run", required=True, help="Path to topic run directory (e.g., experiments/0819/t01_...)")
    parser.add_argument("--model", required=False, default="doubao-seed-1-6-thinking-250715", help="Evaluator model name")
    args = parser.parse_args()

    run_dir = Path(args.run).resolve()
    if not run_dir.exists():
        raise SystemExit(f"Run directory not found: {run_dir}")

    client = LLMClient()
    evaluate_run_dir(run_dir, client, eval_model=args.model)
    print(f"Saved evaluation to {run_dir}/evaluation.*")


if __name__ == "__main__":
    main()


