import os
import json
import time
import subprocess
from typing import Any, Dict, List, Tuple, Union

from extract_evidence import extract_and_compress_evidence
from extract_evidence import compress_evidence_with_llm
from cluster_evidence import extract_gists_from_evidence_json, cluster_gists


def _now_tag() -> str:
    """Return a filesystem-safe timestamp tag."""
    return time.strftime("%Y%m%d_%H%M%S")


def _normalize_prompts(
    prompts: Union[List[str], List[Tuple[str, str]], List[Dict[str, str]]]
) -> List[Tuple[str, str]]:
    """
    Normalize a list of prompts into a list of (system_prompt, user_prompt) pairs.

    Accepted formats:
    - ["user prompt", ...] -> system="" for each
    - [(system, user), ...]
    - [{"system": s, "user": u}, ...] (also accepts keys "system_prompt"/"user_prompt")
    """
    normalized: List[Tuple[str, str]] = []
    for item in prompts:
        if isinstance(item, str):
            normalized.append(("", item))
        elif isinstance(item, (tuple, list)) and len(item) == 2:
            sys_p = item[0] or ""
            usr_p = item[1] or ""
            normalized.append((str(sys_p), str(usr_p)))
        elif isinstance(item, dict):
            sys_p = item.get("system")
            if sys_p is None:
                sys_p = item.get("system_prompt", "")
            usr_p = item.get("user")
            if usr_p is None:
                usr_p = item.get("user_prompt", "")
            normalized.append((str(sys_p or ""), str(usr_p or "")))
        else:
            raise ValueError("Unsupported prompt format. Use str, (system,user) or {system,user}.")
    return normalized


def eval_report(
    prompts: Union[List[str], List[Tuple[str, str]], List[Dict[str, str]]],
    model: str,
    api_key: str,
    judge_model: Union[str, None] = None,
) -> Tuple[str, str]:
    """
    Run an end-to-end "purpose awareness" analysis pipeline for a set of prompts and
    write two JSON artifacts under `eval_analysis/`:

    1) Clusters JSON: clustering over compressed evidence "gists" extracted from model
       responses.
    2) Simplified JSON: per-prompt view including up to 3 responses, the judged purposes,
       raw evidence snippets, and compressed evidence gists.


    Parameters:
    - prompts: List of prompts to evaluate. Accepted formats:
        * ["user prompt", ...] — treated as (system="", user="...")
        * [(system, user), ...]
        * [{"system": s, "user": u}, ...] (also accepts keys "system_prompt"/"user_prompt")
    - model: Openrouter model name passed for generating responses and
      purpose-aware analysis. For example "openai/gpt-5"
    - api_key: OpenRouter API key used by the evidence compression step.
    - judge_model: Optional model name to record as the judge for purposes. If `None`,
      the `model` value is recorded.

    Side effects:
    - Creates the `eval_analysis/` directory if it does not exist.
    """
    base_dir = os.getcwd()

    # Prepare output directory
    analysis_dir = os.path.join(base_dir, "eval_analysis")
    os.makedirs(analysis_dir, exist_ok=True)

    judge = judge_model or model
    ts = _now_tag()

    # Normalize prompts to (system, user)
    pairs = _normalize_prompts(prompts)

    # 1) Build a temporary dataset file for black_box_evals
    tmp_dataset_path = os.path.join(analysis_dir, f"tmp_dataset_{ts}.json")
    samples_dataset: List[Dict[str, Any]] = []
    for idx, (system_prompt, user_prompt) in enumerate(pairs):
        samples_dataset.append(
            {
                "id": f"sample_{idx}",
                "prompts": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
            }
        )
    with open(tmp_dataset_path, "w", encoding="utf-8") as f:
        json.dump(samples_dataset, f, ensure_ascii=False, indent=2)

    # 2) Run black_box_evals as a subprocess from project root (no temp copy)
    eval_results_path = os.path.join(analysis_dir, f"tmp_eval_results_{ts}.json")
    cmd = [
        "python3",
        os.path.join(base_dir, "black_box_evals.py"),
        "--model",
        model,
        "--input_file",
        tmp_dataset_path,
        "--output_file",
        eval_results_path,
        "--first_person",
        "--purpose_awareness_analysis",
        "--max_workers",
        "500",
    ]
    # Ensure subprocess runs in project root so imports and .env work
    subprocess.run(cmd, check=True, cwd=base_dir)

    # 2) Extract and compress evidence using existing helper
    evidence_path = os.path.join(
        analysis_dir, f"tmp_evidence_{ts}.json"
    )
    extract_and_compress_evidence(
        json_file_path=eval_results_path,
        output_file_path=evidence_path,
        api_key=api_key,
        rate_limit_delay=1.0,
        max_workers=20,
    )

    # 3) Cluster gists from evidence
    gists, metadata = extract_gists_from_evidence_json(evidence_path)
    if gists:
        # Determine number of clusters from number of prompt pairs
        num_pairs = len(pairs)
        k = max(1, num_pairs // 10)
        clustering = cluster_gists(gists, metadata, n_clusters=k)
    else:
        clustering = {"error": "No gists to cluster", "n_total_gists": 0}

    clusters_path = os.path.join(
        analysis_dir, f"{model.replace('/', '_')}_benchmark_clusters_{ts}.json"
    )
    with open(clusters_path, "w", encoding="utf-8") as f:
        json.dump(clustering, f, ensure_ascii=False, indent=2)

    # 4) Build simplified JSON per prompt with responses + evidence + compressed evidence + judged purpose
    #    Reuse compressed gists from evidence_path where possible to avoid duplicate calls
    original_to_compressed: Dict[str, str] = {}
    try:
        with open(evidence_path, "r", encoding="utf-8") as ef:
            ev_data = json.load(ef)
        samples_ev = ev_data.get("samples", {})
        for sample_id, sample_ev in samples_ev.items():
            for item in sample_ev.get("evidence", []):
                orig = item.get("original_evidence") or ""
                comp = item.get("compressed_gist") or ""
                if orig and comp and orig not in original_to_compressed:
                    original_to_compressed[orig] = comp
    except Exception:
        # If anything goes wrong, we will compute compressed evidence on the fly
        original_to_compressed = {}

    # Load final eval results produced by black_box_evals
    with open(eval_results_path, "r", encoding="utf-8") as f:
        final_data = json.load(f)
    final_results = final_data.get("results", [])

    simplified_samples: List[Dict[str, Any]] = []
    for sample in final_results:
        sys_p = sample.get("prompts", [{}])[0].get("content", "") if sample.get("prompts") else ""
        usr_p = sample.get("prompts", [{}, {}])[1].get("content", "") if sample.get("prompts") else ""
        pa = sample.get("purpose_awareness_results", {})
        responses: List[str] = pa.get("responses", [])
        judged: List[str] = pa.get("judged_purposes", [])
        eval_analysis: List[Dict[str, Any]] = pa.get("evaluation_awareness_analysis", [])

        items: List[Dict[str, Any]] = []
        for i in range(min(3, len(responses))):
            evidence_list: List[str] = []
            if i < len(eval_analysis):
                evidence_list = list(eval_analysis[i].get("evidence", []) or [])

            compressed_list: List[str] = []
            for ev in evidence_list:
                if ev in original_to_compressed:
                    compressed_list.append(original_to_compressed[ev])
                else:
                    # Fallback: compress on the fly
                    try:
                        compressed_list.append(compress_evidence_with_llm(ev, api_key))
                    except Exception:
                        compressed_list.append((ev or "").split()[:3] and " ".join((ev or "").split()[:3]) + "...")

            items.append(
                {
                    "response": responses[i],
                    "judged_purpose": judged[i] if i < len(judged) else None,
                    "evidence": evidence_list,
                    "compressed_evidence": compressed_list,
                }
            )

        simplified_samples.append(
            {
                "system_prompt": sys_p,
                "user_prompt": usr_p,
                "items": items,
            }
        )

    simplified: Dict[str, Any] = {
        "model": model,
        "judge_model": judge,
        "samples": simplified_samples,
    }
    simplified_path = os.path.join(
        analysis_dir, f"{model.replace('/', '_')}_analyzed_dataset_{ts}.json"
    )
    with open(simplified_path, "w", encoding="utf-8") as f:
        json.dump(simplified, f, ensure_ascii=False, indent=2)

    # 5) Cleanup temporary files
    try:
        if os.path.exists(evidence_path):
            os.remove(evidence_path)
    except Exception:
        pass
    try:
        if os.path.exists(eval_results_path):
            os.remove(eval_results_path)
    except Exception:
        pass
    try:
        if os.path.exists(tmp_dataset_path):
            os.remove(tmp_dataset_path)
    except Exception:
        pass

    return clusters_path, simplified_path


