"""
LLM-as-Judge for CKM hypothesis validation and novelty scoring.

Two judges:
  1. run_hit_verify   — does a candidate paper validate a hypothesis? (HIT/MISS)
  2. run_novelty_judge — how original is the hypothesis itself? (standalone score)

Both use a 10-point scale with explicit, dimension-level rubrics.

Supporting helper:
  run_keyword_extract — extract arXiv search keywords from a hypothesis
"""
import re
import asyncio
import logging

from openai import AsyncOpenAI
from config import config
from core.engines import async_retry

logger = logging.getLogger("ckm.judge")

# Strong judge — for novelty scoring (few calls, needs quality)
judge_client = AsyncOpenAI(
    api_key=config["api"]["judge"]["api_key"],
    base_url=config["api"]["judge"]["base_url"],
    timeout=config["api"]["judge"]["timeout_s"],
    max_retries=0,
)

# Lite judge — for hit verify, keyword extract, profiles (many calls, needs speed)
judge_lite_client = AsyncOpenAI(
    api_key=config["api"]["judge_lite"]["api_key"],
    base_url=config["api"]["judge_lite"]["base_url"],
    timeout=config["api"]["judge_lite"]["timeout_s"],
    max_retries=0,
)

# Mini judge — cheap pre-filter for hit verify (two-stage screening)
judge_mini_client = AsyncOpenAI(
    api_key=config["api"]["judge_mini"]["api_key"],
    base_url=config["api"]["judge_mini"]["base_url"],
    timeout=config["api"]["judge_mini"]["timeout_s"],
    max_retries=0,
)
JUDGE_MINI_MODEL = config["api"]["judge_mini"]["model"]
MINI_PREFILTER_THRESHOLD = config["api"]["hit_prefilter_threshold"]


# ============================================================================
# Keyword Extraction
# ============================================================================

KEYWORD_EXTRACT_SYSTEM = (
    "You are a research retrieval specialist. "
    "Extract specific, searchable technical keywords from a research hypothesis. "
    "Prioritise novel method names, architectural components, and precise problem "
    "formulations — terms specific enough to retrieve the relevant arXiv papers."
)

KEYWORD_EXTRACT_TEMPLATE = """\
Hypothesis:
{hypothesis_content}

Extract 3–5 specific technical keywords or short phrases (2–4 words each) \
that best capture the CORE TECHNICAL IDEA of this hypothesis for arXiv search.

Rules:
- Specific over generic: "cross-lingual prompt tuning" not "prompt tuning"
- Include the research domain: "low-resource ASR adaptation" not just "ASR"
- Avoid: stop words, author names, broad terms ("deep learning", "transformer", "LLM")
- Each phrase should narrow the search, not widen it

Output one keyword or phrase per line, nothing else."""


@async_retry(max_retries=3, delay=5)
async def run_keyword_extract(hypothesis_content: str) -> list[str]:
    """Extract searchable arXiv keywords from a hypothesis (lite judge)."""
    response = await judge_lite_client.chat.completions.create(
        model=config["api"]["judge_lite"]["model"],
        messages=[
            {"role": "system", "content": KEYWORD_EXTRACT_SYSTEM},
            {"role": "user", "content": KEYWORD_EXTRACT_TEMPLATE.format(
                hypothesis_content=hypothesis_content,
            )},
        ],
        temperature=0.1,
    )
    content = response.choices[0].message.content or ""
    keywords = [ln.strip() for ln in content.strip().splitlines() if ln.strip()]
    logger.info("[Judge/Keywords] %d extracted: %s", len(keywords), keywords)
    return keywords[:5]


# ============================================================================
# Hit Verification Judge  (10-point scale, threshold avg >= 6.0)
# ============================================================================

HIT_VERIFY_SYSTEM = (
    "You are a rigorous scientific reviewer judging whether a published paper "
    "substantively validates a previously generated research hypothesis. "
    "Score each dimension strictly according to the rubric. "
    "Superficial topic overlap does NOT constitute validation — "
    "you must find specific, concrete alignment in the paper's actual content."
)

HIT_VERIFY_TEMPLATE = """\
## Research Hypothesis
{hypothesis_content}

## Candidate Paper
- Title: {paper_title}
- Published: {paper_published}
- ArXiv ID: {paper_arxiv_id}

Paper Evidence:
{paper_content}

---

Judge whether this paper substantively validates the hypothesis.
Score each dimension 1–10 using the rubric below. You MUST quote or closely
paraphrase specific text from the paper as evidence for each score.

────────────────────────────────────────────────────────────────────────────
D1 — Research Direction Alignment                                     (1–10)
Does the paper address the same research gap or problem motivation the
hypothesis identified?

  1–2  Completely different problem or motivation; no meaningful connection.
  3–4  Shares the broad topic area but pursues a different goal or question.
  5–6  Same general research direction; some shared motivation but divergent framing.
  7–8  Same research problem and motivation; hypothesis clearly anticipated this gap.
  9–10 Hypothesis precisely identified the gap this paper was written to fill.

Score: {{1-10}}
Evidence: {{direct quote or close paraphrase from the paper}}
────────────────────────────────────────────────────────────────────────────
D2 — Technical Approach Overlap                                       (1–10)
Does the paper employ a similar method, architecture, or algorithm to what
the hypothesis proposed?

  1–2  Completely different technical approach; no methodological similarity.
  3–4  Shares surface-level terminology but fundamentally different method.
  5–6  Related methodology; similar design principles, different execution.
  7–8  Similar architecture or algorithm; hypothesis anticipated the key design choice.
  9–10 Paper implements essentially the same technical approach the hypothesis proposed.

Score: {{1-10}}
Evidence: {{direct quote or close paraphrase}}
────────────────────────────────────────────────────────────────────────────
D3 — Problem Formulation Match                                        (1–10)
Do the hypothesis and paper share the same task definition, input/output
structure, dataset type, or experimental constraints?

  1–2  Entirely different task or experimental setting.
  3–4  Related task with substantially different constraints or scope.
  5–6  Same broad task; different dataset, language pair, or evaluation protocol.
  7–8  Same task with similar input/output and constraints; minor setup differences.
  9–10 Identical problem formulation; hypothesis fully predicted the experimental setting.

Score: {{1-10}}
Evidence: {{direct quote or close paraphrase}}
────────────────────────────────────────────────────────────────────────────
D4 — Contribution Anticipation                                        (1–10)
Does the paper's reported result or contribution match what the hypothesis
predicted would be achievable or valuable?

  1–2  Hypothesis predicted something entirely different; no contribution overlap.
  3–4  Partially overlapping contribution type; hypothesis missed the key finding.
  5–6  Hypothesis predicted the right direction of improvement; some detail mismatch.
  7–8  Hypothesis correctly anticipated the specific type and scale of contribution.
  9–10 Hypothesis precisely predicted this paper's reported contribution.

Score: {{1-10}}
Evidence: {{direct quote or close paraphrase}}
────────────────────────────────────────────────────────────────────────────

### Verdict
Avg Score: {{(D1+D2+D3+D4)/4, one decimal place}}
Matched Core Ideas:
- {{Concrete idea present in BOTH hypothesis and paper — be specific, not generic}}
- {{Second matched idea if applicable; omit this line if none}}
Verdict: {{HIT if avg score >= 6.0 else MISS}}
Reasoning: {{2–3 sentences citing specific evidence from both hypothesis and paper \
that justify the verdict; do not repeat the scores}}"""


async def _call_hit_judge(client, model, prompt, paper_arxiv_id):
    """Call a hit-verify judge model with retry on empty content."""
    content = ""
    tokens = 0
    for attempt in range(3):
        response = await client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": HIT_VERIFY_SYSTEM},
                {"role": "user", "content": prompt},
            ],
            temperature=0.1,
        )
        content = response.choices[0].message.content or ""
        tokens += response.usage.total_tokens if response.usage else 0
        if content.strip():
            break
        if attempt < 2:
            logger.warning("[Judge/Hit] %s attempt %d (%s) returned empty, retrying...",
                           paper_arxiv_id, attempt + 1, model)
            await asyncio.sleep(2)
    return content, tokens


def _parse_hit_response(content, paper_arxiv_id, paper_title, paper_published):
    """Parse hit-verify judge response into structured dict."""
    verdict_match = re.search(r"Verdict:\s*(HIT|MISS)", content)
    is_hit = (verdict_match.group(1) == "HIT") if verdict_match else False

    score_match = re.search(r"Avg Score:\s*([\d.]+)", content)
    avg_score = float(score_match.group(1)) if score_match else 0.0

    ideas_block = re.search(r"Matched Core Ideas:([\s\S]*?)(?=\nVerdict:|\Z)", content)
    matched_ideas: list[str] = []
    if ideas_block:
        matched_ideas = [
            ln.lstrip("-• ").strip()
            for ln in ideas_block.group(1).splitlines()
            if ln.strip().startswith(("-", "•")) and ln.strip().lstrip("-• ")
        ]

    reasoning_match = re.search(r"Reasoning:\s*([\s\S]*?)$", content)
    reasoning = reasoning_match.group(1).strip() if reasoning_match else ""

    return {
        "is_hit": is_hit,
        "avg_score": avg_score,
        "matched_ideas": matched_ideas,
        "reasoning": reasoning,
        "full_eval": content,
        "paper_arxiv_id": paper_arxiv_id,
        "paper_title": paper_title,
        "paper_published": paper_published,
    }


@async_retry(max_retries=3, delay=5)
async def run_hit_verify(
    hypothesis_content: str,
    paper_title: str,
    paper_published: str,
    paper_arxiv_id: str,
    paper_content: str,
) -> dict:
    """
    Two-stage hit verification:
      Stage 1: gpt-4o-mini pre-filter (cheap, fast)
      Stage 2: gpt-4o re-judge if mini score >= threshold (accurate)

    Returns:
      is_hit        bool   (avg_score >= 6.0)
      avg_score     float  (1–10)
      matched_ideas list[str]
      reasoning     str
      full_eval     str
      tokens        int
    """
    prompt = HIT_VERIFY_TEMPLATE.format(
        hypothesis_content=hypothesis_content,
        paper_title=paper_title,
        paper_published=paper_published,
        paper_arxiv_id=paper_arxiv_id,
        paper_content=paper_content,
    )
    total_tokens = 0

    # Stage 1: mini pre-filter
    mini_content, mini_tokens = await _call_hit_judge(
        judge_mini_client, JUDGE_MINI_MODEL, prompt, paper_arxiv_id)
    total_tokens += mini_tokens

    mini_score_match = re.search(r"Avg Score:\s*([\d.]+)", mini_content)
    mini_score = float(mini_score_match.group(1)) if mini_score_match else 0.0

    # If mini score below threshold, use mini result directly (definite MISS)
    if mini_score < MINI_PREFILTER_THRESHOLD:
        result = _parse_hit_response(mini_content, paper_arxiv_id, paper_title, paper_published)
        result["tokens"] = total_tokens
        logger.info("[Judge/Hit] %s → MISS (mini=%.1f/10, skipped 4o)",
                    paper_arxiv_id, mini_score)
        return result

    # Stage 2: gpt-4o re-judge for borderline/high candidates
    logger.info("[Judge/Hit] %s mini=%.1f/10 >= %.1f, re-judging with 4o",
                paper_arxiv_id, mini_score, MINI_PREFILTER_THRESHOLD)
    content_4o, tokens_4o = await _call_hit_judge(
        judge_lite_client, config["api"]["judge_lite"]["model"], prompt, paper_arxiv_id)
    total_tokens += tokens_4o

    result = _parse_hit_response(content_4o, paper_arxiv_id, paper_title, paper_published)
    result["tokens"] = total_tokens
    logger.info("[Judge/Hit] %s → %s (4o=%.1f/10)",
                paper_arxiv_id, "HIT" if result["is_hit"] else "MISS", result["avg_score"])
    return result


# ============================================================================
# Novelty Judge  (10-point scale, standalone hypothesis quality score)
# ============================================================================

NOVELTY_JUDGE_SYSTEM = (
    "You are an expert scientific evaluator assessing the originality and quality "
    "of a research hypothesis generated by an AI system. "
    "Score each dimension strictly according to the rubric. "
    "Base your scores solely on the hypothesis content and the source papers it cites — "
    "do not reward vagueness or penalise the system for ideas later proven correct."
)

NOVELTY_JUDGE_TEMPLATE = """\
## Research Hypothesis
{hypothesis_content}

---

Evaluate the originality and scientific quality of this hypothesis across four
dimensions. Score each 1–10 using the rubric. Quote from the hypothesis text
to justify each score.

────────────────────────────────────────────────────────────────────────────
D1 — Conceptual Originality                                           (1–10)
Does the hypothesis propose a genuinely new research concept, or does it
merely restate ideas already explicit in the cited source papers?

  1–2  Directly restates one or more source papers; no new synthesis.
  3–4  Minor variation on existing work; predictable next step.
  5–6  Combines existing ideas in a non-obvious way; moderate originality.
  7–8  Introduces a genuinely novel conceptual contribution not in source papers.
  9–10 Breakthrough-level original idea; would surprise domain experts.

Score: {{1-10}}
Justification: {{quote from hypothesis + comparison to source paper ideas}}
────────────────────────────────────────────────────────────────────────────
D2 — Cross-field Synthesis                                            (1–10)
Does the hypothesis connect insights from multiple distinct sub-fields,
domains, or research communities?

  1–2  Stays entirely within one narrow sub-field; no cross-domain connection.
  3–4  Cites papers from adjacent areas but makes no meaningful synthesis.
  5–6  Meaningfully bridges two related sub-fields with a concrete link.
  7–8  Synthesises ideas from distinct domains; connection is non-trivial.
  9–10 Creates a novel research bridge across fundamentally disparate fields.

Score: {{1-10}}
Justification: {{which domains are bridged and how}}
────────────────────────────────────────────────────────────────────────────
D3 — Gap Identification Precision                                     (1–10)
Does the hypothesis identify a specific, concrete gap in the literature,
or does it describe a vague or commonly acknowledged weakness?

  1–2  Gap is too vague to act on, or is already the dominant open problem.
  3–4  Identifies a general area of weakness without pinpointing the gap.
  5–6  Specific gap with clear evidence from cited papers.
  7–8  Precisely motivated gap with strong, multi-source evidence.
  9–10 Identifies a gap that was non-obvious even to domain experts at the time.

Score: {{1-10}}
Justification: {{describe the gap and cite supporting evidence in the hypothesis}}
────────────────────────────────────────────────────────────────────────────
D4 — Specificity and Falsifiability                                   (1–10)
Is the hypothesis specific enough to be tested and falsified as a concrete
research claim, or is it too vague to guide an experiment?

  1–2  So vague it could describe any paper in the field; untestable.
  3–4  Direction is clear but lacks the detail needed to design an experiment.
  5–6  Mostly testable; a researcher could design a study with some clarification.
  7–8  Clearly testable: proposed method, expected metric, and expected gain are stated.
  9–10 Fully specified research agenda; could be directly converted to a paper outline.

Score: {{1-10}}
Justification: {{cite the specific claim and explain what makes it testable or not}}
────────────────────────────────────────────────────────────────────────────

### Summary
Avg Novelty Score: {{(D1+D2+D3+D4)/4, one decimal place}}
Strengths: {{1–2 specific strengths of this hypothesis}}
Weaknesses: {{1–2 specific weaknesses or vague points}}"""


@async_retry(max_retries=3, delay=5)
async def run_novelty_judge(hypothesis_content: str, max_retries: int = 3) -> dict:
    """
    Score the originality and quality of a hypothesis across 4 dimensions.
    Retries up to max_retries times if the response is empty or unparseable.

    Returns:
      avg_score    float  (1–10)
      d_scores     dict   {d1, d2, d3, d4}
      strengths    str
      weaknesses   str
      full_eval    str
      tokens       int
    """
    total_tokens = 0
    for attempt in range(max_retries):
        response = await judge_client.chat.completions.create(
            model=config["api"]["judge"]["model"],
            messages=[
                {"role": "system", "content": NOVELTY_JUDGE_SYSTEM},
                {"role": "user", "content": NOVELTY_JUDGE_TEMPLATE.format(
                    hypothesis_content=hypothesis_content,
                )},
            ],
            temperature=0.1,
        )
        content = response.choices[0].message.content or ""
        total_tokens += response.usage.total_tokens if response.usage else 0

        # Parse avg score
        avg_match = re.search(r"Avg Novelty Score:\s*([\d.]+)", content)
        avg_score = float(avg_match.group(1)) if avg_match else 0.0

        # Parse individual dimension scores
        d_scores = {}
        for i in range(1, 5):
            m = re.search(rf"D{i}[^\n]*\n.*?Score:\s*(\d+)", content, re.DOTALL)
            d_scores[f"d{i}"] = int(m.group(1)) if m else 0

        # If we got a valid score, stop retrying
        if avg_score > 0:
            break
        if attempt < max_retries - 1:
            logger.warning("[Judge/Novelty] attempt %d/%d returned empty/unparseable, retrying...",
                           attempt + 1, max_retries)
            await asyncio.sleep(2)

    strengths_match = re.search(r"Strengths:\s*([\s\S]*?)(?=\nWeaknesses:|\Z)", content)
    weaknesses_match = re.search(r"Weaknesses:\s*([\s\S]*?)$", content)

    logger.info("[Judge/Novelty] avg=%.1f/10 (d1=%s d2=%s d3=%s d4=%s)",
                avg_score, d_scores.get("d1"), d_scores.get("d2"),
                d_scores.get("d3"), d_scores.get("d4"))

    return {
        "avg_score": avg_score,
        "d_scores": d_scores,
        "strengths": strengths_match.group(1).strip() if strengths_match else "",
        "weaknesses": weaknesses_match.group(1).strip() if weaknesses_match else "",
        "full_eval": content,
        "tokens": total_tokens,
    }
