import re
import math
import json
import logging
import asyncio
import functools
from typing import Union

from openai import AsyncOpenAI
from config import config

logger = logging.getLogger("ckm.engines")

# --- Clients ---

llm_client = AsyncOpenAI(
    api_key=config["api"]["llm"]["api_key"],
    base_url=config["api"]["llm"]["base_url"],
    timeout=config["api"]["llm"]["timeout_s"],
    max_retries=0,
)

embed_client = AsyncOpenAI(
    api_key=config["api"]["embedding"]["api_key"],
    base_url=config["api"]["embedding"]["base_url"],
    timeout=config["api"]["embedding"]["timeout_s"],
    max_retries=0,
)

# --- Retry decorators ---


def async_retry(max_retries: int = 3, delay: float = 5):
    """Retry decorator for async functions with exponential backoff."""
    def decorator(func):
        @functools.wraps(func)
        async def wrapper(*args, **kwargs):
            current_delay = delay
            for attempt in range(1, max_retries + 1):
                try:
                    return await func(*args, **kwargs)
                except Exception as e:
                    if attempt == max_retries:
                        logger.error(
                            "Async API call failed after %d attempts: %s: %s",
                            max_retries,
                            type(e).__name__,
                            e,
                        )
                        raise
                    logger.warning(
                        "Async API call failed (attempt %d/%d), retrying in %ds: %s: %s",
                        attempt,
                        max_retries,
                        current_delay,
                        type(e).__name__,
                        e,
                    )
                    await asyncio.sleep(current_delay)
                    current_delay = min(current_delay * 2, 120)  # exponential backoff, cap at 2min
        return wrapper
    return decorator


# --- Embedding & similarity ---

_embedding_cache: dict[str, list[float]] = {}


@async_retry(max_retries=4, delay=10)
async def get_embedding(text: str) -> list[float]:
    cache_key = text[:8000]
    if cache_key in _embedding_cache:
        return _embedding_cache[cache_key]
    response = await embed_client.embeddings.create(
        model=config["api"]["embedding"]["model"],
        input=cache_key,
        encoding_format="float",
    )
    vec = response.data[0].embedding
    _embedding_cache[cache_key] = vec
    return vec


def cosine_similarity(vec_a: list[float], vec_b: list[float]) -> float:
    if len(vec_a) != len(vec_b):
        return 0.0

    dot_product = sum(a * b for a, b in zip(vec_a, vec_b))
    norm_a = math.sqrt(sum(a * a for a in vec_a))
    norm_b = math.sqrt(sum(b * b for b in vec_b))

    if norm_a == 0 or norm_b == 0:
        return 0.0

    return dot_product / (norm_a * norm_b)


# --- Prompts ---

READ_SYSTEM = (
    "You are a research paper analyst. "
    "Extract structured information strictly from the provided paper content — no hallucination. "
    "Be concise and precise. Cite the arxiv_id for every claim."
)

READ_TEMPLATE = """\
Paper: {title}
ArXiv ID: {arxiv_id}
Published: {published}

{content}

Extract the following from this paper only:

## Core Method
{{The main technical contribution or approach (2-3 sentences: what was proposed, how it works)}}

## Key Findings
{{Main results and conclusions with quantitative details where available [arxiv_id]}}

## Field Status
{{What problem this paper addresses, what gap it fills, how it relates to prior work}}

## Open Questions
{{Limitations acknowledged or future directions suggested by the authors}}"""

INIT_SYSTEM = (
    "You are a research knowledge base builder executing Day 0 baseline construction. "
    "Read and extract from the provided paper content only — no hallucination. "
    "Cite arxiv_id for every factual claim. "
    "Do not ask questions; operate autonomously. "
    "Return valid JSON only."
)

INIT_TEMPLATE = """\
Research topic: {topic}

The following {n_papers} papers cover foundational and recent work in this area:

{papers}

Extract core methods, key findings, and field status from each paper. \
Then construct the initial knowledge state as JSON using exactly this schema:

{{
  "research_goal": "One grounded paragraph",
  "cross_topic_links": ["link 1", "link 2"],
  "topics": [
    {{
      "name": "short topic name",
      "status": "Active",
      "key_papers": ["arxiv_id_1", "arxiv_id_2"],
      "known_methods": ["method summary with [arxiv_id] citation"],
      "findings": ["finding summary with [arxiv_id] citation"],
      "open_questions": ["open question"]
    }}
  ]
}}

Rules:
- Return valid JSON only, no markdown fences
- topics must be a non-empty list unless the papers truly contain no coherent subtopics
- Use 3 to 8 topics when possible
- status must be one of: Active, Emerging, Foundational
- key_papers should contain arXiv IDs only
- known_methods, findings, open_questions should each be concise lists
- Never use ArXiv IDs as topic names
- Do not fabricate any fact not present in the provided papers"""

TOPIC_UPDATE_SYSTEM = (
    "You are a research knowledge base updater. "
    "Update a single topic file based on new papers. "
    "Work strictly from the provided content — no hallucination. "
    "Cite arxiv_id for every claim."
)

TOPIC_UPDATE_TEMPLATE = """\
Topic file: {topic_name}
Period: {period}

Current content:
{topic_content}

New papers this period:
{papers_text}

Update this topic file by applying the following principles to EACH new paper:
- New finding/method absent from current content → append to the relevant section with full detail and citation [arxiv_id]
- Confirms an existing conclusion → add the new evidence source, note increased confidence [arxiv_id]
- Contradicts or refines an existing conclusion → rewrite that conclusion, preserve both positions with citations, mark [Revised: {period}]
- Cross-domain connection → note the link and suggest adjacent search terms

Size control: keep the file under 200 lines. When approaching the limit, compress older entries \
(merge similar conclusions, remove low-value items) — never drop key conclusions or citations.

If none of the new papers are relevant to this topic, output the file content unchanged.
Output the complete updated file content only — no commentary, no filename header."""

TOPIC_DISCOVER_SYSTEM = (
    "You are a research knowledge base curator. "
    "Identify whether new papers introduce subtopics genuinely absent from all existing topic files. "
    "Be conservative — only create a new file if the subtopic is clearly distinct. "
    "If all papers fit within existing topics, return an empty JSON topics list. "
    "Return valid JSON only."
)

TOPIC_DISCOVER_TEMPLATE = """\
Existing topic files: {existing_topics}

New papers this period ({period}):
{papers_text}

For each paper NOT adequately covered by any existing topic, propose ONE new topic entry. \
If all papers fit within existing topics, return {{"topics": []}}.

Use exactly this JSON schema:
{{
  "topics": [
    {{
      "name": "short topic name",
      "known_methods": ["method summary with [arxiv_id] citation"],
      "findings": ["finding summary with [arxiv_id] citation"],
      "open_questions": ["open question"]
    }}
  ]
}}

Rules:
- Return valid JSON only, no markdown fences
- Only create a topic if it is clearly distinct from ALL of: {existing_topics}
- Do not duplicate existing topics
- Keep topic names short and descriptive"""

BASELINE_HYPOTHESIS_SYSTEM = (
    "You are a research hypothesis generator for scientific forecasting. "
    "Generate only narrow, paper-matchable hypotheses that a single future paper could plausibly validate. "
    "A valid hypothesis must specify a concrete problem, method delta, target setting, baseline, "
    "observable outcome, and failure mode. "
    "Return 0 to N hypotheses depending on the evidence; do not pad the list. "
    "If the evidence supports only broad directions or vague ideas, return an empty list."
)

BASELINE_HYPOTHESIS_TEMPLATE = """\
Current knowledge state:
{knowledge}

New papers ingested this period ({period}):
{papers_text}

Review the accumulated knowledge and this period's new papers, then ask yourself:
1. Are there recurring patterns or trends that remain unvalidated?
2. Do two or more independent findings combine to suggest a new direction?
3. Is there an obvious methodological gap that no existing work addresses?

Reminder:
- Prefer one coherent sub-problem from the provided evidence.
- Do not stitch together unrelated topics just because they co-occur in the context.

Only output a hypothesis if ALL of the following are true:
- It is narrow enough that one future paper could validate it.
- It names a concrete task/problem setting.
- It specifies a method delta relative to an explicit baseline or comparator.
- It states an observable outcome that could be measured in a paper.
- It includes a realistic failure mode or boundary condition.

Reject vague claims such as "improve safety", "enhance reliability", "improve usability",
or "improve performance" unless you also specify:
- on what task or benchmark,
- compared with what baseline,
- and by what measurable observable.

Return valid JSON only using exactly this schema:
{{
  "hypotheses": [
    {{
      "statement": "1-2 sentences. Must name the problem setting, method delta, baseline/comparator, and expected observable outcome.",
      "research_claim": {{
        "problem": "The exact research problem or task",
        "method_delta": "What new mechanism / architectural change / training strategy is introduced",
        "target_setting": "Dataset / benchmark / environment / deployment setting",
        "baseline": "What existing method, workflow, or class of systems this is compared against",
        "expected_observable": "What measurable outcome should improve, remain stable, or trade off",
        "evaluation_plan": "How a future paper would test this claim",
        "failure_mode": "When this hypothesis would likely fail or not generalize"
      }},
      "reasoning": "2-3 paragraphs explaining the evidence chain",
      "abstract": "One arXiv-style abstract paragraph in plain English, 100-200 words",
      "source_papers": [
        {{
          "arxiv_id": "arxiv_id_1",
          "title": "paper title",
          "insight": "what insight it contributed to this hypothesis"
        }},
        {{
          "arxiv_id": "arxiv_id_2",
          "title": "paper title",
          "insight": "what insight it contributed to this hypothesis"
        }}
      ],
      "trigger": {{
        "type": "GAP / BRIDGE / TREND / CONTRADICTION",
        "source": "The specific knowledge point or diff entry that triggered this hypothesis"
      }},
      "self_assessment": {{
        "novelty": {{"score": 1, "justification": "brief justification"}},
        "feasibility": {{"score": 1, "justification": "brief justification"}},
        "impact": {{"score": 1, "justification": "brief justification"}}
      }}
    }}
  ]
}}

Rules:
- Return 0 to {max_hypotheses} hypotheses, strongest first
- Return fewer hypotheses if only fewer are truly well-grounded
- Do not output placeholders, duplicates, or minor variants of the same idea
- source_papers must cite at least 2 real arXiv IDs drawn from the provided context
- Return JSON only, no markdown fences, no prose outside the JSON"""

HYPOTHESIS_REQUIRED_SECTIONS = [
    "Statement",
    "Research Claim",
    "Reasoning",
    "Abstract",
    "Source Papers",
    "Trigger",
    "Self-Assessment",
]
HYPOTHESIS_REQUIRED_FIELDS = [
    "Problem",
    "Method Delta",
    "Target Setting",
    "Baseline",
    "Expected Observable",
    "Evaluation Plan",
    "Failure Mode",
]
HYPOTHESIS_VAGUE_PATTERNS = [
    r"\bimprov(?:e|es|ing)\s+(?:safety|reliability|usability|performance|trust|quality)\b",
    r"\benhanc(?:e|es|ing)\s+(?:safety|reliability|usability|performance|trust|quality)\b",
    r"\bpromising\s+(?:direction|avenue)\b",
    r"\bresponsible evolution\b",
    r"\buser trust\b",
]


def _extract_markdown_section(content: str, section_name: str) -> str:
    pattern = rf"(?m)^## {re.escape(section_name)}\s*([\s\S]*?)(?=^## |\Z)"
    match = re.search(pattern, content)
    return match.group(1).strip() if match else ""


def _extract_markdown_key_values(section_content: str) -> dict[str, str]:
    values = {}
    for line in section_content.splitlines():
        match = re.match(r"^\s*-\s*([^:]+):\s*(.+?)\s*$", line)
        if match:
            values[match.group(1).strip()] = match.group(2).strip()
    return values


def _has_measurement_anchor(text: str) -> bool:
    lowered = text.lower()
    patterns = [
        r"\b(compared with|compared to|vs\.?|against|relative to|over baseline|under)\b",
        r"\b(accuracy|f1|f-1|bleu|rouge|pass@k|success rate|win rate|latency|throughput|precision|recall|auc|auroc|score|benchmark|dataset|exact match|jailbreak success|task success)\b",
        r"(>=|<=|>|<|±)",
        r"\d",
    ]
    return any(re.search(pattern, lowered) for pattern in patterns)


def _validate_hypothesis_content(content: str, finish_reason: str) -> list[str]:
    errors = []

    if finish_reason == "length":
        errors.append("model output was truncated")

    for section_name in HYPOTHESIS_REQUIRED_SECTIONS:
        if not _extract_markdown_section(content, section_name):
            errors.append(f"missing section: {section_name}")

    statement = _extract_markdown_section(content, "Statement")
    research_claim = _extract_markdown_section(content, "Research Claim")
    source_papers = _extract_markdown_section(content, "Source Papers")

    claim_fields = _extract_markdown_key_values(research_claim)
    for field_name in HYPOTHESIS_REQUIRED_FIELDS:
        if not claim_fields.get(field_name):
            errors.append(f"missing claim field: {field_name}")

    evidence_text = " ".join(
        filter(None, [
            statement,
            claim_fields.get("Baseline", ""),
            claim_fields.get("Expected Observable", ""),
            claim_fields.get("Evaluation Plan", ""),
        ])
    )
    if evidence_text and not _has_measurement_anchor(evidence_text):
        errors.append("missing explicit comparator or measurable observable")

    if statement:
        has_vague_phrase = any(
            re.search(pattern, statement, flags=re.IGNORECASE)
            for pattern in HYPOTHESIS_VAGUE_PATTERNS
        )
        if has_vague_phrase and not _has_measurement_anchor(evidence_text):
            errors.append("statement is too broad or generic")

    paper_refs = re.findall(r"\[[^\]]+\]", source_papers)
    if len(paper_refs) < 2:
        errors.append("need at least two source paper references")

    return errors


def _sanitize_text(value: object) -> str:
    return str(value or "").strip()


def _coerce_score(value: object) -> int:
    try:
        score = int(value)
    except (TypeError, ValueError):
        return 1
    return max(1, min(5, score))


def _normalize_statement_key(text: str) -> str:
    return re.sub(r"\W+", " ", (text or "").lower()).strip()


def _render_source_papers(source_papers: list[dict]) -> str:
    lines = []
    for paper in source_papers:
        if not isinstance(paper, dict):
            continue
        arxiv_id = _sanitize_text(paper.get("arxiv_id"))
        title = _sanitize_text(paper.get("title"))
        insight = _sanitize_text(paper.get("insight"))
        if not arxiv_id:
            continue
        line = f"- [{arxiv_id}]"
        if title:
            line += f": {title}"
        if insight:
            line += f" — {insight}"
        lines.append(line)
    return "\n".join(lines) if lines else "- [missing-source-1]: source paper missing"


def _render_self_assessment(self_assessment: dict) -> str:
    lines = []
    for key in ("novelty", "feasibility", "impact"):
        entry = self_assessment.get(key, {}) if isinstance(self_assessment, dict) else {}
        if not isinstance(entry, dict):
            entry = {}
        score = _coerce_score(entry.get("score"))
        justification = _sanitize_text(entry.get("justification")) or "not provided"
        label = key.capitalize()
        lines.append(f"- {label}: {score} — {justification}")
    return "\n".join(lines)


def _render_hypothesis_markdown(candidate: dict, hyp_id_string: str) -> str:
    research_claim = candidate.get("research_claim", {}) if isinstance(candidate.get("research_claim"), dict) else {}
    trigger = candidate.get("trigger", {}) if isinstance(candidate.get("trigger"), dict) else {}
    self_assessment = candidate.get("self_assessment", {}) if isinstance(candidate.get("self_assessment"), dict) else {}

    lines = [
        f"# Hypothesis H{hyp_id_string}",
        "## Statement",
        _sanitize_text(candidate.get("statement")),
        "## Research Claim",
        f"- Problem: {_sanitize_text(research_claim.get('problem'))}",
        f"- Method Delta: {_sanitize_text(research_claim.get('method_delta'))}",
        f"- Target Setting: {_sanitize_text(research_claim.get('target_setting'))}",
        f"- Baseline: {_sanitize_text(research_claim.get('baseline'))}",
        f"- Expected Observable: {_sanitize_text(research_claim.get('expected_observable'))}",
        f"- Evaluation Plan: {_sanitize_text(research_claim.get('evaluation_plan'))}",
        f"- Failure Mode: {_sanitize_text(research_claim.get('failure_mode'))}",
        "## Reasoning",
        _sanitize_text(candidate.get("reasoning")),
        "## Abstract",
        _sanitize_text(candidate.get("abstract")),
        "## Source Papers",
        _render_source_papers(candidate.get("source_papers", [])),
        "## Trigger",
        f"- Type: {_sanitize_text(trigger.get('type'))}",
        f"- Source: {_sanitize_text(trigger.get('source'))}",
        "## Self-Assessment",
        _render_self_assessment(self_assessment),
    ]
    return "\n".join(lines).strip()


def _extract_hypothesis_candidates(content: str) -> list[dict]:
    payload = _extract_json_payload(content)
    hypotheses = payload.get("hypotheses", [])
    if not isinstance(hypotheses, list):
        raise ValueError("HypothesisEngine hypotheses field must be a list")
    return [candidate for candidate in hypotheses if isinstance(candidate, dict)]


# --- Filename parsing helper ---


def parse_file_operations(content: str) -> list:
    """Parse [filename] content blocks from LLM output."""
    xml_regex = r"<file\s+name=\"([^\"]+)\">\s*([\s\S]*?)\s*</file>"
    header_regex = r"(?m)^\[([^\]\n]+)\]\s*$"
    operations = []
    xml_matches = list(re.finditer(xml_regex, content))

    if xml_matches:
        matches = xml_matches
        use_xml = True
    else:
        matches = list(re.finditer(header_regex, content))
        use_xml = False

    for i, match in enumerate(matches):
        file_name = match.group(1).strip()
        if use_xml:
            file_content = match.group(2).strip()
        else:
            content_start = match.end()
            content_end = matches[i + 1].start() if i + 1 < len(matches) else len(content)
            file_content = content[content_start:content_end].strip()

        file_name = file_name.lower()
        file_name = re.sub(r"[^a-z0-9_\-.]", "-", file_name)
        file_name = re.sub(r"-+", "-", file_name)
        if not file_name.endswith(".md"):
            file_name += ".md"

        if re.search(r"\d{4}_\d{4,5}", file_name) or re.match(r"^[\d_v]+(\.md)?$", file_name):
            file_name = "misc_unnamed_concepts.md"

        if file_name and file_content:
            operations.append({"fileName": file_name, "fileContent": file_content})

    return operations


def _fix_json_escapes(text: str) -> str:
    """Fix invalid JSON escape sequences (e.g. \\n inside strings that should be \\\\n)."""
    # Replace invalid \x escapes with \\x (but keep valid ones: \n \t \r \b \f \\ \" \/)
    return re.sub(r'\\(?![ntrfb\\/"])', r'\\\\', text)


def _extract_json_payload(content: str) -> dict:
    text = (content or "").strip()
    if not text:
        raise ValueError("Empty model output")

    if text.startswith("```"):
        fence_match = re.search(r"```(?:json)?\s*([\s\S]*?)```", text)
        if fence_match:
            text = fence_match.group(1).strip()

    # Try parsing as-is first, then with escape fixes
    for attempt_text in [text, _fix_json_escapes(text)]:
        try:
            return json.loads(attempt_text)
        except json.JSONDecodeError:
            start = attempt_text.find("{")
            end = attempt_text.rfind("}")
            if start != -1 and end != -1 and end > start:
                try:
                    return json.loads(attempt_text[start:end + 1])
                except json.JSONDecodeError:
                    continue
    raise ValueError(f"Failed to parse JSON from model output ({len(text)} chars)")


def _slugify_topic_name(name: str) -> str:
    slug = name.strip().lower()
    slug = re.sub(r"[^a-z0-9]+", "-", slug)
    slug = re.sub(r"-+", "-", slug).strip("-")
    return slug or "unnamed-topic"


def _normalize_status(value: str) -> str:
    normalized = (value or "").strip().lower()
    mapping = {
        "active": "Active",
        "emerging": "Emerging",
        "foundational": "Foundational",
    }
    return mapping.get(normalized, "Active")


def _dedupe_preserve_order(items: list[str]) -> list[str]:
    if not isinstance(items, list):
        return []
    seen = set()
    results = []
    for item in items:
        clean = str(item or "").strip()
        if clean and clean not in seen:
            seen.add(clean)
            results.append(clean)
    return results


def _format_bullet_list(items: list[str]) -> str:
    cleaned = _dedupe_preserve_order(items)
    if not cleaned:
        return "- None"
    return "\n".join(f"- {item}" for item in cleaned)


def _render_index_markdown(topic: str, research_goal: str, cross_topic_links: list[str], topics: list[dict], n_papers: int) -> str:
    lines = [
        f"# Knowledge Base: {topic}",
        "## Research Goal",
        str(research_goal or "").strip() or "No research goal extracted.",
        "## Topics",
        "| Topic | Key papers | Status |",
        "|-------|-----------|--------|",
    ]

    for topic_obj in topics:
        name = topic_obj["name"]
        key_papers = ", ".join(topic_obj.get("key_papers", [])) or "None"
        status = topic_obj.get("status", "Active")
        lines.append(f"| {name} | {key_papers} | {status} |")

    lines.extend([
        "## Cross-topic Links",
        _format_bullet_list(cross_topic_links),
        "## Timeline",
        f"- Day 0: Baseline constructed from {n_papers} papers",
    ])
    return "\n".join(lines).strip()


def _render_topic_markdown(topic_obj: dict) -> str:
    return "\n".join([
        "## Known Methods",
        _format_bullet_list(topic_obj.get("known_methods", [])),
        "",
        "## Key Papers & Findings",
        _format_bullet_list(topic_obj.get("findings", [])),
        "",
        "## Open Questions",
        _format_bullet_list(topic_obj.get("open_questions", [])),
    ]).strip()


def _build_topic_operations(topic_payloads: list[dict], include_status: bool = True) -> list[dict]:
    operations = []
    used_file_names = set()

    for topic_obj in topic_payloads:
        if not isinstance(topic_obj, dict):
            continue
        name = (topic_obj.get("name") or "").strip()
        if not name:
            continue

        base_slug = _slugify_topic_name(name)
        file_name = f"topic-{base_slug}.md"
        suffix = 2
        while file_name in used_file_names:
            file_name = f"topic-{base_slug}-{suffix}.md"
            suffix += 1
        used_file_names.add(file_name)

        normalized_topic = {
            "name": name,
            "status": _normalize_status(topic_obj.get("status", "Active")) if include_status else "Emerging",
            "key_papers": _dedupe_preserve_order(topic_obj.get("key_papers", [])),
            "known_methods": _dedupe_preserve_order(topic_obj.get("known_methods", [])),
            "findings": _dedupe_preserve_order(topic_obj.get("findings", [])),
            "open_questions": _dedupe_preserve_order(topic_obj.get("open_questions", [])),
        }
        operations.append({
            "fileName": file_name,
            "fileContent": _render_topic_markdown(normalized_topic),
            "topic": normalized_topic,
        })

    return operations


# --- Engine functions ---


@async_retry(max_retries=3, delay=5)
async def run_read_engine(title: str, arxiv_id: str, published: str, content: str) -> str:
    """Per-paper extraction: core method, key findings, field status, open questions."""
    prompt = READ_TEMPLATE.format(
        title=title, arxiv_id=arxiv_id, published=published, content=content,
    )
    response = await llm_client.chat.completions.create(
        model=config["api"]["llm"]["model"],
        messages=[
            {"role": "system", "content": READ_SYSTEM},
            {"role": "user", "content": prompt},
        ],
        temperature=0.2,
    )
    choice = response.choices[0]
    if choice.finish_reason == "length":
        logger.warning("[ReadEngine] %s: Output truncated (finish_reason=length).", arxiv_id)
    extraction = choice.message.content or ""
    logger.info("[ReadEngine] Extracted %s (%d chars)", arxiv_id, len(extraction))
    return f"### {title}\n- arxiv_id: {arxiv_id}\n- published: {published}\n\n{extraction}"


@async_retry(max_retries=5, delay=10)
async def run_init_engine(topic: str, papers_text: str, n_papers: int) -> dict:
    """Phase 1: Synthesize papers into structured knowledge base from scratch."""
    prompt = INIT_TEMPLATE.format(
        topic=topic, papers=papers_text, n_papers=n_papers,
    )
    response = await llm_client.chat.completions.create(
        model=config["api"]["llm"]["model"],
        messages=[
            {"role": "system", "content": INIT_SYSTEM},
            {"role": "user", "content": prompt},
        ],
        temperature=0.2,
    )


    choice = response.choices[0]
    content = choice.message.content or ""
    tokens = response.usage.total_tokens if response.usage else 0
    if choice.finish_reason == "length":
        logger.warning("[InitEngine] Output truncated (finish_reason=length).")
    operations = []
    error = None

    try:
        payload = _extract_json_payload(content)
        topic_payloads = payload.get("topics", [])
        if not isinstance(topic_payloads, list):
            raise ValueError("InitEngine topics field must be a list")

        topic_ops = _build_topic_operations(topic_payloads, include_status=True)
        index_content = _render_index_markdown(
            topic=topic,
            research_goal=payload.get("research_goal", ""),
            cross_topic_links=payload.get("cross_topic_links", []),
            topics=[op["topic"] for op in topic_ops],
            n_papers=n_papers,
        )
        operations = [{"fileName": "_index.md", "fileContent": index_content}] + [
            {"fileName": op["fileName"], "fileContent": op["fileContent"]} for op in topic_ops
        ]
    except Exception as exc:
        logger.warning("[InitEngine] JSON parse/render failed: %s (will retry)", exc)
        raise  # Let @async_retry handle it

    topic_ops_only = [op for op in operations if op["fileName"] != "_index.md"]
    if not topic_ops_only:
        logger.warning("[InitEngine] No topic files produced (will retry)")
        raise ValueError("InitEngine returned no topic files")

    logger.info("[InitEngine] topic=%s, papers=%d, files=%d, tokens=%d",
                topic, n_papers, len(operations), tokens)

    return {"operations": operations, "tokens": tokens, "raw_content": content, "error": None}



@async_retry(max_retries=5, delay=10)
async def run_topic_update_engine(
    topic_name: str, topic_content: str, papers_text: str, time_period: Union[str, int]
) -> dict:
    """Update a single topic file based on new papers for this period."""
    prompt = TOPIC_UPDATE_TEMPLATE.format(
        topic_name=topic_name, topic_content=topic_content,
        papers_text=papers_text, period=time_period,
    )
    response = await llm_client.chat.completions.create(
        model=config["api"]["llm"]["model"],
        messages=[
            {"role": "system", "content": TOPIC_UPDATE_SYSTEM},
            {"role": "user", "content": prompt},
        ],
        temperature=0.2,
    )

    choice = response.choices[0]
    content = choice.message.content or ""
    tokens = response.usage.total_tokens if response.usage else 0
    if choice.finish_reason == "length":
        logger.warning("[TopicUpdate] %s period=%s: Output truncated.", topic_name, time_period)
    logger.info("[TopicUpdate] %s period=%s, tokens=%d", topic_name, time_period, tokens)

    return {"fileName": topic_name, "fileContent": content, "tokens": tokens}


@async_retry(max_retries=3, delay=5)
async def run_topic_discover_engine(
    existing_topics: list[str], papers_text: str, time_period: Union[str, int]
) -> dict:
    """Detect if new papers introduce subtopics not covered by existing topic files."""
    prompt = TOPIC_DISCOVER_TEMPLATE.format(
        existing_topics=", ".join(existing_topics),
        papers_text=papers_text,
        period=time_period,
    )
    response = await llm_client.chat.completions.create(
        model=config["api"]["llm"]["model"],
        messages=[
            {"role": "system", "content": TOPIC_DISCOVER_SYSTEM},
            {"role": "user", "content": prompt},
        ],
        temperature=0.2,
    )

    choice = response.choices[0]
    content = choice.message.content or ""
    tokens = response.usage.total_tokens if response.usage else 0
    if choice.finish_reason == "length":
        logger.warning("[TopicDiscover] period=%s: Output truncated.", time_period)
    logger.info("[TopicDiscover] period=%s, tokens=%d", time_period, tokens)
    operations = []
    try:
        payload = _extract_json_payload(content)
        topic_payloads = payload.get("topics", [])
        if not isinstance(topic_payloads, list):
            raise ValueError("TopicDiscover topics field must be a list")
        operations = [
            {"fileName": op["fileName"], "fileContent": op["fileContent"]}
            for op in _build_topic_operations(topic_payloads, include_status=False)
        ]
    except Exception as exc:
        logger.warning("[TopicDiscover] JSON parse/render failed: %s", exc)

    new_ops = [op for op in operations if op["fileName"] not in existing_topics]
    logger.info("[TopicDiscover] period=%s: %d new topic(s)", time_period, len(new_ops))
    return {"operations": new_ops, "tokens": tokens}


@async_retry(max_retries=3, delay=5)
async def run_baseline_hypothesis_engine(
    knowledge_content: str, papers_text: str, time_period: Union[str, int], max_hypotheses: int
) -> dict:
    from openai import BadRequestError

    for truncation_round in range(4):
        prompt = BASELINE_HYPOTHESIS_TEMPLATE.format(
            knowledge=knowledge_content, papers_text=papers_text,
            period=time_period, max_hypotheses=max_hypotheses,
        )
        try:
            response = await llm_client.chat.completions.create(
                model=config["api"]["llm"]["model"],
                messages=[
                    {"role": "system", "content": BASELINE_HYPOTHESIS_SYSTEM},
                    {"role": "user", "content": prompt},
                ],
                temperature=0.2,
            )
            break
        except BadRequestError as e:
            if "context_length_exceeded" in str(e) and truncation_round < 3:
                logger.warning("[Baseline-Hypothesis] Context overflow (round %d), truncating to 80%%", truncation_round + 1)
                knowledge_content, papers_text = _truncate_to_fit(knowledge_content, papers_text, 0.8)
                continue
            raise

    choice = response.choices[0]
    content = choice.message.content or ""
    tokens = response.usage.total_tokens if response.usage else 0
    if choice.finish_reason == "length":
        logger.warning("[HypothesisEngine] period=%s: Output truncated (finish_reason=length).", time_period)
    logger.info("[HypothesisEngine] period=%s, tokens=%d", time_period, tokens)

    accepted = []
    rejections = []
    error = ""
    seen_statement_keys = set()

    try:
        if content.strip().upper().startswith("NULL"):
            candidates = []
        else:
            candidates = _extract_hypothesis_candidates(content)
    except Exception as exc:
        error = str(exc)
        logger.warning("[HypothesisEngine] period=%s: JSON parse failed: %s", time_period, exc)
        candidates = []

    for candidate_index, candidate in enumerate(candidates[:max_hypotheses], start=1):
        statement = _sanitize_text(candidate.get("statement"))
        statement_key = _normalize_statement_key(statement)
        hyp_id_string = str(candidate_index).zfill(3)
        rendered = _render_hypothesis_markdown(candidate, hyp_id_string)
        validation_errors = _validate_hypothesis_content(rendered, choice.finish_reason or "")

        if not statement_key:
            validation_errors.append("missing hypothesis statement")
        elif statement_key in seen_statement_keys:
            validation_errors.append("duplicate hypothesis statement")
        else:
            seen_statement_keys.add(statement_key)

        if validation_errors:
            rejection_reason = "; ".join(validation_errors)
            logger.warning(
                "[HypothesisEngine] period=%s candidate=%d: rejected low-quality hypothesis: %s",
                time_period,
                candidate_index,
                rejection_reason,
            )
            rejections.append({
                "candidate_index": candidate_index,
                "content": rendered,
                "rejection_reason": rejection_reason,
            })
            continue

        accepted.append({
            "candidate_index": candidate_index,
            "statement": statement,
            "content": rendered,
        })

    return {
        "hypotheses": accepted,
        "rejections": rejections,
        "tokens": tokens,
        "raw_content": content,
        "error": error,
    }


# --- Trigger detection engine ---

TRIGGER_SYSTEM = (
    "You are a research intelligence analyst monitoring a knowledge base for changes. "
    "Your job is to characterize what changed in the latest knowledge update — "
    "what type of change occurred and what is notable. Be factual and precise. "
    "Return valid JSON only."
)

TRIGGER_TEMPLATE = """\
## Knowledge State BEFORE This Window
{knowledge_before}

## Knowledge State AFTER This Window
{knowledge_after}

## New Papers This Window ({period})
{papers_text}

## Cumulative Windows Processed: {windows_since_trigger} windows since last hypothesis generation

Analyze the knowledge change and characterize it. Classify using these types:
1. **INCREMENTAL**: Routine extension — new results consistent with existing knowledge
2. **CONTRADICTION**: A new paper contradicts or significantly revises an established finding
3. **CONVERGENCE**: Multiple independent papers (possibly across windows) now point to the same conclusion
4. **BRIDGE**: A connection between two previously unrelated topics has emerged
5. **TREND_CONFIRMED**: A pattern observed in earlier windows is now validated by new evidence

Return JSON using this schema:
{{
  "change_type": "INCREMENTAL / CONTRADICTION / CONVERGENCE / BRIDGE / TREND_CONFIRMED",
  "reason": "1-2 sentences: what specifically changed and why it matters",
  "key_changes": ["change 1", "change 2"]
}}

Rules:
- Be factual — report what actually changed, do not speculate
- If nothing notable changed, use INCREMENTAL
- Return JSON only, no markdown fences"""


@async_retry(max_retries=3, delay=5)
async def run_trigger_detection(
    knowledge_before: str,
    knowledge_after: str,
    papers_text: str,
    period: str,
    windows_since_trigger: int,
) -> dict:
    """Characterize what changed in this window's knowledge update. Always returns a change type."""
    from openai import BadRequestError

    for truncation_round in range(4):
        prompt = TRIGGER_TEMPLATE.format(
            knowledge_before=knowledge_before,
            knowledge_after=knowledge_after,
            papers_text=papers_text,
            period=period,
            windows_since_trigger=windows_since_trigger,
        )
        try:
            response = await llm_client.chat.completions.create(
                model=config["api"]["llm"]["model"],
                messages=[
                    {"role": "system", "content": TRIGGER_SYSTEM},
                    {"role": "user", "content": prompt},
                ],
                temperature=0.1,
            )
            break
        except BadRequestError as e:
            if "context_length_exceeded" in str(e) and truncation_round < 3:
                logger.warning("[TriggerDetection] Context overflow (round %d), truncating", truncation_round + 1)
                knowledge_before = knowledge_before[:int(len(knowledge_before) * 0.8)]
                knowledge_after = knowledge_after[:int(len(knowledge_after) * 0.8)]
                papers_text = papers_text[:int(len(papers_text) * 0.8)]
                continue
            raise

    choice = response.choices[0]
    content = choice.message.content or ""
    tokens = response.usage.total_tokens if response.usage else 0

    result = {
        "change_type": "INCREMENTAL",
        "reason": "",
        "key_changes": [],
        "tokens": tokens,
    }

    try:
        payload = _extract_json_payload(content)
        result["change_type"] = str(payload.get("change_type", "INCREMENTAL"))
        result["reason"] = str(payload.get("reason", ""))
        result["key_changes"] = payload.get("key_changes", [])
    except Exception as exc:
        logger.warning("[TriggerDetection] %s: JSON parse failed: %s", period, exc)

    logger.info(
        "[TriggerDetection] %s: change_type=%s reason=%s",
        period, result["change_type"], result["reason"][:80],
    )
    return result


# --- Evolution-aware hypothesis engine ---

CKM_HYPOTHESIS_SYSTEM = (
    "You are a research hypothesis generator embedded in a continuous knowledge metabolism system. "
    "Unlike a one-shot analyst, you have been tracking this field over multiple time periods. "
    "You have access to the knowledge evolution trajectory — how the field has changed over time — "
    "and your own previously generated hypotheses. "
    "Generate hypotheses grounded in the evidence you have accumulated. "
    "A hypothesis can come from a single breakthrough paper or from patterns spanning many periods — "
    "what matters is that it is specific, testable, and well-grounded. "
    "Return valid JSON only."
)

CKM_HYPOTHESIS_TEMPLATE = """\
You have been tracking the field of "{topic}" for {n_windows} time periods.

## Knowledge Evolution Trajectory
{evolution_trajectory}

## Current Knowledge State
{knowledge}

## This Period's New Papers ({period})
{papers_text}

## What Changed This Period
Type: {trigger_type}
Detail: {trigger_reason}

## Your Previously Generated Hypotheses (avoid repetition, build upon them if possible)
{existing_hypotheses}

---

Based on everything you have accumulated, generate research hypotheses.
Aim for hypotheses that are **conceptually novel** — not just combining existing tools, but proposing new mechanisms, revealing non-obvious connections, or challenging current assumptions.

Consider all types of insights:

1. **Contradictions & tensions**: earlier assumptions challenged by recent evidence — what new direction does this open?
2. **Non-obvious bridges**: connect ideas from different subfields or periods in a way that would surprise domain experts
3. **Trend extrapolation**: where is the field heading, and what problems will emerge next?
4. **Gap exploitation**: a capability gap revealed by recent work that no current method addresses
5. **Cross-paper synthesis**: combine findings from multiple papers, but the combination must yield insight beyond the sum of parts

Avoid hypotheses that merely integrate existing methods into an engineering pipeline.
A good hypothesis should make a reader think "I hadn't considered that connection" rather than "that's a natural next step."

Make each hypothesis **highly specific and testable**:
- Name exact models, datasets, or benchmarks
- State a quantitative expected effect (e.g., "at least 15% improvement", "reduction by half")
- Define a clear evaluation protocol a future paper could follow

Only output a hypothesis if ALL of the following are true:
- It is narrow enough that one future paper could validate it
- It names a concrete task/problem setting
- It specifies a method delta relative to an explicit baseline
- It states an observable outcome that could be measured
- It includes a realistic failure mode

Reject vague claims such as "improve safety", "enhance reliability", "improve performance"
unless you specify on what task, compared with what baseline, and by what measurable observable.

Return valid JSON using exactly this schema:
{{
  "hypotheses": [
    {{
      "statement": "1-2 sentences with problem, method delta, baseline, expected outcome.",
      "research_claim": {{
        "problem": "The exact research problem or task",
        "method_delta": "What new mechanism is introduced",
        "target_setting": "Dataset / benchmark / deployment setting",
        "baseline": "What this is compared against",
        "expected_observable": "Measurable outcome",
        "evaluation_plan": "How a future paper would test this",
        "failure_mode": "When this would fail or not generalize"
      }},
      "reasoning": "2-3 paragraphs explaining the evidence chain. Reference specific papers and periods when relevant.",
      "abstract": "One arXiv-style abstract paragraph in plain English, 100-200 words",
      "source_papers": [
        {{
          "arxiv_id": "id",
          "title": "title",
          "insight": "what insight it contributed",
          "period": "which time period this paper was ingested (if known)"
        }}
      ],
      "trigger": {{
        "type": "GAP / BRIDGE / TREND / CONTRADICTION / CONVERGENCE / SINGLE_BREAKTHROUGH",
        "source": "The specific evidence or pattern that triggered this hypothesis"
      }},
      "evidence_span": {{
        "earliest_period": "earliest period referenced (or current if single-period)",
        "latest_period": "latest period referenced",
        "n_periods": 1
      }},
      "builds_on": "H_NNN or null — ID of a prior hypothesis this refines/extends",
      "self_assessment": {{
        "novelty": {{"score": 1, "justification": "brief"}},
        "feasibility": {{"score": 1, "justification": "brief"}},
        "impact": {{"score": 1, "justification": "brief"}},
        "temporal_depth": {{"score": 1, "justification": "how much this relies on sustained observation vs single-period insight"}}
      }}
    }}
  ]
}}

Rules:
- Return 0 to {max_hypotheses} hypotheses, strongest first
- source_papers must cite at least 2 real arXiv IDs from the provided context
- Do not repeat or trivially rephrase your existing hypotheses
- If a hypothesis builds on a prior one, set builds_on to its ID
- Return JSON only, no markdown fences"""


def get_max_input_chars() -> int:
    """Get max input chars based on current model's context window (80% of window, ~4 chars/token)."""
    from config import get_model_context_window
    model = config["api"]["llm"]["model"]
    context_tokens = get_model_context_window(model)
    # Use 80% of context for input, reserve 20% for output
    return int(context_tokens * 0.8 * 4)


def _truncate_to_fit(knowledge_content: str, papers_text: str, ratio: float = 0.8) -> tuple[str, str]:
    """Truncate the larger text field by `ratio` to fit context window."""
    k_len = len(knowledge_content)
    p_len = len(papers_text)
    if k_len >= p_len:
        knowledge_content = knowledge_content[:int(k_len * ratio)]
        logger.warning("[ContextTruncate] Truncated knowledge_content to %d chars (%.0f%%)", len(knowledge_content), ratio * 100)
    else:
        papers_text = papers_text[:int(p_len * ratio)]
        logger.warning("[ContextTruncate] Truncated papers_text to %d chars (%.0f%%)", len(papers_text), ratio * 100)
    return knowledge_content, papers_text


@async_retry(max_retries=3, delay=5)
async def run_ckm_hypothesis_engine(
    topic: str,
    knowledge_content: str,
    papers_text: str,
    time_period: str,
    max_hypotheses: int,
    evolution_trajectory: str,
    existing_hypotheses: str,
    trigger_type: str,
    trigger_reason: str,
    n_windows: int,
) -> dict:
    """Evolution-aware hypothesis generation with temporal context."""
    from openai import BadRequestError

    # Try up to 3 rounds of truncation on context overflow
    for truncation_round in range(4):
        prompt = CKM_HYPOTHESIS_TEMPLATE.format(
            topic=topic,
            knowledge=knowledge_content,
            papers_text=papers_text,
            period=time_period,
            max_hypotheses=max_hypotheses,
            evolution_trajectory=evolution_trajectory,
            existing_hypotheses=existing_hypotheses or "(none yet)",
            trigger_type=trigger_type,
            trigger_reason=trigger_reason,
            n_windows=n_windows,
        )
        try:
            response = await llm_client.chat.completions.create(
                model=config["api"]["llm"]["model"],
                messages=[
                    {"role": "system", "content": CKM_HYPOTHESIS_SYSTEM},
                    {"role": "user", "content": prompt},
                ],
                temperature=0.3,
            )
            break  # success
        except BadRequestError as e:
            if "context_length_exceeded" in str(e) and truncation_round < 3:
                logger.warning(
                    "[CKM-Hypothesis] Context overflow (round %d), truncating to 80%%",
                    truncation_round + 1,
                )
                knowledge_content, papers_text = _truncate_to_fit(knowledge_content, papers_text, 0.8)
                continue
            raise

    choice = response.choices[0]
    content = choice.message.content or ""
    tokens = response.usage.total_tokens if response.usage else 0
    if choice.finish_reason == "length":
        logger.warning("[HypothesisV2] period=%s: Output truncated.", time_period)
    logger.info("[HypothesisV2] period=%s, tokens=%d", time_period, tokens)

    accepted = []
    rejections = []
    error = ""
    seen_statement_keys = set()

    try:
        if content.strip().upper().startswith("NULL"):
            candidates = []
        else:
            candidates = _extract_hypothesis_candidates(content)
    except Exception as exc:
        error = str(exc)
        logger.warning("[HypothesisV2] period=%s: JSON parse failed: %s", time_period, exc)
        candidates = []

    for candidate_index, candidate in enumerate(candidates[:max_hypotheses], start=1):
        statement = _sanitize_text(candidate.get("statement"))
        statement_key = _normalize_statement_key(statement)
        hyp_id_string = str(candidate_index).zfill(3)
        rendered = _render_hypothesis_markdown(candidate, hyp_id_string)
        validation_errors = _validate_hypothesis_content(rendered, choice.finish_reason or "")

        if not statement_key:
            validation_errors.append("missing hypothesis statement")
        elif statement_key in seen_statement_keys:
            validation_errors.append("duplicate hypothesis statement")
        else:
            seen_statement_keys.add(statement_key)

        if validation_errors:
            rejection_reason = "; ".join(validation_errors)
            logger.warning(
                "[HypothesisV2] period=%s candidate=%d: rejected: %s",
                time_period, candidate_index, rejection_reason,
            )
            rejections.append({
                "candidate_index": candidate_index,
                "content": rendered,
                "rejection_reason": rejection_reason,
            })
            continue

        accepted.append({
            "candidate_index": candidate_index,
            "statement": statement,
            "content": rendered,
        })

    return {
        "hypotheses": accepted,
        "rejections": rejections,
        "tokens": tokens,
        "raw_content": content,
        "error": error,
    }


# --- Hypothesis deduplication engine ---

DEDUP_SYSTEM = (
    "You are a research hypothesis deduplication judge. "
    "Given a list of hypotheses, identify groups of duplicates or near-duplicates "
    "that make essentially the same claim (same problem + same method delta + same expected outcome). "
    "Minor wording differences do not matter — judge by semantic equivalence of the core claim. "
    "Return valid JSON only."
)

DEDUP_TEMPLATE = """\
Below are {n} hypotheses, each with an ID and its statement:

{hypotheses_list}

Identify which hypotheses are duplicates or near-duplicates of each other.
For each group of duplicates, select the ONE best-written representative to KEEP.

Return valid JSON using exactly this schema:
{{
  "keep": ["H001", "H003", "H007"],
  "remove": [
    {{"id": "H002", "duplicate_of": "H001", "reason": "same core claim about X"}},
    {{"id": "H005", "duplicate_of": "H003", "reason": "same method delta and setting"}}
  ]
}}

Rules:
- "keep" lists the IDs of hypotheses to retain (unique ones + best representative of each duplicate group)
- "remove" lists the IDs to discard, with the ID they duplicate and a brief reason
- If ALL hypotheses are unique, return {{"keep": [...all IDs...], "remove": []}}
- Return JSON only, no markdown fences"""


@async_retry(max_retries=3, delay=5)
async def run_dedup_engine(hypotheses: list[dict]) -> dict:
    """
    Deduplicate hypotheses via LLM judge.

    Args:
        hypotheses: list of {"id": "H001", "statement": "...", "file_name": "..."}

    Returns:
        {"keep_ids": set[str], "remove_ids": set[str], "removals": list[dict], "tokens": int}
    """
    if len(hypotheses) <= 1:
        return {
            "keep_ids": {h["id"] for h in hypotheses},
            "remove_ids": set(),
            "removals": [],
            "tokens": 0,
        }

    hyp_list_text = "\n\n".join(
        f"### {h['id']}\n{h['statement']}" for h in hypotheses
    )

    prompt = DEDUP_TEMPLATE.format(n=len(hypotheses), hypotheses_list=hyp_list_text)

    response = await llm_client.chat.completions.create(
        model=config["api"]["llm"]["model"],
        messages=[
            {"role": "system", "content": DEDUP_SYSTEM},
            {"role": "user", "content": prompt},
        ],
        temperature=0.0,
    )

    choice = response.choices[0]
    content = choice.message.content or ""
    tokens = response.usage.total_tokens if response.usage else 0

    all_ids = {h["id"] for h in hypotheses}
    keep_ids = set(all_ids)
    remove_ids = set()
    removals = []

    try:
        payload = _extract_json_payload(content)
        keep_list = payload.get("keep", [])
        remove_list = payload.get("remove", [])

        if isinstance(keep_list, list) and isinstance(remove_list, list):
            parsed_keep = {str(kid) for kid in keep_list if str(kid) in all_ids}
            parsed_remove = set()
            parsed_removals = []
            for entry in remove_list:
                if isinstance(entry, dict):
                    rid = str(entry.get("id", ""))
                    if rid in all_ids:
                        parsed_remove.add(rid)
                        parsed_removals.append(entry)

            # Sanity: keep and remove should not overlap, and union should cover all
            if parsed_keep and not (parsed_keep & parsed_remove):
                keep_ids = parsed_keep
                remove_ids = parsed_remove
                removals = parsed_removals

    except Exception as exc:
        logger.warning("[DedupEngine] JSON parse failed: %s — keeping all hypotheses", exc)

    logger.info(
        "[DedupEngine] %d hypotheses → keep=%d, remove=%d, tokens=%d",
        len(hypotheses), len(keep_ids), len(remove_ids), tokens,
    )
    return {
        "keep_ids": keep_ids,
        "remove_ids": remove_ids,
        "removals": removals,
        "tokens": tokens,
    }
