"""
CKM Validation Phase — LLM-as-Judge hit verification + novelty scoring.

For each generated hypothesis:
  1. Extract core keywords (LLM)                         → local reranking signals
  2. Run novelty judge (LLM, 4 dimensions, 10-pt scale)  → hypothesis quality score
  3. Rank already-fetched validation papers
  4. Judge the top candidate papers (LLM, 4 dimensions)  → HIT if avg >= 6.0
  5. Reference: compute cosine similarity to closest validation paper (embedding)

Metrics reported:
  Hypothesis Yield          total hypotheses generated
  Predictive Hit Count      hypotheses with a verified HIT
  Predictive Hit Rate       hit_count / yield * 100
  Avg Temporal Lead         (hits only) days between hypothesis date and matched paper
  Avg Novelty Judge Score   average LLM novelty score across all hypotheses (1–10)
  Avg Cross-domain Score    category diversity of papers cited in each hypothesis
  Ref: Avg Cosine Similarity  max cosine sim to any validation paper (reference only)
"""
import re
import asyncio
import logging
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any, Optional

from core.judge import run_keyword_extract, run_hit_verify, run_novelty_judge
from core.engines import get_embedding, cosine_similarity, run_read_engine
from core.store import FileSystemKnowledgeStore
from tools.arxiv_fulltext import get_paper_content_record
from config import config

logger = logging.getLogger("ckm.metrics")


# ---------------------------------------------------------------------------
# Cross-domain score (based on arXiv categories of papers cited in hypothesis)
# ---------------------------------------------------------------------------

def _cat_distance(cat_a: str, cat_b: str) -> float:
    if cat_a == cat_b:
        return 0.0
    return 0.2 if cat_a.split(".")[0] == cat_b.split(".")[0] else 0.8


def calculate_cross_domain_score(categories: List[str]) -> float:
    unique = list(set(categories))
    if len(unique) <= 1:
        return 0.0
    total, pairs = 0.0, 0
    for i in range(len(unique)):
        for j in range(i + 1, len(unique)):
            total += _cat_distance(unique[i], unique[j])
            pairs += 1
    return len(unique) * (total / pairs) if pairs > 0 else 0.0


def _extract_abstract_text(hyp_content: str) -> str:
    """Pull the ## Abstract section for embedding; fall back to ## Statement."""
    m = re.search(r"## Abstract\s*([\s\S]*?)(?=##|\Z)", hyp_content)
    if m:
        return m.group(1).strip()
    m = re.search(r"## Statement\s*([\s\S]*?)(?=##|\Z)", hyp_content)
    return m.group(1).strip() if m else hyp_content[:300]


def _find_cross_experiment_cache(cache_dir: Optional[Path], arxiv_id: str) -> Optional[str]:
    """Search other experiments' fulltext caches for an already-downloaded paper."""
    if not cache_dir:
        return None
    # cache_dir is like results/<exp>/topics/<topic>/metabolism/fulltext_cache
    # Walk up to results/ and scan sibling experiments
    results_dir = cache_dir.parent.parent.parent.parent  # results/
    if not results_dir.exists() or results_dir.name != "results":
        # Try one more level (in case of different nesting)
        results_dir = results_dir.parent
        if not results_dir.exists() or results_dir.name != "results":
            return None
    clean_id = arxiv_id.replace("/", "_")
    for exp_dir in results_dir.iterdir():
        if not exp_dir.is_dir() or exp_dir.name.startswith("_archived"):
            continue
        topics_dir = exp_dir / "topics"
        if not topics_dir.is_dir():
            continue
        for topic_dir in topics_dir.iterdir():
            candidate = topic_dir / "metabolism" / "fulltext_cache" / f"{clean_id}.txt"
            if candidate.exists():
                text = candidate.read_text(encoding="utf-8")
                if len(text.strip()) > 100:  # not a stub
                    return text
    return None


async def _prefetch_validation_fulltexts(
    val_papers: list[Dict[str, Any]],
    cache_dir: Optional[Path],
    fallback_cache: Dict[str, str],
    concurrency: int,
    timeout_s: int,
    retries: int,
    retry_delay_s: float,
):
    if not val_papers:
        return {}, {}, {}

    semaphore = asyncio.Semaphore(max(1, concurrency))
    paper_text_cache: Dict[str, str] = {}
    paper_source_cache: Dict[str, str] = {}
    paper_counted_cache: Dict[str, bool] = {}

    async def load_paper(paper: Dict[str, Any]) -> None:
        arxiv_id = paper["arxiv_id"]
        if arxiv_id in fallback_cache:
            paper_text_cache[arxiv_id] = fallback_cache[arxiv_id]
            paper_source_cache[arxiv_id] = "fallback-cache"
            paper_counted_cache[arxiv_id] = False
            return
        # Try cross-experiment cache first (no API call needed)
        cross_text = _find_cross_experiment_cache(cache_dir, arxiv_id)
        if cross_text:
            paper_text_cache[arxiv_id] = cross_text
            paper_source_cache[arxiv_id] = "cross-experiment-cache"
            paper_counted_cache[arxiv_id] = True
            # Also save locally so future runs find it immediately
            if cache_dir:
                cache_dir.mkdir(parents=True, exist_ok=True)
                local_path = cache_dir / f"{arxiv_id.replace('/', '_')}.txt"
                if not local_path.exists():
                    local_path.write_text(cross_text, encoding="utf-8")
            return
        async with semaphore:
            record = await get_paper_content_record(
                arxiv_id,
                paper["abstract"],
                cache_dir,
                timeout_s=timeout_s,
                retries=retries,
                retry_delay_s=retry_delay_s,
            )
        paper_text_cache[arxiv_id] = record["content"]
        paper_source_cache[arxiv_id] = record["source"]
        paper_counted_cache[arxiv_id] = bool(record["counted_fulltext"])

    await asyncio.gather(*[load_paper(paper) for paper in val_papers])
    return paper_text_cache, paper_source_cache, paper_counted_cache


async def _build_validation_profiles(
    val_papers: list[Dict[str, Any]],
    paper_text_cache: Dict[str, str],
    concurrency: int,
) -> Dict[str, str]:
    if not val_papers:
        return {}

    semaphore = asyncio.Semaphore(max(1, concurrency))
    paper_profile_cache: Dict[str, str] = {}

    async def build_profile(paper: Dict[str, Any]) -> None:
        arxiv_id = paper["arxiv_id"]
        content = paper_text_cache.get(arxiv_id, paper["abstract"])
        async with semaphore:
            profile = await run_read_engine(
                paper["title"],
                arxiv_id,
                paper["published"],
                content,
            )
        paper_profile_cache[arxiv_id] = profile

    await asyncio.gather(*[build_profile(paper) for paper in val_papers])
    return paper_profile_cache


# ---------------------------------------------------------------------------
# Per-hypothesis hit verification
# ---------------------------------------------------------------------------

async def _verify_hit(
    hyp_file: Path,
    hyp_content: str,
    t_hyp: float,
    candidate_papers: list[Dict[str, Any]],
    paper_profile_cache: Dict[str, str],
    judge_concurrency: int,
) -> dict:
    """
    Judge every candidate paper in the validation corpus for this hypothesis.
    Returns hit metadata across the full validation set.
    """
    if not candidate_papers:
        logger.info("[Metrics] %s: no validation papers after hypothesis date", hyp_file.name)
        return {
            "is_hit": False,
            "lead_days": 0.0,
            "matched_paper": None,
            "matched_papers": [],
            "matched_hit_count": 0,
            "best_miss": None,
            "candidate_count": 0,
            "judged_count": 0,
        }

    logger.info(
        "[Metrics] %s: judging all %d validation papers (concurrency=%d)",
        hyp_file.name,
        len(candidate_papers),
        max(1, judge_concurrency),
    )

    semaphore = asyncio.Semaphore(max(1, judge_concurrency))

    async def judge_paper(paper: Dict[str, Any]) -> dict:
        async with semaphore:
            content = paper_profile_cache.get(paper["arxiv_id"], paper["abstract"])
            return await run_hit_verify(
                hyp_content,
                paper["title"],
                paper["published"][:10],
                paper["arxiv_id"],
                content,
            )

    judged_results = await asyncio.gather(*[judge_paper(paper) for paper in candidate_papers])
    hits = [result for result in judged_results if result["is_hit"]]
    misses = [result for result in judged_results if not result["is_hit"]]

    hits.sort(key=lambda result: (result["paper_published"], -result["avg_score"]))
    matched_paper = hits[0] if hits else None
    best_miss = max(misses, key=lambda result: result["avg_score"]) if misses else None

    # Temporal lead: hypothesis date vs matched paper's actual publication date
    lead_days = 0.0
    if matched_paper:
        paper_ts = datetime.fromisoformat(matched_paper["paper_published"]).timestamp()
        lead_days = max(0.0, (paper_ts - t_hyp) / 86400)

    return {
        "is_hit": matched_paper is not None,
        "lead_days": lead_days,
        "matched_paper": matched_paper,
        "matched_papers": hits[:5],
        "matched_hit_count": len(hits),
        "best_miss": best_miss,
        "candidate_count": len(candidate_papers),
        "judged_count": len(judged_results),
    }


# ---------------------------------------------------------------------------
# Main entry point
# ---------------------------------------------------------------------------

async def calculate_metrics(
    all_papers: List[Dict[str, Any]],
    validation_start: str,
    validation_end: str,
    topic_keyword: str,
    ablation_mode: str = "none",
    store: FileSystemKnowledgeStore = None,
    cache_dir: Optional[Path] = None,
):
    hyp_dir = store.dirs["hypotheses"] if store else config["paths"]["hypotheses"]
    if not hyp_dir.exists():
        logger.warning("[Metrics] Hypotheses directory not found: %s", hyp_dir)
        return

    files = sorted(hyp_dir.glob("*.md"))
    if not files:
        logger.info("[Metrics] No hypotheses to evaluate.")
        return

    logger.info("[Metrics] Evaluating %d hypotheses | validation: %s ~ %s",
                len(files), validation_start, validation_end)

    # ── Pass 1: parse hypothesis metadata (no API calls) ──────────────────
    val_start = validation_start[:10]
    val_end = validation_end[:10]
    exp_cfg = config["experiment"]
    validation_fulltext_concurrency = max(1, exp_cfg.get("validation_fulltext_concurrency", 8))
    validation_profile_concurrency = max(1, exp_cfg.get("validation_profile_concurrency", 4))
    validation_judge_concurrency = max(1, exp_cfg.get("validation_judge_concurrency", 4))
    validation_min_papers = max(1, exp_cfg.get("validation_min_papers", 120))
    fulltext_timeout_s = max(1, exp_cfg.get("fulltext_timeout_s", 30))
    fulltext_retries = max(1, exp_cfg.get("fulltext_retries", 2))
    fulltext_retry_delay_s = max(0, exp_cfg.get("fulltext_retry_delay_s", 2))

    hyp_records = []
    for fp in files:
        content = fp.read_text(encoding="utf-8")

        date_match = re.search(r"hyp-(\d{4}-\d{2})", fp.name)
        hyp_date_str = (date_match.group(1) + "-01") if date_match else "2020-01-01"
        t_hyp = datetime.strptime(hyp_date_str, "%Y-%m-%d").timestamp()

        linked_cats = [
            cat
            for p in all_papers if p["arxiv_id"] in content
            for cat in p["categories"]
        ] or ["cs.AI"]

        hyp_records.append({
            "file_path": fp,
            "content": content,
            "hyp_date_str": hyp_date_str,
            "t_hyp": t_hyp,
            "cd_score": calculate_cross_domain_score(linked_cats),
            "abstract_text": _extract_abstract_text(content),
        })

    # ── Pass 2: embeddings for reference cosine similarity ────────────────
    # Embed hypothesis abstracts + validation paper abstracts concurrently
    val_papers = [p for p in all_papers if val_start <= p["published"][:10] < val_end]
    logger.info("[Metrics] Validation metadata papers found: %d", len(val_papers))
    logger.info(
        "[Metrics] Validation exhaustive mode: fulltext_concurrency=%d, profile_concurrency=%d, judge_concurrency=%d",
        validation_fulltext_concurrency,
        validation_profile_concurrency,
        validation_judge_concurrency,
    )

    hyp_texts = [rec["abstract_text"] for rec in hyp_records]
    val_texts = [p["abstract"] for p in val_papers]

    logger.info("[Metrics] Fetching %d hyp + %d val-paper embeddings (reference cosine sim)",
                len(hyp_texts), len(val_texts))

    all_vecs = await asyncio.gather(
        *[get_embedding(t) for t in hyp_texts],
        *[get_embedding(t) for t in val_texts],
    )
    hyp_vecs = list(all_vecs[:len(hyp_texts)])
    val_vecs = [all_vecs[len(hyp_texts) + i] for i in range(len(val_texts))]

    for rec, hyp_vec in zip(hyp_records, hyp_vecs):
        rec["hyp_vec"] = hyp_vec
        rec["ref_cosine_sim"] = (
            max(cosine_similarity(hyp_vec, vv) for vv in val_vecs)
            if val_vecs else 0.0
        )

    # ── Pass 2b: Embedding pre-filter — find which papers need fulltext ────
    TOP_K_CANDIDATES = 30  # Per hypothesis, rank by cosine similarity
    val_id_to_idx = {p["arxiv_id"]: i for i, p in enumerate(val_papers)}

    # Pre-compute top-K candidates per hypothesis and collect the union
    needed_arxiv_ids: set[str] = set()
    per_hyp_candidates: list[list[dict]] = []

    for rec in hyp_records:
        candidate_papers = [
            paper for paper in val_papers
            if paper["published"][:10] > rec["hyp_date_str"]
        ]
        rec["candidate_count"] = len(candidate_papers)

        hyp_vec = rec["hyp_vec"]
        scored = []
        for paper in candidate_papers:
            idx = val_id_to_idx.get(paper["arxiv_id"])
            sim = cosine_similarity(hyp_vec, val_vecs[idx]) if idx is not None and idx < len(val_vecs) else 0.0
            scored.append((sim, paper))
        scored.sort(key=lambda x: x[0], reverse=True)
        top_k = [paper for _, paper in scored[:TOP_K_CANDIDATES]]
        per_hyp_candidates.append(top_k)
        needed_arxiv_ids.update(p["arxiv_id"] for p in top_k)

    # Only fetch fulltext & build profiles for papers in the union of all top-K sets
    needed_val_papers = [p for p in val_papers if p["arxiv_id"] in needed_arxiv_ids]
    logger.info(
        "[Metrics] Embedding pre-filter: %d total val papers → %d needed by any hypothesis top-%d",
        len(val_papers), len(needed_val_papers), TOP_K_CANDIDATES,
    )

    logger.info("[Metrics] Prefetching fulltexts for %d needed validation papers", len(needed_val_papers))
    paper_text_cache, paper_source_cache, paper_counted_cache = await _prefetch_validation_fulltexts(
        needed_val_papers,
        cache_dir,
        {},
        validation_fulltext_concurrency,
        fulltext_timeout_s,
        fulltext_retries,
        fulltext_retry_delay_s,
    )
    abstract_fallback = sum(1 for counted in paper_counted_cache.values() if not counted)
    val_fulltext_papers = [
        paper for paper in needed_val_papers
        if paper_counted_cache.get(paper["arxiv_id"], False)
    ]
    logger.info(
        "[Metrics] Validation fulltext ready: %d papers (abstract_only=%d)",
        len(val_fulltext_papers), abstract_fallback,
    )

    logger.info("[Metrics] Building validation profiles for %d fulltext papers", len(val_fulltext_papers))
    paper_profile_cache = await _build_validation_profiles(
        val_fulltext_papers,
        paper_text_cache,
        validation_profile_concurrency,
    )
    logger.info("[Metrics] Built validation profiles for %d papers", len(paper_profile_cache))

    # Build set of papers with fulltext for filtering top-K lists
    fulltext_ids = {p["arxiv_id"] for p in val_fulltext_papers}

    # ── Pass 3: LLM judges on pre-filtered top-K candidates ───────────────
    for rec, top_k_all in zip(hyp_records, per_hyp_candidates):
        logger.info("[Metrics] Judging: %s", rec["file_path"].name)

        # Keyword extraction + novelty judge run in parallel (both need only hyp content)
        keywords, novelty = await asyncio.gather(
            run_keyword_extract(rec["content"]),
            run_novelty_judge(rec["content"]),
        )
        rec["keywords"] = keywords
        rec["novelty"] = novelty

        # Filter to papers that actually have fulltext
        top_k_papers = [p for p in top_k_all if p["arxiv_id"] in fulltext_ids]
        rec["fulltext_read_count"] = len(top_k_papers)

        logger.info(
            "[Metrics] %s: %d candidates → top-%d by embedding → %d with fulltext",
            rec["file_path"].name, rec["candidate_count"], len(top_k_all), len(top_k_papers),
        )

        # Hit verification against top-K validation papers only
        rec["hit"] = await _verify_hit(
            rec["file_path"],
            rec["content"],
            rec["t_hyp"],
            top_k_papers,
            paper_profile_cache,
            validation_judge_concurrency,
        )

    # ── Aggregate ─────────────────────────────────────────────────────────
    n = len(files)
    hit_count = sum(1 for rec in hyp_records if rec["hit"]["is_hit"])
    hit_rate = (hit_count / n) * 100

    hit_leads = [rec["hit"]["lead_days"] for rec in hyp_records if rec["hit"]["is_hit"]]
    avg_lead = sum(hit_leads) / len(hit_leads) if hit_leads else 0.0

    avg_cross = sum(rec["cd_score"] for rec in hyp_records) / n
    avg_novelty = sum(rec["novelty"]["avg_score"] for rec in hyp_records) / n
    avg_ref_sim = sum(rec["ref_cosine_sim"] for rec in hyp_records) / n
    avg_candidate_pool = sum(rec.get("candidate_count", 0) for rec in hyp_records) / n
    avg_fulltext_read = sum(rec.get("fulltext_read_count", 0) for rec in hyp_records) / n
    avg_judged = sum(rec["hit"]["judged_count"] for rec in hyp_records) / n
    avg_hit_matches = sum(rec["hit"]["matched_hit_count"] for rec in hyp_records) / n

    # ── New metrics (zero-cost, from existing judge data) ─────────────
    # Best Match Score: highest judge score per hypothesis (even if not a hit)
    best_match_scores = []
    for rec in hyp_records:
        hit = rec["hit"]
        if hit["matched_paper"]:
            best_match_scores.append(hit["matched_paper"]["avg_score"])
        elif hit.get("best_miss"):
            best_match_scores.append(hit["best_miss"]["avg_score"])
        else:
            best_match_scores.append(0.0)
    avg_best_match = sum(best_match_scores) / n if n else 0.0

    # Avg Judge Score (all): mean of all judge scores across all hypothesis-paper pairs
    all_judge_scores = []
    for rec in hyp_records:
        hit = rec["hit"]
        for mp in hit.get("matched_papers", []):
            all_judge_scores.append(mp["avg_score"])
        if hit.get("best_miss"):
            all_judge_scores.append(hit["best_miss"]["avg_score"])
    avg_all_judge = sum(all_judge_scores) / len(all_judge_scores) if all_judge_scores else 0.0

    # Precision@K: hit rate among top-K hypotheses (ranked by novelty score)
    sorted_by_novelty = sorted(hyp_records, key=lambda r: r["novelty"]["avg_score"], reverse=True)
    precision_at_3 = 0.0
    precision_at_5 = 0.0
    if n >= 3:
        top3_hits = sum(1 for r in sorted_by_novelty[:3] if r["hit"]["is_hit"])
        precision_at_3 = (top3_hits / 3) * 100
    if n >= 5:
        top5_hits = sum(1 for r in sorted_by_novelty[:5] if r["hit"]["is_hit"])
        precision_at_5 = (top5_hits / 5) * 100

    # Hit Paper Diversity: unique arXiv categories across all hit papers
    hit_paper_cats = set()
    for rec in hyp_records:
        if rec["hit"]["is_hit"]:
            for mp in rec["hit"].get("matched_papers", []):
                arxiv_id = mp.get("paper_arxiv_id", "")
                for p in all_papers:
                    if p["arxiv_id"] == arxiv_id:
                        hit_paper_cats.update(p.get("categories", []))
    hit_diversity = len(hit_paper_cats)

    # Novelty sub-dimension averages
    d1_scores = [rec["novelty"]["d_scores"].get("d1", 0) for rec in hyp_records]
    d2_scores = [rec["novelty"]["d_scores"].get("d2", 0) for rec in hyp_records]
    d3_scores = [rec["novelty"]["d_scores"].get("d3", 0) for rec in hyp_records]
    d4_scores = [rec["novelty"]["d_scores"].get("d4", 0) for rec in hyp_records]
    avg_d1 = sum(d1_scores) / n if n else 0.0
    avg_d2 = sum(d2_scores) / n if n else 0.0
    avg_d3 = sum(d3_scores) / n if n else 0.0
    avg_d4 = sum(d4_scores) / n if n else 0.0

    # Total hit papers: absolute count of unique papers matched
    all_hit_paper_ids = set()
    for rec in hyp_records:
        for mp in rec["hit"].get("matched_papers", []):
            all_hit_paper_ids.add(mp.get("paper_arxiv_id", ""))
    total_unique_hits = len(all_hit_paper_ids)

    logger.info("=" * 60)
    logger.info("[Metrics] Hypothesis Yield:              %d", n)
    logger.info("[Metrics] Validation Metadata Papers:    %d", len(val_papers))
    logger.info("[Metrics] Validation Papers Available:   %d", len(val_fulltext_papers))
    logger.info("[Metrics] Validation Fulltexts Available:%d", len(val_fulltext_papers))
    logger.info("[Metrics] Predictive Hits:               %d / %d (%.1f%%)", hit_count, n, hit_rate)
    logger.info("[Metrics] Total Unique Hit Papers:       %d", total_unique_hits)
    logger.info("[Metrics] Avg Validation Papers Checked: %.1f", avg_candidate_pool)
    logger.info("[Metrics] Avg Fulltexts Read:           %.1f", avg_fulltext_read)
    logger.info("[Metrics] Avg Candidates Judged:        %.1f", avg_judged)
    logger.info("[Metrics] Avg Hit Matches Found:        %.1f", avg_hit_matches)
    logger.info("[Metrics] Avg Temporal Lead (hits):      %.0f days", avg_lead)
    logger.info("[Metrics] Avg Best Match Score:          %.2f / 10", avg_best_match)
    logger.info("[Metrics] Avg All Judge Score:           %.2f / 10", avg_all_judge)
    logger.info("[Metrics] Avg Novelty Judge Score:       %.2f / 10", avg_novelty)
    logger.info("[Metrics] Avg D1 Originality:            %.2f / 10", avg_d1)
    logger.info("[Metrics] Avg D2 Cross-field:            %.2f / 10", avg_d2)
    logger.info("[Metrics] Avg D3 Gap Precision:          %.2f / 10", avg_d3)
    logger.info("[Metrics] Avg D4 Falsifiability:         %.2f / 10", avg_d4)
    logger.info("[Metrics] Avg Cross-domain Score:        %.2f", avg_cross)
    logger.info("[Metrics] Precision@3:                   %.1f%%", precision_at_3)
    logger.info("[Metrics] Precision@5:                   %.1f%%", precision_at_5)
    logger.info("[Metrics] Hit Paper Diversity (cats):    %d", hit_diversity)
    logger.info("[Metrics] Ref Avg Cosine Sim (val):      %.4f", avg_ref_sim)
    logger.info("=" * 60)

    # ── Build report ──────────────────────────────────────────────────────
    md_lines = []
    for rec in hyp_records:
        fname = rec["file_path"].name
        nov = rec["novelty"]
        hit = rec["hit"]

        md_lines.append(f"### {fname}")
        md_lines.append(f"- Generated: {rec['hyp_date_str']}")
        md_lines.append(f"- Keywords: {', '.join(rec['keywords'])}")
        md_lines.append(f"- Validation papers checked: {rec.get('candidate_count', 0)}")
        md_lines.append(f"- Candidate fulltexts read: {rec.get('fulltext_read_count', 0)}")
        md_lines.append(f"- Candidates judged: {hit['judged_count']}")
        md_lines.append(f"- Hit papers found: {hit['matched_hit_count']}")
        md_lines.append(f"- Cross-domain Score: {rec['cd_score']:.2f}")
        md_lines.append(f"- Best Match Score: {best_match_scores[hyp_records.index(rec)]:.1f} / 10")
        md_lines.append(f"- Ref Cosine Sim (closest val paper): {rec['ref_cosine_sim']:.4f}")
        md_lines.append("")

        # Novelty judge breakdown
        ds = nov["d_scores"]
        md_lines.append("**Novelty Judge**")
        md_lines.append(
            f"| D1 Originality | D2 Cross-field | D3 Gap Precision | D4 Falsifiability | Avg |"
        )
        md_lines.append(f"| --- | --- | --- | --- | --- |")
        md_lines.append(
            f"| {ds.get('d1',0)}/10 | {ds.get('d2',0)}/10 "
            f"| {ds.get('d3',0)}/10 | {ds.get('d4',0)}/10 | **{nov['avg_score']:.1f}/10** |"
        )
        if nov["strengths"]:
            md_lines.append(f"- Strengths: {nov['strengths']}")
        if nov["weaknesses"]:
            md_lines.append(f"- Weaknesses: {nov['weaknesses']}")
        md_lines.append("")

        # Hit verdict
        if hit["is_hit"] and hit["matched_paper"]:
            mp = hit["matched_paper"]
            md_lines.append(f"**Predictive Hit: ✓**")
            md_lines.append(f"- Earliest matched: [{mp['paper_arxiv_id']}] {mp['paper_title']}")
            md_lines.append(f"- Paper published: {mp['paper_published']}")
            md_lines.append(f"- Temporal lead: {hit['lead_days']:.0f} days")
            md_lines.append(f"- Judge avg score: {mp['avg_score']:.1f} / 10")
            if hit["matched_hit_count"] > 1:
                other_hits = ", ".join(
                    f"[{paper['paper_arxiv_id']}]"
                    for paper in hit["matched_papers"][1:]
                )
                if other_hits:
                    md_lines.append(f"- Additional matched papers: {other_hits}")
            for idea in mp["matched_ideas"]:
                md_lines.append(f"- Matched idea: {idea}")
            md_lines.append(f"- Reasoning: {mp['reasoning']}")
        else:
            bm = hit.get("best_miss")
            if bm:
                md_lines.append(
                    f"**Predictive Hit: ✗** "
                    f"(closest: [{bm['paper_arxiv_id']}] score={bm['avg_score']:.1f}/10)"
                )
            else:
                md_lines.append("**Predictive Hit: ✗** (no local candidates found in validation corpus)")
        md_lines.append("")
        md_lines.append("---")
        md_lines.append("")

    # ── Write report ──────────────────────────────────────────────────────
    report_dir = (store.base_dir.parent / "reports") if store else config["paths"]["reports"]
    report_dir.mkdir(parents=True, exist_ok=True)

    safe_title = topic_keyword.replace(" ", "_")
    suffix = f"_Ablation_{ablation_mode}" if ablation_mode != "none" else ""
    report_path = report_dir / f"{safe_title}{suffix}_Evaluation_Report.md"

    report = [
        f"# CKM Evaluation Report: {topic_keyword}",
        f"> Ablation: {ablation_mode} | Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
        "",
        "## Metrics Summary",
        "",
        "| Metric | Value |",
        "| ------ | ----- |",
        f"| Hypothesis Yield | {n} |",
        f"| Validation Metadata Papers Seen | {len(val_papers)} |",
        f"| Validation Papers Available | {len(val_fulltext_papers)} |",
        f"| Validation Fulltexts Available | {len(val_fulltext_papers)} |",
        f"| Predictive Hit Count | {hit_count} |",
        f"| Predictive Hit Rate | {hit_rate:.1f}% |",
        f"| Total Unique Hit Papers | {total_unique_hits} |",
        f"| Avg Validation Papers Checked / Hypothesis | {avg_candidate_pool:.1f} |",
        f"| Avg Candidate Fulltexts Read / Hypothesis | {avg_fulltext_read:.1f} |",
        f"| Avg Candidates Judged / Hypothesis | {avg_judged:.1f} |",
        f"| Avg Hit Papers Found / Hypothesis | {avg_hit_matches:.1f} |",
        f"| Avg Temporal Lead (hits only) | {avg_lead:.0f} days |",
        f"| Avg Best Match Score | {avg_best_match:.2f} / 10 |",
        f"| Avg All Judge Score | {avg_all_judge:.2f} / 10 |",
        f"| Avg Novelty Judge Score | {avg_novelty:.2f} / 10 |",
        f"| Avg D1 Originality | {avg_d1:.2f} / 10 |",
        f"| Avg D2 Cross-field Synthesis | {avg_d2:.2f} / 10 |",
        f"| Avg D3 Gap Precision | {avg_d3:.2f} / 10 |",
        f"| Avg D4 Falsifiability | {avg_d4:.2f} / 10 |",
        f"| Avg Cross-domain Score | {avg_cross:.2f} |",
        f"| Precision@3 (by novelty) | {precision_at_3:.1f}% |",
        f"| Precision@5 (by novelty) | {precision_at_5:.1f}% |",
        f"| Hit Paper Diversity (categories) | {hit_diversity} |",
        f"| Ref: Avg Cosine Sim to Val Papers | {avg_ref_sim:.4f} |",
        "",
        "> **Note — Temporal Lead**: measured only for hits; defined as the number of days",
        "> between the hypothesis generation date and the matched paper's publication date.",
        "> **Note — Best Match Score**: highest judge score per hypothesis, even if below hit threshold.",
        "> **Note — Precision@K**: hit rate among top-K hypotheses ranked by novelty score.",
        "> **Note — Ref Cosine Sim**: max cosine similarity between hypothesis abstract",
        "> and any validation-period paper abstract; reported as a reference only.",
        "",
        "## Hypothesis Breakdown",
        "",
    ]
    report.extend(md_lines)
    report_path.write_text("\n".join(report), encoding="utf-8")
    logger.info("[Metrics] Report saved: %s", report_path)
