"""Build the per-topic validation pool index.

Reads the frozen ``data/arxiv_ids/<slug>.json`` files (produced by
``ckm_benchmark.extract_arxiv_ids``) and emits a flat ``data/validation_pool.json``
that lists every validation arXiv ID across all 50 topics, with topic
attribution.

This index is the canonical "future ground truth" set against which every
submission is judged. Submitters can either:

    1. Use the IDs directly to fetch paper content from arXiv themselves, or
    2. Wait for v0.2 which will ship a frozen content snapshot
       (``data/validation_papers/<slug>/<arxiv_id>.json``).

Usage:
    python -m ckm_benchmark.build_validation_pool \\
        --arxiv-ids-dir data/arxiv_ids \\
        --output data/validation_pool.json
"""

from __future__ import annotations

import argparse
import json
import logging
from collections import defaultdict
from pathlib import Path


logger = logging.getLogger("ckm_benchmark.build_validation_pool")
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")


def main() -> None:
    parser = argparse.ArgumentParser(description="Build the per-topic validation pool index.")
    parser.add_argument("--arxiv-ids-dir", type=Path, required=True,
                        help="Directory of per-topic arXiv ID JSON files.")
    parser.add_argument("--output", type=Path, required=True,
                        help="Path to write the consolidated validation pool index.")
    args = parser.parse_args()

    by_topic: dict[str, list[str]] = {}
    by_arxiv_id: dict[str, list[str]] = defaultdict(list)
    total_papers = 0

    for path in sorted(args.arxiv_ids_dir.glob("*.json")):
        with open(path) as fh:
            record = json.load(fh)
        slug = record["slug"]
        validation = record.get("validation", [])
        by_topic[slug] = validation
        total_papers += len(validation)
        for arxiv_id in validation:
            by_arxiv_id[arxiv_id].append(slug)

    out = {
        "version": "v0.1",
        "total_validation_papers": total_papers,
        "unique_validation_papers": len(by_arxiv_id),
        "topics": {
            slug: {
                "validation_arxiv_ids": validation,
                "count": len(validation),
            }
            for slug, validation in by_topic.items()
        },
        "papers_by_arxiv_id": {
            arxiv_id: {"appears_in_topics": topics}
            for arxiv_id, topics in by_arxiv_id.items()
        },
        "fetch_instructions": (
            "Each arxiv_id can be fetched as full text via "
            "https://arxiv.org/abs/<arxiv_id> (HTML/PDF) or via the arXiv API "
            "(see https://info.arxiv.org/help/api). The benchmark v0.2 release "
            "will ship a frozen content snapshot keyed by arxiv_id."
        ),
    }

    args.output.parent.mkdir(parents=True, exist_ok=True)
    with open(args.output, "w") as fh:
        json.dump(out, fh, indent=2)

    logger.info(
        "Wrote validation pool: %d topics, %d total validation papers, %d unique IDs → %s",
        len(by_topic), total_papers, len(by_arxiv_id), args.output,
    )


if __name__ == "__main__":
    main()
