#!/usr/bin/env python3
import argparse
import json
from pathlib import Path
from typing import Sequence

from datasets import load_dataset  # type: ignore
from sweagent.run.batch_instances import SimpleBatchInstance

# Exact instances to run (nebius50)
INSTANCES: list[str] = [
    "sympy__sympy-22080",
    "django__django-15315",
    "django__django-11333",
    "matplotlib__matplotlib-20826",
    "django__django-11532",
    "django__django-16642",
    "django__django-14855",
    "sphinx-doc__sphinx-8721",
    "pylint-dev__pylint-4604",
    "sympy__sympy-13615",
    "django__django-13089",
    "django__django-15987",
    "django__django-14725",
    "sympy__sympy-14248",
    "pytest-dev__pytest-7982",
    "django__django-15280",
    "scikit-learn__scikit-learn-13142",
    "pytest-dev__pytest-5809",
    "matplotlib__matplotlib-23299",
    "django__django-16560",
    "django__django-15103",
    "sympy__sympy-16792",
    "django__django-14007",
    "psf__requests-2317",
    "django__django-11880",
    "django__django-16136",
    "django__django-16661",
    "sympy__sympy-17139",
    "sympy__sympy-14531",
    "sphinx-doc__sphinx-8595",
    "django__django-10880",
    "sympy__sympy-19346",
    "sphinx-doc__sphinx-9229",
    "django__django-11265",
    "matplotlib__matplotlib-25332",
    "scikit-learn__scikit-learn-13135",
    "pydata__xarray-6744",
    "pydata__xarray-6461",
    "sympy__sympy-15017",
    "django__django-13417",
    "matplotlib__matplotlib-24870",
    "django__django-15368",
    "django__django-11095",
    "django__django-15554",
    "pydata__xarray-6992",
    "django__django-15863",
    "django__django-13363",
    "sympy__sympy-13852",
    "django__django-14017",
    "pylint-dev__pylint-4661",
]


def write_instances_jsonl(output_path: Path, instance_ids: Sequence[str]) -> int:
    output_path.parent.mkdir(parents=True, exist_ok=True)
    dataset = load_dataset("princeton-nlp/SWE-Bench_Verified", split="test")  # type: ignore
    wanted = set(instance_ids)
    selected = [row for row in dataset if row["instance_id"] in wanted]  # type: ignore[index]
    objs = [SimpleBatchInstance.from_swe_bench(row).model_dump() for row in selected]
    output_path.write_text("\n".join(json.dumps(o) for o in objs))
    return len(objs)


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Write nebius50 SWE-bench instances to JSONL")
    parser.add_argument(
        "--output",
        type=Path,
        default=Path("tool_gen/evaluation/data/nebius50.jsonl"),
        help="Path to write JSONL (SimpleBatchInstance per line)",
    )
    return parser.parse_args()


def main() -> None:
    args = parse_args()
    count = write_instances_jsonl(args.output, INSTANCES)
    print(f"Wrote {count} instances to {args.output}")


if __name__ == "__main__":
    main()
