import json
from datasets import load_dataset
from pathlib import Path



# Path to your list of desired instance IDs (from your timing file)
ids_jsonl = Path("../SWE-smith/old_execution_times_from_logs_top_100_1_per_repo.jsonl")
with open(ids_jsonl, "r") as f:
    wanted_ids = {json.loads(line)["instance_id"] for line in f}

# Load the full SWE-smith dataset from HuggingFace
print("Loading SWE-bench/SWE-smith from HuggingFace...")
ds = load_dataset("SWE-bench/SWE-smith", split="train")

filtered_jsonl = Path("tool_gen/filtered_swe_smith.jsonl")
count = 0
with open(filtered_jsonl, "w") as fout:
    for obj in ds:
        if obj["instance_id"] in wanted_ids:
            fout.write(json.dumps(obj) + "\n")
            count += 1

print(f"Filtered file written to {filtered_jsonl} with {count} instances.") 