import json
import os  # 导入 os 模块, 用于Checkfiles是否exists
from datasets import load_from_disk, load_dataset
from get_repo_structure.get_repo_structure import get_project_structure_from_scratch

# Loaddata集
print("加载数据")
# swe_bench_data = load_from_disk("./datasets/SWE-bench_Lite_test")
# swe_bench_data = load_dataset("princeton-nlp/SWE-bench_Lite")["test"]
# swe_bench_data = load_dataset("GMagoLi/FEA-Bench-v0.1-lite-standard")["test"]
DATA_FILE = "task_samples.jsonl"
STRUCTURE_DIR = "repo_structures"

swe_bench_data = [json.loads(line) for line in open(DATA_FILE, "r")]
os.makedirs(STRUCTURE_DIR, exist_ok=True)

# 逐个Processdata集中的 bug instance
for bug in swe_bench_data:

    instance_id = bug['repo'].replace("/", "__") + "-" + bug["pr_url"].split("/")[-1]
    # 构造 JSON filespath
    json_file_path = os.path.join(STRUCTURE_DIR, f"{instance_id}.json")

    # Checkfiles是否已经exists, 如果存在则skipped
    if os.path.exists(json_file_path):
        print(f"文件 {json_file_path} 已存在, 跳过该实例.")
        continue

    # 如果files不exists, 则Generateitemstructure并Save
    print(f"处理实例 {instance_id}...")
    d = get_project_structure_from_scratch(
        bug["repo"], bug["base_sha"], instance_id, "playground"
    )

    # 将itemstructureSave到 JSON files中
    with open(json_file_path, "w") as json_file:
        json.dump(d, json_file, indent=4, ensure_ascii=False)

    print(f"已保存 {json_file_path}")
