import os
import json
from tqdm import tqdm
import sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from utils import generate_repo_id


def load_json(in_file):
    with open(in_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data


def load_jsonl(in_file):
    datas = []
    with open(in_file, "r", encoding="utf-8") as f:
        for line in tqdm(f):
            datas.append(json.loads(line))
    return datas


def save_jsonl(datas, out_file):
    with open(out_file, "w", encoding="utf-8") as f:
        for data in tqdm(datas):
            f.write(json.dumps(data, ensure_ascii=False) + "\n")


def has_duplicate_api_endpoints(api_endpoints):
    seen = []
    for endpoint in api_endpoints:
        path = endpoint.get("path", "")
        method = endpoint.get("method", "GET").upper()
        identifier = f"{method} {path}"
        if identifier in seen:
            return True
        seen.append(identifier)
    return False


def filter_nestjs_github_repos(orig_jsonl_file, out_jsonl_file, log_root_dir, quality_thresh, page_thresh, api_thresh):
    log_dirs = [os.path.join(log_root_dir, d) for d in os.listdir(log_root_dir) if os.path.isdir(os.path.join(log_root_dir, d))]
    print(f"Total num of samples: {len(log_dirs)}")
    filtered_samples = []
    for log_dir in tqdm(log_dirs):
        result_file = os.path.join(log_dir, "finished.json")
        if not os.path.isfile(result_file):
            continue
        data = load_json(result_file)
        data["id"] = os.path.basename(log_dir)
        data.pop("content", None)
        summary = data["summary"]
        quality_score = summary["qualityScore"]
        page_num = len(summary.get("frontendPlan", {}).get("pages", []))
        api_endpoints = summary.get("backendPlan", {}).get("apiEndpoints", [])
        api_num = len(api_endpoints)
        has_duplicate = has_duplicate_api_endpoints(api_endpoints)
        print(f"{data['id']}: quality score {quality_score}; page num {page_num}; api num {api_num}; duplicate api: {has_duplicate}")
        if quality_score >= quality_thresh and page_num >= page_thresh and api_num >= api_thresh and not has_duplicate:
            filtered_samples.append(data)
    print(f"Filtered samples: {len(filtered_samples)}")

    filtered_samples_dict = {data["id"]: data for data in filtered_samples}

    orig_datas = load_jsonl(orig_jsonl_file)
    for orig_data in tqdm(orig_datas):
        sample_id = generate_repo_id(orig_data["url"])
        if sample_id in filtered_samples_dict:
            filtered_samples_dict[sample_id].update(orig_data)

    final_samples = [v for _, v in filtered_samples_dict.items()]

    save_jsonl(final_samples, out_jsonl_file)


if __name__ == "__main__":
    orig_jsonl_file = "src/run_process_data/jsonl_files/nestjs_github-repos.jsonl"
    out_jsonl_file = "src/run_process_data/jsonl_files/nestjs_github-repos_filtered-with-info_Qwen3-Coder-30B-A3B-Instruct.jsonl"
    log_root_dir = "logs_root/model-Qwen3-Coder-30B-A3B-Instruct_hist-100_iter-200_compress-0.5_sum-5_nestjs_info"
    quality_thresh = 3
    page_thresh = 1
    api_thresh = 2
    filter_nestjs_github_repos(orig_jsonl_file, out_jsonl_file, log_root_dir, quality_thresh, page_thresh, api_thresh)


