import os
import json
from tqdm import tqdm
import sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from utils import generate_repo_id


def load_json(in_file):
    with open(in_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data


def load_jsonl(in_file):
    datas = []
    with open(in_file, "r", encoding="utf-8") as f:
        for line in tqdm(f):
            datas.append(json.loads(line))
    return datas


def save_jsonl(datas, out_file):
    with open(out_file, "w", encoding="utf-8") as f:
        for data in tqdm(datas):
            f.write(json.dumps(data, ensure_ascii=False) + "\n")


def filter_nextjs_github_repos(orig_jsonl_file, out_jsonl_file, log_root_dir, quality_thresh, page_thresh):
    log_dirs = [os.path.join(log_root_dir, d) for d in os.listdir(log_root_dir) if os.path.isdir(os.path.join(log_root_dir, d))]
    print(f"Total num of samples: {len(log_dirs)}")
    filtered_samples = []
    for log_dir in tqdm(log_dirs):
        result_file = os.path.join(log_dir, "finished.json")
        if not os.path.isfile(result_file):
            continue
        data = load_json(result_file)
        data["id"] = os.path.basename(log_dir)
        data.pop("content", None)
        summary = data["summary"]
        quality_score = summary["qualityScore"]
        page_num = len(summary.get("frontendPlan", {}).get("pages", []))
        print(f"{data['id']}: quality score {quality_score}; page num {page_num}")
        if quality_score >= quality_thresh and page_num >= page_thresh:
            filtered_samples.append(data)
    print(f"Filtered samples: {len(filtered_samples)}")

    filtered_samples_dict = {data["id"]: data for data in filtered_samples}

    orig_datas = load_jsonl(orig_jsonl_file)
    for orig_data in tqdm(orig_datas):
        sample_id = generate_repo_id(orig_data["url"])
        if sample_id in filtered_samples_dict:
            filtered_samples_dict[sample_id].update(orig_data)

    final_samples = [v for _, v in filtered_samples_dict.items()]

    save_jsonl(final_samples, out_jsonl_file)


def main():
    orig_jsonl_file = "src/run_process_data/jsonl_files/nextjs_github-repos.jsonl"
    out_jsonl_file = "src/run_process_data/jsonl_files/nextjs_github-repos_filtered-with-info_Qwen3-Coder-30B-A3B-Instruct.jsonl"
    log_root_dir = "logs_root/model-Qwen3-Coder-30B-A3B-Instruct_hist-100_iter-200_compress-0.5_sum-5_nextjs_info"
    quality_thresh = 3
    page_thresh = 1
    filter_nextjs_github_repos(orig_jsonl_file, out_jsonl_file, log_root_dir, quality_thresh, page_thresh)


if __name__ == "__main__":
    main()