import json
import os
from unidiff import PatchSet
from collections import Counter
from tqdm import tqdm
from localize.language_specify import Language, LanguageManager, LANGUAGE_TO_EXTENSIONS

def show_counter(counter):
    for item, count in counter.items():
        print(f"{item}: {count}.")
    total_items = sum(counter.values())
    print(f"Total: {total_items}.")

def extract_patches(patch, language) -> tuple[str, str]:
    """
    Get patch and test patch from PR

    Args:
        pull (dict): PR dictionary object from GitHub
        repo (Repo): Repo object
    Return:
        patch_change_str (str): gold patch
        patch_test_str (str): test patch
    """
    patch_test = ""
    patch_fix = ""
    language_attr = LanguageManager(language)
    for hunk in PatchSet(patch):
        # TODO: for those files with unmatched suffix, we put it in test patch for now.
        # Because in the code parsing, both test file and unmatched code file will be excluded because of test file filtering and parsing error.
        if language_attr.is_test_path(hunk.path) or not hunk.path.endswith(LANGUAGE_TO_EXTENSIONS[language]):
            patch_test += str(hunk)
        else:
            patch_fix += str(hunk)
    return patch_fix, patch_test

def load_crawled_data_to_swe(jsonl_path):
    """Load crawled data from a JSONL file and convert it to the SWE format.
    
    :param jsonl_path: Path to the JSONL file containing crawled data.
    :return: A list of dictionaries in the SWE format.
    """
    swe_data = []

    supported_language = {
        'Python': Language.PYTHON,
        'Go': Language.GO,
        'JavaScript': Language.JAVASCRIPT,
        'Ruby': Language.RUBY,
        'PHP': Language.PHP,
        'Java': Language.JAVA,
        'TypeScript': Language.TYPESCRIPT,
        'C#': Language.C_SHARP,
        'C++': Language.CPP,
        'C': Language.C
    }

    original_count = Counter()
    return_count = Counter()
    
    with open(jsonl_path, 'r') as file:
        for line in tqdm(file):
            data = json.loads(line.strip())
            original_count.update([data["patch_analysis"]["language"] ])
            if "info" not in data:
                continue
            if "diff" not in data["info"]:
                continue
            if data["patch_analysis"]["language"] not in supported_language:
                continue
            lang = supported_language[data["patch_analysis"]["language"]]
            patch, test_patch = extract_patches(data["info"]["diff"], lang)
            if 'repo' in data and patch.strip():
                return_count.update([data["patch_analysis"]["language"]])
                swe_data.append({
                    "repo": data["repo"],
                    "language": lang.value,
                    "instance_id": data['repo'].replace("/", "__") + "-" + data["pr_url"].split("/")[-1],
                    "base_commit": data["base_sha"],
                    "problem_statement": data["problem_statement"],
                    "hints_text": "",
                    "patch": patch,
                    "test_patch": test_patch,
                })
    print("Original data:")
    show_counter(original_count)
    print("Data after initial filtering")
    show_counter(return_count)
    return swe_data
