import nltk
from nltk.corpus import words
import os
import json
from concurrent.futures import ProcessPoolExecutor

# 在脚本开始时下载一次英语单词库
# nltk.download('words', quiet=True)
nltk.data.path.append(r'C:\Users\Administrator\AppData\Roaming\nltk_data')  # 替换为实际路径

# 获取英语单词列表并转换为集合
english_words = set(words.words())


def find_type_name_pairs(node):
    pairs = []
    if isinstance(node, dict):
        if 'type' in node and 'name' in node:
            pairs.append({'type': node['type'], 'name': node['name']})
        for key, value in node.items():
            pairs.extend(find_type_name_pairs(value))
    elif isinstance(node, list):
        for item in node:
            pairs.extend(find_type_name_pairs(item))

    return pairs


# def get_parameterName():
#     def delete_ordinaryWord(parameter_names):
#         # 下载英语单词库
#         nltk.download('words')
#         string_list = list(parameter_names)
#
#         # 获取英语单词列表
#         english_words = set(words.words())
#         filtered_list = [word for word in string_list if word.lower() not in english_words]
#         return set(filtered_list)
#
#     # def get_ast_parameter_function():
#     ast_json_dir = r'E:\2024\experiment_code_clone\total4\source_code_no_comments\ast'
#     excluded_types = {'ElementaryTypeName', 'Identifier', 'PragmaDirective', 'ElementaryTypeNameExpression'}
#     parameter_names = set()
#     final_pairs = []
#
#     for filename in os.listdir(ast_json_dir):
#         if filename.endswith('.json'):
#             with open(os.path.join(ast_json_dir, filename), 'r', encoding='utf-8') as f:
#                 ast_data = json.load(f)
#             type_name_pairs = find_type_name_pairs(ast_data)
#             for pair in type_name_pairs:
#                 if pair['type'] not in excluded_types:
#                     final_pairs.append(pair)
#                     parameter_names.add(pair['name'])
#
#     parameter_names = delete_ordinaryWord(parameter_names)
#
#     with open('./ast_parameterName.json', 'w') as f:
#         json.dump(list(parameter_names), f)


def delete_ordinaryWord(word):
    return word.lower() not in english_words


def process_words_parallel(parameter_names):
    with ProcessPoolExecutor() as executor:
        filtered_list = list(executor.map(delete_ordinaryWord, parameter_names))
    return set(word for word, keep in zip(parameter_names, filtered_list) if keep)


def get_parameterName_no_simple_words():
    ast_json_dir = r'E:\2024\experiment_code_clone\total4\source_code_no_comments\ast'
    excluded_types = {'ElementaryTypeName', 'Identifier', 'PragmaDirective', 'ElementaryTypeNameExpression'}
    parameter_names = set()
    final_pairs = []

    for filename in os.listdir(ast_json_dir):
        if filename.endswith('.json'):
            with open(os.path.join(ast_json_dir, filename), 'r', encoding='utf-8') as f:
                ast_data = json.load(f)
            type_name_pairs = find_type_name_pairs(ast_data)
            for pair in type_name_pairs:
                if pair['type'] not in excluded_types:
                    final_pairs.append(pair)
                    parameter_names.add(pair['name'])

    parameter_names = {name for name in parameter_names if (name is not None) and (len(name) > 8)}

    parameter_names = process_words_parallel(parameter_names)

    with open('./ast_parameterName_no_simple_word.json', 'w') as f:
        json.dump(list(parameter_names), f)


def replace_substrings(contents, substrings, replacement):
    for substring in substrings:
        contents = contents.replace(substring, replacement)
    return contents


if __name__ == '__main__':
    get_parameterName_no_simple_words()
    # get_parameterName()

    # contents = "a bcd e f g h I de fip"
    # substrings = ['i', 'f', 'e', 'b']
    #
    # # 替换操作
    # updated_contents = replace_substrings(contents, substrings, "parameterName")
    #
    # print(updated_contents)
