import os
import json
import re

# 加载 substrings
with open('./ast_parameterName_no_simple_word.json', 'r') as f:
    substrings = json.load(f)

# 将 substrings 转换为字典以加速查找
substrings_dict = {s: "parameterName" for s in substrings}


def format_code(content):
    # 在指定字符的前后加入空格
    content = re.sub(r'([\{\}\,\(\)\!\^\[\]\;])', r' \1 ', content)
    # 去除多余的空格
    content = re.sub(r'\s+', ' ', content)
    return content


# def format_code(content):
#     # 在指定字符的前后加入空格
#     content = re.sub(r'(\{)', r' \1 ', content)
#     content = re.sub(r'(\})', r' \1 ', content)
#     content = re.sub(r'(\,)', r' \1 ', content)
#     content = re.sub(r'(\))', r' \1 ', content)
#     content = re.sub(r'(\()', r' \1 ', content)
#     content = re.sub(r'(\!)', r' \1 ', content)
#     content = re.sub(r'(\^)', r' \1 ', content)
#     content = re.sub(r'(\[)', r' \1 ', content)
#     content = re.sub(r'(\])', r' \1 ', content)
#     content = re.sub(r'(\;)', r' \1 ', content)
#
#     # 去除多余的空格
#     content = re.sub(r'\s+', ' ', content)
#
#     return content

def replace_substrings(content, substrings_dict, replacement):
    # 将 substrings_dict 的键转换为正则表达式模式
    pattern = re.compile(r'(?<!\w)(' + '|'.join(re.escape(s) for s in substrings_dict) + r')(?!\w)', re.IGNORECASE)
    # 使用正则表达式替换匹配的内容
    updated_content = pattern.sub(replacement, content)
    return updated_content


def process_file_content(content, substrings_dict, replacement):
    formatted_content = format_code(content)
    return replace_substrings(formatted_content, substrings_dict, replacement)


def extract_words_from_files(content):
    updated_words = set()

    updated_contents = process_file_content(content, substrings_dict, "parameterName")
    updated_words.update(updated_contents.split())

    return list(updated_words)


if __name__ == "__main__":
    source_code_dir = r'E:\2024\experiment_code_clone\total4\all_features\source_code_no_comments'
    for file in os.listdir(source_code_dir):
        if file.endswith('.sol'):
            file_path = os.path.join(source_code_dir, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                contents = f.read()
            updated_words = extract_words_from_files(contents)

            os.makedirs('./source_code_words', exist_ok=True)
            print("file[:-4] + '.json'", file[:-4] + '.json')
            with open(os.path.join('./source_code_words', file[:-4] + '.json'), 'w') as f:
                json.dump(updated_words, f)
