import re
import html
import os
from gensim.models import Word2Vec
import numpy as np
from concurrent.futures import ProcessPoolExecutor, as_completed
from functools import partial
from tqdm import tqdm

# �~H~F�~M�~G��~U�
def tokenize_expression(expression: str) -> list:
    # �~L��~E~M�| ~G记�~L�~L~E�~P��| ~G�~F符�~@~A�~U��~W�~@~A�~P�~W符�~@~A�~K��~O��~@~A�~W符串�~I
    tokens = re.findall(r'\w+|\S', expression)
    # �~N��~Y��~M�~\~@�~A�~Z~D符�~O�
    tokens = [token for token in tokens if token not in {',', '(', ')', '[', ']', '"'}]
    return tokens

def parse_dot_file(dot_content):
    nodes = []
    edges = []
    node_alltokens=[]
    node_id_to_index={}
    index=0

    # 正�~H~Y表达�~O�~L��~E~M�~J~B�~B��~R~L边
    node_pattern = r'"(\d+)"\s+\[label\s*=\s*<([^>]+)>'
    edge_pattern = r'"(\d+)"\s*->\s*"(\d+)"\s*\[ label\s*=\s*"([^"]+)"\]'
    
    print("Starting parse")

    # �~L��~E~M�~J~B�~B�
    for match in re.finditer(node_pattern, dot_content):
        node_id = match.group(1)
        label = match.group(2)
        # �~D�~P~F HTML �~^�~S并�~N��~Y� <SUB> �~R~L </SUB> �| ~G签�~O~J�~E��~F~E容
        clean_label = html.unescape(label)
        clean_label = re.sub(r'<SUB', '', clean_label)  # �~N��~Y� <SUB> �| ~G签�~O~J�~E��~F~E容
        # �~L��~E~M以大�~F~Y�~W�~M�~@头�~Z~D�~M~U�~M�~H�~B IDENTIFIER�~I
        first_word_match = re.search(r'\b[A-Z_]+\b', clean_label.split(',')[0])
        if first_word_match:
            first_word = first_word_match.group(0)
        # �~O~P�~O~V�~I��~Y�~C��~H~F�~L�~N��~N~I第�~@个�~@~W�~O��~K�~I~M�~Z~D�~F~E容
            remaining_code = clean_label[len(first_word_match.group(0)) + 1:]
            remaining_code = remaining_code.strip()
            remaining_code = re.sub(r'^[,(]+', '', remaining_code)  # �~N��~N~I�~@头�~Z~D ( �~H~V ,
            node_text = re.sub(r'\)$', '', remaining_code, 1)  # �~N��~N~I�~\~@�~P~N�~@个 )     
        else:
        # �~B�~^~\没�~\~I�~L��~E~M�~H�第�~@个大�~F~Y�~M~U�~M�~L�~T�~[~^�~N~_�~V~G�~\�
            first_word="METHOD"
            remaining_code = re.sub(r'^[,(]+', '', clean_label)  # �~N��~N~I�~@头�~Z~D ( �~H~V ,
            node_text = re.sub(r'\)$', '', remaining_code, 1)  # �~N��~N~I�~\~@�~P~N�~@个 )  
        # �~X�~B� node_id �~R~L index �~Z~D�~X| �~D
        node_id_to_index[node_id] = index
        index += 1
        # �~X�~B�word2vct�~Z~D�~I~@�~\~Itoken
        node_tokens=tokenize_expression(node_text)
        node_alltokens.append(node_tokens)
        # �~X�~B�nodes
        nodes.append((node_id,clean_label,first_word,node_text))

    # �~L��~E~M边
    for match in re.finditer(edge_pattern, dot_content):
        source = match.group(1)
        target = match.group(2)
        label = match.group(3)
        edge_type_match = re.search(r'\b[A-Z_]+\b', label.split(':')[0])
        edge_type=edge_type_match.group(0)
        edge_code = label[len(edge_type_match.group(0)) + 1:]
        edge_code = edge_code.strip()
        edge_text = re.sub(r'^[:(]+', '', edge_code)  # �~N��~N~I�~@头�~Z~D ( �~H~V ,
        edges.append((source, target, label,edge_type,edge_text))

    return nodes, edges,node_alltokens,node_id_to_index

def save_to_file(unique_words, file_path):
    with open(file_path, 'w') as file:
        for word, count in unique_words.items():
            file.write(f'"{word}": {count}\n')

def collect_unique_words(directory_path, unique_words, counter):
    files = [f for f in os.listdir(directory_path) if f.endswith('.dot')]
    for filename in tqdm(files, desc=f"Processing {directory_path}"):
        file_path = os.path.join(directory_path, filename)

        # 从文件中读取内容
        with open(file_path, 'r') as file:
            dot_content = file.read()
        # 解析 .dot 文件内容
        nodes, edges, _, _ = parse_dot_file(dot_content)
 
        # 从 node 中提取 token
        for _, _, first_word, _ in nodes:
            if first_word not in unique_words:
                unique_words[first_word] = counter
                counter += 1

        # 从 edge 中提取 token
        for _, _, _, edge_type, _ in edges:
            if edge_type not in unique_words:
                unique_words[edge_type] = counter
                counter += 1

    return unique_words, counter

def load_from_file(file_path):
    unique_words = {}
    counter = 0
    with open(file_path, 'r') as file:
        for line in file:
            word, index = line.strip().split(': ')
            unique_words[word.strip('"')] = int(index)
            counter = max(counter, int(index) + 1)
    return unique_words, counter

base_folder_path = './data/split-file/'
out_file_path = './data/token.txt'
if os.path.exists(out_file_path):
    unique_words, counter = load_from_file(out_file_path)
else:
    unique_words = {}
    counter = 0
for i in range(1, 25):  # 遍历 folder_1 到 folder_13
    folder_path = os.path.join(base_folder_path, f'folder_{i}')
    unique_words, counter = collect_unique_words(folder_path, unique_words, counter)
    print("一处理一个文件夹")
save_to_file(unique_words, out_file_path)
