import os
import subprocess
import re
from collections import OrderedDict
import shutil

def is_valid_dot_line(line):
    """
    检查一行是否是有效的 .dot 语法
    """
    if not line or line.startswith('#'):
        return False
    if 'label' in line or '->' in line:
        return True
    return False

def balance_brackets(line):
    """
    检查并平衡未闭合的各种括号，包括尖括号
    """
    stack = []
    balanced_line = ''
    for char in line:
        if char in '([{<':
            stack.append(char)
            balanced_line += char
        elif char in ')]}>':
            if stack:
                last_open = stack.pop()
                if (last_open == '(' and char == ')') or \
                   (last_open == '[' and char == ']') or \
                   (last_open == '{' and char == '}') or \
                   (last_open == '<' and char == '>'):
                    balanced_line += char
                else:
                    stack.append(last_open)
                    stack.append(char)
            else:
                balanced_line += char
        else:
            balanced_line += char

    # 补全剩余的未闭合括号
    while stack:
        last_open = stack.pop()
        if last_open == '(':
            balanced_line += ')'
        elif last_open == '[':
            balanced_line += ']'
        elif last_open == '{':
            balanced_line += '}'
        elif last_open == '<':
            balanced_line += '>'

    return balanced_line

def should_delete_node(label):
    """
    判断节点是否应被删除：
    - 当 label 中包含 empty 且 SUB 为 1 时，返回 True
    """
    if 'empty' in label and re.search(r'<SUB>1</SUB>', label):
        return True
    return False
 
def merge_dot_files_from_folder(folder_path):
    nodes = OrderedDict()  # 使用 OrderedDict 以保留插入顺序并确保唯一性
    edges = []  # 使用列表来保留边的顺序
    block_nodes = {}  # 存储BLOCK标签的节点及其父节点
    parent_map = {}  # 映射父节点到子节点
    nodes_to_remove = set()  # 用于存储需要删除的节点
    
    if not os.path.isdir(folder_path):
        print(f"The folder path '{folder_path}' does not exist.")
        return
    
    # 读取所有节点和边
    
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.dot'):
            file_path = os.path.join(folder_path, file_name)
            if os.path.getsize(file_path) < 2000:  # 检查文件大小
                print(f"Skipping file '{file_name}' as its size is less than 1000 bytes.")
                continue  # 跳过小文件
            print(f"Processing file: {file_path}")
            try:
                with open(file_path, 'r') as f:
                    lines = f.readlines()
                    lines = lines[1:-1]
                    in_block = False  # 控制块的状态
                    block_node_label = ""  # 用于存储BLOCK节点的标签
                    node_label = ""
                    for i in range(len(lines)):
                        line = lines[i].strip()
                        #if is_valid_dot_line(line):                           
                            
                        if '->' in line:
                            # 处理边的定义
                            parent, child_info = line.split('->')
                            parent = parent.strip()
                            child_info = child_info.strip()
				 

                            # 分离标签
                            if '[' in child_info:
                                child, label = child_info.split('[', 1)
                                label = '[' + label.strip()  # 保留标签信息
                                child = child.strip()
                                # 检查下一行是否仍为标签的一部分
                                while i + 1 < len(lines):
                                    next_line = lines[i + 1].strip()
                                    
                                     # 检查下一行是否有label标签开头
                                    if '[ label =' in next_line:
                                     # 如果下一行包含label标签,说明是新边的定义，停止拼接
                                        break
                 
                                    if next_line.endswith(']'):
                                        label += " " + next_line  # 拼接闭合标签的行
                                        i += 1  # 移动到下一行，确保闭合标签的行被处理
                                        break  # 找到闭合标签后退出循环
                                    else:
                                        label += " " + next_line  # 拼接继续的行
                                        i += 1  # 移动到下一行，继续拼接标签       
                            else:
                                child = child_info
                                label = ''  # 如果没有标签，初始化为空
                            
                            label = label.strip()
                            edges.append((parent, child, label))
                            
                         # 更新父子节点映射
                            if parent not in parent_map:
                                parent_map[parent] = []
                            parent_map[parent].append(child)  # 记录父子关系
                        # 处理单独的节点定义，不需要隐式推导
                        else:
                                
                            # 处理 empty 节点
                            if 'BLOCK' in line:
                                if 'empty' in line:
                                    node_name = line.split(' ')[0].strip()  # 获取节点名称                                  
                                    block_nodes[node_name] = node_name  # 记录 empty 节点
                                else:
                                    in_block = True
                                    block_node_label = line.strip()  # 获取BLOCK节点label部分
                                    continue  # 跳过当前行，继续到下一行
                                if 'label' in line:
                                    label = line.split('label =', 1)[1].strip()  # 获取节点的 label 部分
                                    if should_delete_node(label):
                                        node_name = line.split(' ')[0].strip()  # 获取节点名称
                                        nodes_to_remove.add(node_name)  # 标记需要删除的节点
                            else:
                                # 处理拼接BLOCK内容
                                
                                if in_block:
                                    if '[label = <' in line:  # 判断下一行是否为 label
                                        
                                        # 到达BLOCK结束
                                        in_block = False
                                        nodes[block_node_label]=None  # 添加完整BLOCK节点到集合
                                        block_node_label = ""  # 重置BLOCK内容
                                        nodes[line] = None
                                    else:
                                    # 拼接当前行内容，去掉换行符
                                        block_node_label += " " + line.replace("\n", " ")
                                else:
                                    if '[label = <' in lines[i+1].strip() :
                                        if node_label == "":
                                            nodes[line] = None
                                        else:
                                            nodes[node_label] =  None
                                            node_label = ""
                                    elif '[ label =' in lines[i+1].strip():
                                        if node_label == "":
                                            nodes[line] = None
                                        else:
                                            nodes[node_label] = None
                                            node_label = ""
                                    else:
                                        node_label += " " + line.replace("\n", " ")
                       

            except Exception as e:
                print(f"Error reading file {file_path}: {e}")

    # 处理边关系并去除带有 BLOCK 标签的节点
    new_edges = []
    for parent, child, label in edges:
        if child in block_nodes:
            # 如果子节点是empty节点，查找其父节点并处理
            if child in parent_map:
                # 获取所有与该BLOCK节点相关的子节点
                children_of_block = parent_map[child]
                
                # 将BLOCK节点的所有子节点连到父节点
                for child_of_block in children_of_block:
                    new_edges.append((parent, child_of_block, label))  # 保留父节点到BLOCK的子节点的边
            continue  # 跳过当前BLOCK边的添加，避免将BLOCK节点加入
        if parent in nodes_to_remove or parent in block_nodes:
            continue;
        # 保留非 BLOCK 边
        new_edges.append((parent, child, label))

    # 输出调试信息
    #print(f"Final nodes after processing: {nodes}")
    #print(f"Final edges after processing: {new_edges}")

    # 返回新合并的DOT图
    return nodes.keys(), new_edges


    
def write_dot_file(output_file, nodes, edges):
    try:
        with open(output_file, 'w') as f:
            f.write("digraph G {\n")
            for node in nodes:
                if is_valid_dot_line(node):
                    node = balance_brackets(node)
                    f.write(f"    {node}\n")  # 写入所有节点
            for parent, child, label in edges:
                if is_valid_dot_line(f"{parent} -> {child} {label}"):  # 检查边是否有效
                    f.write(f"    {parent} -> {child} {label}\n")  # 写入所有边，保持标签格式
            f.write("}\n")
        print(f"Combined .dot file created as {output_file}")
    except Exception as e:
        print(f"Error writing to file {output_file}: {e}")

# 读取文件路径列表
def read_file_paths(file_with_paths):
    with open(file_with_paths, 'r') as file:
        return [line.strip() for line in file if line.strip()]
        
def process_files_with_joern(file_with_paths, output_folder, combine_folder_name, file_counter):
    file_paths = read_file_paths(file_with_paths)
    
    # 确保输出文件夹存在
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    # 创建一个子文件夹用于存储合并的 .dot 文件
    combine_folder = os.path.join(output_folder, combine_folder_name)
    if not os.path.exists(combine_folder):
        os.makedirs(combine_folder)
   
    # 遍历输入文件夹中的所有文件
    for file_path in file_paths:
        if os.path.isfile(file_path):
            print(f"Processing file: {file_path}")
            output_file_path = os.path.join(output_folder, f"{os.path.basename(file_path)}.cpg")
            
            # 获取相对路径（从"c/"开始），并构造输出的 .dot 文件路径
            relative_path = os.path.relpath(file_path, start='./data/sv-benchmarks/c')
            relative_path = relative_path.replace('/','-')
            combined_file_name = os.path.splitext(relative_path)[0] + '.dot'
            combined_file_path = os.path.join(combine_folder, combined_file_name)

            # 确保目标文件夹存在
            os.makedirs(os.path.dirname(combined_file_path), exist_ok=True)
            # 调用 Joern 处理文件
            try:
                if file_path.endswith('.i'):
            	    # 使用 gcc 将 .i 文件转换为 .c 文件
            	    c_file_path = file_path[:-2] + '.c'
            	    subprocess.run(['mv', file_path, c_file_path], check=True)
                    #print(f"Converted {file_path} to {c_file_path}")
            	    # 现在使用转换后的 .c 文件进行解析
            	    file_path = c_file_path
                subprocess.run(['joern-parse', file_path], check=True)
                subprocess.run(['joern-export', 'cpg.bin', '-o', output_file_path], check=True)
                
                largest_nodes, largest_edges = merge_dot_files_from_folder(output_file_path)
                
                write_dot_file(combined_file_path, largest_nodes, largest_edges)  # 写入合并后的 .dot 文件
                print(f"File processed and saved to: {combined_file_path}")

                # 删除生成的中间 .dot 文件，只保留最后合并的
                dot_files = [f for f in os.listdir(output_file_path) if f.endswith('.dot')]
                for dot_file in dot_files:
                    file_to_remove = os.path.join(output_file_path, dot_file)
                    if os.path.isfile(file_to_remove):
                        os.remove(file_to_remove)
                        print(f"Removed intermediate .dot file: {file_to_remove}")
               
               # 删除中间件文件夹
                shutil.rmtree(output_file_path)
                print(f"Removed middleware folder: {output_file_path}")

                # 增加文件计数
                file_counter['count'] += 1
            except subprocess.CalledProcessError as e:
                print(f"Error processing file {file_path}: {e}")

# 初始化文件计数器
file_counter = {'count': 0}

# 循环处理 part_1.txt 到 part_13.txt
for i in range(1, 25):
    file_with_paths = f'./data/part_{i}.txt'
    output_folder = './data/split-file'
    combine_folder_name = f'folder_{i}'

    # 调用处理函数
    process_files_with_joern(file_with_paths, output_folder, combine_folder_name, file_counter)

# 输出总共处理的文件数量
print(f"Total number of files processed: {file_counter['count']}")


