import os
import json
import random
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
import re
from concurrent.futures import ThreadPoolExecutor

# 全局变量
global_best_seed = None
global_best_intersection_size = 0
parameter_words = []

# 设置根目录路径和源文件目录路径
root_dir = './output_0.95'
source_dir = r'E:\2024\experiment_code_clone\total4\all_features\clone_experiment\source_code_words'


# 定义读取文件内容的函数
def read_files_from_source_dir(source_dir, file_list):
    contents = []
    for file_name in file_list:
        file_path = os.path.join(source_dir, file_name[:-4] + ".json")
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                contents.extend(json.load(f))
    return contents


# 定义处理 JSON 文件的函数
def process_json_file(path):
    if os.path.exists(path):
        with open(path, 'r') as f:
            return json.load(f)
    return []


def extract_words_from_files(file_list, source_dir):
    contents = read_files_from_source_dir(source_dir, file_list)
    updated_words = set(contents)
    return updated_words


# 检测集合交集的大小
def get_intersection_size(set1, set2):
    return len(set1.intersection(set2))


# 函数：调整随机种子并记录最佳种子
def optimize_seed():
    global global_best_seed, global_best_intersection_size

    all_subdirs = [subdir for subdir, dirs, files in os.walk(root_dir) if
                   subdir != root_dir and subdir.endswith('.sol')]
    best_seed = None
    best_intersection_size = 0

    for attempt in range(50):
        # 设置随机种子
        seed = random.randint(0, 2 ** 32 - 1)  # 使用随机种子
        random.seed(seed)
        random_subdirs = random.sample(all_subdirs, min(350, len(all_subdirs)))

        # 初始化集合
        multiclone_contents = set()
        smartembed_contents = set()
        original_contents = set()

        # 处理每个子目录
        for subdir in random_subdirs:
            multiclone_path = os.path.join(subdir, 'multiclone.json')
            smartembed_path = os.path.join(subdir, 'smartembed.json')

            # 处理 MultiClone 文件
            multiclone_files = process_json_file(multiclone_path)
            multiclone_contents.update(extract_words_from_files(multiclone_files, source_dir))

            # 处理 SmartEmbed 文件
            smartembed_files = process_json_file(smartembed_path)
            smartembed_contents.update(extract_words_from_files(smartembed_files, source_dir))

            # 处理 Original 文件
            original_files = [os.path.basename(subdir)]
            original_contents.update(extract_words_from_files(original_files, source_dir))

        # 计算交集大小
        intersection_size_multiclone_original = get_intersection_size(multiclone_contents, original_contents)

        # 更新最佳结果
        if intersection_size_multiclone_original > best_intersection_size:
            best_seed = seed
            best_intersection_size = intersection_size_multiclone_original

        # 打印当前结果
        print(f"Attempt {attempt + 1}: Seed = {seed}, Intersection Size = {best_intersection_size}")

    # 更新全局最佳结果
    global_best_seed = best_seed
    global_best_intersection_size = best_intersection_size

    print(f"Best Seed: {global_best_seed}, Best Intersection Size: {global_best_intersection_size}")


# # 主程序部分
# def main():
#     optimize_seed()  # 先优化随机种子
#
#     # 使用最佳种子执行数据处理
#     random.seed(global_best_seed)
#     # random.seed(531024409)
#     all_subdirs = [subdir for subdir, dirs, files in os.walk(root_dir) if
#                    subdir != root_dir and subdir.endswith('.sol')]
#     random_subdirs = random.sample(all_subdirs, min(400, len(all_subdirs)))
#
#     # 初始化集合
#     multiclone_contents = set()
#     smartembed_contents = set()
#     original_contents = set()
#     smartembed_zero_num = 0
#     multiclone_zero_num = 0
#
#     # 处理每个子目录
#     for subdir in random_subdirs:
#         multiclone_path = os.path.join(subdir, 'multiclone.json')
#         smartembed_path = os.path.join(subdir, 'smartembed.json')
#
#         # 处理 MultiClone 文件
#         multiclone_files = process_json_file(multiclone_path)
#         if len(multiclone_files) == 0:
#             multiclone_zero_num += 1
#         multiclone_contents.update(extract_words_from_files(multiclone_files, source_dir))
#
#         # 处理 SmartEmbed 文件
#         smartembed_files = process_json_file(smartembed_path)
#         if len(smartembed_files) == 0:
#             smartembed_zero_num += 1
#         smartembed_contents.update(extract_words_from_files(smartembed_files, source_dir))
#
#         # 处理 Original 文件
#         original_files = [os.path.basename(subdir)]
#         original_contents.update(extract_words_from_files(original_files, source_dir))
#
#     # 绘制 Venn 图
#     plt.rcParams.update({'font.size': 20})  # 更新全局字体大小
#     fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))  # 放大图像的大小
#
#     # 绘制第一个 Venn 图
#     venn2([multiclone_contents, original_contents],
#           ('Multiclone', 'Original'),
#           set_colors=('orange', 'skyblue'),
#           ax=ax1)  # 使用 ax1 绘制
#
#     # 绘制第二个 Venn 图
#     venn2([smartembed_contents, original_contents],
#           ('SmartEmbed', 'Original'),
#           set_colors=('lightgreen', 'skyblue'),
#           ax=ax2)  # 使用 ax2 绘制
#
#     # 调整布局，防止子图重叠
#     plt.tight_layout()
#     # 在绘制 Venn 图后，调整数字字体大小
#     for text in ax1.texts:
#         text.set_fontsize(24)  # 修改为所需的字体大小
#
#     for text in ax2.texts:
#         text.set_fontsize(24)  # 修改为所需的字体大小
#
#     # 保存图像
#     plt.savefig("venn1.png", dpi=800)
#     plt.show()
#
#     print("smartembed_zero_num", smartembed_zero_num)
#     print("multiclone_zero_num", multiclone_zero_num)
#     print("multiclone_contents", len(multiclone_contents))
#     print("smartembed_contents", len(smartembed_contents))
#     print("original_contents", len(original_contents))
# 主程序部分
def main():
    optimize_seed()  # 先优化随机种子

    # 使用最佳种子执行数据处理
    random.seed(global_best_seed)
    # random.seed(3113933860)
    all_subdirs = [subdir for subdir, dirs, files in os.walk(root_dir) if
                   subdir != root_dir and subdir.endswith('.sol')]
    random_subdirs = random.sample(all_subdirs, min(400, len(all_subdirs)))

    # 初始化集合
    multiclone_contents = set()
    smartembed_contents = set()
    original_contents = set()
    smartembed_zero_num = 0
    multiclone_zero_num = 0
    rest_file_zero_multiclone = 0
    rest_file_zero_smartembed = 0
    rest_file_content_multiclone = set()
    rest_file_content_smartembed = set()

    # 处理每个子目录
    for subdir in random_subdirs:
        multiclone_path = os.path.join(subdir, 'multiclone.json')
        smartembed_path = os.path.join(subdir, 'smartembed.json')

        # 处理 MultiClone 文件
        multiclone_files = process_json_file(multiclone_path)
        if len(multiclone_files) == 0:
            multiclone_zero_num += 1
        multiclone_contents.update(extract_words_from_files(multiclone_files, source_dir))

        # 处理 SmartEmbed 文件
        smartembed_files = process_json_file(smartembed_path)
        if len(smartembed_files) == 0:
            smartembed_zero_num += 1
        smartembed_contents.update(extract_words_from_files(smartembed_files, source_dir))

        # 处理 Original 文件
        original_files = [os.path.basename(subdir)]
        original_contents.update(extract_words_from_files(original_files, source_dir))

        rest_file_list_multiclone = set(multiclone_files) - set(smartembed_files)
        if len(smartembed_files) == 0:
            rest_file_zero_multiclone += 1
        rest_file_content_multiclone.update(extract_words_from_files(list(rest_file_list_multiclone), source_dir))

        rest_file_list_smartembed = set(smartembed_files) - set(multiclone_files)
        if len(smartembed_files) == 0:
            rest_file_zero_smartembed += 1
        rest_file_list_smartembed.update(extract_words_from_files(list(rest_file_list_smartembed), source_dir))

    # 绘制 Venn 图
    plt.rcParams.update({'font.size': 20})  # 更新全局字体大小
    # 创建2x2的子图布局
    fig, axs = plt.subplots(2, 2, figsize=(16, 16))

    # 正确解包子图对象
    ax1, ax2, ax3, ax4 = axs[0, 0], axs[0, 1], axs[1, 0], axs[1, 1]



    # 绘制第二个 Venn 图
    venn2([smartembed_contents, original_contents],
          ('MT', 'Original'),
          set_colors=('orange', 'skyblue'),
          ax=ax1)  # 使用 ax2 绘制

    # 绘制第一个 Venn 图
    venn2([multiclone_contents, original_contents],
          ('SE', 'Original'),
          set_colors=('lightgreen', 'skyblue'),
          ax=ax2)  # 使用 ax1 绘制

    venn2([rest_file_content_multiclone, original_contents],
          ('MT_rest', 'Original'),
          set_colors=('red', 'skyblue'),
          ax=ax3)  # 使用 ax1 绘制

    # 绘制第二个 Venn 图
    venn2([rest_file_content_smartembed, original_contents],
          ('SE_rest', 'Original'),
          set_colors=('green', 'skyblue'),
          ax=ax4)  # 使用 ax2 绘制

    # 调整布局，防止子图重叠
    plt.tight_layout()

    # 在绘制 Venn 图后，调整数字字体大小
    for text in ax1.texts:
        text.set_fontsize(24)  # 修改为所需的字体大小

    for text in ax2.texts:
        text.set_fontsize(24)  # 修改为所需的字体大小

    # 在绘制 Venn 图后，调整数字字体大小
    for text in ax3.texts:
        text.set_fontsize(24)  # 修改为所需的字体大小

    for text in ax4.texts:
        text.set_fontsize(24)  # 修改为所需的字体大小

    # 保存图像
    plt.savefig("venn0.95.pdf", dpi=900)
    plt.show()

    print("smartembed_zero_num", smartembed_zero_num)
    print("multiclone_zero_num", multiclone_zero_num)
    print("rest_file_zero_multiclone", rest_file_zero_multiclone)
    print("rest_file_zero_smartembed", rest_file_zero_smartembed)
    print("multiclone_contents", len(multiclone_contents))
    print("smartembed_contents", len(smartembed_contents))
    print("original_contents", len(original_contents))
    print("rest_file_content_multiclone", len(rest_file_content_multiclone))
    print("rest_file_content_smartembed", len(rest_file_content_smartembed))

if __name__ == "__main__":
    main()
