import os
import re

# for root, dirs, files in os.walk(wild_clean_input_folder):
#     content1=[]
#     content2=[]
#     for file in files:
#         if 'all' in file:
#             file_num = int(file[:-4])
#             input_file_name = ''
#             if file_num <= 1527:
#                 input_file_name = "all1.txt"
#                 input_all_file = os.path.join(wild_clean_input_folder, input_file_name)
#                 with open(os.path.join(wild_clean_input_folder,file), 'r') as f:
#                     content = f.readlines()
#                 unique_content=list(set(content))
#                 content1.extend(unique_content)
#
#             elif 1527 < file_num <= 1988:
#                 input_file_name = "all2.txt"
#                 input_all_file = os.path.join(wild_clean_input_folder, input_file_name)
#                 with open(os.path.join(wild_clean_input_folder, file), 'r') as f:
#                     content = f.readlines()
#                 unique_content = list(set(content))
#                 content2.extend(unique_content)
#
#             elif 1988 < file_num <= 2455:
#                 input_file_name = "all3.txt"
#             elif 2455 < file_num:
#                 input_file_name = "all4.txt"


import os

cfg_folder = r'E:\2024\experiment_code_clone\total4\binary_cfg_code'
wild_clean_input_folder = r'E:\2024\experiment_code_clone\total4\BertPretrainFinetune-main\feature\input\reentrancy'

# 检查文件夹中没有的文件，并写入
for root, dirs, files in os.walk(cfg_folder):
    for dir in dirs:
        if 'reentrancy' in dir:
            cfg_folder_path = os.path.join(root, dir)

            for file in os.listdir(cfg_folder_path):
                write_file = os.path.join(wild_clean_input_folder, file)
                if not os.path.exists(write_file):
                    print("new write_file", write_file)
                    file_block = []
                    cfg_file = os.path.join(cfg_folder_path, file)
                    with open(cfg_file, 'r') as f:
                        content = f.readlines()
                    for line in content:
                        if line.strip().startswith("block_") and "->" not in line:
                            block_name = re.search(r"block_(\w+)\s*\[", line).group(1)
                            features = re.search(r'label="(.*)\\l', line).group(1)
                            if features.endswith('\\l'):
                                cleaned_content2 = features[:-2]
                            else:
                                cleaned_content2 = features
                            cleaned_content3 = cleaned_content2.replace('\\l', ' \\l')
                            cleaned_content4 = cleaned_content3.replace('\\l', '')
                            cleaned_content5 = re.sub(r'\b\w*:\s*', '', cleaned_content4)

                            formatted_content = re.sub(r'\s+', ' ', cleaned_content5)
                            file_block.append(formatted_content)
                    with open(write_file, 'w') as f:
                        for line in file_block:
                            f.write(line + '\n')


# 为写入的文件新建了一个输入all。txt
def write_to_file(file_path, content_list):
    with open(file_path, 'w') as f:
        for line in content_list:
            f.write(line)


#
# file_paths = [
#     r'E:\2024\experiment_code_clone\total4\BertPretrainFinetune-main\feature\input\wild-clean\1106.txt',
#     r'E:\2024\experiment_code_clone\total4\BertPretrainFinetune-main\feature\input\wild-clean\1168.txt',
#     r'E:\2024\experiment_code_clone\total4\BertPretrainFinetune-main\feature\input\wild-clean\1320.txt',
#     r'E:\2024\experiment_code_clone\total4\BertPretrainFinetune-main\feature\input\wild-clean\1522.txt',
#     r'E:\2024\experiment_code_clone\total4\BertPretrainFinetune-main\feature\input\wild-clean\1936.txt',
#     r'E:\2024\experiment_code_clone\total4\BertPretrainFinetune-main\feature\input\wild-clean\2137.txt',
#     r'E:\2024\experiment_code_clone\total4\BertPretrainFinetune-main\feature\input\wild-clean\2151.txt',
#     r'E:\2024\experiment_code_clone\total4\BertPretrainFinetune-main\feature\input\wild-clean\2463.txt',
#     r'E:\2024\experiment_code_clone\total4\BertPretrainFinetune-main\feature\input\wild-clean\2648.txt',
#     r'E:\2024\experiment_code_clone\total4\BertPretrainFinetune-main\feature\input\wild-clean\2793.txt'
# ]
# all_content = []
# for path in file_paths:
#     with open(path, 'r') as f:
#         contents = f.readlines()
#     all_content.extend(contents)
#
# unique_all_content = list(set(all_content))
#
# write_to_file(os.path.join(wild_clean_input_folder, "all5.txt"), unique_all_content)


def process_file(file_path, content_list):
    with open(file_path, 'r') as f:
        content = f.readlines()
    unique_content = list(set(content))
    content_list.extend(unique_content)


# 检查文件的数量
# file_count = sum(len(files) for _, _, files in os.walk(wild_clean_input_folder))
# batch_size = file_count / 2
# print("batch_size is {}".format(batch_size))
# 把所有txt文件分成四个txt all的文件进行bert

#
content1 = []

for root, dirs, files in os.walk(wild_clean_input_folder):
    for file in files:
        if not 'all' in file:
            process_file(os.path.join(wild_clean_input_folder, file), content1)
content1 = list(set(content1))
write_to_file(os.path.join(wild_clean_input_folder, "all1.txt"), content1)

# 所有cfg的txt文件在input生成对应文件

# python extract_features.py --input_file=./feature/input/wild-clean/all1.txt --output_file=./feature/output/wild-clean/all1.json --init_checkpoint=./models/finetune/wild-clean/model.ckpt-9975 --vocab_file=./vocab.txt --bert_config_file=./bert_config.json --layers=-1 --max_seq_length=64 --batch_size=8 --use_gpu=False

# python3 extract_features.py --input_file=./feature/input/delegatecall/all1.txt --output_file=./feature/output/delegatecall/all1.json --init_checkpoint=./models/finetune/delegatecall/model.ckpt-478 --vocab_file=./vocab.txt --bert_config_file=./bert_config.json --layers=-1 --max_seq_length=64 --batch_size=8 --use_gpu=False

# python extract_features.py --input_file=./feature/input/reentrancy/all1.txt --output_file=./feature/output/reentrancy/all1.json --init_checkpoint=./models/finetune/reentrancy/model.ckpt-4812 --vocab_file=./vocab.txt --bert_config_file=./bert_config.json --layers=-1 --max_seq_length=64 --batch_size=8 --use_gpu=False

#python extract_features.py --input_file=./feature/input/access_control/all1.txt --output_file=./feature/output/access_control/all1.json --init_checkpoint=./models/finetune/access_control/model.ckpt-11048 --vocab_file=./vocab.txt --bert_config_file=./bert_config.json --layers=-1 --max_seq_length=64 --batch_size=8 --use_gpu=False

#python extract_features.py --input_file=./feature/input/external_call/all1.txt --output_file=./feature/output/external_call/all1.json --init_checkpoint=./models/finetune/external_call/model.ckpt-4557 --vocab_file=./vocab.txt --bert_config_file=./bert_config.json --layers=-1 --max_seq_length=64 --batch_size=8 --use_gpu=False
