# def get_dirname(file_num):
#     model_dirname = ''
#     if file_num <= 41:
#         model_dirname = "access_control"
#     elif file_num <= 1060 and file_num >= 299:
#         model_dirname = "reentrancy"
#     elif file_num <= 2918 and file_num >= 1061:
#         model_dirname = "wild-clean"
#     elif file_num <= 3336 and file_num >= 2919:
#         model_dirname = "external_call"
#     elif file_num <= 298 and file_num >= 42:
#         model_dirname = "delegatecall"
#     return model_dirname
#
#
# import json
# import numpy as np
# from collections import Counter
# from sklearn.metrics.pairwise import cosine_similarity
# import os
#
# import time
#
#
# def read_data(file_path):
#     with open(file_path, 'r') as f:
#         return json.load(f)
#
#
# def process_tokens(all_tokens):
#     processed_tokens = []
#     for token in all_tokens:
#         if token == 'math':
#             processed_tokens.append('safe math')
#         elif token == 'operations':
#             processed_tokens.append('math operations')
#         elif token not in ['title', 'safe']:
#             processed_tokens.append(token)
#     return processed_tokens
#
#
# def plot_token_frequency(data):
#     all_tokens = [token for item in data for token in item['token']]
#     processed_tokens = process_tokens(all_tokens)
#     return Counter(processed_tokens)
#
#
# def calculate_attention_weights(data, token_freq):
#     token_freq['safe'] = token_freq.get('safe math', 0)
#     token_freq['title'] = 1
#     token_freq['math'] = token_freq.get('safe math', 0)
#     token_freq['operations'] = token_freq.get('math operations', 0)
#
#     total_freq = sum(token_freq.values())
#     token_to_weight = {token: freq / total_freq for token, freq in token_freq.items()}
#
#     return [[token_to_weight.get(token, 0) for token in item['token']] for item in data]
#
#
# def calculate_weighted_embedding(data, attention_weights):
#     return [
#         np.dot(weights, np.array(item['comments_feature']))
#         if weights else np.zeros(512, dtype=np.float32)
#         for item, weights in zip(data, attention_weights)
#     ]
#
#
# def preprocess_item(item, attention_weights):
#     weighted_embedding = np.array(attention_weights, dtype=np.float32).reshape(1, -1)
#     ast_features = np.array(item['ast_features'], dtype=np.float32).reshape(1, -1)
#     cfg_features = np.array(item['cfg_features'], dtype=np.float32).reshape(1, -1)
#     return np.vstack([weighted_embedding, ast_features, cfg_features])
#
#
# def compute_cosine_similarity(matrix1, matrix2):
#     return cosine_similarity(matrix1.flatten().reshape(1, -1), matrix2.flatten().reshape(1, -1))[0][0]
#
#
# def main():
#     dataset_path = r'E:\2024\experiment_code_clone\total4\all_features\dataset\all_features.json'
#     evaluate_path = r'E:\2024\experiment_code_clone\total4\all_features\dataset_clone\evaluate_clone.json'
#
#     dataset = read_data(dataset_path)
#     evaluate_data = read_data(evaluate_path)
#
#     token_freq = plot_token_frequency(dataset)
#     attention_weights = calculate_attention_weights(dataset, token_freq)
#     weighted_embeddings = calculate_weighted_embedding(dataset, attention_weights)
#
#     dataset_features = [preprocess_item(item, embedding) for item, embedding in zip(dataset, weighted_embeddings)]
#     contract_names = [item['contract_name'] for item in dataset]
#     start_time = time.time()
#
#     for item in evaluate_data:
#         itemb_attention_weights = calculate_attention_weights([item], token_freq)
#         itemb_weighted_embedding = calculate_weighted_embedding([item], itemb_attention_weights)
#         itemb_features = preprocess_item(item, itemb_weighted_embedding[0])
#         itemb_name = item['contract_name']
#
#         # 计算相似度并直接筛选
#         similarities = [compute_cosine_similarity(itemb_features, dataset_feature) for dataset_feature in
#                         dataset_features]
#
#         output_filename = []
#
#         max_similarity = max(similarities)
#
#         for idx, similarity in enumerate(similarities):
#             if max_similarity > 1:
#                 if similarity / max_similarity > 0.95:
#                     output_filename.append(contract_names[idx])
#                     name_num = int(contract_names[idx][:-4])
#                     type_name = get_dirname(name_num)
#                     print(
#                         f"Contract Name: {contract_names[idx]}, Similarity: {similarity / max_similarity}, Vulnerable type: {type_name}")
#             else:
#                 if similarity > 0.95:
#                     output_filename.append(contract_names[idx])
#                     name_num = int(contract_names[idx][:-4])
#                     type_name = get_dirname(name_num)
#                     print(
#                         f"Contract Name: {contract_names[idx]}, Similarity: {similarity}, Vulnerable type: {type_name}")
#         output_filename = [x for x in output_filename if x != itemb_name]
#
#         # file_path = os.path.join(r"E:\2024\experiment_code_clone\total4\all_features\source_code_no_comments",
#         #                          itemb_name)
#         # num_lines = sum(1 for line in open(file_path, 'r', encoding='utf-8'))
#         # print(f"The number of lines in '{file_path}' is: {num_lines}")
#
#         output_dir = os.path.join(r'E:\2024\experiment_code_clone\total4\all_features\clone_experiment\output_0.95',
#                                   itemb_name)
#         os.makedirs(output_dir, exist_ok=True)
#         with open(os.path.join(output_dir, 'multiclone.json'), 'w', encoding='utf-8') as f:
#             json.dump(output_filename, f, ensure_ascii=False)
#
#     end_time = time.time()  # 记录结束时间
#     elapsed_time = end_time - start_time  # 计算运行时间
#     print(f"Elapsed time: {elapsed_time} seconds")  # 打印运行时间
#
#
# if __name__ == "__main__":
#     main()


# def get_dirname(file_num):
#     model_dirname = ''
#     if file_num <= 41:
#         model_dirname = "access_control"
#     elif file_num <= 1060 and file_num >= 299:
#         model_dirname = "reentrancy"
#     elif file_num <= 2918 and file_num >= 1061:
#         model_dirname = "wild-clean"
#     elif file_num <= 3336 and file_num >= 2919:
#         model_dirname = "external_call"
#     elif file_num <= 298 and file_num >= 42:
#         model_dirname = "delegatecall"
#     return model_dirname
#
#
# import json
# import numpy as np
# from collections import Counter
# from sklearn.metrics.pairwise import cosine_similarity
# import os
#
# import time
#
#
# def read_data(file_path):
#     with open(file_path, 'r') as f:
#         return json.load(f)
#
#
# def process_tokens(all_tokens):
#     processed_tokens = []
#     for token in all_tokens:
#         if token == 'math':
#             processed_tokens.append('safe math')
#         elif token == 'operations':
#             processed_tokens.append('math operations')
#         elif token not in ['title', 'safe']:
#             processed_tokens.append(token)
#     return processed_tokens
#
#
# def plot_token_frequency(data):
#     all_tokens = [token for item in data for token in item['token']]
#     processed_tokens = process_tokens(all_tokens)
#     return Counter(processed_tokens)
#
#
# def calculate_attention_weights(data, token_freq):
#     token_freq['safe'] = token_freq.get('safe math', 0)
#     token_freq['title'] = 1
#     token_freq['math'] = token_freq.get('safe math', 0)
#     token_freq['operations'] = token_freq.get('math operations', 0)
#
#     total_freq = sum(token_freq.values())
#     token_to_weight = {token: freq / total_freq for token, freq in token_freq.items()}
#
#     return [[token_to_weight.get(token, 0) for token in item['token']] for item in data]
#
#
# def calculate_weighted_embedding(data, attention_weights):
#     return [
#         np.dot(weights, np.array(item['comments_feature']))
#         if weights else np.zeros(512, dtype=np.float32)
#         for item, weights in zip(data, attention_weights)
#     ]
#
#
# def preprocess_item(item, attention_weights):
#     weighted_embedding = np.array(attention_weights, dtype=np.float32).reshape(1, -1)
#     ast_features = np.array(item['ast_features'], dtype=np.float32).reshape(1, -1)
#     cfg_features = np.array(item['cfg_features'], dtype=np.float32).reshape(1, -1)
#     return np.vstack([weighted_embedding, ast_features, cfg_features])
#
#
# def compute_cosine_similarity(matrix1, matrix2):
#     return cosine_similarity(matrix1.flatten().reshape(1, -1), matrix2.flatten().reshape(1, -1))[0][0]
#
#
# def main():
#     dataset_path = r'E:\2024\experiment_code_clone\total4\all_features\dataset\all_features.json'
#     evaluate_path = r'E:\2024\experiment_code_clone\total4\all_features\dataset_clone\evaluate_clone.json'
#
#     dataset = read_data(dataset_path)
#     evaluate_data = read_data(evaluate_path)
#
#     token_freq = plot_token_frequency(dataset)
#     attention_weights = calculate_attention_weights(dataset, token_freq)
#     weighted_embeddings = calculate_weighted_embedding(dataset, attention_weights)
#
#     dataset_features = [preprocess_item(item, embedding) for item, embedding in zip(dataset, weighted_embeddings)]
#     contract_names = [item['contract_name'] for item in dataset]
#     start_time = time.time()
#
#     for item in evaluate_data:
#         itemb_attention_weights = calculate_attention_weights([item], token_freq)
#         itemb_weighted_embedding = calculate_weighted_embedding([item], itemb_attention_weights)
#         itemb_features = preprocess_item(item, itemb_weighted_embedding[0])
#         itemb_name = item['contract_name']
#
#         # 计算相似度并直接筛选
#         similarities = [compute_cosine_similarity(itemb_features, dataset_feature) for dataset_feature in
#                         dataset_features]
#
#         # 获取前11个相似度及其索引
#         top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:15]
#         # top_similarities = [similarities[i] for i in top_indices]
#         output_filename = []
#
#         max_similarity = max(similarities)
#
#         # 遍历前11个相似度及其索引
#         for idx in top_indices:
#             similarity = similarities[idx]
#             if max_similarity > 1:
#                 if similarity / max_similarity > 0.95:
#                     output_filename.append(contract_names[idx])
#                     name_num = int(contract_names[idx][:-4])
#                     type_name = get_dirname(name_num)
#                     print(
#                         f"Contract Name: {contract_names[idx]}, Similarity: {similarity / max_similarity}, Vulnerable type: {type_name}")
#             else:
#                 if similarity > 0.95:
#                     output_filename.append(contract_names[idx])
#                     name_num = int(contract_names[idx][:-4])
#                     type_name = get_dirname(name_num)
#                     print(
#                         f"Contract Name: {contract_names[idx]}, Similarity: {similarity}, Vulnerable type: {type_name}")
#
#         output_filename = [x for x in output_filename if x != itemb_name]
#
#         output_dir = os.path.join(r'E:\2024\experiment_code_clone\total4\all_features\clone_experiment\output_0.95',
#                                   itemb_name)
#         os.makedirs(output_dir, exist_ok=True)
#         with open(os.path.join(output_dir, 'multiclone.json'), 'w', encoding='utf-8') as f:
#             json.dump(output_filename, f, ensure_ascii=False)
#
#     end_time = time.time()  # 记录结束时间
#     elapsed_time = end_time - start_time  # 计算运行时间
#     print(f"Elapsed time: {elapsed_time} seconds")  # 打印运行时间
#
#
# if __name__ == "__main__":
#     main()


import json
import numpy as np
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import os
import time


def get_dirname(file_num):
    model_dirname = ''
    if file_num <= 41:
        model_dirname = "access_control"
    elif file_num <= 1060 and file_num >= 299:
        model_dirname = "reentrancy"
    elif file_num <= 2918 and file_num >= 1061:
        model_dirname = "wild-clean"
    elif file_num <= 3336 and file_num >= 2919:
        model_dirname = "external_call"
    elif file_num <= 298 and file_num >= 42:
        model_dirname = "delegatecall"
    return model_dirname


def read_data(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)


def process_tokens(all_tokens):
    processed_tokens = []
    for token in all_tokens:
        if token == 'math':
            processed_tokens.append('safe math')
        elif token == 'operations':
            processed_tokens.append('math operations')
        elif token not in ['title', 'safe']:
            processed_tokens.append(token)
    return processed_tokens


def plot_token_frequency(data):
    all_tokens = [token for item in data for token in item['token']]
    processed_tokens = process_tokens(all_tokens)
    return Counter(processed_tokens)


def calculate_attention_weights(data, token_freq):
    token_freq['safe'] = token_freq.get('safe math', 0)
    token_freq['title'] = 1
    token_freq['math'] = token_freq.get('safe math', 0)
    token_freq['operations'] = token_freq.get('math operations', 0)

    total_freq = sum(token_freq.values())
    token_to_weight = {token: freq / total_freq for token, freq in token_freq.items()}

    return [[token_to_weight.get(token, 0) for token in item['token']] for item in data]


def calculate_weighted_embedding(data, attention_weights):
    return [
        np.dot(weights, np.array(item['comments_feature']))
        if weights else np.zeros(512, dtype=np.float32)
        for item, weights in zip(data, attention_weights)
    ]


def preprocess_item(item, attention_weights):
    weighted_embedding = np.array(attention_weights, dtype=np.float32).reshape(1, -1)
    ast_features = np.array(item['ast_features'], dtype=np.float32).reshape(1, -1)
    cfg_features = np.array(item['cfg_features'], dtype=np.float32).reshape(1, -1)
    return np.vstack([weighted_embedding, ast_features, cfg_features])


def compute_cosine_similarity(matrix1, matrix2):
    return cosine_similarity(matrix1.flatten().reshape(1, -1), matrix2.flatten().reshape(1, -1))[0][0]


def rbf_kernel_similarity(x, x_prime, sigma=1.0):
    distance = np.linalg.norm(x - x_prime)
    return np.exp(-0.5 * (distance ** 2) / (sigma ** 2))


def compute_similarity_plus(matrix1, matrix2, sigma=1):
    cosine_sim = compute_cosine_similarity(matrix1, matrix2)
    rbf_sim = rbf_kernel_similarity(matrix1.flatten(), matrix2.flatten(), sigma)
    return cosine_sim * rbf_sim

# alpha=0.3, beta=0.7
def compute_similarity_add(matrix1, matrix2, alpha=0.5, beta=0.5, sigma=1.0):
    cosine_sim = compute_cosine_similarity(matrix1, matrix2)
    rbf_sim = rbf_kernel_similarity(matrix1.flatten(), matrix2.flatten(), sigma)
    return alpha * cosine_sim + beta * rbf_sim


def main():
    dataset_path = r'E:\2024\experiment_code_clone\total4\all_features\dataset\all_features.json'
    evaluate_path = r'E:\2024\experiment_code_clone\total4\all_features\dataset_clone\evaluate_clone.json'

    dataset = read_data(dataset_path)
    evaluate_data = read_data(evaluate_path)

    token_freq = plot_token_frequency(dataset)
    attention_weights = calculate_attention_weights(dataset, token_freq)
    weighted_embeddings = calculate_weighted_embedding(dataset, attention_weights)

    dataset_features = [preprocess_item(item, embedding) for item, embedding in zip(dataset, weighted_embeddings)]
    contract_names = [item['contract_name'] for item in dataset]
    start_time = time.time()

    for item in evaluate_data:
        itemb_attention_weights = calculate_attention_weights([item], token_freq)
        itemb_weighted_embedding = calculate_weighted_embedding([item], itemb_attention_weights)
        itemb_features = preprocess_item(item, itemb_weighted_embedding[0])
        itemb_name = item['contract_name']

        # 计算相似度并直接筛选
        similarities = [compute_similarity_plus(itemb_features, dataset_feature) for dataset_feature in
                        dataset_features]

        # 获取前15个相似度及其索引
        top_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)[:15]
        output_filename = []

        max_similarity = max(similarities)

        # 遍历前15个相似度及其索引
        for idx in top_indices:
            similarity = similarities[idx]
            if max_similarity > 1:
                if similarity / max_similarity >= 0.99:
                    output_filename.append(contract_names[idx])
                    name_num = int(contract_names[idx][:-4])
                    type_name = get_dirname(name_num)
                    print(
                        f"Contract Name: {contract_names[idx]}, Similarity: {similarity / max_similarity}, Vulnerable type: {type_name}")
            else:
                if similarity > 0.99:
                    output_filename.append(contract_names[idx])
                    name_num = int(contract_names[idx][:-4])
                    type_name = get_dirname(name_num)
                    print(
                        f"Contract Name: {contract_names[idx]}, Similarity: {similarity}, Vulnerable type: {type_name}")

        print("itemb_name", itemb_name)
        output_filename = [x for x in output_filename if x != itemb_name]
        print("len out put file", len(output_filename))

        output_dir = os.path.join(r'E:\2024\experiment_code_clone\total4\all_features\clone_experiment\output_0.95',
                                  itemb_name)
        os.makedirs(output_dir, exist_ok=True)
        with open(os.path.join(output_dir, 'multiclone.json'), 'w', encoding='utf-8') as f:
            json.dump(output_filename, f, ensure_ascii=False)

    end_time = time.time()  # 记录结束时间
    elapsed_time = end_time - start_time  # 计算运行时间
    print(f"Elapsed time: {elapsed_time} seconds")  # 打印运行时间


if __name__ == "__main__":
    main()
