import json
import os


def process_comments_features(comments_dir, ast_features, cfg_features):
    check_dir = r'E:\2024\experiment_code_clone\total4\all_features\comments_topic_embedding'

    all_file = []
    for file in os.listdir(check_dir):
        if file.endswith('.json'):
            file_num = int(file[:-5])
            all_file.append(file_num)

    numbers_list = list(range(1, 3337))

    rest_files = list(set(numbers_list) - set(all_file))

    mixed_features = []
    for file in os.listdir(comments_dir):
        if file.endswith(".json"):
            token_list = []
            feature_list = []
            if int(file[:-5]) not in rest_files:
                comment_file = os.path.join(comments_dir, file)
                if os.path.getsize(comment_file) != 0:
                    with open(comment_file, 'r') as f:
                        comments = json.load(f)
                    for word_embedding in comments:
                        token_list.append(word_embedding['token'])
                        feature_list.append(word_embedding['features'])

            contract_name = file[:-5] + ".sol"
            if contract_name in ast_features:
                ast_feats = ast_features[contract_name]
            else:
                ast_feats = [0] * 512

            if contract_name in cfg_features:
                cfg_feats = cfg_features[contract_name]
            else:
                cfg_feats = [0] * 512

            file_num = int(file[:-5])
            y = None
            if file_num <= 1060 and file_num >= 299:
                y = 1
            else:
                y = 0

            mixed_features.append({
                "contract_name": contract_name,
                "token": token_list,
                "comments_feature": feature_list,
                "ast_features": ast_feats,
                "cfg_features": cfg_feats,
                "y": y
            })
    print(len(mixed_features))
    return mixed_features


def process_ast_features():
    ast_features = {}
    ast_dir = "./source_code2ast_features/best_encoded_features.json"
    with open(ast_dir) as f:
        ast_features = json.load(f)
    print(len(ast_features))
    return ast_features


def process_cfg_features(cfg_dir):
    cfg_features = {}
    for subdir in os.listdir(cfg_dir):
        if os.path.isdir(os.path.join(cfg_dir, subdir)):
            subdir_path = os.path.join(cfg_dir, subdir)
            for file in os.listdir(subdir_path):
                if file.endswith(".json"):
                    file_path = os.path.join(subdir_path, file)
                    with open(file_path) as f:
                        cfg_contents = json.load(f)
                    for contract_name, features in cfg_contents.items():
                        cfg_features[contract_name] = features
    print(len(cfg_features))
    return cfg_features


def get_all_features(comments_dir, cfg_dir):
    ast_features = process_ast_features()
    cfg_features = process_cfg_features(cfg_dir)
    mixed_features = process_comments_features(comments_dir, ast_features, cfg_features)
    return mixed_features


#
# def get_dirname(file_num):
#     model_dirname = ''
#     if file_num <= 41:
#         model_dirname = "access_control"
#     elif file_num <= 1060 and file_num >= 299:
#         model_dirname = "reentrancy"
#     elif file_num <= 2918 and file_num >= 1061:
#         model_dirname = "wild-clean"
#     elif file_num <= 3336 and file_num >= 2919:
#         model_dirname = "external_call"
#     elif file_num <= 298 and file_num >= 42:
#         model_dirname = "delegatecall"
#     return model_dirname


if __name__ == '__main__':
    comments_dir = './comments_topic_embedding'
    cfg_dir = './features-cfg'

    mixed_features = get_all_features(comments_dir, cfg_dir)

    with open("dataset/all_features.json", "w") as f:
        json.dump(mixed_features, f)
