# import json
# from collections import Counter
# import numpy as np
# import matplotlib.pyplot as plt
# from wordcloud import WordCloud
# import re
#
#
# # 读取数据文件
# def read_data(file_path):
#     with open(file_path, 'r') as f:
#         data = json.load(f)
#     return data
#
#
# # 提取 token 并生成词云图
# def plot_token_frequency(data):
#     all_tokens = []
#     for item in data:
#         all_tokens.extend(item['token'])
#
#     # 替换特定的 token
#     replacement_dict = {
#         'smartcon': 'smart contract',
#         'math': 'safe math',
#         'safe': 'safe math',
#         'operations': 'math operations',
#         # '20': 'erc20'
#     }
#     replaced_tokens = [replacement_dict.get(token, token) for token in all_tokens]
#
#     # 排除特定词语和数字
#     exclude_words = ['see', 'contract', 'functions', 'ethereum', 'title', 'safety', 'yes', 'no', 'two', 'title',
#                      'param', 'smart contract', 'would']
#     filtered_tokens = [token for token in replaced_tokens if
#                        token.lower() not in exclude_words and not re.fullmatch(r'\d+', token)]
#
#     token_freq = Counter(filtered_tokens)
#
#     # 生成词云图
#     wordcloud = WordCloud(width=1200, height=600, background_color='white').generate_from_frequencies(token_freq)
#
#     plt.figure(figsize=(10, 6))
#     plt.imshow(wordcloud, interpolation='bilinear')
#     plt.axis('off')
#     plt.title('Token Frequency')
#     plt.tight_layout()
#     plt.show()
#
#     return token_freq
#
#
# # 主函数
# def main():
#     # 读取数据
#     data = read_data('all_features.json')
#
#     # 显示 token 频率的词云图（排除指定词语）
#     token_freq = plot_token_frequency(data)
#
#
# if __name__ == "__main__":
#     main()



import json
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import re


# 读取数据文件
def read_data(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data


# 提取 token 并生成词云图
def plot_token_frequency(data):
    all_tokens = []
    for item in data:
        all_tokens.extend(item['token'])

    # 替换特定的 token
    replacement_dict = {
        'smartcon': 'smart contract',
        'math': 'safe math',
        'safe': 'safe math',
        'operations': 'math operations',
        # '20': 'erc20'
    }
    replaced_tokens = [replacement_dict.get(token, token) for token in all_tokens]

    # 排除特定词语和数字
    exclude_words = ['see', 'contract', 'functions', 'ethereum', 'title', 'safety', 'yes', 'no', 'two', 'title',
                     'param', 'smart contract', 'would']
    filtered_tokens = [token for token in replaced_tokens if
                       token.lower() not in exclude_words and not re.fullmatch(r'\d+', token)]

    token_freq = Counter(filtered_tokens)

    # 生成词云图
    wordcloud = WordCloud(width=2000, height=1000, background_color='white').generate_from_frequencies(token_freq)

    plt.figure(figsize=(24, 16))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')

    # 设置标题
    plt.title('Comment Word Cloud', fontsize=24, fontweight='bold', pad=20)

    # 保存词云图为 PDF 文件
    plt.savefig('wordcloud.pdf', format='pdf', bbox_inches='tight')

    plt.show()

    return token_freq


# 主函数
def main():
    # 读取数据
    data = read_data('dataset/all_features.json')

    # 显示 token 频率的词云图（排除指定词语）
    token_freq = plot_token_frequency(data)


if __name__ == "__main__":
    main()
