import json
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')


with open("conversations_cleaned.json", "r") as f:
    conversations = json.load(f)

print(conversations.keys())
print(len(conversations["conversations_by_topic"])) # 26

dialogue_len = []
dialogue_token = []
dialogue_len_per_type = {}
dialogue_token_per_type = {}
for conversation in conversations["conversations_by_topic"]:
    # print(conversation.keys()) # topic, main_speaker, dialogues
    # print(len(conversation["dialogues"])) # 5
    # print(conversation["main_speaker"].keys()) # name, background
    # print(conversation["dialogues"][0].keys()) # other_speaker, conversation
    # print(conversation["dialogues"][0]["other_speaker"].keys()) # name, background, class
    for dialogue in conversation["dialogues"]:
        if dialogue["other_speaker"]["class"] not in dialogue_len_per_type:
            dialogue_len_per_type[dialogue["other_speaker"]["class"]] = []
        if dialogue["other_speaker"]["class"] not in dialogue_token_per_type:
            dialogue_token_per_type[dialogue["other_speaker"]["class"]] = []
        dialogue_len_per_type[dialogue["other_speaker"]["class"]].append(len(dialogue["conversation"]))
        text = ""
        for sentence in dialogue["conversation"]:
            text += sentence["text"]
        dialogue_token_per_type[dialogue["other_speaker"]["class"]].append(len(word_tokenize(text)))
        dialogue_len.append(len(dialogue["conversation"]))
        dialogue_token.append(len(word_tokenize(text)))
        if len(dialogue["conversation"]) > 30:
            print(len(dialogue["conversation"]), conversation["topic"], dialogue["other_speaker"]["name"])

print("Dialogue Length per Type")
print(len(dialogue_len))
print(sum(dialogue_len) / len(dialogue_len))
print(max(dialogue_len), min(dialogue_len))

for class_type, lengths in dialogue_len_per_type.items():
    print(f"Class Type: {class_type}", len(lengths))
    print(f"Average Length: {sum(lengths) / len(lengths)}")
    print(f"Max Length: {max(lengths)}")
    print(f"Min Length: {min(lengths)}")
    print()

print("Dialogue Token per Type")
print(len(dialogue_token))
print(sum(dialogue_token) / len(dialogue_token))
print(max(dialogue_token), min(dialogue_token))

for class_type, lengths in dialogue_token_per_type.items():
    print(f"Class Type: {class_type}", len(lengths))
    print(f"Average Length: {sum(lengths) / len(lengths)}")
    print(f"Max Length: {max(lengths)}")
    print(f"Min Length: {min(lengths)}")
    print()

# # 绘制直方图
# plt.hist(dialogue_len, bins=20, edgecolor='black')
# plt.title('Distribution of Dialogue Lengths')
# plt.xlabel('Dialogue Length')
# plt.ylabel('Frequency')
# plt.show()

