from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import glob
import json
from tqdm import tqdm
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-32B", trust_remote_code=True)


texts = []
json_list = glob.glob("result/finaldata2/**/*.json")

for json_file in json_list:
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    texts.append(data['question'])
print(texts[0], len(texts))

token_counts = [len(tokenizer.encode(text)) for text in tqdm(texts, desc="Encoding texts")]


plt.hist(token_counts, bins=50, edgecolor="black")  
plt.xlabel("Token number")
plt.ylabel("Problem number")
plt.title("Question Token (Qwen3-32B)")
plt.savefig("question.png", dpi=300, bbox_inches="tight")
