from datasets import load_dataset
from transformers import GPT2TokenizerFast
from collections import Counter

dataset = load_dataset("openwebtext", split="train", streaming=True)
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
token_counts = Counter()
i=0
for example in dataset:
    i=i+1
    text = example["text"]
    tokens = tokenizer(text, add_special_tokens=False)["input_ids"]
    token_counts.update(tokens)
    print(i)

vocab_size = tokenizer.vocab_size  # Should be 50257 for GPT-2.
token_frequencies = {token_id: token_counts.get(token_id, 0) for token_id in range(vocab_size)}

for token_id, freq in token_frequencies.items():
    token_str = tokenizer.decode([token_id])
    print(f"Token ID {token_id}: '{token_str}' -> Frequency: {freq}")#to manually check
    #easier to just save
