from datasets import load_dataset
from tokenizers import ByteLevelBPETokenizer
import tiktoken
import json
import base64

def train_tokenizer():
    dataset = load_dataset("pg19", split="train")

    target_bytes = 20_000_000_000
    sampled_texts = []
    current_bytes = 0
    for example in dataset:
        text = example["text"]
        current_bytes += len(text.encode("utf-8"))
        sampled_texts.append(text)
        if current_bytes >= target_bytes:
            break

    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train_from_iterator(sampled_texts, vocab_size=16384, min_frequency=2, special_tokens=["<|endoftext|>", "<|mask|>"])
    tokenizer.save("tokenizer.json")

def main():
    train_tokenizer()
    

if __name__ == "__main__":
    main()
