from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
# from nltk.corpus import movie_reviews
import nltk
nltk.download('movie_reviews')

import os

# Download and prepare the IMDB dataset
pos_reviews = nltk.corpus.movie_reviews.fileids('pos')
neg_reviews = nltk.corpus.movie_reviews.fileids('neg')
train_data = []
for fileid in pos_reviews:
    text = nltk.corpus.movie_reviews.raw(fileid)
    # train_data.append(text + "\t" + "positive")
    train_data.append(text)
for fileid in neg_reviews:
    text = nltk.corpus.movie_reviews.raw(fileid)
    # train_data.append(text + "\t" + "negative")
    train_data.append(text)

train_path = os.path.join(".", "imdb_train.txt")

with open(train_path, "w") as f:
    for example in train_data:
        f.write(example + "\n")

# Initialize a BPE model
bpe_model = BPE(unk_token="<unk>")

# Set up a trainer with desired parameters
trainer = BpeTrainer(vocab_size=10000, min_frequency=2, special_tokens=["<unk>", "<bos>", "<eos>"])

# Train the BPE model on the IMDB dataset
tokenizer = Tokenizer(bpe_model)
tokenizer.pre_tokenizer = Whitespace()
tokenizer.train([train_path], trainer=trainer)

# print(tokenizer)

# # Encode some text using the trained BPE model
# encoded_text = tokenizer.encode("This is some text to encode.").ids
# print(encoded_text)