import random
import numpy as np
import gc
from tinypy_code_tracer_tokenizer import TinypyTokenizer

# Set the random seed for reproducibility
seed = 42
np.random.seed(seed)
random.seed(seed)

# Initialize the tokenizer
print("[*] Initializing the tokenizer ...")
tpt = TinypyTokenizer()

# Creating the train dataset
print("[*] Creating the train dataset ...")
train_out_path = "./data/train.txt"
train_bin_out_path = "./data/train.bin"

# We generate the tokenized file of train.txt in train.bin
print("[*] We generate the tokenized file of train.txt in val.bin")
print(tpt.encode_to_file(train_out_path, train_bin_out_path))

# Create the vocab_size.txt file
print("[*] Creating the vocab_size.txt file ...")
voc_size_path = "./data/vocab_size.txt"
with open(voc_size_path, "w") as f:
	f.write(str(len(tpt.keywords)))