from datasets import load_dataset
from transformers import AutoTokenizer
import draftretriever
from tqdm import tqdm
import json

tokenizer = AutoTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")


datastore_path = './datastore_chat_small.idx'


dataset = load_dataset("Aeala/ShareGPT_Vicuna_unfiltered", split='train')
total_length = len(dataset)
print("Number of samples:", total_length)

# Create an empty list to hold all samples as lists of string tokens
tokenized_samples = []

for conversations in tqdm(dataset, total=total_length):
    for sample in conversations['conversations']:
         # Tokenize the 'value' field and decode back to tokens (keeping them as a list of strings)
        token_list = tokenizer.encode(sample['value'])
        decoded_tokens = tokenizer.convert_ids_to_tokens(token_list)
        
        # Remove leading '_' (or '▁') and convert to lowercase for each token
        processed_tokens = [token.lstrip('▁').lower() for token in decoded_tokens]

        # Append the processed tokens to the final list
        tokenized_samples.extend(processed_tokens)

# Write the list of list of strings to a file using JSON format
with open("../llm_judge/tokenized_data.json", "w") as f:
    json.dump(tokenized_samples, f)

print(f"Data has been written to tokenized_data.json")

