from transformers import BertTokenizer
import os
import csv
from tqdm import tqdm

if __name__ == '__main__':
    dataset = 'amazon'
    max_length = 43
    data_dir = 'data'
    saving_data_dir = 'data/data_tensor_{}'.format(dataset)
    os.makedirs(saving_data_dir, exist_ok=True)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    data_suffixes = ['dev.0', 'dev.1', 'train.0', 'train.1', 'test.1', 'test.0']
    for data_suffix in tqdm(data_suffixes):
        with open(os.path.join(data_dir, dataset, 'sentiment.{}'.format(data_suffix)), 'r') as file:
            lines = file.readlines()
        tensors = []
        for line in lines:
            tokenized_string = tokenizer.encode(line, max_length=max_length)
            if dataset == 'yelp':
                tensors.append(tokenized_string + [tokenizer.pad_token_id] * (max_length - len(tokenized_string)))
            else:
                if len(tokenized_string) < max_length:
                    tensors.append(tokenized_string + [tokenizer.pad_token_id] * (max_length - len(tokenized_string)))
        with open(os.path.join(saving_data_dir, 'tensor_sentiment.{}'.format(data_suffix)), 'w', newline="") as file:
            writer = csv.writer(file)
            writer.writerows(tensors)
