from transformers import BertTokenizer
import os
import csv
from tqdm import tqdm

if __name__ == '__main__':
    max_length = 150
    data_dir = 'data'
    saving_data_dir = 'data/multiple_attribute'
    os.makedirs(saving_data_dir, exist_ok=True)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    data_suffixes = ['valid', 'train', 'test']
    attributes = ['male', 'female']
    for data_suffix in tqdm(data_suffixes):
        for attribue in attributes:
            with open('data/multiple_attribute/processed/{}.fader.with_cat.proc.80000processed_{}'.format(
                    data_suffix, attribue), 'r') as file:
                lines = file.readlines()
            tensors = []
            for line in tqdm(lines):
                tokenized_string = tokenizer.encode(line, max_length=max_length)
                tensors.append(tokenized_string + [tokenizer.pad_token_id] * (max_length - len(tokenized_string)))
            with open(os.path.join(saving_data_dir, 'tensor_sentiment.{}_{}'.format(data_suffix, attribue)), 'w',
                      newline="") as file:
                writer = csv.writer(file)
                writer.writerows(tensors)
