from transformers import BertTokenizer
import os
from tqdm import tqdm
from collections import Counter

if __name__ == '__main__':
    dataset = 'amazon'
    max_length = 43
    data_dir = '../'
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    data_suffixes = ['dev.0', 'dev.1', 'train.0', 'train.1', 'test.1', 'test.0']
    for data_suffix in tqdm(data_suffixes):
        with open(os.path.join(data_dir, dataset, 'sentiment.{}'.format(data_suffix)), 'r') as file:
            lines = file.readlines()

        tensors_length = []
        for line in lines:
            tensors_length.append(len(tokenizer.encode(line)))
        print('Threshold', sum(i > max_length for i in tensors_length))
        print('Max length', max(tensors_length))
