import os
from tqdm import tqdm
import random


def open_data(split, data_suffixes):
    lines = []
    # for index, data_suffix in tqdm(enumerate(data_suffixes)):
    data_suffix = 'male'
    with open(os.path.join(data_dir, '{}.fader.with_cat.proc.80000processed_{}'.format(split, data_suffix)),
              'r') as file:
        lines_to_process = file.readlines()
    lines = [i.split('\t')[0] for i in lines_to_process]
    lines = [line for line in lines if len(line.split(' ')) < 100]

    data_suffix = 'female'
    with open(os.path.join(data_dir, '{}.fader.with_cat.proc.80000processed_{}'.format(split, data_suffix)),
              'r') as file:
        lines_to_process = file.readlines()
    lines_ = [i.split('\t')[0] for i in lines_to_process]
    lines_ = [line for line in lines_ if len(line.split(' ')) < 100]

    length = min(len(lines), len(lines_))
    lines_with_sentiments = lines_[:length]
    lines_with_sentiments += lines[:length]
    random.shuffle(lines_with_sentiments)
    return lines_with_sentiments


if __name__ == '__main__':
    data_dir = 'multiple_attribute/processed'
    max_length = 100
    data_suffixes = ['male', 'female']
    splits = ['test', 'valid', 'train']
    for split in tqdm(splits):
        dev_lines = open_data(split, data_suffixes)
        with open(os.path.join(data_dir, 'gender.{}'.format('{}_for_gpt2.txt'.format(split))),
                  'w') as file:
            file.writelines(dev_lines)
