import os
import json
import pandas as pd
from tqdm import tqdm
from collections import Counter

if __name__ == "__main__":
    for i in range(5):
        lang = ['en', 'es', 'fr', 'ja', 'ko', 'zh'][i]
        data_path = "data/pawx/x-final/{}/test_2k.tsv".format(lang)
        df = pd.read_csv(data_path, sep="\t").fillna(0)
        q1 = df.sentence1.values.tolist()
        q2 = df.sentence2.values.tolist()
        duplicate = df.label.values.tolist()
        print('Statistic Counters', Counter(duplicate))
        assert len(q1) == len(q2)
        assert len(duplicate) == len(q2)
        final_dic = {}
        numb_0 = 0
        numb_1 = 0
        total_numb_1 = sum(duplicate)
        for index in tqdm(range(len(q1))):
            final_dic[index] = {}
            final_dic[index]["q1"] = q1[index]
            final_dic[index]["q2"] = q2[index]
            final_dic[index]["duplicate"] = duplicate[index]
            final_dic[index]["scores"] = {}
            append = False
        NUMBER_OF_CHUNKS = 20
        print('Number of dic',len(final_dic))
        print('Chunk length {}'.format(len(final_dic) / NUMBER_OF_CHUNKS))
        assert len(final_dic) % NUMBER_OF_CHUNKS == 0
        for chunk_index in range(NUMBER_OF_CHUNKS):
            processed_json = "duplicate_questions_formated_{}.json"
            save_dir = 'data/duplicated_questions/'
            dict_items = final_dic.items()
            first_items = list(dict_items)[
                          chunk_index * len(final_dic) // NUMBER_OF_CHUNKS:(chunk_index + 1) * len(
                              final_dic) // NUMBER_OF_CHUNKS]
            with open('data/pawx/final_data/test_{}_formated_{}.json'.format(lang, chunk_index), "w") as file:
                json.dump(dict(first_items), file)
