import os

import nltk
import pickle
import sys
import tarfile
from multiprocessing import Process, Queue, Manager, current_process

sys.path.append('/xxx/aaa/eva-clip/EVA-CLIP/rei')

dataset_dir = r'/xxx/public_data/CC3M/CC3M_290W'

manager = Manager()


def worker_process(rank, work_queue, dict_sentence_to_nouns, dict_nouns_to_count, global_repeat):
    print('rank:{}, start'.format(rank))
    while True:
        item_tar = work_queue.get()
        print('rank:{}, process tar:{}'.format(rank, item_tar))
        if (item_tar is None):
            break
        path_tar = os.path.join(dataset_dir, item_tar)
        with tarfile.open(path_tar, 'r') as tar:
            for index, member in enumerate(tar.getmembers()):
                # 检查文件名是否以.txt结尾
                if (index % 1000 == 0):
                    print('rank:{}, tar:{}, index:{}'.format(rank, item_tar, index))
                    print('rank:{} len dict_sentence_to_nouns:{}, len dict_nouns_to_count:{}'.format(rank,
                                                                                                     len(dict_sentence_to_nouns),
                                                                                                     len(dict_nouns_to_count)))
                if member.name.endswith('.txt'):
                    # 打开文件
                    f = tar.extractfile(member)
                    if f is not None:
                        # 读取文件内容
                        content = f.read()
                        sentence = content.decode('utf-8')
                        f.close()  # 关闭文件
                        tokens = nltk.word_tokenize(sentence)
                        tagged_sent = nltk.pos_tag(tokens, tagset='universal')
                        if (sentence in dict_sentence_to_nouns):
                            # print('tar:{} caption:{}'.format(item_tar, sentence))
                            global_repeat.value += 1
                            if (global_repeat.value % 1000 == 0):
                                print('global_repeat:{} total sentence dict:{}'.format(global_repeat.value,
                                                                                       len(dict_sentence_to_nouns)))
                        dict_sentence_to_nouns[sentence] = manager.list()
                        for item_pair in tagged_sent:
                            word, item_tag = item_pair
                            if (item_tag == 'NOUN'):
                                dict_sentence_to_nouns[sentence].append(word)
                                if (word not in dict_nouns_to_count):
                                    dict_nouns_to_count[word] = 1
                                else:
                                    dict_nouns_to_count[word] += 1

    print('rank:{}, finish'.format(rank))


def main():
    all_files = os.listdir(dataset_dir)

    dict_sentence_to_nouns = manager.dict()
    dict_nouns_to_count = manager.dict()
    global_repeat = manager.Value('i', 0)

    queue = Queue()
    for tar_file in all_files:
        queue.put(tar_file)

    all_workers = []
    for rank in range(10):
        item_process = Process(target=worker_process,
                               args=(rank, queue, dict_sentence_to_nouns, dict_nouns_to_count, global_repeat))
        item_process.start()
        all_workers.append(item_process)
        queue.put(None)

    # 等待所有工作进程完成
    for worker in all_workers:
        worker.join()

    dict_nouns_to_count = dict(dict_nouns_to_count)
    dict_sentence_to_nouns = dict(dict_sentence_to_nouns)
    for item_key, item_list in dict_sentence_to_nouns.items():
        item_list = list(item_list)
        dict_sentence_to_nouns[item_key] = item_list

    with open('dict_sentences_nouns.pkl', 'wb') as f:
        pickle.dump([dict_nouns_to_count, dict_sentence_to_nouns], f)


if __name__ == '__main__':
    main()
