import json
import pickle

import joblib
import numpy
import os
import tarfile
import sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

path_json = r'/xxx/public_data/CC12M/recaption/filtered_cc12m.json'
dataset_dir = r'/xxx/public_data/CC12M/cc12m-wds'

with open(path_json, 'r') as f:
    list_all_url_captions = json.load(f)

dict_url_to_caption = {}
for item in list_all_url_captions:
    url = item['url']
    caption = item['caption']
    dict_url_to_caption[url] = [caption]
print('total number dict:{}'.format(len(dict_url_to_caption)))

all_tars = os.listdir(dataset_dir)
print('number tars:{}'.format(len(all_tars)))
for item_tar in all_tars:
    print('process:{}'.format(item_tar))
    if (item_tar.endswith('tar')):
        path_tar = os.path.join(dataset_dir, item_tar)
        with tarfile.open(path_tar, 'r') as tar:
            for index, member in enumerate(tar.getmembers()):
                if member.name.endswith('.json'):
                    f = tar.extractfile(member)
                    if f is not None:
                        content = f.read()
                        dict_item = json.loads(content)
                        url = dict_item['url']
                        caption = dict_item['caption']
                        if (url not in dict_url_to_caption):
                            dict_url_to_caption[url] = [caption]
                        else:
                            dict_url_to_caption[url].append(caption)

size_count = []
for item_key, item_list in dict_url_to_caption.items():
    size_count.append(len(item_list))
print('max len:{}'.format(numpy.max(size_count)))
print('mean count:{}'.format(numpy.mean(size_count)))
print('min count:{}'.format(numpy.min(size_count)))

print('total size:{}'.format(len(dict_url_to_caption)))
with open('/xxx/public_data/CC12M/recaption/origin_mix_recaption.joblib', 'wb') as f:
    joblib.dump(dict_url_to_caption, f)
