import json
import pickle

import tqdm
import os
import tarfile
import sys

sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

path_json = r'/xxx/public_data/CC3M/recaption/filtered_cc3m_sbu.json'
# dataset_dir = r'/xxx/public_data/CC3M/CC3M_290W'
dataset_dir = r'/dockerdata/CC3M_290W'

with open(path_json, 'r') as f:
    list_all_url_captions = json.load(f)

dict_url_to_caption = {}
for item in list_all_url_captions:
    url = item['url']
    caption = item['caption']
    dict_url_to_caption[url] = [caption]
print('total number dict:{}'.format(len(dict_url_to_caption)))

all_tars = os.listdir(dataset_dir)
for item_tar in all_tars:
    print('process:{}'.format(item_tar))
    if (item_tar.endswith('tar')):
        path_tar = os.path.join(dataset_dir, item_tar)
        with tarfile.open(path_tar, 'r') as tar:
            for index, member in enumerate(tar.getmembers()):
                if member.name.endswith('.json'):
                    f = tar.extractfile(member)
                    if f is not None:
                        content = f.read()
                        dict_item = json.loads(content)
                        url = dict_item['url']
                        caption = dict_item['caption']
                        if (url not in dict_url_to_caption):
                            dict_url_to_caption[url] = [caption]

print('total size:{}'.format(len(dict_url_to_caption)))
with open('/xxx/public_data/CC3M/CC3M_AUGs/recaption.pkl', 'wb') as f:
    pickle.dump(dict_url_to_caption, f)
