import os
import json
import time
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed

base_path = ''


def process_inat_image(inat_chunk):
    id_to_path_mapping = {}

    for data_row in inat_chunk:
        id_to_path_mapping[data_row['id']] = os.path.join('inaturalist', data_row['file_name'])

    return id_to_path_mapping

def process_landmark_image(landmark_chunk):
    id_to_path_mapping = {}

    for i, data_row in landmark_chunk.iterrows():
        image_ids = data_row['images'].split(' ')
        for idx in image_ids:
            id_to_path_mapping[idx] = os.path.join('landmarks', 'train', idx[0], idx[1], idx[2], idx + '.jpg')

    return id_to_path_mapping


# Here, we generate mapping function that maps $dataset_image_ids$ of encyclopedic-vqa to the $dataset_name$'s actual image
# Plus, we also collect the actual image usage, in preprocess_order to delete the unused images to save memories.
if __name__ == '__main__':

    # Load inaturalist data files
    inat_train = json.load(open(os.path.join(base_path, 'inaturalist', 'train.json'), 'r'))['images']
    inat_val = json.load(open(os.path.join(base_path, 'inaturalist', 'val.json'), 'r'))['images']
    inat = inat_train + inat_val

    # Load google-landmarks data files
    landmark = pd.read_csv(os.path.join(base_path, 'landmarks', 'train_clean.csv'))

    num_workers = min(os.cpu_count(), 32)  # Too many CPU causes 426 Client error, since there are too many requests.
    global_id_to_path_mapping = {}

    landmark_chunk_size = len(inat) // num_workers
    landmark_chunks = [landmark[i*landmark_chunk_size : (i+1)*landmark_chunk_size] if i != (num_workers-1) else landmark[i*landmark_chunk_size:] for i in range(num_workers)]

    start = time.time()
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(process_landmark_image, chunk) for chunk in landmark_chunks]

        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing landmark chunks"):
            chunk_id_to_path_mapping = future.result()
            global_id_to_path_mapping.update(chunk_id_to_path_mapping)
    time_spent = (time.time() - start) / 3600
    print(f"Finished with Landmark for {time_spent} hours")

    inat_chunk_size = len(inat) // num_workers
    inat_chunks = [inat[i*inat_chunk_size:] if i == (num_workers-1) else inat[i*inat_chunk_size : (i+1)*inat_chunk_size] for i in range(num_workers)]

    start = time.time()
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        futures = [executor.submit(process_inat_image, chunk) for chunk in inat_chunks]

        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing inat chunks"):
            chunk_id_to_path_mapping = future.result()
            global_id_to_path_mapping.update(chunk_id_to_path_mapping)
    time_spent = (time.time() - start) / 3600
    print(f"Finished with iNat for {time_spent} hours")

    with open(os.path.join(base_path, 'dataset_id_to_path.json'), 'w') as f:
        json.dump(global_id_to_path_mapping, f)
