import os
import glob
import json

import pandas as pd
import pyarrow.parquet as pq
from tqdm import tqdm
from PIL import Image
from io import BytesIO

# Convert the parquet files from the AToMiC-Images-v0.2, containing the images of KB, into jpg files
# Save
# 1. image_url_to_image_id: mapping dictionary (KB contains image_urls, not image_id)
# 2. $image_id$.jpg

base_path = ''

if __name__ == '__main__':

    pq_file_list = glob.glob(os.path.join(base_path, 'AToMiC-Images-v0.2', 'data', 'train-**.parquet'))
    image_url_to_id = {}

    for pq_file in tqdm(pq_file_list):

        pq_df = pd.read_parquet(pq_file)
        for idx, row_data in pq_df.iterrows():
            image_id = row_data['image_id']

            image_url_to_id[row_data['image_url']] = image_id + '.jpg'

            # Load the bytes data into a BytesIO object
            image = BytesIO(row_data['image']['bytes'])

            # Open the image using PIL
            image = Image.open(image)
            image = image.convert('RGB')

            image.save(os.path.join(base_path, 'AToMiC-Images-v0.2', 'data', f'{image_id}.jpg'), 'JPEG')

        os.remove(pq_file)
        print(f'Removed {pq_file}!')

    with open(os.path.join(base_path, 'image_url_to_id.json'), 'w') as f:
        json.dump(image_url_to_id, f)


    print('Done!')
