# Convert VLMEvalKit tsv files to HF datasets and save images to disk for image tokenization
import os
import torch
import random
import numpy as np
from PIL import Image
from io import BytesIO
import datasets
import logging
from datasets import load_dataset, load_from_disk
from tqdm.auto import tqdm
import pandas as pd
import multiprocessing
from utils import dataset_URLs, download_file, decode_base64_to_image_file
import argparse


def tsv_to_jsonl(tsv_path, jsonl_path):
    df = pd.read_csv(tsv_path, sep='\t')
    print(df.columns)
    df = df.sort_values(by=['index'])
    df.to_json(jsonl_path, orient='records', lines=True)
    return {index: idx for idx, index in enumerate(df['index'].tolist()) if index < 1e6}

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--output_dir', type=str, default='YOUR_ROOT_PATH/data/MLLM/Evaluation', help='path to save the output')
    parser.add_argument('--dataset_name', type=str, default='MMBench_DEV_EN', help='dataset name')
    parser.add_argument('--dataset_name_list', type=str, default='MMBench_DEV_EN,MMBench_TEST_EN,MMBench_DEV_CN,MMBench_TEST_CN,CCBench', help='dataset name list') # 'MMBench_DEV_EN,MMBench_TEST_EN,MMBench_DEV_CN,MMBench_TEST_CN,CCBench'
    parser.add_argument('--seed', type=int, default=42, help='random seed')
    parser.add_argument('--process_batch_size', type=int, default=200, help='process batch size')
    parser.add_argument('--process_num_workers', type=int, default=multiprocessing.cpu_count(), help='preprocessing num workers')
    print('Number of available cores:', multiprocessing.cpu_count())
    print('Number of available gpus:', torch.cuda.device_count())
    
    try:
        print('GPU model name:', torch.cuda.get_device_name(0))
        print('GPU memory size:', torch.cuda.get_device_properties(0).total_memory / 1024 / 1024 / 1024, 'GB')
    except:
        print('No GPU available.')
    
    args = parser.parse_args()
    
    return args

def main():
    args = parse_args()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if args.output_dir is not None:
        os.makedirs(args.output_dir, exist_ok=True)
    else:
        print("There is no `args.=output_dir` specified! Model checkpoints will not be saved.")
        exit()

    if args.dataset_name_list:
        dataset_name_list = args.dataset_name_list.split(',')
    else:
        dataset_name_list = [args.dataset_name]
    
    for dataset_name in dataset_name_list:
        origin_path = os.path.join(args.output_dir, dataset_name, 'origin')
        os.makedirs(origin_path, exist_ok=True)
        dataset_url = dataset_URLs[dataset_name]
        dataset_file_name = dataset_url.split('/')[-1]
        dataset_file_path = os.path.join(origin_path, dataset_file_name)
        if not os.path.exists(dataset_file_path):
            download_file(dataset_url, dataset_file_path)

        if dataset_file_path.endswith('.tsv'):
            index2idx = tsv_to_jsonl(dataset_file_path, dataset_file_path.replace('.tsv', '.jsonl'))
            dataset_file_path = dataset_file_path.replace('.tsv', '.jsonl')

        raw_datasets = load_dataset(
            "json",
            data_files=dataset_file_path,
        )

        hfdatasets_path = os.path.join(args.output_dir, dataset_name, 'datasets')
        images_path = os.path.join(args.output_dir, dataset_name, 'images')
        os.makedirs(hfdatasets_path, exist_ok=True)
        os.makedirs(images_path, exist_ok=True)
        
        def process_images(examples):
            examples['image_index'] = [index2idx[index % 1e6] for index in examples['index']]
            examples['local_image_path'] = [os.path.join(images_path, f'{str(image_index).zfill(7)}.jpg') for image_index in examples['image_index']]
            for example_index, index in enumerate(examples['index']):
                if index < 1e6:
                    decode_base64_to_image_file(examples['image'][example_index], examples['local_image_path'][example_index])
            return examples

        convert_datasets = raw_datasets.map(
            process_images,
            batched=True,
            batch_size=args.process_batch_size,
            num_proc=args.process_num_workers, # may need to comment if stuck
            remove_columns=['image'],
            desc="Save image bytes to files",
        )

        print(convert_datasets)
        print(convert_datasets['train'].column_names)

        # the cache of tensors are quite large, e.g., 3M images => 2TB. 
        # If your images are saved as bytes, you may have to convert them to tensors firstly.
        # If your images are saved as files, you may better directly open the images before tokenization. 
        convert_datasets.save_to_disk(
            hfdatasets_path, max_shard_size="20GB"
        )

if __name__ == "__main__":
    main()
