import os
from datasets import load_dataset, DatasetDict

#  python3 mllmsd/datamodules/download-datasets.py 
#0. Config
save_dir = "/XXXX-5/home-XXXX-3/data/MSD/datasets/"
os.makedirs(save_dir, exist_ok=True)    

repo = "lmms-lab/LMMs-Eval-Lite"

"""
ValueError: BuilderConfig 'ok_vqa_val_2014' not found. Available: 
['ai2d', 'chartqa', 'coco2017_cap_val', 'docvqa_val', 'flickr30k_test', 'gqa', 'infovqa_val', 'mmbench_cn_dev', 'mmbench_en_dev', 'nocaps_val', 'ok_vqa_val2014', 'refcoco_bbox_val', 'seedbench', 'textcaps_val', 'textvqa_val', 'vizwiz_vqa_val', 'vqav2_val']
"""

dataset_openended = [
    'chartqa',
    'docvqa_val',
    'infovqa_val',
    'ok_vqa_val2014',
    'textvqa_val',
    'vizwiz_vqa_val',
    'vqav2_val',
]

for subset in dataset_openended:
    datasets = load_dataset(repo, subset)
    
    save_path = os.path.join(save_dir, subset)

    for split, dataset in datasets.items():
        print(dataset.features)
        break

    datasets = DatasetDict({
        'test': datasets['lite']
    })

    datasets.save_to_disk(save_path)



def _apply_tiny_data_filter(map_datasets, tiny_map):
    for split in map_datasets.keys():
        map_datasets[split] = map_datasets[split].select(range(tiny_map[split]))
    return map_datasets
"""
tiny_map = {'train': 80, 'validation': 10, 'test': 10}
map_datasets = _apply_tiny_data_filter(map_datasets, tiny_map)
"""