#!/usr/bin/env python
# coding: utf-8

import os
from datasets import load_dataset

def download_vqa(dataset_name, cache_dir=None):
    """Downloads the specified VQA dataset (e.g., VQA v2.0 or GQA).

    Args:
        dataset_name (str): The name of the dataset on Hugging Face Hub (e.g., "HuggingFaceM4/VQAv2").
        cache_dir (str, optional): Directory to cache the datasets. Defaults to Hugging Face default.
    """
    if cache_dir:
        os.makedirs(cache_dir, exist_ok=True)
        print(f"Using cache directory: {cache_dir}")
    else:
        print("Using default Hugging Face cache directory.")

    print(f"\nDownloading VQA dataset: {dataset_name}...")
    try:
        # Load dataset - this will download if not cached
        # The HuggingFaceM4/VQAv2 dataset might require trust_remote_code=True
        # Let's try without it first, and add if necessary.
        dataset = load_dataset(dataset_name, cache_dir=cache_dir, trust_remote_code=True)
        print(f"Successfully downloaded/loaded VQA dataset: {dataset_name}")
        print(f"Dataset info: {dataset}")
        # Example: Accessing splits
        # print(f"Available splits: {list(dataset.keys())}")
        # print(f"Train features: {dataset["train"].features}")
    except Exception as e:
        print(f"Error downloading VQA dataset {dataset_name}: {e}")
        # If it's a trust issue, we might need to add trust_remote_code=True
        # if "requires you to execute the dataset script" in str(e):
        #     print("Attempting download with trust_remote_code=True...")
        #     try:
        #         dataset = load_dataset(dataset_name, cache_dir=cache_dir, trust_remote_code=True)
        #         print(f"Successfully downloaded/loaded VQA dataset: {dataset_name} with remote code execution.")
        #         print(f"Dataset info: {dataset}")
        #     except Exception as e2:
        #         print(f"Error downloading VQA dataset {dataset_name} even with remote code: {e2}")
        # else:
        #     print(f"Unhandled error: {e}")

if __name__ == "__main__":
    # Dataset name from search results
    vqa_dataset_name = "HuggingFaceM4/VQAv2"
    # Specify a cache directory within the project if desired
    # cache_directory = "/home/ubuntu/ecam_project/data/hf_cache"
    cache_directory = None # Use default cache
    
    print(f"Starting download for VQA dataset: {vqa_dataset_name}")
    download_vqa(vqa_dataset_name, cache_dir=cache_directory)
    print("\nVQA dataset download process finished.")

