#!/usr/bin/env python
# coding: utf-8

import os
from datasets import load_dataset

def download_glue_subsets(subsets, base_save_dir):
    """Downloads specified subsets of the GLUE benchmark and saves them to disk.

    Args:
        subsets (list): A list of GLUE subset names (e.g., ["mnli", "rte", "qnli"]).
        base_save_dir (str): Base directory to save the processed datasets.
    """
    os.makedirs(base_save_dir, exist_ok=True)
    print(f"Saving datasets to subdirectories under: {base_save_dir}")

    for subset in subsets:
        print(f"\nProcessing GLUE subset: {subset}...")
        try:
            # Load dataset - this will download if not cached
            dataset = load_dataset("glue", subset)
            print(f"Successfully downloaded/loaded GLUE subset: {subset}")
            print(f"Dataset info: {dataset}")
            
            # Save the dataset to the specified directory
            subset_save_dir = os.path.join(base_save_dir, subset)
            print(f"Saving {subset} dataset to {subset_save_dir}...")
            dataset.save_to_disk(subset_save_dir)
            print(f"Successfully saved {subset} dataset.")
            
        except Exception as e:
            print(f"Error processing GLUE subset {subset}: {e}")

if __name__ == "__main__":
    glue_subsets_to_download = ["mnli", "rte", "qnli"]
    # Specify the base directory within the project
    save_directory = "/home/ubuntu/ecam_project/data/real_world/glue"
    
    print(f"Starting download and save process for GLUE subsets: {glue_subsets_to_download}")
    download_glue_subsets(glue_subsets_to_download, base_save_dir=save_directory)
    print("\nGLUE dataset download and save process finished.")

