import os
from datasets import load_dataset, Dataset
from tqdm import tqdm
import pdb

def download_dataset():
    """Download the dataset from the Hugging Face Hub."""
    dataset_name = "zhouliang/BlueMO"
    cache_dir = "/lustre/fast/fast/txiao/zly/intrinsic_lean/benchmarks"
    os.environ["HF_DATASETS_CACHE"] = cache_dir
    
    print("Downloading dataset...")
    ds = load_dataset(dataset_name)
    cache_dir = os.path.join(cache_dir, dataset_name.split('/', 1)[1].replace('/', '_'))
    # Save each split to parquet format
    for split_name, split_data in ds.items():
        parquet_path = os.path.join(cache_dir, f"{split_name}.parquet")
        print(f"Saving {split_name} split to {parquet_path}")
        split_data.to_parquet(parquet_path)
        print(f"Saved {split_name} split successfully")


if __name__ == "__main__":
    download_dataset()