#!/usr/bin/env python
# coding: utf-8

import os
from datasets import load_dataset

def download_clutrr(cache_dir=None):
    """Downloads the CLUTRR dataset.

    Args:
        cache_dir (str, optional): Directory to cache the datasets. Defaults to Hugging Face default.
    """
    dataset_name = "CLUTRR/v1" # Based on search results
    
    if cache_dir:
        os.makedirs(cache_dir, exist_ok=True)
        print(f"Using cache directory: {cache_dir}")
    else:
        print("Using default Hugging Face cache directory.")

    print(f"\nDownloading CLUTRR dataset: {dataset_name}...")
    try:
        # Load dataset - this will download if not cached
        # The CLUTRR dataset on HF might have specific configurations or splits
        # Let's try loading it directly first.
        # Common splits might be train, validation, test, or specific k values (e.g., k=2, k=3)
        # We might need to specify a configuration based on the experiment guide's needs.
        # For now, let's load the default configuration.
        # Update: Specify config based on error message and guide (2-4 hops)
        config_name = "gen_train234_test2to10"
        dataset = load_dataset(dataset_name, name=config_name, cache_dir=cache_dir)
        print(f"Successfully downloaded/loaded CLUTRR dataset: {dataset_name} (config: {config_name})")
        print(f"Dataset info: {dataset}")
        # Further inspection might be needed to see available splits/configs
        # print(f"Available splits: {list(dataset.keys())}")
    except Exception as e:
        print(f"Error downloading CLUTRR dataset {dataset_name}: {e}")

if __name__ == "__main__":
    # Specify a cache directory within the project if desired
    # cache_directory = "/home/ubuntu/ecam_project/data/hf_cache"
    cache_directory = None # Use default cache
    
    print(f"Starting download for CLUTRR dataset")
    download_clutrr(cache_dir=cache_directory)
    print("\nCLUTRR dataset download process finished.")

