"""
Utility functions for downloading and processing the IFEVAL dataset from Hugging Face.
"""

import os
from datasets import load_dataset
import pandas as pd
from typing import Optional, Union, Dict, List, Any


def download_ifeval(
    cache_dir: Optional[str] = None,
    split: str = "train",
    subset: Optional[str] = None
) -> pd.DataFrame:
    """
    Download the IFEVAL dataset from Hugging Face.
    
    Args:
        cache_dir: Directory to cache the dataset. If None, uses the default Hugging Face cache.
        split: Dataset split to download ('train', 'test', or 'validation').
        subset: Specific subset of IFEVAL to download. If None, downloads the full dataset.
        
    Returns:
        A pandas DataFrame containing the IFEVAL dataset.
    """
    # Load the dataset from Hugging Face
    dataset = load_dataset(
        "google/ifeval",
        subset,
        split=split,
        cache_dir=cache_dir
    )
    
    # Convert to pandas DataFrame for easier manipulation
    df = pd.DataFrame(dataset)
    
    print(f"Downloaded IFEVAL dataset ({split} split) with {len(df)} examples")
    
    return df


def get_ifeval_categories() -> List[str]:
    """
    Returns the list of categories available in the IFEVAL dataset.
    
    Returns:
        List of category names.
    """
    # Load just the metadata to get categories
    dataset_info = load_dataset("google/ifeval", split="train")
    
   
if __name__ == "__main__":
    # Example usage
    df = download_ifeval(split="train")
    all_type = {}
    df = df['instruction_id_list'].tolist()
    for d in df:
        for t in d:
            t = t.split(':')
            if t[0] not in all_type:
                all_type[t[0]] = [t[1]]
            else:
                all_type[t[0]].append(t[1])
    for k, v in all_type.items():
        all_type[k] = list(set(v))
    from pprint import pprint
    pprint(all_type)
    
