"""
OvertonBench dataset loader for the main pipeline.

- Most users: do not set DATASET → data loads from Hugging Face (in memory).
- Own data: set DATASET in .env to your CSV path → that file is loaded (must exist).
"""

import os
from pathlib import Path

try:
    from dotenv import load_dotenv
    load_dotenv()
except ImportError:
    pass


def load_overtonbench(split="full"):
    """
    Load the OvertonBench dataset from Hugging Face. Returns a pandas DataFrame.

    split: one of "full", "modelslant", "prism". Default "full".
    """
    from datasets import load_dataset
    split = (split or "full").strip().lower()
    dataset = load_dataset("elinorpd/overtonbench", split=split)
    return dataset.to_pandas()


def get_overtonbench_data(path=None, source_split=None):
    """
    Return the benchmark dataset as a pandas DataFrame.

    - If path is None and DATASET is not set (or is empty): load from Hugging Face
      using split = source_split or DATASET_SPLIT env or "full".
    - If path is set or DATASET is set to a non-empty path: load from that CSV (file must exist);
      source_split is ignored.
    """
    import pandas as pd
    path = path if path is not None else os.environ.get("DATASET")
    if path is None or str(path).strip() == "":
        split = source_split or os.environ.get("DATASET_SPLIT", "full")
        return load_overtonbench(split=split)
    path = str(path).strip()
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(
            f"Dataset path not found: {path}. "
            f"Unset DATASET to load from Hugging Face, or set DATASET to an existing CSV path."
        )
    return pd.read_csv(p)
