#!/usr/bin/env python3
"""load_dataset.py

Usage:
    python load_dataset.py -n DATASET_ID [-s COL1,COL2,...] [-p SPLIT]

This script loads a **Hugging Face Datasets** dataset given with `-n/--name` and
(optionally) keeps only the columns passed through the comma-separated list
`-s/--subset`. By default it loads the *train* split, which can be changed with
`-p/--split`.

Examples
--------
Load the IMDb movie-review dataset (train split):

```bash
python load_dataset.py -n imdb
```

Load the GLUE MRPC task and keep only the two input sentences and the label:

```bash
python load_dataset.py -n glue/mrpc -p train -s sentence1,sentence2,label
```
"""
from __future__ import annotations

import argparse
import sys
from typing import List

import subprocess

from datasets import load_dataset, Dataset


def _subset_dataset(ds: Dataset, cols: List[str]) -> Dataset:
    """Return a dataset containing only *cols* (preserving order)."""
    missing = [c for c in cols if c not in ds.column_names]
    if missing:
        sys.exit(f"Columns not found in dataset: {', '.join(missing)}")

    # `select_columns` exists from 🤗 Datasets ≥1.10; fall back otherwise
    if hasattr(ds, "select_columns"):
        return ds.select_columns(cols)  # type: ignore[attr-defined]

    # Older fallback: remove columns we *don't* want
    to_drop = [c for c in ds.column_names if c not in cols]
    return ds.remove_columns(to_drop)


def parse_args(argv: List[str] | None = None) -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Load a Hugging Face dataset and optionally subset its columns.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "name",
        metavar="DATASET_ID",
        help="Dataset identifier (e.g. 'imdb', 'glue/mrpc', local path, or HF hub URL).",
    )
    parser.add_argument(
        "-p",
        "--split",
        default="train",
        metavar="SPLIT",
        help="Dataset split to load (e.g. train, validation, test).",
    )
    parser.add_argument(
        "-s",
        "--subset",
        default=None,
        metavar="SUB1,SUB2,...",
        help="Comma-separated list of subsets to keep. Mandatory for datasets with no default subset.",
    )
    return parser.parse_args(argv)


def main(argv: List[str] | None = None) -> None:
    args = parse_args(argv)

    # Attempt to load the dataset
    try:
        # use system command huggingface-cli download -n 
        subprocess.run(["hf", "download", args.name, "--repo-type", "dataset"], check=True)
    except Exception as exc:
        sys.exit(f"Failed to download dataset '{args.name}': {exc}")

    if args.subset is not None:
        subsets = args.subset.split(",")
    else:
        subsets = [None]

    if args.split is not None:
        splits = args.split.split(",")
    else:
        splits = [None]

    for split in splits:
        for subset in subsets:
            ds: Dataset = load_dataset(args.name, subset, split=split)  # type: ignore[arg-type]

        # Show a concise summary
        print(f"-----------------------------------\nLoaded Hugging Face dataset summary: {args.name} ({subset}) \n-----------------------------------")
        print(ds)
        print(f"Rows: {len(ds):,}; Columns: {', '.join(ds.column_names)}")

        # Preview first few rows (as dict for clarity)
        # preview_rows = min(5, len(ds))
        # print(f"\nPreview of first {preview_rows} row(s):\n-----------------------------------")
        # for i in range(preview_rows):
        #     print(f"Row {i}: {ds[i]}")


if __name__ == "__main__":
    main()
