import datasets
from huggingface_hub import HfApi, HfFileSystem
import pyarrow.parquet as pq

_CITATION = """\
@misc{anonymous_smel_2025,
  author={Anonymous},
  title={SMEL Dataset},
  year={2025},
  publisher={Hugging Face Hub},
}
"""

_DESCRIPTION = "The SMeL Test"
_HOMEPAGE = "https://huggingface.co/datasets/anonymous/smel"
_LICENSE = "cc_by_4.0"
_HF_DATASET_REPO_ID = "anonymous/smel"
_BASE_URL_RESOLVE = f"https://huggingface.co/datasets/{_HF_DATASET_REPO_ID}/resolve/main/"

_FEATURES_QA = datasets.Features({
    "source": datasets.Value("string"),
    "question": datasets.Value("string"),
    "answer": datasets.Value("string"),
    "passage": datasets.Value("string"),
    "entity": datasets.Value("string"),
    "entity_type": datasets.Value("string"),
})

_FEATURES_SUMMARIZATION = datasets.Features({
    "source": datasets.Value("string"),
    "entity": datasets.Value("string"),
    "entity_type": datasets.Value("string"),
    "fact": datasets.Value("string"),
    "passage": datasets.Value("string"),
})

class SmelConfig(datasets.BuilderConfig):
    def __init__(self, name, description, features, data_path, **kwargs):
        super(SmelConfig, self).__init__()
        self.name = name
        self.description = description
        self.features = features
        self.data_path = data_path


class SmelDataset(datasets.GeneratorBasedBuilder):
    """SMeL dataset."""

    BUILDER_CONFIGS = [
        SmelConfig(
            name="summarization",
            description="Summarization tasks.",
            features=_FEATURES_SUMMARIZATION,
            data_path="summarization/",
        ),
        SmelConfig(
            name="qa_rewritten",
            description="Rewritten Question Answering tasks.",
            features=_FEATURES_QA,
            data_path="qa/rewritten",
        ),
        SmelConfig(
            name="qa_corrupted",
            description="Corrupted Question Answering tasks.",
            features=_FEATURES_QA,
            data_path="qa/corrupted",
        ),
    ]

    def _info(self):
        return datasets.DatasetInfo(
            description=_DESCRIPTION,
            features=self.config.features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        fs = HfFileSystem() # Initialize the Hugging Face FileSystem
        repo_path = f"datasets/{_HF_DATASET_REPO_ID}/" # Base path on the HF Filesystem
        all_files_in_path = fs.ls(repo_path + self.config.data_path, detail=False)
        parquet_files = [f for f in all_files_in_path if f.endswith('.parquet')]

        data_urls = [
             _BASE_URL_RESOLVE + f.replace(f"datasets/{_HF_DATASET_REPO_ID}/", "")
             for f in parquet_files
        ]

        downloaded_files = dl_manager.download_and_extract(data_urls)

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "filepaths": downloaded_files,
                },
            ),
        ]
  
    def _generate_examples(self, filepaths):
        """Yields examples."""
        example_idx = 0
        for filepath in filepaths:
            table = pq.read_table(filepath)
            pa_df = table.to_pandas()

            for _, row in pa_df.iterrows():
                yield example_idx, row.to_dict()
                example_idx += 1
