
















import json

import datasets

_CITATION = 

_DESCRIPTION = 

_HOMEPAGE = "https://zenodo.org/record/2630551


_LICENSE = ""

_URLS = {
    "default": "https://openaipublic.blob.core.windows.net/gpt-2/data/lambada_test.jsonl",
    "en": "http://eaidata.bmk.sh/data/lambada_test_en.jsonl",
    "fr": "http://eaidata.bmk.sh/data/lambada_test_fr.jsonl",
    "de": "http://eaidata.bmk.sh/data/lambada_test_de.jsonl",
    "it": "http://eaidata.bmk.sh/data/lambada_test_it.jsonl",
    "es": "http://eaidata.bmk.sh/data/lambada_test_es.jsonl",
}


class LambadaOpenAI(datasets.GeneratorBasedBuilder):
    

    VERSION = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(
            name="default",
            version=VERSION,
            description="Pre-processed English LAMBADA dataset from OpenAI",
        ),
        datasets.BuilderConfig(
            name="en",
            version=VERSION,
            description="The English translated LAMBADA OpenAI dataset",
        ),
        datasets.BuilderConfig(
            name="fr",
            version=VERSION,
            description="The French translated LAMBADA OpenAI dataset",
        ),
        datasets.BuilderConfig(
            name="de",
            version=VERSION,
            description="The German translated LAMBADA OpenAI dataset",
        ),
        datasets.BuilderConfig(
            name="it",
            version=VERSION,
            description="The Italian translated LAMBADA OpenAI dataset",
        ),
        datasets.BuilderConfig(
            name="es",
            version=VERSION,
            description="The Spanish translated LAMBADA OpenAI dataset",
        ),
    ]

    DEFAULT_CONFIG_NAME = "default"

    def _info(self):
        features = datasets.Features(
            {
                "text": datasets.Value("string"),
            }
        )
        return datasets.DatasetInfo(
            description=f"{_DESCRIPTION}\n{self.config.description}",
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        urls = _URLS[self.config.name]
        data_dir = dl_manager.download_and_extract(urls)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                
                gen_kwargs={
                    "filepath": data_dir,
                    "split": "validation",
                },
            ),
        ]

    
    def _generate_examples(self, filepath, split):
        with open(filepath, encoding="utf-8") as f:
            for key, row in enumerate(f):
                data = json.loads(row)
                yield key, {"text": data["text"]}
