















import json

import datasets

_CITATION = 

_DESCRIPTION = 

_HOMEPAGE = "https://pile.eleuther.ai/"


_LICENSE = ""

_URLS = {
    "validation": "http://eaidata.bmk.sh/data/pile/val.jsonl.zst",
    "test": "http://eaidata.bmk.sh/data/pile/test.jsonl.zst",
}

_NAMES = {
    "pile_arxiv": "ArXiv",
    "pile_books3": "Books3",
    "pile_bookcorpus2": "BookCorpus2",
    "pile_dm-mathematics": "DM Mathematics",
    "pile_enron": "Enron Emails",
    "pile_europarl": "EuroParl",
    "pile_freelaw": "FreeLaw",
    "pile_github": "Github",
    "pile_gutenberg": "Gutenberg (PG-19)",
    "pile_hackernews": "HackerNews",
    "pile_nih-exporter": "NIH ExPorter",
    "pile_opensubtitles": "OpenSubtitles",
    "pile_openwebtext2": "OpenWebText2",
    "pile_philpapers": "PhilPapers",
    "pile_pile-cc": "Pile-CC",
    "pile_pubmed-abstracts": "PubMed Abstracts",
    "pile_pubmed-central": "PubMed Central",
    "pile_stackexchange": "StackExchange",
    "pile_upsto": "USPTO Backgrounds",
    "pile_ubuntu-irc": "Ubuntu IRC",
    "pile_wikipedia": "Wikipedia (en)",
    "pile_youtubesubtitles": "YoutubeSubtitles",
}


class Pile(datasets.GeneratorBasedBuilder):
    

    VERSION = datasets.Version("0.0.1")

    BUILDER_CONFIGS = [
        datasets.BuilderConfig(name=name, version=version, description=_NAMES[name])
        for name, version in zip(_NAMES.keys(), [VERSION] * len(_NAMES))
    ]

    def _info(self):
        features = datasets.Features(
            {
                "text": datasets.Value("string"),
            }
        )
        return datasets.DatasetInfo(
            description=f"{_DESCRIPTION}\n{self.config.description}",
            features=features,
            homepage=_HOMEPAGE,
            license=_LICENSE,
            citation=_CITATION,
        )

    def _split_generators(self, dl_manager):
        urls = {"validation": _URLS["validation"], "test": _URLS["test"]}
        data_dir = dl_manager.download_and_extract(urls)
        return [
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                
                gen_kwargs={"filepath": data_dir["test"], "split": "test"},
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                
                gen_kwargs={
                    "filepath": data_dir["validation"],
                    "split": "validation",
                },
            ),
        ]

    
    def _generate_examples(self, filepath, split):
        with open(filepath, encoding="utf-8") as f:
            for key, row in enumerate(f):
                data = json.loads(row)
                if data["meta"]["pile_set_name"] == _NAMES[self.config.name]:
                    yield key, {
                        "text": data["text"],
                    }
