{"pile_arxiv": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nArXiv", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_arxiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 113218251, "num_examples": 2407, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 115653720, "num_examples": 2434, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 228871971, "size_in_bytes": 1160030307}, "pile_books3": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBooks3", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_books3", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 150095743, "num_examples": 269, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 177359876, "num_examples": 301, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 327455619, "size_in_bytes": 1258613955}, "pile_bookcorpus2": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nBookCorpus2", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_bookcorpus2", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 9680652, "num_examples": 28, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9776271, "num_examples": 26, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 19456923, "size_in_bytes": 950615259}, "pile_dm-mathematics": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nDM Mathematics", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_dm-mathematics", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 15756556, "num_examples": 1922, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 16453386, "num_examples": 2007, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 32209942, "size_in_bytes": 963368278}, "pile_enron": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEnron Emails", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_enron", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 1638859, "num_examples": 1010, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 1556487, "num_examples": 947, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 3195346, "size_in_bytes": 934353682}, "pile_europarl": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nEuroParl", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_europarl", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 8789652, "num_examples": 157, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9111791, "num_examples": 133, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 17901443, "size_in_bytes": 949059779}, "pile_freelaw": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nFreeLaw", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_freelaw", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 80808693, "num_examples": 5101, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 80363814, "num_examples": 5094, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 161172507, "size_in_bytes": 1092330843}, "pile_github": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGithub", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_github", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 95654706, "num_examples": 18195, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 97179576, "num_examples": 18337, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 192834282, "size_in_bytes": 1123992618}, "pile_gutenberg": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nGutenberg (PG-19)", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_gutenberg", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 30243176, "num_examples": 80, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 24685980, "num_examples": 60, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 54929156, "size_in_bytes": 986087492}, "pile_hackernews": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nHackerNews", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_hackernews", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 8124255, "num_examples": 1632, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 9803822, "num_examples": 1619, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 17928077, "size_in_bytes": 949086413}, "pile_nih-exporter": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nNIH ExPorter", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_nih-exporter", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 3928804, "num_examples": 1884, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 3927967, "num_examples": 1825, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 7856771, "size_in_bytes": 939015107}, "pile_opensubtitles": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenSubtitles", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_opensubtitles", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 21008996, "num_examples": 642, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 19622904, "num_examples": 621, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 40631900, "size_in_bytes": 971790236}, "pile_openwebtext2": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nOpenWebText2", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_openwebtext2", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 128624303, "num_examples": 32925, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 131554302, "num_examples": 33400, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 260178605, "size_in_bytes": 1191336941}, "pile_philpapers": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPhilPapers", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_philpapers", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 5090158, "num_examples": 68, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 6499078, "num_examples": 64, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 11589236, "size_in_bytes": 942747572}, "pile_pile-cc": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPile-CC", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pile-cc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 235004043, "num_examples": 52790, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 233535650, "num_examples": 52792, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 468539693, "size_in_bytes": 1399698029}, "pile_pubmed-abstracts": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Abstracts", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pubmed-abstracts", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 39908950, "num_examples": 29895, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 40008336, "num_examples": 29871, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 79917286, "size_in_bytes": 1011075622}, "pile_pubmed-central": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nPubMed Central", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_pubmed-central", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 187251519, "num_examples": 5911, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 184791818, "num_examples": 5977, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 372043337, "size_in_bytes": 1303201673}, "pile_stackexchange": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nStackExchange", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_stackexchange", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 66441557, "num_examples": 30378, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 66011397, "num_examples": 29950, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 132452954, "size_in_bytes": 1063611290}, "pile_upsto": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUSPTO Backgrounds", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_upsto", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 47345405, "num_examples": 11415, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 48122320, "num_examples": 11387, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 95467725, "size_in_bytes": 1026626061}, "pile_ubuntu-irc": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nUbuntu IRC", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_ubuntu-irc", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 5694218, "num_examples": 22, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 7410104, "num_examples": 21, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 13104322, "size_in_bytes": 944262658}, "pile_wikipedia": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nWikipedia (en)", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_wikipedia", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 52166968, "num_examples": 17511, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 53186137, "num_examples": 17478, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 105353105, "size_in_bytes": 1036511441}, "pile_youtubesubtitles": {"description": "The Pile is a 825 GiB diverse, open source language modeling data set that consists\nof 22 smaller, high-quality datasets combined together. To score well on Pile\nBPB (bits per byte), a model must be able to understand many disparate domains\nincluding books, github repositories, webpages, chat logs, and medical, physics,\nmath, computer science, and philosophy papers.\n\nYoutubeSubtitles", "citation": "@article{pile,\n  title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},\n  author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},\n  journal={arXiv preprint arXiv:2101.00027},\n  year={2020}\n}\n", "homepage": "https://pile.eleuther.ai/", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "pile", "config_name": "pile_youtubesubtitles", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"test": {"name": "test", "num_bytes": 7377448, "num_examples": 342, "dataset_name": "pile"}, "validation": {"name": "validation", "num_bytes": 8937546, "num_examples": 326, "dataset_name": "pile"}}, "download_checksums": {"https://the-eye.eu/public/AI/pile/val.jsonl.zst": {"num_bytes": 470907480, "checksum": "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92"}, "https://the-eye.eu/public/AI/pile/test.jsonl.zst": {"num_bytes": 460250856, "checksum": "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e"}}, "download_size": 931158336, "post_processing_size": null, "dataset_size": 16314994, "size_in_bytes": 947473330}}
