{"description": "This is the Penn Treebank Project: Release 2 CDROM, featuring a million words of 1989 Wall Street Journal material. This corpus has been annotated for part-of-speech (POS) information. In addition, over half of it has been annotated for skeletal syntactic structure.\n", "citation": "@article{marcus-etal-1993-building,\n    title = \"Building a Large Annotated Corpus of {E}nglish: The {P}enn {T}reebank\",\n    author = \"Marcus, Mitchell P.  and\n      Santorini, Beatrice  and\n      Marcinkiewicz, Mary Ann\",\n    journal = \"Computational Linguistics\",\n    volume = \"19\",\n    number = \"2\",\n    year = \"1993\",\n    url = \"https://www.aclweb.org/anthology/J93-2004\",\n    pages = \"313--330\",\n}\n", "homepage": "https://catalog.ldc.upenn.edu/LDC99T42", "license": "LDC User Agreement for Non-Members", "features": {"sentence": {"dtype": "string", "_type": "Value"}}, "builder_name": "ptb_text_only", "dataset_name": "ptb_text_only", "config_name": "penn_treebank", "version": {"version_str": "1.1.0", "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 5143686, "num_examples": 42068, "dataset_name": "ptb_text_only"}, "test": {"name": "test", "num_bytes": 453706, "num_examples": 3761, "dataset_name": "ptb_text_only"}, "validation": {"name": "validation", "num_bytes": 403152, "num_examples": 3370, "dataset_name": "ptb_text_only"}}, "download_checksums": {"https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt": {"num_bytes": 5101618, "checksum": null}, "https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt": {"num_bytes": 399782, "checksum": null}, "https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt": {"num_bytes": 449945, "checksum": null}}, "download_size": 5951345, "dataset_size": 6000544, "size_in_bytes": 11951889}