{
  "@context": {
    "@language": "en",
    "@vocab": "https://schema.org/",
    "citeAs": "cr:citeAs",
    "column": "cr:column",
    "conformsTo": "dct:conformsTo",
    "cr": "http://mlcommons.org/croissant/",
    "data": {
      "@id": "cr:data",
      "@type": "@json"
    },
    "dataBiases": "cr:dataBiases",
    "dataCollection": "cr:dataCollection",
    "dataType": {
      "@id": "cr:dataType",
      "@type": "@vocab"
    },
    "dct": "http://purl.org/dc/terms/",
    "extract": "cr:extract",
    "field": "cr:field",
    "fileProperty": "cr:fileProperty",
    "fileObject": "cr:fileObject",
    "fileSet": "cr:fileSet",
    "format": "cr:format",
    "includes": "cr:includes",
    "isLiveDataset": "cr:isLiveDataset",
    "jsonPath": "cr:jsonPath",
    "key": "cr:key",
    "md5": "cr:md5",
    "parentField": "cr:parentField",
    "path": "cr:path",
    "personalSensitiveInformation": "cr:personalSensitiveInformation",
    "recordSet": "cr:recordSet",
    "references": "cr:references",
    "regex": "cr:regex",
    "repeated": "cr:repeated",
    "replace": "cr:replace",
    "sc": "https://schema.org/",
    "separator": "cr:separator",
    "source": "cr:source",
    "subField": "cr:subField",
    "transform": "cr:transform"
  },
  "@type": "sc:Dataset",
  "distribution": [
    {
      "@type": "cr:FileObject",
      "@id": "repo",
      "name": "repo",
      "description": "The Hugging Face git repository.",
      "contentUrl": "https://huggingface.co/datasets/blevy41/BeanCounter/tree/refs%2Fconvert%2Fparquet",
      "encodingFormat": "git+https",
      "sha256": "https://github.com/mlcommons/croissant/issues/80"
    },
    {
      "@type": "cr:FileSet",
      "@id": "parquet-files-for-config-default",
      "name": "parquet-files-for-config-default",
      "description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/datasets-server/parquet).",
      "containedIn": {
        "@id": "repo"
      },
      "encodingFormat": "application/x-parquet",
      "includes": "default/*/*.parquet"
    },
    {
      "@type": "cr:FileSet",
      "@id": "parquet-files-for-config-clean",
      "name": "parquet-files-for-config-clean",
      "description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/datasets-server/parquet).",
      "containedIn": {
        "@id": "repo"
      },
      "encodingFormat": "application/x-parquet",
      "includes": "clean/*/*.parquet"
    }
  ],
  "recordSet": [
    {
      "@type": "cr:RecordSet",
      "@id": "default",
      "name": "default",
      "description": "blevy41/BeanCounter - 'default' subset (first 5GB)\n\nAdditional information:\n- 2 splits: train, validation\n- 2 skipped columns: date, ts_accept",
      "field": [
        {
          "@type": "cr:Field",
          "@id": "default/accession",
          "name": "default/accession",
          "description": "Column 'accession' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-default"
            },
            "extract": {
              "column": "accession"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "default/filename",
          "name": "default/filename",
          "description": "Column 'filename' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-default"
            },
            "extract": {
              "column": "filename"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "default/text",
          "name": "default/text",
          "description": "Column 'text' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-default"
            },
            "extract": {
              "column": "text"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "default/type_filing",
          "name": "default/type_filing",
          "description": "Column 'type_filing' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-default"
            },
            "extract": {
              "column": "type_filing"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "default/type_attachment",
          "name": "default/type_attachment",
          "description": "Column 'type_attachment' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-default"
            },
            "extract": {
              "column": "type_attachment"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "default/form_type",
          "name": "default/form_type",
          "description": "Column 'form_type' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-default"
            },
            "extract": {
              "column": "form_type"
            }
          }
        }
      ]
    },
    {
      "@type": "cr:RecordSet",
      "@id": "clean",
      "name": "clean",
      "description": "blevy41/BeanCounter - 'clean' subset (first 5GB)\n\nAdditional information:\n- 2 splits: train, validation\n- 2 skipped columns: date, ts_accept",
      "field": [
        {
          "@type": "cr:Field",
          "@id": "clean/accession",
          "name": "clean/accession",
          "description": "Column 'accession' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-clean"
            },
            "extract": {
              "column": "accession"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "clean/filename",
          "name": "clean/filename",
          "description": "Column 'filename' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-clean"
            },
            "extract": {
              "column": "filename"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "clean/text",
          "name": "clean/text",
          "description": "Column 'text' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-clean"
            },
            "extract": {
              "column": "text"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "clean/type_filing",
          "name": "clean/type_filing",
          "description": "Column 'type_filing' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-clean"
            },
            "extract": {
              "column": "type_filing"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "clean/type_attachment",
          "name": "clean/type_attachment",
          "description": "Column 'type_attachment' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-clean"
            },
            "extract": {
              "column": "type_attachment"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "clean/form_type",
          "name": "clean/form_type",
          "description": "Column 'form_type' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-clean"
            },
            "extract": {
              "column": "form_type"
            }
          }
        }
      ]
    }
  ],
  "conformsTo": "http://mlcommons.org/croissant/1.0",
  "name": "BeanCounter",
  "description": "\n\t\n\t\t\n\t\n\t\n\t\t\ud83e\uded8\ud83e\uddee BeanCounter\n\t\n\n\n\t\n\t\t\n\t\n\t\n\t\tDatset Summary\n\t\n\nBeanCounter is a low-toxicity, large-scale, and open dataset of business-oriented text. See Wang and Levy (2024) for details of the data collection, analysis, and some explorations of using the data for continued pre-training.\nThe data is sourced from the Electronic Data Gathering and Retrieval (EDGAR) system operated by the United States Securities and Exchange Commission (SEC). Specifically all filings submitted to EDGAR from 1996\u2026 See the full description on the dataset page: https://huggingface.co/datasets/blevy41/BeanCounter.",
  "alternateName": [
    "blevy41/BeanCounter"
  ],
  "creator": {
    "@type": "Person",
    "name": "Bradford Levy",
    "url": "https://huggingface.co/blevy41"
  },
  "keywords": [
    "\ud83c\uddfa\ud83c\uddf8 Region: US"
  ],
  "url": "https://huggingface.co/datasets/blevy41/BeanCounter"
}
