{
  "@context": {
    "@language": "en",
    "@vocab": "https://schema.org/",
    "citeAs": "cr:citeAs",
    "column": "cr:column",
    "conformsTo": "dct:conformsTo",
    "cr": "http://mlcommons.org/croissant/",
    "rai": "http://mlcommons.org/croissant/RAI/",
    "data": {
      "@id": "cr:data",
      "@type": "@json"
    },
    "dataType": {
      "@id": "cr:dataType",
      "@type": "@vocab"
    },
    "dct": "http://purl.org/dc/terms/",
    "examples": {
      "@id": "cr:examples",
      "@type": "@json"
    },
    "extract": "cr:extract",
    "field": "cr:field",
    "fileProperty": "cr:fileProperty",
    "fileObject": "cr:fileObject",
    "fileSet": "cr:fileSet",
    "format": "cr:format",
    "includes": "cr:includes",
    "isLiveDataset": "cr:isLiveDataset",
    "jsonPath": "cr:jsonPath",
    "key": "cr:key",
    "md5": "cr:md5",
    "parentField": "cr:parentField",
    "path": "cr:path",
    "recordSet": "cr:recordSet",
    "references": "cr:references",
    "regex": "cr:regex",
    "repeated": "cr:repeated",
    "replace": "cr:replace",
    "sc": "https://schema.org/",
    "separator": "cr:separator",
    "source": "cr:source",
    "subField": "cr:subField",
    "transform": "cr:transform"
  },
  "@type": "sc:Dataset",
  "name": "WikiDBs",
  "description": "Deep learning on tabular data, and particularly tabular representation learning, has recently gained growing interest.\n However, representation learning for relational databases with multiple tables is still an underexplored area, which may be attributed to the lack of openly available resources.\n To support the development of foundation models for tabular data and relational databases, we introduce WikiDBs, a novel open-source corpus of 100,000 relational databases.\n    Each database consists of multiple tables connected by foreign keys.\n    The corpus is based on Wikidata and follows the characteristics of real-world databases.\n    In this paper, we describe the dataset and our method for creating it.\n    By making our code publicly available, we enable others to create tailored versions of the dataset, for example, by creating databases in different languages.\n    Finally, we conduct a set of initial experiments to showcase how WikiDBs can be used to train for the tasks of missing value imputation and the prediction of column and table names.\n    ",
  "conformsTo": "http://mlcommons.org/croissant/1.0",
  "citeAs": "@article{vogel2024wikidbs, title={WikiDBs: A Large-Scale Corpus Of Relational Databases from Wikidata}\n , author={Liane Vogel, Jan-Micha Bodensohn, Carsten Binnig}, year={2024}}",
  "url": "will_follow",
  "distribution": [
    {
      "@type": "cr:FileSet",
      "@id": "schema-files",
      "name": "schema-files",
      "description": "schema files contain details on each table and foreign key relationships",
      "encodingFormat": "application/json",
      "includes": "./*/schema.json"
    },
    {
      "@type": "cr:FileSet",
      "@id": "table-csv-files",
      "name": "table-csv-files",
      "description": "csv files that contain each table in the database",
      "encodingFormat": "application/csv",
      "includes": "./*/tables/*.csv"
    },
    {
      "@type": "cr:FileSet",
      "@id": "renaming-files",
      "name": "renaming-files",
      "description": "json files with names before and after paraphrasing",
      "encodingFormat": "application/json",
      "includes": "./*/renaming.csv"
    },
    {
      "@type": "cr:FileSet",
      "@id": "schema-diagram-files",
      "name": "schema-diagram-files",
      "description": "erd diagrams as visualization of the database schema",
      "encodingFormat": "application/pdf",
      "includes": "./*/schema_diagram.pdf"
    }
  ],
  "recordSet": [
    {
      "@type": "cr:RecordSet",
      "@id": "database",
      "name": "database",
      "field": [
        {
          "@type": "cr:Field",
          "@id": "database/schema",
          "name": "schema",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "schema-files"
            },
            "extract": {
              "jsonPath": "schema.json"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "database/tables",
          "name": "tables",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "table-csv-files"
            },
            "extract": {
              "column": "tables"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "database/tables-with-item-ids",
          "name": "tables-with-item-ids",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "table-csv-files"
            },
            "extract": {
              "column": "tables_with_item_ids"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "database/renaming",
          "name": "paraphrasing-info",
          "description": "Names before and after the paraphrasing",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "renaming-files"
            },
            "extract": {
              "jsonPath": "renaming.json"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "database/schema-diagram-pdf",
          "name": "schema-diagram",
          "description": "Erd diagram of the database schema",
          "dataType": "sc:ImageObject",
          "source": {
            "fileSet": {
              "@id": "schema-diagram-files"
            },
            "extract": {
              "column": "schema-diagram-pdf"
            }
          }
        }
      ]
    }
  ]
}
