---
annotations_creators:
- no-annotation
language_creators:
- crowdsourced
pretty_name: Wikipedia
paperswithcode_id: null
license:
- cc-by-sa-3.0
- gfdl
task_categories:
- text-generation
- fill-mask
task_ids:
- language-modeling
- masked-language-modeling
source_datasets:
- original
multilinguality:
- multilingual
size_categories:
- n<1K
- 1K<n<10K
- 10K<n<100K
- 100K<n<1M
- 1M<n<10M
language:
- aa
- ab
- ace
- af
- ak
- als
- am
- an
- ang
- ar
- arc
- arz
- as
- ast
- atj
- av
- ay
- az
- azb
- ba
- bar
- bcl
- be
- bg
- bh
- bi
- bjn
- bm
- bn
- bo
- bpy
- br
- bs
- bug
- bxr
- ca
- cbk
- cdo
- ce
- ceb
- ch
- cho
- chr
- chy
- ckb
- co
- cr
- crh
- cs
- csb
- cu
- cv
- cy
- da
- de
- din
- diq
- dsb
- dty
- dv
- dz
- ee
- el
- eml
- en
- eo
- es
- et
- eu
- ext
- fa
- ff
- fi
- fj
- fo
- fr
- frp
- frr
- fur
- fy
- ga
- gag
- gan
- gd
- gl
- glk
- gn
- gom
- gor
- got
- gu
- gv
- ha
- hak
- haw
- he
- hi
- hif
- ho
- hr
- hsb
- ht
- hu
- hy
- ia
- id
- ie
- ig
- ii
- ik
- ilo
- inh
- io
- is
- it
- iu
- ja
- jam
- jbo
- jv
- ka
- kaa
- kab
- kbd
- kbp
- kg
- ki
- kj
- kk
- kl
- km
- kn
- ko
- koi
- krc
- ks
- ksh
- ku
- kv
- kw
- ky
- la
- lad
- lb
- lbe
- lez
- lfn
- lg
- li
- lij
- lmo
- ln
- lo
- lrc
- lt
- ltg
- lv
- lzh
- mai
- mdf
- mg
- mh
- mhr
- mi
- min
- mk
- ml
- mn
- mr
- mrj
- ms
- mt
- mus
- mwl
- my
- myv
- mzn
- na
- nah
- nan
- nap
- nds
- ne
- new
- ng
- nl
- nn
- 'no'
- nov
- nrf
- nso
- nv
- ny
- oc
- olo
- om
- or
- os
- pa
- pag
- pam
- pap
- pcd
- pdc
- pfl
- pi
- pih
- pl
- pms
- pnb
- pnt
- ps
- pt
- qu
- rm
- rmy
- rn
- ro
- ru
- rue
- rup
- rw
- sa
- sah
- sat
- sc
- scn
- sco
- sd
- se
- sg
- sgs
- sh
- si
- sk
- sl
- sm
- sn
- so
- sq
- sr
- srn
- ss
- st
- stq
- su
- sv
- sw
- szl
- ta
- tcy
- tdt
- te
- tg
- th
- ti
- tk
- tl
- tn
- to
- tpi
- tr
- ts
- tt
- tum
- tw
- ty
- tyv
- udm
- ug
- uk
- ur
- uz
- ve
- vec
- vep
- vi
- vls
- vo
- vro
- wa
- war
- wo
- wuu
- xal
- xh
- xmf
- yi
- yo
- yue
- za
- zea
- zh
- zu
language_bcp47:
- nds-nl
dataset_info:
- config_name: 20220301.de
  features:
  - name: id
    dtype: string
  - name: url
    dtype: string
  - name: title
    dtype: string
  - name: text
    dtype: string
  splits:
  - name: train
    num_bytes: 8905282792
    num_examples: 2665357
  download_size: 6523215105
  dataset_size: 8905282792
- config_name: 20220301.en
  features:
  - name: id
    dtype: string
  - name: url
    dtype: string
  - name: title
    dtype: string
  - name: text
    dtype: string
  splits:
  - name: train
    num_bytes: 20275516160
    num_examples: 6458670
  download_size: 20598313936
  dataset_size: 20275516160
- config_name: 20220301.fr
  features:
  - name: id
    dtype: string
  - name: url
    dtype: string
  - name: title
    dtype: string
  - name: text
    dtype: string
  splits:
  - name: train
    num_bytes: 7375920768
    num_examples: 2402095
  download_size: 5602565274
  dataset_size: 7375920768
- config_name: 20220301.frr
  features:
  - name: id
    dtype: string
  - name: url
    dtype: string
  - name: title
    dtype: string
  - name: text
    dtype: string
  splits:
  - name: train
    num_bytes: 9129760
    num_examples: 15199
  download_size: 12438017
  dataset_size: 9129760
- config_name: 20220301.it
  features:
  - name: id
    dtype: string
  - name: url
    dtype: string
  - name: title
    dtype: string
  - name: text
    dtype: string
  splits:
  - name: train
    num_bytes: 4539944448
    num_examples: 1743035
  download_size: 3516441239
  dataset_size: 4539944448
- config_name: 20220301.simple
  features:
  - name: id
    dtype: string
  - name: url
    dtype: string
  - name: title
    dtype: string
  - name: text
    dtype: string
  splits:
  - name: train
    num_bytes: 235072360
    num_examples: 205328
  download_size: 239682796
  dataset_size: 235072360
config_names:
- 20220301.aa
- 20220301.ab
- 20220301.ace
- 20220301.ady
- 20220301.af
- 20220301.ak
- 20220301.als
- 20220301.am
- 20220301.an
- 20220301.ang
- 20220301.ar
- 20220301.arc
- 20220301.arz
- 20220301.as
- 20220301.ast
- 20220301.atj
- 20220301.av
- 20220301.ay
- 20220301.az
- 20220301.azb
- 20220301.ba
- 20220301.bar
- 20220301.bat-smg
- 20220301.bcl
- 20220301.be
- 20220301.be-x-old
- 20220301.bg
- 20220301.bh
- 20220301.bi
- 20220301.bjn
- 20220301.bm
- 20220301.bn
- 20220301.bo
- 20220301.bpy
- 20220301.br
- 20220301.bs
- 20220301.bug
- 20220301.bxr
- 20220301.ca
- 20220301.cbk-zam
- 20220301.cdo
- 20220301.ce
- 20220301.ceb
- 20220301.ch
- 20220301.cho
- 20220301.chr
- 20220301.chy
- 20220301.ckb
- 20220301.co
- 20220301.cr
- 20220301.crh
- 20220301.cs
- 20220301.csb
- 20220301.cu
- 20220301.cv
- 20220301.cy
- 20220301.da
- 20220301.de
- 20220301.din
- 20220301.diq
- 20220301.dsb
- 20220301.dty
- 20220301.dv
- 20220301.dz
- 20220301.ee
- 20220301.el
- 20220301.eml
- 20220301.en
- 20220301.eo
- 20220301.es
- 20220301.et
- 20220301.eu
- 20220301.ext
- 20220301.fa
- 20220301.ff
- 20220301.fi
- 20220301.fiu-vro
- 20220301.fj
- 20220301.fo
- 20220301.fr
- 20220301.frp
- 20220301.frr
- 20220301.fur
- 20220301.fy
- 20220301.ga
- 20220301.gag
- 20220301.gan
- 20220301.gd
- 20220301.gl
- 20220301.glk
- 20220301.gn
- 20220301.gom
- 20220301.gor
- 20220301.got
- 20220301.gu
- 20220301.gv
- 20220301.ha
- 20220301.hak
- 20220301.haw
- 20220301.he
- 20220301.hi
- 20220301.hif
- 20220301.ho
- 20220301.hr
- 20220301.hsb
- 20220301.ht
- 20220301.hu
- 20220301.hy
- 20220301.ia
- 20220301.id
- 20220301.ie
- 20220301.ig
- 20220301.ii
- 20220301.ik
- 20220301.ilo
- 20220301.inh
- 20220301.io
- 20220301.is
- 20220301.it
- 20220301.iu
- 20220301.ja
- 20220301.jam
- 20220301.jbo
- 20220301.jv
- 20220301.ka
- 20220301.kaa
- 20220301.kab
- 20220301.kbd
- 20220301.kbp
- 20220301.kg
- 20220301.ki
- 20220301.kj
- 20220301.kk
- 20220301.kl
- 20220301.km
- 20220301.kn
- 20220301.ko
- 20220301.koi
- 20220301.krc
- 20220301.ks
- 20220301.ksh
- 20220301.ku
- 20220301.kv
- 20220301.kw
- 20220301.ky
- 20220301.la
- 20220301.lad
- 20220301.lb
- 20220301.lbe
- 20220301.lez
- 20220301.lfn
- 20220301.lg
- 20220301.li
- 20220301.lij
- 20220301.lmo
- 20220301.ln
- 20220301.lo
- 20220301.lrc
- 20220301.lt
- 20220301.ltg
- 20220301.lv
- 20220301.mai
- 20220301.map-bms
- 20220301.mdf
- 20220301.mg
- 20220301.mh
- 20220301.mhr
- 20220301.mi
- 20220301.min
- 20220301.mk
- 20220301.ml
- 20220301.mn
- 20220301.mr
- 20220301.mrj
- 20220301.ms
- 20220301.mt
- 20220301.mus
- 20220301.mwl
- 20220301.my
- 20220301.myv
- 20220301.mzn
- 20220301.na
- 20220301.nah
- 20220301.nap
- 20220301.nds
- 20220301.nds-nl
- 20220301.ne
- 20220301.new
- 20220301.ng
- 20220301.nl
- 20220301.nn
- 20220301.no
- 20220301.nov
- 20220301.nrm
- 20220301.nso
- 20220301.nv
- 20220301.ny
- 20220301.oc
- 20220301.olo
- 20220301.om
- 20220301.or
- 20220301.os
- 20220301.pa
- 20220301.pag
- 20220301.pam
- 20220301.pap
- 20220301.pcd
- 20220301.pdc
- 20220301.pfl
- 20220301.pi
- 20220301.pih
- 20220301.pl
- 20220301.pms
- 20220301.pnb
- 20220301.pnt
- 20220301.ps
- 20220301.pt
- 20220301.qu
- 20220301.rm
- 20220301.rmy
- 20220301.rn
- 20220301.ro
- 20220301.roa-rup
- 20220301.roa-tara
- 20220301.ru
- 20220301.rue
- 20220301.rw
- 20220301.sa
- 20220301.sah
- 20220301.sat
- 20220301.sc
- 20220301.scn
- 20220301.sco
- 20220301.sd
- 20220301.se
- 20220301.sg
- 20220301.sh
- 20220301.si
- 20220301.simple
- 20220301.sk
- 20220301.sl
- 20220301.sm
- 20220301.sn
- 20220301.so
- 20220301.sq
- 20220301.sr
- 20220301.srn
- 20220301.ss
- 20220301.st
- 20220301.stq
- 20220301.su
- 20220301.sv
- 20220301.sw
- 20220301.szl
- 20220301.ta
- 20220301.tcy
- 20220301.te
- 20220301.tet
- 20220301.tg
- 20220301.th
- 20220301.ti
- 20220301.tk
- 20220301.tl
- 20220301.tn
- 20220301.to
- 20220301.tpi
- 20220301.tr
- 20220301.ts
- 20220301.tt
- 20220301.tum
- 20220301.tw
- 20220301.ty
- 20220301.tyv
- 20220301.udm
- 20220301.ug
- 20220301.uk
- 20220301.ur
- 20220301.uz
- 20220301.ve
- 20220301.vec
- 20220301.vep
- 20220301.vi
- 20220301.vls
- 20220301.vo
- 20220301.wa
- 20220301.war
- 20220301.wo
- 20220301.wuu
- 20220301.xal
- 20220301.xh
- 20220301.xmf
- 20220301.yi
- 20220301.yo
- 20220301.za
- 20220301.zea
- 20220301.zh
- 20220301.zh-classical
- 20220301.zh-min-nan
- 20220301.zh-yue
- 20220301.zu
---

# Dataset Card for Wikipedia

## Table of Contents
- [Dataset Description](#dataset-description)
  - [Dataset Summary](#dataset-summary)
  - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
  - [Languages](#languages)
- [Dataset Structure](#dataset-structure)
  - [Data Instances](#data-instances)
  - [Data Fields](#data-fields)
  - [Data Splits](#data-splits)
- [Dataset Creation](#dataset-creation)
  - [Curation Rationale](#curation-rationale)
  - [Source Data](#source-data)
  - [Annotations](#annotations)
  - [Personal and Sensitive Information](#personal-and-sensitive-information)
- [Considerations for Using the Data](#considerations-for-using-the-data)
  - [Social Impact of Dataset](#social-impact-of-dataset)
  - [Discussion of Biases](#discussion-of-biases)
  - [Other Known Limitations](#other-known-limitations)
- [Additional Information](#additional-information)
  - [Dataset Curators](#dataset-curators)
  - [Licensing Information](#licensing-information)
  - [Citation Information](#citation-information)
  - [Contributions](#contributions)

## Dataset Description

- **Homepage:** [https://dumps.wikimedia.org](https://dumps.wikimedia.org)
- **Repository:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
- **Paper:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)
- **Point of Contact:** [More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)

### Dataset Summary

Wikipedia dataset containing cleaned articles of all languages.
The datasets are built from the Wikipedia dump
(https://dumps.wikimedia.org/) with one split per language. Each example
contains the content of one full Wikipedia article with cleaning to strip
markdown and unwanted sections (references, etc.).

The articles are parsed using the ``mwparserfromhell`` tool.

To load this dataset you need to install Apache Beam and ``mwparserfromhell`` first:

```
pip install apache_beam mwparserfromhell
```

Then, you can load any subset of Wikipedia per language and per date this way:

```python
from datasets import load_dataset

load_dataset("wikipedia", language="sw", date="20220120", beam_runner=...)
```
where you can pass as `beam_runner` any Apache Beam supported runner for (distributed) data processing 
(see [here](https://beam.apache.org/documentation/runners/capability-matrix/)).
Pass "DirectRunner" to run it on your machine. 

You can find the full list of languages and dates [here](https://dumps.wikimedia.org/backup-index.html).

Some subsets of Wikipedia have already been processed by HuggingFace, and you can load them just with:
```python
from datasets import load_dataset

load_dataset("wikipedia", "20220301.en")
```

The list of pre-processed subsets is:
- "20220301.de"
- "20220301.en"
- "20220301.fr"
- "20220301.frr"
- "20220301.it"
- "20220301.simple"

### Supported Tasks and Leaderboards

The dataset is generally used for Language Modeling.

### Languages

You can find the list of languages [here](https://meta.wikimedia.org/wiki/List_of_Wikipedias).

## Dataset Structure

### Data Instances

An example looks as follows:

```
{'id': '1',
 'url': 'https://simple.wikipedia.org/wiki/April',
 'title': 'April',
 'text': 'April is the fourth month...'
}
```

Some subsets of Wikipedia have already been processed by HuggingFace, as you can see below:

#### 20220301.de

- **Size of downloaded dataset files:** 6.84 GB
- **Size of the generated dataset:** 9.34 GB
- **Total amount of disk used:** 16.18 GB

#### 20220301.en

- **Size of downloaded dataset files:** 21.60 GB
- **Size of the generated dataset:** 21.26 GB
- **Total amount of disk used:** 42.86 GB

#### 20220301.fr

- **Size of downloaded dataset files:** 5.87 GB
- **Size of the generated dataset:** 7.73 GB
- **Total amount of disk used:** 13.61 GB

#### 20220301.frr

- **Size of downloaded dataset files:** 13.04 MB
- **Size of the generated dataset:** 9.57 MB
- **Total amount of disk used:** 22.62 MB

#### 20220301.it

- **Size of downloaded dataset files:** 3.69 GB
- **Size of the generated dataset:** 4.76 GB
- **Total amount of disk used:** 8.45 GB

#### 20220301.simple

- **Size of downloaded dataset files:** 251.32 MB
- **Size of the generated dataset:** 246.49 MB
- **Total amount of disk used:** 497.82 MB

### Data Fields

The data fields are the same among all configurations:

- `id` (`str`): ID of the article.
- `url` (`str`): URL of the article.
- `title` (`str`): Title of the article.
- `text` (`str`): Text content of the article.

### Data Splits

Here are the number of examples for several configurations:

| name            |   train |
|-----------------|--------:|
| 20220301.de     | 2665357 |
| 20220301.en     | 6458670 |
| 20220301.fr     | 2402095 |
| 20220301.frr    |   15199 |
| 20220301.it     | 1743035 |
| 20220301.simple |  205328 |

## Dataset Creation

### Curation Rationale

[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)

### Source Data

#### Initial Data Collection and Normalization

[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)

#### Who are the source language producers?

[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)

### Annotations

#### Annotation process

[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)

#### Who are the annotators?

[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)

### Personal and Sensitive Information

[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)

## Considerations for Using the Data

### Social Impact of Dataset

[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)

### Discussion of Biases

[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)

### Other Known Limitations

[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)

## Additional Information

### Dataset Curators

[More Information Needed](https://github.com/huggingface/datasets/blob/master/CONTRIBUTING.md#how-to-contribute-to-the-dataset-cards)

### Licensing Information

Most of Wikipedia's text and many of its images are co-licensed under the
[Creative Commons Attribution-ShareAlike 3.0 Unported License](https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License)
(CC BY-SA) and the [GNU Free Documentation License](https://en.wikipedia.org/wiki/Wikipedia:Text_of_the_GNU_Free_Documentation_License)
(GFDL) (unversioned, with no invariant sections, front-cover texts, or back-cover texts). 

Some text has been imported only under CC BY-SA and CC BY-SA-compatible license and cannot be reused under GFDL; such
text will be identified on the page footer, in the page history, or on the discussion page of the article that utilizes
the text.

### Citation Information

```
@ONLINE{wikidump,
    author = "Wikimedia Foundation",
    title  = "Wikimedia Downloads",
    url    = "https://dumps.wikimedia.org"
}
```

### Contributions

Thanks to [@lewtun](https://github.com/lewtun), [@mariamabarham](https://github.com/mariamabarham), [@thomwolf](https://github.com/thomwolf), [@lhoestq](https://github.com/lhoestq), [@patrickvonplaten](https://github.com/patrickvonplaten) for adding this dataset.