# The wikipedia en dataset, drawn from it huggingface mirror
wikipedia:
  provider: huggingface
  partition: 20220301.en
  split: train

  streaming: False

  # source-specific cleaning rules?
  remove_columns: title
  concatenate_successive_entries: 0
