# Pile streaming from huggingface with new streaming tech :>
# should be 1.2T in this deduplicated version
EleutherAI/pile:
  provider: huggingface
  subsets: all
  split: train

  streaming: True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0 # cannot concat when streaming
