# The open webtext replication, as mirrored on HF
bigcode/starcoderdata:
  provider: huggingface
  # partition: python
  # partition: git-commits-cleaned
  partition: java
  split: train

  streaming: False #True

  # source-specific cleaning rules?
  remove_columns:
  concatenate_successive_entries: 0
