data_mix:
  (): dataset_construction.DataMix
  name: "english_5p"
  shuffle: false
  local_save_dir: data/
  load_from_local_save_dir: false
  compute_dataset_stats: true
  keep_separated_datasets_in_dataset_dict: false
  deduplicate_test_set: false
  ngram_path_for_extra_deduplication: null
  max_shard_size: "2GB"
  datasets:
# Split of SlimPajama
    - (): dataset_construction.DatasetConfig
      dataset_path: cerebras/SlimPajama-627B
      train_split: "train"
      test_split: "test"
      build_test_set_from_train: false
      dataset_kwargs:
        data_dir: train/chunk1

tokenizer: "mistralai/Mistral-7B-v0.1"
