data_mix:
  (): dataset_construction.DataMix
  name: "english_627B"
  shuffle: false
  load_from_local_save_dir: false
  local_save_dir: /gpfsscratch/rech/qts/ucg53vj/croissant_data
  compute_dataset_stats: true
  keep_separated_datasets_in_dataset_dict: false
  deduplicate_test_set: false
  ngram_path_for_extra_deduplication: null
  max_shard_size: "5GB"
  datasets:
# Split of SlimPajama
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[0%:1%]"
      test_split: "test[0%:1%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[1%:2%]"
      test_split: "test[1%:2%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[2%:3%]"
      test_split: "test[2%:3%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[3%:4%]"
      test_split: "test[3%:4%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[4%:5%]"
      test_split: "test[4%:5%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[5%:6%]"
      test_split: "test[5%:6%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[6%:7%]"
      test_split: "test[6%:7%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[7%:8%]"
      test_split: "test[7%:8%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[8%:9%]"
      test_split: "test[8%:9%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[9%:10%]"
      test_split: "test[9%:10%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[10%:11%]"
      test_split: "test[10%:11%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[11%:12%]"
      test_split: "test[11%:12%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[12%:13%]"
      test_split: "test[12%:13%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[13%:14%]"
      test_split: "test[13%:14%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[14%:15%]"
      test_split: "test[14%:15%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[15%:16%]"
      test_split: "test[15%:16%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[16%:17%]"
      test_split: "test[16%:17%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[17%:18%]"
      test_split: "test[17%:18%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[18%:19%]"
      test_split: "test[18%:19%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[19%:20%]"
      test_split: "test[19%:20%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[20%:21%]"
      test_split: "test[20%:21%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[21%:22%]"
      test_split: "test[21%:22%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[22%:23%]"
      test_split: "test[22%:23%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[23%:24%]"
      test_split: "test[23%:24%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[24%:25%]"
      test_split: "test[24%:25%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[25%:26%]"
      test_split: "test[25%:26%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[26%:27%]"
      test_split: "test[26%:27%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[27%:28%]"
      test_split: "test[27%:28%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[28%:29%]"
      test_split: "test[28%:29%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[29%:30%]"
      test_split: "test[29%:30%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[30%:31%]"
      test_split: "test[30%:31%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[31%:32%]"
      test_split: "test[31%:32%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[32%:33%]"
      test_split: "test[32%:33%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[33%:34%]"
      test_split: "test[33%:34%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[34%:35%]"
      test_split: "test[34%:35%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[35%:36%]"
      test_split: "test[35%:36%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[36%:37%]"
      test_split: "test[36%:37%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[37%:38%]"
      test_split: "test[37%:38%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[38%:39%]"
      test_split: "test[38%:39%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[39%:40%]"
      test_split: "test[39%:40%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[40%:41%]"
      test_split: "test[40%:41%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[41%:42%]"
      test_split: "test[41%:42%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[42%:43%]"
      test_split: "test[42%:43%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[43%:44%]"
      test_split: "test[43%:44%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[44%:45%]"
      test_split: "test[44%:45%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[45%:46%]"
      test_split: "test[45%:46%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[46%:47%]"
      test_split: "test[46%:47%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[47%:48%]"
      test_split: "test[47%:48%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[48%:49%]"
      test_split: "test[48%:49%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[49%:50%]"
      test_split: "test[49%:50%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[50%:51%]"
      test_split: "test[50%:51%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[51%:52%]"
      test_split: "test[51%:52%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[52%:53%]"
      test_split: "test[52%:53%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[53%:54%]"
      test_split: "test[53%:54%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[54%:55%]"
      test_split: "test[54%:55%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[55%:56%]"
      test_split: "test[55%:56%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[56%:57%]"
      test_split: "test[56%:57%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[57%:58%]"
      test_split: "test[57%:58%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[58%:59%]"
      test_split: "test[58%:59%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[59%:60%]"
      test_split: "test[59%:60%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[60%:61%]"
      test_split: "test[60%:61%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[61%:62%]"
      test_split: "test[61%:62%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[62%:63%]"
      test_split: "test[62%:63%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[63%:64%]"
      test_split: "test[63%:64%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[64%:65%]"
      test_split: "test[64%:65%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[65%:66%]"
      test_split: "test[65%:66%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[66%:67%]"
      test_split: "test[66%:67%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[67%:68%]"
      test_split: "test[67%:68%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[68%:69%]"
      test_split: "test[68%:69%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[69%:70%]"
      test_split: "test[69%:70%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[70%:71%]"
      test_split: "test[70%:71%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[71%:72%]"
      test_split: "test[71%:72%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[72%:73%]"
      test_split: "test[72%:73%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[73%:74%]"
      test_split: "test[73%:74%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[74%:75%]"
      test_split: "test[74%:75%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[75%:76%]"
      test_split: "test[75%:76%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[76%:77%]"
      test_split: "test[76%:77%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[77%:78%]"
      test_split: "test[77%:78%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[78%:79%]"
      test_split: "test[78%:79%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[79%:80%]"
      test_split: "test[79%:80%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[80%:81%]"
      test_split: "test[80%:81%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[81%:82%]"
      test_split: "test[81%:82%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[82%:83%]"
      test_split: "test[82%:83%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[83%:84%]"
      test_split: "test[83%:84%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[84%:85%]"
      test_split: "test[84%:85%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[85%:86%]"
      test_split: "test[85%:86%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[86%:87%]"
      test_split: "test[86%:87%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[87%:88%]"
      test_split: "test[87%:88%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[88%:89%]"
      test_split: "test[88%:89%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[89%:90%]"
      test_split: "test[89%:90%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[90%:91%]"
      test_split: "test[90%:91%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[91%:92%]"
      test_split: "test[91%:92%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[92%:93%]"
      test_split: "test[92%:93%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[93%:94%]"
      test_split: "test[93%:94%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[94%:95%]"
      test_split: "test[94%:95%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[95%:96%]"
      test_split: "test[95%:96%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[96%:97%]"
      test_split: "test[96%:97%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[97%:98%]"
      test_split: "test[97%:98%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[98%:99%]"
      test_split: "test[98%:99%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsstore/rech/qts/ucg53vj/local-slim-lf
      train_split: "train[99%:100%]"
      test_split: "test[99%:100%]"
      build_test_set_from_train: false
      load_from_disk: true
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/gutenberg_canaries
      train_split: "train"
      build_test_set_from_train: true
      num_test_examples: 0

# tokenizer: "mistralai/Mistral-7B-v0.1"
tokenizer: "/gpfsstore/rech/qts/ucg53vj/tok-fr-en-code"
