data_mix:
  (): dataset_construction.DataMix
  name: "english_660B_2"
  shuffle: false
  load_from_local_save_dir: false
  local_save_dir: /gpfsscratch/rech/qts/ucg53vj/croissant_data
  compute_dataset_stats: true
  keep_separated_datasets_in_dataset_dict: false
  deduplicate_test_set: false
  ngram_path_for_extra_deduplication: null
  max_shard_size: "5GB"
  datasets:
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk6/*0.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk6/*1.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk6/*2.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk6/*3.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk6/*4.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk6/*5.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk6/*6.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk6/*7.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk6/*8.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk6/*9.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk7/*0.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk7/*1.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk7/*2.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk7/*3.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk7/*4.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk7/*5.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk7/*6.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk7/*7.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk7/*8.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk7/*9.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk8/*0.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk8/*1.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk8/*2.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk8/*3.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk8/*4.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk8/*5.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk8/*6.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk8/*7.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk8/*8.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk8/*9.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk9/*0.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk9/*1.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk9/*2.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk9/*3.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk9/*4.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk9/*5.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk9/*6.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk9/*7.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk9/*8.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk9/*9.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk10/*0.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk10/*1.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk10/*2.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk10/*3.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk10/*4.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk10/*5.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk10/*6.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk10/*7.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk10/*8.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter
    - (): dataset_construction.DatasetConfig
      dataset_path: /gpfsscratch/rech/qts/ucg53vj/SlimPajama-627B
      train_split: "train"
      load_from_disk: false
      dataset_kwargs:
        data_files: train/chunk10/*9.jsonl.zst
      preprocessing_function:
        (): dataset_collection.english.slimpajama.SlimPajamaMapper
      filtering_function:
        (): dataset_collection.english.slimpajama.SlimPajamaFilter

# tokenizer: "mistralai/Mistral-7B-v0.1"
tokenizer: "/gpfsstore/rech/qts/ucg53vj/tok-fr-en-code"
