data_mix:
  (): dataset_construction.DataMix
  name: "pretraining_testing"
  local_save_dir: data/
  load_from_local_save_dir: false
  shuffle: false
  compute_dataset_stats: false
  keep_separated_datasets_in_dataset_dict: true
  deduplicate_test_set: false
  ngram_path_for_extra_deduplication: null
  datasets:
# Translation datasets
#    - (): dataset_construction.DatasetConfig
#      dataset_path: <anon>opus100-en-fr
#      test_split: "test"
#      build_test_set_from_train: false
#      num_train_examples:  1000
#      num_test_examples: 1000
#    - (): dataset_construction.DatasetConfig
#      dataset_path: <anon>europarl-en-fr
#      build_test_set_from_train: true
#      num_test_examples: 1000
#    - (): dataset_construction.DatasetConfig
#      dataset_path: <anon>wmt-en-fr
#      test_split: "test"
#      num_train_examples:  1000
#      num_test_examples: 100
# Speech datasets
    - (): dataset_construction.DatasetConfig
      dataset_path: <anon>french_librispeech_text_only
      build_test_set_from_train: true
      num_train_examples:  1000
      num_test_examples: 100
    - (): dataset_construction.DatasetConfig
      dataset_path: <anon>french_podcasts
      build_test_set_from_train: true
      num_test_examples: 100
      filtering_function:
        (): dataset_collection.french.french_transcribed_podcast.PodcastFilter
      preprocessing_function:
        (): dataset_collection.french.french_transcribed_podcast.PodcastMapper
#    - (): dataset_construction.DatasetConfig
#      dataset_path: <anon>french_open_subtitles
#      num_train_examples:  1000
#      num_test_examples: 100
#      build_test_set_from_train: true

# Books
#    - (): dataset_construction.DatasetConfig
#      dataset_path: <anon>project_gutenberg
#      train_split: fr
#      build_test_set_from_train: true
#      num_train_examples:  1000
#      num_test_examples: 100
      # here we should preprocess
#    - (): dataset_construction.DatasetConfig
#      dataset_path: <anon>ProjectGutenberg_fr
#      build_test_set_from_train: true
#      num_train_examples:  1000
#      num_test_examples: 10
#    - (): dataset_construction.DatasetConfig
#      dataset_path: <anon>bnf_gallica
#      build_test_set_from_train: true
#      num_train_examples:  20
#      num_test_examples: 10
# Dila data
    - (): dataset_construction.DatasetConfig
      dataset_path: <anon>dila_legifrance
      build_test_set_from_train: true
      num_train_examples:  10000
      num_test_examples: 100
      needs_internal_deduplication: true
    - (): dataset_construction.DatasetConfig
      dataset_path: Nicolas-BZRD/DEBATS_opendata
      build_test_set_from_train: true
      # num_train_examples: 10000
      num_test_examples: 100
      needs_internal_deduplication: true
      filtering_function:
        (): dataset_preprocessing.perplexity.perplexity_filter.PerplexityFilter
        model:
          (): dataset_preprocessing.perplexity.model.KenlmModel
          model_dataset: "wikipedia"
          language: "fr"
        perplexity_bounds: (10, 1000)

#    - (): dataset_construction.DatasetConfig
#      dataset_path: 'teven/code_contests'
#      train_split: "train[:5%]"
#      build_test_set_from_train: true
#      # build_test_set_from_train: false
#      # test_split: "valid"
#      filtering_function:
#        (): dataset_collection.code.deepmind_code_contest.CodeContestFilter
#      preprocessing_function:
#        (): dataset_collection.code.deepmind_code_contest.CodeContestMapper
#    - (): dataset_construction.DatasetConfig
#      dataset_path: Nicolas-BZRD/BALO_opendata
#      build_test_set_from_train: true
#      num_train_examples: 1000
#      num_test_examples: 100
      # needs a preprocessor to have an id_column
# PDF dataset
#    - (): dataset_construction.DatasetConfig
#      dataset_path: <anon>illuin_layout_dataset_text_only
#      build_test_set_from_train: true
#      num_train_examples:  1000
#      num_test_examples: 100
#      needs_internal_deduplication: true
# Wiki
#    - (): dataset_construction.DatasetConfig
#      dataset_path: <anon>wikisource_fr
#      build_test_set_from_train: true
#      num_train_examples:  1000
#      num_test_examples: 100
#    - (): dataset_construction.DatasetConfig
#      dataset_path: wikipedia
#      dataset_name: "20220301.fr"
#      build_test_set_from_train: true
#      num_train_examples:  1000
#      num_test_examples: 100
# Internet dumps
#    - (): dataset_construction.DatasetConfig
#      dataset_path: 'oscar-corpus/OSCAR-2301'
#      dataset_name: fr
#      build_test_set_from_train: true
#      num_train_examples:  1000
#      num_test_examples: 100
#      filtering_function:
#        (): dataset_collection.french.oscar.OscarFilter
# Code
#    - (): dataset_construction.DatasetConfig
#      dataset_path: 'bigcode/starcoderdata'
#      dataset_kwargs:
#        data_dir: python
#      build_test_set_from_train: true
#      num_train_examples:  1000
#      num_test_examples: 100
#    - (): dataset_construction.DatasetConfig
#      dataset_path: 'bigcode/starcoderdata'
#      dataset_kwargs:
#        data_dir: markdown
#      build_test_set_from_train: true
#      num_train_examples:  1000
#      num_test_examples: 100
#    - (): dataset_construction.DatasetConfig
#      dataset_path: 'bigcode/starcoderdata'
#      dataset_kwargs:
#        data_dir: jupyter-scripts-dedup⁻filtered
#      build_test_set_from_train: true
#      num_train_examples:  1000
#      num_test_examples: 100
#    - (): dataset_construction.DatasetConfig
#      dataset_path: 'bigcode/starcoderdata'
#      dataset_kwargs:
#        data_dir: jupyter-structured-clean-dedup
#      build_test_set_from_train: true
#      num_train_examples:  1000
#      num_test_examples: 100
#    - (): dataset_construction.DatasetConfig
#      dataset_path: 'bigcode/starcoderdata'
#      dataset_kwargs:
#        data_dir: json
#      build_test_set_from_train: true
#      num_train_examples:  1000
#      num_test_examples: 100
#    - (): dataset_construction.DatasetConfig
#      dataset_path: 'teven/stackexchange'
#      dataset_kwargs:
#        data_dir: json
#      build_test_set_from_train: true
#      num_train_examples:  1000
#      num_test_examples: 100
#    - (): dataset_construction.DatasetConfig
#      dataset_path: 'vikp/pypi_clean'
#      dataset_kwargs:
#        data_dir: json
#      build_test_set_from_train: true
#      num_train_examples:  1000
#      num_test_examples: 100


tokenizer: "mistralai/Mistral-7B-v0.1"
