data_mix:
  (): dataset_construction.DataMix
  name: "tok_fr"
  shuffle: false
  compute_dataset_stats: true
  local_save_dir: data/
  load_from_local_save_dir: false
  keep_separated_datasets_in_dataset_dict: true
  deduplicate_test_set: false
  ngram_path_for_extra_deduplication: null
  max_shard_size: "2GB"
  datasets:
# Translation datasets
#    - (): dataset_construction.DatasetConfig
#      dataset_path: <anon>opus100-en-fr
#      test_split: "test"
#      build_test_set_from_train: false
#      num_train_examples:  1000
#      num_test_examples: 1000
#    - (): dataset_construction.DatasetConfig
#      dataset_path: <anon>europarl-en-fr
#      build_test_set_from_train: true
#      num_test_examples: 1000
    - (): dataset_construction.DatasetConfig
      dataset_path: <anon>wmt-en-fr
      train_split: "train[:10%]"
      test_split: "test"
    - (): dataset_construction.DatasetConfig
      dataset_path: <anon>theses_fr_2013_2023
      build_test_set_from_train: true
      filtering_function:
        (): dataset_collection.french.theses.ThesisFilter
      preprocessing_function:
        (): dataset_collection.french.theses.ThesisMapper
# Dila data
    - (): dataset_construction.DatasetConfig
      dataset_path: Nicolas-BZRD/LEGI_opendata # <anon>dila_legifrance
      build_test_set_from_train: true
      train_split: "train[:20%]"
    - (): dataset_construction.DatasetConfig
      dataset_path: Nicolas-BZRD/BALO_opendata
      build_test_set_from_train: true
      train_split: "train[:10%]"
    - (): dataset_construction.DatasetConfig
      dataset_path: Nicolas-BZRD/JADE_opendata
      build_test_set_from_train: true
      train_split: "train[:10%]"
    - (): dataset_construction.DatasetConfig
      dataset_path: Nicolas-BZRD/DOLE_opendata
      build_test_set_from_train: true
    - (): dataset_construction.DatasetConfig
      dataset_path: Nicolas-BZRD/SARDE_opendata
      build_test_set_from_train: true
    - (): dataset_construction.DatasetConfig
      dataset_path: Nicolas-BZRD/QR_opendata
      build_test_set_from_train: true
    - (): dataset_construction.DatasetConfig
      dataset_path: Nicolas-BZRD/JORF_opendata
      build_test_set_from_train: true
      train_split: "train[:10%]"
    - (): dataset_construction.DatasetConfig
      dataset_path: Nicolas-BZRD/INCA_opendata
      build_test_set_from_train: true
      train_split: "train[:10%]"
    - (): dataset_construction.DatasetConfig
      dataset_path: Nicolas-BZRD/ACCO_opendata
      build_test_set_from_train: true
      train_split: "train[:10%]"
    - (): dataset_construction.DatasetConfig
      dataset_path: Nicolas-BZRD/KALI_opendata
      build_test_set_from_train: true
    - (): dataset_construction.DatasetConfig
      dataset_path: Nicolas-BZRD/DEBATS_opendata
      build_test_set_from_train: true
    - (): dataset_construction.DatasetConfig
      dataset_path: Nicolas-BZRD/CNIL_opendata
      build_test_set_from_train: true
    - (): dataset_construction.DatasetConfig
      dataset_path: Nicolas-BZRD/CAPP_opendata
      build_test_set_from_train: true
      train_split: "train[:10%]"
    - (): dataset_construction.DatasetConfig
      dataset_path: Nicolas-BZRD/CASS_opendata
      build_test_set_from_train: true
    - (): dataset_construction.DatasetConfig
      dataset_path: Nicolas-BZRD/CONSTIT_opendata
      build_test_set_from_train: true
      # needs a preprocessor to have an id_column
# PDF dataset
    - (): dataset_construction.DatasetConfig
      dataset_path: <anon>illuin_layout_dataset_text_only
      build_test_set_from_train: true
 # Wiki
    - (): dataset_construction.DatasetConfig
      dataset_path: <anon>wikisource_fr
      train_split: "train[:10%]"
      build_test_set_from_train: true
    - (): dataset_construction.DatasetConfig
      dataset_path: wikipedia
      dataset_name: "20220301.fr"
      build_test_set_from_train: true
 # Internet dumps
    - (): dataset_construction.DatasetConfig
      dataset_path: 'oscar-corpus/OSCAR-2301'
      dataset_name: fr
      train_split: "train[:10%]"
      build_test_set_from_train: true
      num_test_examples: 10000
      filtering_function:
        (): dataset_collection.french.oscar.OscarFilter
