cond_remove_non_printing_characters: False  # Warning if set to True, it contains " " and "\n"
cond_standardize_whitespace: True
cond_check_number_words_node_level: True
number_words_node_level_min_cutoff: 4
number_words_node_level_max_cutoff: 5_000
cond_check_character_repetition_ratio_node_level: True
character_repetition_length_node_level: 10
character_repetition_node_level_max_cutoff: 0.1
cond_check_word_repetition_ratio_node_level: True
word_repetition_length_node_level: 5
word_repetition_node_level_max_cutoff: 0.1
cond_check_special_character_ratio_node_level: True
special_character_ratio_node_level_max_cutoff: 0.3
cond_check_stopword_ratio_node_level: True
stopword_ratio_node_level_min_cutoff: 0.3
cond_check_flagged_word_ratio_node_level: True
flagged_word_ratio_node_level_max_cutoff: 0.02
cond_check_punctuation_ratio_node_level: True
min_number_words_to_check_punctuation_ratio_node_level: 12
punctuation_ratio_node_level_min_cutoff: 0.001
cond_check_common_word_ratio_node_level: True
common_word_ratio_node_level_min_cutoff: 0.8
cond_check_lang_id_node_level: False
lang_id_node_level_min_cutoff: 0.8
cond_check_perplexity_score_node_level: False
perplexity_score_node_level_max_cutoff: 1500
cond_check_number_images: True
number_images_min_cutoff: 1
number_images_max_cutoff: 30
cond_check_number_words_doc_level: True
number_words_doc_level_min_cutoff: 10
number_words_doc_level_max_cutoff: 10_000
cond_check_character_repetition_ratio_doc_level: True
character_repetition_length_doc_level: 10
character_repetition_doc_level_max_cutoff: 0.2
cond_check_word_repetition_ratio_doc_level: True
word_repetition_length_doc_level: 5
word_repetition_doc_level_max_cutoff: 0.3
cond_check_special_character_ratio_doc_level: True
special_character_ratio_doc_level_max_cutoff: 0.3
cond_check_stopword_ratio_doc_level: True
stopword_ratio_doc_level_min_cutoff: 0.35
cond_check_flagged_word_ratio_doc_level: True
flagged_word_ratio_doc_level_max_cutoff: 0.01
cond_check_punctuation_ratio_doc_level: True
punctuation_ratio_doc_level_min_cutoff: 0.03
cond_check_common_word_ratio_doc_level: True
common_word_ratio_doc_level_min_cutoff: 0.90
cond_check_lang_id_doc_level: True
lang_id_doc_level_min_cutoff: 0.8
cond_check_perplexity_score_doc_level: True
perplexity_score_doc_level_max_cutoff: 1500
