# You can also draw in raw dumps of wikipedia for almost any language.
# Change sources.raw_wiki_dump.language to another language to preprocess that language.
name: wiki_dump
defaults:
  - sources:
      - raw_wiki_dump

#
# Preprocessing
normalizer:
  force_lowercase: False
  strip_accents: False
  force_english_keyboard: False
  whitespace_escape: False
tokenizer: BPE
vocab_size: 65536

# Dataset Formation
seq_length: 128
include_cls_token_in_corpus: False
include_sep_token_in_corpus: True
use_type_ids: False
max_entries_in_raw_dataset: 1e14 # Draw in as much data as possible
max_seq_in_tokenized_dataset: 85e6 # Select only this many tokenized sequences.
# max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training

# Data Cleaning:
named_entity_simplification: False
remove_whitespaces: False
remove_trash: self
trash_cutoff: 0.3 # Less aggressive, might have less text for some languages
deduplicate_entries: False
deduplication_threshold: 75

# Data Order:
ordering: randomized
