# This is the "default" BERT dataset
name: bookcorpus-wikitext
defaults:
  - sources:
      - bookcorpus
      - wikipedia

# Preprocessing
normalizer: # This is ignored and the default bert normalizer is used instead
  force_lowercase: # True
  strip_accents: # True
  force_english_keyboard: # False
  whitespace_escape: # False
tokenizer: bert-base-uncased
vocab_size: 30522

# Dataset Formation
seq_length: 512
include_cls_token_in_corpus: # True, but ignored and the default post_processor is used
include_sep_token_in_corpus: # True, but ignored and the default post_processor is used
use_type_ids: # True
max_entries_in_raw_dataset: 1e14 # Select no more than this number of examples from the dataset
max_seq_in_tokenized_dataset: 1e14 # Select only this many tokenized sequences.
# max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training

# Data Cleaning:
named_entity_simplification: False
remove_whitespaces: False
remove_trash: False
trash_cutoff: 0.3
deduplicate_entries: False
deduplication_threshold: 100

# Data Order:
ordering: randomized # could be a curriculum
