name: proofpile
defaults:
  - sources:
      - proofpiledata

# Preprocessing
normalizer:
  force_lowercase: False
  strip_accents: False
  force_english_keyboard: False
tokenizer: EleutherAI/llemma_34b
vocab_size: 49152 #32768 # 2^17

# Dataset Formation
seq_length: 512
include_eot_token_in_corpus: True

max_entries_in_raw_dataset: 10e5 #10e6 # Select only this many examples from the dataset # 20e6 are ok if all are chosen. Oversample if filtering
max_seq_in_tokenized_dataset: 5e4 #5e5 # Select only this many tokenized sequences.
# max_seq_in_tokenized_dataset should be just slightly more than budget * 60 * 60 * expected tokens/sec for the single epoch of training

# Data Cleaning:
remove_trash: False
trash_cutoff: 0.25
deduplicate_entries: False
deduplication_threshold: 75

# Data Order:
ordering: randomized # for now

# Validation Split
validation_seqs: 4096 # how many sequences to reserve for validation
