train:
  train_the_pile_arxiv: 0.1052
  train_the_pile_freelaw: 0.0386
  train_the_pile_nih_exporter: 0.0052
  train_the_pile_pubmed_central: 0.1071
  train_the_pile_wikipedia_en: 0.0919
  train_the_pile_dm_mathematics: 0.0198
  train_the_pile_github: 0.0427
  train_the_pile_philpapers: 0.0027
  train_the_pile_stackexchange: 0.0929
  train_the_pile_enron_emails: 0.0030
  train_the_pile_gutenberg_pg_19: 0.0199
  train_the_pile_pile_cc: 0.1121
  train_the_pile_ubuntu_irc: 0.0074
  train_the_pile_europarl: 0.0043
  train_the_pile_hackernews: 0.0075
  train_the_pile_pubmed_abstracts: 0.0845
  train_the_pile_uspto_backgrounds: 0.0420
valid:
  valid_the_pile_arxiv: 1.0
  valid_the_pile_dm_mathematics: 1.0
  valid_the_pile_enron_emails: 1.0
  valid_the_pile_europarl: 1.0
  valid_the_pile_freelaw: 1.0
  valid_the_pile_github: 1.0
  valid_the_pile_gutenberg_pg_19: 1.0
  valid_the_pile_hackernews: 1.0
  valid_the_pile_nih_exporter: 1.0
  valid_the_pile_philpapers: 1.0
  valid_the_pile_pile_cc: 1.0
  valid_the_pile_pubmed_abstracts: 1.0
  valid_the_pile_pubmed_central: 1.0
  valid_the_pile_stackexchange: 1.0
  valid_the_pile_ubuntu_irc: 1.0
  valid_the_pile_uspto_backgrounds: 1.0
  valid_the_pile_wikipedia_en: 1.0
seed: 42
temperature: 0.5
min_strength: 0.1
max_strength: 5.0
minimum: 0.0002
sample_multiplier: 100
maximum_usage: 15

model_name: tinyllama_60M_kv_head_bias
total_devices: 4
num_of_devices: 4
global_batch_size: 512
micro_batch_size: 16
max_step: 20000
save_step_interval: 1000
eval_step_interval: 1000
learning_rate: 0.0004
min_lr: 0.0004
warmup_steps: 100