# Basic hyperparameter for normal BERT pretraining
# working hard here to separate "impl" implementation details and "train" hyperparameters

name: bert-base

defaults:
  - optim: adam
  - optim_mod: disabled

limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight, norm] # no weight decay for these layers

optim:
  lr: 2e-3
  eps: 1e-6
  weight_decay: 0.01
  betas:
    - 0.9
    - 0.98

# Steps:
warmup_steps: 0.06 # in percentage points
cooldown_steps: 0
steps: 900_000 # these are microbatch steps # This is an upper limit that is usually never reached
scheduler: budget-linear

# Training settings:
batch_size: 4096 # for mbs=128
batch_size_ramp: 0

gradient_clipping:
pretrain_in_train_mode: True # default BERT trains with dropout layers enabled in pretrain

objective:
  name: masked-lm
  mlm_probability: 0.15
  use_80_20_rule: True
  disable_mlm: False
  token_drop: 0.0
reverse_dataset_order: False

budget: ${budget}
