# Basic hyperparameter for normal BERT pretraining
# working hard here to separate "impl" implementation details and "train" abstract hyperparameters

name: common

defaults:
  - optim: adam_classic
  - optim_mod: disabled

optim:
  lr: 1e-4

limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight, norm] # no weight decay for these layers

# steps:
warmup_steps: 80_000 # These are microbatch steps
cooldown_steps: 0
steps: 8_000_000 # These are microbatch steps at bs=64. The original 1mio steps for BERT are recovered with 512/64=8
scheduler: polynomial-decay

# Training settting:
stream_depth: ${data.seq_length} # full sequence as input to model
batch_size: 512
batch_size_ramp: 0

gradient_clipping:
pretrain_in_train_mode: True # default BERT trains with dropout layers
reverse_dataset_order: False

budget: ${budget}
overall_budget: ${overall_budget}
