# Version 2 of changes to bert training hyperparameters
# Moving to large 1-cycle learning rates and batch size ramping

name: bert-o2

defaults:
  - optim: adam
  - optim_mod: disabled

optim:
  lr: 1e-3
  weight_decay: 0.01

limited_decay_keys: [bias, LayerNorm.bias, LayerNorm.weight, norm] # no weight decay for these layers

# steps:
warmup_steps: 0
cooldown_steps: 0
steps: 600000 # these are microbatch steps
scheduler: budget-one-cycle

# Training settting:
batch_size: 1536
batch_size_ramp: 300000

gradient_clipping:
pretrain_in_train_mode: False # default BERT trains with dropout layers enabled in pretrain

objective:
  name: masked-lm
  mlm_probability: 0.15
  use_80_20_rule: True
  disable_mlm: False
  token_drop: 0.0
reverse_dataset_order: False

budget: ${budget}
# gradinit:
#   enabled: False
# eta: 1.0
# tau: 1e-3 # step size
# steps: 50
# min_scale: 1e-3
# max_scale: 1e3
# step_type: sign-grad # sign-grad or grad
# second_order: False
