name: "${.optimizer}_${.learning_rate}_${.scheduler}"
optimizer: "adamw"
betas: [0.9, 0.99]
learning_rate: 0.0
weight_decay: 0.0
warmup: 0
scheduler: "cosine"
grad_clip: 1


staged_training:
  threshold_lr: 1e-5     # LR for COMMON/pretrained parameters
  
  # Optional: Override settings for common parameters
  common_params_kwargs:
    weight_decay: 0.0
    betas: [0.9, 0.99]
  
  # Optional: Override settings for new parameters
  new_params_kwargs:
    weight_decay: 0.0
    betas: [0.9, 0.99]