train:
  lr: 1.0e-3
  optimizer: 
    name: DSGD
    args: []
    kwargs:
      # beta: 0.9
      # weight_decay: 5.0e-3
      beta: 0.999
      weight_decay: 5.0e-8
  scheduler: 
    name: "CosineLRScheduler"
    args: []
    kwargs: 
      lr_min: 1.0e-4
      warmup_t: 10
      warmup_lr_init: 5.0e-5
      warmup_prefix: true