name: "${.optimizer}_${.learning_rate}_${.scheduler}"
optimizer: "psgd"
betas: [0.9]
precondition_frequency: 64
precondition_frequency_decay: 0.999
precondition_size: 4096
merge_dims: false
learning_rate: 1e-5
weight_decay: 0.0
warmup: 0
scheduler: "cosine"
grad_clip: 1
