name: "${.optimizer}_${.learning_rate}_${.scheduler}"
optimizer: "soap"
betas: [0.9, 0.99, 0.99]
precondition_frequency: 16
precondition_warmup: 0
precondition_size: 4096
merge_dims: false
learning_rate: 1e-4
weight_decay: 0.0
warmup: 0
scheduler: "cosine"
grad_clip: 1
