batch_size: 3
n_epochs: 1
num_workers: 1
pin_memory: true
optimizer: "adam"
learning_rate: 5e-5
weight_decay: 1e-5
scheduler: "cosine_with_min_lr"
min_lr: 4e-6
clip_grad: true
clip_to: 1.0
gradnorm_balancer: "none"
loss_type: "mse"
use_latest_checkpoint: false