_target_: torch.optim.Adam
_convert_: 'all'

lr: 5e-5  # learning rate
weight_decay: 5e-4  # l2 norm penalty
betas: [0.9, 0.999]  # coefficients used for computing running averages of gradient and its square
