ddp: ???
eval_interval: ???
log_interval: ???
eval_iters: ???
eval_only: false
always_save_checkpoint: ???
always_interval: ???
device: cuda:0
gradient_accumulation_steps: ???
batch_size: ???
learning_rate: 0.0001
max_iters: ???
weight_decay: 0.1
beta1: 0.9
beta2: 0.95
grad_clip: 1.0
decay_lr: true
warmup_iters: ???
min_lr: 0.0
lr_decay_iters: ${train.max_iters} 
