epochs: -1
batch_size: 16
accelerator: gpu
steps: 375000
precision: bf16-mixed
use_gradient_checkpointing: False
compile_modules: True
num_gpus : 2
size : base
average_top_k_layers : 8