seed: 0 
seq_length: 256 

per_device_batch_size: 8 
n_device: 1 


optim: adamw_torch_fused
steps: 500 
learning_rate: 0.00001 
weight_decay: 0.1 
warmup_steps: 0 
logging_steps: 10 
adam_beta1: 0.9
adam_beta2: 0.95
random_concat_ratio: 0.2 


eval_steps: 100  
save_steps: 100  


gradient_checkpointing: false 
gradient_accumulation_steps: 4 
max_grad_norm: 1.0 
ep_size: 2