seed: 0
seq_length: 256 

per_device_batch_size: 32 #1
n_device: 1 

optim: adamw_torch_fused
steps: 500 
learning_rate: 0.00001 
weight_decay: 0.1 
warmup_steps: 0  
logging_steps: 10  
adam_beta1: 0.9
adam_beta2: 0.95
random_concat_ratio: 0.2 


eval_steps: 100 
save_steps: 100  


gradient_checkpointing: false 
gradient_accumulation_steps: 1 
max_grad_norm: 1.0  
ep_size: 2