hidden_size: 1024
ffn_hidden_size: 4128
num_layers: 12
num_attention_heads: 8
seq_length: 2048
num_kv_heads: 8