model_arch: 'llama'
bf16: True
num_layers: 32
hidden_size: 4096
ffn_hidden_size: 14336
init_method_std: 0.02
num_attention_heads: 32
num_query_groups: 8
max_position_embeddings: 131072
norm_epsilon: 1.0e-05
rotary_base: 500000
position_embedding_type: 'rope'
normalization: 'RMSNorm'
masked_softmax_fusion: False
swiglu: True
attention_dropout: 0.0
hidden_dropout: 0.0
group_query_attention: True
add_bias_linear: False
add_qkv_bias: False
padded_vocab_size: 128256
use_rope_scaling: True
rope_scaling_factor: 8.0
kv_channels: 128
untie_embeddings_and_output_weights: True