model_arch: 'qwen3'
bf16: True
num_layers: 36
hidden_size: 4096
ffn_hidden_size: 12288
init_method_std: 0.02
num_attention_heads: 32
num_query_groups: 8
max_position_embeddings: 40960
norm_epsilon: 1.0e-06
rotary_base: 1000000
position_embedding_type: 'rope'
normalization: 'RMSNorm'
masked_softmax_fusion: False
swiglu: True
attention_dropout: 0.0
hidden_dropout: 0.0
group_query_attention: True
add_bias_linear: False
add_qkv_bias: False
qk_layernorm: True
kv_channels: 128
untie_embeddings_and_output_weights: True
padded_vocab_size: 151936
