qwen3_32b:
  use_mcore_models: true
  num_layers: 64
  hidden_size: 5120
  ffn_hidden_size: 25600
  num_attention_heads: 64
  rotary_base: 1000000
  max_position_embeddings: 40960
  make_vocab_size_divisible_by: 1
  padded_vocab_size: 151936
  untie_embeddings_and_output_weights: true
  disable_bias_linear: true
  group_query_attention: true
  num_query_groups: 8
  position_embedding_type: rope
  normalization: RMSNorm
  swiglu: true
  attention_softmax_in_fp32: true
  attention_bias: false
  qk_layernorm: true

  kv_channels: 128
  no_gradient_accumulation_fusion: true