deepseekv3_671b:
  use_mcore_models: true
  spec:
    - mindspeed_llm.tasks.models.spec.deepseek_spec
    - layer_spec
  num_layers: 61
  num_experts: 256
  moe_ffn_hidden_size: 2048
  ffn_hidden_size: 18432
  num_attention_heads: 128
  hidden_size: 7168
  untie_embeddings_and_output_weights: true
  disable_bias_linear: true
  multi_latent_attention: true
  qk_pos_emb_head_dim: 64
  qk_head_dim: 128
  q_lora_rank: 1536
  kv_lora_rank: 512
  v_head_dim: 128
  qk_layernorm: true
  moe_grouped_gemm: true
  moe_permutation_async_comm: true
  moe_token_dispatcher_type: alltoall_seq
  first_k_dense_replace: 3
  moe_layer_freq: 1
  n_shared_experts: 1
  moe_router_topk: 8
  moe_router_load_balancing_type: none
  moe_router_num_groups: 8
  moe_router_group_topk: 4
  moe_router_topk_scaling_factor: 2.5
  seq_aux: true
  norm_topk_prob: true
  moe_router_score_function: sigmoid
  moe_router_enable_expert_bias: true

  position_embedding_type: rope
  use_rotary_position_embeddings: true
  rotary_base: 10000
  rope_scaling_type: yarn
  beta_fast: 32 
  beta_slow: 1
  rope_scaling_factor: 40
  rope_scaling_mscale: 1.0
  rope_scaling_mscale_all_dim: 1.0
  rope_scaling_original_max_position_embeddings: 4096
  max_position_embeddings: 163840
  padded_vocab_size: 129280
  make_vocab_size_divisible_by: 1

  normalization: RMSNorm
  norm_epsilon: 1e-6
  swiglu: true
