ENV_VARS:
  CUDA_DEVICE_MAX_CONNECTIONS: 1
  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
  NCCL_ALGO: Ring
  CUBLAS_WORKSPACE_CONFIG: :4096:8
TEST_TYPE: frozen-start
MODE: inference
MODEL_ARGS:
  --log-num-zeros-in-grad: true
  --log-validation-ppl-to-tensorboard: true
  --log-timers-to-tensorboard: true
  --log-memory-to-tensorboard: true
  --timing-log-level: 2
  # See the mount paths defined in the top level tests/test_utils/recipes/gpt-static-inference.yaml
  --load: ${CHECKPOINT_LOAD_PATH}/deepseek_16b_pyt/model/checkpoints
  --tokenizer-model: ${DATA_PATH}/deepseek_16b_pyt/tokenizer/multiMixV8.gpt4o_nc_sd.500000.128k.vocab.json
  --tokenizer-type: TikTokenizer
  --tiktoken-pattern: v2
  --distributed-backend: nccl
  --log-interval: 1
  --transformer-impl: transformer_engine
  --tensor-model-parallel-size: 1
  --pipeline-model-parallel-size: 1
  --expert-model-parallel-size: 1
  --use-mcore-models: true
  --moe-token-dispatcher-type: alltoall
  --moe-grouped-gemm: true
  --num-experts: 64
  --moe-router-topk: 6
  --moe-z-loss-coeff: 0
  --moe-router-load-balancing-type: seq_aux_loss
  --moe-aux-loss-coeff: 1e-3
  --moe-router-score-function: sigmoid
  --untie-embeddings-and-output-weights: true
  --disable-bias-linear: true
  --init-method-std: 0.014
  --position-embedding-type: rope
  --rotary-base: 1000000
  --rotary-percent: 1.0
  --num-layers: 27
  --hidden-size: 2048
  --moe-ffn-hidden-size: 1408
  --moe-shared-expert-intermediate-size: 2816
  --ffn-hidden-size: 10944
  --num-attention-heads: 16
  --kv-channels: 128
  --normalization: RMSNorm
  --swiglu: true
  --attention-dropout: 0.0
  --hidden-dropout: 0.0
  --seq-length: 4096
  --max-position-embeddings: 4096
  # Make sure the batch size is 1 so we test consecutive calls to inference covering state cleanup:
  --micro-batch-size: 1
  --inference-max-requests: 1
  --ckpt-format: torch_dist
  --ckpt-fully-parallel-save: true
  --ckpt-fully-parallel-load: true
  --ckpt-assume-constant-structure: true
  --dist-ckpt-strictness: log_unexpected
  --bf16: true
  --attention-backend: flash
  --no-create-attention-mask-in-dataloader: true
  --num-workers: 8
  --flash-decode: true
  --use-checkpoint-args: true
  --no-use-tokenizer-model-from-checkpoint-args: true
  --no-load-optim: true
  --deterministic-mode: true
  --save-interval: 2000
  --temperature: 1.0
  --top_k: 1
  --return-log-probs: true
  --num-tokens-to-generate: 80
  --max-tokens-to-oom: 3600000
  --inference-max-seq-length: 4096
  --output-path: ${TENSORBOARD_PATH}
  --prompt-file: ./tests/functional_tests/test_cases/gpt/gpt_static_inference_tp1_pp1_16b_multiprompt_tokensmatch/test_prompts.jsonl
METRICS:
  - "generated_text"
