{
    "data-path": "/path/to/training/data.jsonl",
    "tensor-model-parallel-size": 4,
    "pipeline-model-parallel-size": 9,
    "context-parallel-size": 1,
    "expert-model-parallel-size": 1,
    "global-batch-size": 256,
    "micro-batch-size": 1,
    "seq-length": 16384,
    "train-epochs": 3,
    "log-interval": 1,
    "save-interval": 500,
    "eval-interval": 3000,
    "eval-iters": 10,
    "split": "1000,0,0",
    "dataloader-type": "cyclic",
    "data-impl": "mmap",
    "bf16": "",
    "attention-softmax-in-fp32": "",
    "seed": "7",
    "distributed-timeout-minutes": 120,
    "load-inequal-bucket-checkpoint": "",
    "bucket-size": 400000000,
    "use-flash-attn": "",
    "legacy-version": "0.30",
    "flash-attn-deterministic": "",
    "varlen-attention": "",
    "deterministic-mode": "",
    "use-grouped-gemm": "",
    "grouped-gemm-init-same-with-separate-expserts": "",
    "sequence-parallel": "",

    "use-distributed-optimizer": "",
    "optimizer": "adam",
    "adam-beta1": 0.9,
    "adam-beta2": 0.95,
    "lr": 5e-06,
    "min-lr": 0,
    "lr-decay-style": "cosine",
    "lr-warmup-fraction": 0.03,
    "mup-lr-scale-cond": "",
    "clip-grad": 1,
    "weight-decay": 0.0001,

    "base-size": 768,
    "disable-bias-linear": "",
    "tokenizer-type": "BloomTokenizer",
    "untie-embeddings-and-output-weights": "",
    "make-vocab-size-divisible-by": 1,
    "attention-dropout": 0.0,
    "hidden-dropout": 0.0,
    "hf-type": "llama_moe",
    "swiglu": "",
    "num-experts": "1 64 1 64 1 64 1 64 1 64 1 64 1 64 1 64 1 64 1 64 1 64 1 64 1 64 1 64 1 64 1 64 1 64",
    "moe-topk": "3",
    "expert-interval": "1",
    "router": "'fast_token_choose'",
    "router-dtype": "float32",
    "use-mcore-models": "",
    "avg-loss-comm-optimize": "",
    "use-load-balance-loss": "",
    "load-balance-loss-type": "both",
    "moe-loss-coeff": "0.001",
    "only-z-loss-coeff": 0.1,
    "eod-mask-loss": "",
    "log-batch-size-to-tensorboard": "",
    "log-memory-to-tensorboard": "",
    "experiment_tracking.board": "True",
    "use-gigaflops-pre-token": "",
    "param_report": 50,
    "simultaneous-writing-native-tensorboard": "",
    "multi-rank-tracking-enable": "",

    "parallel-loader-num": 4,
    "distributed-checkpointing": "",

    "npu-mlp-embedding": "",
    "output-logits-chunk-sum-cnt": 32,
    "memory-optimize-in-sp": "",
    "memory-optimize-in-1f2b": "",
    "cuda-event-trace": "",
    "dynamic-cuda-event-trace": "",
    "cp-alltoall-solution": "",
    "swiglu-recompute": "",
    "recompute-granularity": "full",
    "recompute-method": "block",
    "recompute-num-layers": 4,
    "balance-embedding-stage": "",
    "capacity-factor": 64,
    "get-batch-deferred-on-this-cp-rank": "",
    "splicing": "",
    "splicing-weight": ""
}