ENV_VARS:
  CUDA_DEVICE_MAX_CONNECTIONS: 1
  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
  NCCL_ALGO: ^NVLS
  CUBLAS_WORKSPACE_CONFIG: :4096:8
MODEL_ARGS:
  --encoder-num-layers: 24
  --decoder-num-layers: 24
  --hidden-size: 4096
  --num-attention-heads: 64
  --kv-channels: 64
  --ffn-hidden-size: 10240
  --encoder-seq-length: 512
  --decoder-seq-length: 128
  --max-position-embeddings: 512
  --tensor-model-parallel-size: 4
  --pipeline-model-parallel-size: 1
  --micro-batch-size: 4
  --global-batch-size: 8
  --lr: 0.0001
  --train-iters: 25
  --lr-decay-iters: 100
  --lr-decay-style: linear
  --min-lr: 0.00001
  --weight-decay: 1e-2
  --lr-warmup-fraction: .01
  --clip-grad: 1.0
  --bf16: true
  --vocab-extra-ids: 100
  --init-method-std: 0.015
  --transformer-impl: transformer_engine
  --data-path: ${DATA_PATH}/my-t5_00_text_document
  --vocab-file: ${DATA_PATH}/bert-large-cased-vocab.txt
  --tokenizer-type: BertWordPieceCase
  --calculate-per-token-loss: true
  --split: 99982,9,9
  --save: ${CHECKPOINT_SAVE_PATH}
  --load: ${CHECKPOINT_LOAD_PATH}
  --tensorboard-dir: ${TENSORBOARD_PATH}
  --log-params-norm: true
  --log-num-zeros-in-grad: true
  --log-validation-ppl-to-tensorboard: true
  --log-timers-to-tensorboard: true
  --timing-log-level: 2
  --log-interval: 1
  --save-interval: 10000
  --eval-interval: 1000
  --eval-iters: 10
  --distributed-backend: nccl
  --data-cache-path: ${DATA_CACHE_PATH}
  --deterministic-mode: true
  --ckpt-format: torch_dist
  --log-memory-to-tensorboard: true
TEST_TYPE: regular
METRICS:
  - "iteration-time"
  - "lm loss"
  - "num-zeros"
  - "mem-allocated-bytes"
  - "mem-max-allocated-bytes"
