TRAINING_ARGS="
	--micro-batch-size 30 \
	--global-batch-size 240 \
	--train-iters 18150 \
	--eval-interval 500 \
	--eval-iters 10
"

DATA_ARGS="
	--split 98,2,0 \
	--tokenizer-type GPT2BPETokenizer \
	--vocab-size 32768 \
	--make-vocab-size-divisible-by 64 \
	--data-path ./path_to_dir \
	--vocab-file ./path_to_vocab.json \
	--merge-file ./path_to_merges.json \
"

NETWORK_ARGS="
	--num-layers 10 \
	--hidden-size 640 \
	--ffn-hidden-size 2560 \
	--num-attention-heads 10 \
	--seq-length 2048 \
	--max-position-embeddings 4096 \
	--position-embedding-type rope \
	--no-position-embedding \
	--normalization RMSNorm \
	--swiglu \
	--untie-embeddings-and-output-weights \
	--disable-bias-linear
"

MOE_ARGS="
	--num-experts 32 \
	--expert-model-parallel-size 1 \
	--moe-router-topk 1 \
	--moe-router-dtype fp32 \
	--moe-router-score-function soft-topk \
	--moe-router-load-balancing-type aux_loss \
	--moe-aux-loss-coeff 0.01 \
	--moe-grouped-gemm \
	--moe-input-jitter-eps 0.01 \
	--moe-router-soft-topk-alpha 5.0 1.0 \
	--moe-router-soft-topk-schedule-iters 0.0 1.0 \
	--moe-router-soft-topk-hard-threshold-coeff 4.0 \
	--moe-router-soft-topk-routing-scores-threshold 1.1 \
	--moe-router-tokens-dist-metrics
"

MODEL_PARALLEL_ARGS="
	--use-distributed-optimizer \
	--tensor-model-parallel-size 1 \
	--pipeline-model-parallel-size 1 \
	--sequence-parallel
"

PERFORMANCE_ARGS="
	--use-flash-attn \
	--recompute-activations \
	--recompute-granularity selective \
	--overlap-grad-reduce \
	--overlap-param-gather
"

MIXED_PRECISION_ARGS="
	--bf16 \
	--attention-softmax-in-fp32
"

LEARNING_RATE_ARGS="
	--lr 1e-03 \
	--lr-decay-style cosine \
	--lr-warmup-fraction 0.01 \
	--min-lr 1e-05
"

INITIALIZATION_ARGS="
	--init-method-std 0.02 \
	--seed 42
"

REGULARIZATION_ARGS="
	--attention-dropout 0.0 \
	--hidden-dropout 0.0 \
	--weight-decay 0.1 \
	--adam-beta1 0.9 \
	--adam-beta2 0.95 \
	--clip-grad 1.0
"

CHECKPOINTING_ARGS="
	--save-interval 18150 \
	--save ./path_to_checkpoint_dir \
"

LOGGING_ARGS="
	--log-progress \
	--log-interval 1 \
	--timing-log-level 0 \
	--tensorboard-dir ./tensorboard \
	--tensorboard-log-interval 1 \
	--wandb-exp-name exp_name \
	--wandb-project project_name \
	--wandb-entity entity_name \
 	--wandb-save-dir ./wandb
"