datasets:
  - class_name: MegatronDataset
    data_name: Megatron
    data_sampling_ratio: 1
    class_args:
      eval_steps: 2
      data_cache_path: dummy
      data_path:
        dummy
      split: 100,0,0
      sequence_length: 512

tokenizer_args:
  tokenizer_name: EleutherAI/gpt-neox-20b

model_args:
  model_class: AutoModelForCausalLM
  pretrained_config:
    activation_function: swiglu
    add_bias: false
    attn_pdrop: 0
    embd_pdrop: 0
    resid_pdrop: 0
    initializer_range: 0.1
    layer_norm_epsilon: 1e-05
    model_type: rnn_dolomite
    n_embd: 1536
    n_head: 6
    num_experts: 64
    num_experts_per_tok: 8
    n_inner: 512
    num_shared_experts: 2
    n_layer: 4
    n_positions: 8192
    attention_pattern: ddda
    use_short_conv: true
    use_gate: false
    normalization_function: rmsnorm
    position_embedding_type: nope
    rope_theta: 10000
    attention_head_type: mqa
    scale_attn_weights: true
    vocab_size: 50304
    bos_token_id: 0
    eos_token_id: 0
    pad_token_id: 0
    m_width: 6
    m_emb: 12
    m_residual: 0.22
    init_method: mup
    tie_word_embeddings: true
    upcast_logits_for_loss: true

tuning_args:
  tuning_method: pretraining

save_args:
  save_path: dummy
  save_interval: 500

training_parameters:
  num_training_steps: 25000
  eval_interval: 2500000
  micro_batch_size: 128
  gradient_accumulation_steps: 1
  eval_during_training: false

optimizer_args:
  params_group_method: mup
  class_name: TorchAdamW
  class_args:
    lr: 3e-4
    weight_decay: 0.1
    betas:
      - 0.9
      - 0.95
    eps: 1e-10

lr_scheduler_args:
  lr_decay_style: cosine
  num_warmup_steps: 2500
  num_constant_steps: 0
  num_decay_steps: 97500

mixed_precision_args:
  dtype: bf16

distributed_args:
  distributed_backend: torch
  fsdp_algorithm: 2
  stage: 3
  tensor_parallel_size: 2
