model:
  vocab_size: 151645
  hidden_size: 512
  intermediate_size: 1536
  num_layers: 14
  num_attention_heads: 8
  num_kv_heads: 8
  max_position: 512
  rms_norm_eps: 1e-6
  dropout: 0.0
  tie_lm_head: true # we tie the weights as QWen2 vocab size ~ 150k

training:
  strategy: "progressive"
  mode: "confidence_collapse"
  confidence_threshold: 0.9
  eos_id: 151643
  interval_change: false
  K: 5
  block_size: 256
  k_schedule:
    - [5 , 0]
    - [8, 30000]
    - [11, 60000]
    - [14, 90000]
    - [17, 120000]
    - [20, 150000]
    - [23, 180000]
    - [26, 210000]
    - [29, 240000]
    - [32, 270000]
    - [35, 300000]
    - [38, 330000]
    - [41, 360000]
  reweighting: "none"
  data_ratio: 1.0
  learning_rate: 3e-4
  weight_decay: 0.01
  num_epochs: 20
  max_grad_norm: 1.0
  batch_size: 32
  warmup_steps: 1000
  logging_steps: 100
  eval_steps: 5000
  save_steps: 20000
  ema: 0.9999 # same as MDLM. Leave empty to disable EMA

validation:
  val_dir: "none" # gsm8k eval loop is self-included
  sampling:
    temperature: 0.0
    confidence: ["top_k"]
    unmasking_num: [2, 3]
  
data:
  mask_id: 151644
  dataset: "tinygsm"
  data_dir: "data/tiny_gsm"
  val_ratio: 0.02
  seed: 2026
  training:
    per_gpu_batch_size: ${training.batch_size}
    cpus: 4

wandb:
  wandb: true
  project: "mdm-pretraining"
  name: "puma"
  entity: "your-entity-here"