task: classification
output_dir: outputs_smoke
log_dir: logs_smoke
seed: 42

classification:
  dataset_name: glue
  subset: sst2
  local_path: data/glue_sst2_local
  max_seq_len: 64
  batch_size: 32
  shuffle: true
  num_workers: 0     # Windows can reduce to sace memory

model:
  vocab_name: bert-base-uncased
  d_model: 128
  num_heads: 4
  num_layers: 2
  num_classes: 2
  moe:
    num_experts: 1      # eauql to shutdown MoE
    init_groups: 1
    rank: 4
    top_k: 1
    enable_int8: false
    weight_only_int8: false
    try_full_int8: false

train:
  total_steps: 400
  lr: 2.0e-4
  betas: [0.9, 0.98]
  weight_decay: 0.01
  warmup_steps: 40          # 10%
  grad_clip: 1.0
  eval_interval: 40
  save_interval: 400
  early_stop_patience: 5
  balance_loss_weight: 0.0  # turn off for nwo

quantization:
  quantize_at_step: 999999
  finetune_extra_steps: 0
  freeze_after_quant: false
  unfreeze_at_step: null

deguc_schedule:
  clustering_interval: 999999   # shut down clustering
  offload_interval: 999999
  min_offload_rate: 0.0005

distributed:
  enable: false
  backend: nccl

amp:
  enabled: true
