task: classification
output_dir: outputs_8g_cls
log_dir: logs
seed: 42

classification:
  dataset_name: glue
  subset: sst2
  local_path: data/glue_sst2_local
  max_seq_len: 128
  batch_size: 32
  shuffle: true
  num_workers: 2

model:
  vocab_name: bert-base-uncased
  d_model: 256
  num_heads: 8
  num_layers: 4
  num_classes: 2
  moe:
    num_experts: 64
    init_groups: 4
    rank: 8
    top_k: 2
    enable_int8: false
    weight_only_int8: true
    try_full_int8: false

train:
  total_steps: 2000
  lr: 3.0e-4
  betas: [0.9, 0.98]
  weight_decay: 0.01
  warmup_steps: 400
  grad_clip: 1.0
  eval_interval: 200
  save_interval: 1000
  early_stop_patience: 5
  balance_loss_weight: 0.02

quantization:
  quantize_at_step: 999999   # shutdown
  finetune_extra_steps: 0
  freeze_after_quant: false
  unfreeze_at_step: null

deguc_schedule:
  clustering_interval: 200
  offload_interval: 250
  min_offload_rate: 0.0005

distributed:
  enable: false
  backend: nccl

amp:
  enabled: false
