max_seq_len: 4096
global_seed: 17


model:
  name: llama3_8b 
  path: /root/Share/reweight/llama3/llama-3-8b/consolidated.00.pth
  init_device: "cpu" 
  d_model: 4096
  n_heads: 32
  n_layers: 32
  intermediate_size: 14336
  max_seq_len: ${max_seq_len}
  vocab_size: 128256
  attn_pdrop: 0.0
  attn_impl: flash
  rms_norm_eps: 1e-5
  mask:
    mask_lr: 2e-3
    start_sparsity: 0.4
    target_sparsity: 0.5
    pruning_modules: ["head", "intermediate"] # or ["layer"] "hidden" has some problems
    warmup_steps: 320ba 



scheduler:
  name: cosine_with_warmup
  t_warmup: 100
  alpha_f: 0.1

optimizer:
  name: decoupled_adamw
  lr: 1e-4
  betas:
  - 0.9
  - 0.95
  eps: 1.0e-08
  weight_decay: 0.0


