max_seq_len: 4096
global_seed: 17

model:
  name: llama2_13b 
  path: ...
  tokenizer_path: ...
  init_device: "cpu" 
  d_model: 5120
  n_kv_heads: 40
  n_heads: 40
  n_layers: 40 
  intermediate_size: 13824
  max_seq_len: ${max_seq_len}
  vocab_size: 32000
  attn_pdrop: 0.0
  attn_impl: norm
  rms_norm_eps: 1e-5
  multiple_of: 256
  max_batch_size: 8
  mask:
    mask_lr: 5e-3
    start_sparsity: 0.5
    target_sparsity: 0.5
    pruning_modules: ["head", "intermediate"]

scheduler:
  name: cosine
  t_warmup: 0
  alpha_f: 0.1

optimizer:
  name: decoupled_adamw
  betas:
  - 0.9
  - 0.95
  eps: 1.0e-08
  weight_decay: 1e-4


