seed: 555
use_wandb: true

compression_config:
  layer2:
    layer_idx: 2
    forward: topk
    forward-EF: true
    forward-EF-method: EF21
    forward-params:
      topk: 0.5
    backward: topk
    backward-EF: true
    backward-EF-method: EF21
    backward-params:
      topk: 0.5
  layer5:
    layer_idx: 5
    forward: topk
    forward-EF: true
    forward-EF-method: EF21
    forward-params:
      topk: 0.5
    backward: topk
    backward-EF: true
    backward-EF-method: EF21
    backward-params:
      topk: 0.5
  layer8:
    layer_idx: 8
    forward: topk
    forward-EF: true
    forward-EF-method: EF21
    forward-params:
      topk: 0.5
    backward: topk
    backward-EF: true
    backward-EF-method: EF21
    backward-params:
      topk: 0.5

training:
  learning_rate: 0.00005     # 默认是 5e-5
  model: gpt2
  dataset: gsm8k
  epochs: 4
  batch_size: 8
  block_size: 1024
  gradient_checkpointing: false
  aq_sgd: false                     # 是否使用AQSGD, 注意和前文的 forward-method = AQSGD 一起用
  lazy_sampling: true
  lazy_sampling_params:
    schedule: constant
    p_t: 0.5

wandb:
  project: gpt2-test  # gpt2-compression
  name: fw-top50-ef21-bw-top50-ef21-topk-lazy-pt05-test
