seed: 555
use_wandb: true

compression_config:
  layer4:
    layer_idx: 4
    forward: natural_compress
    forward-EF: true
    forward-EF-method: EF21
    forward-params:
      topk: 0.5
      # k: 16
    backward: natural_compress
    backward-EF: true
    backward-EF-method: EF21
    backward-params:
      # topk: 0.5
      k: 16
  layer8:
    layer_idx: 8
    forward: natural_compress
    forward-EF: true
    forward-EF-method: EF21
    forward-params:
      topk: 0.5
      # k: 16
    backward: natural_compress
    backward-EF: true
    backward-EF-method: EF21
    backward-params:
      # topk: 0.5
      k: 16

training:
  learning_rate: 0.0006  
  model: gpt2
  dataset: openwebtext
  epochs: 1
  batch_size: 8
  block_size: 1024
  gradient_checkpointing: true
  aq_sgd: false                     
  large_batch: true
  large_batch_params:
    k: 0.6
  lazy_sampling: false
  lazy_sampling_params:
    schedule: constant
    p_t: 0.999

output_dir: "~/pretrain_final/p06"
wandb:
  project: final-gpt2-train-from-scratch-autodl
  name: p06-restart