seed: 42
num_total_iters: 200
num_freq: 2
gradient: coupled
is_asyn: false
is_init: false
is_clip: !!float '1'
max_length: 32
seed_for_shuffle: 42
batch_size: 8
num_workers: 0
scheduler:
  name: cosine
  params:
    warmup_steps: 20
    min_lr_ratio: 0.1
optimizer:
  name: AdamW
  params:
    lr: 0.003
    betas:
    - 0.9
    - 0.95
    eps: 0.00000001
    weight_decay: 0.1
layers:
- name: embed_tokens
  params:
    energy: 0.999
    init_energy: 0.45
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.2
    rate_sparsity: 0.05
    alpha_dict:
      init: 0.0000001
      mode: adaptive
      rate_decay: 0.02
    beta_dict:
      init: 0.0000001
      mode: hard_cut
      rate_decay: 0.02
    rho_dict:
      rho: 0.00001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: lm_head
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: 0.0000001
      mode: adaptive
      rate_decay: 0.02
    beta_dict:
      init: 0.0000001
      mode: hard_cut
      rate_decay: 0.02
    rho_dict:
      rho: 0.00001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.0.self_attn.o_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: 0.0000001
      mode: adaptive
      rate_decay: 0.02
    beta_dict:
      init: 0.0000001
      mode: hard_cut
      rate_decay: 0.2
    rho_dict:
      rho: 0.00001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.0.self_attn.q_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: 0.0000001
      mode: adaptive
      rate_decay: 0.02
    beta_dict:
      init: 0.0000001
      mode: hard_cut
      rate_decay: 0.02
    rho_dict:
      rho: 0.00001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.0.self_attn.k_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: 0.0000001
      mode: adaptive
      rate_decay: 0.02
    beta_dict:
      init: 0.0000001
      mode: hard_cut
      rate_decay: 0.02
    rho_dict:
      rho: 0.00001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.0.self_attn.v_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: 0.0000001
      mode: adaptive
      rate_decay: 0.02
    beta_dict:
      init: 0.0000001
      mode: hard_cut
      rate_decay: 0.02
    rho_dict:
      rho: 0.00001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.0.mlp.gate_proj
  params:
    energy: 0.999
    init_energy: 0.35
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.35
    rate_sparsity: 0.05
    alpha_dict:
      init: 0.0000001
      mode: adaptive
      rate_decay: 0.02
    beta_dict:
      init: 0.0000001
      mode: hard_cut
      rate_decay: 0.02
    rho_dict:
      rho: 0.00001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.0.mlp.down_proj
  params:
    energy: 0.999
    init_energy: 0.35
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.35
    rate_sparsity: 0.05
    alpha_dict:
      init: 0.0000001
      mode: adaptive
      rate_decay: 0.02
    beta_dict:
      init: 0.0000001
      mode: hard_cut
      rate_decay: 0.02
    rho_dict:
      rho: 0.00001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.0.mlp.up_proj
  params:
    energy: 0.999
    init_energy: 0.35
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.35
    rate_sparsity: 0.05
    alpha_dict:
      init: 0.0000001
      mode: adaptive
      rate_decay: 0.02
    beta_dict:
      init: 0.0000001
      mode: hard_cut
      rate_decay: 0.02
    rho_dict:
      rho: 0.00001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
