seed: 42
name: llama_9m
training_mode: salad
num_total_iters: 100
num_freq: 2
gradient: coupled
is_asyn: false
is_init: false
is_wandb: true
is_monitor: true
save_interval: 2
is_clip: !!float '1'
max_length: 256
seed_for_shuffle: 42
batch_size: 2
num_workers: 0
scheduler:
  name: cosine
  params:
    warmup_steps: 1000
    min_lr_ratio: 0.1
optimizer:
  name: AdamW
  params:
    lr: 0.008
    betas:
    - 0.9
    - 0.95
    eps: 0.00000001
    weight_decay: !!float '0'
layers:
- name: embed_tokens
  params:
    energy: 0.999
    init_energy: 0.45
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: !!float '1'
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.0000003
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: lm_head
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.00000006
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.0.self_attn.o_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.0.self_attn.q_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.0.self_attn.k_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.0.self_attn.v_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.0.mlp.gate_proj
  params:
    energy: 0.999
    init_energy: 0.35
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.0.mlp.down_proj
  params:
    energy: 0.999
    init_energy: 0.35
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.0.mlp.up_proj
  params:
    energy: 0.999
    init_energy: 0.35
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.1.self_attn.o_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.1.self_attn.q_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.1.self_attn.k_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.1.self_attn.v_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.1.mlp.gate_proj
  params:
    energy: 0.999
    init_energy: 0.35
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.1.mlp.down_proj
  params:
    energy: 0.999
    init_energy: 0.35
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.1.mlp.up_proj
  params:
    energy: 0.999
    init_energy: 0.35
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.2.self_attn.o_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.2.self_attn.q_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.2.self_attn.k_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.2.self_attn.v_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.2.mlp.gate_proj
  params:
    energy: 0.999
    init_energy: 0.35
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.2.mlp.down_proj
  params:
    energy: 0.999
    init_energy: 0.35
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.2.mlp.up_proj
  params:
    energy: 0.999
    init_energy: 0.35
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.3.self_attn.o_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.3.self_attn.q_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.3.self_attn.k_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.3.self_attn.v_proj
  params:
    energy: 0.999
    init_energy: 0.15
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.3.mlp.gate_proj
  params:
    energy: 0.999
    init_energy: 0.35
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.3.mlp.down_proj
  params:
    energy: 0.999
    init_energy: 0.35
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
- name: layers.3.mlp.up_proj
  params:
    energy: 0.999
    init_energy: 0.35
    is_init: false
    iter_max: 1
    tol: 0.001
    rate_rank: 0.15
    rate_sparsity: 0.05
    alpha_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.2
      drate: 0.01
    beta_dict:
      init: !!float '0'
      mode: adaptive
      rate_decay: 0.002
      drate: 0.01
    rho_dict:
      rho: 0.000001
      mode: fixed
      start_epoch: 2
      coeff_rho: 0.1
      coeff_rho_min: 0.01
      coeff_rho_max: !!float '1500'
      rho_rate: !!float '1'
