alg_name: "PMET"
model_name: "gptj-6B"
stats_dir: "./data/status"
device: 0
layers: [3, 4, 5, 6, 7, 8]
clamp_norm_factor: 0.75
layer_selection: "all"
fact_token: "subject_last"
v_num_grad_steps: 30
v_lr: 2e-1
v_loss_layer: 27
v_weight_decay: 0.5
kl_factor: 1
mom2_adjustment: true
mom2_update_weight: 15000
rewrite_module_tmp: "transformer.h.{}.mlp.fc_out"
rewrite_module_tmps: ["transformer.h.{}.mlp.fc_out"]
layer_module_tmp: "transformer.h.{}"
mlp_module_tmp: "transformer.h.{}.mlp.fc_out"
attn_module_tmp: "transformer.h.{}.attn.out_proj"
ln_f_module: "transformer.ln_f"
lm_head_module: "lm_head"
mom2_dataset: "wikipedia"
mom2_n_samples: 100000
mom2_dtype: "float32"
model_parallel: false
nll_loss_factor: 1
model_parallel: false
attn_W_loss_weight: 0
attnW_droupout: 0
attn_K_loss_weight: 0
last_hid_restrain_weight: 0
last_hid_restrain_layers: None
high_attn_range: None
loss_type: KL