{
    "normal": [
        "model.transformer.wte.weight"
    ],
    "no_weight_decay": [
        "model.transformer.h.0.ln_1.weight",
        "model.transformer.h.0.ln_2.weight",
        "model.transformer.h.1.ln_1.weight",
        "model.transformer.h.1.ln_2.weight",
        "model.transformer.h.2.ln_1.weight",
        "model.transformer.h.2.ln_2.weight",
        "model.transformer.h.3.ln_1.weight",
        "model.transformer.h.3.ln_2.weight",
        "model.transformer.ln_f.weight",
        "model.transformer.h.0.attn.c_attn.bias",
        "model.transformer.h.0.attn.c_proj.bias",
        "model.transformer.h.0.moe.c_fc.bias",
        "model.transformer.h.0.moe.c_proj.bias",
        "model.transformer.h.0.mlp.c_fc.bias",
        "model.transformer.h.0.mlp.c_proj.bias",
        "model.transformer.h.1.attn.c_attn.bias",
        "model.transformer.h.1.attn.c_proj.bias",
        "model.transformer.h.1.moe.c_fc.bias",
        "model.transformer.h.1.moe.c_proj.bias",
        "model.transformer.h.1.mlp.c_fc.bias",
        "model.transformer.h.1.mlp.c_proj.bias",
        "model.transformer.h.2.attn.c_attn.bias",
        "model.transformer.h.2.attn.c_proj.bias",
        "model.transformer.h.2.moe.c_fc.bias",
        "model.transformer.h.2.moe.c_proj.bias",
        "model.transformer.h.2.mlp.c_fc.bias",
        "model.transformer.h.2.mlp.c_proj.bias",
        "model.transformer.h.3.attn.c_attn.bias",
        "model.transformer.h.3.attn.c_proj.bias",
        "model.transformer.h.3.moe.c_fc.bias",
        "model.transformer.h.3.moe.c_proj.bias",
        "model.transformer.h.3.mlp.c_fc.bias",
        "model.transformer.h.3.mlp.c_proj.bias"
    ],
    "mup": [
        "model.transformer.h.0.attn.c_attn.weight",
        "model.transformer.h.0.attn.c_proj.weight",
        "model.transformer.h.0.moe.gate.weight",
        "model.transformer.h.0.moe.c_fc.weight",
        "model.transformer.h.0.moe.c_proj.weight",
        "model.transformer.h.0.mlp.c_fc.weight",
        "model.transformer.h.0.mlp.c_proj.weight",
        "model.transformer.h.1.attn.c_attn.weight",
        "model.transformer.h.1.attn.c_proj.weight",
        "model.transformer.h.1.moe.gate.weight",
        "model.transformer.h.1.moe.c_fc.weight",
        "model.transformer.h.1.moe.c_proj.weight",
        "model.transformer.h.1.mlp.c_fc.weight",
        "model.transformer.h.1.mlp.c_proj.weight",
        "model.transformer.h.2.attn.c_attn.weight",
        "model.transformer.h.2.attn.c_proj.weight",
        "model.transformer.h.2.moe.gate.weight",
        "model.transformer.h.2.moe.c_fc.weight",
        "model.transformer.h.2.moe.c_proj.weight",
        "model.transformer.h.2.mlp.c_fc.weight",
        "model.transformer.h.2.mlp.c_proj.weight",
        "model.transformer.h.3.attn.c_attn.weight",
        "model.transformer.h.3.attn.c_proj.weight",
        "model.transformer.h.3.moe.gate.weight",
        "model.transformer.h.3.moe.c_fc.weight",
        "model.transformer.h.3.moe.c_proj.weight",
        "model.transformer.h.3.mlp.c_fc.weight",
        "model.transformer.h.3.mlp.c_proj.weight"
    ]
}
