{
    "normal": [
        "model.transformer.wte.weight"
    ],
    "no_weight_decay": [
        "model.transformer.h.0.ln_1.weight",
        "model.transformer.h.0.ln_2.weight",
        "model.transformer.h.1.ln_1.weight",
        "model.transformer.h.1.sequence_mixer.A_log",
        "model.transformer.h.1.sequence_mixer.D",
        "model.transformer.h.1.sequence_mixer.norm.weight",
        "model.transformer.h.1.ln_2.weight",
        "model.transformer.ln_f.weight",
        "model.transformer.h.0.sequence_mixer.c_attn.bias",
        "model.transformer.h.0.sequence_mixer.c_proj.bias",
        "model.transformer.h.0.mlp_block.c_fc.bias",
        "model.transformer.h.0.mlp_block.c_proj.bias",
        "model.transformer.h.1.sequence_mixer.dt_bias",
        "model.transformer.h.1.sequence_mixer.conv1d.bias",
        "model.transformer.h.1.sequence_mixer.in_proj.bias",
        "model.transformer.h.1.sequence_mixer.out_proj.bias",
        "model.transformer.h.1.mlp_block.c_fc.bias",
        "model.transformer.h.1.mlp_block.c_fc_shared.bias",
        "model.transformer.h.1.mlp_block.c_proj.bias",
        "model.transformer.h.1.mlp_block.c_proj_shared.bias"
    ],
    "mup": [
        "model.transformer.h.0.sequence_mixer.c_attn.weight",
        "model.transformer.h.0.sequence_mixer.c_proj.weight",
        "model.transformer.h.0.mlp_block.c_fc.weight",
        "model.transformer.h.0.mlp_block.c_proj.weight",
        "model.transformer.h.1.sequence_mixer.conv1d.weight",
        "model.transformer.h.1.sequence_mixer.in_proj.weight",
        "model.transformer.h.1.sequence_mixer.out_proj.weight",
        "model.transformer.h.1.mlp_block.gate.weight",
        "model.transformer.h.1.mlp_block.c_fc.weight",
        "model.transformer.h.1.mlp_block.c_fc_shared.weight",
        "model.transformer.h.1.mlp_block.c_proj.weight",
        "model.transformer.h.1.mlp_block.c_proj_shared.weight"
    ]
}
