CONFIGS = {

    # 2 layers, 8 heads, GELU MLP
    "l2_h8": dict(
        n_layers=2,
        d_model=512,
        n_heads=8,
        d_head=64,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_2L512W_C4_Code_8H",
        attn_only=False,
        lr=1e-4,
    ),

    # 2 layers, 8 heads, attention-only
    "l2_h8_attn": dict(
        n_layers=2,
        d_model=512,
        n_heads=8,
        d_head=64,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_2L512W_C4_Code_8H_Attn_only",
        attn_only=True,
        lr=1e-4,
    ),

    # 4 layers, 8 heads, GELU MLP
    "l4_h8": dict(
        n_layers=4,
        d_model=512,
        n_heads=8,
        d_head=64,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_4L512W_C4_Code_8H",
        attn_only=False,
        lr=1e-4,
    ),

    # 4 layers, 8 heads, attention-only
    "l4_h8_attn": dict(
        n_layers=4,
        d_model=512,
        n_heads=8,
        d_head=64,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_4L512W_C4_Code_8H_Attn_only",
        attn_only=True,
        lr=1e-4,
    ),

    # 8 layers, 8 heads, GELU MLP
    "l8_h8": dict(
        n_layers=8,
        d_model=512,
        n_heads=8,
        d_head=64,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_8L512W_C4_Code_8H",
        attn_only=False,
        lr=1e-4,
    ),

    # 8 layers, 8 heads, attention-only
    "l8_h8_attn": dict(
        n_layers=8,
        d_model=512,
        n_heads=8,
        d_head=64,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_8L512W_C4_Code_8H_Attn_only",
        attn_only=True,
        lr=1e-4,
    ),

    # 2 layers, 16 heads, GELU MLP
    "l2_h16": dict(
        n_layers=2,
        d_model=512,
        n_heads=16,
        d_head=32,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_2L512W_C4_Code_16H",
        attn_only=False,
        lr=1e-4,
    ),

    # 2 layers, 16 heads, attention-only
    "l2_h16_attn": dict(
        n_layers=2,
        d_model=512,
        n_heads=16,
        d_head=32,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_2L512W_C4_Code_16H_Attn_only",
        attn_only=True,
        lr=1e-4,
    ),

    # 4 layers, 16 heads, GELU MLP
    "l4_h16": dict(
        n_layers=4,
        d_model=512,
        n_heads=16,
        d_head=32,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_4L512W_C4_Code_16H",
        attn_only=False,
        lr=1e-4,
    ),

    # 4 layers, 16 heads, attention-only
    "l4_h16_attn": dict(
        n_layers=4,
        d_model=512,
        n_heads=16,
        d_head=32,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_4L512W_C4_Code_16H_Attn_only",
        attn_only=True,
        lr=1e-4,
    ),

    # 8 layers, 16 heads, GELU MLP
    "l8_h16": dict(
        n_layers=8,
        d_model=512,
        n_heads=16,
        d_head=32,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_8L512W_C4_Code_16H",
        attn_only=False,
        lr=1e-4,
    ),

    # 8 layers, 16 heads, attention-only
    "l8_h16_attn": dict(
        n_layers=8,
        d_model=512,
        n_heads=16,
        d_head=32,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_8L512W_C4_Code_16H_Attn_only",
        attn_only=True,
        lr=1e-4,
    ),

    "gpt2" : dict(
        n_layers=12,
        d_model=768,
        n_heads=12,
        d_head=64,
        d_mlp=3072,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=50257,
        tokenizer_name="gpt2",
        model_name="gpt2",
        attn_only=False,
        lr=5e-5,
    ),

    # 2 layers, 8 heads, GELU MLP
    "l2_h8_wd": dict(
        n_layers=2,
        d_model=512,
        n_heads=8,
        d_head=64,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_2L512W_C4_Code_8H",
        attn_only=False,
        lr=1e-4,
    ),

    # 2 layers, 8 heads, attention-only
    "l2_h8_attn_wd": dict(
        n_layers=2,
        d_model=512,
        n_heads=8,
        d_head=64,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_2L512W_C4_Code_8H_Attn_only",
        attn_only=True,
        lr=1e-4,
    ),

    # 4 layers, 8 heads, GELU MLP
    "l4_h8_wd": dict(
        n_layers=4,
        d_model=512,
        n_heads=8,
        d_head=64,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_4L512W_C4_Code_8H",
        attn_only=False,
        lr=1e-4,
    ),

    # 4 layers, 8 heads, attention-only
    "l4_h8_attn_wd": dict(
        n_layers=4,
        d_model=512,
        n_heads=8,
        d_head=64,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_4L512W_C4_Code_8H_Attn_only",
        attn_only=True,
        lr=1e-4,
    ),

    # 8 layers, 8 heads, GELU MLP
    "l8_h8_wd": dict(
        n_layers=8,
        d_model=512,
        n_heads=8,
        d_head=64,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_8L512W_C4_Code_8H",
        attn_only=False,
        lr=1e-4,
    ),

    # 8 layers, 8 heads, attention-only
    "l8_h8_attn_wd": dict(
        n_layers=8,
        d_model=512,
        n_heads=8,
        d_head=64,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_8L512W_C4_Code_8H_Attn_only",
        attn_only=True,
        lr=1e-4,
    ),

    # 2 layers, 16 heads, GELU MLP
    "l2_h16_wd": dict(
        n_layers=2,
        d_model=512,
        n_heads=16,
        d_head=32,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_2L512W_C4_Code_16H",
        attn_only=False,
        lr=1e-4,
    ),

    # 2 layers, 16 heads, attention-only
    "l2_h16_attn_wd": dict(
        n_layers=2,
        d_model=512,
        n_heads=16,
        d_head=32,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_2L512W_C4_Code_16H_Attn_only",
        attn_only=True,
        lr=1e-4,
    ),

    # 4 layers, 16 heads, GELU MLP
    "l4_h16_wd": dict(
        n_layers=4,
        d_model=512,
        n_heads=16,
        d_head=32,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_4L512W_C4_Code_16H",
        attn_only=False,
        lr=1e-4,
    ),

    # 4 layers, 16 heads, attention-only
    "l4_h16_attn_wd": dict(
        n_layers=4,
        d_model=512,
        n_heads=16,
        d_head=32,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_4L512W_C4_Code_16H_Attn_only",
        attn_only=True,
        lr=1e-4,
    ),

    # 8 layers, 16 heads, GELU MLP
    "l8_h16_wd": dict(
        n_layers=8,
        d_model=512,
        n_heads=16,
        d_head=32,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_8L512W_C4_Code_16H",
        attn_only=False,
        lr=1e-4,
    ),

    # 8 layers, 16 heads, attention-only
    "l8_h16_attn_wd": dict(
        n_layers=8,
        d_model=512,
        n_heads=16,
        d_head=32,
        d_mlp=2048,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=48262,
        tokenizer_name="NeelNanda/gpt-neox-tokenizer-digits",
        model_name="GELU_8L512W_C4_Code_16H_Attn_only",
        attn_only=True,
        lr=1e-4,
    ),

    "gpt2_wd" : dict(
        n_layers=12,
        d_model=768,
        n_heads=12,
        d_head=64,
        d_mlp=3072,
        n_ctx=1024,
        act_fn="gelu",
        d_vocab=50257,
        tokenizer_name="gpt2",
        model_name="gpt2",
        attn_only=False,
        lr=5e-5,
    ),
}

