{
    "model": {
        "n_layers": 3,
        "d_token": 64,
        "n_heads": 8,
        "attention_dropout": 0.2,
        "residual_dropout": 0.0,
        "activation": "relu",
        "prenormalization": false,
        "initialization": "kaiming",
        "kv_compression": null,
        "kv_compression_sharing": null
    },
    "training": {
        "lr": 0.0001,
        "weight_decay": 1e-05
    }
}