{
	"d_model": 1024,
	"n_layer": 20,
	"vocab_size": 5000,
	"ssm_cfg": {"d_state": 16, "d_conv": 4, "layer": "Mamba2"},
	"d_intermediate": 0,
	"attn_layer_idx": [],
    "attn_cfg": {},
	"rms_norm": true,
	"residual_in_fp32": true,
	"fused_add_norm": true,
	"pad_vocab_size_multiple": 8,
	"tie_embeddings": true,
	"layer_block": [
		"mamba",
		"mamba",
		"mamba",
		"mamba",
		"transformer",
		"mamba",
		"mamba",
		"mamba",
		"mamba",
		"transformer",
		"mamba",
		"mamba",
		"mamba",
		"mamba",
		"transformer",
		"mamba",
		"mamba",
		"mamba",
		"mamba",
		"transformer"
	],
	"transformer": {
		"architectures": [
			"GPTNeoXForCausalLM"
		],
		"bos_token_id": 0,
		"eos_token_id": 0,
		"hidden_act": "gelu",
		"hidden_size": 1024,
		"initializer_range": 0.02,
		"intermediate_size": 4096,
		"layer_norm_eps": 1e-05,
		"max_position_embeddings": 2048,
		"model_type": "gpt_neox",
		"num_attention_heads": 16,
		"num_hidden_layers": 12,
		"rotary_emb_base": 10000,
		"rotary_pct": 0.25,
		"tie_word_embeddings": false,
		"torch_dtype": "float16",
		"transformers_version": "4.24.0",
		"use_cache": true,
		"use_parallel_residual": true,
		"vocab_size": 1000,
		"_attn_implementation": "flash_attention_2",
		"rms_norm": true
	}
}