{
  "dataset": "240-mammalian",
  "task": "glm",
  "model_type": "jamba",
  "model_config": {
    "hidden_size": 512,
    "intermediate_size": 512,
    "num_hidden_layers": 8,
    "num_attention_heads": 8,
    "num_key_value_heads": 8,
    "use_mamba_kernels": true,
    "mamba_d_state": 16,
    "mamba_d_conv": 4,
    "mamba_expand": 2,
    "mamba_dt_rank": "auto",
    "output_router_logits": true,
    "use_cache": false
  },
  "aux_loss_weight": 0.001,
  "epochs": 100000,
  "max_tokens": 20000,
  "max_len": 2048,
  "pad_to_multiple_of": 8,
  "max_batch_size": 6000,
  "bucket_size": 1000,
  "lr": 1e-4,
  "warmup_steps": 16000,
  "flip_prob": 0.5,
  "activation_checkpointing": 0
}