{
  "model": {
    "Feature": 960,
    "ATTN Feature": 960,
    "FFN Feature": 3840,
    "Head Count": 12,
    "Decoder Count": 16,
    "Init Scalar": 0.03125,
    "RoPE Base": 100000,
    "Max Length": 8192
  },
  "pretrain": {
    "Peak LR": 0.003,
    "Accumulation": 16,
    "Grads Clipping": 1,
    "Weight Decay": 0.00001,
    "Total Steps": 40000,
    "Warmup Steps": 4000,
    "Anneal Steps": 8000,
    "Batch Size": 256,
    "Context Length": 512
  },
  "sst": {
    "Peak LR": 0.00015,
    "Accumulation": 16,
    "Grads Clipping": 1,
    "Weight Decay": 0.00001,
    "Total Steps": 12000,
    "Warmup Steps": 4000,
    "Batch Size": 128,
    "Context Length": 1024
  }
}