{
    "model_type": "FlashSTU",
    "dim": 896,
    "num_heads": 8,
    "num_layers": 12,
    "seq_len": 8192,
    "weight_tying": true,
    "window_size": 1024,
    "vocab_size": 200064,
    "mlp_scale": 12,
    "bias": false,
    "dropout": 0.1,
    "num_eigh": 24,
    "use_hankel_L": false,
    "num_epochs": 1,
    "global_bsz": 524288,
    "bsz": 8,
    "warmup_steps": 19070,
    "eval_period": 50,
    "save_period": 19000,
    "max_lr": 5.0e-3,
    "min_lr": 3.0e-5,
    "max_norm": 1.0,
    "dilation": 2,
    "fsdp": true,
    "ddp": false,
    "mixed_precision": true,
    "torch_dtype": "bfloat16",
    "cpu_offload": false,
    "sharding_strategy": "full_shard",
    "state_dict_type": "full",
    "auto_wrap_policy": "partial",
    "backward_prefetch": "backward_pre",
    "forward_prefetch": false,
    "sync_module_states": true,
    "use_orig_params": true,
    "device_id": null,
    "precision": {
      "param": "bfloat16",
      "reduce": "bfloat16",
      "buffer": "bfloat16"
    },
    "fsdp_modules": [
      "STULayer",
      "AttentionLayer"
    ],
    "use_activation_checkpointing": true,
    "use_flash_fft": true,
    "use_approx": true,
    "use_attn": false,
    "softcap": 50.0,
    "theta": 10000.0,
    "use_alibi": false,
    "torch_compile": false
  }
  