{
  "model": {
    "name": "minigpt-linear",
    "architecture": "linear",
    "maxlen": 1024,
    "vocab_size": 50257,
    "embed_dim": 1024,
    "num_heads": 16,
    "feed_forward_dim": 1024,
    "num_transformer_blocks": 24,
    "dropout_rate": 0.1
  },
  "training": {
    "batch_size": 64,
    "precision": "bfloat16",
    "learning_rate": 0.003,
    "max_tokens_to_process": 1000000000,
    "eval_interval": 10000,
    "eval_steps": 1000,
    "val_set_size": 10000,
    "checkpoint_interval": 10000,
    "optimizer": "novograd",
    "lr_scheduler": "cosine",
    "lr_scheduler_alpha": 0.1,
    "lr_scheduler_warmup_steps": 1000,
    "momentum": 0.9,
    "weight_decay": 0.01
  },
  "data": {
    "dataset_name": "HuggingFaceFW/fineweb",
    "split": "train",
    "streaming": true,
    "tokenizer_name": "gpt2"
  },
  "logging": {
    "wandb_project": "large-model-training",
    "checkpoint_dir": "./checkpoints",
    "log_level": "INFO",
    "log_file": null
  },
  "device": {
    "mesh_shape": [2, 4],
    "auto_detect_mesh": false
  }
}