{
  "model": {
    "name": "minigpt-linear",
    "architecture": "linear",
    "maxlen": 1024,
    "vocab_size": 50257,
    "embed_dim": 768,
    "num_heads": 12,
    "feed_forward_dim": 768,
    "num_transformer_blocks": 12,
    "dropout_rate": 0.1
  },
  "training": {
    "batch_size": 32,
    "learning_rate": 0.002,
    "max_tokens_to_process": 1000000000,
    "eval_interval": 2000,
    "eval_steps": 1000,
    "val_set_size": 20000,
    "checkpoint_interval": 10000,
    "optimizer": "adamw",
    "lr_scheduler": "cosine",
    "lr_scheduler_alpha": 0.1,
    "lr_scheduler_warmup_steps": null,
    "momentum": 0.9,
    "weight_decay": 0.01
  },
  "data": {
    "dataset_name": "HuggingFaceFW/fineweb",
    "split": "train",
    "streaming": true,
    "tokenizer_name": "gpt2"
  },
  "logging": {
    "wandb_project": "aether-cosine-training",
    "checkpoint_dir": "./checkpoints",
    "log_level": "INFO",
    "log_file": null
  }
}