dataset:
  name: mod_subtract_dataset
  p: 96
  frac_train: 0.4
model:
  name: grokk_model
  transformer_config:
    max_length: 5
    heads: 4
    hidden_dim: 128
    attn_dim: 32
    intermediate_dim: 512
    num_blocks: 2
    block_repeats: 1
    dropout: 0.1
    pre_norm: true
  checkpoint_path: null
  strict_load: true
train:
  num_workers: 0
  bsize: 512
  lr: 0.001
  weight_decay: 0.0
  betas:
  - 0.9
  - 0.98
  warmup_steps: 10
  eval_every: 10
  eval_batches: 8
  max_steps: 1000000.0
wandb:
  use_wandb: true
  wandb_project: grokking_replica
