dataset:
  name: mod_division_dataset
  p: 97
  frac_train: 0.3
model:
  name: grokk_model
  transformer_config:
    max_length: 5
    heads: 4
    hidden_dim: 128
    attn_dim: 32
    intermediate_dim: 512
    num_blocks: 2
    block_repeats: 1
    dropout: 0.1
    pre_norm: true
  checkpoint_path: null
  strict_load: true
train:
  num_workers: 0
  bsize: 512
  lr: 0.001
  weight_decay: 0.0
  betas:
  - 0.9
  - 0.98
  warmup_steps: 10
  eval_every: 100
  eval_batches: 8
  max_steps: 1000000.0
  seed: 0
wandb:
  use_wandb: true
  wandb_project: grokking_replica
wd:
  enabled: true
  num_pairs: 200
  resolution: 64
  max_degree: 40
  log_every_eval: 1
  use_pca: false
  pca_k: 10
sharpness:
  enabled: true
  rho: 0.05
  log_every_eval: 1
