wandb_version: 1

_wandb:
  desc: null
  value:
    cli_version: 0.10.31
    framework: torch
    is_jupyter_run: false
    is_kaggle_kernel: false
    python_version: 3.8.10
    t:
      1:
      - 1
      4: 3.8.10
      5: 0.10.31
      8:
      - 5
dataset:
  desc: null
  value:
    frac_train: 0.4
    name: mod_subtract_dataset
    p: 96
model:
  desc: null
  value:
    checkpoint_path: null
    name: grokk_model
    strict_load: true
    transformer_config:
      attn_dim: 32
      block_repeats: 1
      dropout: 0.1
      heads: 4
      hidden_dim: 128
      intermediate_dim: 512
      max_length: 5
      num_blocks: 2
      pre_norm: true
train:
  desc: null
  value:
    betas:
    - 0.9
    - 0.98
    bsize: 512
    eval_batches: 8
    eval_every: 10
    lr: 0.001
    max_steps: 1000000.0
    num_workers: 0
    warmup_steps: 10
    weight_decay: 0.0
wandb:
  desc: null
  value:
    use_wandb: true
    wandb_project: grokking_replica
