_wandb:
    value:
        cli_version: 0.24.0
        e:
            lth835r39ezfa8k9aer1jm62klihjamm:
                args:
                    - dataset=mod_division_dataset
                    - dataset.p=97
                    - dataset.frac_train=0.3
                    - wd.use_pca=false
                    - wd.num_pairs=200
                    - train.eval_every=100
                cpu_count: 32
                cpu_count_logical: 64
                cudaVersion: "12.6"
                disk:
                    /:
                        total: "942910771200"
                        used: "425209159680"
                email: lixx22@mails.tsinghua.edu.cn
                executable: /data/home/xxx/anaconda3/envs/xxx_pytorch/bin/python
                git:
                    commit: afcebbace2522d82f3837f5ff79cb147f793fd6e
                    remote: https://github.com/Sea-Snell/grokking.git
                gpu: NVIDIA A100 80GB PCIe
                gpu_count: 6
                gpu_nvidia:
                    - architecture: Ampere
                      cudaCores: 6912
                      memoryTotal: "85899345920"
                      name: NVIDIA A100 80GB PCIe
                      uuid: GPU-69b6fe5d-9ac0-132b-73fc-4ccedc445c80
                    - architecture: Ampere
                      cudaCores: 6912
                      memoryTotal: "85899345920"
                      name: NVIDIA A100 80GB PCIe
                      uuid: GPU-d71ced92-168e-89f4-b010-d898d86d4647
                    - architecture: Ampere
                      cudaCores: 6912
                      memoryTotal: "85899345920"
                      name: NVIDIA A100 80GB PCIe
                      uuid: GPU-17e1e0b7-1cd1-7776-1c30-a29affe4a6b9
                    - architecture: Volta
                      cudaCores: 5120
                      memoryTotal: "34359738368"
                      name: Tesla V100-SXM2-32GB
                      uuid: GPU-08f9551c-2d2d-bd14-93d4-d47ae3910d65
                    - architecture: Volta
                      cudaCores: 5120
                      memoryTotal: "34359738368"
                      name: Tesla V100-SXM2-32GB
                      uuid: GPU-bb8592d4-5b73-18b7-7ca4-aaa24c65bb71
                    - architecture: Volta
                      cudaCores: 5120
                      memoryTotal: "34359738368"
                      name: Tesla V100-SXM2-32GB
                      uuid: GPU-b20fc8ff-7249-af87-3c77-eb3f468e021e
                host: user-NF5468M5
                memory:
                    total: "404941676544"
                os: Linux-5.15.0-136-generic-x86_64-with-glibc2.10
                program: train_grokk.py
                python: CPython 3.8.10
                root: /data/home/xxx/research/code/dbViz/grokking/scripts/outputs/2026-01-14/17-46-11
                startedAt: "2026-01-14T09:46:13.192517Z"
                writerId: lth835r39ezfa8k9aer1jm62klihjamm
        m: []
        python_version: 3.8.10
        t:
            "1":
                - 1
                - 50
                - 51
            "2":
                - 1
                - 50
                - 51
            "3":
                - 13
                - 16
            "4": 3.8.10
            "5": 0.24.0
            "12": 0.24.0
            "13": linux-x86_64
dataset:
    value:
        frac_train: 0.3
        name: mod_division_dataset
        p: 97
model:
    value:
        checkpoint_path: null
        name: grokk_model
        strict_load: true
        transformer_config:
            attn_dim: 32
            block_repeats: 1
            dropout: 0.1
            heads: 4
            hidden_dim: 128
            intermediate_dim: 512
            max_length: 5
            num_blocks: 2
            pre_norm: true
sharpness:
    value:
        enabled: true
        log_every_eval: 1
        rho: 0.05
train:
    value:
        betas:
            - 0.9
            - 0.98
        bsize: 512
        eval_batches: 8
        eval_every: 100
        lr: 0.001
        max_steps: 1e+06
        num_workers: 0
        seed: 0
        warmup_steps: 10
        weight_decay: 0
wandb:
    value:
        use_wandb: true
        wandb_project: grokking_replica
wd:
    value:
        enabled: true
        log_every_eval: 1
        max_degree: 40
        num_pairs: 200
        pca_k: 10
        resolution: 64
        use_pca: false
