{
  "results": {
    "gsm8k": {
      "alias": "gsm8k",
      "exact_match,strict-match": 0.7012888551933283,
      "exact_match_stderr,strict-match": 0.012607137125693625,
      "exact_match,flexible-extract": 0.7862016679302501,
      "exact_match_stderr,flexible-extract": 0.011293054698635034
    }
  },
  "group_subtasks": {
    "gsm8k": []
  },
  "configs": {
    "gsm8k": {
      "task": "gsm8k",
      "tag": [
        "math_word_problems"
      ],
      "dataset_path": "gsm8k",
      "dataset_name": "main",
      "training_split": "train",
      "test_split": "test",
      "fewshot_split": "train",
      "doc_to_text": "Question: {{question}}\nAnswer:",
      "doc_to_target": "{{answer}}",
      "unsafe_code": false,
      "description": "",
      "target_delimiter": " ",
      "fewshot_delimiter": "\n\n",
      "num_fewshot": 5,
      "metric_list": [
        {
          "metric": "exact_match",
          "aggregation": "mean",
          "higher_is_better": true,
          "ignore_case": true,
          "ignore_punctuation": false,
          "regexes_to_ignore": [
            ",",
            "\\$",
            "(?s).*#### ",
            "\\.$"
          ]
        }
      ],
      "output_type": "generate_until",
      "generation_kwargs": {
        "until": [
          "Question:",
          "</s>",
          "<|im_end|>"
        ],
        "do_sample": false,
        "temperature": 0.0
      },
      "repeats": 1,
      "filter_list": [
        {
          "name": "strict-match",
          "filter": [
            {
              "function": "regex",
              "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
            },
            {
              "function": "take_first"
            }
          ]
        },
        {
          "name": "flexible-extract",
          "filter": [
            {
              "function": "regex",
              "group_select": -1,
              "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
            },
            {
              "function": "take_first"
            }
          ]
        }
      ],
      "should_decontaminate": false,
      "metadata": {
        "version": 3.0,
        "model_path": "GSAI-ML/LLaDA-8B-Instruct",
        "gen_length": 1024,
        "steps": 1024,
        "block_length": 8,
        "remasking": "bst"
      }
    }
  },
  "versions": {
    "gsm8k": 3.0
  },
  "n-shot": {
    "gsm8k": 5
  },
  "higher_is_better": {
    "gsm8k": {
      "exact_match": true
    }
  },
  "n-samples": {
    "gsm8k": {
      "original": 1319,
      "effective": 1319
    }
  },
  "config": {
    "model": "llada_dist",
    "model_args": "model_path=GSAI-ML/LLaDA-8B-Instruct,gen_length=1024,steps=1024,block_length=8,remasking=bst",
    "batch_size": 1,
    "batch_sizes": [],
    "device": null,
    "use_cache": null,
    "limit": null,
    "bootstrap_iters": 100000,
    "gen_kwargs": null,
    "random_seed": 0,
    "numpy_seed": 1234,
    "torch_seed": 1234,
    "fewshot_seed": 1234
  },
  "git_hash": "9b919e8",
  "date": 1754402634.276131,
  "pretty_env_info": "PyTorch version: 2.4.0\nIs debug build: False\nCUDA used to build PyTorch: 12.1\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.4 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: Could not collect\nLibc version: glibc-2.35\n\nPython version: 3.12.9 | packaged by conda-forge | (main, Mar  4 2025, 22:48:41) [GCC 13.3.0] (64-bit runtime)\nPython platform: Linux-5.15.0-210.163.7.el8uek.x86_64-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: Could not collect\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: GPU 0: NVIDIA A100-SXM4-80GB\nNvidia driver version: 560.35.03\ncuDNN version: Probably one of the following:\n/usr/lib/x86_64-linux-gnu/libcudnn.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.9.0\n/usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.9.0\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture:                         x86_64\nCPU op-mode(s):                       32-bit, 64-bit\nAddress sizes:                        48 bits physical, 48 bits virtual\nByte Order:                           Little Endian\nCPU(s):                               256\nOn-line CPU(s) list:                  0-254\nOff-line CPU(s) list:                 255\nVendor ID:                            AuthenticAMD\nModel name:                           AMD EPYC 7J13 64-Core Processor\nCPU family:                           25\nModel:                                1\nThread(s) per core:                   2\nCore(s) per socket:                   64\nSocket(s):                            2\nStepping:                             1\nFrequency boost:                      enabled\nCPU max MHz:                          3673.0950\nCPU min MHz:                          0.0000\nBogoMIPS:                             4900.16\nFlags:                                fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 invpcid_single hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local nt_good clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin brs arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip pku ospke vaes vpclmulqdq rdpid overflow_recov succor smca fsrm\nVirtualization:                       AMD-V\nL1d cache:                            4 MiB (128 instances)\nL1i cache:                            4 MiB (128 instances)\nL2 cache:                             64 MiB (128 instances)\nL3 cache:                             512 MiB (16 instances)\nNUMA node(s):                         8\nNUMA node0 CPU(s):                    0-15,128-143\nNUMA node1 CPU(s):                    16-31,144-159\nNUMA node2 CPU(s):                    32-47,160-175\nNUMA node3 CPU(s):                    48-63,176-191\nNUMA node4 CPU(s):                    64-79,192-207\nNUMA node5 CPU(s):                    80-95,208-223\nNUMA node6 CPU(s):                    96-111,224-239\nNUMA node7 CPU(s):                    112-127,240-254\nVulnerability Gather data sampling:   Not affected\nVulnerability Itlb multihit:          Not affected\nVulnerability L1tf:                   Not affected\nVulnerability Mds:                    Not affected\nVulnerability Meltdown:               Not affected\nVulnerability Mmio stale data:        Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed:               Not affected\nVulnerability Spec rstack overflow:   Mitigation; safe RET\nVulnerability Spec store bypass:      Mitigation; Speculative Store Bypass disabled via prctl\nVulnerability Spectre v1:             Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2:             Mitigation; Retpolines; IBPB conditional; IBRS_FW; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds:                  Not affected\nVulnerability Tsx async abort:        Not affected\n\nVersions of relevant libraries:\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==2.0.2\n[pip3] torch==2.4.0\n[pip3] torchaudio==2.4.0\n[pip3] torchvision==0.19.0\n[pip3] triton==3.0.0\n[conda] blas                      2.116                       mkl    conda-forge\n[conda] blas-devel                3.9.0            16_linux64_mkl    conda-forge\n[conda] libblas                   3.9.0            16_linux64_mkl    conda-forge\n[conda] libcblas                  3.9.0            16_linux64_mkl    conda-forge\n[conda] liblapack                 3.9.0            16_linux64_mkl    conda-forge\n[conda] liblapacke                3.9.0            16_linux64_mkl    conda-forge\n[conda] libopenvino-pytorch-frontend 2025.0.0             h5888daf_1    conda-forge\n[conda] mkl                       2022.1.0           h84fe81f_915    conda-forge\n[conda] mkl-devel                 2022.1.0           ha770c72_916    conda-forge\n[conda] mkl-include               2022.1.0           h84fe81f_915    conda-forge\n[conda] numpy                     2.0.2           py312h58c1407_1    conda-forge\n[conda] pytorch                   2.4.0           py3.12_cuda12.1_cudnn9.1.0_0    pytorch\n[conda] pytorch-cuda              12.1                 ha16c6d3_6    pytorch\n[conda] pytorch-mutex             1.0                        cuda    pytorch\n[conda] torchaudio                2.4.0               py312_cu121    pytorch\n[conda] torchtriton               3.0.0                     py312    pytorch\n[conda] torchvision               0.19.0              py312_cu121    pytorch",
  "transformers_version": "4.54.1",
  "lm_eval_version": "0.4.9.1",
  "upper_git_hash": null,
  "tokenizer_pad_token": [
    "<|endoftext|>",
    "126081"
  ],
  "tokenizer_eos_token": [
    "<|endoftext|>",
    "126081"
  ],
  "tokenizer_bos_token": [
    "<|startoftext|>",
    "126080"
  ],
  "eot_token_id": null,
  "max_length": 4096,
  "task_hashes": {},
  "model_source": "llada_dist",
  "model_name": "GSAI-ML/LLaDA-8B-Instruct",
  "model_name_sanitized": "GSAI-ML__LLaDA-8B-Instruct",
  "system_instruction": null,
  "system_instruction_sha": null,
  "fewshot_as_multiturn": false,
  "chat_template": null,
  "chat_template_sha": null,
  "start_time": 5611026.857887003,
  "end_time": 5656432.83792529,
  "total_evaluation_time_seconds": "45405.98003828712"
}