{
  "arc_challenge": {
    "alias": "arc_challenge",
    "acc,none": 0.4308873720136519,
    "acc_stderr,none": 0.014471133392642483,
    "acc_norm,none": 0.447098976109215,
    "acc_norm_stderr,none": 0.01452938016052684
  },
  "arc_easy": {
    "alias": "arc_easy",
    "acc,none": 0.6868686868686869,
    "acc_stderr,none": 0.00951630387930954,
    "acc_norm,none": 0.5643939393939394,
    "acc_norm_stderr,none": 0.010174341733665219
  },
  "boolq": {
    "alias": "boolq",
    "acc,none": 0.8131498470948012,
    "acc_stderr,none": 0.006817490804551452
  },
  "gsm8k": {
    "alias": "gsm8k",
    "exact_match,strict-match": 0.0,
    "exact_match_stderr,strict-match": 0.0,
    "exact_match,flexible-extract": 0.645185746777862,
    "exact_match_stderr,flexible-extract": 0.013179083387979192
  },
  "hellaswag": {
    "alias": "hellaswag",
    "acc,none": 0.5103565026887075,
    "acc_stderr,none": 0.004988710917169335,
    "acc_norm,none": 0.6462856004779924,
    "acc_norm_stderr,none": 0.004771447244095127
  },
  "mmlu": {
    "acc,none": 0.6660019940179461,
    "acc_stderr,none": 0.003767670416970162,
    "alias": "mmlu"
  },
  "mmlu_humanities": {
    "acc,none": 0.5725823591923486,
    "acc_stderr,none": 0.0067816181322413955,
    "alias": " - humanities"
  },
  "mmlu_formal_logic": {
    "alias": "  - formal_logic",
    "acc,none": 0.5238095238095238,
    "acc_stderr,none": 0.04467062628403273
  },
  "mmlu_high_school_european_history": {
    "alias": "  - high_school_european_history",
    "acc,none": 0.7272727272727273,
    "acc_stderr,none": 0.0347769116216366
  },
  "mmlu_high_school_us_history": {
    "alias": "  - high_school_us_history",
    "acc,none": 0.7696078431372549,
    "acc_stderr,none": 0.029554292605695053
  },
  "mmlu_high_school_world_history": {
    "alias": "  - high_school_world_history",
    "acc,none": 0.7848101265822784,
    "acc_stderr,none": 0.026750826994676177
  },
  "mmlu_international_law": {
    "alias": "  - international_law",
    "acc,none": 0.768595041322314,
    "acc_stderr,none": 0.0384985609879409
  },
  "mmlu_jurisprudence": {
    "alias": "  - jurisprudence",
    "acc,none": 0.7685185185185185,
    "acc_stderr,none": 0.04077494709252626
  },
  "mmlu_logical_fallacies": {
    "alias": "  - logical_fallacies",
    "acc,none": 0.8098159509202454,
    "acc_stderr,none": 0.030833491146281245
  },
  "mmlu_moral_disputes": {
    "alias": "  - moral_disputes",
    "acc,none": 0.6791907514450867,
    "acc_stderr,none": 0.025131000233647904
  },
  "mmlu_moral_scenarios": {
    "alias": "  - moral_scenarios",
    "acc,none": 0.3664804469273743,
    "acc_stderr,none": 0.01611523550486547
  },
  "mmlu_philosophy": {
    "alias": "  - philosophy",
    "acc,none": 0.7202572347266881,
    "acc_stderr,none": 0.025494259350694902
  },
  "mmlu_prehistory": {
    "alias": "  - prehistory",
    "acc,none": 0.7716049382716049,
    "acc_stderr,none": 0.023358211840626267
  },
  "mmlu_professional_law": {
    "alias": "  - professional_law",
    "acc,none": 0.4452411994784876,
    "acc_stderr,none": 0.012693421303973294
  },
  "mmlu_world_religions": {
    "alias": "  - world_religions",
    "acc,none": 0.8011695906432749,
    "acc_stderr,none": 0.030611116557432528
  },
  "mmlu_other": {
    "acc,none": 0.7061474090762794,
    "acc_stderr,none": 0.007898677944834398,
    "alias": " - other"
  },
  "mmlu_business_ethics": {
    "alias": "  - business_ethics",
    "acc,none": 0.72,
    "acc_stderr,none": 0.04512608598542129
  },
  "mmlu_clinical_knowledge": {
    "alias": "  - clinical_knowledge",
    "acc,none": 0.7245283018867924,
    "acc_stderr,none": 0.027495663683724057
  },
  "mmlu_college_medicine": {
    "alias": "  - college_medicine",
    "acc,none": 0.6473988439306358,
    "acc_stderr,none": 0.03643037168958548
  },
  "mmlu_global_facts": {
    "alias": "  - global_facts",
    "acc,none": 0.35,
    "acc_stderr,none": 0.047937248544110196
  },
  "mmlu_human_aging": {
    "alias": "  - human_aging",
    "acc,none": 0.6591928251121076,
    "acc_stderr,none": 0.03181149747055359
  },
  "mmlu_management": {
    "alias": "  - management",
    "acc,none": 0.7378640776699029,
    "acc_stderr,none": 0.04354631077260595
  },
  "mmlu_marketing": {
    "alias": "  - marketing",
    "acc,none": 0.8888888888888888,
    "acc_stderr,none": 0.020588491316092358
  },
  "mmlu_medical_genetics": {
    "alias": "  - medical_genetics",
    "acc,none": 0.75,
    "acc_stderr,none": 0.04351941398892446
  },
  "mmlu_miscellaneous": {
    "alias": "  - miscellaneous",
    "acc,none": 0.7931034482758621,
    "acc_stderr,none": 0.014485656041669195
  },
  "mmlu_nutrition": {
    "alias": "  - nutrition",
    "acc,none": 0.7189542483660131,
    "acc_stderr,none": 0.025738854797818716
  },
  "mmlu_professional_accounting": {
    "alias": "  - professional_accounting",
    "acc,none": 0.5283687943262412,
    "acc_stderr,none": 0.029779450957303062
  },
  "mmlu_professional_medicine": {
    "alias": "  - professional_medicine",
    "acc,none": 0.7536764705882353,
    "acc_stderr,none": 0.02617343857052
  },
  "mmlu_virology": {
    "alias": "  - virology",
    "acc,none": 0.4939759036144578,
    "acc_stderr,none": 0.03892212195333045
  },
  "mmlu_social_sciences": {
    "acc,none": 0.780630484237894,
    "acc_stderr,none": 0.0073633147283634764,
    "alias": " - social sciences"
  },
  "mmlu_econometrics": {
    "alias": "  - econometrics",
    "acc,none": 0.6052631578947368,
    "acc_stderr,none": 0.045981880578165414
  },
  "mmlu_high_school_geography": {
    "alias": "  - high_school_geography",
    "acc,none": 0.8181818181818182,
    "acc_stderr,none": 0.0274796030105388
  },
  "mmlu_high_school_government_and_politics": {
    "alias": "  - high_school_government_and_politics",
    "acc,none": 0.8549222797927462,
    "acc_stderr,none": 0.025416343096306426
  },
  "mmlu_high_school_macroeconomics": {
    "alias": "  - high_school_macroeconomics",
    "acc,none": 0.7512820512820513,
    "acc_stderr,none": 0.0219169577092138
  },
  "mmlu_high_school_microeconomics": {
    "alias": "  - high_school_microeconomics",
    "acc,none": 0.8487394957983193,
    "acc_stderr,none": 0.023274255898707942
  },
  "mmlu_high_school_psychology": {
    "alias": "  - high_school_psychology",
    "acc,none": 0.8697247706422019,
    "acc_stderr,none": 0.014431862852473247
  },
  "mmlu_human_sexuality": {
    "alias": "  - human_sexuality",
    "acc,none": 0.7099236641221374,
    "acc_stderr,none": 0.03980066246467766
  },
  "mmlu_professional_psychology": {
    "alias": "  - professional_psychology",
    "acc,none": 0.7075163398692811,
    "acc_stderr,none": 0.018403415710109804
  },
  "mmlu_public_relations": {
    "alias": "  - public_relations",
    "acc,none": 0.7,
    "acc_stderr,none": 0.04389311454644286
  },
  "mmlu_security_studies": {
    "alias": "  - security_studies",
    "acc,none": 0.7755102040816326,
    "acc_stderr,none": 0.02671143055553841
  },
  "mmlu_sociology": {
    "alias": "  - sociology",
    "acc,none": 0.8009950248756219,
    "acc_stderr,none": 0.028231365092758406
  },
  "mmlu_us_foreign_policy": {
    "alias": "  - us_foreign_policy",
    "acc,none": 0.83,
    "acc_stderr,none": 0.03775251680686371
  },
  "mmlu_stem": {
    "acc,none": 0.6539803361877576,
    "acc_stderr,none": 0.008180544241806063,
    "alias": " - stem"
  },
  "mmlu_abstract_algebra": {
    "alias": "  - abstract_algebra",
    "acc,none": 0.51,
    "acc_stderr,none": 0.05024183937956913
  },
  "mmlu_anatomy": {
    "alias": "  - anatomy",
    "acc,none": 0.6888888888888889,
    "acc_stderr,none": 0.03999262876617722
  },
  "mmlu_astronomy": {
    "alias": "  - astronomy",
    "acc,none": 0.7697368421052632,
    "acc_stderr,none": 0.034260594244031654
  },
  "mmlu_college_biology": {
    "alias": "  - college_biology",
    "acc,none": 0.8125,
    "acc_stderr,none": 0.032639560491693344
  },
  "mmlu_college_chemistry": {
    "alias": "  - college_chemistry",
    "acc,none": 0.5,
    "acc_stderr,none": 0.050251890762960605
  },
  "mmlu_college_computer_science": {
    "alias": "  - college_computer_science",
    "acc,none": 0.55,
    "acc_stderr,none": 0.04999999999999999
  },
  "mmlu_college_mathematics": {
    "alias": "  - college_mathematics",
    "acc,none": 0.46,
    "acc_stderr,none": 0.05009082659620333
  },
  "mmlu_college_physics": {
    "alias": "  - college_physics",
    "acc,none": 0.5196078431372549,
    "acc_stderr,none": 0.04971358884367406
  },
  "mmlu_computer_security": {
    "alias": "  - computer_security",
    "acc,none": 0.76,
    "acc_stderr,none": 0.042923469599092816
  },
  "mmlu_conceptual_physics": {
    "alias": "  - conceptual_physics",
    "acc,none": 0.7659574468085106,
    "acc_stderr,none": 0.027678452578212387
  },
  "mmlu_electrical_engineering": {
    "alias": "  - electrical_engineering",
    "acc,none": 0.7103448275862069,
    "acc_stderr,none": 0.03780019230438015
  },
  "mmlu_elementary_mathematics": {
    "alias": "  - elementary_mathematics",
    "acc,none": 0.6164021164021164,
    "acc_stderr,none": 0.025043757318520193
  },
  "mmlu_high_school_biology": {
    "alias": "  - high_school_biology",
    "acc,none": 0.8580645161290322,
    "acc_stderr,none": 0.01985300367655978
  },
  "mmlu_high_school_chemistry": {
    "alias": "  - high_school_chemistry",
    "acc,none": 0.6699507389162561,
    "acc_stderr,none": 0.033085304262282574
  },
  "mmlu_high_school_computer_science": {
    "alias": "  - high_school_computer_science",
    "acc,none": 0.81,
    "acc_stderr,none": 0.039427724440366234
  },
  "mmlu_high_school_mathematics": {
    "alias": "  - high_school_mathematics",
    "acc,none": 0.4444444444444444,
    "acc_stderr,none": 0.030296771286067323
  },
  "mmlu_high_school_physics": {
    "alias": "  - high_school_physics",
    "acc,none": 0.5298013245033113,
    "acc_stderr,none": 0.040752249922169775
  },
  "mmlu_high_school_statistics": {
    "alias": "  - high_school_statistics",
    "acc,none": 0.6481481481481481,
    "acc_stderr,none": 0.032568505702936484
  },
  "mmlu_machine_learning": {
    "alias": "  - machine_learning",
    "acc,none": 0.5803571428571429,
    "acc_stderr,none": 0.04684099321077106
  },
  "piqa": {
    "alias": "piqa",
    "acc,none": 0.7546245919477693,
    "acc_stderr,none": 0.010039831320422396,
    "acc_norm,none": 0.7573449401523396,
    "acc_norm_stderr,none": 0.010002002569708707
  },
  "social_iqa": {
    "alias": "social_iqa",
    "acc,none": 0.44933469805527126,
    "acc_stderr,none": 0.011255834856639554
  }
}