{
  "arc_challenge": {
    "alias": "arc_challenge",
    "acc,none": 0.41552901023890787,
    "acc_stderr,none": 0.014401366641216391,
    "acc_norm,none": 0.41723549488054607,
    "acc_norm_stderr,none": 0.01440982551840308
  },
  "arc_easy": {
    "alias": "arc_easy",
    "acc,none": 0.6035353535353535,
    "acc_stderr,none": 0.010037412763064526,
    "acc_norm,none": 0.4898989898989899,
    "acc_norm_stderr,none": 0.010257689687458356
  },
  "boolq": {
    "alias": "boolq",
    "acc,none": 0.7599388379204893,
    "acc_stderr,none": 0.007470382031100501
  },
  "gsm8k": {
    "alias": "gsm8k",
    "exact_match,strict-match": 0.0,
    "exact_match_stderr,strict-match": 0.0,
    "exact_match,flexible-extract": 0.645185746777862,
    "exact_match_stderr,flexible-extract": 0.01317908338797919
  },
  "hellaswag": {
    "alias": "hellaswag",
    "acc,none": 0.5284803823939455,
    "acc_stderr,none": 0.004981680090303709,
    "acc_norm,none": 0.6516630153355906,
    "acc_norm_stderr,none": 0.004754697013354969
  },
  "mmlu": {
    "acc,none": 0.6420737786640079,
    "acc_stderr,none": 0.0038401957183727973,
    "alias": "mmlu"
  },
  "mmlu_humanities": {
    "acc,none": 0.5674814027630181,
    "acc_stderr,none": 0.006848211483250616,
    "alias": " - humanities"
  },
  "mmlu_formal_logic": {
    "alias": "  - formal_logic",
    "acc,none": 0.5317460317460317,
    "acc_stderr,none": 0.04463112720677171
  },
  "mmlu_high_school_european_history": {
    "alias": "  - high_school_european_history",
    "acc,none": 0.7151515151515152,
    "acc_stderr,none": 0.03524390844511781
  },
  "mmlu_high_school_us_history": {
    "alias": "  - high_school_us_history",
    "acc,none": 0.7647058823529411,
    "acc_stderr,none": 0.029771775228145628
  },
  "mmlu_high_school_world_history": {
    "alias": "  - high_school_world_history",
    "acc,none": 0.7848101265822784,
    "acc_stderr,none": 0.026750826994676173
  },
  "mmlu_international_law": {
    "alias": "  - international_law",
    "acc,none": 0.7768595041322314,
    "acc_stderr,none": 0.03800754475228733
  },
  "mmlu_jurisprudence": {
    "alias": "  - jurisprudence",
    "acc,none": 0.7129629629629629,
    "acc_stderr,none": 0.043733130409147614
  },
  "mmlu_logical_fallacies": {
    "alias": "  - logical_fallacies",
    "acc,none": 0.7607361963190185,
    "acc_stderr,none": 0.033519538795212696
  },
  "mmlu_moral_disputes": {
    "alias": "  - moral_disputes",
    "acc,none": 0.6994219653179191,
    "acc_stderr,none": 0.02468531686725781
  },
  "mmlu_moral_scenarios": {
    "alias": "  - moral_scenarios",
    "acc,none": 0.3675977653631285,
    "acc_stderr,none": 0.016125543823552958
  },
  "mmlu_philosophy": {
    "alias": "  - philosophy",
    "acc,none": 0.662379421221865,
    "acc_stderr,none": 0.026858825879488554
  },
  "mmlu_prehistory": {
    "alias": "  - prehistory",
    "acc,none": 0.7561728395061729,
    "acc_stderr,none": 0.02389187954195961
  },
  "mmlu_professional_law": {
    "alias": "  - professional_law",
    "acc,none": 0.4530638852672751,
    "acc_stderr,none": 0.012713845972358985
  },
  "mmlu_world_religions": {
    "alias": "  - world_religions",
    "acc,none": 0.7660818713450293,
    "acc_stderr,none": 0.03246721765117826
  },
  "mmlu_other": {
    "acc,none": 0.6884454457676215,
    "acc_stderr,none": 0.008065108188581402,
    "alias": " - other"
  },
  "mmlu_business_ethics": {
    "alias": "  - business_ethics",
    "acc,none": 0.71,
    "acc_stderr,none": 0.045604802157206845
  },
  "mmlu_clinical_knowledge": {
    "alias": "  - clinical_knowledge",
    "acc,none": 0.6754716981132075,
    "acc_stderr,none": 0.028815615713432118
  },
  "mmlu_college_medicine": {
    "alias": "  - college_medicine",
    "acc,none": 0.630057803468208,
    "acc_stderr,none": 0.03681229633394319
  },
  "mmlu_global_facts": {
    "alias": "  - global_facts",
    "acc,none": 0.42,
    "acc_stderr,none": 0.04960449637488584
  },
  "mmlu_human_aging": {
    "alias": "  - human_aging",
    "acc,none": 0.6681614349775785,
    "acc_stderr,none": 0.031602951437766785
  },
  "mmlu_management": {
    "alias": "  - management",
    "acc,none": 0.7281553398058253,
    "acc_stderr,none": 0.044052680241409216
  },
  "mmlu_marketing": {
    "alias": "  - marketing",
    "acc,none": 0.9017094017094017,
    "acc_stderr,none": 0.019503444900757567
  },
  "mmlu_medical_genetics": {
    "alias": "  - medical_genetics",
    "acc,none": 0.74,
    "acc_stderr,none": 0.0440844002276808
  },
  "mmlu_miscellaneous": {
    "alias": "  - miscellaneous",
    "acc,none": 0.7701149425287356,
    "acc_stderr,none": 0.015046301846691805
  },
  "mmlu_nutrition": {
    "alias": "  - nutrition",
    "acc,none": 0.7091503267973857,
    "acc_stderr,none": 0.02600480036395213
  },
  "mmlu_professional_accounting": {
    "alias": "  - professional_accounting",
    "acc,none": 0.5035460992907801,
    "acc_stderr,none": 0.02982674915328092
  },
  "mmlu_professional_medicine": {
    "alias": "  - professional_medicine",
    "acc,none": 0.6875,
    "acc_stderr,none": 0.02815637344037142
  },
  "mmlu_virology": {
    "alias": "  - virology",
    "acc,none": 0.4819277108433735,
    "acc_stderr,none": 0.03889951252827216
  },
  "mmlu_social_sciences": {
    "acc,none": 0.7487812804679883,
    "acc_stderr,none": 0.007712009677159702,
    "alias": " - social sciences"
  },
  "mmlu_econometrics": {
    "alias": "  - econometrics",
    "acc,none": 0.5526315789473685,
    "acc_stderr,none": 0.046774730044912
  },
  "mmlu_high_school_geography": {
    "alias": "  - high_school_geography",
    "acc,none": 0.803030303030303,
    "acc_stderr,none": 0.028335609732463362
  },
  "mmlu_high_school_government_and_politics": {
    "alias": "  - high_school_government_and_politics",
    "acc,none": 0.8134715025906736,
    "acc_stderr,none": 0.02811209121011746
  },
  "mmlu_high_school_macroeconomics": {
    "alias": "  - high_school_macroeconomics",
    "acc,none": 0.6974358974358974,
    "acc_stderr,none": 0.023290888053772725
  },
  "mmlu_high_school_microeconomics": {
    "alias": "  - high_school_microeconomics",
    "acc,none": 0.819327731092437,
    "acc_stderr,none": 0.024991964966600742
  },
  "mmlu_high_school_psychology": {
    "alias": "  - high_school_psychology",
    "acc,none": 0.8330275229357799,
    "acc_stderr,none": 0.01599015488507337
  },
  "mmlu_human_sexuality": {
    "alias": "  - human_sexuality",
    "acc,none": 0.7099236641221374,
    "acc_stderr,none": 0.03980066246467766
  },
  "mmlu_professional_psychology": {
    "alias": "  - professional_psychology",
    "acc,none": 0.684640522875817,
    "acc_stderr,none": 0.01879808628488689
  },
  "mmlu_public_relations": {
    "alias": "  - public_relations",
    "acc,none": 0.6454545454545455,
    "acc_stderr,none": 0.04582004841505415
  },
  "mmlu_security_studies": {
    "alias": "  - security_studies",
    "acc,none": 0.7061224489795919,
    "acc_stderr,none": 0.02916273841024977
  },
  "mmlu_sociology": {
    "alias": "  - sociology",
    "acc,none": 0.8208955223880597,
    "acc_stderr,none": 0.027113286753111837
  },
  "mmlu_us_foreign_policy": {
    "alias": "  - us_foreign_policy",
    "acc,none": 0.83,
    "acc_stderr,none": 0.03775251680686371
  },
  "mmlu_stem": {
    "acc,none": 0.6035521725340945,
    "acc_stderr,none": 0.008261787871761215,
    "alias": " - stem"
  },
  "mmlu_abstract_algebra": {
    "alias": "  - abstract_algebra",
    "acc,none": 0.35,
    "acc_stderr,none": 0.04793724854411019
  },
  "mmlu_anatomy": {
    "alias": "  - anatomy",
    "acc,none": 0.6592592592592592,
    "acc_stderr,none": 0.040943762699967946
  },
  "mmlu_astronomy": {
    "alias": "  - astronomy",
    "acc,none": 0.7763157894736842,
    "acc_stderr,none": 0.03391160934343603
  },
  "mmlu_college_biology": {
    "alias": "  - college_biology",
    "acc,none": 0.8125,
    "acc_stderr,none": 0.032639560491693344
  },
  "mmlu_college_chemistry": {
    "alias": "  - college_chemistry",
    "acc,none": 0.43,
    "acc_stderr,none": 0.049756985195624284
  },
  "mmlu_college_computer_science": {
    "alias": "  - college_computer_science",
    "acc,none": 0.55,
    "acc_stderr,none": 0.049999999999999996
  },
  "mmlu_college_mathematics": {
    "alias": "  - college_mathematics",
    "acc,none": 0.39,
    "acc_stderr,none": 0.04902071300001974
  },
  "mmlu_college_physics": {
    "alias": "  - college_physics",
    "acc,none": 0.4117647058823529,
    "acc_stderr,none": 0.04897104952726368
  },
  "mmlu_computer_security": {
    "alias": "  - computer_security",
    "acc,none": 0.74,
    "acc_stderr,none": 0.04408440022768077
  },
  "mmlu_conceptual_physics": {
    "alias": "  - conceptual_physics",
    "acc,none": 0.7361702127659574,
    "acc_stderr,none": 0.028809989854102953
  },
  "mmlu_electrical_engineering": {
    "alias": "  - electrical_engineering",
    "acc,none": 0.6275862068965518,
    "acc_stderr,none": 0.04028731532947559
  },
  "mmlu_elementary_mathematics": {
    "alias": "  - elementary_mathematics",
    "acc,none": 0.5634920634920635,
    "acc_stderr,none": 0.02554284681740051
  },
  "mmlu_high_school_biology": {
    "alias": "  - high_school_biology",
    "acc,none": 0.8225806451612904,
    "acc_stderr,none": 0.02173254068932927
  },
  "mmlu_high_school_chemistry": {
    "alias": "  - high_school_chemistry",
    "acc,none": 0.6600985221674877,
    "acc_stderr,none": 0.033327690684107895
  },
  "mmlu_high_school_computer_science": {
    "alias": "  - high_school_computer_science",
    "acc,none": 0.81,
    "acc_stderr,none": 0.03942772444036623
  },
  "mmlu_high_school_mathematics": {
    "alias": "  - high_school_mathematics",
    "acc,none": 0.32222222222222224,
    "acc_stderr,none": 0.028493465091028604
  },
  "mmlu_high_school_physics": {
    "alias": "  - high_school_physics",
    "acc,none": 0.48344370860927155,
    "acc_stderr,none": 0.04080244185628972
  },
  "mmlu_high_school_statistics": {
    "alias": "  - high_school_statistics",
    "acc,none": 0.5509259259259259,
    "acc_stderr,none": 0.03392238405321617
  },
  "mmlu_machine_learning": {
    "alias": "  - machine_learning",
    "acc,none": 0.5803571428571429,
    "acc_stderr,none": 0.04684099321077106
  },
  "piqa": {
    "alias": "piqa",
    "acc,none": 0.7682263329706203,
    "acc_stderr,none": 0.009845143772794026,
    "acc_norm,none": 0.7562568008705114,
    "acc_norm_stderr,none": 0.01001719947150062
  },
  "social_iqa": {
    "alias": "social_iqa",
    "acc,none": 0.43244626407369496,
    "acc_stderr,none": 0.011210331273967561
  }
}