,ARC,HellaSwag,MMLU,TruthfulQA,Winogrande,GSM8K,hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions,IFEval,BBH,MATH Lvl 5,GPQA,MUSR,MMLU-PRO,bbh_boolean_expressions,bbh_causal_judgement,bbh_date_understanding,bbh_disambiguation_qa,bbh_formal_fallacies,bbh_geometric_shapes,bbh_hyperbaton,bbh_logical_deduction_five_objects,bbh_logical_deduction_seven_objects,bbh_logical_deduction_three_objects,bbh_movie_recommendation,bbh_navigate,bbh_object_counting,bbh_penguins_in_a_table,bbh_reasoning_about_colored_objects,bbh_ruin_names,bbh_salient_translation_error_detection,bbh_snarks,bbh_sports_understanding,bbh_temporal_sequences,bbh_tracking_shuffled_objects_five_objects,bbh_tracking_shuffled_objects_seven_objects,bbh_tracking_shuffled_objects_three_objects,bbh_web_of_lies,gpqa_diamond,gpqa_extended,gpqa_main,math_algebra_hard,math_counting_and_prob_hard,math_geometry_hard,math_intermediate_algebra_hard,math_num_theory_hard,math_prealgebra_hard,math_precalculus_hard,musr_murder_mysteries,musr_object_placements,musr_team_allocation
0,0.2048,0.2491432,0.2303,0.317364,0.47909999999999997,0.0,0.17,0.1439407407407407,0.1447368421052631,0.15,0.1924528301886792,0.1875,0.14,0.15,0.18,0.1676300578034682,0.1372549019607843,0.16,0.1872340425531914,0.17045614035087714,0.1724137931034483,0.1995555555555555,0.1269841269841269,0.14,0.1741935483870967,0.1477832512315271,0.15,0.1844363636363636,0.1616161616161616,0.1709844559585492,0.19341538461538457,0.2,0.180672268907563,0.1788079470198675,0.1871559633027523,0.1388888888888889,0.1911764705882352,0.1940928270042194,0.1076233183856502,0.16360305343511444,0.140495867768595,0.1759259259259259,0.1840490797546012,0.1517857142857142,0.145631067961165,0.1709401709401709,0.17,0.1992337164750957,0.1907514450867052,0.18293631284916195,0.2026143790849673,0.180064308681672,0.1975308641975308,0.19656737588652481,0.2235984354628422,0.1580882352941176,0.21149019607843134,0.1636363636363636,0.1428571428571428,0.1890547263681592,0.17,0.16525301204819273,0.175438596491228,0.06805,0.29,0.0,0.23115,0.33,0.11,0.4223,0.4154545454545454,0.12584,0.28138,0.44446,0.0,0.42506,0.15476,0.12092,0.30446,0.16938,0.41646,0.0,0.15304794520547943,0.08892,0.06,0.11338000000000001,0.3558707865168539,0.456,0.032,0.10892,0.092,0.23645999999999998,0.45076,0.19755050505050498,0.21639194139194132,0.20887276785714284,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.46846000000000004,0.204921875,0.23246
