leaderboard|arc:challenge|25|0
leaderboard|truthfulqa:mc|0|0
leaderboard|hellaswag|10|0
leaderboard|mmlu:college_chemistry|5|0
leaderboard|mmlu:us_foreign_policy|5|0
lighteval|agieval:aqua-rat|0|0
lighteval|agieval:logiqa-en|0|0
lighteval|agieval:lsat-ar|0|0
lighteval|agieval:lsat-lr|0|0
lighteval|agieval:lsat-rc|0|0
lighteval|agieval:sat-en-without-passage|0|0
lighteval|agieval:sat-en|0|0
lighteval|bigbench:causal_judgment|3|0
lighteval|bigbench:date_understanding|3|0
lighteval|bigbench:disambiguation_qa|3|0
lighteval|bigbench:geometric_shapes|3|0
lighteval|bigbench:logical_deduction_five_objects|3|0
lighteval|bigbench:logical_deduction_seven_objects|3|0
lighteval|bigbench:movie_recommendation|3|0
lighteval|bigbench:navigate|3|0
lighteval|bigbench:ruin_names|3|0
lighteval|bigbench:salient_translation_error_detection|3|0
lighteval|bigbench:snarks|3|0
lighteval|bigbench:temporal_sequences|3|0
lighteval|bigbench:tracking_shuffled_objects_five_objects|3|0
lighteval|bigbench:tracking_shuffled_objects_seven_objects|3|0
test|gsm8k|0|1
