gsm8k_llama: "exact_match,flexible_extract"
gpqa_main_cot_zeroshot: "exact_match,flexible-extract"
minerva_math: "math_verify,none"
mmlu_cot_llama: "exact_match,strict_match"
mmlu_pro_llama: "exact_match,strict_match"
bbh_cot_zeroshot: "exact_match,flexible-extract"
mmlu_cot_llama_llm_as_a_judge: "acc,strict_match"
mmlu_pro_llama_llm_as_a_judge: "acc,strict_match"
bbh_cot_zeroshot_llm_as_a_judge: "acc,strict_match"
gpqa_main_cot_zeroshot_llm_as_a_judge: "acc,strict_match"
humaneval_instruct: "pass@1,create_test"
mbpp_instruct: "pass_at_1,extract_code"