gsm8k_llama:
  - exact_match
  - flexible_extract
gpqa_main_cot_zeroshot:
  - exact_match
  - flexible-extract
minerva_math:
  - math_verify
  - none
mmlu_cot_llama:
  - exact_match
  - strict_match
mmlu_pro_llama:
  - exact_match
  - strict_match
bbh_cot_zeroshot:
  - exact_match
  - flexible-extract
mmlu_cot_llama_llm_as_a_judge:
  - acc
  - strict_match
mmlu_pro_llama_llm_as_a_judge:
  - acc
  - strict_match
bbh_cot_zeroshot_llm_as_a_judge:
  - acc
  - strict_match
gpqa_main_cot_zeroshot_llm_as_a_judge:
  - acc
  - strict_match
humaneval_instruct:
  - pass@1
  - create_test
mbpp_instruct:
  - pass_at_1
  - extract_code