
 gpt-4-0613 & math_word_problem_generation & 57.1 & 51.1 & -- & -- \\
 gpt-4-0613 & finegrained_fact_verification & 9.6 & 3.1 & 7.7 & -- \\
 gpt-4-0613 & answerability_classification & 9.8 & -- & -- & 12.9 \\
 Llama-2-70b-chat-hf & math_word_problem_generation & 91.5 & 82.4 & -- & -- \\
 Llama-2-70b-chat-hf & finegrained_fact_verification & 22.5 & 29.6 & 34.9 & -- \\
 Llama-2-70b-chat-hf & answerability_classification & 63.7 & -- & -- & 41.5 \\
