
 gpt-4-0613 & math_word_problem_generation & 42.9 & 37.5 & -- & -- \\
 gpt-4-0613 & finegrained_fact_verification & 52.2 & 34.4 & 33.2 & -- \\
 gpt-4-0613 & answerability_classification & 30.4 & -- & -- & 27.9 \\
 Llama-2-70b-chat-hf & math_word_problem_generation & 75.3 & 78.5 & -- & -- \\
 Llama-2-70b-chat-hf & finegrained_fact_verification & 34.8 & 34.5 & 31.5 & -- \\
 Llama-2-70b-chat-hf & answerability_classification & 16.7 & -- & -- & 14.0 \\
