
 gpt-4-0613 & math_word_problem_generation & 40.2 & 39.4 & -- & -- \\
 gpt-4-0613 & finegrained_fact_verification & 41.2 & 50.0 & 39.4 & -- \\
 gpt-4-0613 & answerability_classification & 37.5 & -- & -- & 31.2 \\
 Llama-2-70b-chat-hf & math_word_problem_generation & 49.1 & 46.5 & -- & -- \\
 Llama-2-70b-chat-hf & finegrained_fact_verification & 32.3 & 33.1 & 30.8 & -- \\
 Llama-2-70b-chat-hf & answerability_classification & 7.5 & -- & -- & 10.4 \\
