
 gpt-4-0613 & math_word_problem_generation & 47.3 & 49.6 & -- & -- \\
 gpt-4-0613 & finegrained_fact_verification & 71.3 & 53.1 & 57.7 & -- \\
 gpt-4-0613 & answerability_classification & 63.4 & -- & -- & 52.1 \\
 Llama-2-70b-chat-hf & math_word_problem_generation & 50.0 & 49.3 & -- & -- \\
 Llama-2-70b-chat-hf & finegrained_fact_verification & 64.0 & 63.4 & 58.6 & -- \\
 Llama-2-70b-chat-hf & answerability_classification & 47.1 & -- & -- & 43.3 \\
