
 gpt-4-0613 & math_word_problem_generation & 51.8 & 50.4 & -- & -- \\
 gpt-4-0613 & finegrained_fact_verification & 68.4 & 53.1 & 59.1 & -- \\
 gpt-4-0613 & answerability_classification & 62.5 & -- & -- & 56.7 \\
 Llama-2-70b-chat-hf & math_word_problem_generation & 49.7 & 50.5 & -- & -- \\
 Llama-2-70b-chat-hf & finegrained_fact_verification & 66.6 & 63.0 & 56.8 & -- \\
 Llama-2-70b-chat-hf & answerability_classification & 22.9 & -- & -- & 24.7 \\
