
 gpt-4-0613 & math_word_problem_generation & 51.8 & 54.2 & -- & -- \\
 gpt-4-0613 & finegrained_fact_verification & 77.2 & 65.6 & 66.8 & -- \\
 gpt-4-0613 & answerability_classification & 78.6 & -- & -- & 80.8 \\
 Llama-2-70b-chat-hf & math_word_problem_generation & 52.5 & 50.7 & -- & -- \\
 Llama-2-70b-chat-hf & finegrained_fact_verification & 79.5 & 77.1 & 74.7 & -- \\
 Llama-2-70b-chat-hf & answerability_classification & 82.1 & -- & -- & 77.7 \\
