
                    gpt-4-0613 &   math_word_problem_generation &                           51.8 &                           54.2 &                             -- &                             --  \\
                    gpt-4-0613 &  finegrained_fact_verification &                           77.2 &                           65.6 &                           66.8 &                             --  \\
                    gpt-4-0613 &   answerability_classification &                           78.6 &                             -- &                             -- &                           80.8  \\
           Llama-2-70b-chat-hf &   math_word_problem_generation &                           52.5 &                           50.7 &                             -- &                             --  \\
           Llama-2-70b-chat-hf &  finegrained_fact_verification &                           79.5 &                           77.1 &                           74.7 &                             --  \\
           Llama-2-70b-chat-hf &   answerability_classification &                           82.1 &                             -- &                             -- &                           77.7  \\
