
                    gpt-4-0613 &   math_word_problem_generation &                           78.6 &                           57.2 &                             -- &                             --  \\
                    gpt-4-0613 &  finegrained_fact_verification &                           14.0 &                           18.8 &                           11.5 &                             --  \\
                    gpt-4-0613 &   answerability_classification &                           10.7 &                             -- &                             -- &                           14.2  \\
           Llama-2-70b-chat-hf &   math_word_problem_generation &                           96.2 &                           88.1 &                             -- &                             --  \\
           Llama-2-70b-chat-hf &  finegrained_fact_verification &                           55.6 &                           53.2 &                           64.4 &                             --  \\
           Llama-2-70b-chat-hf &   answerability_classification &                           79.2 &                             -- &                             -- &                           59.1  \\
