
                    gpt-4-0613 &   math_word_problem_generation &                           42.9 &                           37.5 &                             -- &                             --  \\
                    gpt-4-0613 &  finegrained_fact_verification &                           52.2 &                           34.4 &                           33.2 &                             --  \\
                    gpt-4-0613 &   answerability_classification &                           30.4 &                             -- &                             -- &                           27.9  \\
           Llama-2-70b-chat-hf &   math_word_problem_generation &                           75.3 &                           78.5 &                             -- &                             --  \\
           Llama-2-70b-chat-hf &  finegrained_fact_verification &                           34.8 &                           34.5 &                           31.5 &                             --  \\
           Llama-2-70b-chat-hf &   answerability_classification &                           16.7 &                             -- &                             -- &                           14.0  \\
