
                    gpt-4-0613 &   math_word_problem_generation &                           69.6 &                           64.8 &                             -- &                             --  \\
                    gpt-4-0613 &  finegrained_fact_verification &                           48.5 &                           18.8 &                           33.2 &                             --  \\
                    gpt-4-0613 &   answerability_classification &                           29.5 &                             -- &                             -- &                           16.7  \\
           Llama-2-70b-chat-hf &   math_word_problem_generation &                           73.1 &                           69.3 &                             -- &                             --  \\
           Llama-2-70b-chat-hf &  finegrained_fact_verification &                           23.0 &                           28.9 &                           26.7 &                             --  \\
           Llama-2-70b-chat-hf &   answerability_classification &                            1.2 &                             -- &                             -- &                            2.4  \\
