
                    gpt-4-0613 &   math_word_problem_generation &                           57.1 &                           51.1 &                             -- &                             --  \\
                    gpt-4-0613 &  finegrained_fact_verification &                            9.6 &                            3.1 &                            7.7 &                             --  \\
                    gpt-4-0613 &   answerability_classification &                            9.8 &                             -- &                             -- &                           12.9  \\
           Llama-2-70b-chat-hf &   math_word_problem_generation &                           91.5 &                           82.4 &                             -- &                             --  \\
           Llama-2-70b-chat-hf &  finegrained_fact_verification &                           22.5 &                           29.6 &                           34.9 &                             --  \\
           Llama-2-70b-chat-hf &   answerability_classification &                           63.7 &                             -- &                             -- &                           41.5  \\
