
                    gpt-4-0613 &   math_word_problem_generation &                           40.2 &                           39.4 &                             -- &                             --  \\
                    gpt-4-0613 &  finegrained_fact_verification &                           41.2 &                           50.0 &                           39.4 &                             --  \\
                    gpt-4-0613 &   answerability_classification &                           37.5 &                             -- &                             -- &                           31.2  \\
           Llama-2-70b-chat-hf &   math_word_problem_generation &                           49.1 &                           46.5 &                             -- &                             --  \\
           Llama-2-70b-chat-hf &  finegrained_fact_verification &                           32.3 &                           33.1 &                           30.8 &                             --  \\
           Llama-2-70b-chat-hf &   answerability_classification &                            7.5 &                             -- &                             -- &                           10.4  \\
