
                    gpt-4-0613 &   math_word_problem_generation &                           37.5 &                           37.5 &                             -- &                             --  \\
                    gpt-4-0613 &  finegrained_fact_verification &                           44.1 &                           53.1 &                           41.8 &                             --  \\
                    gpt-4-0613 &   answerability_classification &                           41.1 &                             -- &                             -- &                           27.5  \\
           Llama-2-70b-chat-hf &   math_word_problem_generation &                           49.7 &                           47.8 &                             -- &                             --  \\
           Llama-2-70b-chat-hf &  finegrained_fact_verification &                           27.8 &                           26.4 &                           22.9 &                             --  \\
           Llama-2-70b-chat-hf &   answerability_classification &                           20.4 &                             -- &                             -- &                           19.5  \\
