                 initial model &                        dataset &                    gemma-7b-it &            Llama-2-13b-chat-hf &            Llama-2-70b-chat-hf &     Mixtral-8x7B-Instruct-v0.1 &               Qwen1.5-14B-Chat &               Qwen1.5-72B-Chat &             gpt-3.5-turbo-0125 &             gemini-1.0-pro-001 &         claude-3-opus-20240229 &                     gpt-4-0613 &             gpt-4-0125-preview &                         random &                          human  \\
                               &     \scalebox{0.9}[1]{MathGen} &                            6.6 & \cellcolor[RGB]{211,211,211}{-26.7} & \cellcolor[RGB]{211,211,211}{-0.3} &                            3.2 & \cellcolor[RGB]{211,211,211}{-4.7} &                           13.7 & \cellcolor[RGB]{211,211,211}{-0.3} & \cellcolor[RGB]{211,211,211}{-21.9} &                            9.3 &                            2.5 &                            0.2  \\
                               &     \scalebox{0.9}[1]{FgFactV} &                            5.0 & \cellcolor[RGB]{211,211,211}{-3.3} & \cellcolor[RGB]{211,211,211}{-0.3} &                            6.9 & \cellcolor[RGB]{211,211,211}{-8.3} &                           13.2 & \cellcolor[RGB]{211,211,211}{-8.4} & \cellcolor[RGB]{211,211,211}{-15.0} & \cellcolor[RGB]{211,211,211}{-30.8} & \cellcolor[RGB]{211,211,211}{-4.4} &                            7.9  \\
                               &      \scalebox{0.9}[1]{AnsCls} &                            3.3 & \cellcolor[RGB]{211,211,211}{-2.8} &                            2.3 & \cellcolor[RGB]{211,211,211}{-2.3} & \cellcolor[RGB]{211,211,211}{-7.1} & \cellcolor[RGB]{211,211,211}{-1.7} & \cellcolor[RGB]{211,211,211}{-9.9} & \cellcolor[RGB]{211,211,211}{-31.2} & \cellcolor[RGB]{211,211,211}{-5.5} & \cellcolor[RGB]{211,211,211}{-6.0} &                            6.5  \\
                               &     \scalebox{0.9}[1]{MathGen} &                           17.3 & \cellcolor[RGB]{211,211,211}{-31.5} &                            0.7 &                            3.6 & \cellcolor[RGB]{211,211,211}{-5.2} &                            1.9 & \cellcolor[RGB]{211,211,211}{-4.0} & \cellcolor[RGB]{211,211,211}{-22.4} &                            5.2 & \cellcolor[RGB]{211,211,211}{-1.4} & \cellcolor[RGB]{211,211,211}{-0.2}  \\
                               &     \scalebox{0.9}[1]{FgFactV} &                            0.6 & \cellcolor[RGB]{211,211,211}{-3.1} &                            0.4 &                           12.9 & \cellcolor[RGB]{211,211,211}{-54.2} &                            9.9 & \cellcolor[RGB]{211,211,211}{-37.9} & \cellcolor[RGB]{211,211,211}{-18.5} &                            6.2 & \cellcolor[RGB]{211,211,211}{-7.0} & \cellcolor[RGB]{211,211,211}{-6.6}  \\
                               &      \scalebox{0.9}[1]{AnsCls} &                            1.1 & \cellcolor[RGB]{211,211,211}{-8.9} &                            5.2 & \cellcolor[RGB]{211,211,211}{-21.0} & \cellcolor[RGB]{211,211,211}{-42.2} &                           14.4 & \cellcolor[RGB]{211,211,211}{-5.8} & \cellcolor[RGB]{211,211,211}{-17.8} & \cellcolor[RGB]{211,211,211}{-7.2} & \cellcolor[RGB]{211,211,211}{-4.6} & \cellcolor[RGB]{211,211,211}{-1.2}  \\
