                 initial model &                        dataset &                    gemma-7b-it &            Llama-2-13b-chat-hf &            Llama-2-70b-chat-hf &     Mixtral-8x7B-Instruct-v0.1 &               Qwen1.5-14B-Chat &               Qwen1.5-72B-Chat &             gpt-3.5-turbo-0125 &             gemini-1.0-pro-001 &         claude-3-opus-20240229 &                     gpt-4-0613 &             gpt-4-0125-preview &                         random &                          human  \\
                               &     \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{7.4} & \cellcolor[RGB]{211,211,211}{29.4} & \cellcolor[RGB]{211,211,211}{5.1} & \cellcolor[RGB]{211,211,211}{26.1} & \cellcolor[RGB]{211,211,211}{22.0} & \cellcolor[RGB]{211,211,211}{12.2} & \cellcolor[RGB]{211,211,211}{38.8} & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{36.2} &                           55.0 &                  \textbf{62.8} &                           55.0 &                           90.0  \\
                               &     \scalebox{0.9}[1]{FgFactV} &                  \textbf{49.6} & \cellcolor[RGB]{211,211,211}{45.7} & \cellcolor[RGB]{211,211,211}{48.6} & \cellcolor[RGB]{211,211,211}{10.6} & \cellcolor[RGB]{211,211,211}{33.3} & \cellcolor[RGB]{211,211,211}{7.3} & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{27.1} & \cellcolor[RGB]{211,211,211}{33.0} & \cellcolor[RGB]{211,211,211}{9.8} & \cellcolor[RGB]{211,211,211}{7.3} &                           55.7 &                           95.5  \\
                               &      \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{43.2} &                  \textbf{55.6} & \cellcolor[RGB]{211,211,211}{54.0} & \cellcolor[RGB]{211,211,211}{24.2} & \cellcolor[RGB]{211,211,211}{26.5} & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{2.4} & \cellcolor[RGB]{211,211,211}{19.6} & \cellcolor[RGB]{211,211,211}{29.4} & \cellcolor[RGB]{211,211,211}{15.7} & \cellcolor[RGB]{211,211,211}{23.7} &                           57.9 &                           85.7  \\
                               &     \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{4.8} & \cellcolor[RGB]{211,211,211}{35.9} & \cellcolor[RGB]{211,211,211}{8.0} & \cellcolor[RGB]{211,211,211}{28.4} & \cellcolor[RGB]{211,211,211}{12.5} & \cellcolor[RGB]{211,211,211}{29.8} & \cellcolor[RGB]{211,211,211}{48.2} & \cellcolor[RGB]{211,211,211}{12.5} &                           75.4 &                           83.9 &                  \textbf{88.2} &                           75.0 &                           98.3  \\
                               &     \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{42.8} &                  \textbf{69.1} & \cellcolor[RGB]{211,211,211}{68.2} & \cellcolor[RGB]{211,211,211}{6.2} & \cellcolor[RGB]{211,211,211}{23.1} & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{1.6} & \cellcolor[RGB]{211,211,211}{14.6} & \cellcolor[RGB]{211,211,211}{16.1} & \cellcolor[RGB]{211,211,211}{18.2} & \cellcolor[RGB]{211,211,211}{56.5} &                           78.8 &                          100.0  \\
                               &      \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{11.9} &                  \textbf{80.2} & \cellcolor[RGB]{211,211,211}{3.1} & \cellcolor[RGB]{211,211,211}{3.2} & \cellcolor[RGB]{211,211,211}{3.1} & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{3.1} & \cellcolor[RGB]{211,211,211}{14.9} & \cellcolor[RGB]{211,211,211}{65.2} & \cellcolor[RGB]{211,211,211}{71.1} &                           77.5 &                           98.3  \\
