                 initial model &                        dataset &                    gemma-7b-it &            Llama-2-13b-chat-hf &            Llama-2-70b-chat-hf &     Mixtral-8x7B-Instruct-v0.1 &               Qwen1.5-14B-Chat &               Qwen1.5-72B-Chat &             gpt-3.5-turbo-0125 &             gemini-1.0-pro-001 &         claude-3-opus-20240229 &                     gpt-4-0613 &             gpt-4-0125-preview &                         random &                          human  \\
                               &     \scalebox{0.9}[1]{MathGen} &                           80.5 &                           85.7 &                 \textbf{100.0} & \cellcolor[RGB]{211,211,211}{32.5} & \cellcolor[RGB]{211,211,211}{19.5} & \cellcolor[RGB]{211,211,211}{26.0} &                           57.1 &                           74.0 &                           58.4 & \cellcolor[RGB]{211,211,211}{49.4} &                           64.9 &                           55.0 &                           81.8  \\
                               &     \scalebox{0.9}[1]{FgFactV} &                           67.9 &                           92.3 &                  \textbf{98.7} & \cellcolor[RGB]{211,211,211}{20.5} & \cellcolor[RGB]{211,211,211}{51.3} & \cellcolor[RGB]{211,211,211}{11.5} & \cellcolor[RGB]{211,211,211}{38.5} &                           57.7 &                           59.0 & \cellcolor[RGB]{211,211,211}{5.1} & \cellcolor[RGB]{211,211,211}{14.1} &                           55.7 &                           95.5  \\
                               &      \scalebox{0.9}[1]{AnsCls} &                           65.4 &                  \textbf{98.8} &                           91.4 & \cellcolor[RGB]{211,211,211}{27.2} & \cellcolor[RGB]{211,211,211}{37.0} & \cellcolor[RGB]{211,211,211}{16.0} & \cellcolor[RGB]{211,211,211}{16.0} & \cellcolor[RGB]{211,211,211}{45.7} & \cellcolor[RGB]{211,211,211}{34.6} & \cellcolor[RGB]{211,211,211}{11.1} & \cellcolor[RGB]{211,211,211}{12.3} &                           57.9 &                           78.3  \\
                               &     \scalebox{0.9}[1]{MathGen} &                           89.2 &                           82.5 &                  \textbf{99.2} & \cellcolor[RGB]{211,211,211}{43.3} & \cellcolor[RGB]{211,211,211}{25.8} & \cellcolor[RGB]{211,211,211}{48.3} & \cellcolor[RGB]{211,211,211}{65.0} & \cellcolor[RGB]{211,211,211}{74.2} &                           87.5 &                           83.3 &                           88.3 &                           75.0 &                           96.7  \\
                               &     \scalebox{0.9}[1]{FgFactV} &                           80.2 &                           89.7 &                  \textbf{92.1} & \cellcolor[RGB]{211,211,211}{10.3} & \cellcolor[RGB]{211,211,211}{44.4} & \cellcolor[RGB]{211,211,211}{6.3} & \cellcolor[RGB]{211,211,211}{17.5} & \cellcolor[RGB]{211,211,211}{46.0} & \cellcolor[RGB]{211,211,211}{60.3} & \cellcolor[RGB]{211,211,211}{24.6} & \cellcolor[RGB]{211,211,211}{51.6} &                           78.8 &                          100.0  \\
                               &      \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{30.6} &                  \textbf{95.2} &                           79.8 & \cellcolor[RGB]{211,211,211}{28.2} & \cellcolor[RGB]{211,211,211}{3.2} & \cellcolor[RGB]{211,211,211}{2.4} & \cellcolor[RGB]{211,211,211}{0.8} & \cellcolor[RGB]{211,211,211}{12.1} & \cellcolor[RGB]{211,211,211}{20.2} & \cellcolor[RGB]{211,211,211}{45.2} & \cellcolor[RGB]{211,211,211}{58.9} &                           77.5 &                          100.0  \\
