                 initial model &                        dataset &                    gemma-7b-it &            Llama-2-13b-chat-hf &            Llama-2-70b-chat-hf &     Mixtral-8x7B-Instruct-v0.1 &               Qwen1.5-14B-Chat &               Qwen1.5-72B-Chat &             gpt-3.5-turbo-0125 &             gemini-1.0-pro-001 &         claude-3-opus-20240229 &                     gpt-4-0613 &             gpt-4-0125-preview &                         random &                          human  \\
                               &     \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{50.3} & \cellcolor[RGB]{211,211,211}{53.2} &                  \textbf{75.3} & \cellcolor[RGB]{211,211,211}{37.7} & \cellcolor[RGB]{211,211,211}{49.0} & \cellcolor[RGB]{211,211,211}{23.4} &                           65.3 & \cellcolor[RGB]{211,211,211}{39.9} & \cellcolor[RGB]{211,211,211}{38.3} & \cellcolor[RGB]{211,211,211}{51.6} &                           61.0 &                           55.0 &                           81.8  \\
                               &     \scalebox{0.9}[1]{FgFactV} &                           62.8 &                           71.5 &                  \textbf{83.3} & \cellcolor[RGB]{211,211,211}{42.6} &                           61.9 & \cellcolor[RGB]{211,211,211}{17.6} & \cellcolor[RGB]{211,211,211}{38.5} & \cellcolor[RGB]{211,211,211}{40.4} & \cellcolor[RGB]{211,211,211}{40.1} & \cellcolor[RGB]{211,211,211}{6.7} & \cellcolor[RGB]{211,211,211}{11.5} &                           55.7 &                           95.5  \\
                               &      \scalebox{0.9}[1]{AnsCls} &                           58.6 &                  \textbf{80.2} &                           78.7 & \cellcolor[RGB]{211,211,211}{30.9} & \cellcolor[RGB]{211,211,211}{54.6} & \cellcolor[RGB]{211,211,211}{9.9} & \cellcolor[RGB]{211,211,211}{18.8} & \cellcolor[RGB]{211,211,211}{31.8} & \cellcolor[RGB]{211,211,211}{26.9} & \cellcolor[RGB]{211,211,211}{12.3} & \cellcolor[RGB]{211,211,211}{13.6} &                           57.9 &                           78.3  \\
                               &     \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{50.6} & \cellcolor[RGB]{211,211,211}{50.4} & \cellcolor[RGB]{211,211,211}{72.7} & \cellcolor[RGB]{211,211,211}{45.8} & \cellcolor[RGB]{211,211,211}{47.9} & \cellcolor[RGB]{211,211,211}{39.2} & \cellcolor[RGB]{211,211,211}{67.1} & \cellcolor[RGB]{211,211,211}{47.3} & \cellcolor[RGB]{211,211,211}{74.6} &                           82.7 &                  \textbf{87.9} &                           75.0 &                           96.7  \\
                               &     \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{61.9} & \cellcolor[RGB]{211,211,211}{77.4} &                  \textbf{82.5} & \cellcolor[RGB]{211,211,211}{24.8} & \cellcolor[RGB]{211,211,211}{61.1} & \cellcolor[RGB]{211,211,211}{11.1} & \cellcolor[RGB]{211,211,211}{24.2} & \cellcolor[RGB]{211,211,211}{31.7} & \cellcolor[RGB]{211,211,211}{32.5} & \cellcolor[RGB]{211,211,211}{25.2} & \cellcolor[RGB]{211,211,211}{54.0} &                           78.8 &                          100.0  \\
                               &      \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{24.2} &                  \textbf{78.2} & \cellcolor[RGB]{211,211,211}{46.4} & \cellcolor[RGB]{211,211,211}{20.6} & \cellcolor[RGB]{211,211,211}{45.0} & \cellcolor[RGB]{211,211,211}{2.4} & \cellcolor[RGB]{211,211,211}{2.0} & \cellcolor[RGB]{211,211,211}{9.5} & \cellcolor[RGB]{211,211,211}{15.1} & \cellcolor[RGB]{211,211,211}{48.4} & \cellcolor[RGB]{211,211,211}{63.9} &                           77.5 &                          100.0  \\
