                 initial model &                        dataset &                    gemma-7b-it &            Llama-2-13b-chat-hf &            Llama-2-70b-chat-hf &     Mixtral-8x7B-Instruct-v0.1 &               Qwen1.5-14B-Chat &               Qwen1.5-72B-Chat &             gpt-3.5-turbo-0125 &             gemini-1.0-pro-001 &         claude-3-opus-20240229 &                     gpt-4-0613 &             gpt-4-0125-preview &                         random &                          human  \\
                               &     \scalebox{0.9}[1]{MathGen} &                           57.5 & \cellcolor[RGB]{211,211,211}{55.0} &                           67.5 &                           72.4 &                           66.4 &                           76.3 &                           69.4 & \cellcolor[RGB]{211,211,211}{43.8} &                  \textbf{90.8} &                           90.3 &                           80.9 &                           55.0 &                          100.0  \\
                               &     \scalebox{0.9}[1]{FgFactV} &                           56.5 & \cellcolor[RGB]{211,211,211}{53.1} & \cellcolor[RGB]{211,211,211}{55.5} & \cellcolor[RGB]{211,211,211}{52.1} & \cellcolor[RGB]{211,211,211}{55.2} &                           64.3 & \cellcolor[RGB]{211,211,211}{46.9} &                           61.8 &                           72.3 &                  \textbf{88.3} &                           80.4 &                           55.7 &                           95.5  \\
                               &      \scalebox{0.9}[1]{AnsCls} &                           59.6 &                           57.9 &                           59.4 &                           60.4 &                           65.0 & \cellcolor[RGB]{211,211,211}{55.7} &                           62.2 &                           73.7 &                           70.4 &                           90.5 &                  \textbf{91.6} &                           57.9 &                           94.7  \\
                               &     \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{72.3} & \cellcolor[RGB]{211,211,211}{74.5} &                           84.4 &                           87.0 &                           92.9 &                  \textbf{93.5} &                           82.9 &                           87.3 &                           91.4 &                           93.2 &                           90.3 &                           75.0 &                          100.0  \\
                               &     \scalebox{0.9}[1]{FgFactV} &                           82.1 &                           79.8 &                           80.5 &                           96.3 &                           81.5 & \cellcolor[RGB]{211,211,211}{73.6} &                           98.7 &                           84.7 &                  \textbf{99.2} &                           84.7 &                           92.4 &                           78.8 &                          100.0  \\
                               &      \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{77.3} &                           78.9 & \cellcolor[RGB]{211,211,211}{73.6} &                           83.6 & \cellcolor[RGB]{211,211,211}{72.6} & \cellcolor[RGB]{211,211,211}{70.0} & \cellcolor[RGB]{211,211,211}{67.5} & \cellcolor[RGB]{211,211,211}{67.6} &                 \textbf{100.0} &                           95.3 &                           93.8 &                           77.5 &                           96.7  \\
