                 initial model &                        dataset &                    gemma-7b-it &            Llama-2-13b-chat-hf &            Llama-2-70b-chat-hf &     Mixtral-8x7B-Instruct-v0.1 &               Qwen1.5-14B-Chat &               Qwen1.5-72B-Chat &             gpt-3.5-turbo-0125 &             gemini-1.0-pro-001 &         claude-3-opus-20240229 &                     gpt-4-0613 &             gpt-4-0125-preview &                         random &                          human  \\
                               &     \scalebox{0.9}[1]{MathGen} &                           75.0 & \cellcolor[RGB]{211,211,211}{50.0} &                 \textbf{100.0} &                           80.0 &                           71.4 &                          100.0 &                           76.9 & \cellcolor[RGB]{211,211,211}{0.0} &                          100.0 &                           93.8 &                           86.4 &                           55.0 &                          100.0  \\
                               &     \scalebox{0.9}[1]{FgFactV} &                           60.0 & \cellcolor[RGB]{211,211,211}{51.6} & \cellcolor[RGB]{211,211,211}{53.0} & \cellcolor[RGB]{211,211,211}{31.2} & \cellcolor[RGB]{211,211,211}{52.8} &                           75.0 & \cellcolor[RGB]{211,211,211}{0.0} &                           72.2 &                           84.2 &                 \textbf{100.0} &                           75.0 &                           55.7 &                           95.5  \\
                               &      \scalebox{0.9}[1]{AnsCls} &                           61.4 & \cellcolor[RGB]{211,211,211}{55.6} &                           66.1 &                           66.7 &                           76.5 & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{50.0} &                           81.8 &                           71.4 &                           87.5 &                  \textbf{91.7} &                           57.9 &                           94.7  \\
                               &     \scalebox{0.9}[1]{MathGen} &                           75.0 &                           77.8 &                 \textbf{100.0} &                           95.2 &                          100.0 &                          100.0 &                           87.0 &                          100.0 &                           94.9 &                           93.8 &                           92.7 &                           75.0 &                          100.0  \\
                               &     \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{78.7} &                           79.4 &                           83.0 &                 \textbf{100.0} &                           81.0 & \cellcolor[RGB]{211,211,211}{0.0} &                          100.0 &                           90.9 &                          100.0 & \cellcolor[RGB]{211,211,211}{76.5} &                           89.7 &                           78.8 &                          100.0  \\
                               &      \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{72.7} &                           80.5 & \cellcolor[RGB]{211,211,211}{50.0} &                 \textbf{100.0} & \cellcolor[RGB]{211,211,211}{66.7} & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{0.0} & \cellcolor[RGB]{211,211,211}{50.0} &                          100.0 &                           96.8 &                           95.9 &                           77.5 &                           96.7  \\
