                 initial model &                        dataset &                    gemma-7b-it &            Llama-2-13b-chat-hf &            Llama-2-70b-chat-hf &     Mixtral-8x7B-Instruct-v0.1 &               Qwen1.5-14B-Chat &               Qwen1.5-72B-Chat &             gpt-3.5-turbo-0125 &             gemini-1.0-pro-001 &         claude-3-opus-20240229 &                     gpt-4-0613 &             gpt-4-0125-preview &                         random &                          human  \\
                               &     \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{46.4} & \cellcolor[RGB]{211,211,211}{51.6} &                           55.0 &                           63.6 &                           56.7 &                           67.3 &                           63.0 &                           57.6 &                           86.8 &                  \textbf{87.3} &                           74.6 &                           55.0 &                          100.0  \\
                               &     \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{55.0} &                           55.9 &                           55.7 &                           57.9 &                           56.0 &                           57.1 &                           60.4 &                           58.3 &                           71.0 &                  \textbf{87.5} &                           73.3 &                           55.7 &                           95.5  \\
                               &      \scalebox{0.9}[1]{AnsCls} &                           60.6 &                           58.4 & \cellcolor[RGB]{211,211,211}{57.1} & \cellcolor[RGB]{211,211,211}{56.9} & \cellcolor[RGB]{211,211,211}{57.1} &                           70.0 &                           58.5 &                           71.2 &                           65.0 &                  \textbf{92.3} &                           92.3 &                           57.9 &                           94.7  \\
                               &     \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{56.9} & \cellcolor[RGB]{211,211,211}{72.8} &                           77.1 &                           77.2 &                           83.2 &                           87.2 &                           80.0 &                           80.5 &                           90.5 &                  \textbf{92.0} &                           87.2 &                           75.0 &                          100.0  \\
                               &     \scalebox{0.9}[1]{FgFactV} &                           83.2 &                           79.4 & \cellcolor[RGB]{211,211,211}{78.5} &                           88.9 &                           82.5 &                           94.6 &                  \textbf{98.7} &                           80.9 &                           98.2 &                           86.4 &                           92.4 &                           78.8 &                          100.0  \\
                               &      \scalebox{0.9}[1]{AnsCls} &                           79.6 &                           78.6 &                           81.0 &                           79.1 &                           78.7 &                           80.0 & \cellcolor[RGB]{211,211,211}{70.0} & \cellcolor[RGB]{211,211,211}{72.2} &                 \textbf{100.0} &                           95.2 &                           92.0 &                           77.5 &                           96.7  \\
