                 initial model &                        dataset &                    gemma-7b-it &            Llama-2-13b-chat-hf &            Llama-2-70b-chat-hf &     Mixtral-8x7B-Instruct-v0.1 &               Qwen1.5-14B-Chat &               Qwen1.5-72B-Chat &             gpt-3.5-turbo-0125 &             gemini-1.0-pro-001 &         claude-3-opus-20240229 &                     gpt-4-0613 &             gpt-4-0125-preview &                         random &                          human  \\
                               &     \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{46.4} & \cellcolor[RGB]{211,211,211}{44.3} & \cellcolor[RGB]{211,211,211}{46.4} &                           51.4 & \cellcolor[RGB]{211,211,211}{49.3} & \cellcolor[RGB]{211,211,211}{48.6} &                           55.0 & \cellcolor[RGB]{211,211,211}{44.3} &                           57.1 &                           65.0 &                  \textbf{67.9} &                           50.5 &                           88.2  \\
                               &     \scalebox{0.9}[1]{FgFactV} &                           52.1 & \cellcolor[RGB]{211,211,211}{44.3} & \cellcolor[RGB]{211,211,211}{47.1} & \cellcolor[RGB]{211,211,211}{38.6} & \cellcolor[RGB]{211,211,211}{45.0} & \cellcolor[RGB]{211,211,211}{45.7} & \cellcolor[RGB]{211,211,211}{42.9} & \cellcolor[RGB]{211,211,211}{50.0} &                  \textbf{53.6} & \cellcolor[RGB]{211,211,211}{47.1} & \cellcolor[RGB]{211,211,211}{45.7} &                           50.7 &                           94.3  \\
                               &      \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{49.3} & \cellcolor[RGB]{211,211,211}{47.9} &                  \textbf{55.0} & \cellcolor[RGB]{211,211,211}{43.6} & \cellcolor[RGB]{211,211,211}{48.6} & \cellcolor[RGB]{211,211,211}{42.1} & \cellcolor[RGB]{211,211,211}{42.1} & \cellcolor[RGB]{211,211,211}{47.1} & \cellcolor[RGB]{211,211,211}{48.6} & \cellcolor[RGB]{211,211,211}{46.4} & \cellcolor[RGB]{211,211,211}{49.3} &                           51.2 &                           82.9  \\
                               &     \scalebox{0.9}[1]{MathGen} & \cellcolor[RGB]{211,211,211}{26.2} & \cellcolor[RGB]{211,211,211}{36.9} & \cellcolor[RGB]{211,211,211}{28.1} & \cellcolor[RGB]{211,211,211}{36.2} & \cellcolor[RGB]{211,211,211}{30.0} & \cellcolor[RGB]{211,211,211}{38.1} & \cellcolor[RGB]{211,211,211}{46.2} & \cellcolor[RGB]{211,211,211}{30.0} &                           69.4 &                           78.1 &                  \textbf{83.1} &                           62.5 &                           97.1  \\
                               &     \scalebox{0.9}[1]{FgFactV} & \cellcolor[RGB]{211,211,211}{38.1} & \cellcolor[RGB]{211,211,211}{56.9} &                  \textbf{57.5} & \cellcolor[RGB]{211,211,211}{23.8} & \cellcolor[RGB]{211,211,211}{29.4} & \cellcolor[RGB]{211,211,211}{21.2} & \cellcolor[RGB]{211,211,211}{21.9} & \cellcolor[RGB]{211,211,211}{26.9} & \cellcolor[RGB]{211,211,211}{28.1} & \cellcolor[RGB]{211,211,211}{26.9} & \cellcolor[RGB]{211,211,211}{50.0} &                           66.5 &                          100.0  \\
                               &      \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{25.6} &                  \textbf{69.4} & \cellcolor[RGB]{211,211,211}{22.5} & \cellcolor[RGB]{211,211,211}{20.6} & \cellcolor[RGB]{211,211,211}{23.1} & \cellcolor[RGB]{211,211,211}{22.5} & \cellcolor[RGB]{211,211,211}{22.5} & \cellcolor[RGB]{211,211,211}{22.5} & \cellcolor[RGB]{211,211,211}{28.7} & \cellcolor[RGB]{211,211,211}{59.4} & \cellcolor[RGB]{211,211,211}{64.4} &                           65.1 &                           97.1  \\
