                 initial model &                        dataset &                    gemma-7b-it &            Llama-2-13b-chat-hf &            Llama-2-70b-chat-hf &     Mixtral-8x7B-Instruct-v0.1 &               Qwen1.5-14B-Chat &               Qwen1.5-72B-Chat &             gpt-3.5-turbo-0125 &             gemini-1.0-pro-001 &         claude-3-opus-20240229 &                     gpt-4-0613 &             gpt-4-0125-preview &                         random &                          human  \\
                               &     \scalebox{0.9}[1]{MathGen} &                           64.9 &                           69.1 &                           71.0 & \cellcolor[RGB]{211,211,211}{45.0} & \cellcolor[RGB]{211,211,211}{29.7} & \cellcolor[RGB]{211,211,211}{37.4} &                           64.7 &                           67.1 &                           67.7 &                           63.9 &                  \textbf{72.5} &                           55.0 &                           90.0  \\
                               &     \scalebox{0.9}[1]{FgFactV} &                           62.4 &                           69.2 &                  \textbf{71.6} & \cellcolor[RGB]{211,211,211}{29.9} & \cellcolor[RGB]{211,211,211}{53.3} & \cellcolor[RGB]{211,211,211}{19.4} & \cellcolor[RGB]{211,211,211}{50.0} &                           58.4 &                           63.4 & \cellcolor[RGB]{211,211,211}{9.6} & \cellcolor[RGB]{211,211,211}{24.7} &                           55.7 &                           95.5  \\
                               &      \scalebox{0.9}[1]{AnsCls} &                           61.3 &                  \textbf{74.1} &                           70.8 & \cellcolor[RGB]{211,211,211}{37.0} & \cellcolor[RGB]{211,211,211}{47.6} & \cellcolor[RGB]{211,211,211}{26.8} & \cellcolor[RGB]{211,211,211}{26.5} & \cellcolor[RGB]{211,211,211}{53.6} & \cellcolor[RGB]{211,211,211}{46.3} & \cellcolor[RGB]{211,211,211}{19.8} & \cellcolor[RGB]{211,211,211}{21.7} &                           57.9 &                           85.7  \\
                               &     \scalebox{0.9}[1]{MathGen} &                           84.6 &                           79.5 &                           85.3 & \cellcolor[RGB]{211,211,211}{58.8} & \cellcolor[RGB]{211,211,211}{40.8} & \cellcolor[RGB]{211,211,211}{63.7} & \cellcolor[RGB]{211,211,211}{72.2} & \cellcolor[RGB]{211,211,211}{74.5} &                           86.8 &                           88.1 &                  \textbf{90.2} &                           75.0 &                           98.3  \\
                               &     \scalebox{0.9}[1]{FgFactV} &                           80.2 &                           84.6 &                  \textbf{85.3} & \cellcolor[RGB]{211,211,211}{18.7} & \cellcolor[RGB]{211,211,211}{57.1} & \cellcolor[RGB]{211,211,211}{11.9} & \cellcolor[RGB]{211,211,211}{29.7} & \cellcolor[RGB]{211,211,211}{59.2} & \cellcolor[RGB]{211,211,211}{74.9} & \cellcolor[RGB]{211,211,211}{38.3} & \cellcolor[RGB]{211,211,211}{66.7} &                           78.8 &                          100.0  \\
                               &      \scalebox{0.9}[1]{AnsCls} & \cellcolor[RGB]{211,211,211}{44.4} &                  \textbf{85.2} &                           81.8 & \cellcolor[RGB]{211,211,211}{41.7} & \cellcolor[RGB]{211,211,211}{6.2} & \cellcolor[RGB]{211,211,211}{4.7} & \cellcolor[RGB]{211,211,211}{1.6} & \cellcolor[RGB]{211,211,211}{20.5} & \cellcolor[RGB]{211,211,211}{33.6} & \cellcolor[RGB]{211,211,211}{61.2} & \cellcolor[RGB]{211,211,211}{73.0} &                           77.5 &                           98.3  \\
